def test_build1(self): geneidmap = GeneIDMapGPL() # GPL96.txt gpl_path = os.path.abspath( os.path.join(self.test_data_root, 'gpl96_sample.txt')) gpl_comment = '#' gpl_delimiter = '\t' gpl_fh = DSV.getHandle(gpl_path) gpl_dsv = DSV(self.dbm, self.testdb, gpl_fh, dtname=self.annoTable, delimiter=gpl_delimiter, comment=gpl_comment) gpl_dsv.create() gpl_dsv.loadAll() gpl_dsv.close() # build test map geneidmap.build(gpl_dsv, None, self.testdb) self.assertTrue(geneidmap.built) self.assertIsInstance(geneidmap.dbt, DBTable) fwdmap = dict(geneidmap.gene2emid.getFwdMap()) self.assertEqual(self.ref_fwd1, fwdmap) bwdmap = dict(geneidmap.gene2emid.getBwdMap()) self.assertEqual(self.ref_bwd1, bwdmap)
def test_recache2(self): dsv_fh = DSV.getHandle(self.num_dsv_path) # default DSV, dialect and delimiter sniffed dsv = DSV(self.dbm, self.testdb, dsv_fh, dtname=self.test_dtname) dsv.create() dsv.loadAll() # data set by default spanning all rows and all columns ds = DataSet(dbtable=dsv) dsv.close() numpy.testing.assert_array_almost_equal(self.array1, ds.array) # wipe out underlying table wipe_cs = dsv.db.cursor() wipe_cs.execute('delete from "%s";' % self.test_dtname) dsv.db.commit() wipe_cs.execute('vacuum;') dsv.db.commit() # NOTE: before numpy 1.6.0, empty file in loadtxt() generates IOError, # with 1.6.0+ only warning if check_min_numpy_version(1, 6, 0): # perform recache (we shall see empty array) # suppress numpy warning of empty source file with warnings.catch_warnings(): warnings.simplefilter("ignore") ds.recache() numpy.testing.assert_array_almost_equal(self.empty_array, ds.array) else: with self.assertRaises(Error): ds.recache()
def test_init10(self): dsv1_fh = DSV.getHandle(self.num_dsv_path) # delimiter sniffed, header extracted (default), ID not resolved dsv1 = DSV(self.dbm, self.testdb, dsv1_fh, dtname=self.test_dtname, make_missing_ID_column=False) self.assertSequenceEqual(self.num_dsv_actual_header, dsv1.header) dsv1.create() self.assertTrue(dsv1.isCreated()) self.assertTrue(dsv1.isEmpty()) dsv1.close()
def test_init9(self): dsv1_fh = DSV.getHandle(self.num_dsv_path) # delimiter sniffed, header extracted (default), ID resolved dsv1 = DSV(self.dbm, self.testdb, dsv1_fh, dtname=self.test_dtname) self.assertSequenceEqual(self.num_dsv_desired_header, dsv1.header) dsv1.create() self.assertTrue(dsv1.isCreated()) self.assertTrue(dsv1.isEmpty()) dsv1.close()
def test_init12(self): dsv1_fh = DSV.getHandle(self.num_dsv_path) # delimiter sniffed, header supplied (proper length) our_header = tuple(['C%d' % n for n in range(1, len(self.num_dsv_actual_header) + 1)]) dsv1 = DSV(self.dbm, self.testdb, dsv1_fh, dtname=self.test_dtname, header=our_header) self.assertSequenceEqual(our_header, dsv1.header) dsv1.create() self.assertTrue(dsv1.isCreated()) self.assertTrue(dsv1.isEmpty()) dsv1.close()
def test_init11(self): dsv1_fh = DSV.getHandle(self.num_dsv_path) # delimiter sniffed, header auto-generated dsv1 = DSV(self.dbm, self.testdb, dsv1_fh, dtname=self.test_dtname, header=()) ref_header = tuple(['%d' % n for n in range(1, len(self.num_dsv_actual_header) + 1)]) self.assertSequenceEqual(ref_header, dsv1.header) dsv1.create() self.assertTrue(dsv1.isCreated()) self.assertTrue(dsv1.isEmpty()) dsv1.close()
def test_init7(self): dsv_fh = DSV.getHandle(self.num_dsv_path) # default DSV, dialect and delimiter sniffed dsv = DSV(self.dbm, self.testdb, dsv_fh, dtname=self.test_dtname) dsv.create() dsv.loadAll() dsv.close() # get only first row ds = DataSet(dbtable=dsv, rows=self.sample_rows_1) numpy.testing.assert_array_almost_equal(self.array_v1, ds.array)
def test_init6(self): dsv_fh = DSV.getHandle(self.num_dsv_path) # default DSV, dialect and delimiter sniffed dsv = DSV(self.dbm, self.testdb, dsv_fh, dtname=self.test_dtname) dsv.create() dsv.loadAll() dsv.close() # data set by default spanning all rows and all columns ds = DataSet(dbtable=dsv) numpy.testing.assert_array_almost_equal(self.array1, ds.array)
def setUp(self): self.test_data_root = TEST_INVARIANTS['test_data_root'] self.test_write_root = TEST_INVARIANTS['test_write_root'] self.testdb = 'DB1' self.test_dtname = 'Test1' self.dbm = DBManager(self.test_write_root) # ssdata.dsv self.ssdata_dsv_path = os.path.abspath( os.path.join(self.test_data_root, 'ssdata.dsv')) self.ssdata_comment = '#' ssdata_dsv_fh = DSV.getHandle(self.ssdata_dsv_path) self.ssdata_dsv1 = DSV(self.dbm, self.testdb, ssdata_dsv_fh, dtname=self.test_dtname, comment=self.ssdata_comment) self.ssdata_dsv1.create() self.ssdata_dsv1.loadAll() self.ssdata_dsv1.close() self.ssdata_samples = ('S1', 'S2', 'S3', 'S4', 'S5') # hierarchy tests self.pkc2id = {'PKC1': ('R1', 'R2', 'R3', 'R4'), 'PKC2': ('R5', )} self.pkc = ('PKC1', 'PKC2') self.size_thr1 = 2 self.cat1 = SubsetSizeCategorizer(self.size_thr1, ID='Cat1') self.cat1_uniq_le = self.cat1.uniquifyCategory( self.cat1.ROW_SIZE_LESSER) self.cat1_uniq_gt = self.cat1.uniquifyCategory( self.cat1.ROW_SIZE_GREATER) self.size_thr2 = 3 self.cat2 = SubsetSizeCategorizer(self.size_thr2, ID='Cat2') self.cat2_uniq_le = self.cat2.uniquifyCategory( self.cat2.ROW_SIZE_LESSER) self.cat2_uniq_gt = self.cat2.uniquifyCategory( self.cat2.ROW_SIZE_GREATER) self.size_thr3 = 0 self.cat3 = SubsetSizeCategorizer(self.size_thr3, ID='Cat3') self.cat3_uniq_le = self.cat3.uniquifyCategory( self.cat3.ROW_SIZE_LESSER) self.cat3_uniq_gt = self.cat3.uniquifyCategory( self.cat3.ROW_SIZE_GREATER) self.cinst = { 'Cat1': self.cat1, 'Cat2': self.cat2, 'Cat3': self.cat3, } self.cmap1 = ['Cat1', 'Cat2', 'Cat3'] self.cmap2 = ['Cat3', 'Cat1', 'Cat2'] self.symbols = list(self.pkc)
def setUp(self): self.test_data_root = TEST_INVARIANTS['test_data_root'] self.test_write_root = TEST_INVARIANTS['test_write_root'] self.testdb = 'DB1' self.hgncTable = 'HGNC' self.dbm = DBManager(self.test_write_root) # hgnc_path = os.path.abspath( os.path.join(self.test_data_root, 'hgnc_sample_2.txt')) hgnc_comment = '#' hgnc_delimiter = '\t' hgnc_fh = DSV.getHandle(hgnc_path) self.hgnc_dsv = DSV(self.dbm, self.testdb, hgnc_fh, dtname=self.hgncTable, delimiter=hgnc_delimiter, comment=hgnc_comment) self.hgnc_dsv.create() self.hgnc_dsv.loadAll() self.hgnc_dsv.close() # self.withdrawn_pattern = '%~withdrawn' self.symbol_col = 'Approved Symbol' # # NOTE: we use unicode since we do not reparse immediately after querying self.ref_previous1 = { u'NTRK4': [u'DDR1'], u'PTK3A': [u'DDR1'], u'NEP': [u'DDR1'], u'CAK': [u'DDR1'], u'EDDR1': [u'DDR1'], u'C19orf72': [u'DCAF15'], } self.ref_synonyms1 = { u'A1': [u'RFC2'], u'BEHAB': [u'BCAN'], u'CD167': [u'DDR1'], u'CSPG7': [u'BCAN'], u'DRC3': [u'EPS8L1'], u'FLJ20258': [u'EPS8L1'], u'MGC13038': [u'BCAN'], u'MGC23164': [u'EPS8L1'], u'MGC4642': [u'EPS8L1'], u'MGC99481': [u'DCAF15'], u'RFC40': [u'RFC2'], u'RTK6': [u'DDR1'] }
def test_init3(self): dsv_fh = DSV.getHandle(self.num_dsv_path) dsv = DSV(self.dbm, self.testdb, dsv_fh, dtname=self.test_dtname) dsv.create() dsv.close() with self.assertRaises(Error): DataSet(dbtable=dsv, cols='BBB')
def test_loadall1(self): dsv1_fh = DSV.getHandle(self.num_dsv_path) # default DSV dsv1 = DSV(self.dbm, self.testdb, dsv1_fh, dtname=self.test_dtname) self.assertSequenceEqual(self.num_dsv_desired_header, dsv1.header) dsv1.create() self.assertTrue(dsv1.isCreated()) self.assertTrue(dsv1.isEmpty()) # load from file st = dsv1.loadAll(debug=True) dsv1.close() ref_st = ['insert into "Test1" values ("V1","7.29865639942","7.1839394018853","8.08785988003525","8.43784327460378","7.56725674896063","7.17150350961048",' '"8.23772125375395","7.26860393651388","6.74186036580687","7.55493056104098","7.37521470969549","6.35468766815909",' '"7.03794441889888","6.75197742759923","7.26608934160658","8.70335292880697","6.85443361759566","7.59055769774248",' '"8.01751559655053","6.99993079846214","7.10871523619365","7.65161630470663","6.71058065426046","6.64437907655326",' '"6.93172233805358","7.61870427987243","6.9634175191832","6.37433009206648","6.34485366708736","6.0977075555399",' '"6.9061361459302","6.54264897912374","6.31961323363347","6.16533391728077","6.90481905323935","6.7168440158265",' '"7.22535319774288","6.20123577217092","6.93391118518623","6.82985307889579","6.35468239627533","7.09693639659124",' '"7.60449775270475","7.12266778930967","6.35835046528365","6.76414046791","6.17508883882112","6.52508274039929",' '"7.11162248509395","6.89152906126555","6.49949720627377","6.69448041622817","6.37526926527225","5.80401273298264",' '"7.12987703240072","6.05831629170905","6.81624397767137","6.66820808623227","6.64998519558867","6.42308111524492",' '"7.58672787003923","3.84767749509431","6.71665724008276","6.35468766815909","6.54859953448512","7.23447515724748",' '"6.70007125889196","6.28445976227631","6.75206243946758","6.7168440158265","6.55922419484843","6.93675713126568",' '"6.80067557800434","6.50103393612957","6.91542815411986","6.19960368164491","7.6448783709798","6.2125929974423",' '"6.35468766815909","7.32784699996015","6.14659907126786","6.7168440158265","6.8825610653412","6.72831600642366",' '"6.46374697412319","5.79584776993902","6.0825372527799","7.1204899554919","6.39620062779895","6.35814627516342",' '"6.35814627516342")', 'insert into "Test1" values ("V2","2.38904325749261","2.37588862645719","2.37310583895584","2.38904325749261","2.42091222425779","2.38904325749261",' '"2.38626046999126","2.38904325749261","2.38904325749261","2.41002306956031","2.38904325749261","2.38904325749261",' '"2.38904325749261","2.37310583895584","2.38626046999126","2.34429782913723","2.38904325749261","2.98112952430922",' '"2.34553574786241","2.37310583895584","2.39660701797421","2.38904325749261","2.40955866820479","2.38626046999126",' '"2.35577218230877","2.39443448171899","2.34433277775847","2.69053923836483","2.38430054425455","2.86158891209344",' '"2.34595261411454","2.89813268468409","2.42777977950130","2.38626046999126","2.44904175049461","3.55795174775419",' '"2.66896481156844","2.38626046999126","2.71772299956764","2.61602731442131","2.56996895766296","3.86202701130675",' '"2.38904325749261","2.35577218230877","2.60505670342601","3.12697260562512","2.38904325749261","3.15740854425796",' '"2.65364423092787","2.45124596034905","3.14913252263311","2.38904325749261","2.39700474393300","2.38904325749261",' '"2.46188514405506","3.23873137510437","2.55373906857937","3.39601442806742","3.16936129560691","3.18777558546775",' '"2.38904325749261","2.38904325749261","2.38626046999126","2.34553574786241","2.35577218230877","2.38624782221570",' '"2.35577218230877","2.38904325749261","2.74265374191966","2.37188401381886","2.37588862645719","2.38904325749261",' '"2.35577218230877","2.35121946858936","2.49946444329392","2.38904325749261","2.34553574786241","2.93960156829307",' '"2.38904325749261","2.39182604499395","2.38904325749261","2.35315614841910","2.47149945385376","2.38626046999126",' '"2.39596753440869","2.38904325749261","2.40223987191512","2.34715558421848","2.38210356896247","2.34719053283972",' '"2.76820667786915")'] self.assertSequenceEqual(ref_st, st)
def test_init4(self): dsv_fh = DSV.getHandle(self.num_dsv_path) dsv = DSV(self.dbm, self.testdb, dsv_fh, dtname=self.test_dtname) dsv.create() dsv.close() with self.assertRaises(Error): DataSet(dbtable=dsv, rows=self.sample_rows_none, cols=self.sample_cols_none)
def test_init1(self): dsv1_fh = DSV.getHandle(self.num_dsv_path) # default DSV, dialect and delimiter sniffed dsv1 = DSV(self.dbm, self.testdb, dsv1_fh, dtname=self.test_dtname) self.assertFalse(dsv1.isCreated()) self.assertEqual(',', dsv1.dialect.delimiter) dsv1.close()
def test_init6(self): dsv1_fh = DSV.getHandle(self.num_dsv_path) # delimiter sniffed, comment resolved successfully dsv1 = DSV(self.dbm, self.testdb, dsv1_fh, dtname=self.test_dtname, comment='#') self.assertFalse(dsv1.isCreated()) self.assertEqual('#', dsv1.comment) dsv1.close()
def test_loadall2(self): dsv1_fh = DSV.getHandle(self.num_dsv_path) # default DSV dsv1 = DSV(self.dbm, self.testdb, dsv1_fh, dtname=self.test_dtname) self.assertSequenceEqual(self.num_dsv_desired_header, dsv1.header) dsv1.create() dsv1.loadAll() dsv1.close() # low level checks cs = dsv1.db.cursor() cs.execute('select %s from %s' % (self.num_dsv_desired_header[0], dsv1.name)) rres = cs.fetchall() res = [str(r[0]) for r in rres] self.assertSequenceEqual(self.num_rows, res) cs.execute('select %s from %s' % (self.num_dsv_desired_header[3], dsv1.name)) rres = cs.fetchall() res = [str(r[0]) for r in rres] self.assertSequenceEqual(self.num_column_3, res)
def test_loadall3(self): dsv2_fh = DSV.getHandle(self.anno_dsv_path) dsv2 = DSV(self.dbm, self.testdb, dsv2_fh, dtname=self.test_dtname, comment=self.anno_comment) dsv2.create() st = dsv2.loadAll(debug=True) dsv2.close() ref_st = ['insert into "Test1" values ("1007_s_at","U48705","","H**o sapiens","Mar 11, 2009",' '"Exemplar sequence","Affymetrix Proprietary Database",' '"U48705 /FEATURE=mRNA /DEFINITION=HSU48705 Human receptor tyrosine kinase DDR gene, complete cds",' '"U48705","discoidin domain receptor tyrosine kinase 1",' '"DDR1","780","NM_001954 /// NM_013993 /// NM_013994",' '"0006468 // protein amino acid phosphorylation // inferred from electronic annotation ///' ' 0007155 // cell adhesion // traceable author statement ///' ' 0007155 // cell adhesion // inferred from electronic annotation ///' ' 0007169 // transmembrane receptor protein tyrosine kinase signaling pathway // inferred from electronic annotation",' '"0005887 // integral to plasma membrane // traceable author statement ///' ' 0016020 // membrane // inferred from electronic annotation ///' ' 0016021 // integral to membrane // inferred from electronic annotation",' '"0000166 // nucleotide binding // inferred from electronic annotation ///' ' 0004672 // protein kinase activity // inferred from electronic annotation ///' ' 0004713 // protein tyrosine kinase activity // inferred from electronic annotation ///' ' 0004714 // transmembrane receptor protein tyrosine kinase activity // traceable author statement ///' ' 0004714 // transmembrane receptor protein tyrosine kinase activity // inferred from electronic annotation ///' ' 0004872 // receptor activity // inferred from electronic annotation ///' ' 0005515 // protein binding // inferred from physical interaction ///' ' 0005524 // ATP binding // inferred from electronic annotation ///' ' 0016301 // kinase activity // inferred from electronic annotation ///' ' 0016740 // transferase activity // inferred from electronic annotation")', 'insert into "Test1" values ("1053_at","M87338","","H**o sapiens","Mar 11, 2009",' '"Exemplar sequence","GenBank",' '"M87338 /FEATURE= /DEFINITION=HUMA1SBU Human replication factor C, 40-kDa subunit (A1) mRNA, complete cds",' '"M87338","replication factor C (activator 1) 2, 40kDa",' '"RFC2","5982","NM_002914 /// NM_181471",' '"0006260 // DNA replication // not recorded ///' ' 0006260 // DNA replication // inferred from electronic annotation ///' ' 0006297 // nucleotide-excision repair, DNA gap filling // not recorded",' '"0005634 // nucleus // inferred from electronic annotation ///' ' 0005654 // nucleoplasm // not recorded ///' ' 0005663 // DNA replication factor C complex // inferred from direct assay ///' ' 0005663 // DNA replication factor C complex // inferred from electronic annotation",' '"0000166 // nucleotide binding // inferred from electronic annotation ///' ' 0003677 // DNA binding // inferred from electronic annotation ///' ' 0003689 // DNA clamp loader activity // inferred from electronic annotation ///' ' 0005515 // protein binding // inferred from physical interaction ///' ' 0005524 // ATP binding // traceable author statement ///' ' 0005524 // ATP binding // inferred from electronic annotation ///' ' 0017111 // nucleoside-triphosphatase activity // inferred from electronic annotation")'] self.assertSequenceEqual(ref_st, st)
def test_build1(self): pkcidmap = PKCIDMapGOGPL() # GPL96.txt # gpl_path = os.path.abspath(os.path.join(self.test_data_root, 'GPL96.txt.bz2')) gpl_path = os.path.abspath( os.path.join(self.test_data_root, 'gpl96_sample.txt')) gpl_comment = '#' gpl_delimiter = '\t' gpl_fh = DSV.getHandle(gpl_path) gpl_dsv = DSV(self.dbm, self.testdb, gpl_fh, dtname=self.annoTable, delimiter=gpl_delimiter, comment=gpl_comment) gpl_dsv.create() gpl_dsv.loadAll() gpl_dsv.close() pkcidmap.build(gpl_dsv, self.testdb) self.assertTrue(pkcidmap.built) self.assertIsInstance(pkcidmap.dbt, DBTable) # test bidirectional map bwdmap = pkcidmap.pkc2emid.getBwdMap() fwdmap = pkcidmap.pkc2emid.getFwdMap() # check valid probes and terms for existence for ref_probe, ref_terms in self.test_bwd.iteritems(): # backward map self.assertIn(ref_probe, bwdmap) terms = bwdmap[ref_probe] self.assertEqual(ref_terms, terms) # forward map for term in terms: self.assertIn(term, fwdmap) probes = fwdmap[term] self.assertIn(ref_probe, probes) # check control probes for nonexistence for test_ctrl in self.test_ctrl_probes: # backward map self.assertNotIn(test_ctrl, bwdmap) for probes in fwdmap.values(): # forward map self.assertNotIn(test_ctrl, probes)
def test_init2(self): dsv1_fh = DSV.getHandle(self.num_dsv_path) # predefined delimiter, resolved successfully # NOTE: class does not check if delimiter is valid at this point dsv1 = DSV(self.dbm, self.testdb, dsv1_fh, dtname=self.test_dtname, delimiter='\t') self.assertFalse(dsv1.isCreated()) self.assertEqual(csv.get_dialect('excel-tab'), dsv1.dialect) self.assertEqual('\t', dsv1.dialect.delimiter) dsv1.close()
def test_loadall4(self): dsv2_fh = DSV.getHandle(self.anno_dsv_path) dsv2 = DSV(self.dbm, self.testdb, dsv2_fh, dtname=self.test_dtname, comment=self.anno_comment) self.assertSequenceEqual(self.anno_header, dsv2.header) dsv2.create() dsv2.loadAll() dsv2.close() # low level checks cs = dsv2.db.cursor() cs.execute('select %s from %s' % (self.anno_header[0], dsv2.name)) rres = cs.fetchall() res = [str(r[0]) for r in rres] self.assertSequenceEqual(self.anno_rows, res) cols = ','.join([quote(c) for c in self.anno_columns]) cs.execute('select %s from %s' % (cols, dsv2.name)) rres = cs.fetchall() res = {} for ix, ac in enumerate(self.anno_columns): res[ac] = [str(r[ix]) for r in rres] self.assertDictEqual(self.anno_columns_dict, res)
def test_em2annotation1(self): geneidmap = GeneIDMapGPL() # GPL96.txt gpl_path = os.path.abspath( os.path.join(self.test_data_root, 'gpl96_sample.txt')) gpl_comment = '#' gpl_delimiter = '\t' gpl_fh = DSV.getHandle(gpl_path) gpl_dsv = DSV(self.dbm, self.testdb, gpl_fh, dtname=self.annoTable, delimiter=gpl_delimiter, comment=gpl_comment) gpl_dsv.create() gpl_dsv.loadAll() gpl_dsv.close() geneidmap.build(gpl_dsv, None, self.testdb) em2a = get_em2annotation(geneidmap.dbt) self.assertEqual(self.ref_em2a, em2a)
def setUp(self): self.test_write_root = TEST_INVARIANTS['test_write_root'] self.test_data_root = TEST_INVARIANTS['test_data_root'] self.sample_data_root = self.test_write_root self.testdb = 'DB1' self.dbm = DBManager(self.sample_data_root) # self.test_dtname1 = 'LABELS1' self.lab1_dsv_path = os.path.abspath(os.path.join(self.test_data_root, 'labels1.dsv')) self.lab1_fh = DSV.getHandle(self.lab1_dsv_path, 'rb') self.lab1_dsv = DSV(self.dbm, self.testdb, self.lab1_fh, dtname=self.test_dtname1) self.lab1_dsv.create() self.lab1_dsv.loadAll() self.lab1_dsv.close() self.lab1_cnt = {'A1': '1', 'A2': '1', 'A3': '1', 'A4': '1', 'B1': '-1', 'B2': '-1', 'B3': '-1', 'B4': '-1', } self.lab1_samples1 = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4'] self.lab1_resp1 = ['1', '1', '1', '1', '-1', '-1', '-1', '-1'] self.lab1_samples_resp1 = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4'] self.lab1_samples2 = ['B4', 'B3', 'B2', 'B1', 'A4', 'A3', 'A2', 'A1'] self.lab1_resp2 = ['-1', '-1', '-1', '-1', '1', '1', '1', '1'] self.lab1_samples_resp2 = ['B4', 'B3', 'B2', 'B1', 'A4', 'A3', 'A2', 'A1'] self.lab1_samples3 = ['A1', 'B1', 'A2', 'B2', 'A3', 'B3', 'A4', 'B4'] self.lab1_resp3 = ['1', '-1', '1', '-1', '1', '-1', '1', '-1'] self.lab1_samples_resp3 = ['A1', 'B1', 'A2', 'B2', 'A3', 'B3', 'A4', 'B4'] self.lab1_samples4 = ['A1', 'B1', 'B4', 'A3'] self.lab1_resp4 = ['1', '-1', '-1', '1'] self.lab1_samples_resp4 = ['A1', 'B1', 'B4', 'A3'] self.lab1_samples5 = ['A1', 'B1', None, 'A3'] self.lab1_resp5 = ['1', '-1', '1'] self.lab1_samples_resp5 = ['A1', 'B1', 'A3'] self.lab1_samples6 = ['A1', 'XXX1', 'B4' 'QQQ7546dsfsdfs453'] self.lab1_resp6 = ['1'] self.lab1_samples_resp6 = ['A1'] # self.test_dtname2 = 'LABELS2' self.lab2_dsv_path = os.path.abspath(os.path.join(self.test_data_root, 'labels2.dsv')) self.lab2_fh = DSV.getHandle(self.lab2_dsv_path, 'rb') self.lab2_dsv = DSV(self.dbm, self.testdb, self.lab2_fh, dtname=self.test_dtname2) self.lab2_dsv.create() self.lab2_dsv.loadAll() self.lab2_dsv.close() self.lab2_cntN = {'A1': '1', 'A2': '1', 'A3': '1', 'A4': '1', 'B1': '-1', 'B2': '-1', 'B3': '-1', 'B4': '-1', 'PP': '0', 'QQ': '0', 'RR': '0', 'SS': '0'} self.lab2_cntY = {'A1': '1', 'A2': '1', 'A3': '1', 'A4': '1', 'B1': '-1', 'B2': '-1', 'B3': '-1', 'B4': '-1'} self.lab2_samples_order1 = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4'] self.lab2_samples_resp1 = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4'] self.lab2_samples_order2 = ['A1', 'A2', 'QQ', 'A3', 'A4', 'SS', 'B1', 'B2', 'B3', 'RR', 'B4', 'PP'] self.lab2_samples_resp2 = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4']
def test_init5(self): dsv_fh = DSV.getHandle(self.num_dsv_path) # default DSV, dialect and delimiter sniffed dsv = DSV(self.dbm, self.testdb, dsv_fh, dtname=self.test_dtname) # data set by default spanning all rows and all columns # data not loaded, we shall see empty array dsv.create() dsv.close() # NOTE: before numpy 1.6.0, empty file in loadtxt() generates IOError, # with 1.6.0+ only warning if check_min_numpy_version(1, 6, 0): # suppress numpy warning of empty source file with warnings.catch_warnings(): warnings.simplefilter("ignore") ds = DataSet(dbtable=dsv) numpy.testing.assert_equal(self.empty_array, ds.array) else: with self.assertRaises(Error): DataSet(dbtable=dsv)
class TestLabels1(unittest.TestCase): def setUp(self): self.test_write_root = TEST_INVARIANTS['test_write_root'] self.test_data_root = TEST_INVARIANTS['test_data_root'] self.sample_data_root = self.test_write_root self.testdb = 'DB1' self.dbm = DBManager(self.sample_data_root) # self.test_dtname1 = 'LABELS1' self.lab1_dsv_path = os.path.abspath(os.path.join(self.test_data_root, 'labels1.dsv')) self.lab1_fh = DSV.getHandle(self.lab1_dsv_path, 'rb') self.lab1_dsv = DSV(self.dbm, self.testdb, self.lab1_fh, dtname=self.test_dtname1) self.lab1_dsv.create() self.lab1_dsv.loadAll() self.lab1_dsv.close() self.lab1_cnt = {'A1': '1', 'A2': '1', 'A3': '1', 'A4': '1', 'B1': '-1', 'B2': '-1', 'B3': '-1', 'B4': '-1', } self.lab1_samples1 = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4'] self.lab1_resp1 = ['1', '1', '1', '1', '-1', '-1', '-1', '-1'] self.lab1_samples_resp1 = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4'] self.lab1_samples2 = ['B4', 'B3', 'B2', 'B1', 'A4', 'A3', 'A2', 'A1'] self.lab1_resp2 = ['-1', '-1', '-1', '-1', '1', '1', '1', '1'] self.lab1_samples_resp2 = ['B4', 'B3', 'B2', 'B1', 'A4', 'A3', 'A2', 'A1'] self.lab1_samples3 = ['A1', 'B1', 'A2', 'B2', 'A3', 'B3', 'A4', 'B4'] self.lab1_resp3 = ['1', '-1', '1', '-1', '1', '-1', '1', '-1'] self.lab1_samples_resp3 = ['A1', 'B1', 'A2', 'B2', 'A3', 'B3', 'A4', 'B4'] self.lab1_samples4 = ['A1', 'B1', 'B4', 'A3'] self.lab1_resp4 = ['1', '-1', '-1', '1'] self.lab1_samples_resp4 = ['A1', 'B1', 'B4', 'A3'] self.lab1_samples5 = ['A1', 'B1', None, 'A3'] self.lab1_resp5 = ['1', '-1', '1'] self.lab1_samples_resp5 = ['A1', 'B1', 'A3'] self.lab1_samples6 = ['A1', 'XXX1', 'B4' 'QQQ7546dsfsdfs453'] self.lab1_resp6 = ['1'] self.lab1_samples_resp6 = ['A1'] # self.test_dtname2 = 'LABELS2' self.lab2_dsv_path = os.path.abspath(os.path.join(self.test_data_root, 'labels2.dsv')) self.lab2_fh = DSV.getHandle(self.lab2_dsv_path, 'rb') self.lab2_dsv = DSV(self.dbm, self.testdb, self.lab2_fh, dtname=self.test_dtname2) self.lab2_dsv.create() self.lab2_dsv.loadAll() self.lab2_dsv.close() self.lab2_cntN = {'A1': '1', 'A2': '1', 'A3': '1', 'A4': '1', 'B1': '-1', 'B2': '-1', 'B3': '-1', 'B4': '-1', 'PP': '0', 'QQ': '0', 'RR': '0', 'SS': '0'} self.lab2_cntY = {'A1': '1', 'A2': '1', 'A3': '1', 'A4': '1', 'B1': '-1', 'B2': '-1', 'B3': '-1', 'B4': '-1'} self.lab2_samples_order1 = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4'] self.lab2_samples_resp1 = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4'] self.lab2_samples_order2 = ['A1', 'A2', 'QQ', 'A3', 'A4', 'SS', 'B1', 'B2', 'B3', 'RR', 'B4', 'PP'] self.lab2_samples_resp2 = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'B4'] def tearDown(self): self.dbm.close() db1_path = os.path.abspath('%s/%s.db' % (self.test_write_root, self.testdb)) rootdb_path = os.path.abspath('%s/%s.root.db' % (self.test_write_root, SYSTEM_NAME_LC)) if os.path.exists(db1_path): os.remove(db1_path) if os.path.exists(rootdb_path): os.remove(rootdb_path) self.dbm = None def test_init1(self): lab = Labels(self.lab1_dsv) self.assertEqual(self.lab1_cnt, lab.labels) self.assertEqual('0', lab.unused_sample_label) def test_init2(self): lab = Labels(self.lab1_dsv, unused_sample_label='XXX') self.assertEqual('XXX', lab.unused_sample_label) self.assertEqual(self.lab1_cnt, lab.labels) def test_getLabels1(self): lab = Labels(self.lab1_dsv) resp1 = lab.getLabels(self.lab1_samples1) self.assertEqual(self.lab1_resp1, resp1) resp2 = lab.getLabels(self.lab1_samples2) self.assertEqual(self.lab1_resp2, resp2) resp3 = lab.getLabels(self.lab1_samples3) self.assertEqual(self.lab1_resp3, resp3) resp4 = lab.getLabels(self.lab1_samples4) self.assertEqual(self.lab1_resp4, resp4) resp5 = lab.getLabels(self.lab1_samples5) self.assertEqual(self.lab1_resp5, resp5) resp6 = lab.getLabels(self.lab1_samples6) self.assertEqual(self.lab1_resp6, resp6) def test_getLabels2(self): lab = Labels(self.lab1_dsv) resp1 = lab.getLabels(self.lab1_samples1, as_array=True) num1 = np.array([float(l) for l in self.lab1_resp1]) np.testing.assert_array_equal(resp1, num1) resp2 = lab.getLabels(self.lab1_samples2, as_array=True) num2 = np.array([float(l) for l in self.lab1_resp2]) np.testing.assert_array_equal(resp2, num2) resp3 = lab.getLabels(self.lab1_samples3, as_array=True) num3 = np.array([float(l) for l in self.lab1_resp3]) np.testing.assert_array_equal(resp3, num3) resp4 = lab.getLabels(self.lab1_samples4, as_array=True) num4 = np.array([float(l) for l in self.lab1_resp4]) np.testing.assert_array_equal(resp4, num4) resp5 = lab.getLabels(self.lab1_samples5, as_array=True) num5 = np.array([float(l) for l in self.lab1_resp5]) np.testing.assert_array_equal(resp5, num5) resp6 = lab.getLabels(self.lab1_samples6, as_array=True) num6 = np.array([float(l) for l in self.lab1_resp6]) np.testing.assert_array_equal(resp6, num6) def test_getLabels3(self): lab = Labels(self.lab2_dsv) self.assertNotEqual(self.lab2_cntN, lab.labels) self.assertEqual(self.lab2_cntY, lab.labels) resp1 = lab.getLabels(self.lab1_samples1) self.assertEqual(self.lab1_resp1, resp1) resp2 = lab.getLabels(self.lab1_samples2) self.assertEqual(self.lab1_resp2, resp2) resp3 = lab.getLabels(self.lab1_samples3) self.assertEqual(self.lab1_resp3, resp3) resp4 = lab.getLabels(self.lab1_samples4) self.assertEqual(self.lab1_resp4, resp4) resp5 = lab.getLabels(self.lab1_samples5) self.assertEqual(self.lab1_resp5, resp5) resp6 = lab.getLabels(self.lab1_samples6) self.assertEqual(self.lab1_resp6, resp6) def test_getLabels4(self): lab = Labels(self.lab2_dsv) resp1 = lab.getLabels(self.lab1_samples1, as_array=True) num1 = np.array([float(l) for l in self.lab1_resp1]) np.testing.assert_array_equal(resp1, num1) resp2 = lab.getLabels(self.lab1_samples2, as_array=True) num2 = np.array([float(l) for l in self.lab1_resp2]) np.testing.assert_array_equal(resp2, num2) resp3 = lab.getLabels(self.lab1_samples3, as_array=True) num3 = np.array([float(l) for l in self.lab1_resp3]) np.testing.assert_array_equal(resp3, num3) resp4 = lab.getLabels(self.lab1_samples4, as_array=True) num4 = np.array([float(l) for l in self.lab1_resp4]) np.testing.assert_array_equal(resp4, num4) resp5 = lab.getLabels(self.lab1_samples5, as_array=True) num5 = np.array([float(l) for l in self.lab1_resp5]) np.testing.assert_array_equal(resp5, num5) resp6 = lab.getLabels(self.lab1_samples6, as_array=True) num6 = np.array([float(l) for l in self.lab1_resp6]) np.testing.assert_array_equal(resp6, num6) def test_getSamples1(self): lab1 = Labels(self.lab1_dsv) samples1 = lab1.getSamples(self.lab1_samples1) self.assertEqual(self.lab1_samples_resp1, samples1) samples2 = lab1.getSamples(self.lab1_samples2) self.assertEqual(self.lab1_samples_resp2, samples2) samples3 = lab1.getSamples(self.lab1_samples3) self.assertEqual(self.lab1_samples_resp3, samples3) samples4 = lab1.getSamples(self.lab1_samples4) self.assertEqual(self.lab1_samples_resp4, samples4) samples5 = lab1.getSamples(self.lab1_samples5) self.assertEqual(self.lab1_samples_resp5, samples5) samples6 = lab1.getSamples(self.lab1_samples6) self.assertEqual(self.lab1_samples_resp6, samples6) def test_getSamples2(self): lab2 = Labels(self.lab2_dsv) samples1 = lab2.getSamples(self.lab2_samples_order1) self.assertEqual(self.lab2_samples_resp1, samples1) samples2 = lab2.getSamples(self.lab2_samples_order2) self.assertEqual(self.lab2_samples_resp2, samples2)
def test_init7(self): dsv1_fh = DSV.getHandle(self.num_dsv_path) # delimiter sniffed, comment not resolved with self.assertRaises(Error): DSV(self.dbm, self.testdb, dsv1_fh, dtname=self.test_dtname, comment=('#',))
def test_init5(self): dsv1_fh = DSV.getHandle(self.num_dsv_path) # delimiter not resolved with self.assertRaises(Error): DSV(self.dbm, self.testdb, dsv1_fh, dtname=self.test_dtname, delimiter=100)
class TestPKDrivenDBDataManager1(unittest.TestCase): def setUp(self): self.test_data_root = TEST_INVARIANTS['test_data_root'] self.test_write_root = TEST_INVARIANTS['test_write_root'] self.testdb = 'DB1' self.test_dtname = 'Test1' self.dbm = DBManager(self.test_write_root) # ssdata.dsv self.ssdata_dsv_path = os.path.abspath( os.path.join(self.test_data_root, 'ssdata.dsv')) self.ssdata_comment = '#' ssdata_dsv_fh = DSV.getHandle(self.ssdata_dsv_path) self.ssdata_dsv1 = DSV(self.dbm, self.testdb, ssdata_dsv_fh, dtname=self.test_dtname, comment=self.ssdata_comment) self.ssdata_dsv1.create() self.ssdata_dsv1.loadAll() self.ssdata_dsv1.close() self.ssdata_samples = ('S1', 'S2', 'S3', 'S4', 'S5') # pkcidmap self.pkc2id1 = { 'PKC1': ('R1', 'R2'), 'PKC2': ('R3', ), 'PKC3': ('R4', 'R5') } self.pkc1 = ('PKC1', 'PKC2', 'PKC3') self.ss_cols1 = '*' self.ref_ss1 = [ ({ 'pkcID': 'PKC1', 'dtable': self.ssdata_dsv1, 'rows': ['R1', 'R2'], 'cols': list(self.ssdata_samples) }, None), ({ 'pkcID': 'PKC2', 'dtable': self.ssdata_dsv1, 'rows': ['R3'], 'cols': list(self.ssdata_samples) }, None), ({ 'pkcID': 'PKC3', 'dtable': self.ssdata_dsv1, 'rows': ['R4', 'R5'], 'cols': list(self.ssdata_samples) }, None), ] self.ss_cols2 = ('S1', 'S4', 'S5') self.ref_ss2 = [ ({ 'pkcID': 'PKC1', 'dtable': self.ssdata_dsv1, 'rows': ['R1', 'R2'], 'cols': list(self.ss_cols2) }, None), ({ 'pkcID': 'PKC2', 'dtable': self.ssdata_dsv1, 'rows': ['R3'], 'cols': list(self.ss_cols2) }, None), ({ 'pkcID': 'PKC3', 'dtable': self.ssdata_dsv1, 'rows': ['R4', 'R5'], 'cols': list(self.ss_cols2) }, None), ] self.pkc2 = ('PKC3', 'PKC1') self.ref_ss3 = [ ({ 'pkcID': 'PKC3', 'dtable': self.ssdata_dsv1, 'rows': ['R4', 'R5'], 'cols': list(self.ss_cols2) }, None), ({ 'pkcID': 'PKC1', 'dtable': self.ssdata_dsv1, 'rows': ['R1', 'R2'], 'cols': list(self.ss_cols2) }, None), ] self.ref_ss4 = [ (None, numpy.array([ [2.44753543273, 42.9497086717, 30.8331998765], [42.1888598933, 39.1743921225, 15.9744094108], ])), (None, numpy.array([ [16.5734780715, 14.8233987496, 21.7385342744], [60.0958378228, 98.4321570519, 71.9193619126], ])), ] # for categorization tests self.pkc2id2 = {'PKC1': ('R1', 'R2', 'R3', 'R4'), 'PKC2': ('R5', )} self.pkc3 = ('PKC1', 'PKC2') self.size_thr = 3 self.cat1 = SubsetSizeCategorizer(self.size_thr) self.exp_categories1 = ['>', '<='] self.cat2 = NullCategorizer() self.exp_categories2 = [self.cat2.NULL] * len(self.pkc3) def tearDown(self): self.dbm.close() db1_path = os.path.abspath('%s/%s.db' % (self.test_write_root, self.testdb)) rootdb_path = os.path.abspath('%s/%s.root.db' % (self.test_write_root, SYSTEM_NAME_LC)) if os.path.exists(db1_path): os.remove(db1_path) if os.path.exists(rootdb_path): os.remove(rootdb_path) self.dbm = None def test_init1(self): pkdm = PKDrivenDBDataManager(self.ssdata_dsv1, MockPKCIDMap(self.pkc2id1)) self.assertEqual(self.pkc2id1, pkdm.pkcidmap.pkc2emid) self.assertSequenceEqual(self.ssdata_samples, pkdm.all_samples) def test_getSubset1(self): pkdm = PKDrivenDBDataManager(self.ssdata_dsv1, MockPKCIDMap(self.pkc2id1)) ss = [ pkdm.getSubset(pkc, forSamples=self.ss_cols1, get_ssinfo=True, get_dataset=False) for pkc in self.pkc1 ] self.assertEqual(self.ref_ss1, ss) def test_getSubset2(self): pkdm = PKDrivenDBDataManager(self.ssdata_dsv1, MockPKCIDMap(self.pkc2id1)) ss = [ pkdm.getSubset(pkc, forSamples=self.ss_cols2, get_ssinfo=True, get_dataset=False) for pkc in self.pkc1 ] self.assertEqual(self.ref_ss2, ss) def test_getSubset3(self): pkdm = PKDrivenDBDataManager(self.ssdata_dsv1, MockPKCIDMap(self.pkc2id1)) ss = [ pkdm.getSubset(pkc, forSamples=self.ss_cols2, get_ssinfo=True, get_dataset=False) for pkc in self.pkc2 ] self.assertEqual(self.ref_ss3, ss) def test_getSubset4(self): pkdm = PKDrivenDBDataManager(self.ssdata_dsv1, MockPKCIDMap(self.pkc2id1)) ss = [ pkdm.getSubset(pkc, forSamples=self.ss_cols2, get_ssinfo=False, get_dataset=True) for pkc in self.pkc2 ] for refss, actss in zip(self.ref_ss4, ss): self.assertEqual(refss[0], actss[0]) numpy.testing.assert_array_equal(refss[1], actss[1].array) def test_categorizeSubset1(self): pkdm = PKDrivenDBDataManager(self.ssdata_dsv1, MockPKCIDMap(self.pkc2id2)) ss_dss = [ pkdm.getSubset(pkc, forSamples=self.ss_cols2, get_ssinfo=False, get_dataset=True)[1] for pkc in self.pkc3 ] ss_categories = [ PKDrivenDBDataManager.categorizeSubset(ss_ds, self.cat1) for ss_ds in ss_dss ] self.assertSequenceEqual(self.exp_categories1, ss_categories) def test_categorizeSubset2(self): pkdm = PKDrivenDBDataManager(self.ssdata_dsv1, MockPKCIDMap(self.pkc2id2)) ss_dss = [ pkdm.getSubset(pkc, forSamples=self.ss_cols2, get_ssinfo=False, get_dataset=True)[1] for pkc in self.pkc3 ] ss_categories = [ PKDrivenDBDataManager.categorizeSubset(ss_ds, self.cat2) for ss_ds in ss_dss ] self.assertSequenceEqual(self.exp_categories2, ss_categories)
def test_close1(self): dsv2_fh = DSV.getHandle(self.anno_dsv_path) dsv2 = DSV(self.dbm, self.testdb, dsv2_fh, dtname=self.test_dtname, comment=self.anno_comment) dsv2.create() dsv2.loadAll() dsv2.close() with self.assertRaises(Error): dsv2.loadAll()
def setUp(self): self.test_data_root = TEST_INVARIANTS['test_data_root'] self.test_write_root = TEST_INVARIANTS['test_write_root'] self.testdb = 'DB1' self.test_dtname = 'Test1' self.dbm = DBManager(self.test_write_root) # ssdata.dsv self.ssdata_dsv_path = os.path.abspath( os.path.join(self.test_data_root, 'ssdata.dsv')) self.ssdata_comment = '#' ssdata_dsv_fh = DSV.getHandle(self.ssdata_dsv_path) self.ssdata_dsv1 = DSV(self.dbm, self.testdb, ssdata_dsv_fh, dtname=self.test_dtname, comment=self.ssdata_comment) self.ssdata_dsv1.create() self.ssdata_dsv1.loadAll() self.ssdata_dsv1.close() self.ssdata_samples = ('S1', 'S2', 'S3', 'S4', 'S5') # pkcidmap self.pkc2id1 = { 'PKC1': ('R1', 'R2'), 'PKC2': ('R3', ), 'PKC3': ('R4', 'R5') } self.pkc1 = ('PKC1', 'PKC2', 'PKC3') self.ss_cols1 = '*' self.ref_ss1 = [ ({ 'pkcID': 'PKC1', 'dtable': self.ssdata_dsv1, 'rows': ['R1', 'R2'], 'cols': list(self.ssdata_samples) }, None), ({ 'pkcID': 'PKC2', 'dtable': self.ssdata_dsv1, 'rows': ['R3'], 'cols': list(self.ssdata_samples) }, None), ({ 'pkcID': 'PKC3', 'dtable': self.ssdata_dsv1, 'rows': ['R4', 'R5'], 'cols': list(self.ssdata_samples) }, None), ] self.ss_cols2 = ('S1', 'S4', 'S5') self.ref_ss2 = [ ({ 'pkcID': 'PKC1', 'dtable': self.ssdata_dsv1, 'rows': ['R1', 'R2'], 'cols': list(self.ss_cols2) }, None), ({ 'pkcID': 'PKC2', 'dtable': self.ssdata_dsv1, 'rows': ['R3'], 'cols': list(self.ss_cols2) }, None), ({ 'pkcID': 'PKC3', 'dtable': self.ssdata_dsv1, 'rows': ['R4', 'R5'], 'cols': list(self.ss_cols2) }, None), ] self.pkc2 = ('PKC3', 'PKC1') self.ref_ss3 = [ ({ 'pkcID': 'PKC3', 'dtable': self.ssdata_dsv1, 'rows': ['R4', 'R5'], 'cols': list(self.ss_cols2) }, None), ({ 'pkcID': 'PKC1', 'dtable': self.ssdata_dsv1, 'rows': ['R1', 'R2'], 'cols': list(self.ss_cols2) }, None), ] self.ref_ss4 = [ (None, numpy.array([ [2.44753543273, 42.9497086717, 30.8331998765], [42.1888598933, 39.1743921225, 15.9744094108], ])), (None, numpy.array([ [16.5734780715, 14.8233987496, 21.7385342744], [60.0958378228, 98.4321570519, 71.9193619126], ])), ] # for categorization tests self.pkc2id2 = {'PKC1': ('R1', 'R2', 'R3', 'R4'), 'PKC2': ('R5', )} self.pkc3 = ('PKC1', 'PKC2') self.size_thr = 3 self.cat1 = SubsetSizeCategorizer(self.size_thr) self.exp_categories1 = ['>', '<='] self.cat2 = NullCategorizer() self.exp_categories2 = [self.cat2.NULL] * len(self.pkc3)
def test_init13(self): dsv1_fh = DSV.getHandle(self.num_dsv_path) # delimiter sniffed, header supplied (improper length) our_header = tuple(['C%d' % n for n in range(1, len(self.num_dsv_actual_header) * 2 + 1)]) with self.assertRaises(Error): DSV(self.dbm, self.testdb, dsv1_fh, dtname=self.test_dtname, header=our_header)