def test_fastq_unzipped(): for thisdata in [data, datagz]: # isntanciation f = fastq.FastQ(thisdata) assert f.data_format == "Illumina_1.8+" # count lines # rune it twice because we want to make sure re-running count_lines # (decompression with zlib) works when run again. assert f.count_lines() == 1000 assert f.count_lines() == 1000 assert f.count_reads() == 250 assert f.count_reads() == 250 # extract head of the file into an unzipped file ft = TempFile() f.extract_head(100, ft.name) fcheck = fastq.FastQ(ft.name) assert fcheck.count_lines() == 100 ft.delete() # extract head of the file and zip output ft = TempFile(suffix=".gz") f.extract_head(100, ft.name) fcheck = fastq.FastQ(ft.name) assert fcheck.count_lines() == 100 ft.delete() with FastQ(thisdata) as ff: assert len(ff) == 250 with TempFile() as fh: selection = f.select_random_reads(10, fh.name) f.select_random_reads(selection, fh.name)
def test_snpeff(): # a custom refrence fh_log = TempFile() mydata = snpeff.SnpEff(reference=sequana_data("JB409847.gbk"), log=fh_log.name) with TempFile() as fh: mydata.launch_snpeff(sequana_data("JB409847.vcf"), fh.name) fh_log.delete() # cleanup try: os.remove("snpEff.config") except: pass try: os.remove("snpEff_genes.txt") except: pass try: os.remove("snpEff_summary.html") except: pass try: snpeff.SnpEff(reference="dummy") assert False except SystemExit: assert True except: assert False
def test_attrdict(): a = tools.AttrDict(value=1) assert a.value == 1 assert 'value' in list(a.keys()) assert 1 in (a.values()) a.description = 'test' assert a['description'] == 'test' a['output'] = 'txt' assert a.output == 'txt' d = {'a':{'b':1}, 'aa':2} ad = tools.AttrDict(**d) assert ad.a.b == 1 ad.a.b = 2 assert ad.a.b == 2 ad['d'] = 4 assert ad.d == 4 try: ad.update(1) assert False except: assert True # check json capabilities fh = TempFile() js = ad.to_json() ad.to_json(filename=fh.name) ad.from_json(fh.name) fh.delete()
def test_add_locus_with_modification(): # Alter the original GBK to alter the locus name data = open(sequana_data("JB409847.gbk"), "r").read() newdata = data.replace("JB409847", "DUMMY_JB409847") fh = TempFile(suffix="gbk") with open(fh.name, 'w') as fout: fout.write(newdata) # Now we read this new GBK file that has a different locus name as # compared to the fasta mydata = snpeff.SnpEff(reference=fh.name) # Here is the corresponding FASTA fasta = sequana_data("JB409847.fasta") with TempFile(suffix="fasta") as fh2: mydata.add_locus_in_fasta(fasta, fh2.name) # In theory, in the newly created fasta file, we should find back the # DUMMY tag # cleanup try: os.remove("snpEff.config") except: pass data = open(fh2.name, "r").read() assert "DUMMY" in data fh.delete()
def test_attrdict(): a = tools.AttrDict(value=1) assert a.value == 1 assert 'value' in list(a.keys()) assert 1 in (a.values()) a.description = 'test' assert a['description'] == 'test' a['output'] = 'txt' assert a.output == 'txt' d = {'a': {'b': 1}, 'aa': 2} ad = tools.AttrDict(**d) assert ad.a.b == 1 ad.a.b = 2 assert ad.a.b == 2 ad['d'] = 4 assert ad.d == 4 try: ad.update(1) assert False except: assert True # check json capabilities fh = TempFile() js = ad.to_json() ad.to_json(filename=fh.name) ad.from_json(fh.name) fh.delete()
def test_yeast_annotations(): from easydev import gsf filename = gsf('msdas', "data", "YEAST_raw_sample.csv") r = MassSpecReader(filename, verbose=verbose) a = AnnotationsYeast(r, verbose=verbose) a.df = a.df.ix[0:200] # 200 is enough to get gene name cases and ambiguous gene names cases # e.g., ALD3_YEAST ['P54114', 'P40047'] a.get_uniprot_entries() a.update_mapping() a.set_annotations() a.annotations.Sequence t = TempFile() a.to_csv(t.name) t.delete() t = TempFile() a.to_pickle("test", overwrite=True) try: a.to_pickle("test", overwrite=False) assert False except IOError: assert True a.read_pickle("YEAST_annotations_test.pkl") # create constructor given the annotations a = AnnotationsYeast(r, verbose=verbose, annotations="YEAST_annotations_test.pkl") a.get_uniprot_entries() # populate entry and entry_names in the df a.plot_goid_histogram() #cleanup os.remove("YEAST_annotations_test.pkl")
def test_pacbio_input_bam(tmpdir): # we need a summary and a bunch of images filename = sequana_data("summary_pacbio_qc1.json") # mock the PNG files found in the summary import json summary = json.load(open(filename)) pngname = sequana_data("no_data.jpg") summary["images"]["gc_vs_length"] = pngname summary["images"]["hist_gc_content"] = pngname summary["images"]["hist_read_length"] = pngname summary["images"]["hist_snr"] = pngname summary["images"]["hist_zmw"] = pngname summary_file = TempFile() with open(summary_file.name, "w") as ff: json.dump(summary, ff) # Now that we have this new summary file, let us use it # we also need an output handler ff = TempFile() from sequana.utils import config config.output_dir = "/tmp" #here, ff.name is of the form /tmp/djhfjh4dz so we need to remove the /tmp pacbio_input_bam.PacbioInputBAMModule(summary_file.name, ff.name.split("/")[1]) # cleanup summary_file.delete() ff.delete()
def test_sequana_config(): s = snaketools.Module("compressor") config = snaketools.SequanaConfig(s.config) assert config.config.get("compressor")["source"] == "fastq.gz" assert config.config.get("kraken:dummy") == None # --------------------------------- tests different constructors config = snaketools.SequanaConfig() config = snaketools.SequanaConfig({"test": 1}) assert config.config.test == 1 # with a dictionary config = snaketools.SequanaConfig(config.config) # with a sequanaConfig instance config = snaketools.SequanaConfig(config) # with a non-yaml file try: json = sequana_data('test_summary_fastq_stats.json') config = snaketools.SequanaConfig(json) assert False except: assert True try: config = snaketools.SequanaConfig("dummy_dummy") assert False except: assert True # Test an exception s = snaketools.Module("compressor") config = snaketools.SequanaConfig(s.config) config._recursive_update(config._yaml_code, {"input_directory_dummy": "test"}) #config.check_config_with_schema(s.schema_config) # loop over all pipelines, read the config, save it and check the content is # identical. This requires to remove the templates. We want to make sure the # empty strings are kept and that "no value" are kept as well # # field1: "" # field2: # # is unchanged from easydev import TempFile output = TempFile(suffix=".yaml") for pipeline in snaketools.pipeline_names: config_filename = Module(pipeline)._get_config() cfg1 = SequanaConfig(config_filename) cfg1.cleanup() # remove templates and strip strings cfg1.save(output.name) cfg2 = SequanaConfig(output.name) assert cfg2._yaml_code == cfg1._yaml_code cfg2._update_config() assert cfg1.config == cfg2.config output.delete()
def test_input(): from easydev import TempFile fh = TempFile(suffix=".fastq.gz") filename = sequana_data('Hm2_GTGAAA_L005_R2_001.fastq.gz') df = fastq_head.main( [prog, '--input', filename, '--nlines', "100", "--output", fh.name]) df = fastq_head.main([prog, filename, "100", fh.name]) fh.delete()
def score_sc2(self, prediction_file): fh = TempFile() _, gs2 = self.download_gs() script = self.classpath + os.sep + "DREAM_Olfaction_scoring_Q2.pl" cmd = "perl %s %s %s %s" cmd = cmd % (script, prediction_file, fh.name, gs2) shellcmd(cmd) df = pd.read_csv(fh.name, sep='\t', index_col=None).ix[0] fh.delete() return df
def test_sequana_config(): s = snaketools.Module("quality_control") config = snaketools.SequanaConfig(s.config) assert config.config.get("kraken:dummy", "test") == "test" assert config.config.get("kraken:dummy") == None # --------------------------------- tests different constructors config = snaketools.SequanaConfig() config = snaketools.SequanaConfig({"test":1}) assert config.config.test == 1 # with a dictionary config = snaketools.SequanaConfig(config.config) # with a sequanaConfig instance config = snaketools.SequanaConfig(config) # with a non-yaml file try: json = sequana_data('test_summary_fastq_stats.json') config = snaketools.SequanaConfig(json) assert False except: assert True try: config = snaketools.SequanaConfig("dummy_dummy") assert False except: assert True # Test an exception s = snaketools.Module("quality_control") config = snaketools.SequanaConfig(s.config) config._recursive_update(config._yaml_code, {"input_directory_dummy": "test"}) # loop over all pipelines, read the config, save it and check the content is # identical. This requires to remove the templates. We want to make sure the # empty strings are kept and that "no value" are kept as well # # field1: "" # field2: # # is unchanged from easydev import TempFile output = TempFile(suffix=".yaml") for pipeline in snaketools.pipeline_names: config_filename = Module(pipeline)._get_config() cfg1 = SequanaConfig(config_filename) cfg1.cleanup() # remove templates and strip strings cfg1.save(output.name) cfg2 = SequanaConfig(output.name) assert cfg2._yaml_code == cfg1._yaml_code cfg2._update_config() assert cfg1.config == cfg2.config output.delete()
def sbmlqual_from_datasets(identifier): # a simple model s1 = SIF() s2 = SIF(cnodata("PKN-" + identifier + ".sif")) fh = TempFile() s2.to_sbmlqual(fh.name) s1.read_sbmlqual(fh.name) fh.delete() assert s1 == s2 s3 = SIF(cnodata("PKN-" + identifier + ".xml")) assert s1 == s3 and s2 == s3
def test_phosphogrid(): m = MassSpecReader(get_yeast_small_data(), verbose=False) gene_names = set(list(m.df.Protein)) p = phosphogrid.PhosphoGRID(directory = "../share/data") p.run(gene_names=gene_names) fh = TempFile(suffix='.sif') p.export2sif(filename=fh.name) p.plot() #p.run() fh.delete()
def score_A(self, filename): from easydev import TempFile fh = TempFile() script = self._pj( [self.classpath, 'weighted_average_concordance_index.pl']) datadir = self._pj([self.classpath, 'data']) cmd = "perl %s %s %s %s" cmd = cmd % (script, filename, datadir, fh.name) shellcmd(cmd, verbose=True, ignore_errors=True) try: df = pd.read_csv(fh.name, sep='\t', header=None) except: print("Something wrong in the Scoring while executing \n %s. " % cmd) print( "\n The D7C4 challenge requires a Perl package to be installed" ) print("See D7C4 documentation e.g., on dreamtools.readthedocs.org") import sys sys.exit(1) df.columns = [ 'DrugID', 'probabilistic c-index', 'weighted probabilistic c-index', 'zscores' ] df = df.set_index('DrugID') fh.delete() ws = (df.sum() / df.sum().ix['zscores']) ws = ws.ix['weighted probabilistic c-index'] results = df.mean() results['weight average probabilistic c-index'] = ws del results['zscores'] # Finally compute pvalues based on precomputed scores precomp = pd.read_csv(self._pj([ self.classpath, 'data', 'DREAM7_DrugSensitivity1_drug_zscores.txt' ]), sep='\t', skiprows=6, header=None) overall_mean = precomp.ix[31][1] overall_var = precomp.ix[31][2] pval = 1 - (.5 * (math.erf( (ws - overall_mean) / (math.sqrt(2 * overall_var))) + 1)) results['weight average probabilistic c-index p-value'] = pval return {'Results': results}
def test_read_ic50(): # -------------------------------- functionalities r = IC50(ic50_test) # we can also instanciate from a valid dataframe r = IC50(r) # test repr r # and print statement print(r) # the copy method assert r == r.copy() r.hist() r.plot_ic50_count() r.cosmicIds f = TempFile() r.to_csv(f.name) f.delete() # columns may be duplicated r = IC50(ic50_test) df = pd.concat([r.df, r.df[999]], axis=1) # create new instance that should raise an error try: IC50(df) assert False except: assert True # ---------------------------------------- different IC50 formats # test all files available for key in testing.keys() : filename = testing[key].location if filename.startswith('ic50_test'): ic = IC50(filename) # some specific checks: #ic = IC50(testing['ic50_test_header_drug_prefix_only'].location) #assert ic.df.shape == (2,2) #assert all(ic.df.columns == ['1','2']) ic = IC50(testing['ic50_test_header_no_drug_prefix'].location) assert ic.drugIds == [1, 2] ic = IC50(testing['ic50_test_header_drug_prefix_only'].location) assert ic.drugIds == [1, 2] ic = IC50(testing['ic50_test_header_mixed_drug_prefix'].location) assert ic.drugIds == [1, 2]
def test_read_ic50(): # -------------------------------- functionalities r = IC50(ic50_test) # we can also instanciate from a valid dataframe r = IC50(r) # test repr r # and print statement print(r) # the copy method assert r == r.copy() r.hist() r.plot_ic50_count() r.cosmicIds f = TempFile() r.to_csv(f.name) f.delete() # columns may be duplicated r = IC50(ic50_test) df = pd.concat([r.df, r.df[999]], axis=1) # create new instance that should raise an error try: IC50(df) assert False except: assert True # ---------------------------------------- different IC50 formats # test all files available for key in testing.keys(): filename = testing[key].location if filename.startswith('ic50_test'): ic = IC50(filename) # some specific checks: #ic = IC50(testing['ic50_test_header_drug_prefix_only'].location) #assert ic.df.shape == (2,2) #assert all(ic.df.columns == ['1','2']) ic = IC50(testing['ic50_test_header_no_drug_prefix'].location) assert ic.drugIds == [1, 2] ic = IC50(testing['ic50_test_header_drug_prefix_only'].location) assert ic.drugIds == [1, 2] ic = IC50(testing['ic50_test_header_mixed_drug_prefix'].location) assert ic.drugIds == [1, 2]
def test_d2c1(): s = D2C1() s.test() filename = s.download_template() d = s.score(filename) assert_almost_equal(d['AUPR'], 0.2563463, 7) from easydev import TempFile fh = TempFile() s._create_templates(filename=fh.name) fh.delete() s.score_and_compare_with_lb(s.download_template())
def score_sc1(self, prediction_file): """Compute all results and compare user prediction with all official participants This scoring function can take a long time (about 5-10 minutes). """ fh = TempFile() gs1, _ = self.download_gs() script = self.classpath + os.sep + "DREAM_Olfaction_scoring_Q1.pl" cmd = "perl %s %s %s %s" cmd = cmd % (script, prediction_file, fh.name, gs1) shellcmd(cmd) df = pd.read_csv(fh.name, sep='\t', index_col=None).ix[0] fh.delete() return df
def test_read_write_from_cnograph(): c = CNOGraph(cnodata("PKN-ToyPB.sif")) fh = TempFile(suffix='.xml') c.to_sbmlqual(fh.name) c2 = CNOGraph(fh.name) assert c == c2 fh.delete() c = CNOGraph(cnodata("PKN-ToyPB.sif")) c.expand_and_gates() fh = TempFile(suffix='.xml') c.to_sbmlqual(fh.name) c2 = CNOGraph(fh.name) fh.delete() assert c == c2
def test_simple_sbmlqual(): # a simple example with simple OR, simple link, mix of OR and AND and single ANd c = CNOGraph() c.add_reaction("!A=C") c.add_reaction("C=D") c.add_reaction("B=C") c.expand_and_gates() c.add_reaction("a1=b") c.add_reaction("a2=b") c.add_reaction("D^b=E") fh = TempFile(suffix='.xml') c.to_sbmlqual(fh.name) c2 = CNOGraph(fh.name) fh.delete() assert c == c2
def test_MSReader(): # we can just create an instance r = MassSpecReader(verbose=verbose) # fails if wrong file try: r = MassSpecReader("dummy.csv", verbose=verbose) assert False except: assert True filename = yeast.get_yeast_filenames()[0] r = MassSpecReader(filename, verbose=verbose) print(r) r.mode r.N r.df r.measurements r.metadata try: r.mode = None assert False except: assert True r.sort_psites_ors_only() r['DIG1'] r['DIG1',"S142"] r['DIG1_S142'] try: r['DIG1', 'S142', 'dummy'] assert False except: assert True r.sequences r.psites from easydev import TempFile f = TempFile() r.to_csv(f.name) f.delete()
def test_yeast_june(): #y = yeast.YEAST2MIDAS(get_yeast_small_data(), get_yeast_raw_data(), verbose=False) #y.cleanup_june() #y.cleanup_june() #len(y.df)<100 filename = gsf("msdas", "data", "PKN-yeastScaffold.sif") data.cleanup_june() c,m,e = data.export_pkn_and_midas_june(filename) from easydev import TempFile f = TempFile() data.to_midas(f.name) f.delete() cv = data.get_cv() m = data.get_midas() data.pcolor_na() data.plot_timeseries("DIG1_S126+S127")
def score_A(self, filename): from easydev import TempFile fh = TempFile() script = self._pj([self.classpath, 'weighted_average_concordance_index.pl']) datadir = self._pj([self.classpath, 'data']) cmd = "perl %s %s %s %s" cmd = cmd % (script, filename, datadir , fh.name) shellcmd(cmd, verbose=True, ignore_errors=True) try: df = pd.read_csv(fh.name, sep='\t', header=None) except: print("Something wrong in the Scoring while executing \n %s. " % cmd) print("\n The D7C4 challenge requires a Perl package to be installed") print("See D7C4 documentation e.g., on dreamtools.readthedocs.org") import sys sys.exit(1) df.columns = ['DrugID', 'probabilistic c-index', 'weighted probabilistic c-index', 'zscores'] df = df.set_index('DrugID') fh.delete() ws = (df.sum() / df.sum().ix['zscores']) ws = ws.ix['weighted probabilistic c-index'] results = df.mean() results['weight average probabilistic c-index'] = ws del results['zscores'] # Finally compute pvalues based on precomputed scores precomp = pd.read_csv(self._pj([self.classpath, 'data', 'DREAM7_DrugSensitivity1_drug_zscores.txt']), sep='\t', skiprows=6, header=None) overall_mean = precomp.ix[31][1] overall_var = precomp.ix[31][2] pval = 1 - (.5 * (math.erf((ws - overall_mean)/(math.sqrt(2*overall_var))) + 1)) results['weight average probabilistic c-index p-value'] = pval return {'Results': results}
def test_config_parser(): s1 = ParamsGA() s2 = ParamsGeneral() c1 = CNOConfigParser() c1.add_section(s2) c1.add_section(s1) s1 = ParamsGA() s2 = ParamsGeneral() c2 = CNOConfigParser() c2.add_section(s2) c2.add_section(s1) assert c1 == c2 from easydev import TempFile fh = TempFile() c1.save(fh.name) c2 = CNOConfigParser(fh.name) fh.delete() assert c1 == c2
def test_models(): data = np.array([[1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1], [1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1]]) columns = [u'EGF=PI3K', u'TNFa=PI3K', u'Jnk=cJun', u'PI3K=Akt', u'Raf=Mek', u'!Akt=Mek', u'Mek=p90RSK', u'Mek=Erk', u'Erk=Hsp27', u'TNFa=Jnk', u'TNFa=NFkB', u'TNFa=Hsp27', u'EGF=Raf', u'EGF^TNFa=PI3K', u'Raf^!Akt=Mek', u'Erk^TNFa=Hsp27'] df = pd.DataFrame(data, columns=columns) fh = TempFile() df.to_csv(fh.name) m1 = Models(df) m2 = Models(m1) m3 = Models(fh.name, index_col=0) # there is an index column with no name fh.delete() # trying a stupid constructor try: Models(1) assert False except: assert True return m1, m2 assert m1 == m2 assert m1 == m3 # plots m1.plot() m1.plot(1) m1.plot('cv') m1.errorbar() m1.heatmap() # exporters fh = TempFile() m1.to_csv(fh.name) fh.delete() fh = TempFile() m1.to_sif(fh.name) fh.delete() # m1 and m2 are identical. Adding them gets rid of duplicates so it should be # equal to itself. m1 == m1 + m2
def score_A(self, filename): from easydev import TempFile fh = TempFile() script = self._pj([self._path2data, 'weighted_average_concordance_index.pl']) datadir = self._pj([self._path2data, 'data']) cmd = "perl %s %s %s %s" cmd = cmd % (script, filename, datadir , fh.name) shellcmd(cmd, verbose=True, ignore_errors=True) df = pd.read_csv(fh.name, sep='\t', header=None) df.columns = ['DrugID','probabilistic c-index', 'weighted probabilistic c-index', 'zscores'] df = df.set_index('DrugID') fh.delete() ws = (df.sum() / df.sum().ix['zscores']) ws = ws.ix['weighted probabilistic c-index'] results = df.mean() results['weight average probabilitis c-index'] = ws del results['zscores'] # Finally compute pvalues based on precomputed scores precomp = pd.read_csv(self._pj([self._path2data, 'data', 'DREAM7_DrugSensitivity1_drug_zscores.txt']), sep='\t', skiprows=6, header=None) overall_mean = precomp.ix[31][1] overall_var = precomp.ix[31][2] pval = 1 - (.5 * (math.erf((ws - overall_mean)/(math.sqrt(2*overall_var))) + 1)) results['weight average probabilitis c-index p-value'] = pval return {'Results': results}