def _closest_nucmer_match_between_fastas(cls, ref_fasta, qry_fasta, log_fh, min_id, min_length, breaklen, use_qry_length, check_flanking): tmpdir = tempfile.mkdtemp(prefix='tmp.closest_nucmer_match.', dir=os.getcwd()) coords_file = os.path.join(tmpdir, 'nucmer_vs_cluster_refs.coords') pymummer.nucmer.Runner( ref_fasta, qry_fasta, coords_file, min_id=min_id, min_length=min_length, breaklen=breaklen, maxmatch=True, ).run() nucmer_matches = RefSeqChooser._load_nucmer_coords_file(coords_file, log_fh=log_fh) common.rmtree(tmpdir) if len(nucmer_matches) == 0: return None, {} else: best_hit = RefSeqChooser._choose_best_nucmer_match( nucmer_matches, use_qry_length=use_qry_length, check_flanking=check_flanking) return best_hit, nucmer_matches
def test_assemble_with_fermilite(self): '''test _assemble_with_fermilite''' reads1 = os.path.join(data_dir, 'assembly_assemble_with_fermilite.reads_1.fq') reads2 = os.path.join(data_dir, 'assembly_assemble_with_fermilite.reads_2.fq') expected_log = os.path.join( data_dir, 'assembly_assemble_with_fermilite.expected.log') expected_fa = os.path.join( data_dir, 'assembly_assemble_with_fermilite.expected.fa') tmp_dir = 'tmp.test_assemble_with_fermilite' tmp_log = 'tmp.test_assemble_with_fermilite.log' tmp_log_fh = open(tmp_log, 'w') print('First line', file=tmp_log_fh) a = assembly.Assembly(reads1, reads2, 'not needed', 'not needed', tmp_dir, 'not_needed_for_this_test.fa', 'not_needed_for_this_test.bam', tmp_log_fh, 'not needed') a._assemble_with_fermilite() self.assertTrue(a.assembled_ok) tmp_log_fh.close() self.assertTrue(filecmp.cmp(expected_log, tmp_log, shallow=False)) self.assertTrue( filecmp.cmp(expected_fa, os.path.join(tmp_dir, 'debug_all_contigs.fa'), shallow=False)) common.rmtree(tmp_dir) os.unlink(tmp_log)
def test_load_fasta_files_and_write_clusters_file(self): '''test _load_fasta_files_and_write_clusters_file''' indir = os.path.join( data_dir, 'pubmlst_ref_prepare.test_load_fa_and_clusters.in') outdir = 'tmp.test.pubmlst_ref_prepare.test_load_fa_and_clusters' os.mkdir(outdir) r_prep = pubmlst_ref_preparer.PubmlstRefPreparer('species', outdir) profile_file = os.path.join(indir, 'profile.txt') r_prep.profile = mlst_profile.MlstProfile(profile_file) r_prep._load_fasta_files_and_write_clusters_file(indir) expected_cluster_tsv = os.path.join( data_dir, 'pubmlst_ref_prepare.test_load_fa_and_clusters.expect.tsv') self.assertTrue( filecmp.cmp(expected_cluster_tsv, r_prep.clusters_file, shallow=False)) common.rmtree(outdir) expected_fasta_files = [ os.path.join(indir, x) for x in ['gene1.tfa', 'gene2.tfa'] ] self.assertEqual(expected_fasta_files, r_prep.fasta_files) expected_seqs = { 'gene1': { 'gene1_1': pyfastaq.sequences.Fasta('gene1_1', 'ACGT'), 'gene1_2': pyfastaq.sequences.Fasta('gene1_2', 'AAAA'), }, 'gene2': { 'gene2_1': pyfastaq.sequences.Fasta('gene2_1', 'GGGG'), 'gene2_2': pyfastaq.sequences.Fasta('gene2_2', 'TTTT'), }, } self.assertEqual(expected_seqs, r_prep.sequences)
def run(self, reads_out1, reads_out2): tmpdir = tempfile.mkdtemp(prefix='tmp.filter_reads.', dir=os.getcwd()) all_reads_fasta = os.path.join(tmpdir, 'all_reads_for_cdhit.fa') self.readstore.get_reads(self.cluster_name, all_reads_fasta, fasta=True, log_fh=self.log_fh) cdhit_out = os.path.join(tmpdir, 'cdhit') ReadFilter._run_cdhit_est_2d(self.references_fa, all_reads_fasta, cdhit_out, self.extern_progs.exe('cdhit2d'), verbose=True, verbose_fh=self.log_fh) wanted_read_ids = ReadFilter._cdhit_clstr_to_reads(cdhit_out + '.clstr') total_reads, total_bases = self.readstore.get_reads( self.cluster_name, reads_out1, out2=reads_out2, log_fh=self.log_fh, wanted_ids=wanted_read_ids) common.rmtree(tmpdir) return total_reads, total_bases
def test_assemble_with_spades_fail(self): '''test _assemble_with_spades handles spades fail''' reads1 = os.path.join( data_dir, 'assembly_test_assemble_with_spades_fails_reads_1.fq') reads2 = os.path.join( data_dir, 'assembly_test_assemble_with_spades_fails_reads_2.fq') tmp_dir = 'tmp.test_assemble_with_spades_fail' tmp_log = 'tmp.test_assemble_with_spades_fail.log' with open(tmp_log, 'w') as tmp_log_fh: print('First line', file=tmp_log_fh) common.rmtree(tmp_dir) a = assembly.Assembly(reads1, reads2, 'not needed', 'not needed', tmp_dir, 'not_needed_for_this_test.fa', 'not_needed_for_this_test.bam', tmp_log_fh, 'not needed', assembler="spades", spades_options=" --only-assembler") a._assemble_with_spades() self.assertFalse(a.assembled_ok) common.rmtree(tmp_dir) os.unlink(tmp_log)
def test_assemble_with_spades(self): '''test _assemble_with_spades''' reads1 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_1.fq') reads2 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_2.fq') tmp_dir = 'tmp.test_assemble_with_spades' tmp_log = 'tmp.test_assemble_with_spades.log' with open(tmp_log, 'w') as tmp_log_fh: print('First line', file=tmp_log_fh) common.rmtree(tmp_dir) #using spades_options=" --only-assembler" because error correction cannot determine quality offset on this #artificial dataset a = assembly.Assembly(reads1, reads2, 'not needed', 'not needed', tmp_dir, 'not_needed_for_this_test.fa', 'not_needed_for_this_test.bam', tmp_log_fh, 'not needed', assembler="spades", spades_options=" --only-assembler") a._assemble_with_spades() self.assertTrue(a.assembled_ok) common.rmtree(tmp_dir) os.unlink(tmp_log)
def test_run_all_noncoding(self): '''test run with no metadata input, all sequences are noncoding''' fasta_in = [ os.path.join(data_dir, 'ref_preparer_test_run.in.1.fa'), os.path.join(data_dir, 'ref_preparer_test_run.in.2.fa'), os.path.join(data_dir, 'ref_preparer_test_run.in.3.fa'), ] extern_progs = external_progs.ExternalProgs() refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, all_coding='no', genetic_code=1) tmp_out = 'tmp.ref_preparer_test_run' refprep.run(tmp_out) expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_all_noncoding.out') test_files = [ '00.auto_metadata.tsv', '01.filter.check_metadata.tsv', '01.filter.check_genes.log', '01.filter.check_noncoding.log', '01.filter.check_metadata.log', '02.cdhit.all.fa', '02.cdhit.clusters.tsv', '02.cdhit.gene.fa', '02.cdhit.gene.varonly.fa', '02.cdhit.noncoding.fa', '02.cdhit.noncoding.varonly.fa', ] for filename in test_files: expected = os.path.join(expected_outdir, filename) got = os.path.join(tmp_out, filename) self.assertTrue(filecmp.cmp(expected, got, shallow=False)) common.rmtree(tmp_out)
def _run_cluster(obj, verbose, clean, fails_dir, remaining_clusters, remaining_clusters_lock): failed_clusters = os.listdir(fails_dir) if len(failed_clusters) > 0: print('Other clusters failed. Will not start cluster', obj.name, file=sys.stderr) return obj if verbose: print('Start running cluster', obj.name, 'in directory', obj.root_dir, flush=True) try: obj.run(remaining_clusters=remaining_clusters,remaining_clusters_lock=remaining_clusters_lock) except: print('Failed cluster:', obj.name, file=sys.stderr) with open(os.path.join(fails_dir, obj.name), 'w'): pass if verbose: print('Finished running cluster', obj.name, 'in directory', obj.root_dir, flush=True) if clean: if verbose: print('Deleting cluster dir', obj.root_dir, flush=True) if os.path.exists(obj.root_dir): try: common.rmtree(obj.root_dir) except: pass return obj
def _get_from_vfdb_common(self, outprefix, filename, info_text): outprefix = os.path.abspath(outprefix) tmpdir = outprefix + '.tmp.download' try: os.mkdir(tmpdir) except: raise Error('Error mkdir ' + tmpdir) zipfile = os.path.join(tmpdir, filename) common.download_file('http://www.mgc.ac.cn/VFs/Down/' + filename, zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True) print('Extracting files ... ', end='', flush=True) vparser = vfdb_parser.VfdbParser(zipfile, outprefix) vparser.run() if not self.debug: common.rmtree(tmpdir) print('done') final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' print('Extracted core DNA sequence dataset and metadata. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')
def _extract_files(cls, zip_file, outdir): original_files = { 'annotations': None, 'fasta': None, 'header_mappings': None } try: os.mkdir(outdir) except: raise Error('Error making directory ' + outdir) zfile = zipfile.ZipFile(zip_file) for member in zfile.namelist(): if '_annotations_' in member: original_files['annotations'] = member elif '_database_' in member and member.endswith('.fasta'): original_files['fasta'] = member elif '_header_mappings_' in member: original_files['header_mappings'] = member else: continue zfile.extract(member, path=outdir) if None in original_files.values(): common.rmtree(outdir) raise Error( 'Error. Not all expected files found in downloaded megares zipfile. ' + str(original_files)) return original_files
def test_extract_files_ok(self): '''test _extract_files when all ok''' zip_file = os.path.join(data_dir, 'megares_zip_parse_extract_files_ok.zip') tmp_dir = 'tmp.test_megares_extract_files_ok' got = megares_zip_parser.MegaresZipParser._extract_files( zip_file, tmp_dir) common_dir = os.path.join('megares_zip_parse_extract_files_ok', 'megares_v1.01') expected = { 'annotations': os.path.join(common_dir, 'megares_annotations_v1.01.csv'), 'fasta': os.path.join(common_dir, 'megares_database_v1.01.fasta'), 'header_mappings': os.path.join(common_dir, 'megares_to_external_header_mappings_v1.01.tsv') } self.assertEqual(expected, got) for filename in expected.values(): self.assertTrue(os.path.exists(os.path.join(tmp_dir, filename))) common.rmtree(tmp_dir)
def test_make_prepareref_dir(self): '''test make_prepareref_dir''' outdir = 'tmp.make_prepareref_dir' common.rmtree(outdir) tb.make_prepareref_dir(outdir) self.assertTrue(os.path.exists(outdir)) json_file = os.path.join(outdir, '00.params.json') common.rmtree(outdir)
def _get_from_argannot(self, outprefix): outprefix = os.path.abspath(outprefix) tmpdir = outprefix + '.tmp.download' current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) zipfile = 'arg-annot-database_doc.zip' common.download_file( 'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip', zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True) common.syscall('unzip ' + zipfile) os.chdir(current_dir) print('Extracted files.') genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt') final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' seq_reader = pyfastaq.sequences.file_reader(genes_file) f_out_tsv = pyfastaq.utils.open_file_write(final_tsv) f_out_fa = pyfastaq.utils.open_file_write(final_fasta) for seq in seq_reader: original_id = seq.id seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id.split()[0]) print(seq, file=f_out_fa) print(seq.id, '1', '0', '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_tsv) pyfastaq.utils.close(f_out_tsv) pyfastaq.utils.close(f_out_fa) if not self.debug: common.rmtree(tmpdir) print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print(argannot_ref)
def _get_from_virulencefinder(self, outprefix): outprefix = os.path.abspath(outprefix) final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' tmpdir = outprefix + '.tmp.download' current_dir = os.getcwd() if self.version == 'old': try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) zipfile = 'virulencefinder.zip' cmd = 'curl -X POST --data "folder=virulencefinder&filename=virulencefinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php' print('Downloading data with:', cmd, sep='\n') common.syscall(cmd) common.syscall('unzip ' + zipfile) else: RefGenesGetter._get_genetic_epi_database_from_bitbucket('virulencefinder', tmpdir, git_commit=self.version) os.chdir(tmpdir) print('Combining downloaded fasta files...') fout_fa = pyfastaq.utils.open_file_write(final_fasta) fout_tsv = pyfastaq.utils.open_file_write(final_tsv) name_count = {} for filename in os.listdir(tmpdir): if filename.endswith('.fsa'): print(' ', filename) fix_file = os.path.join(tmpdir, filename + '.fix.fsa') RefGenesGetter._fix_virulencefinder_fasta_file(os.path.join(tmpdir, filename), fix_file) file_reader = pyfastaq.sequences.file_reader(fix_file) for seq in file_reader: original_id = seq.id seq.id = seq.id.replace('_', '.', 1) seq.id = seq.id.replace(' ', '_') if seq.id in name_count: name_count[seq.id] += 1 seq.id = seq.id + '.' + str(name_count[seq.id]) else: name_count[seq.id] = 1 print(seq, file=fout_fa) print(seq.id, '0', '0', '.', '.', 'Original name was ' + original_id, sep='\t', file=fout_tsv) pyfastaq.utils.close(fout_fa) pyfastaq.utils.close(fout_tsv) print('\nFinished combining files\n') os.chdir(current_dir) if not self.debug: common.rmtree(tmpdir) print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print('"Real-time whole-genome sequencing for routine typing, surveillance, and outbreak detection of verotoxigenic Escherichia coli", Joensen al 2014, PMID: 24574290\n')
def run(self): tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd()) cdhit_fasta = os.path.join(tmpdir, 'cdhit') cluster_info_outfile = cdhit_fasta + '.bak.clstr' cmd = self.get_run_cmd(cdhit_fasta) common.syscall(cmd, verbose=self.verbose) clusters = self._get_clusters_from_bak_file(cluster_info_outfile, self.min_cluster_number) common.rmtree(tmpdir) return clusters
def test_rmtree(self): '''test rmtree''' tmp_dir = 'tmp.rmtree' os.mkdir(tmp_dir) with open(os.path.join(tmp_dir, 'foo'), 'w') as f: pass self.assertTrue(os.path.exists(tmp_dir)) common.rmtree(tmp_dir) self.assertFalse(os.path.exists(tmp_dir))
def run(self): common.download_file(self.zip_url, self.zip_file, verbose=True) tmpdir = self.zip_file + '.tmp.extract' original_files = MegaresZipParser._extract_files(self.zip_file, tmpdir) annotation_data = MegaresZipParser._load_annotations_file(os.path.join(tmpdir, original_files['annotations'])) header_data = MegaresZipParser._load_header_mappings_file(os.path.join(tmpdir, original_files['header_mappings'])) sequences = {} pyfastaq.tasks.file_to_dict(os.path.join(tmpdir, original_files['fasta']), sequences) MegaresZipParser._write_files(self.outprefix, sequences, annotation_data, header_data) common.rmtree(tmpdir) os.unlink(self.zip_file)
def _get_xml_file_tree(self): xml_url = 'http://pubmlst.org/data/dbases.xml' tmpdir = tempfile.mkdtemp(prefix='tmp.get_pubmlst_xml', dir=os.getcwd()) xml_file = os.path.join(tmpdir, 'out.xml') self._download_file(xml_url, xml_file) xml_tree = ET.parse(xml_file) if not self.debug: common.rmtree(tmpdir) return xml_tree
def test_full_run_ok_gene_start_mismatch(self): '''test complete run where gene extended because too different at end for full nucmer match''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_gene_start_mismatch.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_gene_start_mismatch.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_gene_start_mismatch' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_gene_start_mismatch'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=112, total_reads_bases=1080) c.run() expected = [ 'gene\tgene\t1\t0\t27\t112\tcluster_name\t96\t96\t100.0\tcluster_name.l6.c30.ctg.1\t362\t27.8\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of gene' ] self.assertEqual(expected, c.report_lines) common.rmtree(tmpdir)
def test_full_run_smtls_snp_varonly_nonc(self): '''test complete run where samtools calls a snp in a presence/absence noncoding sequence''' fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_nonc' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() expected = [ 'ref_seq\tref_seq\t0\t1\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_seq' ] self.assertEqual(expected, c.report_lines) common.rmtree(tmpdir)
def test_full_run_insert_codon(self): '''Test complete run where there is a inserted codon''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_insert_codon.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_insert_codon.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_insert_codon' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_insert_codon'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=292, total_reads_bases=20900) c.run() expected = [ 'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t108\t108\t92.31\tcluster_name.l15.c30.ctg.1\t1115\t19.9\t0\t.\tp\t.\t0\tS25_M26insELI\tINS\t73\t73\tA\t554\t554\tG\t24\tG\t24\t.\tGeneric description of presence_absence1' ] self.assertEqual(expected, c.report_lines) common.rmtree(tmpdir)
def test_full_run_partial_assembly(self): '''Test complete run where only part of the ref gene is present in the reads''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_partial_assembly' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=278, total_reads_bases=15020) c.run() expected = [ 'presence_absence1\tpresence_absence1\t1\t0\t19\t278\tcluster_name\t96\t77\t100.0\tcluster_name.l15.c17.ctg.1\t949\t20.5\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of presence_absence1' ] self.assertEqual(expected, c.report_lines) common.rmtree(tmpdir)
def test_full_run_ok_variants_only_variant_not_present_always_report(self): '''test complete run of cluster on a variants only gene when variant not present but always report variant''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_varonly.not_present.always_report.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_full_run_varonly.not_present.always_report' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=66, total_reads_bases=3300) c.run() expected = [ 'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t65\t67\tCGC\t18;18;19\tC;G;C\t18;18;19\tvariants_only1:1:1:R3S:.:Ref and assembly have wild type, but always report anyway\tGeneric description of variants_only1' ] self.assertEqual(expected, c.report_lines) common.rmtree(tmpdir)
def test_full_run_no_reads_after_filtering(self): '''test complete run of cluster when filtering removes all reads''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering.in.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering.in.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_no_reads_after_filtering' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=0, total_reads_bases=0) c.run() expected = '\t'.join(['.', '.', '.', '.', '64', '0', 'cluster_name'] + ['.'] * 24) self.assertEqual([expected], c.report_lines) self.assertFalse(c.status_flag.has('ref_seq_choose_fail')) self.assertTrue(c.status_flag.has('assembly_fail')) common.rmtree(tmpdir)
def test_full_run_multiple_vars_in_codon(self): '''Test complete run where there is a codon with a SNP and an indel''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_multiple_vars.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_multiple_vars.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_multiple_vars' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_multiple_vars'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=292, total_reads_bases=20900) c.run() expected = [ 'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t96\t96\t96.91\tcluster_name.l15.c30.ctg.1\t1074\t20.4\t0\t.\tp\t.\t0\t.\tMULTIPLE\t25\t26\tGA\t487\t489\tCAT\t27;26;25\tC;A;T\t27;26;25\t.\tGeneric description of presence_absence1', 'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t96\t96\t96.91\tcluster_name.l15.c30.ctg.1\t1074\t20.4\t0\t.\tp\t.\t0\tA10fs\tFSHIFT\t28\t28\tG\t491\t491\tG\t26\tG\t26\t.\tGeneric description of presence_absence1', ] self.assertEqual(expected, c.report_lines) common.rmtree(tmpdir)
def test_full_run_smtls_snp_varonly_gene_2(self): '''test complete run where samtools calls a snp in a variant only gene''' # _2 because I think test_full_run_smtls_snp_varonly_gene tests the asame functionality. # ... but let's leave both tests in anyway fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_gene_2' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() expected = [ 'ref_gene\tref_gene\t1\t1\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_gene' ] self.assertEqual(expected, c.report_lines) common.rmtree(tmpdir)
def clean_cluster_dir(d, exclude=None): if not os.path.exists(d): return '''Cleans up all files made except original ones in a cluster directory''' keep = set(['genes.fa', 'reads_1.fq', 'reads_2.fq']) if exclude is not None: for f in exclude: keep.add(f) for name in os.listdir(d): if name not in keep: full_path = os.path.join(d, name) if os.path.isdir(full_path): common.rmtree(full_path) else: os.unlink(full_path)
def _extract_files(cls, zip_file, outdir): original_files = { 'annotations': None, 'fasta': None, 'header_mappings': None } try: os.mkdir(outdir) except: raise Error('Error making directory ' + outdir) # Old <2.0.0 megares has eg these files: # megares_annotations_v1.01.csv # megares_database_v1.01.fasta # megares_to_external_header_mappings_v1.01.tsv # megares 2.0.0 has these files: # megares_drugs_annotations_v2.00.csv # megares_drugs_database_v2.00.fasta # megares_modified_annotations_v2.00.csv # megares_modified_database_v2.00.fasta # megares_to_external_header_mappings_v2.00.csv # The sequences in *_modified_* files seem to be a superset of # *_drugs_*, so use the *_modified_* ones. This will happen # as long as we loop over sorted filenames, because the _modified_ # csv and fasta are listed last zfile = zipfile.ZipFile(zip_file) for member in sorted(zfile.namelist()): if '_annotations_' in member: original_files['annotations'] = member elif '_database_' in member and member.endswith('.fasta'): original_files['fasta'] = member elif '_header_mappings_' in member: original_files['header_mappings'] = member else: continue zfile.extract(member, path=outdir) if None in original_files.values(): common.rmtree(outdir) raise Error( 'Error. Not all expected files found in downloaded megares zipfile. ' + str(original_files)) return original_files
def test_full_run_cluster_test_full_run_smtls_snp_varonly_nonc(self): '''test complete run where samtools calls a snp at a known snp location in a presence/absence noncoding and sample has the var''' fasta_in = os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.fa') tsv_in = os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() # We shouldn't get an extra 'HET' line because we already know about the snp, so # included in the report of the known snp expected = [ 'ref_seq\tref_seq\t0\t1\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tn\tA18G\t1\t.\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\tref_seq:0:1:A18G:.:Description of A18G snp\t.' ] self.assertEqual(expected, c.report_lines) common.rmtree(tmpdir)
def test_full_run_smtls_snp_varonly_gene(self): '''test complete run where samtools calls a snp at a known snp location in a variant only gene, gene does have variant''' fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene.fa') tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var' common.rmtree(tmpdir) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() # We shouldn't get an extra 'HET' line because we already know about the snp, so # included in the report of the known snp expected = [ 'ref_gene\tref_gene\t1\t1\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tp\tI6M\t1\t.\t.\t16\t18\tATG\t135\t137\tATG\t65;64;63\tA;T;G,A\t65;64;32,31\tref_gene:1:1:I6M:.:Description of I6M snp\t.' ] self.assertEqual(expected, c.report_lines) common.rmtree(tmpdir)