def test_add(self): '''Test add''' h = histogram.Histogram(3) h.add(4) self.assertEqual({3: 1}, h.bins) h.add(4) self.assertEqual({3: 2}, h.bins) h.add(42) self.assertEqual({3: 2, 42: 1}, h.bins)
def _load_minimap_insert_histogram(cls, infile, bin_size): hist = histogram.Histogram(bin_size) with open(infile) as f: for line in f: value, count = line.rstrip().split('\t') hist.add(int(value), count=int(count)) return hist
def test_len(self): '''Test __len__''' h = histogram.Histogram(1) self.assertEqual(0, len(h)) h.add(1) self.assertEqual(1, len(h)) h.add(1) self.assertEqual(2, len(h)) h.add(2) self.assertEqual(3, len(h))
def test_stats(self): '''Test stats''' h = histogram.Histogram(1) for i in range(10): h.add(i + 1) h.bins[3] = 3 h.bins[4] = 3 h.bins[5] = 5 h.bins[6] = 3 h.bins[7] = 2 h.bins[8] = 2 self.assertEqual((2.5, 5.5, 10.5, 0.91), h.stats())
def test_to_bin(self): '''Test _to_bin''' h = histogram.Histogram(3) tests = [ (0, 0), (1, 0), (2, 0), (3, 3), (4, 3), (5, 3), (6, 6), ] for t in tests: self.assertEqual(h._to_bin(t[0]), t[1])
def test_load_minimap_insert_histogram(self): '''test _load_minimap_insert_histogram''' infile = os.path.join( data_dir, 'clusters_test_load_minimap_insert_histogram.in') bin_size = 10 got = clusters.Clusters._load_minimap_insert_histogram( infile, bin_size) expected = histogram.Histogram(bin_size) expected.add(85, count=1) expected.add(86, count=2) expected.add(90, count=4) expected.add(91, count=6) expected.add(97, count=10) expected.add(100, count=7) expected.add(111, count=3) self.assertEqual(expected, got)
def __init__( self, refdata_dir, reads_1, reads_2, outdir, extern_progs, version_report_lines=None, assembly_kmer=21, assembly_coverage=100, threads=1, verbose=False, assembler='fermilite', spades_mode='rna', spades_options=None, max_insert=1000, min_scaff_depth=10, nucmer_min_id=90, nucmer_min_len=20, nucmer_breaklen=200, assembled_threshold=0.95, unique_threshold=0.03, max_gene_nt_extend=30, clean=True, tmp_dir=None, ): self.refdata_dir = os.path.abspath(refdata_dir) self.refdata, self.cluster_ids = self._load_reference_data_from_dir( refdata_dir) self.reads_1 = os.path.abspath(reads_1) self.reads_2 = os.path.abspath(reads_2) self.outdir = os.path.abspath(outdir) self.extern_progs = extern_progs self.clusters_tsv = os.path.abspath( os.path.join(refdata_dir, '02.cdhit.clusters.tsv')) self.all_ref_seqs_fasta = os.path.abspath( os.path.join(refdata_dir, '02.cdhit.all.fa')) if version_report_lines is None: self.version_report_lines = [] else: self.version_report_lines = version_report_lines self.clean = clean self.logs_dir = os.path.join(self.outdir, 'Logs') self.assembler = assembler self.assembly_kmer = assembly_kmer self.assembly_coverage = assembly_coverage self.spades_mode = spades_mode self.spades_options = spades_options self.cdhit_files_prefix = os.path.join(self.refdata_dir, 'cdhit') self.cdhit_cluster_representatives_fa = self.cdhit_files_prefix + '.cluster_representatives.fa' self.bam_prefix = os.path.join(self.outdir, 'map_reads_to_cluster_reps') self.bam = self.bam_prefix + '.bam' self.report_file_all_tsv = os.path.join(self.outdir, 'debug.report.tsv') self.report_file_filtered = os.path.join(self.outdir, 'report.tsv') self.mlst_reports_prefix = os.path.join(self.outdir, 'mlst_report') self.mlst_profile_file = os.path.join(self.refdata_dir, 'pubmlst.profile.txt') self.catted_assembled_seqs_fasta = os.path.join( self.outdir, 'assembled_seqs.fa.gz') self.catted_genes_matching_refs_fasta = os.path.join( self.outdir, 'assembled_genes.fa.gz') self.catted_assemblies_fasta = os.path.join(self.outdir, 'assemblies.fa.gz') self.threads = threads self.verbose = verbose self.max_insert = max_insert self.insert_hist_bin = 10 self.insert_hist = histogram.Histogram(self.insert_hist_bin) self.insert_size = None self.insert_sspace_sd = None self.insert_proper_pair_max = None self.min_scaff_depth = min_scaff_depth self.nucmer_min_id = nucmer_min_id self.nucmer_min_len = nucmer_min_len self.nucmer_breaklen = nucmer_breaklen self.assembled_threshold = assembled_threshold self.unique_threshold = unique_threshold self.max_gene_nt_extend = max_gene_nt_extend self.cluster_to_dir = {} # gene name -> abs path of cluster directory self.clusters = {} # gene name -> Cluster object self.cluster_read_counts = {} # gene name -> number of reads self.cluster_base_counts = {} # gene name -> number of bases self.pool = None self.fails_dir = os.path.join(self.outdir, '.fails') self.clusters_all_ran_ok = True for d in [self.outdir, self.logs_dir, self.fails_dir]: try: os.mkdir(d) except: raise Error('Error mkdir ' + d) if tmp_dir is None: if 'ARIBA_TMPDIR' in os.environ: tmp_dir = os.path.abspath(os.environ['ARIBA_TMPDIR']) elif 'TMPDIR' in os.environ: tmp_dir = os.path.abspath(os.environ['TMPDIR']) else: tmp_dir = self.outdir if not os.path.exists(tmp_dir): raise Error('Temporary directory ' + tmp_dir + ' not found. Cannot continue') if self.clean: self.tmp_dir_obj = tempfile.TemporaryDirectory( prefix='ariba.tmp.', dir=os.path.abspath(tmp_dir)) self.tmp_dir = self.tmp_dir_obj.name else: self.tmp_dir_obj = None self.tmp_dir = os.path.join(self.outdir, 'clusters') try: os.mkdir(self.tmp_dir) except: raise Error('Error making directory ' + self.tmp_dir) if self.verbose: print('Temporary directory:', self.tmp_dir) for i in [ x for x in dir(signal) if x.startswith("SIG") and x not in {'SIGCHLD', 'SIGCLD'} ]: try: signum = getattr(signal, i) signal.signal(signum, self._receive_signal) except: pass
def __init__(self, db_fasta, reads_1, reads_2, outdir, assembly_kmer=21, threads=1, verbose=False, assembler='velvet', smalt_k=13, smalt_s=2, smalt_min_id=0.9, spades_other=None, max_insert=1000, min_scaff_depth=10, nucmer_min_id=90, nucmer_min_len=50, nucmer_breaklen=50, assembled_threshold=0.95, unique_threshold=0.03, bcftools_exe='bcftools', gapfiller_exe='GapFiller.pl', samtools_exe='samtools', smalt_exe='smalt', bowtie2_exe='bowtie2', bowtie2_preset='very-sensitive-local', spades_exe='spades.py', sspace_exe='SSPACE_Basic_v2.0.pl', velvet_exe='velvet', # prefix of velvet{g,h} cdhit_seq_identity_threshold=0.9, cdhit_length_diff_cutoff=0.9, run_cd_hit=True, clean=1, ): self.db_fasta = os.path.abspath(db_fasta) self.reads_1 = os.path.abspath(reads_1) self.reads_2 = os.path.abspath(reads_2) self.outdir = os.path.abspath(outdir) self.clusters_outdir = os.path.join(self.outdir, 'Clusters') self.clusters_info_file = os.path.join(self.outdir, 'clusters.tsv') self.clean = clean self.assembler = assembler assert self.assembler in ['velvet', 'spades'] self.assembly_kmer = assembly_kmer self.spades_other = spades_other self.db_fasta_clustered = os.path.join(self.outdir, 'genes.clustered.fa') self.cluster_ids = {} self.bam_prefix = os.path.join(self.outdir, 'map_all_reads') self.bam = self.bam_prefix + '.bam' self.report_file_tsv = os.path.join(self.outdir, 'report.tsv') self.report_file_xls = os.path.join(self.outdir, 'report.xls') self.threads = threads self.verbose = verbose self.smalt_k = smalt_k self.smalt_s = smalt_s self.smalt_min_id = smalt_min_id self.max_insert = max_insert self.smalt_exe = smalt_exe self.bowtie2_exe = bowtie2_exe self.bowtie2_preset = bowtie2_preset self.insert_hist_bin = 10 self.insert_hist = histogram.Histogram(self.insert_hist_bin) self.insert_size = None self.insert_sspace_sd = None self.insert_proper_pair_max = None self.min_scaff_depth = min_scaff_depth self.nucmer_min_id = nucmer_min_id self.nucmer_min_len = nucmer_min_len self.nucmer_breaklen = nucmer_breaklen self.assembled_threshold = assembled_threshold self.unique_threshold = unique_threshold self.cluster_to_dir = {} # gene name -> abs path of cluster directory self.clusters = {} # gene name -> Cluster object self.bcftools_exe = bcftools_exe self.sspace_exe = shutil.which(sspace_exe) if self.sspace_exe is None: print('WARNING: SSPACE not found. Scaffolding and gap filling will be skipped!', file=sys.stderr) self.gapfiller_exe = None else: self.sspace_exe = os.path.realpath(self.sspace_exe) # otherwise sspace dies loading packages self.gapfiller_exe = shutil.which(gapfiller_exe) if self.gapfiller_exe is None: print('WARNING: GapFiller not found. No gap filling will be run after scaffolding!', file=sys.stderr) else: self.gapfiller_exe = os.path.realpath(self.gapfiller_exe) # otherwise gapfiller dies loading packages self.samtools_exe = samtools_exe self.spades_exe = spades_exe self.velvet = velvet_exe self.cdhit_seq_identity_threshold = cdhit_seq_identity_threshold self.cdhit_length_diff_cutoff = cdhit_length_diff_cutoff self.run_cd_hit = run_cd_hit for d in [self.outdir, self.clusters_outdir]: try: os.mkdir(d) except: raise Error('Error mkdir ' + d)