def assemble(args): global reads, prefix, parameters parameters = add_args(args).__dict__ parameters.update(externals) prefix = parameters['prefix'] reads = [] for k, vs in zip(('pe', 'se'), (parameters['pe'], parameters['se'])): for v in vs: if k == 'pe': rnames = v.split(',') if len(rnames) > 0: assert len( rnames ) == 2, 'Allows 2 reads per PE library. You specified {0}'.format( len(rnames)) reads.append(rnames) elif k == 'se': rnames = v.split(',') if len(rnames) > 0: assert len( rnames ) == 1, 'Allows one file per SE library. You specified {0}'.format( len(rnames)) reads.append(rnames) logger('Load in {0} read files from {1} libraries'.format( sum([len(lib) for lib in reads]), len(reads))) if not parameters['onlyEval']: assembly = mainprocess().launch(reads) else: assembly = parameters['reference'] report = postprocess().launch(assembly) import json print(json.dumps(report, sort_keys=True, indent=2))
def returnOverlap(self, blastab, param): logger('Calculate overlaps.') ovl_l, ovl_p = param[1:] contigs = {tab[1]: id for id, tab in enumerate(blastab)} tabs = [[contigs[tab[1]], tab[15]] + sorted([tab[8], tab[9]]) for tab in blastab] tabs = np.array(sorted(tabs, key=itemgetter(0, 2, 3)), dtype=int) overlaps = np.empty(shape=[1000001, 3], dtype=int) overlaps[-1, :] = [0, 1, -1] res = [] while overlaps[-1, 0] >= 0: logger('Searching {0} / {1} tabs'.format(overlaps[-1, 0], len(tabs))) overlaps[:-1, :] = -1 overlaps = tab2overlaps(tabs, ovl_l, ovl_p, len(tabs), overlaps) res.append(overlaps[overlaps.T[2] > 0][:]) res = np.vstack(res) logger('Identified {0} overlaps.'.format(len(res))) return res
def evaluate(profile, cluster, stepwise, ave_gene_length=1000.) : with uopen(profile) as fin : logger('Loading profiles ...') profile_header = fin.readline().strip().split('\t') ST_col = np.where([p.find('#ST')>=0 for p in profile_header])[0].tolist() if len(ST_col) <= 0 : ST_col = [0] cols = ST_col + np.where([not h.startswith('#') for h in profile_header])[0].tolist() profile = pd.read_csv(fin, sep='\t', header=None, index_col=0, usecols=cols) profile_names = profile.index.values profile = profile.values with uopen(cluster) as fin : logger('Loading hierCC ...') cluster_header = fin.readline().strip().split('\t') cols = [0] + np.where([not h.startswith('#') for h in cluster_header])[0].tolist() cluster = pd.read_csv(fin, sep='\t', header=None, index_col=0, usecols=cols) cluster_names = cluster.index.values cluster = cluster.values s = np.arange(0, cluster.shape[1], stepwise) cluster = cluster[:, s] presence = np.in1d(cluster_names, profile_names) cluster, cluster_names = cluster[presence], cluster_names[presence] order = {n:id for id, n in enumerate(cluster_names)} profile_order = np.array([ [id, order[n]] for id, n in enumerate(profile_names) if n in order ]) profile_order = profile_order[np.argsort(profile_order.T[1]), 0] profile_names = profile_names[profile_order] profile = profile[profile_order] shannon = shannon_index(cluster) similarity = get_similarity('adjusted_rand_score', cluster, stepwise) silhouette = get_silhouette(profile, cluster, stepwise, ave_gene_length) np.savez_compressed('evalHCC.npz', shannon=shannon, similarity=similarity, silhouette=silhouette) logger('Done. Results saved in evalHCC.npz')
def get_quality(self, reference, reads ) : if parameters['mapper'] == 'minimap2' : bams = self.__run_minimap(prefix, reference, reads, ) elif parameters['mapper'] != 'bwa' : bams = self.__run_bowtie(prefix, reference, reads, ) else : bams = self.__run_bwa(prefix, reference, reads, ) sequence = readFasta(reference) for n, s in sequence.items() : q = ['!'] * len(s) sequence[n] = [s, q] sites = { n:np.array([0 for ss in s[1] ]) for n, s in sequence.items() } for bam in bams : if bam is not None : depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True) for line in depth.stdout : part = line.strip().split() if len(part) > 2 and float(part[2]) > 0 : sites[part[0]][int(part[1]) - 1] += float(part[2]) sites = {n:[s.size, np.max([np.median(s), np.exp(np.mean(np.log(s + 0.5)))-0.5]), 0.] for n, s in sites.items()} depth = np.array(list(sites.values())) depth = depth[np.argsort(-depth.T[0])] size = np.sum(depth.T[0]) acc = [0, 0] for d in depth : acc[0], acc[1] = acc[0] + d[0], acc[1] + d[0]*d[1] if acc[0] *2 >= size : break ave_depth = acc[1]/acc[0] exp_mut_depth = max(ave_depth * 0.2, 2.) for n, s in sites.items() : s[2] = s[1]/ave_depth logger('Average read depth: {0}'.format(ave_depth)) sequence = {n:s for n, s in sequence.items() if sites[n][1]>0.} with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout : for n, s in sorted(sequence.items()) : fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[0][site:(site+100)] for site in xrange(0, len(s[0]), 100)]))) bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None]) pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, universal_newlines=True ).communicate() if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) : pilon_cmd = '{pilon} --fix snps,indels,gaps,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() cont_depth = [float(d) for d in parameters['cont_depth'].split(',')] logger('Contigs with less than {0} depth will be removed from the assembly'.format(cont_depth[0]*ave_depth)) logger('Contigs with more than {0} depth will be treated as duplicates'.format(cont_depth[1]*ave_depth)) indels = [] with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.difference'.format(prefix), 'w') as fout : for line in fin : if line.startswith('#') : continue part = line.strip().split('\t') if sites[part[0]][2] < cont_depth[0] or sites[part[0]][2] >= cont_depth[1] : continue if part[-1] == '1/1': if len(part[3]) > 1 : indels.append([part[0], max(0, int(site)-1), int(site)-1+len(part[3])+2]) elif len(part[4]) > 1 and part[4] != '<DUP>' : indels.append([part[0], max(0, int(site)-2), int(site)-1+len(part[3])+2]) try: if part[-1] == '0/0' and len(part[3]) == 1 and len(part[4]) == 1 : pp = part[7].split(';') dp = float(pp[0][3:]) af = 100 - sorted([float(af) for af in pp[6][3:].split(',')])[-1] if af <= 20 and dp >= 2 and dp * af/100. <= exp_mut_depth and (part[6] == 'PASS' or (part[6] == 'LowCov' and parameters['metagenome'])) : site = int(part[1])-1 qual = chr(int(pp[4][3:])+33) sequence[part[0]][1][site] = qual else : fout.write(line) else : fout.write(line) except : fout.write(line) for n, s, e in indels : sequence[n][1][s:e] = ['!'] * len(sequence[n][1][s:e]) if self.snps is not None : for n, snvs in self.snps.items() : for site, snv in snvs : if snv.find('N') >= 0 : continue if snv.startswith('+') : s, e = site-4, site+3+len(snv) else : s, e = site-4, site+4 for k in xrange(s, e) : sequence[n][1][k] = max(chr(40+33), sequence[n][1][k]) with open('{0}.result.fastq'.format(prefix), 'w') as fout : p = prefix.rsplit('/', 1)[-1] for n, (s, q) in sequence.items() : if sites[n][2] >= cont_depth[0] : fout.write( '@{0} {3} {4} {5}\n{1}\n+\n{2}\n'.format( p+'_'+n, s, ''.join(q), *sites[n] ) ) os.unlink( '{0}.mapping.vcf'.format(prefix) ) logger('Final result is written into {0}'.format('{0}.result.fastq'.format(prefix))) return '{0}.result.fastq'.format(prefix)
def reduce_depth(self, reads): encode = {'A': 0, 'C': 1, 'G': 2, 'T': 3} read_stats = [{} for library in reads] new_reads = [{} for library in reads] for lib_id, (libraries, stat, new_libs) in enumerate(zip(reads, read_stats, new_reads)): read_information = [0, 0] for lib_type, library in libraries.items(): stat[lib_type] = [] for fname in library: p = Popen("{pigz} -cd {0}|awk 'NR%4==2'|wc".format( fname, **externals), shell=True, stdout=PIPE, universal_newlines=True).communicate()[0].strip( ).split() n_base, n_read = int(p[2]) - int(p[1]), int(p[0]) read_information[0] += n_base read_information[1] += n_read bcomp = [[0, 0, 0, 0, 0] for i in range(10)] p = Popen( "{pigz} -cd {0}|head -200000|awk 'NR%20==2'".format( fname, **externals), shell=True, stdout=PIPE, stderr=PIPE, universal_newlines=True) for line in p.stdout: for b, bc in zip(line[:10], bcomp): bc[encode.get(b, 4)] += 1 seq_start = 0 for c in range(9, -1, -1): bc = bcomp[c] if max(bc) / 0.8 >= sum(bc) or (c < 2 and bc[4] > 0.1 * sum(bc)): seq_start = c + 1 break stat[lib_type].append([n_base, seq_start]) logger('Obtained {1} bases in {2} reads after Trimming in Lib {0}'. format(lib_id, *read_information)) n_base = read_information[0] sample_freq2 = float( parameters['max_base'] ) / n_base if parameters['max_base'] > 0 and n_base > 0 else 1. if sample_freq2 >= 1: for ss in stat.values(): for s in ss: s.append(sample_freq2) else: max_base = float(parameters['max_base']) for lib_type in ('MP', 'PE', 'SE'): if lib_type in stat: ss = stat[lib_type] n_base = sum([s[0] for s in ss]) sample_freq = float(max_base) / n_base for s in ss: s.append(sample_freq) max_base = 0. if n_base >= max_base else max_base - n_base if 0 < sample_freq2 < 1: logger('Read depth too high. Subsampling.') for lib_type, library in libraries.items(): if stat[lib_type][0][-1] > 0: if lib_type == 'MP': new_libs[lib_type] = [ '{0}.2.{1}.m.fastq.gz'.format( parameters['prefix'], lib_id) ] elif lib_type == 'PE': new_libs[lib_type] = [ '{0}.2.{1}.r1.fastq.gz'.format( parameters['prefix'], lib_id), '{0}.2.{1}.r2.fastq.gz'.format( parameters['prefix'], lib_id) ] else: new_libs[lib_type] = [ '{0}.2.{1}.s.fastq.gz'.format( parameters['prefix'], lib_id) ] for f_id, (lib, s, nlib) in enumerate( zip(library, stat[lib_type], new_libs[lib_type])): sample_freq = s[-1] if parameters['noRename'] == False: if s[1] > 0: logger( 'Remove potential barcode bases at the beginning {0} bps of reads in {1}' .format(s[1], lib)) Popen( "{pigz} -cd {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print substr($0, {3}, 9999999)}} else {{if(id==0) {{print \"@{4}_{5}_\"nr}} else {{print \"+\"}} }} }}'|{pigz} > {1}" .format(lib, nlib, min(sample_freq, 1.), s[1] + 1, lib_id, lib_type, **externals), shell=True).wait() else: Popen( "{pigz} -cd {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print $0}} else {{ if(id==0){{print \"@{4}_{5}_\"nr}} else {{print \"+\"}} }} }}'|{pigz} > {1}" .format(lib, nlib, min(sample_freq, 1.), s[1] + 1, lib_id, lib_type, **externals), shell=True).wait() else: if s[1] > 0: logger( 'Remove potential barcode bases at the beginning {0} bps of reads in {1}' .format(s[1], lib)) Popen( "{pigz} -cd {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print substr($0, {3}, 9999999)}} else {{if(id==0) {{print $0}} else {{print \"+\"}} }} }}'|{pigz} > {1}" .format(lib, nlib, min(sample_freq, 1.), s[1] + 1, lib_id, **externals), shell=True).wait() else: Popen( "{pigz} -cd {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print $0}} else {{ if(id==0){{print $0}} else {{print \"+\"}} }} }}'|{pigz} > {1}" .format(lib, nlib, min(sample_freq, 1.), s[1] + 1, lib_id, **externals), shell=True).wait() for lib in library: try: os.unlink(lib) except: pass return new_reads
def hierCC(args): params = get_args(args) ot = time.time() cluster_file = params.output + '.completeCC.npz' pool = Pool(10) global mat, n_loci mat = pd.read_csv(params.profile, sep='\t', header=None, dtype=str).values allele_columns = np.array( [i == 0 or (not h.startswith('#')) for i, h in enumerate(mat[0])]) mat = mat[1:, allele_columns].astype(int) n_loci = mat.shape[1] - 1 logger( '{0}: Loaded in allelic profiles with dimension: {1} and {2}. The first column is assumed to be type id.' .format(time.time() - ot, *mat.shape)) if os.path.isfile(params.incremental): od = np.load(params.incremental, allow_pickle=True) cls = od['completeCC'] typed = {c[0]: id for id, c in enumerate(cls) if c[0] > 0} if len(typed) > 0: logger('{0}: Loaded in {1} old completeCC assignments.'.format( time.time() - ot, len(typed))) mat_idx = np.array([t in typed for t in mat.T[0]]) mat[:] = np.vstack([mat[mat_idx], mat[(mat_idx) == False]]) else: typed = {} if os.path.isfile(params.partition): st_idx = {str(st): id for id, st in enumerate(mat.T[0])} from collections import defaultdict partitions = defaultdict(list) for st, grp in pd.read_csv(params.partition, sep='\t', dtype=str).values: partitions[grp].append(st_idx[st]) st_idx[st] = -1 logger('{0}: Load in {1} partition(s)'.format(time.time() - ot, len(partitions))) st_idx = {k: v for k, v in st_idx.items() if v >= 0} else: partitions = {'all': np.arange(mat.shape[0])} st_idx = {} res = np.repeat(mat.T[0], mat.shape[1]).reshape(mat.shape) res[list(st_idx.values()), :] = 0 for key, indices in sorted(partitions.items()): if len(indices) <= 1: continue logger('{0}: Partition {1} contains {2} STs'.format( time.time() - ot, key, len(indices))) mat2 = mat[indices] logger( '{0}: Start to calculate pairwise distances'.format(time.time() - ot)) dist = get_distances(params.output, mat2, pool) logger('{0}: Start complete linkage clustering'.format(time.time() - ot)) cls = linkage(ssd.squareform(dist), method='complete') logger('{0}: Start completeCC assignments'.format(time.time() - ot)) descendents = [[i] for i in np.arange(dist.shape[0]) ] + [None for i in np.arange(dist.shape[0] - 1)] for idx, c in enumerate(cls.astype(int)): n_id = idx + dist.shape[0] d = sorted([int(c[0]), int(c[1])], key=lambda x: descendents[x][0]) min_id = descendents[d[0]][0] descendents[n_id] = descendents[d[0]] + descendents[d[1]] for tgt in descendents[d[1]]: res[indices[tgt], c[2] + 1:] = res[indices[min_id], c[2] + 1:] res = res[res.T[0] > 0] np.savez_compressed(cluster_file, completeCC=res) if not params.delta: with uopen(params.output + '.completeCC.gz', 'w') as fout: fout.write('#ST_id\t{0}\n'.format('\t'.join( ['d' + str(id) for id in np.arange(n_loci)]))) for r in res[np.argsort(res.T[0])]: fout.write('\t'.join([str(rr) for rr in r]) + '\n') else: deltas = map(int, params.delta.split(',')) with uopen(params.output + '.completeCC.gz', 'w') as fout: fout.write('#ST_id\t{0}\n'.format('\t'.join( ['d' + str(id) for id in deltas]))) for r in res[np.argsort(res.T[0])]: fout.write('\t'.join([str(r[id + 1]) for id in deltas]) + '\n') del res logger( 'NUMPY clustering result (for incremental completeCC): {0}.completeCC.npz' .format(params.output)) logger( 'TEXT clustering result (for visual inspection): {0}.completeCC.gz'. format(params.output))
def nomenclature(query, reference, ref_aa='', **params): # write query logger('EnSign starts') sequence, qry_fna, qry_faa = seqOperation().write_query(query) logger('Read in {0} bases as query'.format( sum([len(s[0]) for s in sequence.itervalues()]))) # write refset if not os.path.isfile(str(ref_aa)): ref_aa = seqOperation().write_refsets(reference) logger('Prepare translated references') # do comparison blasttab = dualBlast().run_ublast(fna_target=qry_fna, faa_target=qry_faa, fna_query=reference, faa_query=ref_aa) # filter blasttab_parser = blastParser() blasttab = blasttab_parser.linear_merge(blasttab, **parameters) logger('Merge closely located hits. {0} hits'.format(len(blasttab))) loci = blasttab_parser.parse_ublast(blasttab, parameters) logger('Identify homologous groups. {0} groups'.format( len([1 for lc in loci if lc != '__non_specific__']))) regions = blasttab_parser.inter_loci_overlap(loci, parameters) logger('Resolve potential paralogs. {0} regions'.format(len(regions))) # submission alleles = blasttab_parser.form_alleles(regions, sequence, parameters['unique_key'], parameters['high_quality'], parameters) logger('Generate allelic sequences. {0} remains'.format(len(alleles))) #results = blasttab_parser.typing(alleles, parameters, dbname, scheme, submission=submission) return alleles
def read_matrix(fname): invariant = [] seqLens, missing = [], [] with uopen(fname) as fin: for line_id, line in enumerate(fin): if line.startswith('##'): if line.startswith('## Constant_bases'): part = line[2:].strip().split() invariant = dict( zip([65, 67, 71, 84], [float(v) for v in part[1:]])) elif line.startswith('## Sequence_length:'): part = line[2:].strip().split() seqLens.append([part[1], int(part[2])]) elif line.startswith('## Missing_region:'): part = line[2:].strip().split() missing.append([part[1], int(part[2]), int(part[3])]) elif line.startswith('#'): part = np.array(line.strip().split('\t')) cols = np.where( (1 - np.char.startswith(part, '#')).astype(bool))[0] w_cols = np.where(np.char.startswith(part, '#!W'))[0] names = part[cols] break else: part = np.array(line.strip().split('\t')) cols = np.ones(part.shape, dtype=bool) cols[:2] = False w_cols = np.char.startswith(part, '#!W') names = part[cols] break bases, weights, sites = [], [], [] for mat in pd.read_csv(fin, header=None, sep='\t', usecols=cols.tolist() + w_cols.tolist() + [0, 1], chunksize=10000, engine='c', dtype=str, low_memory=False, na_filter=False): mat = mat.values logger('{0}\t{1}\t{2}\t{3}'.format(\ mat[0, 0], mat[0, 1], \ resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, len(sites) )) for m in mat: btype, bidx = np.unique(['-'] + m[cols].tolist(), return_inverse=True) if btype.size <= 2: continue sites.append([m[0], int(m[1]), 1, np.array([])]) weights.append( m[w_cols].astype(float).prod() if w_cols.size else 1.) if '.' in btype or max(map(len, btype)) > 1: missing_val = np.where(btype == '-')[0][0] bidx[bidx == missing_val] = 45 bidx[bidx < missing_val] += 1 bidx[bidx == 45] = 0 sites[-1][3] = np.array(['-'] + btype[btype != '-'].tolist()) sites[-1][2] = 2 bases.append(bidx[1:]) else: bases.append( np.array(list(map(ord, btype)), dtype=np.uint8)[bidx[1:]]) bases, weights, sites = np.vstack(bases), np.array(weights), np.array( sites, dtype=object) indices = np.lexsort(bases.T) snps = [] for idx in indices: s, b, w = sites[idx], bases[idx], weights[idx] if not snps or np.any(b != snps[-1][2]): snps.append([len(snps), w, b, s[2]]) else: snps[-1][1] += w s[2] = snps[-1][0] for inv in invariant.items(): b_key = np.array([inv[0]] * len(names), dtype=np.uint8) snps.append([len(snps), float(inv[1]), b_key, 0]) for snp in snps: snp[1] = np.ceil(snp[1]) return names, sites, np.array(snps, dtype=object), np.array( seqLens, dtype=object), np.array(missing, dtype=object)
def filt_genes(prefix, groups, global_file, conflicts, first_classes=None): outPos = np.ones(16, dtype=bool) outPos[[3, 4, 5, 10, 15]] = False c2 = {c: {} for c in np.unique(conflicts.T[:2])} for c in conflicts: c2[c[0]][c[1]] = c2[c[1]][c[0]] = c[2] conflicts = c2 clust_ref = readFasta(params['clust']) for gene, g in groups.items(): g.T[2] *= g.T[3] g[:] = g[np.argsort(-g.T[2], kind='mergesort')] used, results, run = {}, {}, {} group_id = 0 with open('{0}.Prediction'.format(prefix), 'w') as fout: while len(groups) > 0: genes = get_gene(groups, first_classes, cnt=50) if len(genes) <= 0: continue to_run, to_run_id, min_score, min_rank = [], [], genes[-1][ 1], genes[0][2] genes = {gene: score for gene, score, min_rank in genes} if params['orthology'] in ('ml', 'nj'): for gene, score in genes.items(): if gene not in run: mat = groups[gene] _, bestPerGenome, matInGenome = np.unique( mat.T[1], return_index=True, return_inverse=True) region_score = mat.T[2] / mat[ bestPerGenome[matInGenome], 2] if region_score.size >= bestPerGenome.size * 2: used2, kept = set([]), np.ones(mat.shape[0], dtype=bool) for id, m in enumerate(mat): if m[5] in used2: kept[id] = False else: used2.update(conflicts.get(m[5], {})) mat = mat[kept] _, bestPerGenome, matInGenome = np.unique( mat.T[1], return_index=True, return_inverse=True) region_score = mat.T[2] / mat[ bestPerGenome[matInGenome], 2] if region_score.size > bestPerGenome.size * 3 and len( region_score) > 500: region_score2 = sorted(region_score, reverse=True) cut = region_score2[bestPerGenome.size * 3 - 1] if cut >= params['clust_identity']: cut = min( region_score2[bestPerGenome.size * 5] if len(region_score) > bestPerGenome.size * 5 else params['clust_identity'], 1.0 - 0.6 * (1.0 - params['clust_identity'])) mat = mat[region_score >= cut] to_run.append([mat, clust_ref[mat[0][0]], global_file]) to_run_id.append(gene) working_groups = pool.map(filt_per_group, to_run) #working_groups = [filt_per_group(d) for d in to_run] for gene, working_group in zip(to_run_id, working_groups): groups[gene] = working_group run[gene] = 1 else: _, bestPerGenome, matInGenome = np.unique(mat.T[1], return_index=True, return_inverse=True) region_score = mat.T[2] / mat[bestPerGenome[matInGenome], 2] mat[:] = mat[region_score >= params['clust_identity']] used2, kept = set([]), np.ones(mat.shape[0], dtype=bool) for id, m in enumerate(mat): for mmm in m[6]: if mmm[15] in used2: kept[id] = False break if kept[id]: used2 |= {mmm[15] for mmm in m[6]} mat = mat[kept] _, bestPerGenome, matInGenome = np.unique(mat.T[1], return_index=True, return_inverse=True) while len(genes): score, gene = max([[ np.sum(groups[gene][np.unique(groups[gene].T[1], return_index=True)[1]].T[2]), gene ] for gene in genes]) if score < min_score: break mat = groups.pop(gene, []) genes.pop(gene) paralog, paralog2 = 0, 0 supergroup = {} used2 = {} for m in mat: gid = m[5] conflict = used.get(gid, None) if conflict is not None: if not isinstance(conflict, int): superC = results[conflict] supergroup[superC] = supergroup.get(superC, 0) + 1 elif conflict > 0: if m[6].shape[0] <= 1 and m[3] >= params[ 'clust_identity']: paralog = 1 break else: paralog2 += 1 m[3] = -1 else: for g2, gs in conflicts.get(gid, {}).items(): if gs == 1: if g2 not in used: used2[g2] = m[0] elif gs == 2: used2[g2] = 1 else: used[g2] = 0 if paralog or paralog2 * 3 >= mat.shape[0]: continue else: used.update(used2) pangene = mat[0][0] if len(supergroup): pg, pid = max(supergroup.items(), key=itemgetter(1)) if pid * 3 >= mat.shape[0] or (pid * 5 >= mat.shape[0] and pid > 1): pangene = pg results[mat[0][0]] = pangene logger( '{4} / {5}: pan gene "{3}" : "{0}" picked from rank {1} and score {2}' .format(mat[0][0], min_rank, score, pangene, len(results), len(groups) + len(results))) for grp in mat[mat.T[3] > 0]: group_id += 1 for g in grp[6]: fout.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format( pangene, min_rank, group_id, grp[1], '\t'.join(g[outPos].astype(str).tolist()))) return '{0}.Prediction'.format(prefix)
def runDiamond(self, ref, qry, nhits=10, frames='7'): logger('Run diamond starts') refAA = os.path.join(self.dirPath, 'refAA') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch') if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id) with open(qryAA, 'w') as fout: for n, ss in sorted(qryAASeq.items()): _, id, s = min([(len(s[:-1].split('X')), id, s) for id, s in enumerate(ss)]) fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s)) diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format( diamond=diamond, qryAA=qryAA) p = Popen(diamond_fmt.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id) toWrite = [] for n, ss in sorted(refAASeq.items()): for id, s in enumerate(ss): cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X') cdss[-1] = cdss[-1][:-1] cdsi = np.cumsum([0] + list(map(len, cdss[:-1]))) for ci, cs in zip(cdsi, cdss): if len(cs): toWrite.append('>{0}:{1}:{2}\n{3}\n'.format( n, id + 1, ci, cs)) for id in xrange(5): with open('{0}.{1}'.format(refAA, id), 'w') as fout: for line in toWrite[id::5]: fout.write(line) diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format( diamond=diamond, refAA='{0}.{1}'.format(refAA, id), qryAA=qryAA, aaMatch='{0}.{1}'.format(aaMatch, id), n_thread=self.n_thread, min_id=self.min_id * 100., nhits=nhits, min_ratio=self.min_ratio * 100.) Popen(diamond_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() blastab = [] for r in self.pool.imap_unordered(parseDiamond, [[ '{0}.{1}'.format(aaMatch, id), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio ] for id in xrange(5)]): if r is not None: blastab.append(np.load(r, allow_pickle=True)) os.unlink(r) blastab = np.vstack(blastab) logger('Run diamond finishes. Got {0} alignments'.format( blastab.shape[0])) return blastab
def init_cleanup(self, reads): prefix = parameters['prefix'] new_reads = [] for lib_id, library in enumerate(reads): library_file = ['{0}.0.{1}.1.fastq.gz'.format(prefix, lib_id)] Popen('cat {0} > {1}'.format(' '.join([run[0] for run in library]), library_file[0]), shell=True).wait() if len(library[0]) > 1: library_file.append('{0}.0.{1}.2.fastq.gz'.format( prefix, lib_id)) Popen('cat {0} > {1}'.format( ' '.join([run[1] for run in library]), library_file[1]), shell=True).wait() if len(library_file) == 1: reads = 'in=' + library_file[0] library_file2 = ['{0}.1.{1}.1.fastq.gz'.format(prefix, lib_id)] outputs = 'out=' + library_file2[0] else: reads = 'in=' + library_file[0] + ' in2=' + library_file[1] library_file2 = [ '{0}.1.{1}.1.fastq.gz'.format(prefix, lib_id), '{0}.1.{1}.2.fastq.gz'.format(prefix, lib_id), '{0}.1.{1}.3.fastq.gz'.format(prefix, lib_id) ] outputs = 'out=' + library_file2[0] + ' out2=' + library_file2[ 1] + ' outs=' + library_file2[2] if parameters['noTrim'] == False: bb_run = Popen('{bbduk} -Xmx{memory} threads=8 rref={adapters} overwrite=t qout=33 k=23 mink=13 minlength=23 tbo=t entropy=0.75 entropywindow=25 mininsert=23 maxns=2 ktrim=r trimq={read_qual} {read} {outputs}'.format( \ read=reads, outputs=outputs, **parameters).split(), stdout=PIPE, stderr=PIPE) timer = Timer(3600, kill_child_proc, [bb_run]) try: timer.start() bb_out = bb_run.communicate() finally: timer.cancel() if bb_run.returncode == 0: new_reads.append(library_file2) try: for fname in library_file: os.unlink(fname) stat = re.findall( 'Result:\s+(\d+) reads .+\s+(\d+) bases', bb_out[1])[0] logger('Obtained {1} bases in {0} reads after BBDuk2'. format(*stat)) except: pass else: new_reads.append(library_file) try: stat = re.findall( 'Input:\s+(\d+) reads .+\s+(\d+) bases', bb_out[1])[0] logger( 'BBDuk2 failed! Use original reads with {1} bases in {0} reads' .format(*stat)) for fname in library_file2: os.unlink(fname) except: pass else: new_reads.append(library_file) return new_reads
def hierCC(args): params = get_args(args) ot = time.time() profile_file, cluster_file, old_cluster = params.profile, params.output + '.npz', params.incremental global mat, n_loci mat = pd.read_csv(profile_file, sep='\t', header=None, dtype=str).values allele_columns = np.array( [i == 0 or (not h.startswith('#')) for i, h in enumerate(mat[0])]) mat = mat[1:, allele_columns].astype(int) n_loci = mat.shape[1] - 1 logger( '{0}: Loaded in allelic profiles with dimension: {1} and {2}. The first column is assumed to be type id.' .format(time.time() - ot, *mat.shape)) if not params.immutable: absence = np.sum(mat <= 0, 1) mat = mat[np.argsort(absence, kind='mergesort')] if os.path.isfile(old_cluster): od = np.load(old_cluster, allow_pickle=True) cls = od['hierCC'] typed = {c[0]: id for id, c in enumerate(cls) if c[0] > 0} if len(typed) > 0: logger('{0}: Loaded in {1} old hierCC assignments.'.format( time.time() - ot, len(typed))) mat_idx = np.array([t in typed for t in mat.T[0]]) mat[:] = np.vstack([mat[mat_idx], mat[(mat_idx) == False]]) else: typed = {} logger('{0}: Start hierCC assignments'.format(time.time() - ot)) pool = Pool(10) res = np.repeat(mat.T[0], mat.shape[1]).reshape(mat.shape) res[1:, 0] = n_loci + 1 for index in xrange(0, mat.shape[0], 100): to_run = [] for idx in np.arange(index, index + 100): if idx < mat.shape[0]: if mat[idx, 0] in typed: res[idx, :] = cls[typed[mat[idx, 0]], :] else: to_run.append(idx) if len(to_run) == 0: continue if not params.immutable: dists = np.vstack(pool.map(get_distance, to_run)) assignment(dists, res) else: dists = np.vstack(pool.map(get_distance2, to_run)) assignment2(dists, res) logger('{0}: Assigned {1} of {2} types into hierCC.'.format( time.time() - ot, index, mat.shape[0])) res.T[0] = mat.T[0] np.savez_compressed(cluster_file, hierCC=res) if not params.delta: with uopen(params.output + '.hierCC.gz', 'w') as fout: fout.write('#ST_id\t{0}\n'.format('\t'.join( ['d' + str(id) for id in np.arange(n_loci)]))) for r in res[np.argsort(res.T[0])]: fout.write('\t'.join([str(rr) for rr in r]) + '\n') else: deltas = map(int, params.delta.split(',')) with uopen(params.output + '.hierCC.gz', 'w') as fout: fout.write('#ST_id\t{0}\n'.format('\t'.join( ['d' + str(id) for id in deltas]))) for r in res[np.argsort(res.T[0])]: fout.write('\t'.join([str(r[id + 1]) for id in deltas]) + '\n') del res logger('NUMPY clustering result (for incremental hierCC): {0}.npz'.format( params.output)) logger('TEXT clustering result (for visual inspection): {0}.hierCC.gz'. format(params.output))
def do_polish(self, reference, reads, reassemble=False, onlySNP=False) : if parameters.get('SNP', None) is not None : return self.do_polish_with_SNPs(reference, parameters['SNP']) else : if parameters['mapper'] == 'minimap2' : bams = self.__run_minimap(prefix, reference, reads ) elif parameters['mapper'] != 'bwa' : bams = self.__run_bowtie(prefix, reference, reads ) else : bams = self.__run_bwa(prefix, reference, reads ) sites = {} for bam in bams : if bam is not None : depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True) for line in depth.stdout : part = line.strip().split() if len(part) > 2 and float(part[2]) > 0 : sites[part[0]] = 1 sequence = readFasta(reference) sequence = {n:s for n,s in sequence.items() if n in sites} with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout : for n, s in sorted(sequence.items()) : fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)]))) bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None]) if reassemble : pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() else : pilon_cmd = '{pilon} --fix all --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) : pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() snps = [] with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.changes'.format(prefix), 'w') as fout : for line in fin : if line.startswith('#') : continue part = line.strip().split('\t') if part[-1] != '0/0': try : if (part[6] == 'PASS' or float(part[7][-4:]) >= 0.75) and re.match(r'^[ACGTN]+$', part[4]): if (not onlySNP) or (len(part[3]) == 1 and len(part[4]) == 1 ) : snps.append( [ part[0], int(part[1])-1, part[3], part[4] ] ) fout.write(line) except : pass os.unlink('{0}.mapping.vcf'.format(prefix)) for n in sequence.keys() : sequence[n] = list(sequence[n]) for n, site, ori, alt in reversed(snps) : s = sequence[n] end = site + len(ori) s[site:end] = alt logger('Observed and corrected {0} changes using PILON'.format(len(snps))) with open('{0}.fasta'.format(prefix), 'w') as fout : for n, s in sorted(sequence.items()) : s = ''.join(s) fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)]))) return '{0}.fasta'.format(prefix)
def runUBlast(self, ref, qry, nhits=6, frames='7'): logger('Run uBLAST starts') def parseUBlast(fin, refseq, qryseq, min_id, min_cov, min_ratio): blastab = pd.read_csv(fin, sep='\t', header=None) blastab[2] /= 100. blastab = blastab[blastab[2] >= min_id] blastab[3], blastab[4] = blastab[3] * 3, blastab[4] * 3 qf, rf = blastab[0].str.rsplit( ':', 1, expand=True), blastab[1].str.rsplit(':', 1, expand=True) if np.all(qf[0].str.isdigit()): qf[0] = qf[0].astype(int) if np.all(rf[0].str.isdigit()): rf[0] = rf[0].astype(int) blastab[0], qf = qf[0], qf[1].astype(int) blastab[1], rf = rf[0], rf[1].astype(int) blastab[6], blastab[ 7] = blastab[6] * 3 + qf - 3, blastab[7] * 3 + qf - 1 blastab[14] = [[ [3 * vv[0], vv[1]] for vv in v ] for v in map(getCIGAR, zip(blastab[15], blastab[14]))] blastab[12], blastab[13] = blastab[0].apply(lambda x: len(qryseq[ str(x)])), blastab[1].apply(lambda x: len(refseq[str(x)])) rf3 = (rf <= 3) blastab.loc[rf3, 8], blastab.loc[rf3, 9] = blastab.loc[rf3, 8] * 3 + rf[ rf3] - 3, blastab.loc[rf3, 9] * 3 + rf[rf3] - 1 blastab.loc[~rf3, 8], blastab.loc[ ~rf3, 9] = blastab.loc[~rf3, 13] - ( blastab.loc[~rf3, 8] * 3 + rf[~rf3] - 3 - 3) + 1, blastab.loc[~rf3, 13] - (blastab.loc[~rf3, 9] * 3 + rf[~rf3] - 3 - 1) + 1 d = np.max([ blastab[7] - blastab[12], blastab[9] - blastab[13], 1 - blastab[9], np.zeros(blastab.shape[0], dtype=int) ], axis=0) blastab[7] -= d def ending(x, y): x[-1][0] -= y np.vectorize(ending)(blastab[14], d) d[~rf3] *= -1 blastab[9] -= d blastab = blastab[ (blastab[7] - blastab[6] + 1 >= min_ratio * blastab[12]) & (blastab[7] - blastab[6] + 1 >= min_cov)] return blastab.drop(columns=[15, 16]) refAA = os.path.join(self.dirPath, 'refAA') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch') if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) qryAASeq = transeq(self.qrySeq, frame='F') with open(qryAA, 'w') as fout: for n, ss in sorted(qryAASeq.items()): _, id, s = min([(len(s[:-1].split('X')), id, s) for id, s in enumerate(ss)]) fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s)) refAASeq = transeq(self.refSeq, frames) toWrite = [] for n, ss in sorted(refAASeq.items()): for id, s in enumerate(ss): toWrite.append('>{0}:{1}\n{2}\n'.format(n, id + 1, s)) blastab = [] for id in xrange(5): with open(refAA, 'w') as fout: for line in toWrite[id::4]: fout.write(line) ublast_cmd = '{usearch} -self -threads {n_thread} -db {refAA} -ublast {qryAA} -mid {min_id} -query_cov {min_ratio} -evalue 1 -accel 0.9 -maxhits {nhits} -userout {aaMatch} -ka_dbsize 5000000 -userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+raw+ql+tl+qrow+trow+qstrand'.format( usearch=usearch, refAA=refAA, qryAA=qryAA, aaMatch=aaMatch, n_thread=self.n_thread, min_id=self.min_id * 100., nhits=nhits, min_ratio=self.min_ratio) p = Popen(ublast_cmd.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() if os.path.getsize(aaMatch) > 0: blastab.append( parseUBlast(open(aaMatch), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio)) blastab = pd.concat(blastab) logger('Run uBLAST finishes. Got {0} alignments'.format( blastab.shape[0])) return blastab
def get_quality(self, reference, reads): if parameters['mapper'] != 'bwa': bams = self.__run_bowtie(reference, reads) else: bams = self.__run_bwa(reference, reads) sequence = readFasta(filename=reference, qual=0) for n, s in sequence.iteritems(): s[1] = list(s[1]) sites = { n: np.array([0 for ss in s[1]]) for n, s in sequence.iteritems() } for bam in bams: if bam is not None: depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format( bam=bam, **parameters).split(), stdout=PIPE).communicate()[0] for line in depth.split('\n'): part = line.strip().split() if len(part) > 2 and float(part[2]) > 0: sites[part[0]][int(part[1]) - 1] += float(part[2]) sites = {n: [s.size, np.mean(s), 0.] for n, s in sites.iteritems()} depth = np.array(sites.values()) depth = depth[np.argsort(-depth.T[0])] size = np.sum(depth.T[0]) acc = [0, 0] for d in depth: acc[0], acc[1] = acc[0] + d[0], acc[1] + d[0] * d[1] if acc[0] * 2 >= size: break ave_depth = acc[1] / acc[0] exp_mut_depth = max(ave_depth * 0.2, 1.) for n, s in sites.iteritems(): s[2] = s[1] / ave_depth logger('Average read depth: {0}'.format(ave_depth)) logger('Sites with over {0} or 15% unsupported reads is not called'. format(exp_mut_depth)) sequence = {n: s for n, s in sequence.iteritems() if sites[n][1] > 0.} with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout: for n, s in sorted(sequence.items()): fout.write('>{0}\n{1}\n'.format( n, '\n'.join([ s[0][site:(site + 100)] for site in range(0, len(s[0]), 100) ]))) bam_opt = ' '.join( ['--bam {0}'.format(b) for b in bams if b is not None]) pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format( bam_opt=bam_opt, **parameters) Popen(pilon_cmd.split(), stdout=PIPE).communicate() cont_depth = [float(d) for d in parameters['cont_depth'].split(',')] logger( 'Contigs with less than {0} depth will be removed from the assembly' .format(cont_depth[0] * ave_depth)) logger( 'Contigs with more than {0} depth will be treated as duplicates'. format(cont_depth[1] * ave_depth)) with open('{0}.mapping.vcf'.format(prefix)) as fin, open( '{0}.mapping.difference'.format(prefix), 'w') as fout: for line in fin: if line.startswith('#'): continue part = line.strip().split('\t') if sites[part[0]][2] < cont_depth[0] or sites[ part[0]][2] >= cont_depth[1]: continue try: if part[-1] == '0/0' and len(part[3]) == 1 and len( part[4]) == 1: dp, af = float(part[7].split(';', 1)[0][3:]), float( part[7][-4:]) if af < 0.15 and dp >= 3 and dp * af <= exp_mut_depth: if part[6] == 'PASS' or (part[6] == 'LowCov' and parameters['metagenome']): site = int(part[1]) - 1 qual = chr(int(part[7].split(';')[4][3:]) + 33) sequence[part[0]][1][site] = qual else: fout.write(line) else: fout.write(line) except: fout.write(line) if self.snps is not None: for n, snvs in self.snps.iteritems(): for site, snv in snvs: if snv.find('N') >= 0: continue if snv.startswith('+'): s, e = site - 4, site + 3 + len(snv) else: s, e = site - 4, site + 4 for k in range(s, e): sequence[n][1][k] = max(chr(40 + 33), sequence[n][1][k]) with open('{0}.result.fastq'.format(prefix), 'w') as fout: for n, (s, q) in sequence.iteritems(): if sites[n][2] >= cont_depth[0]: fout.write('@{0} {3} {4} {5}\n{1}\n+\n{2}\n'.format( n, s, ''.join(q), *sites[n])) os.unlink('{0}.mapping.vcf'.format(prefix)) logger('Final result is written into {0}'.format( '{0}.result.fastq'.format(prefix))) return '{0}.result.fastq'.format(prefix)
def read_matrix(fname) : sites, snps = [], {} invariant = [] seqLens, missing = [], [] validate = np.repeat(45, 256).astype(np.uint8) validate[np.array(['A', 'C', 'G', 'T', '.', '+', '-', '', '*']).view(asc2int)] = np.array(['A', 'C', 'G', 'T', '', '', '-', '', '']).view(asc2int) with uopen(fname) as fin : for line_id, line in enumerate(fin) : if line.startswith('##'): if line.startswith('## Constant_bases') : part = line[2:].strip().split() invariant = list(zip([65, 67, 71, 84], [float(v) for v in part[1:]])) elif line.startswith('## Sequence_length:') : part = line[2:].strip().split() seqLens.append([part[1], int(part[2])]) elif line.startswith('## Missing_region:') : part = line[2:].strip().split() missing.append([part[1], int(part[2]), int(part[3])]) elif line.startswith('#') : part = np.array(line.strip().split('\t')) cols = np.where((1 - np.char.startswith(part, '#')).astype(bool))[0] w_cols = np.where(np.char.startswith(part, '#!W'))[0] names = part[cols] break else : part = np.array(line.strip().split('\t')) cols = np.ones(part.shape, dtype=bool) cols[:2] = False w_cols = np.char.startswith(part, '#!W') names = part[cols] break for mat in pd.read_csv(fin, header=None, sep='\t', usecols=cols.tolist() + w_cols.tolist() + [0,1], chunksize=10000) : mat = mat.values logger('{0}\t{1}\t{2}\t{3}\t{4}'.format(\ mat[0, 0], mat[0, 1], \ resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, len(sites), len(snps))) bk = validate[mat[:, cols].astype('str').view(asc2int)].reshape(mat.shape[0], -1, cols.shape[0]) bk = np.moveaxis(bk, 1, 2) if bk.shape[2] > 1 : bk[(bk[:, :, 1] != 0) & (bk[:, :, 0] == 45), 0] = 0 b_keys = bk[:, :, 0] weights = mat[:, w_cols].astype(float).prod(1) if w_cols.size else np.ones(mat.shape[0], dtype=float) for (b_key, site, w) in zip(b_keys, mat, weights) : b_key = tuple(b_key) if min(b_key) == 0 : bk2 = np.concatenate([site[cols], ['']]) bk2[bk2 == '-'] = '' category, b_key = np.unique(bk2, return_inverse=True) if category[0] == '' : category[0] = '-' b_key = tuple(b_key[:-1].tolist()) else : category = [] if b_key in snps : snps[b_key][2] += w elif min(b_key) >= 45 : snps[b_key] = [len(snps), 1, w] else : snps[b_key] = [len(snps), 2, w] if snps[b_key][1] > 0 : sites.append([ site[0], site[1], snps[b_key][0], np.array(category) ]) for inv in invariant : b_key = tuple([inv[0]] * len(names)) if b_key not in snps : snps[b_key] = [len(snps), 0, float(inv[1])] else : snps[b_key][2] += float(inv[1]) return names, sites, sorted([[info[0], int(math.ceil(info[2])), np.array(line, dtype=np.uint8), info[1]] for line, info in snps.items() ]), seqLens, missing
def buildReference(targets, sources, max_iden=0.9, min_iden=0.6, coverage=0.7, paralog=0.1, relaxEnd=False) : orderedLoci = { t['fieldname']:i for i, t in reversed(list(enumerate(sources))) } refsets = [] dirPath = tempfile.mkdtemp(prefix='NS_', dir='.') try: tmpDir = os.path.join(dirPath, 'tmp') sourceFna = os.path.join(dirPath, 'sourceFna') sourceFaa = os.path.join(dirPath, 'sourceFaa') targetFna = os.path.join(dirPath, 'targetFna') targetFiltFna = os.path.join(dirPath, 'targetFiltFna') targetFiltFaa = os.path.join(dirPath, 'targetFiltFaa') targetClsFna = os.path.join(dirPath, 'targetClsFna') targetResFna = os.path.join(dirPath, 'targetResFna') alnFaa = os.path.join(dirPath, 'alnFaa') with open(sourceFna+'.fas', 'w') as fout : fout.write('\n'.join(['>{fieldname}_{value_id}\n{value}'.format(**s) for s in sources])) with open(targetFna+'.fas', 'w') as fout : fout.write('\n'.join(['>{fieldname}_{value_id}\n{value}'.format(**t) for t in targets])) targetFiltFna, goodCandidates, crossSites = minimapFilter(sourceFna+'.fas', targetFna+'.fas', targetFiltFna, max_iden, min_iden, coverage, paralog, relaxEnd, orderedLoci) logger('identifed {0} good exemplar alleles after nucleic search'.format(len(goodCandidates))) subprocess.Popen('{0} createdb {1}.fas {1} --dont-split-seq-by-len'.format(mmseqs, sourceFna).split(), stdout = subprocess.PIPE).communicate() subprocess.Popen('{0} createdb {1}.fas {1} --dont-split-seq-by-len'.format(mmseqs, targetFiltFna).split(), stdout = subprocess.PIPE).communicate() subprocess.Popen('{0} translatenucs {1} {2}'.format(mmseqs, sourceFna, sourceFaa).split(), stdout = subprocess.PIPE).communicate() subprocess.Popen('{0} translatenucs {1} {2}'.format(mmseqs, targetFiltFna, targetFiltFaa).split(), stdout = subprocess.PIPE).communicate() for ite in range(9) : if os.path.isdir(tmpDir) : shutil.rmtree(tmpDir) p=subprocess.Popen('{0} search {2} {1} {3} {4} -c {6} --min-seq-id {5} --threads {t}'.format(mmseqs, sourceFaa, targetFiltFaa, alnFaa, tmpDir, min_iden, coverage, t=9-4*int(ite/3) ).split(), stdout = subprocess.PIPE) p.communicate() if p.returncode == 0 : break time.sleep(1) subprocess.Popen('{0} convertalis {2} {1} {3} {3}.tab --format-mode 2 --threads 8'.format(mmseqs, sourceFaa, targetFiltFaa, alnFaa).split(), stdout = subprocess.PIPE).communicate() with open(alnFaa + '.tab') as fin : for line in fin : part = line.strip().split('\t') qLoc, rLoc = part[0].rsplit('_', 1)[0], part[1].rsplit('_', 1)[0] if qLoc == rLoc: if relaxEnd or (int(part[8]) == int(part[6]) and int(part[12]) - int(part[7]) == int(part[13]) - int(part[9])) : goodCandidates[part[0]] = max(goodCandidates.get(part[0], 0), float(part[2])) elif orderedLoci[qLoc] > orderedLoci[rLoc] and crossSites.get(part[0], 0) < float(part[2]) : crossSites[part[0]] = float(part[2]) logger('identifed a total of {0} good exemplar alleles after amino search'.format(len(goodCandidates))) nLoci = len(orderedLoci) for s in sources : key = '{0}_{1}'.format(s['fieldname'], s['value_id']) if crossSites.get(key, 0) > 1-paralog : orderedLoci.pop(s['fieldname'], None) #logger(key) if nLoci > len(orderedLoci) : logger('Total of {0} loci are not suitable for MLST scheme [due to paralog setting]. There are {1} left'.format(nLoci - len(orderedLoci), len(orderedLoci))) with open(targetFna+'.fas') as fin, open(targetClsFna+'.fas', 'w') as fout : writable = False for line in fin : if line.startswith('>') : name = line[1:].strip().split()[0] locus, id = name.rsplit('_', 1) writable = True if locus in orderedLoci and goodCandidates.get(name, 0) - crossSites.get(name, 0) > paralog else False if writable : fout.write(line) subprocess.Popen('{0} createdb {1}.fas {1} --dont-split-seq-by-len'.format(mmseqs, targetClsFna).split(), stdout = subprocess.PIPE).communicate() for ite in range(9) : if os.path.isdir(tmpDir) : shutil.rmtree(tmpDir) p = subprocess.Popen('{0} cluster {1} {2} {3} -c {5} --min-seq-id {4} --threads {t}'.format(mmseqs, targetClsFna, targetResFna, tmpDir, max_iden, max_iden, t=9-4*int(ite/3)).split(), stdout = subprocess.PIPE) p.communicate() if p.returncode == 0 : break time.sleep(1) subprocess.Popen('{0} createtsv {1} {2} {3} {3}.tab'.format(mmseqs, targetClsFna, targetClsFna, targetResFna).split(), stdout = subprocess.PIPE).communicate() goodCandidates = {} with open(targetResFna + '.tab') as fin : for line in fin : goodCandidates[line.split('\t', 1)[0]] = 1 logger('There are {0} good exemplar alleles left after final clustering'.format(len(goodCandidates))) with open(targetClsFna+'.fas') as fin: writable = False for line in fin : if line.startswith('>') : name = line[1:].strip().split()[0] writable = True if name in goodCandidates else False if writable : refsets.append(line.strip()) except : pass finally: shutil.rmtree(dirPath) return '\n'.join(refsets)
def loadBam(prefix, reference, bams, sequences, snps): sequence = readFasta(reference) sequence = {n: [s, [0] * len(s)] for n, s in sequence.items()} sites = {} for bam in bams: if bam is not None: depth = subprocess.Popen('{samtools} depth -q 0 -Q 0 {bam}'.format( bam=bam, **externals).split(), stdout=subprocess.PIPE, universal_newlines=True) try: d = pd.read_csv(depth.stdout, sep='\t').values sites.update({cName: 1 for cName in np.unique(d.T[0])}) except: pass sequence = {n: s for n, s in sequence.items() if n in sites} with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout: for n, s in sorted(sequence.items()): fout.write('>{0}\n{1}\n'.format( n, '\n'.join([ s[0][site:(site + 100)] for site in xrange(0, len(s[0]), 100) ]))) bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None]) pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format( prefix=prefix, bam_opt=bam_opt, **externals) subprocess.Popen(pilon_cmd.split(), stdout=subprocess.PIPE, universal_newlines=True).communicate() uncertains = [] with open('{0}.mapping.vcf'.format(prefix)) as fin: for line in fin: if line.startswith('#'): continue part = line.strip().split('\t') if sequences[part[0]][int(part[1]) - 1] >= 0: if len(part[3]) == 1 and len(part[4]) == 1: pp = part[7].split(';') dp = float(pp[0][3:]) if dp >= 3: qd = int(pp[4][3:]) if part[-1] == '0/1' or qd < 10: bcs = sorted( [float(bc) for bc in pp[5][3:].split(',')]) uncertains.append([bcs[-1], np.sum(bcs[:-1])]) uncertains = np.array(uncertains) p = np.sum(uncertains.T[0]) / np.sum(uncertains) qPerRead = 10 * (np.log10(p) - np.log10(1 - p)) for n in sequence: sequence[n][0] = list(sequence[n][0]) highQ, lowQ, lowC = 0, 0, 0 with open('{0}.mapping.vcf'.format(prefix)) as fin: for line in fin: if line.startswith('#'): continue part = line.strip().split('\t') if len(part[3]) == 1 and len(part[4]) == 1: s = int(part[1]) - 1 pp = part[7].split(';') dp = float(pp[0][3:]) qd = int(pp[4][3:]) if part[-1] == '0/1' or qd < 10: bcs = np.array([int(bc) for bc in pp[5][3:].split(',')]) if np.sum(bcs) > 0: sequence[part[0]][0][s] = ['A', 'C', 'G', 'T'][np.argmax(bcs)] else: sequence[part[0]][0][s] = part[3] if dp < 3: lowC += 1 else: bcs.sort() bcs = [bcs[-1], np.sum(bcs[:-1])] q1 = binom.cdf(bcs[0], bcs[0] + bcs[1], p) q2 = qPerRead * (bcs[0] - bcs[1]) if q1 >= 0.05 else 1 if q2 >= 10: highQ += 1 else: lowQ += 1 sequence[part[0]][1][s] = min(40, max(1, int(q2))) else: if dp < 3: lowC += 1 else: if qd >= 10: highQ += 1 else: lowQ += 1 sequence[part[0]][1][s] = qd if part[-1] == '1/1': sequence[part[0]][0][s] = part[4] logger( '{0}: Expected mix-up: {1} {2} ; Got highQ {3} ; lowQ {4} ; lowC {5}'. format(prefix, uncertains.shape[0], p, highQ, lowQ, lowC)) with open('{0}.metaCaller.fastq'.format(prefix), 'w') as fout: p = prefix.rsplit('/', 1)[-1] for n, (s, q) in sequence.items(): fout.write('@{0}\n{1}\n+\n{2}\n'.format( p + '_' + n, ''.join(s), ''.join([chr(qq + 33) for qq in q]))) os.unlink('{0}.mapping.vcf'.format(prefix)) os.unlink('{0}.mapping.fasta'.format(prefix)) os.unlink('{0}.mapping.reference.fasta'.format(prefix)) return '{0}.metaCaller.fastq'.format(prefix)
def runDiamond(self, ref, qry, nhits=10, frames='7'): logger('Run diamond starts') def parseDiamond(fin, refseq, qryseq, min_id, min_cov, min_ratio): blastab = [] for line in fin: if line.startswith('@'): continue part = line.strip().split('\t') if part[2] == '*': continue qn, qf = part[0].rsplit(':', 1) rn, rf, rx = part[2].rsplit(':', 2) rs = int(part[3]) + int(rx) ql, rl = len(qryseq[str(qn)]), len(refseq[str(rn)]) qm = len(part[9]) if qm * 3 < min_cov: continue cov_ratio = qm * 3. / ql if cov_ratio < min_ratio: continue cigar = [[int(n) * 3, t] for n, t in re.findall(r'(\d+)([A-Z])', part[5])] cl = np.sum([c[0] for c in cigar]) variation = float(part[12][5:]) * 3 if part[12].startswith( 'NM:') else float(re.findall('NM:i:(\d+)', line)[0]) * 3 iden = 1 - round(variation / cl, 3) if iden < min_id: continue qf, rf = int(qf), int(rf) qs = int(part[18][5:]) if part[18].startswith('ZS:') else int( re.findall('ZS:i:(\d+)', line)[0]) rm = int( np.sum([c[0] for c in cigar if c[1] in {'M', 'D'}]) / 3) if rf <= 3: rs, r_e = rs * 3 + rf - 3, (rs + rm - 1) * 3 + rf - 1 else: rs, r_e = rl - (rs * 3 + rf - 6) + 1, rl - ( (rs + rm - 1) * 3 + rf - 4) + 1 if qf <= 3: qs, qe = qs * 3 + qf - 3, (qs + qm - 1) * 3 + qf - 1 else: qs, qe = ql - (qs * 3 + qf - 6) + 1, ql - ( (qs + qm - 1) * 3 + qf - 4) + 1 qs, qe, rs, r_e = qe, qs, r_e, rs cigar = list(reversed(cigar)) cd = [c[0] for c in cigar if c[1] != 'M'] score = int( part[14][5:]) if part[14].startswith('ZR:') else int( re.findall('ZR:i:(\d+)', line)[0]) blastab.append([ qn, rn, iden, cl, int(variation - sum(cd)), len(cd), qs, qe, rs, r_e, 0.0, score, ql, rl, cigar ]) blastab = pd.DataFrame(blastab) blastab[[0, 1]] = blastab[[0, 1]].astype(str) return blastab refAA = os.path.join(self.dirPath, 'refAA') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch') if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id) with open(qryAA, 'w') as fout: for n, ss in sorted(qryAASeq.items()): _, id, s = min([(len(s[:-1].split('X')), id, s) for id, s in enumerate(ss)]) fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s)) diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format( diamond=diamond, qryAA=qryAA) p = Popen(diamond_fmt.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id) toWrite = [] for n, ss in sorted(refAASeq.items()): for id, s in enumerate(ss): cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X') cdss[-1] = cdss[-1][:-1] cdsi = np.cumsum([0] + list(map(len, cdss[:-1]))) for ci, cs in zip(cdsi, cdss): if len(cs): toWrite.append('>{0}:{1}:{2}\n{3}\n'.format( n, id + 1, ci, cs)) blastab = [] for id in xrange(5): #logger('{0}'.format(id)) with open(refAA, 'w') as fout: for line in toWrite[id::5]: fout.write(line) diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format( diamond=diamond, refAA=refAA, qryAA=qryAA, aaMatch=aaMatch, n_thread=self.n_thread, min_id=self.min_id * 100., nhits=nhits, min_ratio=self.min_ratio * 100.) p = Popen(diamond_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() if os.path.getsize(aaMatch) > 0: tab = parseDiamond(open(aaMatch), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio) os.unlink(aaMatch) if tab is not None: blastab.append(tab) blastab = pd.concat(blastab) logger('Run diamond finishes. Got {0} alignments'.format( blastab.shape[0])) return blastab
def runMMseq(self, ref, qry): logger('Run MMSeqs starts') def parseMMSeq(fin, refseq, qryseq, min_id, min_cov, min_ratio): blastab = pd.read_csv(fin, sep='\t', header=None) blastab = blastab[blastab[2] >= min_id] qlen = blastab[0].apply(lambda r: len(qryseq[r])) rlen = blastab[1].apply(lambda r: len(refseq[r])) cigar = blastab[14].apply(lambda x: [[int(n) * 3, t] for n, t in re .findall(r'(\d+)([A-Z])', x)]) ref_sites = pd.concat([3 * (blastab[6] - 1) + 1, 3 * blastab[7]], keys=[0, 1], axis=1) d = ref_sites[1] - qlen d[d < 0] = 0 def ending(x, y): x[-1][0] -= y np.vectorize(ending)(cigar, d) ref_sites[1] -= d direction = (blastab[8] < blastab[9]) qry_sites = pd.concat([blastab[8], blastab[9] - d], axis=1) qry_sites[~direction] = pd.concat([blastab[8] - d, blastab[9]], axis=1)[~direction] blastab = pd.DataFrame( np.hstack([ blastab[[0, 1, 2]], np.apply_along_axis(lambda x: x[1] - x[0] + 1, 1, ref_sites.values)[:, np.newaxis], pd.DataFrame(np.zeros([blastab.shape[0], 2], dtype=int)), ref_sites, qry_sites, blastab[[10, 11]], qlen[:, np.newaxis], rlen[:, np.newaxis], cigar[:, np.newaxis] ])) return blastab[(blastab[3] >= min_cov) & (blastab[3] >= blastab[12] * min_ratio)] tmpDir = os.path.join(self.dirPath, 'tmp') refNA = os.path.join(self.dirPath, 'refNA') qryNA = os.path.join(self.dirPath, 'qryNA') refCDS = os.path.join(self.dirPath, 'refCDS') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch2') Popen('{0} createdb {1} {2} --dont-split-seq-by-len'.format( mmseqs, ref, refNA).split(), stdout=PIPE).communicate() Popen('{0} createdb {1} {2} --dont-split-seq-by-len'.format( mmseqs, qry, qryNA).split(), stdout=PIPE).communicate() Popen('{0} translatenucs {1} {2}'.format(mmseqs, qryNA, qryAA).split(), stdout=PIPE).communicate() for ite in range(9): if os.path.isdir(tmpDir): shutil.rmtree(tmpDir) p = Popen('{0} search {1} {2} {3} {4} -a --alt-ali 30 -s 6 --translation-table 11 --threads {5} --min-seq-id {6} -e 10 --cov-mode 2 -c {7}'.format(\ mmseqs, qryAA, refNA, aaMatch, tmpDir, self.n_thread, self.min_id, self.min_ratio).split(), stdout=PIPE) p.communicate() if p.returncode == 0: break if ite > 2: Popen('{0} extractorfs {2} {3}'.format(mmseqs, qryAA, refNA, refCDS).split(), stdout=PIPE).communicate() p = Popen('{0} search {1} {2} {3} {4} -a --alt-ali 30 -s 6 --translation-table 11 --threads {5} --min-seq-id {6} -e 10 --cov-mode 2 -c {7}'.format(\ mmseqs, qryAA, refCDS, aaMatch, tmpDir, self.n_thread, self.min_id, self.min_ratio).split(), stdout=PIPE) p.communicate() if p.returncode == 0: break time.sleep(1) Popen('{0} convertalis {1} {2} {3} {3}.tab --threads {4} --format-output'.format(\ mmseqs, qryAA, refNA, aaMatch, self.n_thread).split() + ['query,target,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,raw,qlen,tlen,cigar'], stdout=PIPE).communicate() if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) blastab = parseMMSeq(open(aaMatch + '.tab'), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio) logger('Run MMSeqs finishes. Got {0} alignments'.format( blastab.shape[0])) return blastab
def buildReference(alleles, references, max_iden=0.9, min_iden=0.6, coverage=0.7, paralog=0.1, relaxEnd=False): orderedLoci = { t['fieldname']: i for i, t in reversed(list(enumerate(references))) } dirPath = tempfile.mkdtemp(prefix='NS_', dir='.') try: sourceFna = os.path.join(dirPath, 'sourceFna') clsFna = os.path.join(dirPath, 'clsFna') targetFna = os.path.join(dirPath, 'targetFna') with open(sourceFna, 'w') as fout: fout.write('\n'.join([ '>{fieldname}_{value_id}\n{value}'.format(**s) for s in alleles ])) with open(targetFna, 'w') as fout: fout.write('\n'.join([ '>{fieldname}_{value_id}\n{value}'.format(**t) for t in references ])) # get cluster exampler, cluster = clust('-i {0} -p {1} -d {2} -c 1 -t 8'.format(\ sourceFna, clsFna, max_iden).split()) tooClose, goodCandidates, crossSites = {}, {}, {} with open(cluster) as fin: for line in fin: part = line.strip().split() locus = [p.rsplit('_', 1)[0] for p in part] if locus[0] != locus[1]: crossSites[part[0]] = locus[1] crossSites[part[1]] = locus[0] # compare with references blastab = uberBlast('-r {0} -q {1} -f --blastn --diamondSELF --min_id {2} --min_ratio {3} -t 8 -p -s 1 -e 0,3'.format(\ targetFna, exampler, min_iden, coverage ).split()) #blastab = blastab[blastab.T[0] != blastab.T[1]] except: pass finally: shutil.rmtree(dirPath) for tab in blastab: locus = [p.rsplit('_', 1)[0] for p in tab[:2]] c = (tab[7] - tab[6] + 1) / tab[12] e = max(abs(tab[8] - tab[6]), abs(tab[12] - tab[7] - (tab[13] - tab[9]))) if c >= coverage and tab[2] >= min_iden: if locus[0] != locus[1]: crossSites[tab[0]] = locus[1] crossSites[tab[1]] = locus[0] elif e <= 0: if tab[2] >= max_iden and tab[0] != tab[1]: tooClose[tab[0]] = 1 else: goodCandidates[tab[0]] = tab[2] paralogous_loci = {} for ref in references: key = '{0}_{1}'.format(ref['fieldname'], ref['value_id']) if key in crossSites and orderedLoci[ref['fieldname']] < orderedLoci[ crossSites[key]]: paralogous_loci[ref['fieldname']] = 1 refsets = [] for allele in alleles: if allele['fieldname'] in paralogous_loci: allele['fieldname'] = '' else: key = '{0}_{1}'.format(allele['fieldname'], allele['value_id']) if key in crossSites: allele['fieldname'] = '' elif key in goodCandidates and key not in tooClose: refsets.append( '>{fieldname}_{value_id}\n{value}'.format(**allele)) alleles = [ '>{fieldname}_{value_id}\n{value}'.format(**allele) for allele in alleles if allele['fieldname'] != '' ] logger('removed {0} paralogous sites.'.format(len(paralogous_loci))) logger('obtained {0} alleles and {1} references alleles'.format( len(alleles), len(refsets))) return '\n'.join(alleles), '\n'.join(refsets)
def reduce_depth(self, reads): encode = {'A': 0, 'C': 1, 'G': 2, 'T': 3} read_stats = [[] for library in reads] new_reads = [[] for library in reads] for lib_id, (library, stat, new_lib) in enumerate(zip(reads, read_stats, new_reads)): for fname in library: p = Popen("zcat {0}|awk 'NR%4==2'|wc".format(fname), shell=True, stdout=PIPE).communicate()[0].strip().split() n_base = int(p[2]) - int(p[1]) bcomp = [[0, 0, 0, 0, 0] for i in range(10)] p = Popen("zcat {0}|head -400000|awk 'NR%20==2'".format(fname), shell=True, stdout=PIPE, stderr=PIPE) for line in p.stdout: for b, bc in zip(line[:10], bcomp): bc[encode.get(b, 4)] += 1 seq_start = 0 for c in range(9, -1, -1): bc = bcomp[c] if max(bc) / 0.8 >= sum(bc) or (c < 2 and bc[4] > 0.1 * sum(bc)): seq_start = c + 1 break stat.append([n_base, seq_start]) n_base = sum([s[0] for s in stat]) sample_freq = float(parameters['max_base']) / n_base if parameters[ 'max_base'] > 0 else 1. if sample_freq >= 1 or len(stat) < 3: sample_freqs = [sample_freq for s in stat] else: n_base2 = sum([s[0] for s in stat[:2]]) if float(parameters['max_base']) <= n_base2: sample_freqs = [ float(parameters['max_base']) / n_base2, float(parameters['max_base']) / n_base2, 0. ] else: sample_freqs = [ 1., 1., (float(parameters['max_base']) - n_base2) / stat[2][0] ] if sample_freqs[0] < 1 and sample_freqs[0] > 0: logger('Read depth too high. Subsample to every {0:.2f} read'. format(1. / sample_freqs[0])) for f_id, (lib, s, sample_freq) in enumerate( zip(library, stat, sample_freqs)): if sample_freq > 0: new_lib.append('{0}.2.{1}.{2}.fastq.gz'.format( parameters['prefix'], lib_id, f_id + 1)) if parameters['noRename'] == False: if s[1] > 0: logger( 'Remove potential barcode bases at the beginning {0} bps of reads in {1}' .format(s[1], lib)) Popen( "zcat {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print substr($0, {3}, 9999999)}} else {{if(id==0) {{print \"@{4}_\"nr}} else {{print \"+\"}} }} }}'|gzip > {1}" .format(lib, new_lib[-1], min(sample_freq, 1.), s[1] + 1, lib_id), shell=True).wait() else: Popen( "zcat {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print $0}} else {{ if(id==0){{print \"@{4}_\"nr}} else {{print \"+\"}} }} }}'|gzip > {1}" .format(lib, new_lib[-1], min(sample_freq, 1.), s[1] + 1, lib_id), shell=True).wait() else: if s[1] > 0: logger( 'Remove potential barcode bases at the beginning {0} bps of reads in {1}' .format(s[1], lib)) Popen( "zcat {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print substr($0, {3}, 9999999)}} else {{if(id==0) {{print $0}} else {{print \"+\"}} }} }}'|gzip > {1}" .format(lib, new_lib[-1], min(sample_freq, 1.), s[1] + 1, lib_id), shell=True).wait() else: Popen( "zcat {0}|awk '{{nr = int((NR-1)/4)}} {{id=(NR-1)%4}} int(nr*{2}) > int((nr-1)*{2}) {{if (id==1 || id == 3) {{print $0}} else {{ if(id==0){{print $0}} else {{print \"+\"}} }} }}'|gzip > {1}" .format(lib, new_lib[-1], min(sample_freq, 1.), s[1] + 1, lib_id), shell=True).wait() os.unlink(lib) return new_reads
def filt_per_group(data): mat, ref, global_file = data global_differences = dict(np.load(global_file)) nMat = mat.shape[0] seqs = np.vstack([ np.vstack(mat.T[4]), np.array(list(ref)).view(asc2int).astype(np.uint8)[np.newaxis, :] ]) seqs[np.in1d(seqs, [65, 67, 71, 84], invert=True).reshape(seqs.shape)] = 45 diff = compare_seq( seqs, np.zeros(shape=[seqs.shape[0], seqs.shape[0], 2], dtype=int)).astype(float) incompatible, distances = {}, np.zeros( shape=[seqs.shape[0], seqs.shape[0]], dtype=float) for i1, m1 in enumerate(mat): for i2 in xrange(i1 + 1, nMat): m2 = mat[i2] mut, aln = diff[i1, i2] if aln > 0: gd = global_differences.get(tuple(sorted([m1[1], m2[1]])), (0.01, 4)) distances[i1, i2] = distances[i2, i1] = max( 0., 1 - (aln - mut) / aln / (1 - gd[0])) difference = mut / aln / gd[0] / gd[1] else: distances[i1, i2] = distances[i2, i1] = 0.8 difference = 1.5 if difference > 1.: incompatible[(i1, i2)] = 1 if len(incompatible) > 0: groups = [] for j, m in enumerate(mat): novel = 1 for g in groups: if diff[g[0], j, 0] <= 0.6 * ( 1.0 - params['clust_identity']) * diff[g[0], j, 1]: g.append(j) novel = 0 break if novel: groups.append([j]) group_tag = {gg: g[0] for g in groups for gg in g} try: tags = { g[0]: mat[g[0]][4].tostring().decode('ascii') for g in groups } except: tags = {g[0]: mat[g[0]][4].tostring() for g in groups} tags.update({'REF': ref}) ic2 = {} for i1, i2 in incompatible: t1, t2 = group_tag[i1], group_tag[i2] if t1 != t2: t1, t2 = str(t1), str(t2) if t1 not in ic2: ic2[t1] = {} if t2 not in ic2: ic2[t2] = {} ic2[t1][t2] = ic2[t2][t1] = 1 incompatible = ic2 for ite in xrange(3): try: tmpFile = tempfile.NamedTemporaryFile(dir='.', delete=False) for n, s in tags.items(): tmpFile.write('>X{0}\n{1}\n{2}'.format( n, s, '\n' * ite).encode('utf-8')) tmpFile.close() cmd = params[params['orthology']].format( tmpFile.name, ** params) if len(tags) < 500 else params['nj'].format( tmpFile.name, **params) phy_run = subprocess.Popen(shlex.split(cmd), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) gene_phy = ete3.Tree(phy_run.communicate()[0].replace("'", '')) break except: if ite == 2: return mat finally: os.unlink(tmpFile.name) for n in gene_phy.get_leaves(): if len(n.name): n.name = n.name[1:] node = gene_phy.get_midpoint_outgroup() if node is not None: gene_phy.set_outgroup(node) for ite in xrange(3000): gene_phy.ic, gene_phy.dist = {}, 0. rdist = sum([c.dist for c in gene_phy.get_children()]) for c in gene_phy.get_children(): c.dist = rdist for node in gene_phy.iter_descendants('postorder'): if node.is_leaf(): node.ic = { tuple(sorted([node.name, n2])): 1 for n2 in incompatible.get(node.name, {}) } else: node.ic = {} for c in node.get_children(): for x in c.ic: if x in node.ic: node.ic.pop(x) else: node.ic[x] = 1 cut_node = max([[len(n.ic), n.dist, n] for n in gene_phy.iter_descendants('postorder')], key=lambda x: (x[0], x[1])) if cut_node[0] > 0: cut_node = cut_node[2] prev_node = cut_node.up cut_node.detach() if 'REF' in cut_node.get_leaf_names(): gene_phy = cut_node elif prev_node.is_root(): gene_phy = gene_phy.get_children()[0] else: prev_node.delete(preserve_branch_length=True) tips = set(gene_phy.get_leaf_names()) for r1 in list(incompatible.keys()): if r1 not in tips: rr = incompatible.pop(r1, None) for r2 in rr: incompatible.get(r2, {}).pop(r1, None) for r1 in list(incompatible.keys()): if len(incompatible[r1]) == 0: incompatible.pop(r1, None) if len(incompatible) == 0: break logger(' Iteration {0}. Remains {1} tips.'.format( ite + 1, len(gene_phy.get_leaf_names()))) else: break if len(gene_phy.get_leaf_names()) < len(tags): groups = {str(g[0]): g for g in groups} tips = sorted([ nn for n in gene_phy.get_leaf_names() for nn in groups.get(n, []) ]) mat = mat[tips] return mat