def __readAssembly(self, assembly) : seq = {} with uopen(assembly) as fin : header = fin.read(1) with uopen(assembly) as fin : if header == '@' : for id, line in enumerate(fin) : if id % 4 == 0 : part = line[1:].strip().split() name = part[0] seq[name]= [0, float(part[2]) if len(part) > 2 else 0., None, None] elif id % 4 == 1 : seq[name][2] = line.strip() seq[name][0] = len(seq[name][2]) elif id % 4 == 3 : seq[name][3] = np.array(list(line.strip())) fasfile = assembly.rsplit('.', 1)[0] + '.fasta' logger('Write fasta sequences into {0}'.format(fasfile)) with open(fasfile, 'w') as fout : for n, s in sorted(seq.items()) : fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[2][site:(site+100)] for site in xrange(0, len(s[2]), 100)]))) else : fasfile = assembly for id, line in enumerate(fin) : if line.startswith('>') : name = line[1:].strip().split()[0] seq[name] = [0, 0., []] else : seq[name][2].extend( line.strip().split() ) for n, s in seq.items() : s[2] = ''.join(s[2]) s[0] = len(s[2]) return seq, fasfile
def readMap(data) : mTag, mFile = data presences, absences, mutations = [], [], [] aligns = ['', 0, 0] miss = -1 with uopen(mFile) as fin : for line in fin : if line.startswith('##') : if line.startswith('## Reference: ') : ref = line.split(' ')[-1] elif line.startswith('## Query: ') : qry = line.split(' ')[-1] if ref == qry : miss = -99999999 else : break with uopen(mFile) as fin : for line in fin : if line.startswith('#') : continue part = line.strip().split('\t') part[3:5] = [int(part[3]), int(part[4])] if part[2] == 'misc_feature' : if len(presences) == 0 or presences[-1][0] != part[0] or presences[-1][2] < part[3] : presences.append([part[0], part[3], part[4]]) elif presences[-1][2] < part[4] : presences[-1][2] = part[4] elif part[2] == 'unsure' : absences.append([part[0], part[3], part[4], miss]) elif part[2] == 'variation' : alt = re.findall(r'replace="([^"]+)"', part[8]) ori = re.findall(r'origin="([^"]+)"', part[8]) if len(alt) and len(ori) : mutations.append([mTag, part[0], part[3], ori[0], alt[0]]) return presences, absences, mutations
def readXFasta(fasta_file, core=0.8): seqs = [] nameMap = {} contLens, missingLens = [], [] with uopen(fasta_file) as fin, open(fasta_file + '.tmp', 'w') as fout: for line in fin: if line.startswith('>'): name = line[1:].strip().split()[0] seqName = name.split(':', 1)[0] contName = name.split(':', 1)[-1] if seqName in nameMap: seqId = nameMap[seqName] seqs[seqId] = [seqName, contName, []] else: seqId = nameMap[seqName] = len(seqs) seqs.append([seqName, contName, []]) elif line.startswith('='): for seq in seqs: seq[2] = ''.join(seq[2]).upper() contName = [seq[1] for seq in seqs if seq[2] != ''][0] contLen = [len(seq[2]) for seq in seqs if seq[2] != ''][0] for seq in seqs: if seq[2] == '': seq[2] = '-' * contLen cLen, mLen = parseSNP(fout, seqs, core) contLens.append(cLen) missingLens.extend(mLen) seqs = [[n, '', []] for n in nameMap] else: seqs[seqId][2].extend(line.strip().split()) if len(seqs[0][2]) > 0: cLen, mLen = parseSNP(fout, seqs, core) contLens.append(cLen) missingLens.extend(mLen) sys.stdout.write('## Constant_bases: {0} {1} {2} {3}\n'.format( conservedSites[ord('A')], conservedSites[ord('C')], conservedSites[ord('G')], conservedSites[ord('T')])) for cLen in contLens: sys.stdout.write('## Sequence_length: {0} {1}\n'.format(*cLen)) for mLen in missingLens: sys.stdout.write('## Missing_region: {0} {1} {2}\n'.format(*mLen)) sys.stdout.write('#Seq\t#Site\t{0}\n'.format('\t'.join( [n for n, i in sorted(nameMap.items(), key=lambda n: n[1])]))) with uopen(fasta_file + '.tmp') as fin: for line in fin: sys.stdout.write(line) os.unlink(fasta_file + '.tmp') return
def MLSTdb(args): params = getParams(args) database, refset, alleleFasta, refstrain, max_iden, min_iden, coverage, paralog, relaxEnd = params[ 'database'], params['refset'], params['alleleFasta'], params[ 'refstrain'], params['max_iden'], params['min_iden'], params[ 'coverage'], params['paralog'], params['relaxEnd'] if os.path.isfile(alleleFasta): alleles = readFasta(uopen(alleleFasta)) else: alleles = readFasta(StringIO(alleleFasta)) alleles = [allele for allele in alleles \ if allele['value_id'].isdigit() and int(allele['value_id']) > 0 and allele['fieldname'].find('/') < 0] refAlleles = '' if refset is not None: if refstrain: if os.path.isfile(refstrain): references = readFasta(uopen(refstrain)) else: references = readFasta(StringIO(refstrain)) else: loci, references = {}, [] for allele in alleles: if allele['fieldname'] not in loci: loci[allele['fieldname']] = 1 references.append(allele) allele_text, refAlleles = buildReference(alleles, references, max_iden, min_iden, coverage, paralog, relaxEnd) if refset: with open(str(refset), 'w') as fout: fout.write(refAlleles + '\n') logger('A file of reference alleles has been generated: {0}'.format( refset)) if database: conversion = [[], []] with open(database, 'w') as fout: for allele in alleles: conversion[0].append(get_md5(allele['value'])) conversion[1].append( [allele['fieldname'], int(allele['value_id'])]) conversion = pd.DataFrame(conversion[1], index=conversion[0]) conversion.to_csv(database, header=False) logger('A lookup table of all alleles has been generated: {0}'.format( database)) return allele_text, refAlleles
def xFasta2Matrix(prefix, fasta_file, core=0.95): seqs = [] snp_data = [] nameMap = {} with uopen(fasta_file) as fin: for line in fin: if line.startswith('>'): name = line[1:].strip().split()[0] seqName = name.split(':', 1)[0] contName = name.split(':', 1)[-1] if seqName in nameMap: seqId = nameMap[seqName] seqs[seqId] = [seqName, contName, []] else: seqId = nameMap[seqName] = len(seqs) seqs.append([seqName, contName, []]) elif line.startswith('='): if fillMissingSeq(seqs, len(snp_data)): snp_data.append( parse_snps(prefix, len(snp_data), seqs, core)) seqs = [[n] for n, i in sorted(nameMap.items(), key=lambda x: x[1]) ] else: seqs[seqId][2].extend(line.strip().split()) if fillMissingSeq(seqs, len(snp_data)): snp_data.append(parse_snps(prefix, len(snp_data), seqs, core)) const_sites = np.sum([snp[2] for snp in snp_data], axis=0) names = [n for n, i in sorted(nameMap.items(), key=lambda x: x[1])] with uopen(prefix + '.matrix.gz', 'w') as fout: fout.write('## Constant_bases: ' + ' '.join(const_sites.astype(str)) + '\n') for snp in snp_data: fout.write('## Sequence_length: {0} {1}\n'.format(*snp[:2])) for snp in snp_data: for s, e in snp[3]: fout.write('## Missing_region: {0} {1} {2}\n'.format( snp[0], s, e)) fout.write('#seq\t#site\t' + '\t'.join(names) + '\n') for snp in snp_data: d = np.load(snp[4]) sites, sv = d['sites'], d['snps'] for s in sites: fout.write('{0}\t{1}\t{2}\n'.format( snp[0], s[0], '\t'.join(sv[s[1]].astype(str)))) os.unlink(snp[4]) return prefix + '.matrix.gz'
def phylo(args) : args = add_args(args) global pool pool = Pool(args.n_proc) if 'matrix' in args.tasks : assert os.path.isfile( args.alignment ) seq = readXFasta( args.alignment ) names, sites, snps, seqLens, missing = parse_snps(seq, args.core) args.snp = write_matrix(args.prefix+'.matrix.gz', names, sites, snps, seqLens, missing) if len(names) < 4 : raise ValueError('Taxa too few.') snp_list = sorted([[info[0], int(math.ceil(info[2])), list(line), info[1]] for line, info in snps.items() ]) elif 'phylogeny' in args.tasks or 'ancestral' in args.tasks or 'ancestral_proportion' in args.tasks : assert os.path.isfile( args.snp ) names, sites, snps, seqLens, missing = read_matrix(args.snp) if len(names) < 4 : raise ValueError('Taxa too few.') snp_list = sorted([[info[0], int(math.ceil(info[2])), list(line), info[1]] for line, info in snps.items() ]) # build tree if 'phylogeny' in args.tasks : phy, weights, asc = write_phylip(args.prefix+'.tre', names, snp_list) if phy != '' : args.tree = run_raxml(args.prefix +'.tre', phy, weights, asc, 'CAT', args.n_proc) else : args.tree = args.prefix + '.tre' with open(args.tree, 'w') as fout : fout.write('({0}:0.0);'.format(':0.0,'.join(names))) args.tree = get_root(args.prefix, args.tree) elif 'ancestral' in args.tasks or 'ancestral_proportion' in args.tasks : tree = Tree(args.tree, format=1) # map snp if 'ancestral' in args.tasks : final_tree, node_names, states = infer_ancestral(args.tree, names, snp_list, sites, infer='viterbi') states = np.array(states) final_tree.write(format=1, outfile=args.prefix + '.labelled.nwk') write_states(args.prefix+'.ancestral_states.gz', node_names, states, sites, seqLens, missing) elif 'mutation' in args.tasks : final_tree = Tree(args.tree, format=1) node_names, states, sites = read_states(args.ancestral) if 'ancestral_proportion' in args.tasks : final_tree, node_names, states = infer_ancestral(args.tree, names, snp_list, sites, infer='margin') final_tree.write(format=1, outfile=args.prefix + '.labelled.nwk') write_ancestral_proportion(args.prefix+'.ancestral_proportion.gz', node_names, states, sites, seqLens, missing) if 'mutation' in args.tasks : mutations = get_mut(final_tree, node_names, states, sites) with uopen(args.prefix + '.mutations.gz', 'w') as fout : for sl in seqLens : fout.write('## Sequence_length: {0} {1}\n'.format(*sl)) for ms in missing : fout.write('## Missing_region: {0} {1} {2}\n'.format(*ms)) fout.write('#Node\t#Seq\t#Site\t#Homoplasy\t#Mutation\n') for mut in mutations : fout.write('\t'.join([str(m) for m in mut]) + '\n')
def readXFasta(fasta_file) : seqs = [[]] nameMap = {} with uopen(fasta_file) as fin : for line in fin : if line.startswith('>') : name = line[1:].strip().split()[0] seqName = name.split(':', 1)[0] contName = name.split(':', 1)[-1] if seqName in nameMap : seqId = nameMap[seqName] seqs[-1][seqId] = [seqName, contName, []] else : seqId = nameMap[seqName] = len(seqs[-1]) seqs[-1].append([seqName, contName, []]) elif line.startswith('=') : seqs.append([[seqName, str(len(seqs)), []] for seqName, contName, _ in seqs[0] ]) else : seqs[-1][seqId][2].extend(line.strip().split()) res = [] for blocks in seqs: seqLen = 0 for block in blocks : block[2] = ''.join(block[2]) seqLen = max(seqLen, len(block[2])) for block in blocks : if len(block[2]) < seqLen : block[2] += '-' * (seqLen - len(block[2])) if seqLen > 0 : res.append(blocks) return res
def write_down(filename, regions, repeats, mutations, reference, query, tag) : with uopen(filename, 'w') as fout: fout.write('##gff-version 3\n') fout.write('## Reference: {0}\n'.format(reference)) fout.write('## Query: {0}\n'.format(query)) fout.write('## Tag: {0}\n'.format(tag)) fout.write('\n'.join(['{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t{5}'.format(r[1], r[2], r[3], r[0], r[10], '/inference="Aligned%20with%20{0}:{1}-{2}"'.format(*r[7:10])) for r in regions]) + '\n') fout.write('\n'.join(['{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="{3}"'.format(r[0], r[1], r[2], 'Repetitive%20region' if r[3] == 0 else 'Uncertain%20base%20calling%20or%20ambigious%20alignment') for r in repeats]) + '\n') for contig, variation in sorted(mutations.items()): for site, alters in sorted(variation.items()) : for alter, source in alters.items() : if source[6][0] == '-' : difference = '+{0}'.format(source[7]) origin = '.' elif source[7][0] == '-' : difference = '-{0}'.format(source[6]) origin = source[6] else : difference = source[7] origin = source[6] compare = '' for id in xrange(0, len(source), 9) : compare += '{0}:{1}-{2}:{3};'.format(source[id+0], abs(source[id+4]), abs(source[id+5]), source[id+1]) fout.write('{0}\trefMapper\tvariation\t{1}\t{2}\t.\t+\t.\t{3}\n'.format(contig, source[2], source[3], '/replace="{0}";/compare="{1}";/origin="{2}"'.format(difference, compare[:-1], origin)))
def main(mgs): for mg in mgs: res = [] with uopen(mg) as fin: for line in fin: logp = re.findall('logp:\t([-eE\d\.]+)', line) if len(logp): logp = float(logp[0]) res.append([logp]) else: genotype = re.findall( 'Genotype (\d+):\tMean proportion:\t([eE\d\.]+)\tCI95%:\t(\[ [eE\d\.]+ - [eE\d\.]+ \])', line) if len(genotype): res[-1].append( [genotype[0][1], genotype[0][2], '', '', '']) elif len(res) and len( res[-1]) > 1 and res[-1][-1][-1] == '': part = line.strip().split('\t') res[-1][-1][2:] = [ part[0], part[1], part[3] + ' ' + part[5] ] try: res = max(res) res[1:] = sorted(res[1:], key=lambda x: -float(x[0])) for i in xrange(1, len(res)): r = res[i] print('{0}\t{1}\t{2}'.format(mg, i, '\t'.join(r))) except: pass
def loadMatrix(fname): sequences, snps = {}, {} with uopen(fname) as fin: for line in fin: if line.startswith('##'): part = line.strip().split() if line.startswith('## Sequence_length'): sequences[part[2]] = np.zeros(int(part[3]), dtype=np.int8) elif line.startswith('## Missing_region'): sequences[part[2]][int(part[3]) - 1:int(part[4])] = -1 else: headers = line.strip().split('\t') break matrix = pd.read_csv(fin, header=None, sep='\t').values encode = {'A': 1, 'C': 2, 'G': 4, 'T': 8} matrix.T[4] = list( map(lambda d: encode.get(d[0], -9999) + encode.get(d[-1], -9999), matrix.T[4].tolist())) matrix = matrix[matrix.T[4] > 0] matrix.T[2] -= 1 for m in matrix: sequences[m[1]][m[2]] |= m[4] if (m[1], m[2]) not in snps: snps[(m[1], m[2])] = {} if m[4] not in snps[(m[1], m[2])]: snps[(m[1], m[2])][m[4]] = [] snps[(m[1], m[2])][m[4]].append(m[0]) return sequences, snps
def iter_readGFF(fname): seq, cds = {}, {} names = {} with uopen(fname) as fin: sequenceMode = False for line in fin: if line.startswith('#'): continue elif line.startswith('>'): sequenceMode = True name = line[1:].strip().split()[0] assert name not in seq, logger( 'Error: duplicated sequence name {0}'.format(name)) seq[name] = [fname, []] elif sequenceMode: seq[name][1].extend(line.strip().split()) else: part = line.strip().split('\t') if len(part) > 2: name = re.findall(r'locus_tag=([^;]+)', part[8]) if len(name) == 0: parent = re.findall(r'Parent=([^;]+)', part[8]) if len(parent) and parent[0] in names: name = names[parent[0]] if len(name) == 0: name = re.findall(r'Name=([^;]+)', part[8]) if len(name) == 0: name = re.findall(r'ID=([^;]+)', part[8]) if part[2] == 'CDS': assert len(name) > 0, logger( 'Error: CDS has no name. {0}'.format(line)) # source_file, seqName, Start, End, Direction, hash, Sequences cds[name[0]] = [ fname, part[0], int(part[3]), int(part[4]), part[6], 0, '' ] else: ids = re.findall(r'ID=([^;]+)', part[8]) if len(ids): names[ids[0]] = name for n in seq: seq[n][1] = ''.join(seq[n][1]).upper() for n in cds: c = cds[n] try: c[6] = seq[c[1]][1][(c[2] - 1):c[3]] if c[4] == '-': c[6] = rc(c[6]) if not checkCDS(n, c[6]): c[6] = '' else: c[5] = int(hashlib.sha1(c[6].encode('utf-8')).hexdigest(), 16) except: c[6] = '' return seq, cds
def prepReference(prefix, ref_tag, reference, aligner, pilercr, trf, **args) : def mask_tandem(fasta_file) : cmd = '{0} {1} 2 4 7 80 10 60 2000 -d -h -ngs'.format(trf, fasta_file) trf_run = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, universal_newlines=True) region = [] for line in iter(trf_run.stdout.readline, r'') : if line[0] == '@' : cont_name = line[1:].strip().split()[0] else : part = line.split(' ',2)[:2] region.append([cont_name, int(part[0])-2, int(part[1])+2]) return region def mask_crispr(fasta_file, prefix) : cmd = '{0} -in {1} -out {2}.crispr'.format(pilercr, fasta_file, prefix) subprocess.Popen(cmd.split(), stderr=subprocess.PIPE).communicate() summary_trigger = 0 region = [] with open('{0}.crispr'.format(prefix)) as fin : for line in fin : if line.startswith('SUMMARY BY POSITION') : summary_trigger = 1 elif summary_trigger : if line[0] == '>' : cont_name = line[1:].strip().split()[0] elif len(line) > 10 and line.strip()[0] in '0123456789' : part = line[24:].strip().split() region.append([cont_name, int(part[0]), int(part[0]) + int(part[1]) -1]) os.unlink('{0}.crispr'.format(prefix)) return region # prepare reference if reference : if not isinstance(aligner, list) : subprocess.Popen('{0} -k15 -w5 -d {2}.mmi {1}'.format(aligner, reference, prefix).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() else : subprocess.Popen('{0} -cR01 {2}.mmi {1}'.format(aligner[0], reference, prefix).split()).communicate() import tempfile with tempfile.NamedTemporaryFile(dir='.') as tf : seq, _ = readFastq(reference) tf_fas = '{0}.fasta'.format(tf.name) with open(tf_fas, 'wt') as fout: for n, s in seq.items() : fout.write('>{0}\n{1}\n'.format(n, s)) #tf_fas = '{0}.fasta'.format(tf.name) #if reference.upper().endswith('GZ') : # subprocess.Popen('{0} -cd {1} > {2}'.format(externals['pigz'], reference, tf_fas), shell=True).communicate() #else : # subprocess.Popen('cp {1} {2}'.format(externals['pigz'], reference, tf_fas), shell=True).communicate() repeats = mask_tandem(tf_fas) + mask_crispr(tf_fas, tf.name) os.unlink(tf_fas) alignments = alignAgainst([prefix +'.' + ref_tag.rsplit('.', 1)[0] + '.0', aligner, prefix + '.mmi', [ref_tag, reference], [ref_tag, reference]]) with uopen(alignments[1], 'a') as fout : for r in repeats : fout.write('{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="repetitive_regions"\n'.format( r[0], r[1], r[2], )) return alignments
def write_states(fname, names, states, sites, seqLens, missing) : with uopen(fname, 'w') as fout : for sl in seqLens : fout.write('## Sequence_length: {0} {1}\n'.format(*sl)) for ms in missing : fout.write('## Missing_region: {0} {1} {2}\n'.format(*ms)) fout.write('#Seq\t#Site\t' + '\t'.join(names) + '\n') for site in sites : fout.write('{0}\t{1}\t{2}\n'.format(site[0], site[1], '\t'.join(states[site[2]]) ))
def read_matrix(fname) : sites, snps = [], {} invariant = [] seqLens, missing = [], [] with uopen(fname) as fin : for line_id, line in enumerate(fin) : if line.startswith('##'): if line.startswith('## Constant_bases') : part = line[2:].strip().split() invariant = zip(['A', 'C', 'G', 'T'], [float(v) for v in part[1:]]) elif line.startswith('## Sequence_length:') : part = line[2:].strip().split() seqLens.append([part[1], int(part[2])]) elif line.startswith('## Missing_region:') : part = line[2:].strip().split() missing.append([part[1], int(part[2]), int(part[3])]) elif line.startswith('#') : part = np.array(line.strip().split('\t')) cols = np.where((1 - np.char.startswith(part, '#')).astype(bool))[0] w_cols = np.where(np.char.startswith(part, '#!W'))[0] names = part[cols] break else : part = np.array(line.strip().split('\t')) cols = np.ones(part.shape, dtype=bool) cols[:2] = False w_cols = np.char.startswith(part, '#!W') names = part[cols] break mat = pd.read_csv(fin, header=None, sep='\t', usecols=cols.tolist() + w_cols.tolist() + [0,1]).values val = {'A':'A', 'C':'C', 'G':'G', 'T':'T', '-':'-', 'N':'-', '.':'.'} validate = np.vectorize(lambda b: b if len(b) > 1 else val.get(b, '-')) for p2 in mat : part = validate(np.char.upper(p2[cols].astype(str))) b_key = tuple(part) w = np.multiply.reduce(p2[w_cols].astype(float)) if w_cols.size else 1. if b_key in snps : snps[b_key][2] += w else : types = dict(zip(*np.unique(part, return_index=True))) types.pop('-', None) snps[b_key] = [len(snps), len(types)-1, w] if snps[b_key][1] > 0 : sites.append([ p2[0], int(p2[1]), snps[b_key][0] ]) for inv in invariant : b_key = tuple([inv[0]] * len(names)) if b_key not in snps : snps[b_key] = [len(snps), 0, float(inv[1])] else : snps[b_key][2] += float(inv[1]) return names, sites, snps, seqLens, missing
def readRec(fname): rec = {} with uopen(fname) as fin: for line in fin: part = line.strip().split('\t') key = tuple([part[0], part[1]]) if key not in rec: rec[key] = [] rec[key].append([int(part[2]), int(part[3])]) return rec
def write_ancestral_proportion(fname, names, states, sites, seqLens, missing) : with uopen(fname, 'w') as fout : for sl in seqLens : fout.write('## Sequence_length: {0} {1}\n'.format(*sl)) for ms in missing : fout.write('## Missing_region: {0} {1} {2}\n'.format(*ms)) fout.write('#Seq\t#Site\t#Type:Proportion\n') for c, p, i in sites : tag, state = states[i] for n, ss in zip(names, state) : fout.write( '{0}\t{1}\t{2}\t{3}\n'.format(c, p, n, '\t'.join([ '{0}:{1:.5f}'.format(t, s) for t, s in zip(tag, ss)]) ))
def RecHMM(args) : args = parse_arg(args) global pool, verbose pool = Pool(args.n_proc) verbose = not args.clean model = recHMM(prefix=args.prefix, mode=args.task) if not args.report or not args.model : sequences, missing = [], [] with uopen(args.data) as fin : for line in fin : if line.startswith('##') : if line.startswith('## Sequence_length:') : part = line[2:].strip().split() sequences.append([part[1], int(part[2])]) elif line.startswith('## Missing_region:') : part = line[2:].strip().split() missing.append([part[1], int(part[2]), int(part[3])]) else : break data = pd.read_csv(fin, sep='\t', dtype=str, header=None).values branches, mutations = {}, [] seqLens = {seqName:[seqId, seqLen] for seqId, (seqName, seqLen) in enumerate(sequences)} for d in data : if re.findall(r'^[ACGTacgt]->[ACGTacgt]$', d[4]) : if d[1] not in seqLens : seqLens[d[1]] = [len(seqLens), int(d[2])] if seqLens[d[1]][1] < int(d[2]) : seqLens[d[1]][1] = int(d[2]) if d[0] not in branches : branches[d[0]] = len(branches) brId, seqId = branches[d[0]], seqLens[d[1]][0] mutations.append([brId, seqId, int(d[2]), int(d[3])]) missing = np.array([ [seqLens.get(m[0], [-1])[0], m[1], m[2]] for m in missing ]) sequences = [ [n, i[1]] for n, i in sorted(seqLens.items(), key=lambda x:x[1][0])] branches = np.array([ br for br, id in sorted(branches.items(), key=lambda x:x[1]) ]) mutations = np.array(mutations) reorder = np.argsort(-np.bincount(mutations.T[0])) branches = branches[reorder] reorder = np.array([i1 for i1, i2 in sorted(enumerate(reorder), key=lambda x:x[1])]) mutations.T[0] = reorder[mutations.T[0]] if args.model : model.load(open(args.model, 'r')) else : #pass model.fit(mutations, branches=branches, sequences=sequences, missing=missing, categories=args.categories, init=args.init, cool_down=args.cool_down) model.save(open(args.prefix + '.best.model.json', 'w')) print('Best HMM model is saved in {0}'.format(args.prefix + '.best.model.json')) model.report(args.bootstrap) if not args.report : model.predict(mutations, branches=branches, sequences=sequences, missing=missing, marginal=args.marginal, tree=args.tree)
def readFasta(fasta): sequence = [] with uopen(fasta) as fin: for line in fin: if line.startswith('>'): name = line[1:].strip().split()[0] sequence.append([name, []]) elif len(line) > 0 and not line.startswith('#'): sequence[-1][1].extend(line.strip().split()) for s in sequence: s[1] = (''.join(s[1])).upper() return sequence
def write_filtered_matrix(fname, names, sites, snps, masks, m_weight): bases = {65: 0, 67: 0, 71: 0, 84: 0} for snp in snps: for n, c in zip(*np.unique(snp[2], return_counts=True)): if n in bases: bases[n] += c * snp[1] name_map = {name: id for id, name in enumerate(names)} sss = [] for site in sites: if site[1] not in m_weight or not len(m_weight[site[1]]): continue weight = np.mean(list(m_weight[site[1]].values())) if site[3].size == 0: snvs = np.frompyfunc(chr, 1, 1)(snps[site[2]][2]) else: x = snps[site[2]][2] if 45 in x: if '-' not in site[3]: site[3] = np.concatenate([site[3], ['-']]) x[x == 45] = np.where(site[3] == '-')[0][0] snvs = site[3][x] snv_x = [] p = np.zeros(snvs.shape, dtype=bool) for m in masks.get(site[1], []): pp = np.ones(snvs.shape, dtype=bool) pp[[name_map[mm] for mm in m]] = False p = (p | (~pp)) snv_x.append(np.copy(snvs)) snv_x[-1][pp] = '-' snv_x.append(snvs) snv_x[-1][p] = '-' for snv in snv_x: snv_type, snv_cnt = np.unique(snv, return_counts=True) snv_type, snv_cnt = snv_type[ ~np.in1d(snv_type, ['-', 'N', 'n'] )], snv_cnt[~np.in1d(snv_type, ['-', 'N', 'n'])] if snv_type.size > 1: for k, v in zip(snv_type, snv_cnt): if k in bases: bases[k] -= v * weight sss.append(['\t'.join(snv.tolist()), weight, site[:2]]) with uopen(fname, 'w') as fout: fout.write('## Constant_bases: ' + ' '.join([ str(int(inv[1] / names.size + 0.5) if inv[1] > 0 else 0.) for inv in sorted(bases.items()) ]) + '\n') fout.write('#seq\t#site\t' + '\t'.join(names) + '\t#!W[RecFilter]\n') for snv, weight, site in sss: fout.write('{2}\t{3}\t{0}\t{1:.5f}\n'.format(snv, weight, *site)) return fname
def readFasta(fasta, filter=None) : sequence = {} with uopen(fasta) as fin : for line in fin : if line.startswith('>') : name = line[1:].strip().split()[0] if not filter or name in filter : sequence[name] = [] elif len(line) > 0 and not line.startswith('#') and name in sequence : sequence[name].extend(line.strip().split()) for s in sequence : sequence[s] = (''.join(sequence[s])).upper() return sequence
def read_states(fname) : names, ss, sites = [], {}, [] with uopen(fname) as fin : names = fin.readline().strip().split('\t')[2:] for line in fin : seq, site, snp_str = line.strip().split('\t', 2) if snp_str not in ss : ss[snp_str] = len(ss) sites.append([seq, int(site), ss[snp_str]]) states = [] for s, id in sorted(ss.items(), key=lambda x:x[1]) : states.append(s.split('\t')) return names, np.array(states), sites
def evaluate(profile, cluster, stepwise, ave_gene_length=1000.) : with uopen(profile) as fin : logger('Loading profiles ...') profile_header = fin.readline().strip().split('\t') ST_col = np.where([p.find('#ST')>=0 for p in profile_header])[0].tolist() if len(ST_col) <= 0 : ST_col = [0] cols = ST_col + np.where([not h.startswith('#') for h in profile_header])[0].tolist() profile = pd.read_csv(fin, sep='\t', header=None, index_col=0, usecols=cols) profile_names = profile.index.values profile = profile.values with uopen(cluster) as fin : logger('Loading hierCC ...') cluster_header = fin.readline().strip().split('\t') cols = [0] + np.where([not h.startswith('#') for h in cluster_header])[0].tolist() cluster = pd.read_csv(fin, sep='\t', header=None, index_col=0, usecols=cols) cluster_names = cluster.index.values cluster = cluster.values s = np.arange(0, cluster.shape[1], stepwise) cluster = cluster[:, s] presence = np.in1d(cluster_names, profile_names) cluster, cluster_names = cluster[presence], cluster_names[presence] order = {n:id for id, n in enumerate(cluster_names)} profile_order = np.array([ [id, order[n]] for id, n in enumerate(profile_names) if n in order ]) profile_order = profile_order[np.argsort(profile_order.T[1]), 0] profile_names = profile_names[profile_order] profile = profile[profile_order] shannon = shannon_index(cluster) similarity = get_similarity('adjusted_rand_score', cluster, stepwise) silhouette = get_silhouette(profile, cluster, stepwise, ave_gene_length) np.savez_compressed('evalHCC.npz', shannon=shannon, similarity=similarity, silhouette=silhouette) logger('Done. Results saved in evalHCC.npz')
def readRecRegions(recFile, seqRange): recBlocks = [] with uopen(recFile) as fin: for line in fin: part = line.strip().split() if part[0] == 'Importation': if part[2] not in seqRange: continue acc = seqRange[part[2]][0] recBlocks.append( [part[1], acc + int(part[3]), acc + int(part[4])]) elif len(part) == 3: try: recBlocks.append([part[0], int(part[1]), int(part[2])]) except: pass return recBlocks
def write_matrix(fname, names, sites, snps, seqNames, missing) : invariants = { snp[0]:[base, snp[2]] for base, snp in snps.items() if snp[1] == 0 and base[0] != '-' } bases = {} for inv in invariants.values() : bases[inv[0]] = bases.get(inv[0], 0) + inv[1] sv = {ss[0]:'\t'.join(s) for s, ss in snps.items()} with uopen(fname, 'w') as fout : fout.write('## Constant_bases: ' + ' '.join([str(inv[1]) for inv in sorted(bases.items())]) + '\n') for n, l in seqNames : fout.write('## Sequence_length: {0} {1}\n'.format(n, l)) for n, s, e in missing : fout.write('## Missing_region: {0} {1} {2}\n'.format(n, s, e)) fout.write('#seq\t#site\t' + '\t'.join(names) + '\n') for site in sites : fout.write('{1}\t{2}\t{0}\n'.format(sv[site[2]], *site[:2])) return fname
def write_filtered_matrix(fname, names, sites, snps, masks, m_weight): bases = {'A': 0, 'C': 0, 'G': 0, 'T': 0} for base, snp in snps.items(): for n, c in zip(*np.unique(base, return_counts=True)): if n in bases: bases[n] += c * snp[2] sv = {ss[0]: s for s, ss in snps.items()} name_map = {name: id for id, name in enumerate(names)} sss = [] for site in sites: if not len(m_weight[site[1]]): continue weight = np.mean(list(m_weight[site[1]].values())) snvs = np.array(sv[site[2]]) snv_x = [] p = np.zeros(snvs.shape, dtype=bool) for m in masks.get(site[1], []): pp = np.ones(snvs.shape, dtype=bool) pp[[name_map[mm] for mm in m]] = False p = (p | (~pp)) snv_x.append(np.copy(snvs)) snv_x[-1][pp] = '-' snv_x.append(snvs) snv_x[-1][p] = '-' for snv in snv_x: snv_type, snv_cnt = np.unique(snv, return_counts=True) snv_type, snv_cnt = snv_type[ ~np.in1d(snv_type, ['-', 'N', 'n'] )], snv_cnt[~np.in1d(snv_type, ['-', 'N', 'n'])] if snv_type.size > 1: for k, v in zip(snv_type, snv_cnt): if k in bases: bases[k] -= v * weight sss.append(['\t'.join(snv.tolist()), weight, site[:2]]) with uopen(fname, 'w') as fout: fout.write('## Constant_bases: ' + ' '.join([ str(int(inv[1] / names.size + 0.5) if inv[1] > 0 else 0.) for inv in sorted(bases.items()) ]) + '\n') fout.write('#seq\t#site\t' + '\t'.join(names) + '\t#!W[RecFilter]\n') for snv, weight, site in sss: fout.write('{2}\t{3}\t{0}\t{1:.5f}\n'.format(snv, weight, *site)) return fname
def read_states(fname): names, ss, sites = [], {}, [] with uopen(fname) as fin: for line in fin: if line.startswith('##'): continue else: names = line.strip().split('\t')[2:] break for line in fin: seq, site, snp_str = line.strip().split('\t', 2) if snp_str not in ss: ss[snp_str] = len(ss) sites.append([seq, int(site), ss[snp_str]]) states = [] for s, id in sorted(ss.items(), key=lambda x: x[1]): states.append(np.array(s.split('\t')).view(asc2int)) return names, np.array(states), sites
def readAncestral(fname, nSample) : sites = [] with uopen(fname) as fin : for line in fin : if line.startswith('##') : pass elif line.startswith('#') : header = line.strip().split('\t') nodes = np.array([not h.startswith('#') for h in header], dtype=bool) nodeNames = np.array(header)[nodes] mat = pd.read_csv(fin, sep='\t', header=None, dtype=str).values break mat = mat[np.in1d(mat.T[nodes][0], ['A', 'C', 'G', 'T'])] conv = np.bincount([ 67, 71, 71, 84, 84, 84]) data = conv[mat[:, nodes].astype(bytes).view(np.int8)] for id, (cont, site) in enumerate(mat[:, :2]) : sites.append([(cont, int(site)), ] + [1.]*nSample) return np.array(sites), nodeNames, np.ascontiguousarray(data.T, dtype=np.int8)
def readMutations(mutationFile): accLength = 0 seqRange = {} missingBlocks = [] with uopen(mutationFile) as fin: for line in fin: if line.startswith('##'): if line.startswith('## Sequence_length:'): part = line.strip().split() seqRange[part[2]] = [accLength, accLength + int(part[3])] accLength += int(part[3]) elif line.startswith('## Missing_region:'): part = line.strip().split() acc = seqRange[part[2]][0] s, e = int(part[3]) + acc, int(part[4]) + acc if [s - 1, e] == seqRange[part[2]]: diff = e - s + 1 seqRange.pop(part[2]) for n in seqRange: if seqRange[n][0] >= e: seqRange[n][0] -= diff if seqRange[n][1] >= e: seqRange[n][1] -= diff for blk in missingBlocks: if blk[0] >= e: blk[0] -= diff if blk[1] >= e: blk[1] -= diff else: if e - s + 1 >= 500: missingBlocks.append([s, e]) elif line.startswith('#'): mat = pd.read_csv(fin, sep='\t', header=None, dtype=str).values break mat = mat[np.vectorize(lambda x: len(x))(mat.T[4]) == 4] sites = np.vstack([ np.vectorize(lambda x: seqRange.get(x, [-1])[0])(mat.T[1]), mat.T[2].astype(int) ]) mat = mat[sites[0] >= 0] sites = np.sum(sites[:, sites[0] >= 0], 0) mutBlocks = list(zip(mat.T[0], sites)) return seqRange, missingBlocks, mutBlocks
def readAncestral(fname,): sites = [] data = [] conv = np.bincount([65, 67, 67, 71, 71, 71, 84, 84, 84, 84]) - 1 sys.stderr.write('Start reading Matrix: \n') with uopen(fname) as fin: for line in fin: if line.startswith('##'): pass elif line.startswith('#'): header = line.strip().split('\t') nodes = np.array([not h.startswith('#') for h in header], dtype=bool) nodeNames = np.array(header)[nodes] break for i, mat in enumerate(pd.read_csv(fin, sep='\t', header=None, dtype=str, chunksize=20000)): sys.stderr.write('Reading Matrix - Line: {0} \r'.format(i * 20000)) mat = mat.values mat = mat[np.in1d(mat[:, np.where(nodes)[0][0]], ['A', 'C', 'G', 'T'])] data.append(conv[np.vectorize(ord)(mat[:, nodes])]) for id, (cont, site) in enumerate(mat[:, :2]): sites.append([(cont, int(site)), 1. ]) sys.stderr.write('Read Matrix DONE. Total SNP sites: {0} \n'.format(len(sites))) return np.array(sites), nodeNames, np.ascontiguousarray(np.vstack(data).T, dtype=np.uint8)
def getMatrix(prefix, reference, alignments, core, matrixOut, alignmentOut) : refSeq, refQual = readFastq(reference) coreSites = { n:np.zeros(len(refSeq[n]), dtype=int) for n in refSeq } matSites = { n:np.zeros(len(refSeq[n]), dtype=int) for n in refSeq } alnId = { aln[0]:id for id, aln in enumerate(alignments) } res = pool.map(readMap, alignments) matrix = {} for presences, absences, mutations in res : for mut in mutations : j = alnId[mut[0]] site = tuple(mut[1:3]) if site not in matrix : matrix[site] = [[], []] matSites[mut[1]][mut[2]-1] = mut[2] if len(mut[4]) == 1 : if len(matrix[site][0]) == 0 : matrix[site][0] = ['-' for id in alnId] matrix[site][0][j] = mut[4] else : if len(matrix[site][1]) == 0 : matrix[site][1] = ['-' for id in alnId] matrix[site][1][j] = mut[4] for (mTag, mFile), (presences, absences, mutations) in zip(alignments, res) : j = alnId[mTag] for n, s, e in presences : coreSites[n][s-1:e] +=1 mutations = matSites[n][s-1:e] for kk in mutations[mutations > 0] : k = (n, kk) if len(matrix[k][0]) and matrix[k][0][j] == '-' : matrix[k][0][j] = '.' if len(matrix[k][1]) and matrix[k][1][j] == '-' : matrix[k][1][j] = '.' for n, s, e, m in absences : coreSites[n][s-1:e] -=1 mutations = matSites[n][s-1:e] for kk in mutations[mutations > 0] : k = (n, kk) if len(matrix[k][0]) and matrix[k][0][j] == '.' : matrix[k][0][j] = '-' if len(matrix[k][1]) and matrix[k][1][j] == '.' : matrix[k][1][j] = '-' pres = np.unique(np.concatenate(list(coreSites.values())), return_counts=True) pres = [pres[0][pres[0] > 0], pres[1][pres[0] > 0]] coreNum = len(alignments) * core for p, n in zip(*pres) : sys.stderr.write('#{2} {0} {1}\n'.format(p, n, '' if p > coreNum else '#')) missings = [] coreBases = {'A':0, 'C':0, 'G':0, 'T':0} for n in sorted(coreSites) : sites = coreSites[n] for site, num in enumerate(sites) : cSite = (n, site+1) if num < coreNum and cSite in matrix and len(matrix[cSite][1]) > 0 : num = np.sum(matrix[cSite][1] != '-') matrix[cSite][0] = [] if num < coreNum : matrix.pop(cSite, None) if len(missings) == 0 or missings[-1][0] != n or missings[-1][2] + 1 < cSite[1] : missings.append([n, cSite[1], cSite[1]]) else : missings[-1][2] = cSite[1] else : b = refSeq[n][cSite[1]-1] if cSite in matrix and len(matrix[cSite][0]) : matrix[cSite][0] = [ (b if s == '.' else s) for s in matrix[cSite][0]] else : coreBases[b] = coreBases.get(b, 0) + 1 outputs = {} if matrixOut : outputs['matrix'] = prefix + '.matrix.gz' with uopen(prefix + '.matrix.gz', 'w') as fout : fout.write('## Constant_bases: {A} {C} {G} {T}\n'.format(**coreBases)) for n in refSeq : fout.write('## Sequence_length: {0} {1}\n'.format(n, len(refSeq[n]))) for region in missings : fout.write('## Missing_region: {0} {1} {2}\n'.format(*region)) fout.write('\t'.join(['#Seq', '#Site'] + [ mTag for mTag, mFile in alignments ]) + '\n') for site in sorted(matrix) : bases = matrix[site] if len(bases[0]) : fout.write('{0}\t{1}\t{2}\n'.format(site[0], site[1], '\t'.join(bases[0]))) if len(bases[1]) : fout.write('{0}\t{1}\t{2}\n'.format(site[0], site[1], '\t'.join(bases[1]))) if alignmentOut : outputs['alignment'] = prefix + '.fasta.gz' sequences = [] for (mTag, mFile), (presences, absences, mutations) in zip(alignments, res) : j = alnId[mTag] seq = { n:['-']*len(s) for n, s in refSeq.items() } if j > 0 else { n:list(s) for n, s in refSeq.items() } if j : for n, s, e in presences : seq[n][s-1:e] = refSeq[n][s-1:e] for n, s, e, c in absences : seq[n][s-1:e] = '-' * (e-s+1) for site in matrix : bases = matrix[site] if len(bases[0]) : seq[site[0]][site[1]-1] = bases[0][j] sequences.append(seq) with uopen(prefix + '.fasta.gz', 'w') as fout : for id, n in enumerate(sorted(refSeq)) : if id : fout.write('=\n') for (mTag, mFile), seq in zip(alignments, sequences) : fout.write('>{0}:{1}\n{2}\n'.format(mTag, n, ''.join(seq[n]))) return outputs