def make_single_genes(seq_lists, wd='src'): seqs = fasta.fasta_read(seq_lists[0], generator=False) for i in xrange(len(seqs)): start = True gene_list = '' for faa in seq_lists: title = os.path.basename(faa) seqs = fasta.fasta_read(faa, generator=False) if start: start = False with open(join(wd, 'sampling/accs%d.txt' % i), 'a') as s: s.write(title + '\t' + seqs[i].name[1:seqs[i].name.find(' ')] + '\n') gene_list += '>' + title + '\n' gene_list += seqs[i].seq gene_list += '\n' yield (i, gene_list)
def run(n1, n2, n1_nam='genome1', n2_nam='genome2', k=19, mp=''): '''Return MUMi value for 2 genomes''' # seq generators if not mp: mp = 'mummer' n1_s = fasta.fasta_read(n1) n2_s = fasta.fasta_read(n2) n2_l = 0 n1_l = 0 # concatenate files, beacause mumi cannot process multiple contigs with tempfile.NamedTemporaryFile(delete=True) as f1: with tempfile.NamedTemporaryFile(delete=True) as f2: f1.write('>temp_seq_1\n') for seq in n1_s: f1.write(seq.seq) f2.write('>temp_seq_2\n') for seq in n2_s: f2.write(seq.seq) cmd = [mp, '-mum', '-b', '-c', '-l', '%d' % k, f1.name, f2.name] # run MUMmer output = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.PIPE) o, e = output.communicate() # get genome lengths for line in e.splitlines(): if not n1_l: if f1.name in line and 'length' in line: n1_l = int(line[line.rfind(' ') + 1:]) if not n2_l: if f2.name in line and 'length' in line: n2_l = int(line[line.rfind(' ') + 1:]) # get MUMi m = '%.5f' % give_mumi.get(from_text=o, l1=n1_l, l2=n2_l) return '\t'.join((n1_nam, n2_nam, m))
def run_new(start='', d='blast_data', cov=65, wd='', **kw): num_threads = kw.pop('num_threads', 7) dbs = [str(d + '/' + f[:-4]) for f in os.listdir(d) if f.endswith('.phr')] ref_genes = fasta.fasta_read(start, generator=False) matches = {} if not os.path.isdir(os.path.join(wd, 'ref_dbs')): os.mkdir(os.path.join(wd, 'ref_dbs')) # segment dbs into blocks (size = num_threads) - blocks don't have to be full # process blocks # refine # repeat until all blocks completed blocks = [] for i in xrange(0, len(dbs), num_threads): blocks.append(dbs[i:i + num_threads]) for block in blocks: args = zip(range(len(block)), block) to_remove = [] print len(ref_genes) for i, res in enumerate( comps( arg_list=args, cov=cov, ref=ref_genes, cores=num_threads)): for r in res: if r[1] is None: to_remove.append(r[0]) else: try: matches[r[0]][block[i]] = r[1] except KeyError: matches[r[0]] = {block[i]: r[1]} # remove 0 score proteins from master list for r in set(to_remove): print matches.pop(r)[dbs[5]].index, 'Removed' ref_genes = [matches[g][dbs[5]] for g in matches] for d in dbs: with open(os.path.join(wd, 'ref_dbs', os.path.basename(d)), 'w') as s: for g in matches: s.write(matches[g][d].fasta) s.write('\n')
def compare(database, file_path=None, from_text=None, from_list=None, cov=65, remove=[], **kw): removed = 0 # implemented for multi blast prot = _faa_dict.faa(database) if not from_list: g = fasta.fasta_read(file_path=file_path, from_text=from_text, generator=False) else: g = from_list print len(g) ref = [] # create blast query with open('src/tools/blast/query.fasta', 'w') as q: q.write('\n'.join(gene.fasta for gene in g)) # run (multi) blast search aln = local_blast.run(db_path=database, mr=1, make_db=False, r_a=True, outfmt='details', join_hsps=True, **kw) for index, gene in enumerate(g): try: # grab top alignment (position 0) % similarity a = aln[index][0] sim = a.psim except IndexError: # if no record, move onto the next gene for j in xrange(len(remove)): del remove[j][index - removed] removed += 1 continue if sim < cov: # if psim lower than cut off value, move onto next gene for j in xrange(len(remove)): del remove[j][index - removed] removed += 1 continue else: print index, gene.name[1:gene.name.find(' ')], a.h_acc, sim # matched lists ref.append(fasta.sequence(a.h_def, prot[a.h_acc])) # keep track of genes to keep from previous queries #indices.append(index) remove.append(ref) return remove
def fasta_chunks(fname, chunk_size=5000): """ Split fasta into chunks and save (free up ram during run). For continuity, a list of contig_ids and chunk numbers are pickled for back-checking at the end of the process with checkout.py. """ fas = fasta_read(fname) chunk_num = math.ceil(len(fas) / chunk_size) # Iterate over chunk numbers for i in range(chunk_num): i += 1 if len(fas) > chunk_size: # Take first 5k elements chunk = dict(list(fas.items())[:chunk_size]) fas = dict(list(fas.items())[chunk_size:]) else: # Else take all elements chunk = fas fasta_write(chunk, "temp/chunk_%s.fas" % i) log("%s written to /temp" % i) return chunk_num
def make_gene_list(seq_lists, size=10, repeats=1000, wd='src'): ''' Generator Function to create samples seq_lists - sequence of file paths to conserved gene lists size - size of sample repeats - number of samples to take ''' # TODO: add check to see if samples dir is empty for rep in xrange(repeats): gene_list = '' start = True max_l = 0 for faa in seq_lists: title = os.path.basename(faa) seqs = fasta.fasta_read(faa, generator=False) if not start: # should always be true - if not, something has gone # wrong somewhere assert len(seqs) == max_l else: max_l = len(seqs) if start: start = False # get some random indices indices = [i for i in np.random.choice(range(len(seqs)), size, replace=False)] for i in indices: # write sample genes to file seq = seqs[i] #print(seq.name) with open(join(wd, 'sampling/accs%d.txt' % rep), 'a') as s: s.write(title + '\t' + seq.name[1:seq.name.find(' ')] + '\n') gene_list += '>' + title + '\n' gene_list += ''.join(seqs[i].seq for i in indices) gene_list += '\n' yield (rep, gene_list)