def score_seq(known, guess, gapopen=10, gapextend=1): cmd = 'needle -asequence %(cb)s -bsequence %(seq)s -aformat score -gapopen %(go)f -gapextend %(ge)s -outfile %(out)s' with NamedTemporaryFile() as conb_handle: fasta_writer(conb_handle, [('SeqA', known)]) conb_handle.flush() os.fsync(conb_handle.fileno()) with NamedTemporaryFile() as seq_handle: fasta_writer(seq_handle, [('Seq1', guess)]) seq_handle.flush() os.fsync(seq_handle.fileno()) with NamedTemporaryFile() as out_handle: param_dict = { 'cb':conb_handle.name, 'seq':seq_handle.name, 'out':out_handle.name, 'go':gapopen, 'ge':gapextend } cmd_list = shlex.split(cmd % param_dict) check_call(cmd_list) for line in out_handle: parts = line.split() if (len(parts) == 4): return float(parts[-1][1:-2])
def map_seqs_to_ref(input_seqs, retry=0): """Maps a set of (name, seq) pairs to HXB2 using LANL""" base_seqs = StringIO() fasta_writer(base_seqs, input_seqs) base_seqs.seek(0) fasta_seqs = base_seqs.read() br = build_browser() br.open('http://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html') logging.debug('Opened Browser to LANL') br.select_form(nr=1) br.form['SEQ'] = fasta_seqs resp = br.submit() logging.info('Submitted Seqs to LANL') try: soup = BeautifulSoup(resp) except IncompleteRead: if retry > 5: raise ValueError return map_seqs_to_ref(input_seqs, retry=retry+1) rows = [] count = 0 for name, seq, table in zip(yield_seq_names(soup), yield_query_seqs(soup), yield_seq_tables(soup)): count += 1 for row in yield_row_vals(table, seq): row['Name'] = name rows.append(row) logging.info('LANL returned %i regions for %i patients' % (len(rows), count)) return rows
def _write_seqs(self, X, handle): seqs = [] for row in range(X.shape[0]): seq = ''.join(X[row]) seqs.append(('Seq-%03i' % row, ''.join(l for l in seq if l.isalpha()))) fasta_writer(handle, seqs) handle.flush() os.fsync(handle.fileno())
def run_FastTree(seqs, alphabet=generic_protein, tmp_path=None, uniq_seqs=False): if uniq_seqs: trans_names = defaultdict(list) norm_seq_names = {} for num, (name, seq) in enumerate(seqs): trans_names[seq].append(name) new_name = 'Seq-%i' % num norm_seq_names[seq] = new_name uni_seqs = [] name_defs = {} for seq, new_name in norm_seq_names.items(): uni_seqs.append((new_name, seq)) name_defs[new_name] = trans_names[seq] out_tree = run_FastTree(uni_seqs, alphabet=alphabet, tmp_path=tmp_path) tax_set = out_tree.taxon_set for old_name, new_names in name_defs.items(): node = out_tree.find_node_with_taxon_label(old_name) if node: names = iter(new_names) node.taxon.label = names.next() parent = node.parent_node edge_dist = node.edge.length for name in names: parent.new_child(taxon=tax_set.new_taxon(label=name), edge_length=edge_dist) return out_tree else: base_path = os.path.dirname(__file__) with NTF(dir=tmp_path, suffix='.fasta') as handle: fasta_writer(handle, seqs) handle.flush() os.fsync(handle) tdict = { 'alpha': '-nt' if alphabet == generic_dna else '', 'path': handle.name } cmd = os.path.join(base_path, 'FastTree %(alpha)s -quiet %(path)s' % tdict) cmd_list = shlex.split(cmd) tree_str = check_output(cmd_list) return dendropy.Tree(stream=StringIO(tree_str), schema='newick')
def testSeqTransformer_from_fasta(): handle = StringIO() inseqs = [('Seq1', 'ATGTCG'), ('Seq2', 'ATGG'), ('Seq3', 'ATGTAHYTD')] fasta_writer(handle, inseqs) handle.seek(0) outdata = np.array(['ATGTCG---', 'ATGG-----', 'ATGTAHYTD']) names, out = HIVAlignTools.SeqTransformer.get_from_fasta_handle(handle) ok_(np.all(out == outdata)) ok_(all(t == g for t, g in zip(names, ['Seq1', 'Seq2', 'Seq3'])))
def align_with_lastz(input_seqs, ref_seqs): """Aligns set of query sequences with a reference.""" with tmp_directory() as tmp_dir: seq_file = os.path.join(tmp_dir, "query.fasta") ref_file = os.path.join(tmp_dir, "ref.fasta") out_file = os.path.join(tmp_dir, "res.fasta") with open(seq_file, "w") as handle: fasta_writer(handle, input_seqs) with open(ref_file, "w") as handle: fasta_writer(handle, ref_seqs) call_lastz(seq_file, ref_file, out_file) with open(out_file) as handle: return list(SAMreader(handle))
def blast_all_v_all(seqsA, seqsB, block_size=20): dpath = '/home/will/tmpstuf/haptest/tmpseqs/' with NTF(suffix='.fa', dir=dpath, delete=False) as db_handle: fasta_writer(db_handle, seqsA) db_handle.flush() os.fsync(db_handle.fileno()) cmd = 'makeblastdb -in %s -dbtype nucl' % db_handle.name cmd_list = shlex.split(cmd) check_call(cmd_list) align_func = partial(check_seqs, db_handle.name) check_iterable = islice(yield_blocks(iter(seqsB), 200), 20) with ProcessPoolExecutor(max_workers=5) as pool: res_iter = pool.map(align_func, check_iterable) for num, block in enumerate(res_iter): print num, len(block)
def check_seqs(db_path, seqs): cmd = "blastn -db %(db)s -query %(q)s -outfmt '10 qseqid sseqid pident nident' -num_threads 20 -max_target_seqs 1" fields = ['SeqA', 'SeqB', 'pident', 'nident'] dpath = '/home/will/tmpstuf/haptest/tmpseqs/' with NTF(suffix='.fa', dir=dpath, delete=False) as check_handle: fasta_writer(check_handle, seqs) check_handle.flush() os.fsync(check_handle.fileno()) tdict = { 'db':db_path, 'q':check_handle.name } cmd_list = shlex.split(cmd % tdict) out = check_output(cmd_list) reader = csv.DictReader(StringIO(out), fieldnames=fields) return list(reader)
sys.path.append('/home/will/PySeqUtils/') # <codecell> from GeneralSeqTools import fasta_reader, fasta_writer, WebPSSM_V3_series import glob # <codecell> files = [('x4_seqs.fasta.old', 'x4_seqs.fasta'), ('r5_seqs.fasta.old', 'r5_seqs.fasta')] for ifile, ofile in files: with open(ifile) as handle: with open(ofile, 'w') as ohandle: for name, seq in fasta_reader(handle): fasta_writer(ohandle, [(name, seq[1:-1])]) # <codecell> subtype_files = glob.glob('/home/will/WLAHDB_data/SubtypeGuess/*.gb') subtypes = [] for f in subtype_files: gb = f.rsplit(os.sep, 1)[-1].split('.')[0] with open(f) as handle: subtype = handle.next().strip() if subtype != 'Unk': subtypes.append((int(gb), subtype)) subtype_df = pd.DataFrame(subtypes, columns = ['GI', 'Subtype']) subtype_ser = subtype_df.groupby('GI')['Subtype'].first()
'Tat-1-seq-align', 'Tat-2-seq-align', 'LTR-seq-align'] four = wanted_data[fourkb_cols].dropna() wseqs = set() with open('/home/will/Dropbox/HIVseqs/BensTropismLabels.csv') as handle: for row in csv.DictReader(handle, delimiter=','): wseqs.add(row['Patient ID']) for col in four.columns: found = set() prot = col.rsplit('-', 2)[0] fname = 'AlignForBenj/fourKB_%s.fasta' % prot with open(fname, 'w') as handle: for seq, name in zip(four[col], four.index): if name in wseqs and name not in found: fasta_writer(handle, [(name+'-'+trop_dict[name], ''.join(seq))]) found.add(name) print prot, len(found) # <codecell> foukb_lanl = ['AB078005', 'AB221126', 'AB253432', 'AB286955', 'AB287365', 'AB287367', 'AB287368', 'AB287369', 'AB480695', 'AB485642', 'AB565479', 'AB565496', 'AB565497', 'AB565499', 'AB565500', 'AB565502', 'AB604946', 'AB604948', 'AB604950', 'AB604951', 'AB641836', 'AF003887', 'AF003888', 'AF004394', 'AF042100', 'AF042101', 'AF538302', 'AF538303', 'AF538307', 'AJ271445', 'AY173953', 'AY352275', 'AY835748', 'AY835754', 'AY835759', 'AY835762', 'AY835766', 'AY835769', 'AY835770', 'AY835774',
out_seqs.append({ 'Accession':name, 'PSSM':tdata.ix[name]['PSSMScore'], 'Seq':seq }) out_df = pd.DataFrame(out_seqs) # <codecell> with open('extra_brain.fasta', 'w') as handle: found = set() for name, seq in aa_seq_list: if name in wanted_acc: if seq not in found: fasta_writer(handle, [(name+'NEWSEQS!!!!!!', seq)]) found.add(seq) # <codecell> out_df.to_csv('brain_x4.tsv', sep='\t') # <codecell> ax = trim_lanl.boxplot(column='PSSMScore', by = 'STissue', vert=False, figsize=(10,10)) ax.set_ylim([-1, ax.get_ylim()[1]]) order = ['R5-E', 'R5-1', 'R5-2', 'R5-3', 'R5-P', 'X4-P', 'X4'] for line, name in zip(pssm_bins, order): ax.annotate(name, (line-1.5, -0.5), fontsize=10) ax.annotate('X4', (-1.5, -0.5), fontsize=10)
from Bio.SeqIO.AbiIO import AbiIterator files = glob.glob('../Wigdahl Trace files/2:11:11/*.ab1') seqs = [] for f in files: rec = AbiIterator(open(f, mode = 'rb'), trim = True).next() seqs.append( (rec.id, rec.seq.tostring()) ) # <codecell> !/home/will/staden-2.0.0b9.x86_64/bin/convert_trace --help # <codecell> res = call_muscle(seqs) with open('align_data.fasta', 'w') as handle: fasta_writer(handle, res) # <codecell> from HIVTransTool import process_seqs results = list(process_seqs(seqs[:50], extract_regions = True, known_names = 50)) # <codecell> for row in results: if row['RegionName'] == 'LTR5': print row['Name'], row['QueryNuc'] # <codecell>
stop = -1 path = 'HIV1_ALL_2012_env_PRO.fasta' outpath = 'HIV1_ALL_2012_gp41_PRO.fasta' with open(path) as handle: for name, seq in islice(fasta_reader(handle), 20): tseq = seq[start:stop] print tseq[:5], tseq[-5:] # <codecell> seqs = [] with open(path) as handle: for name, seq in fasta_reader(handle): seqs.append((name, seq[start:stop])) with open(outpath, 'w') as handle: fasta_writer(handle, seqs) # <codecell> from Bio import Entrez from Bio import SeqIO ids = '544451412,544451410,544451408,544451406,544451404,544451402,544451400,544451398,544451396' fetch_handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=ids) records = list(SeqIO.parse(fetch_handle, "gb")) # <codecell> rec = records[0]