def add_seq_to_reads(read_file, out_file): with open(read_file) as handle: with open(out_file, 'w') as ohandle: new_seqs = [] for name, seq in GeneralSeqTools.fasta_reader(handle): new_seqs.append((name + ';' + seq, seq)) GeneralSeqTools.fasta_writer(ohandle, new_seqs)
def test_fasta_writer(): items = ['>test1', 'ATCTGCTAGTCGAATCGAGTAGT', '>test2', 'ATCGATGC'] test_seq = '\n'.join(items) + '\n' handle = StringIO() GeneralSeqTools.fasta_writer(handle, [('test1', 'ATCTGCTAGTCGAATCGAGTAGT'), ('test2', 'ATCGATGC')]) handle.seek(0) data = handle.read() eq_(test_seq, data)
def run_mafft(inseqs): orig_order = [name for name, _ in inseqs] with NTF(suffix = '.fasta') as handle: GeneralSeqTools.fasta_writer(handle, inseqs) handle.flush() os.fsync(handle) cmd = 'mafft --quiet --op 10 --ep 0.123 %s' % handle.name out = check_output(shlex.split(cmd)) out_dict = dict(GeneralSeqTools.fasta_reader(StringIO(out))) return [(name, out_dict[name]) for name in orig_order]
def test_seq_map_to_ref(): ref_align = 'ATCTCT--ATCT' seq_align = 'A-CCCT-AATCT' cor_align = 'A-CCCTATCT' res = GeneralSeqTools.seq_map_to_ref(seq_align,ref_align) eq_(res, cor_align)
def test_convert_seqs_to_dataframe(): indict = { 'seq1': list('ATCGATTGC'), 'seq2': list('ATCGATTGC'), } inseqs = [('seq1', 'ATCGATTGC'), ('seq2', 'ATCGATTGC')] tdf = DataFrame(indict).T res = GeneralSeqTools.convert_seqs_to_dataframe(inseqs) ok_((res == tdf).all().all())
def test_convert_seqDF_to_list(): indict = { 'seq1': list('ATCGATTGC'), 'seq2': list('ATCGATTGC'), } inseqs = [('seq1', 'ATCGATTGC'), ('seq2', 'ATCGATTGC')] tdf = DataFrame(indict).T res = GeneralSeqTools.convert_seqDF_to_list(tdf) eq_(res, inseqs)
def test_fasta_reader(): input_items = ['>test1', 'ATCTGCTAGTCGA', 'ATCGAGTAGT', '>test2', 'ATCGATGC'] input_seq = '\n'.join(input_items) res = list(GeneralSeqTools.fasta_reader(StringIO(input_seq))) eq_(len(res), 2) eq_(res[0][0], 'test1') eq_(res[0][1], 'ATCTGCTAGTCGAATCGAGTAGT') eq_(res[1][0], 'test2') eq_(res[1][1], 'ATCGATGC')
def test_seq_align_to_ref_multi(): ref_seq = 'ATCGATTGC' test_seq = 'ATCGATGC' cor_mapping = 'ATCGA-TGC' inp = [('test1', test_seq)] * 10 res = list(GeneralSeqTools.seq_align_to_ref(inp, ref_seq, max_workers=5)) result = [('test1', cor_mapping)] * 10 eq_(res, result)
def align_to_ref(ref_seq, base_seq): """Aligns a sequence to the reference and caches the result for fast lookup later. Returns a tuple (base_seq, ref_seq) properly aligned. ref_seq -- The reference sequence to use as a guide. query_seq -- The query sequence. Returns: query_aln -- The aligned query sequence. ref_aln -- The aligned reference sequence. """ seqs = [('query', base_seq), ('ref', ref_seq)] aligned = dict(GeneralSeqTools.call_muscle(seqs)) return aligned['query'], aligned['ref']
def GetConSeq(region, alphabet=generic_dna, subtype='B', drop_gaps=True): if (alphabet == generic_dna) or (alphabet.lower() == 'dna'): path = get_region_file(region, 'dna') elif (alphabet == generic_protein) or (alphabet.lower() == 'pro'): path = get_region_file(region, 'pro') else: raise(TypeError, 'alphabet must be: "dna", "pro", generic_dna, generic_protein') seq = None if path is None: new_region, start, stop = get_region_span(region, alphabet) conB_seq = GetConSeq(new_region, subtype='B', alphabet=alphabet, drop_gaps=False) sub_seq = GetConSeq(new_region, subtype=subtype, alphabet=alphabet, drop_gaps=False) nstart, nstop = (None, None) print conB_seq print sub_seq conb_pos = 0 for aln_pos, l in enumerate(conB_seq): if l != '-': conb_pos += 1 if conb_pos == start: nstart = aln_pos if conb_pos == stop: nstop = aln_pos break seq = sub_seq[nstart:nstop] else: wanted_key = 'CONSENSUS_'+subtype with open(path) as handle: for name, seq in GeneralSeqTools.fasta_reader(handle): name = name.split('(')[0] if name == wanted_key: break if drop_gaps: return seq.replace('-', '').replace('$', '') else: return seq.replace('$', '')
def get_region(seq, reference, regions = None): if regions == None: regions = [(300, 400)] tmp_seqs = [('conc', reference), ('guess', seq)] aligned = dict(GeneralSeqTools.call_muscle(tmp_seqs)) out = [] for _, start, stop in regions: conc_pos = 0 align_start = None for align_pos, l in enumerate(aligned['conc']): if l != '-': conc_pos += 1 if conc_pos == start: align_start = align_pos if conc_pos == stop: align_stop = align_pos break yield seq[align_start:align_stop].replace('-', '')
# <codecell> import sys sys.path.append('/home/will/PySeqUtils/') # <codecell> import TreeingTools import GeneralSeqTools import dendropy # <codecell> with open('/home/will/SubCData/mafft_ep.fasta') as handle: seqs = list(GeneralSeqTools.fasta_reader(handle)) # <codecell> import os, os.path import csv from itertools import product import numpy as np import pandas as pd import matplotlib.pyplot as plt from operator import methodcaller from itertools import groupby from Bio.Seq import Seq from Bio import Motif from Bio.Alphabet import IUPAC
# <codecell> pat_data = pd.merge(redcap_data, df, left_on ='SingleID', right_on = 'SingleID', how = 'outer').groupby('SingleID').first() # <codecell> import glob ltr_files = sorted(glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*LTR.fasta')) ltr_seqs = {} for f in ltr_files: with open(f) as handle: _, seq = GeneralSeqTools.fasta_reader(handle).next() fname = os.path.basename(f).rsplit('-', 1)[0] ltr_seqs[fname] = seq # <codecell> ltr_df = pd.DataFrame({ 'LTR':pd.Series(ltr_seqs) }) ltr_df.head() # <codecell> conb_ltr = ConSeqs.GetConSeq('ltr') conb_ltr
writer.writerow((gbm, acc)) # <codecell> files = [('C', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/C_*'))), ('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))] seqs = [] for sub, sfiles in files: for f in sfiles: with open(f) as handle: base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0] prot = base_name.split('_')[1] for name, seq in GeneralSeqTools.fasta_reader(handle): seqs.append({ 'Seq':seq, 'ID':gi_to_acc_dict[name], 'Prot':prot, 'Subtype':sub }) seqdf = pd.DataFrame(seqs) # <codecell> pseqdf = pd.pivot_table(seqdf, rows = ['Subtype', 'ID'], cols = 'Prot', values = 'Seq',
for num, (gbm, acc) in enumerate(imap(get_gi_acc, gb_files)): if (num == 100) or (num % 50000 == 0): print num gi_to_acc_dict[gbm] = acc writer.writerow((gbm, acc)) # <codecell> files = [('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))] seqs = [] for sub, sfiles in files: for f in sfiles: with open(f) as handle: base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0] prot = base_name.split('_')[1] for name, seq in GeneralSeqTools.fasta_reader(handle): seqs.append({ 'Seq':seq, 'ID':gi_to_acc_dict[name], 'Prot':prot, }) seqdf = pd.DataFrame(seqs) # <codecell> pseqdf = pd.pivot_table(seqdf, rows = 'ID', cols = 'Prot', values = 'Seq', aggfunc = 'first')
# <codecell> import GeneralSeqTools import glob # <codecell> import pandas as pd files = sorted(glob.glob('/home/will/HIVTropism/LANLdata/SubB*.fasta')) seqs = [] for f in files: prot_name = f.split('/')[-1].split('.')[0].split('-')[1] print prot_name with open(f) as handle: for name, seq in GeneralSeqTools.fasta_reader(handle): seqs.append({ 'GI':name, 'Seq':seq.replace('-', '').upper(), 'Prot':prot_name }) # <codecell> seq_df = pd.pivot_table(pd.DataFrame(seqs), rows = 'GI', cols = 'Prot', values = 'Seq', aggfunc = 'first')
except ValueError: print fname seqs.append((pid, vn, prot, 1)) df = pd.DataFrame(seqs, columns=["Patient ID", "VisitNum", "Prot", "HasSeq"]) has_seq = pd.pivot_table(df, rows=["Patient ID", "VisitNum"], cols="Prot", values="HasSeq") # <codecell> import sys sys.path.append("/home/will/PySeqUtils/") import GeneralSeqTools with open("/home/will/DrugStuff/pat_data.fasta") as handle: seqs = list(GeneralSeqTools.fasta_reader(handle)) out = GeneralSeqTools.WebPSSM_V3_fasta(seqs) # <codecell> tmp = [] for row in out: parts = row[0].split("-") if len(parts) == 2: pat, vnum = parts else: pat, vnum, _ = parts tmp.append({"Patient ID": pat, "VisitNum": vnum, "IsR5": row[2] == "0", "IsX4": row[2] == "1"}) tropism = pd.DataFrame(tmp).groupby(["Patient ID", "VisitNum"]).first()
def test_muscle_basic_call(): seqs = [('test1', 'ATCGATTGC'), ('test2', 'ATCGATGC')] aln = [('test1', 'ATCGATTGC'), ('test2', 'ATCGA-TGC')] res = list(GeneralSeqTools.call_muscle(seqs)) eq_(res, aln)