def add_seq_to_reads(read_file, out_file): with open(read_file) as handle: with open(out_file, 'w') as ohandle: new_seqs = [] for name, seq in GeneralSeqTools.fasta_reader(handle): new_seqs.append((name + ';' + seq, seq)) GeneralSeqTools.fasta_writer(ohandle, new_seqs)
def test_fasta_reader(): input_items = ['>test1', 'ATCTGCTAGTCGA', 'ATCGAGTAGT', '>test2', 'ATCGATGC'] input_seq = '\n'.join(input_items) res = list(GeneralSeqTools.fasta_reader(StringIO(input_seq))) eq_(len(res), 2) eq_(res[0][0], 'test1') eq_(res[0][1], 'ATCTGCTAGTCGAATCGAGTAGT') eq_(res[1][0], 'test2') eq_(res[1][1], 'ATCGATGC')
def run_mafft(inseqs): orig_order = [name for name, _ in inseqs] with NTF(suffix = '.fasta') as handle: GeneralSeqTools.fasta_writer(handle, inseqs) handle.flush() os.fsync(handle) cmd = 'mafft --quiet --op 10 --ep 0.123 %s' % handle.name out = check_output(shlex.split(cmd)) out_dict = dict(GeneralSeqTools.fasta_reader(StringIO(out))) return [(name, out_dict[name]) for name in orig_order]
def GetConSeq(region, alphabet=generic_dna, subtype='B', drop_gaps=True): if (alphabet == generic_dna) or (alphabet.lower() == 'dna'): path = get_region_file(region, 'dna') elif (alphabet == generic_protein) or (alphabet.lower() == 'pro'): path = get_region_file(region, 'pro') else: raise(TypeError, 'alphabet must be: "dna", "pro", generic_dna, generic_protein') seq = None if path is None: new_region, start, stop = get_region_span(region, alphabet) conB_seq = GetConSeq(new_region, subtype='B', alphabet=alphabet, drop_gaps=False) sub_seq = GetConSeq(new_region, subtype=subtype, alphabet=alphabet, drop_gaps=False) nstart, nstop = (None, None) print conB_seq print sub_seq conb_pos = 0 for aln_pos, l in enumerate(conB_seq): if l != '-': conb_pos += 1 if conb_pos == start: nstart = aln_pos if conb_pos == stop: nstop = aln_pos break seq = sub_seq[nstart:nstop] else: wanted_key = 'CONSENSUS_'+subtype with open(path) as handle: for name, seq in GeneralSeqTools.fasta_reader(handle): name = name.split('(')[0] if name == wanted_key: break if drop_gaps: return seq.replace('-', '').replace('$', '') else: return seq.replace('$', '')
# <codecell> pat_data = pd.merge(redcap_data, df, left_on ='SingleID', right_on = 'SingleID', how = 'outer').groupby('SingleID').first() # <codecell> import glob ltr_files = sorted(glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*LTR.fasta')) ltr_seqs = {} for f in ltr_files: with open(f) as handle: _, seq = GeneralSeqTools.fasta_reader(handle).next() fname = os.path.basename(f).rsplit('-', 1)[0] ltr_seqs[fname] = seq # <codecell> ltr_df = pd.DataFrame({ 'LTR':pd.Series(ltr_seqs) }) ltr_df.head() # <codecell> conb_ltr = ConSeqs.GetConSeq('ltr') conb_ltr
writer.writerow((gbm, acc)) # <codecell> files = [('C', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/C_*'))), ('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))] seqs = [] for sub, sfiles in files: for f in sfiles: with open(f) as handle: base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0] prot = base_name.split('_')[1] for name, seq in GeneralSeqTools.fasta_reader(handle): seqs.append({ 'Seq':seq, 'ID':gi_to_acc_dict[name], 'Prot':prot, 'Subtype':sub }) seqdf = pd.DataFrame(seqs) # <codecell> pseqdf = pd.pivot_table(seqdf, rows = ['Subtype', 'ID'], cols = 'Prot', values = 'Seq',
except ValueError: print fname seqs.append((pid, vn, prot, 1)) df = pd.DataFrame(seqs, columns=["Patient ID", "VisitNum", "Prot", "HasSeq"]) has_seq = pd.pivot_table(df, rows=["Patient ID", "VisitNum"], cols="Prot", values="HasSeq") # <codecell> import sys sys.path.append("/home/will/PySeqUtils/") import GeneralSeqTools with open("/home/will/DrugStuff/pat_data.fasta") as handle: seqs = list(GeneralSeqTools.fasta_reader(handle)) out = GeneralSeqTools.WebPSSM_V3_fasta(seqs) # <codecell> tmp = [] for row in out: parts = row[0].split("-") if len(parts) == 2: pat, vnum = parts else: pat, vnum, _ = parts tmp.append({"Patient ID": pat, "VisitNum": vnum, "IsR5": row[2] == "0", "IsX4": row[2] == "1"}) tropism = pd.DataFrame(tmp).groupby(["Patient ID", "VisitNum"]).first()
for num, (gbm, acc) in enumerate(imap(get_gi_acc, gb_files)): if (num == 100) or (num % 50000 == 0): print num gi_to_acc_dict[gbm] = acc writer.writerow((gbm, acc)) # <codecell> files = [('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))] seqs = [] for sub, sfiles in files: for f in sfiles: with open(f) as handle: base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0] prot = base_name.split('_')[1] for name, seq in GeneralSeqTools.fasta_reader(handle): seqs.append({ 'Seq':seq, 'ID':gi_to_acc_dict[name], 'Prot':prot, }) seqdf = pd.DataFrame(seqs) # <codecell> pseqdf = pd.pivot_table(seqdf, rows = 'ID', cols = 'Prot', values = 'Seq', aggfunc = 'first')