def main(input_files, out_csv_file, threads=5, extract_regions=True, known_names=None): seq_iter = fasta_reader(FileInput(input_files)) found_names = set() need_header = True if os.path.exists(out_csv_file): need_header = False with open(out_csv_file) as handle: for row in csv.DictReader(handle, delimiter='\t'): found_names.add(row['Name']) csv_handle = open(out_csv_file, 'a') fields = ['Name', 'RegionName', 'QueryNucStart', 'QueryNucStop', 'QueryNuc', 'RegionNucStart', 'RegionNucStop', 'RegionAAStart', 'RegionAAStop', 'QueryAA'] csv_writer = csv.DictWriter(csv_handle, fields, delimiter='\t') if need_header: csv_writer.writeheader() wanted_seqs = ((name, seq) for name, seq in seq_iter if name not in found_names) result_iter = process_seqs(wanted_seqs, threads=threads, extract_regions=extract_regions, known_names=known_names) for chunk in yield_chunks(result_iter, 1000): print chunk[0] csv_writer.writerows(chunk)
def large_cluster(iseqs, cluster_size=1000): running_align = 'running_profile.fasta' ifile = 'adding_fasta.fasta' #do base alignment chunk_seqs = take(cluster_size, iseqs) muscle_align(chunk_seqs, running_align) count = len(chunk_seqs) print 'Seqs Processed:', count chunk_seqs = take(cluster_size, iseqs) while chunk_seqs: count += len(chunk_seqs) muscle_align(chunk_seqs, ifile) shutil.move(running_align, running_align + '.tmp') muscle_join(running_align + '.tmp', ifile, running_align) chunk_seqs = take(cluster_size, iseqs) print 'Seqs Processed:', count shutil.move(running_align, running_align + '.tmp') print 'refining!' cmd = 'muscle -in %s -out %s -refine' % (running_align + '.tmp', running_align) check_call(shlex.split(cmd)) with open(running_align) as handle: return list(fasta_reader(handle))
def filter_seq(handle, trans): for name, seq in fasta_reader(handle): tseq = ''.join(l for l in seq if l.isalpha()) l = len(tseq) if (l > 100) and (l < 120): if trans: rseq = Seq(tseq, generic_dna).translate() yield name, ''.join(l for l in rseq.tostring() if l.isalpha()) else: yield name, tseq
def trans_seq(handle, wanted_seqs, trans=True): for name, seq in fasta_reader(handle): if name not in wanted_seqs: continue tseq = ''.join(l for l in seq if l.isalpha()) if trans: rseq = Seq(tseq, generic_dna).translate() yield name, ''.join(l for l in rseq.tostring() if l.isalpha()) else: yield name, tseq
def get_from_fasta_handle(handle, letters_only=True): names = [] seqs = [] for name, seq in fasta_reader(handle): names.append(name) if letters_only: seqs.append(''.join(l for l in seq if l.isalpha())) else: seqs.append(seq) return names, SeqTransformer().fit_transform(seqs)
def test_known_mappings(): with open('TestData/LocatorRes.tsv') as handle: cor_res = list(csv.DictReader(handle, delimiter='\t')) with open('TestData/testSeqs.fasta') as handle: test_seqs = list(fasta_reader(handle)) for row, crow in zip(HIVTransTool.process_seqs(test_seqs, extract_regions=True), cor_res): for f in crow.keys(): if row[f] is None: row[f] = '' yield eq_, str(row[f]), crow[f], f
from GeneralSeqTools import fasta_reader import csv # <codecell> from HIVTransTool import map_seqs_to_ref, process_seqs import csv ref_path = 'HIVDBFiles/HXB2Sequence.fasta' cor_res = {} with open('TestData/test_mapping.csv') as handle: for row in csv.DictReader(handle, delimiter='\t'): cor_res[row['SeqName']] = (int(row['GenomeStart']), int(row['GenomeStop'])) with open('TestData/testSeqs.fasta') as handle: input_seqs = list(fasta_reader(handle)) #with open('/home/will/HIVRate/hiv-db.fasta') as handle: # seqs = list(fasta_reader(handle)) # <codecell> from itertools import product list(product('abcdefg', range(5))) # <codecell> fields = ['Name','RegionName', 'QueryNucStart','QueryNucStop','QueryNuc', 'RegionNucStart','RegionNucStop','RegionAAStart', 'RegionAAStop', 'QueryAA']
os.chdir('/home/will/SadiVariation/') sys.path.append('/home/will/PySeqUtils/') # <codecell> from GeneralSeqTools import fasta_reader, fasta_writer, WebPSSM_V3_series import glob # <codecell> files = [('x4_seqs.fasta.old', 'x4_seqs.fasta'), ('r5_seqs.fasta.old', 'r5_seqs.fasta')] for ifile, ofile in files: with open(ifile) as handle: with open(ofile, 'w') as ohandle: for name, seq in fasta_reader(handle): fasta_writer(ohandle, [(name, seq[1:-1])]) # <codecell> subtype_files = glob.glob('/home/will/WLAHDB_data/SubtypeGuess/*.gb') subtypes = [] for f in subtype_files: gb = f.rsplit(os.sep, 1)[-1].split('.')[0] with open(f) as handle: subtype = handle.next().strip() if subtype != 'Unk': subtypes.append((int(gb), subtype)) subtype_df = pd.DataFrame(subtypes, columns = ['GI', 'Subtype']) subtype_ser = subtype_df.groupby('GI')['Subtype'].first()
import os, os.path import sys import numpy as np sys.path.append('/home/will/HIVReportGen/AnalysisCode/') sys.path.append('/home/will/PySeqUtils/') os.chdir('/home/will/HIVVariation/') from GeneralSeqTools import call_muscle # <codecell> from GeneralSeqTools import fasta_reader seq_data = [] with open('PBMC_analyzed.clean.fasta') as handle: for name, seq in fasta_reader(handle): try: pid, vn = name.split('-')[0:2] except ValueError: print name raise ValueError seq_data.append((pid, vn, seq)) seq_df = DataFrame(seq_data, columns=['Patient ID', 'VisitNum', 'Seq']) # <codecell> wanted_seq_cols = [340, 381] #1-based!! hxb2_ltr = """TGGAAGGGCTAATTTACTCCCAAAAAAGACAAGATATCCTTGATCTGTGGGTC TACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGG GATCAGATATCCACTGACCTTTGGATGGTGCTTCAAGCTAGTACCAGTTGAGC
# ax.set_title(col + ' pval:%f' % pval) ax.set_ylim([0, ax.get_ylim()[1]]) plt.tight_layout() plt.savefig(base_path + 'corrected_cyto_data.png') # <codecell> import glob ltr_files = sorted(glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*LTR.fasta')) ltr_seqs = [] for f in ltr_files: with open(f) as handle: ltr_seqs += list(fasta_reader(handle)) print len(ltr_seqs) # <codecell> conb_ltr = """TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAA GGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGC TAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCA TGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAG CTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGC GTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTC TCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCC TTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTG TGGAAAATCTCT""".replace('\n', '') ltr_align = list(seq_align_to_ref(ltr_seqs, conb_ltr, max_workers = 20))
def yield_lets(infile): with open(infile) as handle: for name, seq in fasta_reader(handle): for l in imap(lambda x: x.upper(), seq): if l != '-': yield l.upper()
def filter_seq(handle, trans): for name, seq in fasta_reader(handle): tseq = ''.join(l for l in seq if l.isalpha()) l = len(tseq) if (l == 105): if trans: rseq = Seq(tseq, generic_dna).translate() yield name, rseq.tostring() else: yield name, tseq with open('V3filter.nt.fasta.raln') as handle: seq_list = list(fasta_reader(handle)) with open('V3filter.aa.fasta.raln') as handle: aa_seq_list = list(fasta_reader(handle)) # <codecell> import numpy as np from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin from itertools import product from scipy.sparse import csr_matrix, eye class BioTransformer(BaseEstimator, TransformerMixin):
sys.path.append('/home/will/HIVReportGen/AnalysisCode/') sys.path.append('/home/will/PySeqUtils/') # <codecell> from GeneralSeqTools import fasta_reader, # <codecell> tat_ex1_pos = (5830, 6044) #0 based pos_data = read_csv('simple_results.txt', sep = '\t') with open('Tat1-AB1_passed-cleaned.fasta') as handle: seq_data = list(fasta_reader(handle)) # <codecell> from Bio.Seq import Seq from Bio.Alphabet import generic_dna, generic_protein def translate_to_tat(inseq, start_pos, end_pos, rep = 0): if rep == 2: print 'ESCAPED!!!' return nstart = tat_ex1_pos[0] - start_pos nend = tat_ex1_pos[1] - end_pos
def load_fasta_to_db(fasta_files, source, is_nuc=True, RegionName='Genome'): seq_iterable = fasta_reader(FileInput(fasta_files)) load_raw_seqs_to_db(seq_iterable, source, is_nuc=is_nuc, RegionName=RegionName)
print num, len(block) # <codecell> blast_all_v_all(sA, sB) # <codecell> with open('/home/will/tmpstuf/haptest/DrexelMed.A0107.R02.fa') as handle: sA = list(fasta_reader(handle)) with open('/home/will/tmpstuf/haptest/DrexelMed.A0107.fa') as handle: sB = list(fasta_reader(handle)) # <codecell> sA[:5] # <codecell>
import sys sys.path.append('/home/will/PySeqUtils/') from GeneralSeqTools import fasta_reader, fasta_writer import os os.chdir('/home/will/PySeqUtils/TransToolStuff/') # <codecell> from itertools import islice start = 806 stop = -1 path = 'HIV1_ALL_2012_env_PRO.fasta' outpath = 'HIV1_ALL_2012_gp41_PRO.fasta' with open(path) as handle: for name, seq in islice(fasta_reader(handle), 20): tseq = seq[start:stop] print tseq[:5], tseq[-5:] # <codecell> seqs = [] with open(path) as handle: for name, seq in fasta_reader(handle): seqs.append((name, seq[start:stop])) with open(outpath, 'w') as handle: fasta_writer(handle, seqs) # <codecell> from Bio import Entrez