예제 #1
0
def main(input_files, out_csv_file,
         threads=5, extract_regions=True, known_names=None):

    seq_iter = fasta_reader(FileInput(input_files))
    found_names = set()
    need_header = True
    if os.path.exists(out_csv_file):
        need_header = False
        with open(out_csv_file) as handle:
            for row in csv.DictReader(handle, delimiter='\t'):
                found_names.add(row['Name'])

    csv_handle = open(out_csv_file, 'a')
    fields = ['Name', 'RegionName', 'QueryNucStart', 'QueryNucStop', 'QueryNuc',
              'RegionNucStart', 'RegionNucStop', 'RegionAAStart', 'RegionAAStop', 'QueryAA']
    csv_writer = csv.DictWriter(csv_handle, fields, delimiter='\t')
    if need_header:
        csv_writer.writeheader()

    wanted_seqs = ((name, seq) for name, seq in seq_iter if name not in found_names)
    result_iter = process_seqs(wanted_seqs,
                               threads=threads,
                               extract_regions=extract_regions,
                               known_names=known_names)
    for chunk in yield_chunks(result_iter, 1000):
        print chunk[0]
        csv_writer.writerows(chunk)
예제 #2
0
def large_cluster(iseqs, cluster_size=1000):
    
    running_align = 'running_profile.fasta'
    ifile = 'adding_fasta.fasta'
    
    #do base alignment
    chunk_seqs = take(cluster_size, iseqs)
    muscle_align(chunk_seqs, running_align)
    
    count = len(chunk_seqs)
    print 'Seqs Processed:', count
    chunk_seqs = take(cluster_size, iseqs)
    while chunk_seqs:
        count += len(chunk_seqs)
        muscle_align(chunk_seqs, ifile)
        shutil.move(running_align, running_align + '.tmp')
        muscle_join(running_align + '.tmp', ifile, running_align)
        chunk_seqs = take(cluster_size, iseqs)
        print 'Seqs Processed:', count
        
    shutil.move(running_align, running_align + '.tmp')
    print 'refining!'
    cmd = 'muscle -in %s -out %s -refine' % (running_align + '.tmp', running_align)
    check_call(shlex.split(cmd))
    
    with open(running_align) as handle:
        return list(fasta_reader(handle))
예제 #3
0
def filter_seq(handle, trans):
    for name, seq in fasta_reader(handle):
        tseq = ''.join(l for l in seq if l.isalpha())
        l = len(tseq)
        if (l > 100) and (l < 120):
            if trans:
                rseq = Seq(tseq, generic_dna).translate()
                yield name, ''.join(l for l in rseq.tostring() if l.isalpha())
            else:
                yield name, tseq
예제 #4
0
def trans_seq(handle, wanted_seqs, trans=True):
    for name, seq in fasta_reader(handle):
        if name not in wanted_seqs:
            continue
            
        tseq = ''.join(l for l in seq if l.isalpha())
        if trans:
            rseq = Seq(tseq, generic_dna).translate()
            yield name, ''.join(l for l in rseq.tostring() if l.isalpha())
        else:
            yield name, tseq
예제 #5
0
    def get_from_fasta_handle(handle, letters_only=True):

        names = []
        seqs = []
        for name, seq in fasta_reader(handle):
            names.append(name)
            if letters_only:
                seqs.append(''.join(l for l in seq if l.isalpha()))
            else:
                seqs.append(seq)
        return names, SeqTransformer().fit_transform(seqs)
예제 #6
0
def test_known_mappings():

    with open('TestData/LocatorRes.tsv') as handle:
        cor_res = list(csv.DictReader(handle, delimiter='\t'))

    with open('TestData/testSeqs.fasta') as handle:
        test_seqs = list(fasta_reader(handle))

    for row, crow in zip(HIVTransTool.process_seqs(test_seqs, extract_regions=True), cor_res):
        for f in crow.keys():
            if row[f] is None:
                row[f] = ''
            yield eq_, str(row[f]), crow[f], f
from GeneralSeqTools import fasta_reader
import csv

# <codecell>

from HIVTransTool import map_seqs_to_ref, process_seqs
import csv

ref_path = 'HIVDBFiles/HXB2Sequence.fasta'
cor_res = {}
with open('TestData/test_mapping.csv') as handle:
    for row in csv.DictReader(handle, delimiter='\t'):
        cor_res[row['SeqName']] = (int(row['GenomeStart']), int(row['GenomeStop']))

with open('TestData/testSeqs.fasta') as handle:
    input_seqs = list(fasta_reader(handle))
    
#with open('/home/will/HIVRate/hiv-db.fasta') as handle:
#    seqs = list(fasta_reader(handle))

# <codecell>

from itertools import product

list(product('abcdefg', range(5)))

# <codecell>


fields = ['Name','RegionName', 'QueryNucStart','QueryNucStop','QueryNuc',
'RegionNucStart','RegionNucStop','RegionAAStart', 'RegionAAStop', 'QueryAA']
예제 #8
0
os.chdir('/home/will/SadiVariation/')
sys.path.append('/home/will/PySeqUtils/')

# <codecell>

from GeneralSeqTools import fasta_reader, fasta_writer, WebPSSM_V3_series
import glob

# <codecell>

files = [('x4_seqs.fasta.old', 'x4_seqs.fasta'),
         ('r5_seqs.fasta.old', 'r5_seqs.fasta')]
for ifile, ofile in files:
    with open(ifile) as handle:
        with open(ofile, 'w') as ohandle:
            for name, seq in fasta_reader(handle):
                fasta_writer(ohandle, [(name, seq[1:-1])])

# <codecell>

subtype_files = glob.glob('/home/will/WLAHDB_data/SubtypeGuess/*.gb')
subtypes = []
for f in subtype_files:
    gb = f.rsplit(os.sep, 1)[-1].split('.')[0]
    with open(f) as handle:
        subtype = handle.next().strip()
        if subtype != 'Unk':
            subtypes.append((int(gb), subtype))
subtype_df = pd.DataFrame(subtypes, columns = ['GI', 'Subtype'])

subtype_ser = subtype_df.groupby('GI')['Subtype'].first()
예제 #9
0
import os, os.path
import sys
import numpy as np

sys.path.append('/home/will/HIVReportGen/AnalysisCode/')
sys.path.append('/home/will/PySeqUtils/')
os.chdir('/home/will/HIVVariation/')
from GeneralSeqTools import call_muscle

# <codecell>

from GeneralSeqTools import fasta_reader

seq_data = []
with open('PBMC_analyzed.clean.fasta') as handle:
    for name, seq in fasta_reader(handle):
        try:
            pid, vn = name.split('-')[0:2]
        except ValueError:
            print name
            raise ValueError
        seq_data.append((pid, vn, seq))
        
seq_df = DataFrame(seq_data, columns=['Patient ID', 'VisitNum', 'Seq'])

# <codecell>

wanted_seq_cols = [340, 381] #1-based!!
hxb2_ltr = """TGGAAGGGCTAATTTACTCCCAAAAAAGACAAGATATCCTTGATCTGTGGGTC
TACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGG
GATCAGATATCCACTGACCTTTGGATGGTGCTTCAAGCTAGTACCAGTTGAGC
    # 
    ax.set_title(col + ' pval:%f' % pval)
    ax.set_ylim([0, ax.get_ylim()[1]])
plt.tight_layout()
plt.savefig(base_path + 'corrected_cyto_data.png')


# <codecell>

import glob

ltr_files = sorted(glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*LTR.fasta'))
ltr_seqs = []
for f in ltr_files:
    with open(f) as handle:
        ltr_seqs += list(fasta_reader(handle))
print len(ltr_seqs)

# <codecell>

conb_ltr = """TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAA
GGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGC
TAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCA
TGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAG
CTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGC
GTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTC
TCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCC
TTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTG
TGGAAAATCTCT""".replace('\n', '')

ltr_align = list(seq_align_to_ref(ltr_seqs, conb_ltr, max_workers = 20))
예제 #11
0
def yield_lets(infile):
    with open(infile) as handle:
        for name, seq in fasta_reader(handle):
            for l in imap(lambda x: x.upper(), seq):
                if l != '-':
                    yield l.upper()
예제 #12
0
def filter_seq(handle, trans):
    for name, seq in fasta_reader(handle):
        tseq = ''.join(l for l in seq if l.isalpha())
        l = len(tseq)
        if (l == 105):
            if trans:
                rseq = Seq(tseq, generic_dna).translate()
                yield name, rseq.tostring()
            else:
                yield name, tseq



with open('V3filter.nt.fasta.raln') as handle:
    seq_list = list(fasta_reader(handle))
                

with open('V3filter.aa.fasta.raln') as handle:
    aa_seq_list = list(fasta_reader(handle))
    

# <codecell>

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from itertools import product
from scipy.sparse import csr_matrix, eye

class BioTransformer(BaseEstimator, TransformerMixin):
    
예제 #13
0
sys.path.append('/home/will/HIVReportGen/AnalysisCode/')
sys.path.append('/home/will/PySeqUtils/')


# <codecell>

from GeneralSeqTools import fasta_reader, 

# <codecell>

tat_ex1_pos = (5830, 6044) #0 based

pos_data = read_csv('simple_results.txt', sep = '\t')

with open('Tat1-AB1_passed-cleaned.fasta') as handle:
    seq_data = list(fasta_reader(handle))

# <codecell>

from Bio.Seq import Seq
from Bio.Alphabet import generic_dna, generic_protein

def translate_to_tat(inseq, start_pos, end_pos, rep = 0):
    
    if rep == 2:
        print 'ESCAPED!!!'
        
        return 
    nstart = tat_ex1_pos[0] - start_pos
    nend = tat_ex1_pos[1] - end_pos
    
예제 #14
0
def load_fasta_to_db(fasta_files, source, is_nuc=True, RegionName='Genome'):

    seq_iterable = fasta_reader(FileInput(fasta_files))
    load_raw_seqs_to_db(seq_iterable, source, is_nuc=is_nuc, RegionName=RegionName)
예제 #15
0
                print num, len(block)
        
        
    

    
    




# <codecell>

blast_all_v_all(sA, sB)

# <codecell>

with open('/home/will/tmpstuf/haptest/DrexelMed.A0107.R02.fa') as handle:
    sA = list(fasta_reader(handle))
    
with open('/home/will/tmpstuf/haptest/DrexelMed.A0107.fa') as handle:
    sB = list(fasta_reader(handle))

# <codecell>

sA[:5]

# <codecell>


예제 #16
0
import sys
sys.path.append('/home/will/PySeqUtils/')
from GeneralSeqTools import fasta_reader, fasta_writer
import os
os.chdir('/home/will/PySeqUtils/TransToolStuff/')

# <codecell>

from itertools import islice
start = 806
stop = -1
path = 'HIV1_ALL_2012_env_PRO.fasta'
outpath = 'HIV1_ALL_2012_gp41_PRO.fasta'
with open(path) as handle:
    for name, seq in islice(fasta_reader(handle), 20):
        tseq = seq[start:stop] 
        print tseq[:5], tseq[-5:]

# <codecell>

seqs = []
with open(path) as handle:
    for name, seq in fasta_reader(handle):
        seqs.append((name, seq[start:stop]))
with open(outpath, 'w') as handle:
    fasta_writer(handle, seqs)

# <codecell>

from Bio import Entrez