def score_seq(known, guess, gapopen=10, gapextend=1):
    
    cmd = 'needle -asequence %(cb)s -bsequence %(seq)s -aformat score -gapopen %(go)f -gapextend %(ge)s -outfile %(out)s'
    with NamedTemporaryFile() as conb_handle:
        fasta_writer(conb_handle, [('SeqA', known)])
        conb_handle.flush()
        os.fsync(conb_handle.fileno())
        with NamedTemporaryFile() as seq_handle:
            fasta_writer(seq_handle, [('Seq1', guess)])
            seq_handle.flush()
            os.fsync(seq_handle.fileno())
            with NamedTemporaryFile() as out_handle:
                param_dict = {
                              'cb':conb_handle.name,
                              'seq':seq_handle.name,
                              'out':out_handle.name,
                              'go':gapopen,
                              'ge':gapextend
                              }
                cmd_list = shlex.split(cmd % param_dict)
                check_call(cmd_list)
                for line in out_handle:
                    parts = line.split()
                    if (len(parts) == 4):
                        return float(parts[-1][1:-2])
Example #2
0
def map_seqs_to_ref(input_seqs, retry=0):
    """Maps a set of (name, seq) pairs to HXB2 using LANL"""

    base_seqs = StringIO()
    fasta_writer(base_seqs, input_seqs)
    base_seqs.seek(0)
    fasta_seqs = base_seqs.read()

    br = build_browser()
    br.open('http://www.hiv.lanl.gov/content/sequence/LOCATE/locate.html')
    logging.debug('Opened Browser to LANL')

    br.select_form(nr=1)
    br.form['SEQ'] = fasta_seqs
    resp = br.submit()
    logging.info('Submitted Seqs to LANL')

    try:
        soup = BeautifulSoup(resp)
    except IncompleteRead:
        if retry > 5:
            raise ValueError
        return map_seqs_to_ref(input_seqs, retry=retry+1)
    rows = []
    count = 0
    for name, seq, table in zip(yield_seq_names(soup), yield_query_seqs(soup), yield_seq_tables(soup)):
        count += 1
        for row in yield_row_vals(table, seq):
            row['Name'] = name
            rows.append(row)

    logging.info('LANL returned %i regions for %i patients' % (len(rows), count))
    return rows
 def _write_seqs(self, X, handle):
     
     seqs = []
     for row in range(X.shape[0]):
         seq = ''.join(X[row])
         seqs.append(('Seq-%03i' % row, ''.join(l for l in seq if l.isalpha())))
         
     fasta_writer(handle, seqs)
     handle.flush()
     os.fsync(handle.fileno())
Example #4
0
def run_FastTree(seqs, alphabet=generic_protein, tmp_path=None, uniq_seqs=False):

    if uniq_seqs:

        trans_names = defaultdict(list)
        norm_seq_names = {}
        for num, (name, seq) in enumerate(seqs):
            trans_names[seq].append(name)
            new_name = 'Seq-%i' % num
            norm_seq_names[seq] = new_name

        uni_seqs = []
        name_defs = {}
        for seq, new_name in norm_seq_names.items():
            uni_seqs.append((new_name, seq))
            name_defs[new_name] = trans_names[seq]

        out_tree = run_FastTree(uni_seqs, alphabet=alphabet, tmp_path=tmp_path)

        tax_set = out_tree.taxon_set
        for old_name, new_names in name_defs.items():
            node = out_tree.find_node_with_taxon_label(old_name)
            if node:
                names = iter(new_names)
                node.taxon.label = names.next()
                parent = node.parent_node
                edge_dist = node.edge.length
                for name in names:
                    parent.new_child(taxon=tax_set.new_taxon(label=name),
                                     edge_length=edge_dist)
        return out_tree

    else:
        base_path = os.path.dirname(__file__)

        with NTF(dir=tmp_path, suffix='.fasta') as handle:
            fasta_writer(handle, seqs)
            handle.flush()
            os.fsync(handle)
            tdict = {
                'alpha': '-nt' if alphabet == generic_dna else '',
                'path': handle.name
            }
            cmd = os.path.join(base_path,
                               'FastTree %(alpha)s -quiet %(path)s' % tdict)
            cmd_list = shlex.split(cmd)
            tree_str = check_output(cmd_list)
        return dendropy.Tree(stream=StringIO(tree_str), schema='newick')
Example #5
0
def testSeqTransformer_from_fasta():

    handle = StringIO()
    inseqs = [('Seq1', 'ATGTCG'),
              ('Seq2', 'ATGG'),
              ('Seq3', 'ATGTAHYTD')]
    fasta_writer(handle, inseqs)
    handle.seek(0)

    outdata = np.array(['ATGTCG---',
                        'ATGG-----',
                        'ATGTAHYTD'])

    names, out = HIVAlignTools.SeqTransformer.get_from_fasta_handle(handle)
    ok_(np.all(out == outdata))
    ok_(all(t == g for t, g in zip(names, ['Seq1', 'Seq2', 'Seq3'])))
Example #6
0
def align_with_lastz(input_seqs, ref_seqs):
    """Aligns set of query sequences with a reference."""

    with tmp_directory() as tmp_dir:
        seq_file = os.path.join(tmp_dir, "query.fasta")
        ref_file = os.path.join(tmp_dir, "ref.fasta")
        out_file = os.path.join(tmp_dir, "res.fasta")

        with open(seq_file, "w") as handle:
            fasta_writer(handle, input_seqs)

        with open(ref_file, "w") as handle:
            fasta_writer(handle, ref_seqs)

        call_lastz(seq_file, ref_file, out_file)

        with open(out_file) as handle:
            return list(SAMreader(handle))
Example #7
0
def blast_all_v_all(seqsA, seqsB, block_size=20):
        
    dpath = '/home/will/tmpstuf/haptest/tmpseqs/'
    with NTF(suffix='.fa', dir=dpath, delete=False) as db_handle:
        fasta_writer(db_handle, seqsA)
        db_handle.flush()
        os.fsync(db_handle.fileno())
        
        cmd = 'makeblastdb -in %s -dbtype nucl' % db_handle.name
        cmd_list = shlex.split(cmd)
        check_call(cmd_list)
        
        align_func = partial(check_seqs, db_handle.name)
        check_iterable = islice(yield_blocks(iter(seqsB), 200), 20)
        with ProcessPoolExecutor(max_workers=5) as pool:
            res_iter = pool.map(align_func, check_iterable)
            for num, block in enumerate(res_iter):
                print num, len(block)
Example #8
0
def check_seqs(db_path, seqs):
    cmd = "blastn -db %(db)s -query %(q)s -outfmt '10 qseqid sseqid pident nident' -num_threads 20 -max_target_seqs 1"
    fields = ['SeqA', 'SeqB', 'pident', 'nident']
    dpath =  '/home/will/tmpstuf/haptest/tmpseqs/'
    
    with NTF(suffix='.fa', dir=dpath, delete=False) as check_handle:
        
        fasta_writer(check_handle, seqs)
        check_handle.flush()
        os.fsync(check_handle.fileno())
        
        tdict = {
                 'db':db_path,
                 'q':check_handle.name
                 }
        cmd_list = shlex.split(cmd % tdict)
        out = check_output(cmd_list)
        reader = csv.DictReader(StringIO(out), fieldnames=fields)
        return list(reader)
sys.path.append('/home/will/PySeqUtils/')

# <codecell>

from GeneralSeqTools import fasta_reader, fasta_writer, WebPSSM_V3_series
import glob

# <codecell>

files = [('x4_seqs.fasta.old', 'x4_seqs.fasta'),
         ('r5_seqs.fasta.old', 'r5_seqs.fasta')]
for ifile, ofile in files:
    with open(ifile) as handle:
        with open(ofile, 'w') as ohandle:
            for name, seq in fasta_reader(handle):
                fasta_writer(ohandle, [(name, seq[1:-1])])

# <codecell>

subtype_files = glob.glob('/home/will/WLAHDB_data/SubtypeGuess/*.gb')
subtypes = []
for f in subtype_files:
    gb = f.rsplit(os.sep, 1)[-1].split('.')[0]
    with open(f) as handle:
        subtype = handle.next().strip()
        if subtype != 'Unk':
            subtypes.append((int(gb), subtype))
subtype_df = pd.DataFrame(subtypes, columns = ['GI', 'Subtype'])

subtype_ser = subtype_df.groupby('GI')['Subtype'].first()
                 'Tat-1-seq-align', 'Tat-2-seq-align', 'LTR-seq-align']
four = wanted_data[fourkb_cols].dropna()
wseqs = set()
with open('/home/will/Dropbox/HIVseqs/BensTropismLabels.csv') as handle:
    for row in csv.DictReader(handle, delimiter=','):
        wseqs.add(row['Patient ID'])

        
for col in four.columns:
    found = set()
    prot = col.rsplit('-', 2)[0]
    fname = 'AlignForBenj/fourKB_%s.fasta' % prot
    with open(fname, 'w') as handle:
        for seq, name in zip(four[col], four.index):
            if name in wseqs and name not in found:
                fasta_writer(handle, [(name+'-'+trop_dict[name], ''.join(seq))])
                found.add(name)
    print prot, len(found)

# <codecell>

foukb_lanl = ['AB078005', 'AB221126', 'AB253432', 'AB286955', 
              'AB287365', 'AB287367', 'AB287368', 'AB287369', 
              'AB480695', 'AB485642', 'AB565479', 'AB565496', 
              'AB565497', 'AB565499', 'AB565500', 'AB565502', 
              'AB604946', 'AB604948', 'AB604950', 'AB604951', 
              'AB641836', 'AF003887', 'AF003888', 'AF004394', 
              'AF042100', 'AF042101', 'AF538302', 'AF538303', 
              'AF538307', 'AJ271445', 'AY173953', 'AY352275', 
              'AY835748', 'AY835754', 'AY835759', 'AY835762', 
              'AY835766', 'AY835769', 'AY835770', 'AY835774', 
        out_seqs.append({
                         'Accession':name,
                         'PSSM':tdata.ix[name]['PSSMScore'],
                         'Seq':seq
                         })
out_df = pd.DataFrame(out_seqs)
        

# <codecell>

with open('extra_brain.fasta', 'w') as handle:
    found = set()
    for name, seq in aa_seq_list:
        if name in wanted_acc:
            if seq not in found:
                fasta_writer(handle, [(name+'NEWSEQS!!!!!!', seq)])
                found.add(seq)

# <codecell>

out_df.to_csv('brain_x4.tsv', sep='\t')

# <codecell>

ax = trim_lanl.boxplot(column='PSSMScore', by = 'STissue', vert=False, figsize=(10,10))
ax.set_ylim([-1, ax.get_ylim()[1]])
order = ['R5-E', 'R5-1', 'R5-2', 'R5-3', 'R5-P', 'X4-P', 'X4']
for line, name in zip(pssm_bins, order):
    ax.annotate(name, (line-1.5, -0.5), fontsize=10)
ax.annotate('X4', (-1.5, -0.5), fontsize=10)
from Bio.SeqIO.AbiIO import AbiIterator
files = glob.glob('../Wigdahl Trace files/2:11:11/*.ab1')
seqs = []
for f in files:
    rec = AbiIterator(open(f, mode = 'rb'), trim = True).next()
    seqs.append( (rec.id, rec.seq.tostring()) )

# <codecell>

!/home/will/staden-2.0.0b9.x86_64/bin/convert_trace --help

# <codecell>

res = call_muscle(seqs)
with open('align_data.fasta', 'w') as handle:
    fasta_writer(handle, res)

# <codecell>

from HIVTransTool import process_seqs

results = list(process_seqs(seqs[:50], extract_regions = True, known_names = 50))

# <codecell>

for row in results:
    if row['RegionName'] == 'LTR5':
        print row['Name'], row['QueryNuc']

# <codecell>
Example #13
0
stop = -1
path = 'HIV1_ALL_2012_env_PRO.fasta'
outpath = 'HIV1_ALL_2012_gp41_PRO.fasta'
with open(path) as handle:
    for name, seq in islice(fasta_reader(handle), 20):
        tseq = seq[start:stop] 
        print tseq[:5], tseq[-5:]

# <codecell>

seqs = []
with open(path) as handle:
    for name, seq in fasta_reader(handle):
        seqs.append((name, seq[start:stop]))
with open(outpath, 'w') as handle:
    fasta_writer(handle, seqs)

# <codecell>

from Bio import Entrez
from Bio import SeqIO
ids = '544451412,544451410,544451408,544451406,544451404,544451402,544451400,544451398,544451396'

fetch_handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text",
                             id=ids)
records = list(SeqIO.parse(fetch_handle, "gb"))

# <codecell>

rec = records[0]