Esempio n. 1
0
def add_seq_to_reads(read_file, out_file):
    with open(read_file) as handle:
        with open(out_file, 'w') as ohandle:
            new_seqs = []
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                new_seqs.append((name + ';' + seq, seq))
            GeneralSeqTools.fasta_writer(ohandle, new_seqs)
Esempio n. 2
0
def test_fasta_writer():

    items = ['>test1', 'ATCTGCTAGTCGAATCGAGTAGT', '>test2', 'ATCGATGC']
    test_seq = '\n'.join(items) + '\n'

    handle = StringIO()
    GeneralSeqTools.fasta_writer(handle, [('test1', 'ATCTGCTAGTCGAATCGAGTAGT'),
                                            ('test2', 'ATCGATGC')])
    handle.seek(0)
    data = handle.read()
    eq_(test_seq, data)
def run_mafft(inseqs):
    
    orig_order = [name for name, _ in inseqs]
    with NTF(suffix = '.fasta') as handle:
        GeneralSeqTools.fasta_writer(handle, inseqs)
        handle.flush()
        os.fsync(handle)
        
        cmd = 'mafft --quiet --op 10 --ep 0.123 %s' % handle.name
        out = check_output(shlex.split(cmd))
        
    out_dict = dict(GeneralSeqTools.fasta_reader(StringIO(out)))
        
    return [(name, out_dict[name]) for name in orig_order]
Esempio n. 4
0
def test_seq_map_to_ref():

    ref_align = 'ATCTCT--ATCT'
    seq_align = 'A-CCCT-AATCT'
    cor_align = 'A-CCCTATCT'

    res = GeneralSeqTools.seq_map_to_ref(seq_align,ref_align)
    eq_(res, cor_align)
Esempio n. 5
0
def test_convert_seqs_to_dataframe():

    indict = {
        'seq1': list('ATCGATTGC'),
        'seq2': list('ATCGATTGC'),
    }
    inseqs = [('seq1', 'ATCGATTGC'), ('seq2', 'ATCGATTGC')]
    tdf = DataFrame(indict).T

    res = GeneralSeqTools.convert_seqs_to_dataframe(inseqs)
    ok_((res == tdf).all().all())
Esempio n. 6
0
def test_convert_seqDF_to_list():

    indict = {
        'seq1': list('ATCGATTGC'),
        'seq2': list('ATCGATTGC'),
        }
    inseqs = [('seq1', 'ATCGATTGC'), ('seq2', 'ATCGATTGC')]
    tdf = DataFrame(indict).T

    res = GeneralSeqTools.convert_seqDF_to_list(tdf)
    eq_(res, inseqs)
Esempio n. 7
0
def test_fasta_reader():

    input_items = ['>test1', 'ATCTGCTAGTCGA', 'ATCGAGTAGT', '>test2', 'ATCGATGC']

    input_seq = '\n'.join(input_items)

    res = list(GeneralSeqTools.fasta_reader(StringIO(input_seq)))
    eq_(len(res), 2)
    eq_(res[0][0], 'test1')
    eq_(res[0][1], 'ATCTGCTAGTCGAATCGAGTAGT')
    eq_(res[1][0], 'test2')
    eq_(res[1][1], 'ATCGATGC')
Esempio n. 8
0
def test_seq_align_to_ref_multi():

    ref_seq = 'ATCGATTGC'
    test_seq = 'ATCGATGC'
    cor_mapping = 'ATCGA-TGC'

    inp = [('test1', test_seq)] * 10

    res = list(GeneralSeqTools.seq_align_to_ref(inp, ref_seq, max_workers=5))
    result = [('test1', cor_mapping)] * 10

    eq_(res, result)
Esempio n. 9
0
def align_to_ref(ref_seq, base_seq):
    """Aligns a sequence to the reference and caches the result for fast
     lookup later. Returns a tuple (base_seq, ref_seq) properly aligned.

     ref_seq -- The reference sequence to use as a guide.
     query_seq -- The query sequence.

     Returns:
     query_aln -- The aligned query sequence.
     ref_aln -- The aligned reference sequence.
    """

    seqs = [('query', base_seq), ('ref', ref_seq)]
    aligned = dict(GeneralSeqTools.call_muscle(seqs))
    return aligned['query'], aligned['ref']
Esempio n. 10
0
def GetConSeq(region, alphabet=generic_dna, subtype='B', drop_gaps=True):

    if (alphabet == generic_dna) or (alphabet.lower() == 'dna'):
        path = get_region_file(region, 'dna')
    elif (alphabet == generic_protein) or (alphabet.lower() == 'pro'):
        path = get_region_file(region, 'pro')
    else:
        raise(TypeError, 'alphabet must be: "dna", "pro", generic_dna, generic_protein')
    seq = None
    if path is None:
        new_region, start, stop = get_region_span(region, alphabet)
        conB_seq = GetConSeq(new_region,
                             subtype='B',
                             alphabet=alphabet,
                             drop_gaps=False)
        sub_seq = GetConSeq(new_region,
                            subtype=subtype,
                            alphabet=alphabet,
                            drop_gaps=False)
        nstart, nstop = (None, None)
        print conB_seq
        print sub_seq
        conb_pos = 0
        for aln_pos, l in enumerate(conB_seq):
            if l != '-':
                conb_pos += 1
            if conb_pos == start:
                nstart = aln_pos
            if conb_pos == stop:
                nstop = aln_pos
                break
        seq = sub_seq[nstart:nstop]
    else:

        wanted_key = 'CONSENSUS_'+subtype
        with open(path) as handle:
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                name = name.split('(')[0]
                if name == wanted_key:
                    break

    if drop_gaps:
        return seq.replace('-', '').replace('$', '')
    else:
        return seq.replace('$', '')
Esempio n. 11
0
def get_region(seq, reference, regions = None):
    
    if regions == None:
        regions = [(300, 400)]
    
    tmp_seqs = [('conc', reference),
                ('guess', seq)]
    aligned = dict(GeneralSeqTools.call_muscle(tmp_seqs))
    out = []
    for _, start, stop in regions:
        conc_pos = 0
        align_start = None
        for align_pos, l in enumerate(aligned['conc']):
            if l != '-':
                conc_pos += 1
            if conc_pos == start:
                align_start = align_pos
            if conc_pos == stop:
                align_stop = align_pos
                break
        yield seq[align_start:align_stop].replace('-', '')
# <codecell>

import sys
sys.path.append('/home/will/PySeqUtils/')

# <codecell>

import TreeingTools
import GeneralSeqTools
import dendropy

# <codecell>

with open('/home/will/SubCData/mafft_ep.fasta') as handle:
    seqs = list(GeneralSeqTools.fasta_reader(handle))

# <codecell>

import os, os.path
import csv
from itertools import product
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from operator import methodcaller
from itertools import groupby
from Bio.Seq import Seq
from Bio import Motif
from Bio.Alphabet import IUPAC
Esempio n. 13
0
# <codecell>

pat_data = pd.merge(redcap_data, df,
                    left_on ='SingleID',
                    right_on = 'SingleID',
                    how = 'outer').groupby('SingleID').first()

# <codecell>

import glob
ltr_files = sorted(glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*LTR.fasta'))
ltr_seqs = {}
for f in ltr_files:
    with open(f) as handle:
        _, seq = GeneralSeqTools.fasta_reader(handle).next()
        fname = os.path.basename(f).rsplit('-', 1)[0]
        ltr_seqs[fname] = seq

# <codecell>

ltr_df = pd.DataFrame({
                       'LTR':pd.Series(ltr_seqs)
                       })
ltr_df.head()

# <codecell>

conb_ltr = ConSeqs.GetConSeq('ltr')
conb_ltr
Esempio n. 14
0
            writer.writerow((gbm, acc))


# <codecell>



files = [('C', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/C_*'))),
         ('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))]
seqs = []
for sub, sfiles in files:
    for f in sfiles:
        with open(f) as handle:
            base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0]
            prot = base_name.split('_')[1]
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                seqs.append({
                             'Seq':seq,
                             'ID':gi_to_acc_dict[name],
                             'Prot':prot,
                             'Subtype':sub
                             })
            
seqdf = pd.DataFrame(seqs)

# <codecell>

pseqdf = pd.pivot_table(seqdf, 
                        rows = ['Subtype', 'ID'], 
                        cols = 'Prot', 
                        values = 'Seq', 
Esempio n. 15
0
        for num, (gbm, acc) in enumerate(imap(get_gi_acc, gb_files)):
            if (num == 100) or (num % 50000 == 0):
                print num
            gi_to_acc_dict[gbm] = acc
            writer.writerow((gbm, acc))

# <codecell>

files = [('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))]
seqs = []
for sub, sfiles in files:
    for f in sfiles:
        with open(f) as handle:
            base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0]
            prot = base_name.split('_')[1]
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                seqs.append({
                             'Seq':seq,
                             'ID':gi_to_acc_dict[name],
                             'Prot':prot,
                             })
            
seqdf = pd.DataFrame(seqs)

# <codecell>

pseqdf = pd.pivot_table(seqdf, 
                        rows = 'ID', 
                        cols = 'Prot', 
                        values = 'Seq', 
                        aggfunc = 'first')
Esempio n. 16
0
# <codecell>

import GeneralSeqTools
import glob

# <codecell>

import pandas as pd
files = sorted(glob.glob('/home/will/HIVTropism/LANLdata/SubB*.fasta'))

seqs = []
for f in files:
    prot_name = f.split('/')[-1].split('.')[0].split('-')[1]
    print prot_name
    with open(f) as handle:
        for name, seq in GeneralSeqTools.fasta_reader(handle):
            seqs.append({
                         'GI':name,
                         'Seq':seq.replace('-', '').upper(),
                         'Prot':prot_name
                         })
            

# <codecell>

seq_df = pd.pivot_table(pd.DataFrame(seqs),
                        rows = 'GI',
                        cols = 'Prot',
                        values = 'Seq',
                        aggfunc = 'first')
Esempio n. 17
0
    except ValueError:
        print fname
    seqs.append((pid, vn, prot, 1))

df = pd.DataFrame(seqs, columns=["Patient ID", "VisitNum", "Prot", "HasSeq"])
has_seq = pd.pivot_table(df, rows=["Patient ID", "VisitNum"], cols="Prot", values="HasSeq")

# <codecell>

import sys

sys.path.append("/home/will/PySeqUtils/")
import GeneralSeqTools

with open("/home/will/DrugStuff/pat_data.fasta") as handle:
    seqs = list(GeneralSeqTools.fasta_reader(handle))
    out = GeneralSeqTools.WebPSSM_V3_fasta(seqs)


# <codecell>

tmp = []
for row in out:
    parts = row[0].split("-")
    if len(parts) == 2:
        pat, vnum = parts
    else:
        pat, vnum, _ = parts
    tmp.append({"Patient ID": pat, "VisitNum": vnum, "IsR5": row[2] == "0", "IsX4": row[2] == "1"})
tropism = pd.DataFrame(tmp).groupby(["Patient ID", "VisitNum"]).first()
Esempio n. 18
0
def test_muscle_basic_call():

    seqs = [('test1', 'ATCGATTGC'), ('test2', 'ATCGATGC')]
    aln = [('test1', 'ATCGATTGC'), ('test2', 'ATCGA-TGC')]
    res = list(GeneralSeqTools.call_muscle(seqs))
    eq_(res, aln)