Esempio n. 1
0
def add_seq_to_reads(read_file, out_file):
    with open(read_file) as handle:
        with open(out_file, 'w') as ohandle:
            new_seqs = []
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                new_seqs.append((name + ';' + seq, seq))
            GeneralSeqTools.fasta_writer(ohandle, new_seqs)
Esempio n. 2
0
def test_fasta_reader():

    input_items = ['>test1', 'ATCTGCTAGTCGA', 'ATCGAGTAGT', '>test2', 'ATCGATGC']

    input_seq = '\n'.join(input_items)

    res = list(GeneralSeqTools.fasta_reader(StringIO(input_seq)))
    eq_(len(res), 2)
    eq_(res[0][0], 'test1')
    eq_(res[0][1], 'ATCTGCTAGTCGAATCGAGTAGT')
    eq_(res[1][0], 'test2')
    eq_(res[1][1], 'ATCGATGC')
def run_mafft(inseqs):
    
    orig_order = [name for name, _ in inseqs]
    with NTF(suffix = '.fasta') as handle:
        GeneralSeqTools.fasta_writer(handle, inseqs)
        handle.flush()
        os.fsync(handle)
        
        cmd = 'mafft --quiet --op 10 --ep 0.123 %s' % handle.name
        out = check_output(shlex.split(cmd))
        
    out_dict = dict(GeneralSeqTools.fasta_reader(StringIO(out)))
        
    return [(name, out_dict[name]) for name in orig_order]
Esempio n. 4
0
def GetConSeq(region, alphabet=generic_dna, subtype='B', drop_gaps=True):

    if (alphabet == generic_dna) or (alphabet.lower() == 'dna'):
        path = get_region_file(region, 'dna')
    elif (alphabet == generic_protein) or (alphabet.lower() == 'pro'):
        path = get_region_file(region, 'pro')
    else:
        raise(TypeError, 'alphabet must be: "dna", "pro", generic_dna, generic_protein')
    seq = None
    if path is None:
        new_region, start, stop = get_region_span(region, alphabet)
        conB_seq = GetConSeq(new_region,
                             subtype='B',
                             alphabet=alphabet,
                             drop_gaps=False)
        sub_seq = GetConSeq(new_region,
                            subtype=subtype,
                            alphabet=alphabet,
                            drop_gaps=False)
        nstart, nstop = (None, None)
        print conB_seq
        print sub_seq
        conb_pos = 0
        for aln_pos, l in enumerate(conB_seq):
            if l != '-':
                conb_pos += 1
            if conb_pos == start:
                nstart = aln_pos
            if conb_pos == stop:
                nstop = aln_pos
                break
        seq = sub_seq[nstart:nstop]
    else:

        wanted_key = 'CONSENSUS_'+subtype
        with open(path) as handle:
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                name = name.split('(')[0]
                if name == wanted_key:
                    break

    if drop_gaps:
        return seq.replace('-', '').replace('$', '')
    else:
        return seq.replace('$', '')
Esempio n. 5
0
# <codecell>

pat_data = pd.merge(redcap_data, df,
                    left_on ='SingleID',
                    right_on = 'SingleID',
                    how = 'outer').groupby('SingleID').first()

# <codecell>

import glob
ltr_files = sorted(glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*LTR.fasta'))
ltr_seqs = {}
for f in ltr_files:
    with open(f) as handle:
        _, seq = GeneralSeqTools.fasta_reader(handle).next()
        fname = os.path.basename(f).rsplit('-', 1)[0]
        ltr_seqs[fname] = seq

# <codecell>

ltr_df = pd.DataFrame({
                       'LTR':pd.Series(ltr_seqs)
                       })
ltr_df.head()

# <codecell>

conb_ltr = ConSeqs.GetConSeq('ltr')
conb_ltr
Esempio n. 6
0
            writer.writerow((gbm, acc))


# <codecell>



files = [('C', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/C_*'))),
         ('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))]
seqs = []
for sub, sfiles in files:
    for f in sfiles:
        with open(f) as handle:
            base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0]
            prot = base_name.split('_')[1]
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                seqs.append({
                             'Seq':seq,
                             'ID':gi_to_acc_dict[name],
                             'Prot':prot,
                             'Subtype':sub
                             })
            
seqdf = pd.DataFrame(seqs)

# <codecell>

pseqdf = pd.pivot_table(seqdf, 
                        rows = ['Subtype', 'ID'], 
                        cols = 'Prot', 
                        values = 'Seq', 
Esempio n. 7
0
    except ValueError:
        print fname
    seqs.append((pid, vn, prot, 1))

df = pd.DataFrame(seqs, columns=["Patient ID", "VisitNum", "Prot", "HasSeq"])
has_seq = pd.pivot_table(df, rows=["Patient ID", "VisitNum"], cols="Prot", values="HasSeq")

# <codecell>

import sys

sys.path.append("/home/will/PySeqUtils/")
import GeneralSeqTools

with open("/home/will/DrugStuff/pat_data.fasta") as handle:
    seqs = list(GeneralSeqTools.fasta_reader(handle))
    out = GeneralSeqTools.WebPSSM_V3_fasta(seqs)


# <codecell>

tmp = []
for row in out:
    parts = row[0].split("-")
    if len(parts) == 2:
        pat, vnum = parts
    else:
        pat, vnum, _ = parts
    tmp.append({"Patient ID": pat, "VisitNum": vnum, "IsR5": row[2] == "0", "IsX4": row[2] == "1"})
tropism = pd.DataFrame(tmp).groupby(["Patient ID", "VisitNum"]).first()
Esempio n. 8
0
        for num, (gbm, acc) in enumerate(imap(get_gi_acc, gb_files)):
            if (num == 100) or (num % 50000 == 0):
                print num
            gi_to_acc_dict[gbm] = acc
            writer.writerow((gbm, acc))

# <codecell>

files = [('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))]
seqs = []
for sub, sfiles in files:
    for f in sfiles:
        with open(f) as handle:
            base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0]
            prot = base_name.split('_')[1]
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                seqs.append({
                             'Seq':seq,
                             'ID':gi_to_acc_dict[name],
                             'Prot':prot,
                             })
            
seqdf = pd.DataFrame(seqs)

# <codecell>

pseqdf = pd.pivot_table(seqdf, 
                        rows = 'ID', 
                        cols = 'Prot', 
                        values = 'Seq', 
                        aggfunc = 'first')