Esempi in Python per GeneralSeqTools, esempi in Python per GeneralSeqTools

Esempio n. 1

0

Mostra file

File: FixVispa.py Progetto: JudoWill/ResearchNotebooks

def add_seq_to_reads(read_file, out_file):
    with open(read_file) as handle:
        with open(out_file, 'w') as ohandle:
            new_seqs = []
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                new_seqs.append((name + ';' + seq, seq))
            GeneralSeqTools.fasta_writer(ohandle, new_seqs)

Esempio n. 2

0

Mostra file

File: testGeneralSeqTools.py Progetto: JudoWill/PySeqUtils

def test_fasta_writer():

    items = ['>test1', 'ATCTGCTAGTCGAATCGAGTAGT', '>test2', 'ATCGATGC']
    test_seq = '\n'.join(items) + '\n'

    handle = StringIO()
    GeneralSeqTools.fasta_writer(handle, [('test1', 'ATCTGCTAGTCGAATCGAGTAGT'),
                                            ('test2', 'ATCGATGC')])
    handle.seek(0)
    data = handle.read()
    eq_(test_seq, data)

Esempio n. 3

0

Mostra file

File: SpeedingTreeStats.py Progetto: JudoWill/ResearchNotebooks

def run_mafft(inseqs):
    
    orig_order = [name for name, _ in inseqs]
    with NTF(suffix = '.fasta') as handle:
        GeneralSeqTools.fasta_writer(handle, inseqs)
        handle.flush()
        os.fsync(handle)
        
        cmd = 'mafft --quiet --op 10 --ep 0.123 %s' % handle.name
        out = check_output(shlex.split(cmd))
        
    out_dict = dict(GeneralSeqTools.fasta_reader(StringIO(out)))
        
    return [(name, out_dict[name]) for name in orig_order]

Esempio n. 4

0

Mostra file

File: testGeneralSeqTools.py Progetto: JudoWill/PySeqUtils

def test_seq_map_to_ref():

    ref_align = 'ATCTCT--ATCT'
    seq_align = 'A-CCCT-AATCT'
    cor_align = 'A-CCCTATCT'

    res = GeneralSeqTools.seq_map_to_ref(seq_align,ref_align)
    eq_(res, cor_align)

Esempio n. 5

0

Mostra file

File: testGeneralSeqTools.py Progetto: JudoWill/PySeqUtils

def test_convert_seqs_to_dataframe():

    indict = {
        'seq1': list('ATCGATTGC'),
        'seq2': list('ATCGATTGC'),
    }
    inseqs = [('seq1', 'ATCGATTGC'), ('seq2', 'ATCGATTGC')]
    tdf = DataFrame(indict).T

    res = GeneralSeqTools.convert_seqs_to_dataframe(inseqs)
    ok_((res == tdf).all().all())

Esempio n. 6

0

Mostra file

File: testGeneralSeqTools.py Progetto: JudoWill/PySeqUtils

def test_convert_seqDF_to_list():

    indict = {
        'seq1': list('ATCGATTGC'),
        'seq2': list('ATCGATTGC'),
        }
    inseqs = [('seq1', 'ATCGATTGC'), ('seq2', 'ATCGATTGC')]
    tdf = DataFrame(indict).T

    res = GeneralSeqTools.convert_seqDF_to_list(tdf)
    eq_(res, inseqs)

Esempio n. 7

0

Mostra file

File: testGeneralSeqTools.py Progetto: JudoWill/PySeqUtils

def test_fasta_reader():

    input_items = ['>test1', 'ATCTGCTAGTCGA', 'ATCGAGTAGT', '>test2', 'ATCGATGC']

    input_seq = '\n'.join(input_items)

    res = list(GeneralSeqTools.fasta_reader(StringIO(input_seq)))
    eq_(len(res), 2)
    eq_(res[0][0], 'test1')
    eq_(res[0][1], 'ATCTGCTAGTCGAATCGAGTAGT')
    eq_(res[1][0], 'test2')
    eq_(res[1][1], 'ATCGATGC')

Esempio n. 8

0

Mostra file

File: testGeneralSeqTools.py Progetto: JudoWill/PySeqUtils

def test_seq_align_to_ref_multi():

    ref_seq = 'ATCGATTGC'
    test_seq = 'ATCGATGC'
    cor_mapping = 'ATCGA-TGC'

    inp = [('test1', test_seq)] * 10

    res = list(GeneralSeqTools.seq_align_to_ref(inp, ref_seq, max_workers=5))
    result = [('test1', cor_mapping)] * 10

    eq_(res, result)

Esempio n. 9

0

Mostra file

File: TFSeqTools.py Progetto: JudoWill/PySeqUtils

def align_to_ref(ref_seq, base_seq):
    """Aligns a sequence to the reference and caches the result for fast
     lookup later. Returns a tuple (base_seq, ref_seq) properly aligned.

     ref_seq -- The reference sequence to use as a guide.
     query_seq -- The query sequence.

     Returns:
     query_aln -- The aligned query sequence.
     ref_aln -- The aligned reference sequence.
    """

    seqs = [('query', base_seq), ('ref', ref_seq)]
    aligned = dict(GeneralSeqTools.call_muscle(seqs))
    return aligned['query'], aligned['ref']

Esempio n. 10

0

Mostra file

File: ConSeqs.py Progetto: JudoWill/PySeqUtils

def GetConSeq(region, alphabet=generic_dna, subtype='B', drop_gaps=True):

    if (alphabet == generic_dna) or (alphabet.lower() == 'dna'):
        path = get_region_file(region, 'dna')
    elif (alphabet == generic_protein) or (alphabet.lower() == 'pro'):
        path = get_region_file(region, 'pro')
    else:
        raise(TypeError, 'alphabet must be: "dna", "pro", generic_dna, generic_protein')
    seq = None
    if path is None:
        new_region, start, stop = get_region_span(region, alphabet)
        conB_seq = GetConSeq(new_region,
                             subtype='B',
                             alphabet=alphabet,
                             drop_gaps=False)
        sub_seq = GetConSeq(new_region,
                            subtype=subtype,
                            alphabet=alphabet,
                            drop_gaps=False)
        nstart, nstop = (None, None)
        print conB_seq
        print sub_seq
        conb_pos = 0
        for aln_pos, l in enumerate(conB_seq):
            if l != '-':
                conb_pos += 1
            if conb_pos == start:
                nstart = aln_pos
            if conb_pos == stop:
                nstop = aln_pos
                break
        seq = sub_seq[nstart:nstop]
    else:

        wanted_key = 'CONSENSUS_'+subtype
        with open(path) as handle:
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                name = name.split('(')[0]
                if name == wanted_key:
                    break

    if drop_gaps:
        return seq.replace('-', '').replace('$', '')
    else:
        return seq.replace('$', '')

Esempio n. 11

0

Mostra file

File: SubCAnalysis.py Progetto: JudoWill/ResearchNotebooks

def get_region(seq, reference, regions = None):
    
    if regions == None:
        regions = [(300, 400)]
    
    tmp_seqs = [('conc', reference),
                ('guess', seq)]
    aligned = dict(GeneralSeqTools.call_muscle(tmp_seqs))
    out = []
    for _, start, stop in regions:
        conc_pos = 0
        align_start = None
        for align_pos, l in enumerate(aligned['conc']):
            if l != '-':
                conc_pos += 1
            if conc_pos == start:
                align_start = align_pos
            if conc_pos == stop:
                align_stop = align_pos
                break
        yield seq[align_start:align_stop].replace('-', '')

Esempio n. 12

0

Mostra file

File: SpeedingTreeStats.py Progetto: JudoWill/ResearchNotebooks

# <codecell>

import sys
sys.path.append('/home/will/PySeqUtils/')

# <codecell>

import TreeingTools
import GeneralSeqTools
import dendropy

# <codecell>

with open('/home/will/SubCData/mafft_ep.fasta') as handle:
    seqs = list(GeneralSeqTools.fasta_reader(handle))

# <codecell>

import os, os.path
import csv
from itertools import product
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from operator import methodcaller
from itertools import groupby
from Bio.Seq import Seq
from Bio import Motif
from Bio.Alphabet import IUPAC

Esempio n. 13

0

Mostra file

File: LTRgraphs.py Progetto: JudoWill/ResearchNotebooks

# <codecell>

pat_data = pd.merge(redcap_data, df,
                    left_on ='SingleID',
                    right_on = 'SingleID',
                    how = 'outer').groupby('SingleID').first()

# <codecell>

import glob
ltr_files = sorted(glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*LTR.fasta'))
ltr_seqs = {}
for f in ltr_files:
    with open(f) as handle:
        _, seq = GeneralSeqTools.fasta_reader(handle).next()
        fname = os.path.basename(f).rsplit('-', 1)[0]
        ltr_seqs[fname] = seq

# <codecell>

ltr_df = pd.DataFrame({
                       'LTR':pd.Series(ltr_seqs)
                       })
ltr_df.head()

# <codecell>

conb_ltr = ConSeqs.GetConSeq('ltr')
conb_ltr

Esempio n. 14

0

Mostra file

File: SubCAnalysis.py Progetto: JudoWill/ResearchNotebooks

            writer.writerow((gbm, acc))


# <codecell>



files = [('C', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/C_*'))),
         ('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))]
seqs = []
for sub, sfiles in files:
    for f in sfiles:
        with open(f) as handle:
            base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0]
            prot = base_name.split('_')[1]
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                seqs.append({
                             'Seq':seq,
                             'ID':gi_to_acc_dict[name],
                             'Prot':prot,
                             'Subtype':sub
                             })
            
seqdf = pd.DataFrame(seqs)

# <codecell>

pseqdf = pd.pivot_table(seqdf, 
                        rows = ['Subtype', 'ID'], 
                        cols = 'Prot', 
                        values = 'Seq',

Esempio n. 15

0

Mostra file

File: NewCoEvo.py Progetto: JudoWill/ResearchNotebooks

        for num, (gbm, acc) in enumerate(imap(get_gi_acc, gb_files)):
            if (num == 100) or (num % 50000 == 0):
                print num
            gi_to_acc_dict[gbm] = acc
            writer.writerow((gbm, acc))

# <codecell>

files = [('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))]
seqs = []
for sub, sfiles in files:
    for f in sfiles:
        with open(f) as handle:
            base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0]
            prot = base_name.split('_')[1]
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                seqs.append({
                             'Seq':seq,
                             'ID':gi_to_acc_dict[name],
                             'Prot':prot,
                             })
            
seqdf = pd.DataFrame(seqs)

# <codecell>

pseqdf = pd.pivot_table(seqdf, 
                        rows = 'ID', 
                        cols = 'Prot', 
                        values = 'Seq', 
                        aggfunc = 'first')

Esempio n. 16

0

Mostra file

File: Untitled3.py Progetto: JudoWill/ResearchNotebooks

# <codecell>

import GeneralSeqTools
import glob

# <codecell>

import pandas as pd
files = sorted(glob.glob('/home/will/HIVTropism/LANLdata/SubB*.fasta'))

seqs = []
for f in files:
    prot_name = f.split('/')[-1].split('.')[0].split('-')[1]
    print prot_name
    with open(f) as handle:
        for name, seq in GeneralSeqTools.fasta_reader(handle):
            seqs.append({
                         'GI':name,
                         'Seq':seq.replace('-', '').upper(),
                         'Prot':prot_name
                         })
            

# <codecell>

seq_df = pd.pivot_table(pd.DataFrame(seqs),
                        rows = 'GI',
                        cols = 'Prot',
                        values = 'Seq',
                        aggfunc = 'first')

Esempio n. 17

0

Mostra file

File: PickNonUsers.py Progetto: JudoWill/ResearchNotebooks

    except ValueError:
        print fname
    seqs.append((pid, vn, prot, 1))

df = pd.DataFrame(seqs, columns=["Patient ID", "VisitNum", "Prot", "HasSeq"])
has_seq = pd.pivot_table(df, rows=["Patient ID", "VisitNum"], cols="Prot", values="HasSeq")

# <codecell>

import sys

sys.path.append("/home/will/PySeqUtils/")
import GeneralSeqTools

with open("/home/will/DrugStuff/pat_data.fasta") as handle:
    seqs = list(GeneralSeqTools.fasta_reader(handle))
    out = GeneralSeqTools.WebPSSM_V3_fasta(seqs)


# <codecell>

tmp = []
for row in out:
    parts = row[0].split("-")
    if len(parts) == 2:
        pat, vnum = parts
    else:
        pat, vnum, _ = parts
    tmp.append({"Patient ID": pat, "VisitNum": vnum, "IsR5": row[2] == "0", "IsX4": row[2] == "1"})
tropism = pd.DataFrame(tmp).groupby(["Patient ID", "VisitNum"]).first()

Esempio n. 18

0

Mostra file

File: testGeneralSeqTools.py Progetto: JudoWill/PySeqUtils

def test_muscle_basic_call():

    seqs = [('test1', 'ATCGATTGC'), ('test2', 'ATCGATGC')]
    aln = [('test1', 'ATCGATTGC'), ('test2', 'ATCGA-TGC')]
    res = list(GeneralSeqTools.call_muscle(seqs))
    eq_(res, aln)