Python GeneralSeqTools 예제들, GeneralSeqTools Python 예제들

예제 #1

0

파일 보기

파일: FixVispa.py 프로젝트: JudoWill/ResearchNotebooks

def add_seq_to_reads(read_file, out_file):
    with open(read_file) as handle:
        with open(out_file, 'w') as ohandle:
            new_seqs = []
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                new_seqs.append((name + ';' + seq, seq))
            GeneralSeqTools.fasta_writer(ohandle, new_seqs)

예제 #2

0

파일 보기

파일: testGeneralSeqTools.py 프로젝트: JudoWill/PySeqUtils

def test_fasta_writer():

    items = ['>test1', 'ATCTGCTAGTCGAATCGAGTAGT', '>test2', 'ATCGATGC']
    test_seq = '\n'.join(items) + '\n'

    handle = StringIO()
    GeneralSeqTools.fasta_writer(handle, [('test1', 'ATCTGCTAGTCGAATCGAGTAGT'),
                                            ('test2', 'ATCGATGC')])
    handle.seek(0)
    data = handle.read()
    eq_(test_seq, data)

예제 #3

0

파일 보기

파일: SpeedingTreeStats.py 프로젝트: JudoWill/ResearchNotebooks

def run_mafft(inseqs):
    
    orig_order = [name for name, _ in inseqs]
    with NTF(suffix = '.fasta') as handle:
        GeneralSeqTools.fasta_writer(handle, inseqs)
        handle.flush()
        os.fsync(handle)
        
        cmd = 'mafft --quiet --op 10 --ep 0.123 %s' % handle.name
        out = check_output(shlex.split(cmd))
        
    out_dict = dict(GeneralSeqTools.fasta_reader(StringIO(out)))
        
    return [(name, out_dict[name]) for name in orig_order]

예제 #4

0

파일 보기

파일: testGeneralSeqTools.py 프로젝트: JudoWill/PySeqUtils

def test_seq_map_to_ref():

    ref_align = 'ATCTCT--ATCT'
    seq_align = 'A-CCCT-AATCT'
    cor_align = 'A-CCCTATCT'

    res = GeneralSeqTools.seq_map_to_ref(seq_align,ref_align)
    eq_(res, cor_align)

예제 #5

0

파일 보기

파일: testGeneralSeqTools.py 프로젝트: JudoWill/PySeqUtils

def test_convert_seqs_to_dataframe():

    indict = {
        'seq1': list('ATCGATTGC'),
        'seq2': list('ATCGATTGC'),
    }
    inseqs = [('seq1', 'ATCGATTGC'), ('seq2', 'ATCGATTGC')]
    tdf = DataFrame(indict).T

    res = GeneralSeqTools.convert_seqs_to_dataframe(inseqs)
    ok_((res == tdf).all().all())

예제 #6

0

파일 보기

파일: testGeneralSeqTools.py 프로젝트: JudoWill/PySeqUtils

def test_convert_seqDF_to_list():

    indict = {
        'seq1': list('ATCGATTGC'),
        'seq2': list('ATCGATTGC'),
        }
    inseqs = [('seq1', 'ATCGATTGC'), ('seq2', 'ATCGATTGC')]
    tdf = DataFrame(indict).T

    res = GeneralSeqTools.convert_seqDF_to_list(tdf)
    eq_(res, inseqs)

예제 #7

0

파일 보기

파일: testGeneralSeqTools.py 프로젝트: JudoWill/PySeqUtils

def test_fasta_reader():

    input_items = ['>test1', 'ATCTGCTAGTCGA', 'ATCGAGTAGT', '>test2', 'ATCGATGC']

    input_seq = '\n'.join(input_items)

    res = list(GeneralSeqTools.fasta_reader(StringIO(input_seq)))
    eq_(len(res), 2)
    eq_(res[0][0], 'test1')
    eq_(res[0][1], 'ATCTGCTAGTCGAATCGAGTAGT')
    eq_(res[1][0], 'test2')
    eq_(res[1][1], 'ATCGATGC')

예제 #8

0

파일 보기

파일: testGeneralSeqTools.py 프로젝트: JudoWill/PySeqUtils

def test_seq_align_to_ref_multi():

    ref_seq = 'ATCGATTGC'
    test_seq = 'ATCGATGC'
    cor_mapping = 'ATCGA-TGC'

    inp = [('test1', test_seq)] * 10

    res = list(GeneralSeqTools.seq_align_to_ref(inp, ref_seq, max_workers=5))
    result = [('test1', cor_mapping)] * 10

    eq_(res, result)

예제 #9

0

파일 보기

파일: TFSeqTools.py 프로젝트: JudoWill/PySeqUtils

def align_to_ref(ref_seq, base_seq):
    """Aligns a sequence to the reference and caches the result for fast
     lookup later. Returns a tuple (base_seq, ref_seq) properly aligned.

     ref_seq -- The reference sequence to use as a guide.
     query_seq -- The query sequence.

     Returns:
     query_aln -- The aligned query sequence.
     ref_aln -- The aligned reference sequence.
    """

    seqs = [('query', base_seq), ('ref', ref_seq)]
    aligned = dict(GeneralSeqTools.call_muscle(seqs))
    return aligned['query'], aligned['ref']

예제 #10

0

파일 보기

파일: ConSeqs.py 프로젝트: JudoWill/PySeqUtils

def GetConSeq(region, alphabet=generic_dna, subtype='B', drop_gaps=True):

    if (alphabet == generic_dna) or (alphabet.lower() == 'dna'):
        path = get_region_file(region, 'dna')
    elif (alphabet == generic_protein) or (alphabet.lower() == 'pro'):
        path = get_region_file(region, 'pro')
    else:
        raise(TypeError, 'alphabet must be: "dna", "pro", generic_dna, generic_protein')
    seq = None
    if path is None:
        new_region, start, stop = get_region_span(region, alphabet)
        conB_seq = GetConSeq(new_region,
                             subtype='B',
                             alphabet=alphabet,
                             drop_gaps=False)
        sub_seq = GetConSeq(new_region,
                            subtype=subtype,
                            alphabet=alphabet,
                            drop_gaps=False)
        nstart, nstop = (None, None)
        print conB_seq
        print sub_seq
        conb_pos = 0
        for aln_pos, l in enumerate(conB_seq):
            if l != '-':
                conb_pos += 1
            if conb_pos == start:
                nstart = aln_pos
            if conb_pos == stop:
                nstop = aln_pos
                break
        seq = sub_seq[nstart:nstop]
    else:

        wanted_key = 'CONSENSUS_'+subtype
        with open(path) as handle:
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                name = name.split('(')[0]
                if name == wanted_key:
                    break

    if drop_gaps:
        return seq.replace('-', '').replace('$', '')
    else:
        return seq.replace('$', '')

예제 #11

0

파일 보기

파일: SubCAnalysis.py 프로젝트: JudoWill/ResearchNotebooks

def get_region(seq, reference, regions = None):
    
    if regions == None:
        regions = [(300, 400)]
    
    tmp_seqs = [('conc', reference),
                ('guess', seq)]
    aligned = dict(GeneralSeqTools.call_muscle(tmp_seqs))
    out = []
    for _, start, stop in regions:
        conc_pos = 0
        align_start = None
        for align_pos, l in enumerate(aligned['conc']):
            if l != '-':
                conc_pos += 1
            if conc_pos == start:
                align_start = align_pos
            if conc_pos == stop:
                align_stop = align_pos
                break
        yield seq[align_start:align_stop].replace('-', '')

예제 #12

0

파일 보기

파일: SpeedingTreeStats.py 프로젝트: JudoWill/ResearchNotebooks

# <codecell>

import sys
sys.path.append('/home/will/PySeqUtils/')

# <codecell>

import TreeingTools
import GeneralSeqTools
import dendropy

# <codecell>

with open('/home/will/SubCData/mafft_ep.fasta') as handle:
    seqs = list(GeneralSeqTools.fasta_reader(handle))

# <codecell>

import os, os.path
import csv
from itertools import product
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from operator import methodcaller
from itertools import groupby
from Bio.Seq import Seq
from Bio import Motif
from Bio.Alphabet import IUPAC

예제 #13

0

파일 보기

파일: LTRgraphs.py 프로젝트: JudoWill/ResearchNotebooks

# <codecell>

pat_data = pd.merge(redcap_data, df,
                    left_on ='SingleID',
                    right_on = 'SingleID',
                    how = 'outer').groupby('SingleID').first()

# <codecell>

import glob
ltr_files = sorted(glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*LTR.fasta'))
ltr_seqs = {}
for f in ltr_files:
    with open(f) as handle:
        _, seq = GeneralSeqTools.fasta_reader(handle).next()
        fname = os.path.basename(f).rsplit('-', 1)[0]
        ltr_seqs[fname] = seq

# <codecell>

ltr_df = pd.DataFrame({
                       'LTR':pd.Series(ltr_seqs)
                       })
ltr_df.head()

# <codecell>

conb_ltr = ConSeqs.GetConSeq('ltr')
conb_ltr

예제 #14

0

파일 보기

파일: SubCAnalysis.py 프로젝트: JudoWill/ResearchNotebooks

            writer.writerow((gbm, acc))


# <codecell>



files = [('C', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/C_*'))),
         ('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))]
seqs = []
for sub, sfiles in files:
    for f in sfiles:
        with open(f) as handle:
            base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0]
            prot = base_name.split('_')[1]
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                seqs.append({
                             'Seq':seq,
                             'ID':gi_to_acc_dict[name],
                             'Prot':prot,
                             'Subtype':sub
                             })
            
seqdf = pd.DataFrame(seqs)

# <codecell>

pseqdf = pd.pivot_table(seqdf, 
                        rows = ['Subtype', 'ID'], 
                        cols = 'Prot', 
                        values = 'Seq',

예제 #15

0

파일 보기

파일: NewCoEvo.py 프로젝트: JudoWill/ResearchNotebooks

        for num, (gbm, acc) in enumerate(imap(get_gi_acc, gb_files)):
            if (num == 100) or (num % 50000 == 0):
                print num
            gi_to_acc_dict[gbm] = acc
            writer.writerow((gbm, acc))

# <codecell>

files = [('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))]
seqs = []
for sub, sfiles in files:
    for f in sfiles:
        with open(f) as handle:
            base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0]
            prot = base_name.split('_')[1]
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                seqs.append({
                             'Seq':seq,
                             'ID':gi_to_acc_dict[name],
                             'Prot':prot,
                             })
            
seqdf = pd.DataFrame(seqs)

# <codecell>

pseqdf = pd.pivot_table(seqdf, 
                        rows = 'ID', 
                        cols = 'Prot', 
                        values = 'Seq', 
                        aggfunc = 'first')

예제 #16

0

파일 보기

파일: Untitled3.py 프로젝트: JudoWill/ResearchNotebooks

# <codecell>

import GeneralSeqTools
import glob

# <codecell>

import pandas as pd
files = sorted(glob.glob('/home/will/HIVTropism/LANLdata/SubB*.fasta'))

seqs = []
for f in files:
    prot_name = f.split('/')[-1].split('.')[0].split('-')[1]
    print prot_name
    with open(f) as handle:
        for name, seq in GeneralSeqTools.fasta_reader(handle):
            seqs.append({
                         'GI':name,
                         'Seq':seq.replace('-', '').upper(),
                         'Prot':prot_name
                         })
            

# <codecell>

seq_df = pd.pivot_table(pd.DataFrame(seqs),
                        rows = 'GI',
                        cols = 'Prot',
                        values = 'Seq',
                        aggfunc = 'first')

예제 #17

0

파일 보기

파일: PickNonUsers.py 프로젝트: JudoWill/ResearchNotebooks

    except ValueError:
        print fname
    seqs.append((pid, vn, prot, 1))

df = pd.DataFrame(seqs, columns=["Patient ID", "VisitNum", "Prot", "HasSeq"])
has_seq = pd.pivot_table(df, rows=["Patient ID", "VisitNum"], cols="Prot", values="HasSeq")

# <codecell>

import sys

sys.path.append("/home/will/PySeqUtils/")
import GeneralSeqTools

with open("/home/will/DrugStuff/pat_data.fasta") as handle:
    seqs = list(GeneralSeqTools.fasta_reader(handle))
    out = GeneralSeqTools.WebPSSM_V3_fasta(seqs)


# <codecell>

tmp = []
for row in out:
    parts = row[0].split("-")
    if len(parts) == 2:
        pat, vnum = parts
    else:
        pat, vnum, _ = parts
    tmp.append({"Patient ID": pat, "VisitNum": vnum, "IsR5": row[2] == "0", "IsX4": row[2] == "1"})
tropism = pd.DataFrame(tmp).groupby(["Patient ID", "VisitNum"]).first()

예제 #18

0

파일 보기

파일: testGeneralSeqTools.py 프로젝트: JudoWill/PySeqUtils

def test_muscle_basic_call():

    seqs = [('test1', 'ATCGATTGC'), ('test2', 'ATCGATGC')]
    aln = [('test1', 'ATCGATTGC'), ('test2', 'ATCGA-TGC')]
    res = list(GeneralSeqTools.call_muscle(seqs))
    eq_(res, aln)