def random_selecet(a):
        fastq, number = a
        length = int(
            os.pipe(
                "zcat {} | parallel --pipe wc -l | awk '{i+=$1}END{print i}'".
                format(fastq)))

        fastq_handle = SeqIO(gzip.open(fastq), 'fastq')
        for record in fastq_handle:
            num = random.randint(length)
            if num < number:
                print(record)
                break
Exemple #2
0
# coding: utf-8
from Bio import SeqIO
peptides = SeqIO('Data/anti_microbial_peptide.fasta', 'fasta')
peptides = SeqIO.parse('Data/anti_microbial_peptide.fasta', 'fasta')
peptides
peptides = []
for seq in SeqIO.p('Data/anti_microbial_peptide.fasta', 'fasta'):
    peptides.append(seq)

for seq in SeqIO.parse('Data/anti_microbial_peptide.fasta', 'fasta'):
    peptides.append(seq)

peptides
import pandas as pd
from protein_sequence_features import protein_features
from protein_sequence_features import protein_features
features = map(protein_features, peptides)
features[0]
peptides_dataset = pd.DataFrame(features, index=map(lambda x: x.id, peptides))
peptides_dataset
peptides_dataset.to_csv('Data/anti_microbial_peptide_features.csv')
from sklearn.manifold import TSNE
tsne = TSNE()
tsne.fit(peptides_dataset.values)
X = tsne.fit_transform(peptides_dataset.values)
import matplotlib.pyplot as plt
import seaborn as sns
plt.scatter(X)
plt.scatter(X[:, 0], X[:, 1])
plt.imshow()
plt.show()
Exemple #3
0
def sss(par):
    subsample = bool(par['subsample'])
    select = bool(par['select'])
    randomize = bool(par['randomize'])
    if bool(par['out_f']):
        n = par['split']
        #openw = bz2.BZ2File if par['out_f'].endswith(".bz2") else open
        if n == 1:
            out_stream = [utils.openw(par['out_f'])]
        else:
            out_stream = [
                utils.openw(par['out_f'] + str(r).zfill(len(str(n))) + ".fna" +
                            (".bz2" if par['out_f'].endswith(".bz2") else ""))
                for r in range(n)
            ]
    else:
        out_stream = [sys.stdout]  # larger buffer?

    if select:
        if os.path.exists(par['ids']):
            #openr = bz2.BZ2File if par['ids'].endswith(".bz2") else open
            es = [s.strip().split('\t')[0] for s in utils.openr(par['ids'])]
        else:
            es = [(s.split("$")[1] if s.count("$") else s)
                  for s in par['ids'].split(":::")]
        es = set(es)

    all_reads = []
    nstreams = len(out_stream)

    p = par['subsample']
    #reads = reader( par['inp_f'], par['min_len'], par['max_len'] )
    cind = 0
    lmin, lmax = par['min_len'], par['max_len']
    for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta"):
        if lmin and len(r.seq) < lmin:
            continue
        if lmax and len(r.seq) > lmax:
            continue
        if select:
            if par['reverse']:
                if r.id in es:
                    continue
            elif r.id not in es:
                continue
        if subsample and rnd.random() > p:
            continue
        if randomize:
            all_reads.append(r)
            continue
        SeqIO.write(r, out_stream[cind], "fasta")
        cind = (cind + 1) % nstreams
    """
    for r in reads:
        if select and r.n not in es:
            continue
        if subsample and rnd.random() > p:
            continue
        if randomize:
            all_reads.append( r )
            continue
        out_stream[cind].write(  str(r)  )
        cind = (cind + 1) % nstreams
    """

    if randomize:
        rnd.shuffle(all_reads)
        step = len(all_reads) / nstreams
        for i, r in enumerate(all_reads):
            #out_stream[cind].write( str(r) )
            SeqIO(r, out_stream[cind], "fasta")
            if not i % step:
                cind = (cind + 1) % nstreams

    for o in out_stream:
        o.close()