Beispiel #1
0
import argparse
from metaSeq import io as seqIO
from metaSeq import bead
import json

parser = argparse.ArgumentParser()
parser.add_argument('-i', help='Input beadJson format file.')
parser.add_argument('-o', help='The output FASTQ file base. Barcode will be put at -2, while /1 and /2 at -1.')
parser.add_argument('-bp', default=-1, type=int, help='The position of barcode in beadJson. usually -1 or -2')
args = parser.parse_args()

inputFile = args.i
outputFileBase = args.o
barcodePosition = args.bp

with open(inputFile, 'r') as f:
    for line in f:
        currentBead = bead.beadSequence(json.loads(line.strip('\n')))
        outputR1 = []
        outputR2 = []
        for record in currentBead.fragments:
            r1 = record[0:4]
            r2 = record[4:8]
            label = r1[0].split('/')
            r1[0] = label[0] + '/' + label[2] + '/' + label[1] 
            label = r2[0].split('/')
            r2[0] = label[0] + '/' + label[2] + '/' + label[1] 
            outputR1.append(r1)
            outputR2.append(r2)
        count = seqIO.write_seqs(outputR1, outputFileBase + '.r1.fq', fastx='q', mode='a')
        count = seqIO.write_seqs(outputR2, outputFileBase + '.r2.fq', fastx='q', mode='a')
Beispiel #2
0
 def fastaWrite(self, folder=''):
     fasta = self.fastaSequences()
     fileName = folder + self.barcode + '.fa'
     count = seqIO.write_seqs(fasta, fileName)
     return count
Beispiel #3
0
    error_count = 0
    seqs = seqIO.sequence_twin(r1File, r2File)
    for r1, r2 in seqs:
        count += 1
        if count // 1000000 >= 1:
            f1.write('\tProcessed {8.2f} M reads.'.format(count // 1000000))
        bead = number_tuple(barcode_set(r2[1]), NoSnpDict, OneSnpDict)
        if bead:
            b = '_'.join([str(i) for i in bead])
            r1[0] = r1[0][:-2] + '/' + b + '/1'
            r2[0] = r2[0][:-2] + '/' + b + '/2'
            r2[1] = r2[1][:100]
            r2[3] = r2[3][:100]
            seqIO.write_seqs([r1, r2],
                             outputFileDict[number2ord(bead)],
                             fastx='q',
                             mode='a',
                             gz=False)
        else:
            error_count += 1
            r1[0] = r1[0][:-2] + '/' + '0_0_0' + '/1'
            r2[0] = r2[0][:-2] + '/' + '0_0_0' + '/1'
            r2[1] = r2[1][:100]
            r2[3] = r2[3][:100]
            seqIO.write_seqs([r1, r2],
                             outputFileDict[(0, 0, 0)],
                             fastx='q',
                             mode='a',
                             gz=False)
t2 = time.time()
with open('stlfr_split_sm.log', 'a') as f2:
Beispiel #4
0
    seq = record.seq.__str__()
    annot = record.annotations
    organism = '_'.join(annot['organism'].split(' '))
    species = organism.split('_')[1]
    taxa = parse_level(annot['taxonomy'], string_level)
    taxonomy = 'k__{0};p__{1};c__{2};o__{3};f__{4};g__{5};s__{6}'.format(taxa['superkingdom'], taxa['phylum'], taxa['class'], \
                   taxa['order'], taxa['family'], taxa['genus'], taxa['genus'] + '_' + species)
    head = annot['accessions'][0] + '.' + str(
        annot['sequence_version']) + '|' + organism + '|' + taxonomy
    gb_record.append((head, seq))
    count += 1
print(count)

#%% Write
count = metaSeqIO.write_seqs(gb_record,
                             'ncbi_targeted_refseq_16s.fa',
                             fastx='a',
                             gz=False)
print(count)
#%% Also write to the format of Burst
part1 = []
part2 = []
for record in gb_record:
    head = record[0].split('|')
    part1.append('{0}\t{1}\n'.format(head[0] + '|' + head[1], head[2]))
    part2.append((head[0] + '|' + head[1], record[1]))
count = metaSeqIO.write_seqs(part2,
                             'ncbi_targeted_refseq_16s_burst.fa',
                             fastx='a',
                             gz=False)
with open('ncbi_targeted_refseq_16s_burst.txt', 'w') as f:
    for item in part1:
Beispiel #5
0
            f.write('{0}\t{1}\t{2}\n'.format(item[0], item[1], item[2]))
'''

#%%
''' Extract bead sequences by module number '''
from metaSeq import io as seqIO
from metaSeq import bead

module = {}
with open('kmer.jcd.0.02.module.txt', 'r') as f:
    f.readline()
    for line in f:
        line = line.strip('\n').split('\t')
        module[line[0]] = line[1]
print(len(module))

cluster = {}

for item in list(set(module.values())):
    cluster[item] = []
print(len(cluster))
beads = seqIO.beadJson('CL100077200_L01.json')
for item in beads:
    b = bead.beadSequence(item)
    classNumber = module.get(b.barcode, False)
    if classNumber:
        cluster[classNumber] += b.fastaSequences()
print(len(cluster))
for key, value in cluster.items():
    seqIO.write_seqs(value, 'cluster/{0}.fa'.format(key), fastx='a', mode='w')
Beispiel #6
0
parser.add_argument('-l', help='Minimum length to keep, >=')
parser.add_argument('-t', default=100000, help='Trunk size. By default is 100,000')
args = parser.parse_args()
input_file = args.i
output_file = args.o
rate = float(args.r)
ml = int(args.l)
trunk = int(args.t)
t1 = time.clock()
print('Reading {0} by trunk ({1}) ...'.format(input_file, trunk))
#%% Truncate filter at maxE rate 0.01
input_seq = io.sequence_trunk(input_file, fastx='q', trunk_size=trunk)
p_dict = qc.qual_score()

i = 0
j = 0
c = 0
for trunk in input_seq:
    c += 1
    content = []
    for record in trunk:
        i += 1
        filtered = qc.trunc_ee_rate(record, p_dict, rate=rate)
        if len(filtered[1]) >= ml:
            content.append(filtered)
            j += 1
    count = io.write_seqs(content, output_file, fastx='q', mode='a')
print('Expected error rate = {0}, Minimum length = {1}'.format(rate, ml))
print('Filtered {0}, kept {1}'.format(i, j))
t2 = time.clock()
print('Use {0} s.'.format(t2-t1))
Beispiel #7
0
        count[1] += 1
s = sum([len(i) >= threshold for i in split.values()])
scount = sum([len(i) for i in split.values() if len(i) >= threshold])
print('Finished splitting:')
print('\tTotal reads:\t\t{0}'.format(sum(count)))
print('\tEligible reads:\t\t{0}\t{1:3.2f}%.'.format(
    count[0], count[0] / sum(count) * 100))
print('\tReads passed threshold:\t{0}\t{1:3.2f}%'.format(
    scount, scount / sum(count) * 100))
print('\tFound {0} barcodes with more than {1} reads.'.format(s, threshold))

# Write to files for barcodes pass the threshold
i = 1
for key, value in split.items():
    if len(value) >= threshold:
        outputR1 = outputPrefix + '_' + str(key) + '.r1.fq.gz'
        outputR2 = outputPrefix + '_' + str(key) + '.r2.fq.gz'
        c = seqIO.write_seqs([r[0] for r in value],
                             outputR1,
                             fastx='q',
                             gz=True)
        print('\tPair{2}: {0} reads wrote to {1}.'.format(c, outputR1, i))
        c = seqIO.write_seqs([r[1] for r in value],
                             outputR2,
                             fastx='q',
                             gz=True)
        print('\tPair{2}: {0} reads wrote to {1}.'.format(c, outputR2, i))
        i += 1
    else:
        pass
print('Finished processing {0}.'.format(' '.join(inputFile)))