import argparse from metaSeq import io as seqIO from metaSeq import bead import json parser = argparse.ArgumentParser() parser.add_argument('-i', help='Input beadJson format file.') parser.add_argument('-o', help='The output FASTQ file base. Barcode will be put at -2, while /1 and /2 at -1.') parser.add_argument('-bp', default=-1, type=int, help='The position of barcode in beadJson. usually -1 or -2') args = parser.parse_args() inputFile = args.i outputFileBase = args.o barcodePosition = args.bp with open(inputFile, 'r') as f: for line in f: currentBead = bead.beadSequence(json.loads(line.strip('\n'))) outputR1 = [] outputR2 = [] for record in currentBead.fragments: r1 = record[0:4] r2 = record[4:8] label = r1[0].split('/') r1[0] = label[0] + '/' + label[2] + '/' + label[1] label = r2[0].split('/') r2[0] = label[0] + '/' + label[2] + '/' + label[1] outputR1.append(r1) outputR2.append(r2) count = seqIO.write_seqs(outputR1, outputFileBase + '.r1.fq', fastx='q', mode='a') count = seqIO.write_seqs(outputR2, outputFileBase + '.r2.fq', fastx='q', mode='a')
def fastaWrite(self, folder=''): fasta = self.fastaSequences() fileName = folder + self.barcode + '.fa' count = seqIO.write_seqs(fasta, fileName) return count
error_count = 0 seqs = seqIO.sequence_twin(r1File, r2File) for r1, r2 in seqs: count += 1 if count // 1000000 >= 1: f1.write('\tProcessed {8.2f} M reads.'.format(count // 1000000)) bead = number_tuple(barcode_set(r2[1]), NoSnpDict, OneSnpDict) if bead: b = '_'.join([str(i) for i in bead]) r1[0] = r1[0][:-2] + '/' + b + '/1' r2[0] = r2[0][:-2] + '/' + b + '/2' r2[1] = r2[1][:100] r2[3] = r2[3][:100] seqIO.write_seqs([r1, r2], outputFileDict[number2ord(bead)], fastx='q', mode='a', gz=False) else: error_count += 1 r1[0] = r1[0][:-2] + '/' + '0_0_0' + '/1' r2[0] = r2[0][:-2] + '/' + '0_0_0' + '/1' r2[1] = r2[1][:100] r2[3] = r2[3][:100] seqIO.write_seqs([r1, r2], outputFileDict[(0, 0, 0)], fastx='q', mode='a', gz=False) t2 = time.time() with open('stlfr_split_sm.log', 'a') as f2:
seq = record.seq.__str__() annot = record.annotations organism = '_'.join(annot['organism'].split(' ')) species = organism.split('_')[1] taxa = parse_level(annot['taxonomy'], string_level) taxonomy = 'k__{0};p__{1};c__{2};o__{3};f__{4};g__{5};s__{6}'.format(taxa['superkingdom'], taxa['phylum'], taxa['class'], \ taxa['order'], taxa['family'], taxa['genus'], taxa['genus'] + '_' + species) head = annot['accessions'][0] + '.' + str( annot['sequence_version']) + '|' + organism + '|' + taxonomy gb_record.append((head, seq)) count += 1 print(count) #%% Write count = metaSeqIO.write_seqs(gb_record, 'ncbi_targeted_refseq_16s.fa', fastx='a', gz=False) print(count) #%% Also write to the format of Burst part1 = [] part2 = [] for record in gb_record: head = record[0].split('|') part1.append('{0}\t{1}\n'.format(head[0] + '|' + head[1], head[2])) part2.append((head[0] + '|' + head[1], record[1])) count = metaSeqIO.write_seqs(part2, 'ncbi_targeted_refseq_16s_burst.fa', fastx='a', gz=False) with open('ncbi_targeted_refseq_16s_burst.txt', 'w') as f: for item in part1:
f.write('{0}\t{1}\t{2}\n'.format(item[0], item[1], item[2])) ''' #%% ''' Extract bead sequences by module number ''' from metaSeq import io as seqIO from metaSeq import bead module = {} with open('kmer.jcd.0.02.module.txt', 'r') as f: f.readline() for line in f: line = line.strip('\n').split('\t') module[line[0]] = line[1] print(len(module)) cluster = {} for item in list(set(module.values())): cluster[item] = [] print(len(cluster)) beads = seqIO.beadJson('CL100077200_L01.json') for item in beads: b = bead.beadSequence(item) classNumber = module.get(b.barcode, False) if classNumber: cluster[classNumber] += b.fastaSequences() print(len(cluster)) for key, value in cluster.items(): seqIO.write_seqs(value, 'cluster/{0}.fa'.format(key), fastx='a', mode='w')
parser.add_argument('-l', help='Minimum length to keep, >=') parser.add_argument('-t', default=100000, help='Trunk size. By default is 100,000') args = parser.parse_args() input_file = args.i output_file = args.o rate = float(args.r) ml = int(args.l) trunk = int(args.t) t1 = time.clock() print('Reading {0} by trunk ({1}) ...'.format(input_file, trunk)) #%% Truncate filter at maxE rate 0.01 input_seq = io.sequence_trunk(input_file, fastx='q', trunk_size=trunk) p_dict = qc.qual_score() i = 0 j = 0 c = 0 for trunk in input_seq: c += 1 content = [] for record in trunk: i += 1 filtered = qc.trunc_ee_rate(record, p_dict, rate=rate) if len(filtered[1]) >= ml: content.append(filtered) j += 1 count = io.write_seqs(content, output_file, fastx='q', mode='a') print('Expected error rate = {0}, Minimum length = {1}'.format(rate, ml)) print('Filtered {0}, kept {1}'.format(i, j)) t2 = time.clock() print('Use {0} s.'.format(t2-t1))
count[1] += 1 s = sum([len(i) >= threshold for i in split.values()]) scount = sum([len(i) for i in split.values() if len(i) >= threshold]) print('Finished splitting:') print('\tTotal reads:\t\t{0}'.format(sum(count))) print('\tEligible reads:\t\t{0}\t{1:3.2f}%.'.format( count[0], count[0] / sum(count) * 100)) print('\tReads passed threshold:\t{0}\t{1:3.2f}%'.format( scount, scount / sum(count) * 100)) print('\tFound {0} barcodes with more than {1} reads.'.format(s, threshold)) # Write to files for barcodes pass the threshold i = 1 for key, value in split.items(): if len(value) >= threshold: outputR1 = outputPrefix + '_' + str(key) + '.r1.fq.gz' outputR2 = outputPrefix + '_' + str(key) + '.r2.fq.gz' c = seqIO.write_seqs([r[0] for r in value], outputR1, fastx='q', gz=True) print('\tPair{2}: {0} reads wrote to {1}.'.format(c, outputR1, i)) c = seqIO.write_seqs([r[1] for r in value], outputR2, fastx='q', gz=True) print('\tPair{2}: {0} reads wrote to {1}.'.format(c, outputR2, i)) i += 1 else: pass print('Finished processing {0}.'.format(' '.join(inputFile)))