def test_split_fastq(self): ''' Tests whether every read in a fastq file shows up exactly once in piece of a split. ''' fn = 'data.fastq' whole = defaultdict(list) for read in fastq.reads(fn): whole[read.name].append(read) num_pieces_list = [1, 10, 100] from_pieces = {n: defaultdict(list) for n in num_pieces_list} for num_pieces in num_pieces_list: for which_piece in range(num_pieces): piece = split_file.piece(fn, num_pieces, which_piece, 'fastq') for read in fastq.reads(piece): from_pieces[num_pieces][read.name].append(read) self.assertEqual( whole, from_pieces[num_pieces], msg='Splitting did not partition', )
def get_reads(self): ''' A generator over the reads in a piece of each data file. Can handle a mixture of different fastq encodings across (but not within) files. ''' total_reads = 0 for file_name in self.data_fns: total_reads_from_file = 0 file_piece = split_file.piece( file_name, self.num_pieces, self.which_piece, 'fastq', ) for read in fastq.reads(file_piece, standardize_names=True, ensure_sanger_encoding=True): yield read total_reads += 1 total_reads_from_file += 1 if total_reads % 10000 == 0: logging.info('{0:,} reads processed'.format(total_reads)) head, tail = os.path.split(file_name) self.summary.append( ('Reads in {0}'.format(tail), total_reads_from_file)) logging.info('{0:,} total reads processed'.format(total_reads)) self.summary.append(('Total reads', total_reads))
def align_reads_michelle(fastq_fn, target_fasta_fn, bam_fn): reads = fastq.reads(fastq_fn) for _ in align_reads(target_fasta_fn, reads, bam_fn, alignment_type='local'): pass
def examine_locii(self): locii = {} CDSs, _ = self.get_CDSs(force_all=True) CDSs = {c.name: c for c in CDSs} for gene_name, codon_number in self.codons_to_examine: gene = CDSs[gene_name] reads = fastq.reads(self.file_names['preprocessed_reads']) triplets = examine_specific_codon.count_triplets(reads, gene, codon_number) locii[gene_name, codon_number] = triplets self.write_file('codons_to_examine', locii)
def test_new_synth(): import trim from Sequencing import fasta sfn = '/home/jah/projects/ribosomes/data/stephanie_markers/stephanie_markers.fa' synthetics = [read.seq for read in fasta.reads(sfn)] reads = fastq.reads( '/home/jah/projects/ribosomes/experiments/belgium_2014_08_07/WT_1_FP/data/WT_1_FP.140731.MiSeq.FCA.lane1.R1.fastq' ) for read in reads: trim_at = trim.trim_by_local_alignment(read.seq) trimmed_seq = read.seq[:trim_at] trimmed_read = fasta.Read(read.name, trimmed_seq) old = is_synthetic(trimmed_read, synthetics) new = is_synthetic_new(trimmed_read, synthetics) if old and not new and trimmed_seq != '': print 'old is', old print 'new is', new print trimmed_seq raw_input()
def test_new_synth(): import trim from Sequencing import fasta sfn = "/home/jah/projects/ribosomes/data/stephanie_markers/stephanie_markers.fa" synthetics = [read.seq for read in fasta.reads(sfn)] reads = fastq.reads( "/home/jah/projects/ribosomes/experiments/belgium_2014_08_07/WT_1_FP/data/WT_1_FP.140731.MiSeq.FCA.lane1.R1.fastq" ) for read in reads: trim_at = trim.trim_by_local_alignment(read.seq) trimmed_seq = read.seq[:trim_at] trimmed_read = fasta.Read(read.name, trimmed_seq) old = is_synthetic(trimmed_read, synthetics) new = is_synthetic_new(trimmed_read, synthetics) if old and not new and trimmed_seq != "": print "old is", old print "new is", new print trimmed_seq raw_input()
def get_reads(self): """ A generator over the reads in a piece of each data file. Can handle a mixture of different fastq encodings across (but not within) files. """ total_reads = 0 for file_name in self.data_fns: total_reads_from_file = 0 file_piece = split_file.piece(file_name, self.num_pieces, self.which_piece, "fastq") for read in fastq.reads(file_piece, standardize_names=True, ensure_sanger_encoding=True): yield read total_reads += 1 total_reads_from_file += 1 if total_reads % 10000 == 0: logging.info("{0:,} reads processed".format(total_reads)) head, tail = os.path.split(file_name) self.summary.append(("Reads in {0}".format(tail), total_reads_from_file)) logging.info("{0:,} total reads processed".format(total_reads)) self.summary.append(("Total reads", total_reads))
def get_reads(): return islice(fastq.reads(R1_fn), 1000)
updated_cigar = soft_clipped_block + trimmed_cigar else: # Remove blocks from the end. trimmed_cigar = sam.truncate_cigar_blocks_up_to( mapping.cigar, trimmed_length) updated_cigar = trimmed_cigar + soft_clipped_block mapping.cigar = updated_cigar if mapping.tags: # Clear the MD tag since the possible removal of bases to the # alignment may have made it inaccurate. # TODO: now have machinery to make it accurate. filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags) mapping.tags = filtered_tags set_nongenomic_length(mapping, bases_to_trim) return mapping if __name__ == '__main__': fastq_fn = '/home/jah/projects/ribosomes/experiments/guydosh_cell/dom34KO_CHX/data/SRR1042854.fastq' seqs = [r.seq for _, r in zip(xrange(100000), fastq.reads(fastq_fn))] seqs = utilities.progress_bar(len(seqs), seqs) adapter = full_linker count = 0 counts = Counter() for seq in seqs: counts[trim_by_local_alignment(adapter, seq)] += 1
def get_R1_reads(): return islice(fastq.reads(R1_fn), 100)
def length_from_file_name(file_name): length = len(fastq.reads(file_name).next().seq) return length
fqual = ''.join(fqualList) count = count + 1 nRead = fastq.Read(oldRead.name, r.seq, fqual) collapsedReads[nseq] = [nRead, count] else: nRead = fastq.Read(r.name, r.seq, r.qual) #[rSlice]) collapsedReads[nseq] = [nRead, 1] counter = counter + 1 fh = open(outfile, 'w') for i in collapsedReads: [r, count] = collapsedReads[i] #n = r.name.split(' ') fh.write(str(fastq.Read(r.name + "_" + str(count), r.seq, r.qual))) fh.close() if __name__ == '__main__': import itertools parser = argparse.ArgumentParser() parser.add_argument('R1', help='input Reads fastq file name (can ge gzip\'ed)') parser.add_argument('outfileCollapsed', help='output fastq of collapsed reads') args = parser.parse_args() reads = fastq.reads(args.R1) collapse_fastq(reads, args.outfileCollapsed)
if counter % 1000 == 0: print(str(counter) + " groups processed...") fh.close() print("# of cell-UMI groups = " + str(len(UMIGrps))) print("# reads qual <20 (filtered) = " + str(numReadsQualFilt)) print("# grps w/ reads<" + str(readThres) + " = " + str(numBelowReadThres)) print("# grps singles = " + str(numSingles)) print("# grps >0.5 = " + str(numMaj)) print("# grps concensus = " + str(numCon)) if __name__ == '__main__': t0 = time.time() parser = argparse.ArgumentParser() parser.add_argument('fq', help='fastq of reads collapsed by sequence') parser.add_argument( 'readThres', help='UMIs with <readThres will be thrown out; default=3', default=3) parser.add_argument('outfile', help='collapsedFastqTable.txt') args = parser.parse_args() reads = fastq.reads(args.fq) collapseUMIs(reads, int(args.readThres), args.outfile) print("Final Time: " + str(time.time() - t0))
# Remove blocks from the beginning. trimmed_cigar = sam.truncate_cigar_blocks_from_beginning(mapping.cigar, trimmed_length) updated_cigar = soft_clipped_block + trimmed_cigar else: # Remove blocks from the end. trimmed_cigar = sam.truncate_cigar_blocks_up_to(mapping.cigar, trimmed_length) updated_cigar = trimmed_cigar + soft_clipped_block mapping.cigar = updated_cigar if mapping.tags: # Clear the MD tag since the possible removal of bases to the # alignment may have made it inaccurate. # TODO: now have machinery to make it accurate. filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags) mapping.tags = filtered_tags set_nongenomic_length(mapping, bases_to_trim) return mapping if __name__ == '__main__': fastq_fn = '/home/jah/projects/ribosomes/experiments/guydosh_cell/dom34KO_CHX/data/SRR1042854.fastq' seqs = [r.seq for _, r in zip(xrange(100000), fastq.reads(fastq_fn))] seqs = utilities.progress_bar(len(seqs), seqs) adapter = full_linker count = 0 counts = Counter() for seq in seqs: counts[trim_by_local_alignment(adapter, seq)] += 1