Ejemplo n.º 1
0
    def test_split_fastq(self):
        ''' Tests whether every read in a fastq file shows up exactly once in
        piece of a split.
        '''
        fn = 'data.fastq'

        whole = defaultdict(list)

        for read in fastq.reads(fn):
            whole[read.name].append(read)

        num_pieces_list = [1, 10, 100]
        from_pieces = {n: defaultdict(list) for n in num_pieces_list}

        for num_pieces in num_pieces_list:
            for which_piece in range(num_pieces):
                piece = split_file.piece(fn, num_pieces, which_piece, 'fastq')

                for read in fastq.reads(piece):
                    from_pieces[num_pieces][read.name].append(read)

            self.assertEqual(
                whole,
                from_pieces[num_pieces],
                msg='Splitting did not partition',
            )
Ejemplo n.º 2
0
    def get_reads(self):
        ''' A generator over the reads in a piece of each data file.
            Can handle a mixture of different fastq encodings across (but not
            within) files.
        '''
        total_reads = 0
        for file_name in self.data_fns:
            total_reads_from_file = 0
            file_piece = split_file.piece(
                file_name,
                self.num_pieces,
                self.which_piece,
                'fastq',
            )
            for read in fastq.reads(file_piece,
                                    standardize_names=True,
                                    ensure_sanger_encoding=True):
                yield read

                total_reads += 1
                total_reads_from_file += 1
                if total_reads % 10000 == 0:
                    logging.info('{0:,} reads processed'.format(total_reads))

            head, tail = os.path.split(file_name)
            self.summary.append(
                ('Reads in {0}'.format(tail), total_reads_from_file))

        logging.info('{0:,} total reads processed'.format(total_reads))

        self.summary.append(('Total reads', total_reads))
Ejemplo n.º 3
0
def align_reads_michelle(fastq_fn, target_fasta_fn, bam_fn):
    reads = fastq.reads(fastq_fn)

    for _ in align_reads(target_fasta_fn,
                         reads,
                         bam_fn,
                         alignment_type='local'):
        pass
Ejemplo n.º 4
0
    def examine_locii(self):
        locii = {}
        
        CDSs, _ = self.get_CDSs(force_all=True)
        CDSs = {c.name: c for c in CDSs}

        for gene_name, codon_number in self.codons_to_examine:
            gene = CDSs[gene_name]
            reads = fastq.reads(self.file_names['preprocessed_reads'])
            triplets = examine_specific_codon.count_triplets(reads, gene, codon_number)

            locii[gene_name, codon_number] = triplets

        self.write_file('codons_to_examine', locii)
Ejemplo n.º 5
0
def test_new_synth():
    import trim
    from Sequencing import fasta
    sfn = '/home/jah/projects/ribosomes/data/stephanie_markers/stephanie_markers.fa'
    synthetics = [read.seq for read in fasta.reads(sfn)]

    reads = fastq.reads(
        '/home/jah/projects/ribosomes/experiments/belgium_2014_08_07/WT_1_FP/data/WT_1_FP.140731.MiSeq.FCA.lane1.R1.fastq'
    )
    for read in reads:
        trim_at = trim.trim_by_local_alignment(read.seq)
        trimmed_seq = read.seq[:trim_at]
        trimmed_read = fasta.Read(read.name, trimmed_seq)
        old = is_synthetic(trimmed_read, synthetics)
        new = is_synthetic_new(trimmed_read, synthetics)
        if old and not new and trimmed_seq != '':
            print 'old is', old
            print 'new is', new
            print trimmed_seq
            raw_input()
Ejemplo n.º 6
0
def test_new_synth():
    import trim
    from Sequencing import fasta

    sfn = "/home/jah/projects/ribosomes/data/stephanie_markers/stephanie_markers.fa"
    synthetics = [read.seq for read in fasta.reads(sfn)]

    reads = fastq.reads(
        "/home/jah/projects/ribosomes/experiments/belgium_2014_08_07/WT_1_FP/data/WT_1_FP.140731.MiSeq.FCA.lane1.R1.fastq"
    )
    for read in reads:
        trim_at = trim.trim_by_local_alignment(read.seq)
        trimmed_seq = read.seq[:trim_at]
        trimmed_read = fasta.Read(read.name, trimmed_seq)
        old = is_synthetic(trimmed_read, synthetics)
        new = is_synthetic_new(trimmed_read, synthetics)
        if old and not new and trimmed_seq != "":
            print "old is", old
            print "new is", new
            print trimmed_seq
            raw_input()
Ejemplo n.º 7
0
    def get_reads(self):
        """ A generator over the reads in a piece of each data file.
            Can handle a mixture of different fastq encodings across (but not
            within) files.
        """
        total_reads = 0
        for file_name in self.data_fns:
            total_reads_from_file = 0
            file_piece = split_file.piece(file_name, self.num_pieces, self.which_piece, "fastq")
            for read in fastq.reads(file_piece, standardize_names=True, ensure_sanger_encoding=True):
                yield read

                total_reads += 1
                total_reads_from_file += 1
                if total_reads % 10000 == 0:
                    logging.info("{0:,} reads processed".format(total_reads))

            head, tail = os.path.split(file_name)
            self.summary.append(("Reads in {0}".format(tail), total_reads_from_file))

        logging.info("{0:,} total reads processed".format(total_reads))

        self.summary.append(("Total reads", total_reads))
Ejemplo n.º 8
0
 def get_reads():
     return islice(fastq.reads(R1_fn), 1000)
Ejemplo n.º 9
0
            updated_cigar = soft_clipped_block + trimmed_cigar
        else:
            # Remove blocks from the end.
            trimmed_cigar = sam.truncate_cigar_blocks_up_to(
                mapping.cigar, trimmed_length)
            updated_cigar = trimmed_cigar + soft_clipped_block

        mapping.cigar = updated_cigar

    if mapping.tags:
        # Clear the MD tag since the possible removal of bases to the
        # alignment may have made it inaccurate.
        # TODO: now have machinery to make it accurate.
        filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags)
        mapping.tags = filtered_tags

    set_nongenomic_length(mapping, bases_to_trim)

    return mapping


if __name__ == '__main__':
    fastq_fn = '/home/jah/projects/ribosomes/experiments/guydosh_cell/dom34KO_CHX/data/SRR1042854.fastq'
    seqs = [r.seq for _, r in zip(xrange(100000), fastq.reads(fastq_fn))]
    seqs = utilities.progress_bar(len(seqs), seqs)
    adapter = full_linker
    count = 0
    counts = Counter()
    for seq in seqs:
        counts[trim_by_local_alignment(adapter, seq)] += 1
Ejemplo n.º 10
0
 def get_R1_reads():
     return islice(fastq.reads(R1_fn), 100)
Ejemplo n.º 11
0
 def length_from_file_name(file_name):
     length = len(fastq.reads(file_name).next().seq)
     return length
Ejemplo n.º 12
0
            fqual = ''.join(fqualList)
            count = count + 1
            nRead = fastq.Read(oldRead.name, r.seq, fqual)
            collapsedReads[nseq] = [nRead, count]
        else:
            nRead = fastq.Read(r.name, r.seq, r.qual)  #[rSlice])
            collapsedReads[nseq] = [nRead, 1]
        counter = counter + 1

    fh = open(outfile, 'w')
    for i in collapsedReads:
        [r, count] = collapsedReads[i]
        #n = r.name.split(' ')

        fh.write(str(fastq.Read(r.name + "_" + str(count), r.seq, r.qual)))
    fh.close()


if __name__ == '__main__':
    import itertools
    parser = argparse.ArgumentParser()
    parser.add_argument('R1',
                        help='input Reads fastq file name (can ge gzip\'ed)')
    parser.add_argument('outfileCollapsed',
                        help='output fastq of collapsed reads')

    args = parser.parse_args()
    reads = fastq.reads(args.R1)

    collapse_fastq(reads, args.outfileCollapsed)
Ejemplo n.º 13
0
 def length_from_file_name(file_name):
     length = len(fastq.reads(file_name).next().seq)
     return length
Ejemplo n.º 14
0
        if counter % 1000 == 0:
            print(str(counter) + " groups processed...")

    fh.close()

    print("# of cell-UMI groups = " + str(len(UMIGrps)))
    print("# reads qual <20 (filtered) = " + str(numReadsQualFilt))
    print("# grps w/ reads<" + str(readThres) + " = " + str(numBelowReadThres))
    print("# grps singles = " + str(numSingles))
    print("# grps >0.5 = " + str(numMaj))
    print("# grps concensus = " + str(numCon))


if __name__ == '__main__':

    t0 = time.time()
    parser = argparse.ArgumentParser()
    parser.add_argument('fq', help='fastq of reads collapsed by sequence')
    parser.add_argument(
        'readThres',
        help='UMIs with <readThres will be thrown out; default=3',
        default=3)
    parser.add_argument('outfile', help='collapsedFastqTable.txt')

    args = parser.parse_args()

    reads = fastq.reads(args.fq)

    collapseUMIs(reads, int(args.readThres), args.outfile)
    print("Final Time: " + str(time.time() - t0))
Ejemplo n.º 15
0
            # Remove blocks from the beginning.
            trimmed_cigar = sam.truncate_cigar_blocks_from_beginning(mapping.cigar, trimmed_length)
            updated_cigar = soft_clipped_block + trimmed_cigar
        else:
            # Remove blocks from the end.
            trimmed_cigar = sam.truncate_cigar_blocks_up_to(mapping.cigar, trimmed_length)
            updated_cigar = trimmed_cigar + soft_clipped_block
        
        mapping.cigar = updated_cigar
    
    if mapping.tags:
        # Clear the MD tag since the possible removal of bases to the
        # alignment may have made it inaccurate. 
        # TODO: now have machinery to make it accurate.
        filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags)
        mapping.tags = filtered_tags

    set_nongenomic_length(mapping, bases_to_trim)

    return mapping

if __name__ == '__main__':
    fastq_fn = '/home/jah/projects/ribosomes/experiments/guydosh_cell/dom34KO_CHX/data/SRR1042854.fastq'
    seqs = [r.seq for _, r in zip(xrange(100000), fastq.reads(fastq_fn))]
    seqs = utilities.progress_bar(len(seqs), seqs)
    adapter = full_linker
    count = 0
    counts = Counter()
    for seq in seqs:
        counts[trim_by_local_alignment(adapter, seq)] += 1