Esempio n. 1
0
def trim_polyA_ends(fastq_filename,
                    output_dir,
                    compressed=False,
                    min_polyA_len=3,
                    min_read_len=22):
    """
    Trim polyA ends from reads.
    """
    print "Trimming polyA trails from: %s" % (fastq_filename)
    # Strip the trailing extension
    output_basename = \
        ".".join(os.path.basename(fastq_filename).split(".")[0:-1])
    output_basename = "%s.trimmed_polyA.fastq.gz" % (output_basename)
    output_filename = os.path.join(output_dir, output_basename)
    utils.make_dir(output_dir)
    if os.path.isfile(output_filename):
        print "SKIPPING: %s already exists!" % (output_filename)
        return output_filename
    print "  - Outputting trimmed sequences to: %s" % (output_filename)
    input_file = fastq_utils.read_open_fastq(fastq_filename)
    output_file = fastq_utils.write_open_fastq(output_filename)
    t1 = time.time()
    for line in fastq_utils.read_fastq(input_file):
        header, seq, header2, qual = line
        if seq.endswith("A"):
            # Skip sequences that do not end with at least N
            # many As
            if seq[-min_polyA_len:] != ("A" * min_polyA_len):
                continue
            # Get sequence stripped of contiguous strech of polyAs
            stripped_seq = rstrip_stretch(seq, "A")
            if len(stripped_seq) < min_read_len:
                # Skip altogether reads that are shorter than
                # the required length after trimming
                continue
            # Strip the quality scores to match trimmed sequence
            new_qual = qual[0:len(stripped_seq)]
            new_rec = (header, stripped_seq, header2, new_qual)
            # Write the record with trimmed sequence back out to file
            fastq_utils.write_fastq(output_file, new_rec)
    t2 = time.time()
    print "Trimming took %.2f mins." % ((t2 - t1) / 60.)
    output_file.close()
    return output_filename
Esempio n. 2
0
def trim_polyA_ends(fastq_filename,
                    output_dir,
                    compressed=False,
                    min_polyA_len=3,
                    min_read_len=22):
    """
    Trim polyA ends from reads.
    """
    print "Trimming polyA trails from: %s" %(fastq_filename)
    # Strip the trailing extension
    output_basename = \
        ".".join(os.path.basename(fastq_filename).split(".")[0:-1])
    output_basename = "%s.trimmed_polyA.fastq.gz" %(output_basename)
    output_filename = os.path.join(output_dir, output_basename)
    utils.make_dir(output_dir)
    if os.path.isfile(output_filename):
        print "SKIPPING: %s already exists!" %(output_filename)
        return output_filename
    print "  - Outputting trimmed sequences to: %s" %(output_filename)
    input_file = fastq_utils.read_open_fastq(fastq_filename)
    output_file = fastq_utils.write_open_fastq(output_filename)
    t1 = time.time()
    for line in fastq_utils.read_fastq(input_file):
        header, seq, header2, qual = line
        if seq.endswith("A"):
            # Skip sequences that do not end with at least N
            # many As
            if seq[-min_polyA_len:] != ("A" * min_polyA_len):
                continue
            # Get sequence stripped of contiguous strech of polyAs
            stripped_seq = rstrip_stretch(seq, "A")
            if len(stripped_seq) < min_read_len:
                # Skip altogether reads that are shorter than
                # the required length after trimming
                continue
            # Strip the quality scores to match trimmed sequence
            new_qual = qual[0:len(stripped_seq)]
            new_rec = (header, stripped_seq, header2, new_qual)
            # Write the record with trimmed sequence back out to file
            fastq_utils.write_fastq(output_file, new_rec)
    t2 = time.time()
    print "Trimming took %.2f mins." %((t2 - t1)/60.)
    output_file.close()
    return output_filename
Esempio n. 3
0
 def get_seq_cycle_profile(self, fastq_filename,
                           first_n_seqs=None):#sample):
     """
     Compute the average 'N' bases (unable to sequence)
     as a function of the position of the read.
     """
     fastq_file = fastq_utils.read_open_fastq(fastq_filename)
     fastq_entries = fastq_utils.read_fastq(fastq_file)
     # Mapping from position in read to number of Ns
     num_n_bases = defaultdict(int)
     # Mapping from position in read to total number of
     # reads in that position
     num_reads = defaultdict(int)
     num_entries = 0
     print "Computing sequence cycle profile for: %s" %(fastq_filename)
     if first_n_seqs != None:
         print "Looking at first %d sequences only" %(first_n_seqs)
     for entry in fastq_entries:
         if first_n_seqs != None:
             # Stop at requested number of entries if asked to
             if num_entries >= first_n_seqs:
                 break
         header1, seq, header2, qual = entry
         seq_len = len(seq)
         for n in range(seq_len):
             if seq[n] == "N":
                 # Record occurrences of N
                 num_n_bases[n] += 1
             num_reads[n] += 1
         num_entries += 1
     # Compute percentage of N along each position
     percent_n = []
     for base_pos in range(max(num_reads.keys())):
         curr_percent_n = float(num_n_bases[base_pos]) / num_reads[base_pos]
         percent_n.append(curr_percent_n)
     return percent_n