Beispiel #1
0
def trim_polyA_ends(fastq_filename,
                    output_dir,
                    compressed=False,
                    min_polyA_len=3,
                    min_read_len=22):
    """
    Trim polyA ends from reads.
    """
    print "Trimming polyA trails from: %s" % (fastq_filename)
    # Strip the trailing extension
    output_basename = \
        ".".join(os.path.basename(fastq_filename).split(".")[0:-1])
    output_basename = "%s.trimmed_polyA.fastq.gz" % (output_basename)
    output_filename = os.path.join(output_dir, output_basename)
    utils.make_dir(output_dir)
    if os.path.isfile(output_filename):
        print "SKIPPING: %s already exists!" % (output_filename)
        return output_filename
    print "  - Outputting trimmed sequences to: %s" % (output_filename)
    input_file = fastq_utils.read_open_fastq(fastq_filename)
    output_file = fastq_utils.write_open_fastq(output_filename)
    t1 = time.time()
    for line in fastq_utils.read_fastq(input_file):
        header, seq, header2, qual = line
        if seq.endswith("A"):
            # Skip sequences that do not end with at least N
            # many As
            if seq[-min_polyA_len:] != ("A" * min_polyA_len):
                continue
            # Get sequence stripped of contiguous strech of polyAs
            stripped_seq = rstrip_stretch(seq, "A")
            if len(stripped_seq) < min_read_len:
                # Skip altogether reads that are shorter than
                # the required length after trimming
                continue
            # Strip the quality scores to match trimmed sequence
            new_qual = qual[0:len(stripped_seq)]
            new_rec = (header, stripped_seq, header2, new_qual)
            # Write the record with trimmed sequence back out to file
            fastq_utils.write_fastq(output_file, new_rec)
    t2 = time.time()
    print "Trimming took %.2f mins." % ((t2 - t1) / 60.)
    output_file.close()
    return output_filename
Beispiel #2
0
def trim_polyA_ends(fastq_filename,
                    output_dir,
                    compressed=False,
                    min_polyA_len=3,
                    min_read_len=22):
    """
    Trim polyA ends from reads.
    """
    print "Trimming polyA trails from: %s" %(fastq_filename)
    # Strip the trailing extension
    output_basename = \
        ".".join(os.path.basename(fastq_filename).split(".")[0:-1])
    output_basename = "%s.trimmed_polyA.fastq.gz" %(output_basename)
    output_filename = os.path.join(output_dir, output_basename)
    utils.make_dir(output_dir)
    if os.path.isfile(output_filename):
        print "SKIPPING: %s already exists!" %(output_filename)
        return output_filename
    print "  - Outputting trimmed sequences to: %s" %(output_filename)
    input_file = fastq_utils.read_open_fastq(fastq_filename)
    output_file = fastq_utils.write_open_fastq(output_filename)
    t1 = time.time()
    for line in fastq_utils.read_fastq(input_file):
        header, seq, header2, qual = line
        if seq.endswith("A"):
            # Skip sequences that do not end with at least N
            # many As
            if seq[-min_polyA_len:] != ("A" * min_polyA_len):
                continue
            # Get sequence stripped of contiguous strech of polyAs
            stripped_seq = rstrip_stretch(seq, "A")
            if len(stripped_seq) < min_read_len:
                # Skip altogether reads that are shorter than
                # the required length after trimming
                continue
            # Strip the quality scores to match trimmed sequence
            new_qual = qual[0:len(stripped_seq)]
            new_rec = (header, stripped_seq, header2, new_qual)
            # Write the record with trimmed sequence back out to file
            fastq_utils.write_fastq(output_file, new_rec)
    t2 = time.time()
    print "Trimming took %.2f mins." %((t2 - t1)/60.)
    output_file.close()
    return output_filename
Beispiel #3
0
def get_fastx_entries(fastx_filename,
                      fasta=False,
                      fastq=False):
    """
    Get entries of FASTQ/FASTA file.

    if fasta=True, read file as fasta regardless of extension.
    if fastq=True, read file as fastq regardless of extension
    """
    entries = []
    fastx_type = get_fastx_type(fastx_filename)
    
    if (fastx_type == "fasta") or fasta:
        # It's a FASTA file
        entries = fasta_utils.read_fasta(fastx_filename)
    elif (fastx_type == "fastq") or fastq:
        # It's a FASTQ file
        entries = fastq_utils.read_fastq(fastx_filename)
    return entries
Beispiel #4
0
 def get_seq_cycle_profile(self, fastq_filename,
                           first_n_seqs=None):#sample):
     """
     Compute the average 'N' bases (unable to sequence)
     as a function of the position of the read.
     """
     fastq_file = fastq_utils.read_open_fastq(fastq_filename)
     fastq_entries = fastq_utils.read_fastq(fastq_file)
     # Mapping from position in read to number of Ns
     num_n_bases = defaultdict(int)
     # Mapping from position in read to total number of
     # reads in that position
     num_reads = defaultdict(int)
     num_entries = 0
     print "Computing sequence cycle profile for: %s" %(fastq_filename)
     if first_n_seqs != None:
         print "Looking at first %d sequences only" %(first_n_seqs)
     for entry in fastq_entries:
         if first_n_seqs != None:
             # Stop at requested number of entries if asked to
             if num_entries >= first_n_seqs:
                 break
         header1, seq, header2, qual = entry
         seq_len = len(seq)
         for n in range(seq_len):
             if seq[n] == "N":
                 # Record occurrences of N
                 num_n_bases[n] += 1
             num_reads[n] += 1
         num_entries += 1
     # Compute percentage of N along each position
     percent_n = []
     for base_pos in range(max(num_reads.keys())):
         curr_percent_n = float(num_n_bases[base_pos]) / num_reads[base_pos]
         percent_n.append(curr_percent_n)
     return percent_n