Python parseFastq Exemples, hts_waterworks.utils.common.parseFastq Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : preprocessing.py Projet : hjanime/HTS-waterworks

def read_length_histogram(in_fastq, out_hist):
    """draw histogram of read lengths"""
    length_count = defaultdict(int)
    for header, seq, qual in parseFastq(gzip.open(in_fastq)):
        length_count[len(seq)] += 1
    pyplot.figure()
    pyplot.bar(range(1,max(length_count)+1),
               [length_count[l] for l in range(1, max(length_count)+1)],
               align='center')
    pyplot.savefig(out_hist)

Exemple #2

0

Afficher le fichier

Fichier : preprocessing.py Projet : jakebiesinger/HTS-waterworks

def read_length_histogram(in_fastq, out_hist):
    """draw histogram of read lengths"""
    length_count = defaultdict(int)
    for header, seq, qual in parseFastq(gzip.open(in_fastq)):
        length_count[len(seq)] += 1
    pyplot.figure()
    pyplot.bar(range(1,
                     max(length_count) + 1),
               [length_count[l] for l in range(1,
                                               max(length_count) + 1)],
               align='center')
    pyplot.savefig(out_hist)

Exemple #3

0

Afficher le fichier

Fichier : preprocessing.py Projet : hjanime/HTS-waterworks

def trim_regex(in_fastq, out_fastq, trim_pattern):
    """Search the reads for a regex, and trim everything matching the pattern
        and all succeeding sequence.
    
    """
    pattern = re.compile(trim_pattern)
    with gzip.open(in_fastq) as infile:
        with gzip.open(out_fastq, 'w') as outfile:
            for header, seq, qual in parseFastq(infile):
                matches = [m.span() for m in pattern.finditer(seq)]
                if len(matches) > 0:
                    # match to re found--
                    #   trim the right-most hit and add the trimmed sequence to the read ID
                    m = matches[-1]
                    header = seq[m[0]:] + '_' + header
                    seq = seq[:m[0]]
                    qual = qual[:m[0]]
                if len(matches) > 0 or not cfg.getboolean('filtering', 'require_regex'):
                    if len(seq) >= 10:  # TODO: add adjustable min length
                        outfile.write('@%s\n%s\n+%s\n%s\n' % (header, seq,
                                                              header, qual))

Exemple #4

0

Afficher le fichier

Fichier : preprocessing.py Projet : jakebiesinger/HTS-waterworks

def trim_regex(in_fastq, out_fastq, trim_pattern):
    """Search the reads for a regex, and trim everything matching the pattern
        and all succeeding sequence.
    
    """
    pattern = re.compile(trim_pattern)
    with gzip.open(in_fastq) as infile:
        with gzip.open(out_fastq, 'w') as outfile:
            for header, seq, qual in parseFastq(infile):
                matches = [m.span() for m in pattern.finditer(seq)]
                if len(matches) > 0:
                    # match to re found--
                    #   trim the right-most hit and add the trimmed sequence to the read ID
                    m = matches[-1]
                    header = seq[m[0]:] + '_' + header
                    seq = seq[:m[0]]
                    qual = qual[:m[0]]
                if len(matches) > 0 or not cfg.getboolean(
                        'filtering', 'require_regex'):
                    if len(seq) >= 10:  # TODO: add adjustable min length
                        outfile.write('@%s\n%s\n+%s\n%s\n' %
                                      (header, seq, header, qual))