Ejemplo n.º 1
0
def combine_quiver_results(split_dirs, output_dir, hq_filename, lq_filename, tofu_prefix=''):
    """
    For each size bin result, ex: clusterOut/0to2k/all.quiveredXXX.fastq
    combine it together, remember to add a prefix (ex: i0|c12, i1|c13/....)
    """
    prefix_dict_hq  = {}
    prefix_dict_lq = {}
    fout_hq = FastqWriter(os.path.join(output_dir, 'all_sizes.quivered_hq.fastq'))
    fout_lq = FastqWriter(os.path.join(output_dir, 'all_sizes.quivered_lq.fastq'))
    for i,d in enumerate(split_dirs):
        file_hq = os.path.join(d, hq_filename) #'all_quivered_hq.100_30_0.99.fastq')
        file_lq = os.path.join(d, lq_filename) #'all_quivered_lq.fastq')
        print >> sys.stderr, "Adding prefix i{0}| to {1},{2}...".format(i, file_hq, file_lq)
        prefix_dict_hq["i{i}HQ_{p}".format(i=i,p=tofu_prefix)] = os.path.abspath(d)
        prefix_dict_lq["i{i}LQ_{p}".format(i=i,p=tofu_prefix)] = os.path.abspath(d)
        for r in FastqReader(file_hq):
            _name_ = "i{i}HQ_{p}|{n}".format(p=tofu_prefix, i=i, n=r.name)
            fout_hq.writeRecord(_name_, r.sequence, r.quality)
        for r in FastqReader(file_lq):
            _name_ = "i{i}LQ_{p}|{n}".format(p=tofu_prefix, i=i, n=r.name)
            fout_lq.writeRecord(_name_, r.sequence, r.quality)
    fout_hq.close()
    fout_lq.close()
    print >> sys.stderr, "HQ quivered output combined to:", fout_hq.file.name
    print >> sys.stderr, "LQ quivered output combined to:", fout_lq.file.name
    return fout_hq.file.name,fout_lq.file.name,prefix_dict_hq,prefix_dict_lq
Ejemplo n.º 2
0
def trim_fastq(fastq_file, output_file, window=WINDOW):
    with FastqWriter(output_file) as writer:
        for record in FastqReader(fastq_file):
            start = _find_start(record, window)
            end = _find_end(record, window)
            trimmed_record = _trim_fastq(record, start, end)
            writer.writeRecord(trimmed_record)
Ejemplo n.º 3
0
    def __init__(self,
                 transfrag_filename,
                 fsm_maps,
                 cov_threshold=2,
                 min_aln_coverage=.99,
                 min_aln_identity=.85,
                 is_fq=False):
        self.contiVec = None  # current ContiVec object
        self.exons = None
        #self.MIN_EXON_SIZE = max_fuzzy_junction

        self.transfrag_filename = transfrag_filename
        if is_fq:
            self.transfrag_len_dict = dict(
                (r.name.split()[0], len(r.sequence))
                for r in FastqReader(transfrag_filename))
        else:
            self.transfrag_len_dict = dict(
                (r.name.split()[0], len(r.sequence))
                for r in FastaReader(transfrag_filename))

        self.fsm_maps = fsm_maps

        self.cov_threshold = cov_threshold  # only output GTF records if >= this many GMAP records support it (this must be if I'm running non-clustered fasta on GMAP)

        self.min_aln_coverage = min_aln_coverage
        self.min_aln_identity = min_aln_identity

        self.cuff_index = 1
Ejemplo n.º 4
0
 def precache_fastq(self, fastq_filename):
     """
     Cache each sequence in the FASTQ file into self.qv
     """
     for r in FastqReader(fastq_filename):
         seqid = r.name.split()[0] 
         self.qv[seqid] = {}
         c_basQV.fastq_precache_helper(seqid, r.quality, self.qv)
Ejemplo n.º 5
0
 def parseSequenceData(self):
     self.sequenceData = {}
     self.qualityData = {}
     for record in FastqReader(self.ccsFile):
         zmw = get_zmw(record.name)
         new_record = FastqRecord(zmw, record.sequence, record.quality)
         self.sequenceData[zmw] = new_record
         self.qualityData[zmw] = meanPQv(new_record)
Ejemplo n.º 6
0
def _parse_exon_records(exon_file, output_type):
    if output_type == 'fasta':
        return list(FastaReader(exon_file))
    elif output_type == 'fastq':
        return list(FastqReader(exon_file))
    msg = 'Exon data must be in either Fasta or Fastq format'
    log.error(msg)
    raise TypeError(msg)
Ejemplo n.º 7
0
def rename_resequencing(input_fastq, output_fastq):
    """
    Rename resequenced Fastq to have an AA-style NumReads tag
    """
    with FastqWriter(output_fastq) as writer:
        for record in FastqReader(input_fastq):
            new_name = get_new_name(record.name)
            new_record = FastqRecord(new_name, record.sequence, record.quality)
            writer.writeRecord(new_record)
Ejemplo n.º 8
0
 def parseFastqData(self):
     self.sequenceData = {}
     log.info('Reading QV data from "%s"...' % self.fastq)
     counter = 0
     for record in FastqReader(self.fastq):
         zmw = self.getZmw(record)
         self.sequenceData[zmw] = record
         counter += 1
     log.info('A total of %s Fastq records were read into memory' % counter)
Ejemplo n.º 9
0
def read_names(sequence_file):
    # Open the sequence file with the appropriate reader
    if is_fasta(sequence_file):
        reader = FastaReader(sequence_file)
    elif is_fastq(sequence_file):
        reader = FastqReader(sequence_file)
    else:
        raise ValueError

    # Extract and return the sequence names
    return [r.name.strip().split()[0] for r in reader]
Ejemplo n.º 10
0
def make_current_fastq(icec_obj, flnc_filename, root_dir):
    """
    current fasta will consists of all ids

    however --- if this was a already finished run and we are adding more input,
        then newids is empty, in this case we set newids = everything that
        has no affiliation or more than one affiliated cluster in d
    """
    with FastqWriter(os.path.join(root_dir, 'current.fastq')) as f:
        for r in FastqReader(flnc_filename):
            f.writeRecord(r)
Ejemplo n.º 11
0
def read_sequences(sequence_file):
    """
    Parse a list of records from either a Fasta or Fastq file
    """
    if is_fasta(sequence_file):
        return list(FastaReader(sequence_file))
    elif is_fastq(sequence_file):
        return list(FastqReader(sequence_file))
    else:
        msg = 'Sequence file must be either Fasta or Fastq'
        log.error(msg)
        raise TypeError(msg)
Ejemplo n.º 12
0
def combine_fastq(sequence_files, output_file):
    """
    Combine a series of sequence files into one Fastq
    """
    with FastqWriter(output_file) as handle:
        for filename in sequence_files:
            try:
                for record in FastqReader(filename):
                    handle.writeRecord(record)
            except:
                log.warn('Could not open "%s" as Fastq' % fasta)
    check_output_file(output_file)
    return output_file
Ejemplo n.º 13
0
def _parse_input_records( input_file ):
    """
    Parse the input sequence records with the appropriate pbcore Reader
    """
    input_type = get_file_type( input_file )
    if input_type == 'fasta':
        return list( FastaReader( input_file ))
    elif input_type == 'fastq':
        return list( FastqReader( input_file ))
    else:
        msg = 'Input file must be either Fasta or Fastq'
        log.error( msg )
        raise TypeError( msg )
Ejemplo n.º 14
0
 def separate_sequences(self):
     # Open the appropriate Sequence Reader
     if self.filetype == 'fasta':
         reader = FastaReader(self.input_file)
     elif self.filetype == 'fastq':
         reader = FastqReader(self.input_file)
     # Iterate through records, writing out the
     for record in reader:
         try:
             group = self.groups[record.name]
         except:
             continue
         self.write_record(record, group)
Ejemplo n.º 15
0
def check_ids_unique(fa_or_fq_filename, is_fq=False):
    """
    Confirm that a FASTA/FASTQ file has all unique IDs
    (used probably by collapse or fusion finding script)
    """
    if is_fq:
        reader = FastqReader(fa_or_fq_filename)
    else:
        reader = FastaReader(fa_or_fq_filename)
    seen = set()
    for r in reader:
        if r.id in seen:
            raise Exception, "Duplicate id {0} detected. Abort!".format(r.id)
        seen.add(r.id)
Ejemplo n.º 16
0
def combine_amp_analysis( input_dir, output_file ):
    """
    Combine all AmpAnalysis results into a single Fastq file
    """
    log.info("Combining AmpliconAnalysis outputs")
    record_counter = 0
    file_counter = 0
    with FastqWriter( output_file ) as writer:
        for result in find_amp_analysis_results(input_dir):
            file_counter += 1
            for record in FastqReader( result ):
                record_counter += 1
                writer.writeRecord( record )
    log.info("Found {0} consensus sequences in {1} outputs".format(record_counter,
                                                                   file_counter))
    return output_file
Ejemplo n.º 17
0
def combine_fastq( input_files, output_file):
    """
    Combine sequences from multiple Fastq files into one
    """
    log.info("Combining multiple Fastq outputs")
    record_counter = 0
    file_counter = 0
    with FastqWriter( output_file ) as writer:
        for filename in input_files:
            file_counter += 1
            for record in FastqReader( filename ):
                record_counter += 1
                writer.writeRecord( record )
    log.info("Found {0} consensus sequences in {1} outputs".format(record_counter,
                                                                   file_counter))
    return output_file
Ejemplo n.º 18
0
def quality_filter(input_fastq, output_fastq, min_accuracy=ACCURACY):
    """
    Filter out sequences below a threshold of predicted accuracy
    """
    log.info("Filtering sequences below {0}% predicted accuracy".format(
        100 * min_accuracy))
    seq_count = 0
    pass_count = 0
    with FastqWriter(output_fastq) as writer:
        for record in FastqReader(input_fastq):
            seq_count += 1
            if predicted_accuracy(record) >= min_accuracy:
                pass_count += 1
                writer.writeRecord(record)
    percentage = round(100.0 * pass_count / seq_count, 4)
    log.info("{0} sequences of {1} ({2}%) passed filtering".format(
        pass_count, seq_count, percentage))
Ejemplo n.º 19
0
def filter_fastq(input_fastq,
                 output_fastq,
                 min_length=None,
                 min_num_reads=None):
    """
    Filter a Fastq file based on various criteria
    """
    kept = 0
    total = 0
    with FastqWriter(output_fastq) as writer:
        for record in FastqReader(input_fastq):
            total += 1
            if min_length and len(record.sequence) < min_length:
                continue
            if min_num_reads and get_num_reads(record) < min_num_reads:
                continue
            kept += 1
            writer.writeRecord(record)
    log.info("Kept %s of %s consensus sequences" % (kept, total))
Ejemplo n.º 20
0
    def convert_to_dazz_fasta(self):
        """
        Convert input fasta/fastq file to daligner-compatibe fasta with ids:
        <prefix>/<index>/0_<seqlen>

        Also write out mappings to pickle
        """
        i = 1
        reader = FastaReader(self.input_filename) if self.filetype == 'fasta' else \
            FastqReader(self.input_filename)

        f = FastaWriter(self.dazz_filename)

        for r in reader:
            f.writeRecord("{p}/{i}/0_{len}".format(p=self.dazz_movie_name, i=i, len=len(r.sequence)), r.sequence)
            self.dazz_mapping[i] = r.id
            i += 1

        f.close()

        with open(self.dazz_filename + '.pickle', 'w') as f:
            dump(self.dazz_mapping, f)
Ejemplo n.º 21
0
def snr_filter(input_fastq, raw_data_file, output_fastq, min_snr=SNR):
    """
    Filter out sequences below a threshold of predicted accuracy
    """
    log.info(
        "Filtering sequences below {0} Signal-To-Noise Ratio".format(min_snr))
    seq_count = 0
    pass_count = 0
    raw_data = BasH5Collection(raw_data_file)
    with FastqWriter(output_fastq) as writer:
        for record in FastqReader(input_fastq):
            seq_count += 1
            zmw_name = '/'.join(record.name.strip().split('/')[:2])
            zmw = raw_data[zmw_name]
            zmw_snr = min(zmw.zmwMetric("HQRegionSNR"))
            print zmw_name, zmw_snr
            if zmw_snr >= min_snr:
                pass_count += 1
                writer.writeRecord(record)
    percentage = round(100.0 * pass_count / seq_count)
    log.info("{0} sequences of {1} ({2}%) passed filtering".format(
        pass_count, seq_count, percentage))
Ejemplo n.º 22
0
#!/usr/bin/env python

from pbcore.io.FastqIO import FastqReader, FastqWriter, FastqRecord
import shlex
import sys
import subprocess
import os
import re

usage = "usage: circulization.py initial_contigs.fastq 20000 /tmp circulaized_contigs.fastq"

try:
    fastq_f = FastqReader(sys.argv[1])
    prepostfix_size = int(sys.argv[2])
    tmp_dir = sys.argv[3]
    output_fn = sys.argv[4]
except:
    print usage
    sys.exit(1)

prefix_N = re.compile("^[Nn]+")
postfix_N = re.compile("[Nn]+$")

prefix_fn = os.path.join(tmp_dir, "prefix.fa")
postfix_fn = os.path.join(tmp_dir, "postfix.fa")

with FastqWriter(open(output_fn, "w")) as output_fh:

    for r in fastq_f:
        r_id = r.name
        r_seq = r.sequence
Ejemplo n.º 23
0
 def __init__(self, input_fastq, output_fastq, min_accuracy=MIN_ACCURACY):
     self.input_reader = FastqReader(input_fastq)
     self.output_writer = FastqWriter(output_fastq)
     self.min_accuracy = min_accuracy
Ejemplo n.º 24
0
 def open_reader(self):
     if self.filetype == 'fasta':
         self.reader = FastaReader(self.input_file)
     elif self.filetype == 'fastq':
         self.reader = FastqReader(self.input_file)
Ejemplo n.º 25
0
def filter_by_count(input_prefix, output_prefix, min_count):

    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filename = input_prefix + '.rep.fq'

    # read group
    group_max_count_fl = {}
    group_max_count_p = {}
    f = open(group_filename)
    for line in f:
        #ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
        pbid, members = line.strip().split('\t')
        group_max_count_fl[pbid] = 0
        group_max_count_p[pbid] = 0
        members = members.split(',')
        for m in members:
            tmp = m.split('|')[1].split('/')[1]  #ex: tmp = f30p16
            fl_count, p_count = tmp.split('p')
            fl_count = int(fl_count[1:])
            p_count = int(p_count)
            group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count)
            group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)
    f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k, v in d.iteritems():
        print k, v
    f.close()

    # group_max_count_p NOT used for now
    good = filter(
        lambda x: int(d[x]['count_fl']) >= min_count and group_max_count_fl[x]
        >= min_count and group_max_count_p >= 0, d)

    # write output GFF
    f = open(output_prefix + '.gff', 'w')
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in good: GFF.write_collapseGFF_format(f, r)
    f.close()

    # write output rep.fq
    f = FastqWriter(output_prefix + '.rep.fq')
    for r in FastqReader(rep_filename):
        if r.name.split('|')[0] in good:
            f.writeRecord(r)
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()
Ejemplo n.º 26
0
def _get_stats(fastq_file_name):
    raw_qvs = np.array([r.quality for r in FastqReader(fastq_file_name)])
    qvs = np.hstack(raw_qvs)
    reads = np.array([len(r.sequence) for r in FastqReader(fastq_file_name)])
    return qvs, reads
Ejemplo n.º 27
0
 def add_seqs_from_fastq(self, fastq_filename, smooth=True):
     """Add sequence ids from a fastq file."""
     self.qver.precache_fastq(fastq_filename)
     newids = [r.name.split()[0] for r in FastqReader(fastq_filename)]
     self.qver.presmooth(newids, self.window_size)
Ejemplo n.º 28
0
 def parseFastqData(self):
     self.fastqData = []
     log.info('Reading Fastq data into memory from %s...' % self.fastq)
     for fastqRecord in FastqReader(self.fastq):
         self.fastqData.append(fastqRecord)
Ejemplo n.º 29
0
def _parse_fastq(filename):
    return set([rec.name.split()[0] for rec in FastqReader(filename)])