Example #1
0
def combine_quiver_results(split_dirs, output_dir, hq_filename, lq_filename, tofu_prefix=''):
    """
    For each size bin result, ex: clusterOut/0to2k/all.quiveredXXX.fastq
    combine it together, remember to add a prefix (ex: i0|c12, i1|c13/....)
    """
    prefix_dict_hq  = {}
    prefix_dict_lq = {}
    fout_hq = FastqWriter(os.path.join(output_dir, 'all_sizes.quivered_hq.fastq'))
    fout_lq = FastqWriter(os.path.join(output_dir, 'all_sizes.quivered_lq.fastq'))
    for i,d in enumerate(split_dirs):
        file_hq = os.path.join(d, hq_filename) #'all_quivered_hq.100_30_0.99.fastq')
        file_lq = os.path.join(d, lq_filename) #'all_quivered_lq.fastq')
        print >> sys.stderr, "Adding prefix i{0}| to {1},{2}...".format(i, file_hq, file_lq)
        prefix_dict_hq["i{i}HQ_{p}".format(i=i,p=tofu_prefix)] = os.path.abspath(d)
        prefix_dict_lq["i{i}LQ_{p}".format(i=i,p=tofu_prefix)] = os.path.abspath(d)
        for r in FastqReader(file_hq):
            _name_ = "i{i}HQ_{p}|{n}".format(p=tofu_prefix, i=i, n=r.name)
            fout_hq.writeRecord(_name_, r.sequence, r.quality)
        for r in FastqReader(file_lq):
            _name_ = "i{i}LQ_{p}|{n}".format(p=tofu_prefix, i=i, n=r.name)
            fout_lq.writeRecord(_name_, r.sequence, r.quality)
    fout_hq.close()
    fout_lq.close()
    print >> sys.stderr, "HQ quivered output combined to:", fout_hq.file.name
    print >> sys.stderr, "LQ quivered output combined to:", fout_lq.file.name
    return fout_hq.file.name,fout_lq.file.name,prefix_dict_hq,prefix_dict_lq
 def open_writer( self ):
     if self.filetype == 'fasta':
         output_file = '%s.trim.fasta' % self.prefix
         self.writer = FastaWriter( output_file )
     elif self.filetype == 'fastq':
         output_file = '%s.trim.fastq' % self.prefix
         self.writer = FastqWriter( output_file )
Example #3
0
 def open_writer(self):
     if self.filetype == 'fasta':
         output_file = '%s.trim.fasta' % self.prefix
         self.writer = FastaWriter(output_file)
     elif self.filetype == 'fastq':
         output_file = '%s.trim.fastq' % self.prefix
         self.writer = FastqWriter(output_file)
def pick_rep(fa_fq_filename,
             gff_filename,
             group_filename,
             output_filename,
             is_fq=False,
             pick_least_err_instead=True):
    """
    For each group, select the representative record

    If is FASTA file (is_fa False) -- then always pick the longest one
    If is FASTQ file (is_fq True) -- then 
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    if is_fq:
        fd = LazyFastqReader(fa_fq_filename)
        fout = FastqWriter(output_filename)
    else:
        fd = LazyFastaReader(fa_fq_filename)
        fout = FastaWriter(output_filename)

    coords = {}
    for line in open(gff_filename):
        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PB.1"; transcript_id "PB.1.1";
        raw = line.strip().split('\t')
        if raw[2] == 'transcript':
            tid = raw[-1].split('; ')[1].split()[1][1:-2]
            coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4],
                                                    raw[6])

    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        print >> sys.stderr, "Picking representative sequence for", pb_id
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0
        for x in members.split(','):
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i / 10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or (
                (not is_fq or not pick_least_err_instead)
                    and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if is_fq:
            fout.writeRecord(_id_, _seq_, best_qual)
        else:
            fout.writeRecord(_id_, _seq_)
    fout.close()
Example #5
0
class QualityFilter(object):
    """
    A tool for filtering out low-quality fastq files
    """
    def __init__(self, input_fastq, output_fastq, min_accuracy=MIN_ACCURACY):
        self.input_reader = FastqReader(input_fastq)
        self.output_writer = FastqWriter(output_fastq)
        self.min_accuracy = min_accuracy

    def __call__(self):
        for fastq in self.input_reader:
            if predicted_accuracy(fastq) >= self.min_accuracy:
                self.output_writer.writeRecord(fastq)
class QualityFilter( object ):
    """
    A tool for filtering out low-quality fastq files
    """
    def __init__(self, input_fastq, output_fastq, min_accuracy=MIN_ACCURACY):
        self.input_reader = FastqReader(input_fastq)
        self.output_writer = FastqWriter(output_fastq)
        self.min_accuracy = min_accuracy

    def __call__(self):
        for fastq in self.input_reader:
            if predicted_accuracy(fastq) >= self.min_accuracy:
                self.output_writer.writeRecord( fastq )
class BarcodeTrimmer( object ):

    def __init__( self, input_file, barcode_file, prefix=None, filetype=None ):
        self.input_file = input_file
        self.barcode_file = barcode_file
        self.prefix = prefix or get_prefix( input_file )
        self.filetype = filetype or get_filetype( input_file )
        self.positions = {}

    def run( self ):
        self.parse_barcode_data()
        self.open_reader()
        self.open_writer()
        self.trim_sequences()

    def parse_barcode_data( self ):
        with open( self.barcode_file ) as handle:
            for entry in map(barcode._make, csv.reader(handle, delimiter='\t')):
                if entry.id == 'ID':
                    continue
                start = None if entry.end5 == 'NA' else int(entry.end5)
                end = None if entry.end3 == 'NA' else int(entry.end3)
                self.positions[entry.id] = (start, end)

    def open_reader( self ):
        if self.filetype == 'fasta':
            self.reader = FastaReader( self.input_file )
        elif self.filetype == 'fastq':
            self.reader = FastqReader( self.input_file )

    def open_writer( self ):
        if self.filetype == 'fasta':
            output_file = '%s.trim.fasta' % self.prefix
            self.writer = FastaWriter( output_file )
        elif self.filetype == 'fastq':
            output_file = '%s.trim.fastq' % self.prefix
            self.writer = FastqWriter( output_file )

    def trim_sequences( self ):
        for record in self.reader:
            try:
                start, end = self.positions[record.name]
            except:
                msg = 'Unknown sequence record "%s"!' % record.name
                log.error( msg )
                raise ValueError( msg )
            trimmed_record = trim_record( record, start, end )
            self.writer.writeRecord( trimmed_record )
Example #8
0
class BarcodeTrimmer(object):
    def __init__(self, input_file, barcode_file, prefix=None, filetype=None):
        self.input_file = input_file
        self.barcode_file = barcode_file
        self.prefix = prefix or get_prefix(input_file)
        self.filetype = filetype or get_filetype(input_file)
        self.positions = {}

    def run(self):
        self.parse_barcode_data()
        self.open_reader()
        self.open_writer()
        self.trim_sequences()

    def parse_barcode_data(self):
        with open(self.barcode_file) as handle:
            for entry in map(barcode._make, csv.reader(handle,
                                                       delimiter='\t')):
                if entry.id == 'ID':
                    continue
                start = None if entry.end5 == 'NA' else int(entry.end5)
                end = None if entry.end3 == 'NA' else int(entry.end3)
                self.positions[entry.id] = (start, end)

    def open_reader(self):
        if self.filetype == 'fasta':
            self.reader = FastaReader(self.input_file)
        elif self.filetype == 'fastq':
            self.reader = FastqReader(self.input_file)

    def open_writer(self):
        if self.filetype == 'fasta':
            output_file = '%s.trim.fasta' % self.prefix
            self.writer = FastaWriter(output_file)
        elif self.filetype == 'fastq':
            output_file = '%s.trim.fastq' % self.prefix
            self.writer = FastqWriter(output_file)

    def trim_sequences(self):
        for record in self.reader:
            try:
                start, end = self.positions[record.name]
            except:
                msg = 'Unknown sequence record "%s"!' % record.name
                log.error(msg)
                raise ValueError(msg)
            trimmed_record = trim_record(record, start, end)
            self.writer.writeRecord(trimmed_record)
def pick_rep(fa_fq_filename, gff_filename, group_filename, output_filename, is_fq=False, pick_least_err_instead=False, bad_gff_filename=None):
    """
    For each group, select the representative record

    If is FASTA file (is_fa False) -- then always pick the longest one
    If is FASTQ file (is_fq True) -- then 
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    if is_fq:
        fd = LazyFastqReader(fa_fq_filename)
        fout = FastqWriter(output_filename)
    else:
        fd = LazyFastaReader(fa_fq_filename)
        fout = FastaWriter(output_filename)

    coords = {}
    for line in open(gff_filename):
        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PB.1"; transcript_id "PB.1.1";
        raw = line.strip().split('\t')
        if raw[2] == 'transcript': 
            tid = raw[-1].split('; ')[1].split()[1][1:-2]
            coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])

    if bad_gff_filename is not None:
        for line in open(bad_gff_filename):
            raw = line.strip().split('\t')
            if raw[2] == 'transcript':
                tid = raw[-1].split('; ')[1].split()[1][1:-2]
                coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])

    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        print >> sys.stderr, "Picking representative sequence for", pb_id
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0
        for x in members.split(','):
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i/10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if is_fq:
            fout.writeRecord(_id_, _seq_, best_qual)
        else:
            fout.writeRecord(_id_, _seq_)
    fout.close()
Example #10
0
def trim_fastq(fastq_file, output_file, window=WINDOW):
    with FastqWriter(output_file) as writer:
        for record in FastqReader(fastq_file):
            start = _find_start(record, window)
            end = _find_end(record, window)
            trimmed_record = _trim_fastq(record, start, end)
            writer.writeRecord(trimmed_record)
Example #11
0
def writerProcess(outDir):
    # makes output directories
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    fastOutDir = os.path.join(outDir, "Demultiplexed/")
    if not os.path.exists(fastOutDir):
        os.makedirs(fastOutDir)

    # opens files
    csvOut = open(os.path.join(outDir, "Report.csv"), "w")
    csvOut.write("Name,Barcode,NumPasses,Coverage,AvgConfidence,MinConfidence,TrimFail,MappingFail\n")
    writers = {}
    for writecount in range(totalNumber):
        result = resultQueue.get()
        csvOut.write("%s,%s,%d,%d,%0.6f,%0.6f,%s,%s\n" % (
            result.name, result.barcode, result.numPasses, result.coverage, result.predictedAccuracy,
            result.minConfidence, result.trimFail, result.mappingFail))
        if result.barcode not in writers:
            if args.fastq:
                writers[result.barcode] = FastqWriter(os.path.join(fastOutDir, result.barcode + ".fastq"))
            else:
                writers[result.barcode] = FastaWriter(os.path.join(fastOutDir, result.barcode + ".fasta"))
        if not any((result.minNumPassesFail, result.mappingFail, result.trimFail, result.minCoverageFail,
                    result.minAvgConfidenceFail, result.minConfidenceFail)):
            if args.fastq:
                writers[result.barcode].writeRecord(result.name, result.seq, result.qual)
            else:
                writers[result.barcode].writeRecord(result.name, result.seq)
Example #12
0
def main(parser):

    args = parser.parse_args()

    bam    = BamReader(args.ccsBAM)
    bcFofn = BarcodeH5Fofn(args.barcodeFofn)

    oFiles =  { bc:FastqWriter('{dir}/{bc}.fastq'.format(dir=args.outDir,bc=bc)) for bc in bcFofn.barcodeLabels }
    for rec in bam:
        try:
            lZmw = bcFofn.labeledZmwFromName(rec.readName)
        except KeyError:
            #catch zmws with no barcode and skip
            continue
        if       rec.readScore     >= args.minPredictedAccuracy \
             and lZmw.averageScore >= args.minAvgBarcodeScore \
             and rec.numPasses     >= args.minNumPasses:
            header = rec.readName
            if args.extendedHeader:
                header +=  ' predictedAccuracy={predAcc} numPasses={numPasses} barcodeScore={bcScore}'\
                           .format(predAcc=rec.readScore, numPasses=rec.numPasses, bcScore=lZmw.averageScore)
            qual = [ ord(q)-33  for q in rec.peer.qual ]
            writer = oFiles[bcFofn.barcodeLabels[lZmw.bestIdx]]
            writer.writeRecord(header, rec.read(aligned=False), qual)
    
    for f in oFile.values():
        f.close()
 def add_writer(self, group):
     if self.filetype == 'fasta':
         output_file = '%s.g%s.fasta' % (self.prefix, group)
         self.writers[group] = FastaWriter(output_file)
     if self.filetype == 'fastq':
         output_file = '%s.g%s.fastq' % (self.prefix, group)
         self.writers[group] = FastqWriter(output_file)
Example #14
0
def _write_output(records, output_file, output_type):
    """Write the records out to file"""
    if output_type == 'fasta':
        write_fasta(records, output_file)
    else:
        with FastqWriter(output_file) as writer:
            for record in records:
                writer.writeRecord(record)
        check_output_file(output_file)
Example #15
0
def rename_resequencing(input_fastq, output_fastq):
    """
    Rename resequenced Fastq to have an AA-style NumReads tag
    """
    with FastqWriter(output_fastq) as writer:
        for record in FastqReader(input_fastq):
            new_name = get_new_name(record.name)
            new_record = FastqRecord(new_name, record.sequence, record.quality)
            writer.writeRecord(new_record)
Example #16
0
def write_fastq(records, output_file):
    """
    Write a FastqRecord, or a list of FastqRecords, out to file
    """
    with FastqWriter(output_file) as handle:
        for record in records:
            assert isinstance(record, FastqRecord)
            handle.writeRecord(record)
    check_output_file(output_file)
    return output_file
Example #17
0
def _open_output_handle(output_file, output_type):
    """
    Open an appropriate output handle to record the exon sequences
    """
    if output_type == 'fasta':
        return FastaWriter(output_file)
    elif output_type == 'fastq':
        return FastqWriter(output_file)
    msg = 'Output type must be Fasta or Fastq'
    log.error(msg)
    raise TypeError(msg)
Example #18
0
def make_current_fastq(icec_obj, flnc_filename, root_dir):
    """
    current fasta will consists of all ids

    however --- if this was a already finished run and we are adding more input,
        then newids is empty, in this case we set newids = everything that
        has no affiliation or more than one affiliated cluster in d
    """
    with FastqWriter(os.path.join(root_dir, 'current.fastq')) as f:
        for r in FastqReader(flnc_filename):
            f.writeRecord(r)
Example #19
0
def combine_fastq(sequence_files, output_file):
    """
    Combine a series of sequence files into one Fastq
    """
    with FastqWriter(output_file) as handle:
        for filename in sequence_files:
            try:
                for record in FastqReader(filename):
                    handle.writeRecord(record)
            except:
                log.warn('Could not open "%s" as Fastq' % fasta)
    check_output_file(output_file)
    return output_file
Example #20
0
def combine_fastq( input_files, output_file):
    """
    Combine sequences from multiple Fastq files into one
    """
    log.info("Combining multiple Fastq outputs")
    record_counter = 0
    file_counter = 0
    with FastqWriter( output_file ) as writer:
        for filename in input_files:
            file_counter += 1
            for record in FastqReader( filename ):
                record_counter += 1
                writer.writeRecord( record )
    log.info("Found {0} consensus sequences in {1} outputs".format(record_counter,
                                                                   file_counter))
    return output_file
Example #21
0
def combine_amp_analysis( input_dir, output_file ):
    """
    Combine all AmpAnalysis results into a single Fastq file
    """
    log.info("Combining AmpliconAnalysis outputs")
    record_counter = 0
    file_counter = 0
    with FastqWriter( output_file ) as writer:
        for result in find_amp_analysis_results(input_dir):
            file_counter += 1
            for record in FastqReader( result ):
                record_counter += 1
                writer.writeRecord( record )
    log.info("Found {0} consensus sequences in {1} outputs".format(record_counter,
                                                                   file_counter))
    return output_file
Example #22
0
def quality_filter(input_fastq, output_fastq, min_accuracy=ACCURACY):
    """
    Filter out sequences below a threshold of predicted accuracy
    """
    log.info("Filtering sequences below {0}% predicted accuracy".format(
        100 * min_accuracy))
    seq_count = 0
    pass_count = 0
    with FastqWriter(output_fastq) as writer:
        for record in FastqReader(input_fastq):
            seq_count += 1
            if predicted_accuracy(record) >= min_accuracy:
                pass_count += 1
                writer.writeRecord(record)
    percentage = round(100.0 * pass_count / seq_count, 4)
    log.info("{0} sequences of {1} ({2}%) passed filtering".format(
        pass_count, seq_count, percentage))
Example #23
0
def filter_fastq(input_fastq,
                 output_fastq,
                 min_length=None,
                 min_num_reads=None):
    """
    Filter a Fastq file based on various criteria
    """
    kept = 0
    total = 0
    with FastqWriter(output_fastq) as writer:
        for record in FastqReader(input_fastq):
            total += 1
            if min_length and len(record.sequence) < min_length:
                continue
            if min_num_reads and get_num_reads(record) < min_num_reads:
                continue
            kept += 1
            writer.writeRecord(record)
    log.info("Kept %s of %s consensus sequences" % (kept, total))
Example #24
0
def extract_ccs_fastq(collection, output_file, min_length, min_snr):
    log.info('Extracting fastq CCS reads from input files')
    log.debug('    min_length: %s' % min_length)
    log.debug('    min_snr: %s' % min_snr)
    ccs_total = 0
    pass_total = 0
    with FastqWriter(output_file) as writer:
        for movie in collection.movieNames:
            log.info('Extracting fastq CCS reads from %s' %
                     os.path.basename(movie))
            ccs_count = 0
            pass_count = 0
            for well in collection[movie].sequencingZmws:
                zmw = collection[movie][well]

                # Skip non-CCS ZMWs
                if not zmw.ccsRead:
                    continue
                ccs_count += 1

                # Skip short and low-SNR sequences
                basecalls = zmw.ccsRead.basecalls()
                if len(basecalls) < min_length:
                    continue
                zmw_snr = min(zmw.zmwMetric("HQRegionSNR"))
                if zmw_snr < min_snr:
                    continue
                pass_count += 1

                # Finally write the CCS Fastq to file
                record = FastqRecord(zmw.ccsRead.readName, basecalls,
                                     zmw.ccsRead.QualityValue())
                writer.writeRecord(record)
            percentage = round(100.0 * pass_count / ccs_count)
            log.info(
                "Identified {0} CCS reads, of which {1} ({2}%) passed filter".
                format(ccs_count, pass_count, percentage))
            ccs_total += ccs_count
            pass_total += pass_count
    percentage = round(100.0 * pass_total / ccs_total)
    log.info(
        'Found a total of {0} CCS reads, of which {1} ({2}%) passed filter'.
        format(ccs_total, pass_total, percentage))
Example #25
0
def snr_filter(input_fastq, raw_data_file, output_fastq, min_snr=SNR):
    """
    Filter out sequences below a threshold of predicted accuracy
    """
    log.info(
        "Filtering sequences below {0} Signal-To-Noise Ratio".format(min_snr))
    seq_count = 0
    pass_count = 0
    raw_data = BasH5Collection(raw_data_file)
    with FastqWriter(output_fastq) as writer:
        for record in FastqReader(input_fastq):
            seq_count += 1
            zmw_name = '/'.join(record.name.strip().split('/')[:2])
            zmw = raw_data[zmw_name]
            zmw_snr = min(zmw.zmwMetric("HQRegionSNR"))
            print zmw_name, zmw_snr
            if zmw_snr >= min_snr:
                pass_count += 1
                writer.writeRecord(record)
    percentage = round(100.0 * pass_count / seq_count)
    log.info("{0} sequences of {1} ({2}%) passed filtering".format(
        pass_count, seq_count, percentage))
Example #26
0
def pick_rep(fa_fq_filename,
             sam_filename,
             gff_filename,
             group_filename,
             output_filename,
             is_fq=False,
             pick_least_err_instead=False):
    """
    For each group, select the representative record

    If is FASTA file (is_fa False) -- then always pick the longest one
    If is FASTQ file (is_fq True) -- then 
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    if is_fq:
        fd = LazyFastqReader(fa_fq_filename)
        fout = FastqWriter(output_filename)
    else:
        fd = LazyFastaReader(fa_fq_filename)
        fout = FastaWriter(output_filename)


#    for line in open(gff_filename):
#        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PBfusion.1"; transcript_id "PBfusion.1.1";
#        raw = line.strip().split('\t')
#        if raw[2] == 'transcript':
#            # check if this is first or 2+ part of fusion
#            tid = raw[-1].split('; ')[1].split()[1][1:-2] # ex: tid = PBfusion.1.1
#            gid = tid[:tid.rfind('.')] # ex: gid = PBfusion.1
#            if tid.endswith('.1'):
#                coords[gid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])
#            else:
#                assert gid in coords
#                coords[gid] += "+{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])

    rep_info = {}
    id_to_rep = {}
    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        print >> sys.stderr, "Picking representative sequence for", pb_id
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0
        for x in members.split(','):
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i / 10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or (
                (not is_fq or not pick_least_err_instead)
                    and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)
        rep_info[pb_id] = (best_id, best_seq, best_qual)
        id_to_rep[best_id] = pb_id

    f_gff = open(gff_filename, 'w')
    coords = {}
    record_storage = {
    }  # temporary storage for the .1 record to write in conjunction with second record
    for r in BioReaders.GMAPSAMReader(sam_filename, True):
        if r.qID in id_to_rep:
            pb_id = id_to_rep[r.qID]
            best_id, best_seq, best_qual = rep_info[pb_id]

            # make coordinates & write the SAM file
            if r.qID not in coords:
                # this is the .1 portion
                coords[r.qID] = "{0}:{1}-{2}({3})".format(
                    r.sID, r.sStart, r.sEnd, r.flag.strand)
                isoform_index = 1
                record_storage[pb_id] = r
            else:
                # this is the .2 portion
                coords[r.qID] += "+{0}:{1}-{2}({3})".format(
                    r.sID, r.sStart, r.sEnd, r.flag.strand)
                isoform_index = 1

                old_r = record_storage[pb_id]
                f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                    chr=old_r.sID, s=old_r.segments[0].start+1, e=old_r.segments[-1].end, pi=pb_id, j=isoform_index, strand=old_r.flag.strand))
                for s in old_r.segments:
                    f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                        chr=old_r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=old_r.flag.strand))
                isoform_index = 2
                f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                    chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
                for s in r.segments:
                    f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                        chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
    f_gff.close()

    for pb_id in rep_info:
        best_id, best_seq, best_qual = rep_info[pb_id]
        _id_ = "{0}|{1}|{2}".format(pb_id, coords[best_id], best_id)
        _seq_ = best_seq
        if is_fq:
            fout.writeRecord(_id_, _seq_, best_qual)
        else:
            fout.writeRecord(_id_, _seq_)
Example #27
0
 def writeFastqData(self):
     log.info('Writing the masked Fastq data out to "%s"...' % self.output)
     with FastqWriter(self.output) as writer:
         for fastqRecord in self.maskedFastqs:
             writer.writeRecord(fastqRecord)
Example #28
0
 def __init__(self, input_fastq, output_fastq, min_accuracy=MIN_ACCURACY):
     self.input_reader = FastqReader(input_fastq)
     self.output_writer = FastqWriter(output_fastq)
     self.min_accuracy = min_accuracy
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix", help="Input prefix")
    parser.add_argument("output_prefix", help="Output prefix")
    parser.add_argument("--fuzzy_junction", type=int, default=5, help="Fuzzy junction max dist (default: 5bp)")

    args = parser.parse_args()

    #group_filename = args.input_prefix + '.group.txt'
    count_filename = args.input_prefix + '.abundance.txt'
    gff_filename = args.input_prefix + '.gff'
    rep_filename = args.input_prefix + '.rep.fq'

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(args.output_prefix + '.gff', 'w')
    keys = recs.keys()
    keys.sort()
    for k in recs:
        xxx = recs[k]
        filter_out_subsets(xxx, args.fuzzy_junction)
        for r in xxx:
            GFF.write_collapseGFF_format(f, r)
            good.append(r.seqid)
    f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k,v in d.iteritems():
        print k,v
    f.close()

    # write output rep.fq
    f = FastqWriter(args.output_prefix + '.rep.fq')
    for r in FastqReader(rep_filename):
        if r.name.split('|')[0] in good:
           f.writeRecord(r)
    f.close()

    # write output to .abundance.txt
    f = open(args.output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()
Example #30
0
 def writeFastqData(self):
     log.info('Writing aligned Fastq data out to "%s"' % self.output)
     with FastqWriter(self.output) as handle:
         for alignedFastq in self.alignedFastqs:
             handle.writeRecord(alignedFastq)
Example #31
0
def write_fastq_records(records, filename):
    log.info("Writing {0} FastqRecords to {1}".format(len(records), filename))
    with FastqWriter(filename) as handle:
        for record in records:
            handle.writeRecord(record)
    check_output_file(filename)
Example #32
0
def filter_by_count(input_prefix, output_prefix, min_count):

    group_filename = input_prefix + ".group.txt"
    count_filename = input_prefix + ".abundance.txt"
    gff_filename = input_prefix + ".gff"
    rep_filename = input_prefix + ".rep.fq"

    # read group
    group_max_count_fl = {}
    group_max_count_p = {}
    f = open(group_filename)
    for line in f:
        # ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
        pbid, members = line.strip().split("\t")
        group_max_count_fl[pbid] = 0
        group_max_count_p[pbid] = 0
        members = members.split(",")
        for m in members:
            tmp = m.split("|")[1].split("/")[1]  # ex: tmp = f30p16
            fl_count, p_count = tmp.split("p")
            fl_count = int(fl_count[1:])
            p_count = int(p_count)
            group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count)
            group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)
    f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ""
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith("#"):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r["pbid"], r) for r in DictReader(f, delimiter="\t"))
    for k, v in d.iteritems():
        print k, v
    f.close()

    # group_max_count_p NOT used for now
    good = filter(
        lambda x: int(d[x]["count_fl"]) >= min_count and group_max_count_fl[x] >= min_count and group_max_count_p >= 0,
        d,
    )

    # write output GFF
    f = open(output_prefix + ".gff", "w")
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in good:
            GFF.write_collapseGFF_format(f, r)
    f.close()

    # write output rep.fq
    f = FastqWriter(output_prefix + ".rep.fq")
    for r in FastqReader(rep_filename):
        if r.name.split("|")[0] in good:
            f.writeRecord(r)
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + ".abundance.txt", "w")
    f.write(count_header)
    writer = DictWriter(
        f,
        fieldnames=["pbid", "count_fl", "count_nfl", "count_nfl_amb", "norm_fl", "norm_nfl", "norm_nfl_amb"],
        delimiter="\t",
        lineterminator="\n",
    )
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()
Example #33
0
 def outputSubreadFastq(self):
     log.info('Parsing Fastq subreads from input BAS.H5 files')
     with FastqWriter(self.output) as writer:
         for reader in self.bash5_readers:
             self.writeSubreadFastq(reader, writer)
 def __init__(self, input_fastq, output_fastq, min_accuracy=MIN_ACCURACY):
     self.input_reader = FastqReader(input_fastq)
     self.output_writer = FastqWriter(output_fastq)
     self.min_accuracy = min_accuracy
Example #35
0
def pick_rep(fa_fq_filename, sam_filename, gff_filename, group_filename, output_filename, is_fq=False, pick_least_err_instead=False):
    """
    For each group, select the representative record

    If is FASTA file (is_fa False) -- then always pick the longest one
    If is FASTQ file (is_fq True) -- then 
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    if is_fq:
        fd = LazyFastqReader(fa_fq_filename)
        fout = FastqWriter(output_filename)
    else:
        fd = LazyFastaReader(fa_fq_filename)
        fout = FastaWriter(output_filename)


#    for line in open(gff_filename):
#        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PBfusion.1"; transcript_id "PBfusion.1.1";
#        raw = line.strip().split('\t')
#        if raw[2] == 'transcript':
#            # check if this is first or 2+ part of fusion
#            tid = raw[-1].split('; ')[1].split()[1][1:-2] # ex: tid = PBfusion.1.1
#            gid = tid[:tid.rfind('.')] # ex: gid = PBfusion.1
#            if tid.endswith('.1'):
#                coords[gid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])
#            else:
#                assert gid in coords
#                coords[gid] += "+{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])

    rep_info = {}
    id_to_rep = {}
    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        print >> sys.stderr, "Picking representative sequence for", pb_id
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0
        for x in members.split(','):
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i/10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)
        rep_info[pb_id] = (best_id, best_seq, best_qual)
        id_to_rep[best_id] = pb_id

    f_gff = open(gff_filename, 'w')
    coords = {}
    record_storage = {} # temporary storage for the .1 record to write in conjunction with second record
    for r in BioReaders.GMAPSAMReader(sam_filename, True):
        if r.qID in id_to_rep:
            pb_id = id_to_rep[r.qID]
            best_id, best_seq, best_qual = rep_info[pb_id]

            # make coordinates & write the SAM file
            if r.qID not in coords:
                # this is the .1 portion
                coords[r.qID] = "{0}:{1}-{2}({3})".format(r.sID, r.sStart, r.sEnd, r.flag.strand)
                isoform_index = 1
                record_storage[pb_id] = r
            else:
                # this is the .2 portion
                coords[r.qID] += "+{0}:{1}-{2}({3})".format(r.sID, r.sStart, r.sEnd, r.flag.strand)
                isoform_index = 1

                old_r = record_storage[pb_id]
                f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                    chr=old_r.sID, s=old_r.segments[0].start+1, e=old_r.segments[-1].end, pi=pb_id, j=isoform_index, strand=old_r.flag.strand))
                for s in old_r.segments:
                    f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                        chr=old_r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=old_r.flag.strand))
                isoform_index = 2
                f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                    chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
                for s in r.segments:
                    f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                        chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
    f_gff.close()

    for pb_id in rep_info:
        best_id, best_seq, best_qual = rep_info[pb_id]
        _id_ = "{0}|{1}|{2}".format(pb_id, coords[best_id], best_id)
        _seq_ = best_seq
        if is_fq:
            fout.writeRecord(_id_, _seq_, best_qual)
        else:
            fout.writeRecord(_id_, _seq_)
Example #36
0
def combine_quiver_results(split_dirs, output_dir, hq_filename, lq_filename, tofu_prefix=''):
    """
    For each size bin result, ex: clusterOut/0to2k/all.quiveredXXX.fastq
    combine it together, remember to add a prefix (ex: i0|c12, i1|c13/....)
    """
    prefix_dict_hq  = {}
    prefix_dict_lq = {}
    fout_hq = FastqWriter(os.path.join(output_dir, 'all_sizes.quivered_hq.fastq'))
    fout_lq = FastqWriter(os.path.join(output_dir, 'all_sizes.quivered_lq.fastq'))
    for i,d in enumerate(split_dirs):
        file_hq = os.path.join(d, hq_filename) #'all_quivered_hq.100_30_0.99.fastq')
        file_lq = os.path.join(d, lq_filename) #'all_quivered_lq.fastq')
        print >> sys.stderr, "Adding prefix i{0}| to {1},{2}...".format(i, file_hq, file_lq)
        prefix_dict_hq["i{i}HQ_{p}".format(i=i,p=tofu_prefix)] = os.path.abspath(d)
        prefix_dict_lq["i{i}LQ_{p}".format(i=i,p=tofu_prefix)] = os.path.abspath(d)
        for r in FastqReader(file_hq):
            _name_ = "i{i}HQ_{p}|{n}".format(p=tofu_prefix, i=i, n=r.name)
            fout_hq.writeRecord(_name_, r.sequence, r.quality)
        for r in FastqReader(file_lq):
            _name_ = "i{i}LQ_{p}|{n}".format(p=tofu_prefix, i=i, n=r.name)
            fout_lq.writeRecord(_name_, r.sequence, r.quality)
    fout_hq.close()
    fout_lq.close()
    print >> sys.stderr, "HQ quivered output combined to:", fout_hq.file.name
    print >> sys.stderr, "LQ quivered output combined to:", fout_lq.file.name
    return fout_hq.file.name,fout_lq.file.name,prefix_dict_hq,prefix_dict_lq
Example #37
0
def filter_by_count(input_prefix, output_prefix, min_count):

    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filename = input_prefix + '.rep.fq'

    # read group
    group_max_count_fl = {}
    group_max_count_p = {}
    f = open(group_filename)
    for line in f:
        #ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
        pbid, members = line.strip().split('\t')
        group_max_count_fl[pbid] = 0
        group_max_count_p[pbid] = 0
        members = members.split(',')
        for m in members:
            tmp = m.split('|')[1].split('/')[1]  #ex: tmp = f30p16
            fl_count, p_count = tmp.split('p')
            fl_count = int(fl_count[1:])
            p_count = int(p_count)
            group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count)
            group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)
    f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k, v in d.iteritems():
        print k, v
    f.close()

    # group_max_count_p NOT used for now
    good = filter(
        lambda x: int(d[x]['count_fl']) >= min_count and group_max_count_fl[x]
        >= min_count and group_max_count_p >= 0, d)

    # write output GFF
    f = open(output_prefix + '.gff', 'w')
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in good: GFF.write_collapseGFF_format(f, r)
    f.close()

    # write output rep.fq
    f = FastqWriter(output_prefix + '.rep.fq')
    for r in FastqReader(rep_filename):
        if r.name.split('|')[0] in good:
            f.writeRecord(r)
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()
Example #38
0
try:
    fastq_f = FastqReader(sys.argv[1])
    prepostfix_size = int(sys.argv[2])
    tmp_dir = sys.argv[3]
    output_fn = sys.argv[4]
except:
    print usage
    sys.exit(1)

prefix_N = re.compile("^[Nn]+")
postfix_N = re.compile("[Nn]+$")

prefix_fn = os.path.join(tmp_dir, "prefix.fa")
postfix_fn = os.path.join(tmp_dir, "postfix.fa")

with FastqWriter(open(output_fn, "w")) as output_fh:

    for r in fastq_f:
        r_id = r.name
        r_seq = r.sequence
        r_qv = r.quality
        m = prefix_N.search(r_seq)
        if m:
            prefix_trim = m.end()
        else:
            prefix_trim = 0
        m = postfix_N.search(r_seq)
        if m:
            postfix_trim = m.start()
        else:
            postfix_trim = len(r_seq)