def test_write_pbcore_records(self):
     records = [FastaRecord("chr1", "acgt"), FastaRecord("chr2", "tgca")]
     tmp_fasta = tempfile.NamedTemporaryFile(suffix=".fasta").name
     write_pbcore_records(FastaWriter, records, tmp_fasta)
     with open(tmp_fasta) as fasta_in:
         lines = fasta_in.read().splitlines()
         assert lines == [">chr1", "acgt", ">chr2", "tgca"]
 def test_write_contigset_records(self):
     records = [FastaRecord("chr1", "acgt"), FastaRecord("chr2", "tgca")]
     tmp_contigs = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
     write_contigset_records(FastaWriter, records, tmp_contigs)
     with ContigSet(tmp_contigs) as ds_in:
         rec2 = [(rec.id, rec.sequence) for rec in ds_in]
         assert rec2 == [("chr1", "acgt"), ("chr2", "tgca")]
 def trimSequenceData(self, sequenceData, blasrHits):
     print "Trimming out vector sequence..."
     trimmedSeqData = []
     for rec_id, record in sequenceData.iteritems():
         # If the record has a Blasr hit, find it
         try:
             hit = blasrHits[rec_id]
         # Otherwise keep the sequence as-is
         except KeyError:
             trimmedSeqData.append(record)
             continue
         # For records with hits, cut out and keep the good sequence
         start = int(hit.qstart)
         end = int(hit.qend)
         if start > self.minLength:
             newName = record.name + '_5p'
             newSequence = record.sequence[:start]
             newRecord = FastaRecord(newName, newSequence)
             trimmedSeqData.append(newRecord)
         if len(record.sequence) - end > self.minLength:
             newName = record.name + '_3p'
             newSequence = record.sequence[end:]
             newRecord = FastaRecord(newName, newSequence)
             trimmedSeqData.append(newRecord)
     return trimmedSeqData
Beispiel #4
0
def _extract_from_bash5(bash5_file, min_length, max_length, min_score, min_snr,
                        white_list):
    """
    Extract filtered subreads from a BasH5 or BaxH5 file
    """
    filename = os.path.basename(bash5_file)
    log.info("Extracting subreads from %s" % filename)

    records = []
    for zmw in BasH5Reader(bash5_file):
        zmwName = '%s/%s' % (zmw.baxH5.movieName, zmw.holeNumber)
        if white_list and zmwName not in white_list:
            continue
        if zmw.readScore < min_score:
            continue
        if min(zmw.zmwMetric('HQRegionSNR')) < min_snr:
            continue
        for subread in zmw.subreads:
            if len(subread) < min_length:
                continue
            if len(subread) > max_length:
                continue
            record = FastaRecord(subread.readName, subread.basecalls())
            records.append(record)

    log.info('Found %s subreads that passed filters' % len(records))
    return records
Beispiel #5
0
def reverse_complement(fasta_record):
    """
    Reverse complement a FastaRecord
    """
    rev_seq = fasta_record.sequence[::-1]
    rev_com_seq = rev_seq.translate(COMPLEMENT)
    return FastaRecord(fasta_record.name, rev_com_seq)
 def __getitem__(self, k):
     if k not in self.d:
         raise Exception, "key {0} not in dictionary!".format(k)
     self.f.seek(self.d[k])
     content = ''
     for line in self.f:
         if line.startswith('>'):
             break
         content += line.strip()
     return FastaRecord(k, content)
 def _get_record(self, k):
     index, tell = self.d[k]
     f = self.fhandlers[index]
     f.seek(tell)
     content = ''
     for line in f:
         if line.startswith('>'):
             break
         content += line.strip()
     return FastaRecord(header=k, sequence=content)
Beispiel #8
0
def trim_fasta_record(record, start, end):
    if start is None and end is None:
        trimmed_sequence = record.sequence
    elif start is None:
        trimmed_sequence = record.sequence[:end]
    elif end is None:
        trimmed_sequence = record.sequence[start:]
    else:
        trimmed_sequence = record.sequence[start:end]
    return FastaRecord(record.name, trimmed_sequence)
Beispiel #9
0
 def outputClusterFasta(self, reads, count):
     fastaFile = 'cluster%s.fasta' % count
     if os.path.exists(fastaFile):
         return fastaFile
     # Rename the "Reference" sequence to the cluster
     with FastaWriter(fastaFile) as handle:
         for fastqRecord in reads:
             fastaRecord = FastaRecord(fastqRecord.name,
                                       fastqRecord.sequence)
             handle.writeRecord(fastaRecord)
     return fastaFile
Beispiel #10
0
def apply_trims( records, trims ):
    trimmed = []
    for record in records:
        name = record.name.split()[0]
        if name in trims:
            start, end = trims[name]
            trimmed_record = FastaRecord( name, record.sequence[start:end] )
            trimmed.append( trimmed_record )
        else:
            trimmed.append( record )
    return trimmed
Beispiel #11
0
 def __getitem__(self, args):
     # Return individual sequence Alignments if given Int
     rec_slice, seq_slice = slice_2d(args)
     records = self.records[rec_slice]
     sliced_records = [
         FastaRecord(r.name, r.sequence[seq_slice]) for r in records
     ]
     filtered_records = [
         r for r in sliced_records if len(set(r.sequence)) > 1
     ]
     return FastaAlignment(filtered_records)
Beispiel #12
0
 def outputReferenceFasta(self, reference, count):
     print "Creating reference sequence for Cluster #%s" % count
     referenceFile = 'cluster%s_ref.fasta' % count
     reference_desc = 'cluster{0}_reference\t{1}'.format(
         count, reference.name)
     if os.path.exists(referenceFile):
         return referenceFile
     with FastaWriter(referenceFile) as handle:
         referenceFasta = FastaRecord(reference_desc, reference.sequence)
         handle.writeRecord(referenceFasta)
     return referenceFile
Beispiel #13
0
 def __getitem__(self, k):
     if k not in self.d:
         errMsg = "key {k} not in {f}!".format(k=k, f=self.f.name)
         raise ValueError(errMsg)
     self.f.seek(self.d[k])
     content = ''
     for line in self.f:
         if line.startswith('>'):
             break
         content += line.strip()
     # return SeqRecord(Seq(content), id=k)
     return FastaRecord(name=k, sequence=content)
Beispiel #14
0
def get_temp_fasta_record(record):
    """
    If a record isn't in Fasta format, try to create a FastaRecord from it
    """
    if isinstance(record, FastaRecord):
        return record
    try:
        return FastaRecord(record.name.strip(), record.sequence.strip())
    except:
        msg = 'Unrecognized sequence record type'
        log.error(msg)
        raise TypeError(msg)
Beispiel #15
0
 def reverseComplement(cls, record):
     if isinstance(record, str):
         return record[::-1].translate(cls.DNA_TRANSLATOR)
     elif isinstance(record, FastaRecord):
         return FastaRecord(record.name,
                            cls.reverseComplement(record.sequence))
     elif isinstance(record, FastqRecord):
         return FastqRecord(record.name,
                            cls.reverseComplement(record.sequence),
                            record.quality[::-1])
     else:
         raise ValueError("Record must be either FASTA or FASTQ")
Beispiel #16
0
def _slice_record(record, slice):
    """
    Slice a region out of a Fasta or Fastq record
    """
    sequence = record.sequence[slice]
    if isinstance(record, FastaRecord):
        return FastaRecord(record.name, sequence)
    elif isinstance(record, FastqRecord):
        quality = record.quality[slice]
        return FastqRecord(record.name, sequence, quality)
    else:
        msg = 'Invalid sequence record type'
        log.error(msg)
        raise TypeError(msg)
Beispiel #17
0
def _extract_exon_record(record, exon_num, start, end):
    """
    Create an Exon record from its coordinates and a Fasta
    """
    exon_name = '%s_exon%s' % (record.name, exon_num)
    exon_sequence = record.sequence[start:end]
    if isinstance(record, FastaRecord):
        return FastaRecord(exon_name, exon_sequence)
    elif isinstance(record, FastqRecord):
        exon_qual = record.qualityString[start:end]
        return FastqRecord(exon_name, exon_sequence, qualityString=exon_qual)
    msg = 'Record must be either FastaRecord or FastqRecord'
    log.error(msg)
    raise TypeError(msg)
Beispiel #18
0
def write_temp_fasta(record):
    """
    Write a temporary Fasta file
    """
    temp = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False)
    if isinstance(record, FastaRecord):
        write_fasta(record, temp.name)
    elif isinstance(record, FastqRecord):
        fasta = FastaRecord(record.name, record.sequence)
        write_fasta(fasta, temp.name)
    else:
        msg = 'Sequence record must be either Fasta or Fastq'
        log.error(msg)
        raise TypeError(msg)
    return temp
Beispiel #19
0
def _write_temp_fasta(record):
    """
    Write a sequence record out to a temporary Fasta file
    """
    temp = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False)
    if isinstance(record, FastaRecord):
        write_fasta([record], temp.name)
    elif isinstance(record, FastqRecord):
        temp_record = FastaRecord(record.name, record.sequence)
        write_fasta([temp_record], temp.name)
    else:
        msg = 'Record must be either FastaRecord or FastqRecord'
        log.error(msg)
        raise TypeError(msg)
    return temp.name
Beispiel #20
0
def _combine_records(records):
    """
    Combine an order series of Exon records in to a cDNA record
    """
    name = '_'.join(records[0].name.split('_')[:-1])
    cDNA_sequence = ''
    cDNA_quality = ''
    for record in records:
        cDNA_sequence += record.sequence
        if hasattr(record, 'qualityString'):
            cDNA_quality += record.qualityString
    if len(cDNA_sequence) == len(cDNA_quality):
        return FastqRecord(name, cDNA_sequence, qualityString=cDNA_quality)
    else:
        return FastaRecord(name, cDNA_sequence)
Beispiel #21
0
def _extract_from_bash5( bash5_file, min_length, min_score ):
    """
    Extract filtered subreads from a BasH5 or BaxH5 file
    """
    filename = os.path.basename( bash5_file )
    log.info("Extracting subreads from %s" % filename)

    records = []
    for zmw in BasH5Reader( bash5_file ):
        zmwName = '%s/%s' % (zmw.baxH5.movieName, zmw.holeNumber)
        if zmw.readScore < min_score:
            continue
        #if zmw.ccsRead and len( zmw.ccsRead.basecalls() ) > min_length:
        #    yield FastaRecord( zmw.ccsRead.readName, zmw.ccsRead.basecalls() )
        #elif zmw.subreads:
        long_subreads = [s for s in zmw.subreads if len(s.basecalls()) > min_length]
        if len( long_subreads ) == 1:
            subread = long_subreads[0]
            yield FastaRecord( subread.readName, subread.basecalls() )
        elif len( long_subreads ) >= 2:
            ordered = sorted( long_subreads, key=lambda s: len(s.basecalls()), reverse=True )
            subread = ordered[0]
            yield FastaRecord( subread.readName, subread.basecalls() )
    log.info('Found %s subreads that passed filters' % len(records))
Beispiel #22
0
def _multislice_record(record, slices):
    """
    Slice and combine multiple regions from a Fasta or Fastq
    """
    sliced_records = [_slice_record(record, s) for name, s in slices]
    sequence = ''.join([r.sequence for r in sliced_records])
    if isinstance(record, FastaRecord):
        return FastaRecord(record.name, sequence)
    elif isinstance(record, FastqRecord):
        quality_str = ''.join([r.qualityString for r in sliced_records])
        return FastqRecord(record.name, sequence, qualityString=quality_str)
    else:
        msg = 'Invalid sequence record type'
        log.error(msg)
        raise TypeError(msg)
Beispiel #23
0
def _trim_sequences(records, trim):
    """Trim X bases from each end of each sequence"""
    trimmed = []
    for record in records:
        if isinstance(record, FastaRecord):
            trimmed_record = FastaRecord(record.name,
                                         record.sequence[trim:-trim])
        elif isinstance(record, FastqRecord):
            trimmed_record = FastqRecord(record.name,
                                         record.sequence[trim:-trim],
                                         record.quality[trim:-trim])
        else:
            raise TypeError(
                "Only FastaRecord and FastqRecords support, not  '%s'" %
                type(record))
        trimmed.append(trimmed_record)
    return trimmed
Beispiel #24
0
def reverse_complement(record):
    """
    Reverse complement a FastaRecord
    """
    rev_seq = record.sequence[::-1]
    rev_com_seq = rev_seq.translate(COMPLEMENT)
    if isinstance(record, FastaRecord):
        return FastaRecord(record.name, rev_com_seq)
    elif isinstance(record, FastqRecord):
        rev_com_qual = record.qualityString[::-1]
        return FastqRecord(record.name,
                           rev_com_seq,
                           qualityString=rev_com_qual)
    else:
        msg = 'Record must be either Fasta or Fastq'
        log.error(msg)
        raise TypeError(msg)
Beispiel #25
0
def rename_fasta( input_file, output_file, name_key ):
    """
    Rename a single Fasta of subreads
    """
    renaming_dict = read_dict_file( name_key )
    with FastaWriter( output_file ) as writer:
        for record in FastaReader( input_file ):
            old_name = record.name.split()[0]
            try:
                new_name = renaming_dict[old_name]
            except KeyError:
                msg = "Sequence name not found!"
                log.error( msg )
                raise KeyError( msg )
            new_record = FastaRecord( new_name, record.sequence )
            writer.writeRecord( new_record )
    check_output_file( output_file )
    return output_file
Beispiel #26
0
 def __getitem__(self, k):
     """
     k --- should be <movie>/<zmw> or <movie>/<zmw>/<start_end>
     If former, return a list of records associated with that ZMW
     If latter, return just that record but still in a list
     """
     if k.count('/') == 2:  # is a subread
         if k not in self.d:
             raise ValueError("key {0} not in dictionary!".format(k))
         locations = [self.d[k]]
     else:  # is a ZMW
         if k not in self.zmw_d:
             raise ValueError("key {0} not in dictionary!".format(k))
         locations = self.zmw_d[k]
     output = []
     for seqid, loc in locations:
         self.f.seek(loc)
         content = ''
         for line in self.f:
             if line.startswith('>'):
                 break
             content += line.strip()
         output.append(FastaRecord(name=seqid, sequence=content))
     return output
Beispiel #27
0
 def createUnalignedRecord(cls, seqParts, zmw):
     bases = [b for b in seqParts if type(b) is str]
     unalignedSequence = ''.join(bases)
     unalignedRecord = FastaRecord(zmw, unalignedSequence)
     return unalignedRecord
Beispiel #28
0
 def convertFastqToFasta(cls, fastqRecord):
     return FastaRecord(fastqRecord.name, fastqRecord.sequence)
def _extract_fasta_region(records, region):
    name, start, end = region
    print name, start, end
    for record in records:
        yield FastaRecord(record.name, record.sequence[start:end])