Example #1
0
    def test_contigset_consolidate_int_names(self):
        # build set to merge
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')
        outFas1 = os.path.join(outdir, 'tempfile1.fasta')
        outFas2 = os.path.join(outdir, 'tempfile2.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        shutil.copyfile(
            ReferenceSet(data.getXml(8)).toExternalFiles()[0], inFas)
        rs1 = ContigSet(inFas)

        double = 'B.cereus.1'
        exp_double = rs1.get_contig(double)

        # todo: modify the names first:
        with FastaWriter(outFas1) as writer:
            writer.writeRecord('5141', exp_double.sequence)
        with FastaWriter(outFas2) as writer:
            writer.writeRecord('5142', exp_double.sequence)

        exp_double_seqs = [exp_double.sequence, exp_double.sequence]
        exp_names = ['5141', '5142']

        obs_file = ContigSet(outFas1, outFas2)
        log.debug(obs_file.toExternalFiles())
        obs_file.consolidate()
        log.debug(obs_file.toExternalFiles())

        # open obs and compare to exp
        for name, seq in zip(exp_names, exp_double_seqs):
            assert obs_file.get_contig(name).sequence[:] == seq
Example #2
0
    def test_contigset_consolidate(self):
        #build set to merge
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')
        outFas1 = os.path.join(outdir, 'tempfile1.fasta')
        outFas2 = os.path.join(outdir, 'tempfile2.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        backticks('cp {i} {o}'.format(i=ReferenceSet(
            data.getXml(9)).toExternalFiles()[0],
                                      o=inFas))
        rs1 = ContigSet(inFas)

        singletons = ['A.baumannii.1', 'A.odontolyticus.1']
        double = 'B.cereus.1'
        reader = rs1.resourceReaders()[0]
        exp_double = rs1.get_contig(double)
        exp_singles = [rs1.get_contig(name) for name in singletons]

        # todo: modify the names first:
        with FastaWriter(outFas1) as writer:
            writer.writeRecord(exp_singles[0])
            writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence)
        with FastaWriter(outFas2) as writer:
            writer.writeRecord(exp_double.name + '_0_10',
                               exp_double.sequence + 'ATCGATCGATCG')
            writer.writeRecord(exp_singles[1])

        exp_double_seq = ''.join(
            [exp_double.sequence, 'ATCGATCGATCG', exp_double.sequence])
        exp_single_seqs = [rec.sequence for rec in exp_singles]

        acc_file = ContigSet(outFas1, outFas2)
        acc_file.induceIndices()
        log.debug(acc_file.toExternalFiles())
        self.assertEqual(len(acc_file), 4)
        self.assertEqual(len(list(acc_file)), 4)
        acc_file.consolidate()
        log.debug(acc_file.toExternalFiles())

        # open acc and compare to exp
        for name, seq in zip(singletons, exp_single_seqs):
            self.assertEqual(acc_file.get_contig(name).sequence[:], seq)
        self.assertEqual(
            acc_file.get_contig(double).sequence[:], exp_double_seq)

        self.assertEqual(len(acc_file._openReaders), 1)
        self.assertEqual(len(acc_file.index), 3)
        self.assertEqual(len(acc_file._indexMap), 3)
        self.assertEqual(len(acc_file), 3)
        self.assertEqual(len(list(acc_file)), 3)

        # test merge:
        acc1 = ContigSet(outFas1)
        acc2 = ContigSet(outFas2)
        acc3 = acc1 + acc2
Example #3
0
def combine_polished_isoforms(split_indices, split_hq_fns, split_lq_fns,
                              combined_hq_fa, combined_hq_fq,
                              combined_lq_fa, combined_lq_fq,
                              hq_lq_prefix_dict_pickle, sample_name):
    """Combine split hq (lq) files and save to combined_dir.
    Dumping hq|lq prefix dictionary to pickle.
    Return an instance of CombinedFiles.
    Parameters:
      split_indices -- indices of splitted cluster bins.
      split_hq_fns -- hq files, #['*/all_quivered_hq.100_30_0.99.fastq', ...]
      split_lq_fns -- lq files, #['all_quivered_lq.fastq', ...]
    """
    assert len(split_indices) == len(split_hq_fns)
    assert len(split_indices) == len(split_lq_fns)
    assert all([f.endswith(".fastq") for f in split_hq_fns + split_lq_fns])

    hq_pre_dict, lq_pre_dict = {}, {}

    hq_fa_writer = FastaWriter(combined_hq_fa)
    hq_fq_writer = FastqWriter(combined_hq_fq)
    lq_fa_writer = FastaWriter(combined_lq_fa)
    lq_fq_writer = FastqWriter(combined_lq_fq)

    for i, split_hq, split_lq in zip(split_indices, split_hq_fns, split_lq_fns):
        logging.debug("Adding prefix i%s_| to %s, %s", str(i), split_hq, split_lq)
        hq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="HQ",
                                    sample_name=sample_name)
        lq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="LQ",
                                    sample_name=sample_name)

        hq_pre_dict[hq_prefix] = op.dirname(op.abspath(split_hq))
        lq_pre_dict[lq_prefix] = op.dirname(op.abspath(split_lq))

        with FastqReader(split_hq) as reader:
            for read in reader:
                name = combined_cid_hq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                hq_fa_writer.writeRecord(name, read.sequence[:])
                hq_fq_writer.writeRecord(name, read.sequence[:], read.quality)

        with FastqReader(split_lq) as reader:
            for read in reader:
                name = combined_cid_lq_name(cluster_bin_index=i,
                                            name=read.name, sample_name=sample_name)
                lq_fa_writer.writeRecord(name, read.sequence[:])
                lq_fq_writer.writeRecord(name, read.sequence[:], read.quality)
    hq_fa_writer.close()
    hq_fq_writer.close()
    lq_fa_writer.close()
    lq_fq_writer.close()
    logging.info("HQ polished output combined to:%s", combined_hq_fq)
    logging.info("LQ polished output combined to:%s", combined_lq_fq)

    logging.info("Dumping hq|lq prefix dictionary to:%s", hq_lq_prefix_dict_pickle)
    with open(hq_lq_prefix_dict_pickle, 'wb') as writer:
        cPickle.dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, writer)
Example #4
0
    def _updateChimeraInfo(self,
                           suspicous_hits,
                           in_read_fn,
                           out_nc_fn,
                           out_c_fn,
                           primer_report_fn,
                           write_report_header=True):
        """
        in_read_fn --- a fasta of full-length reads or a fasta of
                       non-full-length reads.
        For each full-length read in in_read_fn FASTA file, detect whether
        it is chimeric or not, and write its annotation to
        primer_report_fn.
        Return:
            (num_nc, num_c, num_nc_bases, num_c_bases)
        """
        logging.debug(
            "Update chimera info for reads in {f} ".format(f=in_read_fn))
        logging.debug(
            "Write primer report to {rpt}".format(rpt=primer_report_fn))

        out_nc_fn_fasta, out_c_fn_fasta = out_nc_fn, out_c_fn
        if out_nc_fn.endswith(".xml"):
            out_nc_fn_fasta = out_nc_fn[:-4] + ".fasta"
        if out_c_fn.endswith(".xml"):
            out_c_fn_fasta = out_c_fn[:-4] + ".fasta"
        num_nc, num_c, num_nc_bases, num_c_bases = 0, 0, 0, 0
        with ContigSetReaderWrapper(in_read_fn) as reader, \
                FastaWriter(out_nc_fn_fasta) as writer, \
                FastaWriter(out_c_fn_fasta) as writer_chimera, \
                open(primer_report_fn, 'w') as reporter:
            if write_report_header:
                reporter.write(ReadAnnotation.header(delimiter=",") + "\n")
            for r in reader:
                # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;"
                readid = r.name.split()[0]
                annotation = ReadAnnotation.fromString(
                    r.name, ignore_polyA=self.ignore_polyA)
                if readid not in suspicous_hits:  # Non-chimeric reads
                    # Primer of a primer-trimmed read can not be None.
                    # assert(annotation.primer is not None)
                    annotation.chimera = 0
                    num_nc += 1
                    num_nc_bases += len(r.sequence)
                    writer.writeRecord(annotation.toAnnotation(),
                                       r.sequence[:])
                else:  # chimeric reads
                    annotation.chimera = 1
                    num_c += 1
                    num_c_bases += len(r.sequence)
                    writer_chimera.writeRecord(annotation.toAnnotation(),
                                               r.sequence[:])

                reporter.write(annotation.toReportRecord(delimitor=",") + "\n")
        return (num_nc, num_c, num_nc_bases, num_c_bases)
Example #5
0
def dumpEvidence(evidenceDumpBaseDirectory, refWindow, refSequence, alns,
                 quiverConsensus):
    """This will import h5py at runtime.
    """
    # Format of evidence dump:
    # evidence_dump/
    #   ref000001/
    #     0-1005/
    #       reference.fa
    #       reads.fa
    #       consensus.fa
    #       quiver-scores.h5
    #     995-2005/
    #       ...
    join = os.path.join
    refId, refStart, refEnd = refWindow
    refName = reference.idToName(refId)
    windowDirectory = join(evidenceDumpBaseDirectory, refName,
                           "%d-%d" % (refStart, refEnd))
    logging.info("Dumping evidence to %s" % (windowDirectory, ))

    if os.path.exists(windowDirectory):
        raise Exception(
            "Evidence dump does not expect directory %s to exist." %
            windowDirectory)
    os.makedirs(windowDirectory)
    refFasta = FastaWriter(join(windowDirectory, "reference.fa"))
    readsFasta = FastaWriter(join(windowDirectory, "reads.fa"))
    consensusFasta = FastaWriter(join(windowDirectory, "consensus.fa"))

    windowName = refName + (":%d-%d" % (refStart, refEnd))
    refFasta.writeRecord(windowName, refSequence)
    refFasta.close()

    consensusFasta.writeRecord(windowName + "|quiver",
                               quiverConsensus.sequence)
    consensusFasta.close()

    rowNames, columnNames, baselineScores, scores = scoreMatrix(
        quiverConsensus.mms)
    import h5py
    quiverScoreFile = h5py.File(join(windowDirectory, "quiver-scores.h5"))
    quiverScoreFile.create_dataset("Scores", data=scores)
    vlen_str = h5py.special_dtype(vlen=str)
    quiverScoreFile.create_dataset("RowNames", data=rowNames, dtype=vlen_str)
    quiverScoreFile.create_dataset("ColumnNames",
                                   data=columnNames,
                                   dtype=vlen_str)
    quiverScoreFile.create_dataset("BaselineScores", data=baselineScores)
    quiverScoreFile.close()
    for aln in alns:
        readsFasta.writeRecord(str(aln.rowNumber),
                               aln.read(orientation="genomic", aligned=False))
    readsFasta.close()
Example #6
0
def write_temporary_fasta(record_list):
    temp_fasta = tempfile.NamedTemporaryFile(suffix=".fasta", delete=False)
    with FastaWriter(temp_fasta.name) as handle:
        for record in record_list:
            rec = FastaRecord(record.name, record.sequence)
            handle.writeRecord(rec)
    return temp_fasta
Example #7
0
    def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs):
        """
        Reconstruct ref_fa of the cluster in the new tmp_dir
        e.g.,
            self.g_consensus_ref_fa_of_cluster(cid)

        cids --- list[int(cid)], e.g., [10, 11, 12, ..., 20]
        refs --- dict{int(cid): ref_fa of cluster(cid)}
        """
        # Check existence when first time it is read.
        if not nfs_exists(self.final_consensus_fa):
            raise IOError("Final consensus FASTA file {f}".format(
                f=self.final_consensus_fa) + "does not exist.")

        self.add_log("Reconstructing g consensus files for clusters "
                     "[%d, %d] in %s" % (cids[0], cids[-1], self.tmp_dir),
                     level=logging.INFO)

        final_consensus_d = FastaRandomReader(self.final_consensus_fa)
        for ref_id in final_consensus_d.d.keys():
            cid = int(ref_id.split('/')[0].replace('c', ''))
            # e.g., ref_id = c103/1/3708, cid = 103,
            #       refs[cid] = ...tmp/0/c103/g_consensus_ref.fasta
            if cid in cids:
                mkdir(self.cluster_dir(cid))
                ref_fa = op.join(self.cluster_dir(cid), op.basename(refs[cid]))
                refs[cid] = ref_fa
                with FastaWriter(ref_fa) as writer:
                    self.add_log("Writing ref_fa %s" % refs[cid])
                    writer.writeRecord(ref_id,
                                       final_consensus_d[ref_id].sequence[:])

        self.add_log("Reconstruct of g consensus files completed.",
                     level=logging.INFO)
Example #8
0
    def Write(self):
        """Clean-up the sequences and write out a Genomic Fasta"""

        sets = []
        writers = []

        for allele, seq in self._dict.iteritems():
            exons = seq.split("|")

            while len(writers) < len(exons):
                fasta = "{0}_exon{1}.fasta".format(self._locus,
                                                   len(writers) + 1)
                writers.append(FastaWriter(fasta))
                sets.append(set())

            for i, exon in enumerate(exons):
                exon = re.sub("[.|*]", "", exon)
                if len(exon) == 0:
                    continue

                if exon in sets[i]:
                    continue
                record = FastaRecord(allele, exon)
                writers[i].writeRecord(record)
                sets[i].add(exon)
Example #9
0
    def __init__(self, isoseq_output_fn, reference_transcripts_fn,
                 output_analysis_fn, min_true_positive, max_false_positive,
                 min_seq_similarity, max_fuzzy_junction):
        self.isoseq_output_fn = isoseq_output_fn
        self.reference_transcripts_fn = reference_transcripts_fn
        self.output_analysis_fn = output_analysis_fn

        if isoseq_output_fn.endswith(".fasta") or isoseq_output_fn.endswith(
                ".fa"):
            self.isoforms = [r for r in FastaReader(isoseq_output_fn)]
            self.isoseq_output_fa = self.isoseq_output_fn
        elif isoseq_output_fn.endswith(".fastq") or isoseq_output_fn.endswith(
                ".fq"):
            self.isoforms = [r for r in FastqReader(isoseq_output_fn)]
            self.isoseq_output_fa = self.output_analysis_fn + ".isoseq.fa"
            with FastaWriter(self.isoseq_output_fa) as writer:
                for r in self.isoforms:
                    writer.writeRecord(r.name, r.sequence)

        self.reference_transcripts = [
            r for r in FastaReader(reference_transcripts_fn)
        ]

        self.min_true_positive = min_true_positive
        self.max_false_positive = max_false_positive
        self.min_seq_similarity = min_seq_similarity if min_seq_similarity <= 1 \
                                  else min_seq_similarity / 100.0
        self.max_fuzzy_junction = max_fuzzy_junction

        self.alns = self.filter_alns(
            self.map_isoforms_to_reference_transcripts())
Example #10
0
    def convert_to_dazz_fasta(self):
        """
        Convert input fasta/fastq file to daligner-compatibe fasta with ids:
        <prefix>/<index>/0_<seqlen>

        Also write out mappings to pickle
        """
        log.debug("Converting %s to daligner compatible fasta %s.",
                  self.input_filename, self.dazz_filename)
        reader = ContigSetReaderWrapper(self.input_filename)

        with FastaWriter(self.dazz_filename) as f:
            i = 1
            for r in reader:
                f.writeRecord(
                    "{p}/{i}/0_{len}".format(p=self.dazz_movie_name,
                                             i=i,
                                             len=len(r.sequence)),
                    r.sequence[:])
                self.dazz_mapping[i] = r.name
                i += 1

        reader.close()

        with open(self.pickle_filename, 'w') as f:
            dump(self.dazz_mapping, f)
Example #11
0
    def run(self):
        """Subset reads based on read annotation and subset rules."""
        infoMsg = "Extracting reads from {f} based on ".format(f=self.inFN)
        infoMsg += "rules(FullLength={fl}, nonChimeric={nc}).".format(
            fl="true" if self.rules.FL != 0 else "false",
            nc="true" if self.rules.nonChimeric != 0 else "false")
        logging.info(infoMsg)

        if not self.printReadLengthOnly:
            with FastaReader(self.inFN) as reader, \
                    FastaWriter(self.outFN) as writer:
                for r in reader:
                    #print >> sys.stderr, r.name, self.ignore_polyA
                    annotation = ReadAnnotation.fromString(
                        r.name, self.ignore_polyA)
                    if self.satisfy(annotation, self.rules):
                        writer.writeRecord(r.name, r.sequence)
        else:  # print read length only, dont print read names and sequences
            with FastaReader(self.inFN) as reader, \
                    open(self.outFN, 'w') as writer:
                for r in reader:
                    annotation = ReadAnnotation.fromString(
                        r.name, self.ignore_polyA)
                    if self.satisfy(annotation, self.rules):
                        writer.write("{rl}\n".format(rl=len(r.sequence)))
Example #12
0
def run_fasta_filter(fasta_in, fasta_out, min_seq_length):
    with FastaWriter(fasta_out) as w:
        with FastaReader(fasta_in) as r:
            for record in r:
                if len(record.sequence) > min_seq_length:
                    w.writeRecord(record)

    return 0
Example #13
0
    def split(self):
        """Split `input_fasta` into smaller files each containing
        `reads_per_split` reads. Return splitted fasta."""
        split_index = 0
        self.out_fns = []
        writer = FastaWriter(self._out_fn(split_index))
        self.out_fns.append(self._out_fn(split_index))
        with FastaReader(self.input_fasta) as reader:
            for ridx, r in enumerate(reader):
                if ridx % self.reads_per_split == 0 and ridx != 0:
                    split_index += 1
                    writer.close()
                    writer = FastaWriter(self._out_fn(split_index))
                    self.out_fns.append(self._out_fn(split_index))
                writer.writeRecord(r.name, r.sequence)

        writer.close()
        return list(self.out_fns)
Example #14
0
 def Write(self):
     """Clean-up the sequences and write out a Genomic Fasta"""
     filename = "{0}_genomic.fasta".format(self._locus)
     with FastaWriter(filename) as handle:
         for allele, seq in self._dict.iteritems():
             # Remove inserts, exon/intron boundaries, and trimmed regions
             seq = re.sub("[.|*]", "", seq)
             record = FastaRecord(allele, seq)
             handle.writeRecord(record)
Example #15
0
def _fastq_to_fasta(fastq_path, fasta_path):
    """Convert a fastq file to  fasta file"""
    with FastqReader(fastq_path) as r:
        with FastaWriter(fasta_path) as w:
            for fastq_record in r:
                fasta_record = FastaRecord(fastq_record.name, fastq_record.sequence)
                w.writeRecord(fasta_record)

    log.info("Completed converting {q} to {f}".format(q=fastq_path, f=fasta_path))
    return 0
Example #16
0
def write_temp_fasta(fastq_file):
    """
    Write a temporary Fasta file from a Fastq
    """
    temp = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False)
    with FastaWriter(temp.name) as handle:
        for record in FastqReader(fastq_file):
            temp_record = FastaRecord(record.name, record.sequence)
            handle.writeRecord(temp_record)
    return temp
Example #17
0
def _writeFasta(filepath, records):
    """
    Attempt to write a list of records to a new reference FASTA
    """
    try:
        with FastaWriter(filepath) as handle:
            for record in records:
                handle.writeRecord(record)
    except:
        raise ReferenceIOException(
            'Unable to write reference FASTA "{0}"'.format(filepath))
Example #18
0
    def split(self, reads_in_first_split=None):
        """Split `input_fasta` into smaller files each containing
        `reads_per_split` reads. Return splitted fasta."""
        split_index = 0
        self.out_fns = []
        writer = FastaWriter(self._out_fn(split_index))
        self.out_fns.append(self._out_fn(split_index))
        if reads_in_first_split is None:
            reads_in_first_split = self.reads_per_split
        with ContigSetReaderWrapper(self.input_fasta) as reader:
            for ridx, r in enumerate(reader):
                if ((split_index == 0 and ridx == reads_in_first_split) or
                        (split_index > 0 and ridx % self.reads_per_split == 0)) \
                    and ridx != 0:
                    split_index += 1
                    writer.close()
                    writer = FastaWriter(self._out_fn(split_index))
                    self.out_fns.append(self._out_fn(split_index))
                writer.writeRecord(r.name, r.sequence[:])

        writer.close()
        return list(self.out_fns)
Example #19
0
def rename_imgt_fasta(input_file, output_file):
    with FastaWriter(output_file) as handle:
        for record in FastaReader(input_file):
            # Check that this is an IMGT-formatted FASTA record
            assert record.header.startswith('HLA:')

            # Extract the header and replace spaces with underscores
            new_header = record.header.strip().replace(' ', '_')

            # Create a new record with the same sequence and the type
            #    in place of it's id.
            new_record = FastaRecord(new_header, record.sequence)
            handle.writeRecord(new_record)
 def setUpClass(cls):
     with FastaWriter(cls.REFERENCE) as fasta_out:
         with FastaReader(TestCoverageRpt.REFERENCE) as fasta_in:
             for rec in fasta_in:
                 header = rec.id + "|quiver"
                 fasta_out.writeRecord(header, rec.sequence)
     with GffWriter(cls.GFF) as gff_out:
         with GffReader(TestCoverageRpt.GFF) as gff_in:
             for header in gff_in.headers:
                 gff_out.writeHeader(header)
             for rec in gff_in:
                 rec.seqid += "|quiver"
                 gff_out.writeRecord(rec)
Example #21
0
 def test_contigset_write(self):
     fasta = upstreamData.getLambdaFasta()
     ds = ContigSet(fasta)
     assert isinstance(ds.resourceReaders()[0], IndexedFastaReader)
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, 'test.fasta')
     w = FastaWriter(outfn)
     for rec in ds:
         w.writeRecord(rec)
     w.close()
     fas = FastaReader(outfn)
     for rec in fas:
         # make sure a __repr__ didn't slip through:
         assert not rec.sequence.startswith('<')
Example #22
0
def resolved_tool_contract_to_args(resolved_tool_contract):
    """Convert resolved tool contract to args."""
    rtc = resolved_tool_contract
    args = [
        "--verbose",
        "classify",
        resolved_tool_contract.task.input_files[0],
        resolved_tool_contract.task.output_files[0],
        "--flnc",
        resolved_tool_contract.task.output_files[1],
        "--nfl",
        resolved_tool_contract.task.output_files[2],
        "--summary",
        resolved_tool_contract.task.output_files[3],  # JSON
        "--report",
        resolved_tool_contract.task.output_files[4],  # CSV
        "--min_seq_len",
        str(rtc.task.options[Constants.MIN_SEQ_LEN_ID]),
        "--cpus",
        str(resolved_tool_contract.task.nproc),
        "--outDir",
        op.dirname(rtc.task.output_files[0]),
        "--ignore-empty-output",
    ]
    if rtc.task.options[Constants.IGNORE_POLYA_ID]:
        args.append("--ignore_polyA")

    primers_str_obj = rtc.task.options[Constants.PRIMER_SEQUENCES_ID]
    primers_str = str(primers_str_obj).strip().translate(None, '\'\" ')
    if primers_str_obj is not None and primers_str not in ('None', ''):
        logging.info("Detected customer primer: %s", primers_str)
        # Save primer sequences to a fasta file under output dir
        primer_fasta_records = parse_primer_sequences(primers_str=primers_str)
        d = op.dirname(resolved_tool_contract.task.output_files[2])
        mkdir(d)
        primer_fn = op.join(d, "customer_primers.fasta")
        with FastaWriter(primer_fn) as writer:
            for record in primer_fasta_records:
                writer.writeRecord(record)
        logging.info("Customer primer sequences written to file %s", primer_fn)
        args.append("-p")
        args.append("%s" % primer_fn)
    else:
        logging.info("No customer primer detected.")

    return get_argument_parser().parse_args(args)
Example #23
0
    def onStart(self):
        self.referenceBasesProcessedById = OrderedDict()
        for refId in reference.byName:
            self.referenceBasesProcessedById[refId] = 0
        self.variantsByRefId             = defaultdict(list)
        self.consensusChunksByRefId      = defaultdict(list)

        # open file writers
        self.fastaWriter = self.fastqWriter = self.gffWriter = None
        if options.fastaOutputFilename:
            self.fastaWriter = FastaWriter(options.fastaOutputFilename)
        if options.fastqOutputFilename:
            self.fastqWriter = FastqWriter(options.fastqOutputFilename)
        if options.gffOutputFilename:
            self.gffWriter = VariantsGffWriter(options.gffOutputFilename,
                                               vars(options),
                                               reference.byName.values())
Example #24
0
def writeSequenceRecords(filename, records, filetype=None):
    """
    Write the records out to file
    """
    fileType = filetype or getFileType(filename)
    if fileType == 'fasta':
        with FastaWriter(filename) as writer:
            for record in records:
                writer.writeRecord(record)
    elif fileType == 'fastq':
        with FastqWriter(filename) as writer:
            for record in records:
                writer.writeRecord(record)
    else:
        msg = 'Output filetype must be either FASTA or FASTQ'
        log.error(msg)
        raise TypeError(msg)
    return filename
Example #25
0
def run_main(input_file, output_file, min_sequence_length):
    """
    Main function entry point to your application (this should be imported
    from your library code)

    :rtype int:
    """
    _d = dict(i=input_file, a=min_sequence_length, o=output_file)
    msg = "Running dev_app task. with input:{i} output:{o} and min-length={a}".format(
        **_d)
    log.info(msg)
    with FastaWriter(output_file) as w:
        with FastaReader(input_file) as r:
            for record in r:
                if len(record.sequence) > min_sequence_length:
                    w.writeRecord(record)
    log.debug("completed running main.")
    return 0
Example #26
0
    def save(self, dir):
        """
        Save this ArrowEvidence to a directory.  The directory will be
        *created* by this method.

        Format of evidence dump:
        evidence_dump/
          ref000001/
            0-1005/
              consensus.fa
              arrow-scores.h5
            995-2005/
            ...
        """
        logging.info("Dumping evidence to %s" % (dir, ))
        join = os.path.join
        if os.path.exists(dir):
            raise Exception(
                "Evidence dump does not expect directory %s to exist." % dir)
        os.makedirs(dir)
        #refFasta       = FastaWriter(join(dir, "reference.fa"))
        #readsFasta     = FastaWriter(join(dir, "reads.fa"))
        consensusFasta = FastaWriter(join(dir, "consensus.fa"))
        windowName = self.refName + (":%d-%d" % (self.refStart, self.refEnd))
        #refFasta.writeRecord(windowName, self.refSequence)
        #refFasta.close()

        consensusFasta.writeRecord(windowName + "|arrow", self.consensus)
        consensusFasta.close()

        import h5py
        arrowScoreFile = h5py.File(join(dir, "arrow-scores.h5"))
        arrowScoreFile.create_dataset("Scores", data=self.scores)
        vlen_str = h5py.special_dtype(vlen=str)
        arrowScoreFile.create_dataset("RowNames",
                                      data=self.rowNames,
                                      dtype=vlen_str)
        arrowScoreFile.create_dataset("ColumnNames",
                                      data=self.colNames,
                                      dtype=vlen_str)
        arrowScoreFile.create_dataset("BaselineScores",
                                      data=self.baselineScores)
        arrowScoreFile.close()
Example #27
0
def main():
    id2seq = {}
    parser = argparse.ArgumentParser()
    parser.add_argument("-b",
                        "--breakpoint",
                        help="file containing breakpoints")
    parser.add_argument("-a",
                        "--assembly",
                        help="fasta file containing contigs")
    parser.add_argument("-o", "--outfile", help="new assembly file")
    parser.add_argument("-l", "--lenfile", help="length of contigs")

    args = parser.parse_args()

    lenfile = open(args.lenfile, 'w')

    lenmap = {}
    f = FastaReader(args.assembly)
    for record in f:
        id = record.id
        id2seq[id] = record.sequence[0:-10]
        new_seq = {}

        f = open(args.breakpoint, 'r')
        lines = f.readlines()
        for line in lines:
            attrs = line.split()
            if len(attrs) == 1:
                curr_contig = attrs[0]
                seq = id2seq[curr_contig]
            else:
                start = long(attrs[0])
                end = long(attrs[1])
                new_id = curr_contig + '_' + attrs[0] + '_' + attrs[1]
                new_seq[new_id] = seq[start:end]
                lenmap[new_id] = end - start + 1
        rec_list = []
        writer = FastaWriter(args.scaffold)
        for key in new_seq:
            writer.writeRecord(key, new_seq[key])

        for key in lenmap:
            lenfile.write(key + "\t" + str(lenmap[key]) + '\n')
Example #28
0
def main(parser):
    args = parser.parse_args()

    # Get outfile name
    if args.outFile is None:
        outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq'
    else:
        outfile = args.outFile

    # Input files
    barcodeFofn = (l.strip('\n') for l in args.barcode_fofn)
    ccsFofn = (l.strip('\n') for l in args.ccs_fofn)

    # Get the read names that are not barcoded
    no_barcode = set()
    for barcodeFile in barcodeFofn:
        bcH5 = BarcodeH5Reader(barcodeFile)
        for row in bcH5.bestDS:
            if row[3] / row[1] < args.minAvgBarcodeScore:
                no_barcode.add('%s/%d' % (bcH5.movieName, row[0]))

    if args.fasta:
        outh = FastaWriter(outfile)
    else:
        outh = FastqWriter(outfile)

    for ccsFile in ccsFofn:
        ccsH5 = BasH5Reader(ccsFile)
        for ccsRead in ccsH5.ccsReads():
            if ccsRead.zmw.zmwName in no_barcode:
                basecalls = ccsRead.basecalls()
                if len(basecalls) >= args.minMaxInsertLength:
                    if args.fasta:
                        outh.writeRecord(
                            FastaRecord(ccsRead.zmw.zmwName,
                                        ccsRead.basecalls()))
                    else:
                        outh.writeRecord(
                            FastqRecord(ccsRead.zmw.zmwName,
                                        ccsRead.basecalls(),
                                        ccsRead.QualityValue()))
    outh.close()
Example #29
0
def combine_consensus_isoforms(split_indices, split_files,
                               combined_consensus_isoforms_fa,
                               sample_name):
    """
    Parameters:
      split_indices -- indices of splitted cluster bins.
      split_files -- consensus isoforms in each splitted cluster bin.
    """
    assert len(split_indices) == len(split_files)
    writer = FastaWriter(combined_consensus_isoforms_fa)
    for i, split_fn in zip(split_indices, split_files):
        logging.debug("Adding prefix i%s to %s.", str(i), split_fn)
        with ContigSetReaderWrapper(split_fn) as reader:
            for read in reader:
                name = combined_cid_ice_name(name=read.name, cluster_bin_index=i,
                                             sample_name=sample_name)
                writer.writeRecord(name, read.sequence[:])
    writer.close()
    logging.info("Consensus isoforms output combined to:%s",
                 combined_consensus_isoforms_fa)
Example #30
0
def split_results(amp_analysis):
    """Split the output of an Amplicon Analysis job by Barcode"""
    assert os.path.isdir(amp_analysis)
    sequence_path = os.path.join(amp_analysis, 'amplicon_analysis.fasta')
    check_output_file(sequence_path)
    print "Analyzing %s output sequences" % fasta_size(sequence_path)
    barcode_path = os.path.join(amp_analysis, 'by_barcode')
    create_directory(barcode_path)

    records = list(FastaReader(sequence_path))
    barcodes = {get_barcode(r): [] for r in records}
    [barcodes[get_barcode(r)].append(r) for r in records]
    barcode_files = {}
    for barcode, records in barcodes.iteritems():
        barcode_file = barcode + '.fasta'
        sample_path = os.path.join(barcode_path, barcode_file)
        with FastaWriter(sample_path) as handle:
            for record in records:
                handle.writeRecord(record)
        barcode_files[barcode] = sample_path
    return barcode_files