Beispiel #1
0
def normalize_fasta(fastaFile, refFile, outFile):
    f = FastaReader(fastaFile)
    recs = []
    with open(outFile, "w") as of:
        for r in f:
            r_id = "%s" %  hex(zlib.adler32(r.name + r.sequence) & 0xffffffff)
            print >>of, ">"+r_id
            seq = r.sequence.upper()
            print >>of, seq 


    output = subprocess.check_output("blasr -bestn 1 -m 1 %s %s" % ( outFile, refFile ), shell=True)
    direction = {}
    output = output.strip().split("\n")
    for l in output:
        l = l.strip().split()
        rId = l[0].split("/")[0]
        if l[2] != l[3]:
            direction[rId] = "-"
        else:
            direction[rId] = "+"

    f = FastaReader(outFile)
    outData = []
    for r in f:
        r_id = "%s" % r.name
        outData.append(">"+r_id)
        seq = r.sequence.upper()
        if direction != None:
            if direction.get(r_id, "+") != "+":
                seq = "".join([rmap[c] for c in seq[::-1]])
        outData.append(seq)
    with open(outFile,"w") as of:
        print >>of, "\n".join(outData)
Beispiel #2
0
    def test_runner(self):
        """Test CombineRunner."""
        ipq_opts = IceQuiverHQLQOptions(qv_trim_5=100, qv_trim_3=30)
        d = op.join(SIV_DATA_DIR, "test_tool_contract_chunks")
        split_dirs = [op.join(d, b, "cluster_out") for b in
                      ("0to1kb_part0", "1to2kb_part0", "2to3kb_part0", "3to4kb_part0", "4to5kb_part0")]
        print split_dirs
        out_combined_dir = op.join(OUT_DIR, "test_CombineUtils", "combined_dir")
        rmpath(out_combined_dir)
        mkdir(out_combined_dir)
        obj = CombineRunner(combined_dir=out_combined_dir,
                            sample_name="mysample",
                            split_dirs=split_dirs,
                            ipq_opts=ipq_opts)
        obj.run()

        expected_out_fns = (obj.all_hq_fa, obj.all_hq_fq, obj.all_lq_fa, obj.all_lq_fq,
                            obj.all_consensus_isoforms_fa,
                            obj.all_cluster_report_fn, obj.all_cluster_summary_fn)
        self.assertTrue(all([op.exists(f) for f in expected_out_fns]))

        expected_hq_isoforms = ['i1_HQ_mysample|c0/f2p16/1826', 'i2_HQ_mysample|c2/f9p14/2470',
                                'i2_HQ_mysample|c5/f7p19/2472', 'i2_HQ_mysample|c10/f8p16/2457',
                                'i2_HQ_mysample|c98/f2p10/2081', 'i2_HQ_mysample|c108/f23p28/2471']
        self.assertEqual([r.name.split(' ')[0] for r in FastaReader(obj.all_hq_fa)], expected_hq_isoforms)
        self.assertEqual([r.name.split(' ')[0] for r in FastqReader(obj.all_hq_fq)], expected_hq_isoforms)

        expected_lq_isoforms_num = 73
        self.assertEqual(len([r for r in FastaReader(obj.all_lq_fa)]), expected_lq_isoforms_num)

        expected_consensus_isoforms_num = 79
        self.assertEqual(len([r for r in FastaReader(obj.all_consensus_isoforms_fa)]), expected_consensus_isoforms_num)
Beispiel #3
0
    def writeSummary(self, fa, summary_fn, hq_fa=None, lq_fa=None):
        """Extract number of consensus isoforms predicted, and total
        number of bases in all consensuus isoforms from fa and write
        the two attributes to summary_fn.

        if hq_fa (polished high-quality isoforms) is not None, report
            the number of polished hq clusters
        if lq_fa (polished high-quality isoforms) is not None, report
            the number of polished hq clusters
        """
        try:
            with FastaReader(fa) as reader:
                for r in reader:
                    self.summary.numConsensusIsoforms += 1
                    self.summary.numTotalBases += len(r.sequence)
            if hq_fa is not None:
                self.summary.num_polished_hq_isoforms = 0
                with FastaReader(hq_fa) as reader:
                    for r in reader:
                        self.summary.num_polished_hq_isoforms += 1
            if lq_fa is not None:
                self.summary.num_polished_lq_isoforms = 0
                with FastaReader(lq_fa) as reader:
                    for r in reader:
                        self.summary.num_polished_lq_isoforms += 1
            self.summary.write(summary_fn)
        except ZeroDivisionError:
            errMsg = "No consensus isoforms predicted."
            self.add_log(errMsg, level=logging.ERROR)
            raise ClusterException(errMsg)
Beispiel #4
0
    def run(self):
        """Subset reads based on read annotation and subset rules."""
        infoMsg = "Extracting reads from {f} based on ".format(f=self.inFN)
        infoMsg += "rules(FullLength={fl}, nonChimeric={nc}).".format(
            fl="true" if self.rules.FL != 0 else "false",
            nc="true" if self.rules.nonChimeric != 0 else "false")
        logging.info(infoMsg)

        if not self.printReadLengthOnly:
            with FastaReader(self.inFN) as reader, \
                    FastaWriter(self.outFN) as writer:
                for r in reader:
                    #print >> sys.stderr, r.name, self.ignore_polyA
                    annotation = ReadAnnotation.fromString(
                        r.name, self.ignore_polyA)
                    if self.satisfy(annotation, self.rules):
                        writer.writeRecord(r.name, r.sequence)
        else:  # print read length only, dont print read names and sequences
            with FastaReader(self.inFN) as reader, \
                    open(self.outFN, 'w') as writer:
                for r in reader:
                    annotation = ReadAnnotation.fromString(
                        r.name, self.ignore_polyA)
                    if self.satisfy(annotation, self.rules):
                        writer.write("{rl}\n".format(rl=len(r.sequence)))
Beispiel #5
0
    def write_summary(self, summary_fn, isoforms_fa, hq_fa=None, lq_fa=None):
        """Extract number of consensus isoforms predicted, and total
        number of bases in all consensuus isoforms from isoforms_fa and write
        the two attributes to summary_fn.

        if hq_fa (polished high-quality isoforms) is not None, report
            the number of polished hq clusters
        if lq_fa (polished high-quality isoforms) is not None, report
            the number of polished hq clusters
        """
        self.add_log("Writing a summary to {f}".format(f=summary_fn),
                     level=logging.INFO)
        try:
            summary = ClusterSummary()

            with FastaReader(isoforms_fa) as reader:
                for r in reader:
                    summary.numConsensusIsoforms += 1
                    summary.numTotalBases += len(r.sequence)

            if hq_fa is not None:
                summary.num_polished_hq_isoforms = 0
                with FastaReader(hq_fa) as reader:
                    for r in reader:
                        summary.num_polished_hq_isoforms += 1
            if lq_fa is not None:
                summary.num_polished_lq_isoforms = 0
                with FastaReader(lq_fa) as reader:
                    for r in reader:
                        summary.num_polished_lq_isoforms += 1
            summary.write(summary_fn)
        except ZeroDivisionError:
            errMsg = "No consensus isoforms predicted."
            self.add_log(errMsg, level=logging.ERROR)
            raise RuntimeError(errMsg)
Beispiel #6
0
    def __init__(self, isoseq_output_fn, reference_transcripts_fn,
                 output_analysis_fn, min_true_positive, max_false_positive,
                 min_seq_similarity, max_fuzzy_junction):
        self.isoseq_output_fn = isoseq_output_fn
        self.reference_transcripts_fn = reference_transcripts_fn
        self.output_analysis_fn = output_analysis_fn

        if isoseq_output_fn.endswith(".fasta") or isoseq_output_fn.endswith(
                ".fa"):
            self.isoforms = [r for r in FastaReader(isoseq_output_fn)]
            self.isoseq_output_fa = self.isoseq_output_fn
        elif isoseq_output_fn.endswith(".fastq") or isoseq_output_fn.endswith(
                ".fq"):
            self.isoforms = [r for r in FastqReader(isoseq_output_fn)]
            self.isoseq_output_fa = self.output_analysis_fn + ".isoseq.fa"
            with FastaWriter(self.isoseq_output_fa) as writer:
                for r in self.isoforms:
                    writer.writeRecord(r.name, r.sequence)

        self.reference_transcripts = [
            r for r in FastaReader(reference_transcripts_fn)
        ]

        self.min_true_positive = min_true_positive
        self.max_false_positive = max_false_positive
        self.min_seq_similarity = min_seq_similarity if min_seq_similarity <= 1 \
                                  else min_seq_similarity / 100.0
        self.max_fuzzy_junction = max_fuzzy_junction

        self.alns = self.filter_alns(
            self.map_isoforms_to_reference_transcripts())
Beispiel #7
0
def gconFunc(tp):
    # called bcause multiprocess
    rootDir, barcode = tp
    bcdir = "/".join((rootDir, barcode))

    ## call gcon
    logging.info("In gconFunc for: %s" % barcode)

    cmd = "gcon.py r --min_cov 3 %s/subreads.fasta %s/seed_read.fasta -d %s" % \
        (bcdir, bcdir, bcdir)
    subprocess.call(cmd, shell=True)

    ## check to see if the file is empty
    r = FastaReader("%s/g_consensus.fa" % bcdir)

    if not list(r)[0].sequence:
        return None

    ## check to see if we are going to run quiver
    if not runner.args.noQuiver:
        # setup the blasr / sam / quiver stuff.
        logging.info("Setup regions file, now running blasr through quiver.")

        cmd = ('blasr %s %s/g_consensus.fa -nproc 1 -sam -regionTable %s/region.fofn -out ' + \
                   '%s/aligned_reads.sam') % (runner.args.inputFofn, bcdir, bcdir, bcdir)
        logging.debug(cmd)
        subprocess.call(cmd, shell=True)

        cmd = 'samtoh5 %s/aligned_reads.sam %s/g_consensus.fa %s/aligned_reads.cmp.h5' % \
            (bcdir, bcdir, bcdir)
        logging.debug(cmd)
        subprocess.call(cmd, shell=True)

        cmd = ('loadPulses %s %s/aligned_reads.cmp.h5 -byread -metrics ' + \
                   'QualityValue,InsertionQV,MergeQV,DeletionQV,DeletionTag,SubstitutionTag,' + \
                   'SubstitutionQV') % (runner.args.inputFofn, bcdir)
        logging.debug(cmd)
        subprocess.call(cmd, shell=True)

        cmd = 'cmph5tools.py sort --inPlace %s/aligned_reads.cmp.h5' % bcdir
        logging.debug(cmd)
        subprocess.call(cmd, shell=True)

        cmd = ('quiver -vv --algorithm quiver -p P4-C2.AllQVsMergingByChannelModel ' \
                   '%s/aligned_reads.cmp.h5 --outputFilename %s/q_consensus.fasta ' + \
                   '--referenceFilename %s/g_consensus.fa') % (bcdir, bcdir, bcdir)
        logging.debug(cmd)
        subprocess.call(cmd, shell=True)
        cFilename = 'q_consensus.fasta'
    else:
        cFilename = 'g_consensus.fa'

    ## append results to output file.
    bcCons = "%s/%s/%s" % (rootDir, barcode, cFilename)
    if os.path.exists(bcCons):
        return FastaRecord(barcode, list(FastaReader(bcCons))[0].sequence)
    else:
        return None
Beispiel #8
0
 def __init__(self, file_name):
     self.file_name = file_name
     self._is_fasta = False
     self.ext = op.splitext(file_name)[1].upper()
     if self.ext in [".FA", ".FASTA"]:
         self._dataset = FastaReader(file_name)
         self._is_fasta = True
     elif self.ext == ".BAM":
         self._dataset = openDataFile(file_name)
     else:  # either contigset.xml or consensusreadset.xml
         assert self.ext == ".XML"
         self._dataset = openDataSet(file_name)
         if isinstance(self._dataset, ContigSet):
             self._is_fasta = True
Beispiel #9
0
def create_chimeras(input_file,
                    output=None,
                    reference_file=None,
                    alignment_file=None):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    # Check the input files, and align the input file if needed
    if reference_file and alignment_file is None:
        alignment_file = align_best_reference(input_file, reference_file)
    elif reference_file is None and alignment_file is None:
        msg = "extract_alleles requires either an Alignment or a Reference!"
        log.error(msg)
        raise IOError(msg)
    # Set the output file if not specified
    if output is None:
        basename = '.'.join(input_file.split('.')[:-1])
        output = '%s.chimeras.fasta' % basename
    # Parse the alignment data and extract the target sequences
    alignments = list(BlasrReader(alignment_file))
    groups = _group_by_locus(alignments)
    groups = _filter_groups(groups)
    sequences = list(FastaReader(input_file))
    chimeras = list(_create_chimeras(groups, sequences))
    write_fasta(chimeras, output)
    return output
Beispiel #10
0
def makeBarcodeH5FromBasH5(basH5):
    """The workhorse function for creating a barcode H5 file from a
    base H5 file."""
    labeler = BarcodeScorer(basH5,
                            FastaReader(runner.args.barcodeFile),
                            runner.args.adapterSidePad,
                            runner.args.insertSidePad,
                            scoreMode=runner.args.scoreMode,
                            maxHits=runner.args.maxAdapters,
                            scoreFirst=runner.args.scoreFirst,
                            startTimeCutoff=runner.args.startTimeCutoff)
    if runner.args.nZmws < 0:
        zmws = basH5.sequencingZmws
    else:
        zmws = basH5.sequencingZmws[0:runner.args.nZmws]

    logging.debug("Labeling %d ZMWs from: %s" % (len(zmws), basH5.filename))
    labeledZmws = labeler.labelZmws(zmws)
    logging.debug("Labeled %d ZMWs" % len(labeledZmws))

    outBase = re.sub(BAS_PLS_REGEX, BARCODE_EXT,
                     os.path.basename(basH5.filename))
    outFile = '/'.join((runner.args.outDir, outBase))
    logging.debug("Writing to: %s" % outFile)

    writeBarcodeH5(labeledZmws, labeler, outFile, runner.args.saveExtendedInfo)
    return outFile
Beispiel #11
0
def fasta_to_plot_group(fasta_file, output_dir):
    lengths = []
    with FastaReader(fasta_file) as f:
        for record in f:
            lengths.append(len(record.sequence))

    from pbreports.plot.helper import get_fig_axes  #pylint: disable=import-error
    from pbcommand.models.report import PlotGroup, Plot
    fig, ax = get_fig_axes()

    if len(lengths) == 1:
        v = lengths[0]
        hrange = (v - 1, v + 1)
        ax.hist(lengths, range=hrange)
    else:
        ax.hist(lengths)

    ax.set_title("Sequence Length Histogram")
    ax.set_xlabel("Sequence Length")

    name = "sequence_length_hist.png"
    png_path = os.path.join(output_dir, name)
    fig.savefig(png_path)
    plots = [Plot("sequence_lengths", name)]
    pg = PlotGroup("reference_hist", "Sequence Lengths", plots=plots)
    return pg
Beispiel #12
0
def _extract_sequences(project, contigs):
    sequence_file = os.path.join(project, 'results', 'AmpliconAssembly',
                                 'Final_Sequences.fasta')
    for record in FastaReader(sequence_file):
        name = record.name.split()[0]
        if name in contigs:
            yield record
Beispiel #13
0
    def run_after(self, rtc, output_dir):
        self.assertTrue(op.exists(rtc.task.output_files[0]))

        out_dir = op.join(OUT_DIR, "test_gather_polished_isoforms_in_each_bin")
        cluster_out_dirs = [
            op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES
        ]
        out_hq_fns = [
            op.join(d, fn) for d in cluster_out_dirs for fn in HQ_ISOFORMS_FNS
        ]
        print "out_hq_fns %s" % out_hq_fns
        self.assertTrue(all([op.exists(f) for f in out_hq_fns]))

        out_lq_fns = [
            op.join(d, fn) for d in cluster_out_dirs for fn in LQ_ISOFORMS_FNS
        ]
        print "out_lq_fns %s" % out_lq_fns
        self.assertTrue(all([op.exists(f) for f in out_lq_fns]))

        print "out_lq_fa %s is not empty" % out_lq_fns[0]
        n = len([r for r in FastaReader(out_lq_fns[0])])
        self.assertTrue(n > 0)

        out_logs = [
            IceFiles(prog_name="", root_dir=d).submitted_quiver_jobs_log
            for d in cluster_out_dirs
        ]
        print "out_logs %s" % out_logs
        self.assertTrue(all([op.exists(f) for f in out_logs]))
Beispiel #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-a", "--assembly", help="assembled contigs")
    #parser.add_argument("-m","--mapping", help="mapping of read to contigs in bam format")
    #parser.add_argument("-d","--dir",help="output directory for results",default='out')
    args = parser.parse_args()
    RF = 'AAGCTT'
    f = FastaReader(args.assembly)
    for record in f:

        id, seq = record.id, str(record.sequence)

        pos = [m.start(0) for m in re.finditer(RF, seq)]

        length = len(seq)

        left_count = 0
        rigt_count = 0
        for each in pos:
            if each < length / 2:
                left_count += 1
            else:
                rigt_count += 1

        print id, left_count, rigt_count
Beispiel #15
0
def run_fasta_filter(fasta_in, fasta_out, min_seq_length):
    with FastaWriter(fasta_out) as w:
        with FastaReader(fasta_in) as r:
            for record in r:
                if len(record.sequence) > min_seq_length:
                    w.writeRecord(record)

    return 0
Beispiel #16
0
    def testSplit(self):
        """Test FastaSplitter.split()."""
        fs = FastaSplitter(self.input_fasta, 2, self.out_dir,
            "testFastaSplitter_split_")
        fs.split()
        splittedReads = []
        for of in fs.out_fns:
            self.assertTrue(op.exists(of))
            with FastaReader(of) as reader:
                splittedReads.extend([(r.name, r.sequence) for r in reader])
        fs.rmOutFNs()

        reads = []
        with FastaReader(self.input_fasta) as reader:
            reads.extend([(r.name, r.sequence) for r in reader])
        self.assertTrue(len(reads) == 22)
        self.assertTrue(splittedReads == reads)
Beispiel #17
0
def isValidFasta(filename):
    if not isValidFile(filename) or not isFastaFile(filename):
        return False
    try:
        list(FastaReader(filename))
    except:
        return False
    return True
Beispiel #18
0
def get_the_only_fasta_record(fa):
    """Input fasta file should contain exactly one FastaRecord,
    return the fastas record."""
    rs = [r for r in FastaReader(fa)]
    if len(rs) != 1:
        errMsg = "Cluster fasta file {fa} must contain only one read.".\
            format(fa=fa)
        raise ValueError(errMsg)
    return rs[0]
 def test_readFasta(self):
     f = FastaReader(data.getFasta())
     entries = list(f)
     assert 48 == len(entries)
     assert "ref000001|EGFR_Exon_2" == entries[0].header
     assert ("TTTCTTCCAGTTTGCCAAGGCACGAGTAACAAGCTCACGCAGTTGGGCACTTT"
             "TGAAGATCATTTTCTCAGCCTCCAGAGGATGTTCAATAACTGTGAGGTGGTCC"
             "TTGGGAATTTGGAAATTACCTATGTGCAGAGGAATTATGATCTTTCCTTCTTA"
             "AAGGTTGGTGACTTTGATTTTCCT") == entries[0].sequence
Beispiel #20
0
    def _findCliques(self, alignGraph, readsFa):
        """
        Find all mutually exclusive cliques within the graph, with decreased
        size.

        alignGraph - a graph, each node represent a read and each edge
        represents an alignment between two end points.

        Return a dictionary of clique indices and nodes.
            key = index of a clique
            value = nodes within a clique
        Cliques are ordered by their size descendingly: index up, size down
        Reads which are not included in any cliques will be added as cliques
        of size 1.
        """
        uc = {}  # To keep cliques found
        used = []  # nodes within any cliques
        ind = 0  # index of clique to discover

        deg = alignGraph.degree().items()
        # Sort tuples of (node, degree) by degree, descendingly
        deg.sort(key=lambda x: x[1], reverse=True)
        for d in deg:
            node = d[0]  # node which has the largest degree in alignGraph
            if node not in alignGraph:
                continue
            # just get the immediate neighbors since we're looking for perfect
            # cliques
            subGraph = alignGraph.subgraph([node] + alignGraph.neighbors(node))
            subNodes = subGraph.nodes()
            # Convert from networkx.Graph to a sparse matrix
            S, H = pClique.convert_graph_connectivity_to_sparse(
                subGraph, subNodes)
            # index of the 'node' in the sub-graph
            seed_i = subNodes.index(node)
            # Grasp a clique from subGraph, and return indices of clique nodes
            # setting gamma=0.8 means to find quasi-0.8-cliques!
            tQ = pClique.grasp(S,
                               H,
                               gamma=0.8,
                               maxitr=5,
                               given_starting_node=seed_i)
            if len(tQ) > 0:
                c = [subNodes[i] for i in tQ]  # nodes in the clique
                uc[ind] = c  # Add the clique to uc
                ind += 1
                used += c  # Add clique nodes to used
                # Remove clique nodes from alignGraph and continue
                alignGraph.remove_nodes_from(c)

        with FastaReader(readsFa) as reader:
            for r in reader:
                rid = r.name.split()[0]
                if rid not in used:
                    uc[ind] = [rid]
                    ind += 1
        return uc
Beispiel #21
0
def get_fasta_stats(fasta, genome_size):
    """Calculate basic fasta stats"""
    lengths = [len(record.sequence) for record in FastaReader(fasta)]
    lengths.sort(reverse=True)
    asm_contigs = len(lengths)
    asm_total_bp = sum(lengths)

    def get_nstat(lens, stat, genome_size=None):
        """Calculate all N* stats"""
        lens.sort(reverse=True)
        if genome_size is not None:
            total = genome_size
        else:
            total = sum(lens)
        limit = total * stat
        for num in lens:
            total -= num
            if total <= limit:
                return num

    asm_n50 = get_nstat(lengths, 0.50)
    asm_n90 = get_nstat(lengths, 0.10)
    asm_n95 = get_nstat(lengths, 0.05)

    asm_min = lengths[-1]
    asm_max = lengths[0]
    asm_mean = asm_total_bp / asm_contigs
    asm_median = int((lengths[int(math.floor(asm_contigs * .5))] +
                      lengths[int(math.floor(asm_contigs * .5))]) / 2)
    asm_esize = sum([x * x for x in lengths]) / asm_total_bp

    fasta_stats = {
        'asm_contigs': asm_contigs,
        'asm_total_bp': asm_total_bp,
        'asm_esize': asm_esize,
        'asm_min': asm_min,
        'asm_max': asm_max,
        'asm_mean': asm_mean,
        'asm_median': asm_median,
        'asm_n50': asm_n50,
        'asm_n90': asm_n90,
        'asm_n95': asm_n95
    }

    if genome_size is not None:
        asm_ng50 = get_nstat(lengths, 0.50, genome_size)
        asm_ng90 = get_nstat(lengths, 0.10, genome_size)
        asm_ng95 = get_nstat(lengths, 0.05, genome_size)

        fasta_stats.update({
            'asm_ng50': asm_ng50,
            'asm_ng90': asm_ng90,
            'asm_ng95': asm_ng95
        })

    return fasta_stats
Beispiel #22
0
def fasta_movie_counts( fasta ):
    counts = {'all':0}
    for record in FastaReader( fasta ):
        movie = record.name.split('_')[0]
        counts['all'] += 1
        try:
            counts[movie] += 1
        except:
            counts[movie] = 1
    return counts
 def _createEntryFromFile(self):
     self._id = os.path.splitext(os.path.basename(self._path))[0]
     self._info = ReferenceInfo(self)
     self._info._file = self._path
     self._contigs = []
     for seq in FastaReader(self._path):
         contig = ReferenceContig(self)
         contig._name = seq.getTag()
         contig._id = contig._name
         self._contigs.append(contig)
Beispiel #24
0
def main():
    id2seq = {}
    parser = argparse.ArgumentParser()
    parser.add_argument("-b",
                        "--breakpoint",
                        help="file containing breakpoints")
    parser.add_argument("-a",
                        "--assembly",
                        help="fasta file containing contigs")
    parser.add_argument("-o", "--outfile", help="new assembly file")
    parser.add_argument("-l", "--lenfile", help="length of contigs")

    args = parser.parse_args()

    lenfile = open(args.lenfile, 'w')

    lenmap = {}
    f = FastaReader(args.assembly)
    for record in f:
        id = record.id
        id2seq[id] = record.sequence[0:-10]
        new_seq = {}

        f = open(args.breakpoint, 'r')
        lines = f.readlines()
        for line in lines:
            attrs = line.split()
            if len(attrs) == 1:
                curr_contig = attrs[0]
                seq = id2seq[curr_contig]
            else:
                start = long(attrs[0])
                end = long(attrs[1])
                new_id = curr_contig + '_' + attrs[0] + '_' + attrs[1]
                new_seq[new_id] = seq[start:end]
                lenmap[new_id] = end - start + 1
        rec_list = []
        writer = FastaWriter(args.scaffold)
        for key in new_seq:
            writer.writeRecord(key, new_seq[key])

        for key in lenmap:
            lenfile.write(key + "\t" + str(lenmap[key]) + '\n')
Beispiel #25
0
def fasta_to_report(fasta_file, output_json):

    nrecords = 0
    with FastaReader(fasta_file) as r:
        for _ in r:
            nrecords += 1

    attr = Attribute("num_records", nrecords, "Number of Records")
    plot_groups = try_fasta_to_plot_group(fasta_file, output_json)
    return Report("fasta_report", attributes=[attr], plotgroups=plot_groups)
Beispiel #26
0
def get_fasta_readlengths(fasta_file):
    """
    Get a sorted list of contig lengths
    :return: (tuple) 
    """
    lens = []
    with FastaReader(fasta_file) as f:
        for record in f:
            lens.append(len(record.sequence))
    lens.sort()
    return lens
    def test_dosLineEndingsFasta(self):
        fr = FastaReader(data.getDosFormattedFasta())
        frEntries = list(fr)

        ft = IndexedFastaReader(data.getDosFormattedFasta())
        ftEntries = list(ft)

        assert_equal(len(frEntries), len(ftEntries))
        for (frE, ftE) in zip(frEntries, ftEntries):
            assert_equal(frE.header, ftE.header)
            assert_equal(frE.sequence, ftE.sequence[:])
Beispiel #28
0
 def test_readFasta(self):
     f = FastaReader(data.getFasta())
     entries = list(f)
     assert_equal(48, len(entries))
     assert_equal("ref000001|EGFR_Exon_2", entries[0].name)
     assert_equal(
         "TTTCTTCCAGTTTGCCAAGGCACGAGTAACAAGCTCACGCAGTTGGGCACTTT"
         "TGAAGATCATTTTCTCAGCCTCCAGAGGATGTTCAATAACTGTGAGGTGGTCC"
         "TTGGGAATTTGGAAATTACCTATGTGCAGAGGAATTATGATCTTTCCTTCTTA"
         "AAGGTTGGTGACTTTGATTTTCCT", entries[0].sequence)
     assert_equal("e3912e9ceacd6538ede8c1b2adda7423", entries[0].md5)
Beispiel #29
0
 def __len__(self):
     if not self._is_fasta:
         return len(self._dataset)
     else:
         if self.ext in [".FA", ".FASTA"]:
             return len([r for r in FastaReader(self.file_name)])
         else:  # contigset
             n = 0
             for rr in self._dataset.resourceReaders():
                 n += len([r for r in rr])
             return n
Beispiel #30
0
def get_subset_reads(fasta_fn, cluster_dict, cluster_index, out_file_name):
    f = FastaReader(fasta_fn)

    with open(out_file_name, "w") as out_f:
        for r in f:
            read_id = r.name
            read_seq = r.sequence.upper()

            if read_id in cluster_dict[cluster_index]:
                print >> out_f, ">" + r.name
                print >> out_f, r.sequence
Beispiel #31
0
 def __init__(self, file_name):
     self.file_name = file_name
     self._is_fasta = False
     self.ext = op.splitext(file_name)[1].upper()
     if self.ext in [".FA", ".FASTA"]:
         self._dataset = FastaReader(file_name)
         self._is_fasta = True
     elif self.ext == ".BAM":
         self._dataset = openDataFile(file_name)
     else: # either contigset.xml or consensusreadset.xml
         assert self.ext == ".XML"
         self._dataset = openDataSet(file_name)
         if isinstance(self._dataset, ContigSet):
             self._is_fasta = True
Beispiel #32
0
class CCSInput(object):
    """
    Wrapper class for handling multiple formats specifying CCS sequences.
    The old convention was to use .fasta, but we would like to be able to pass
    the classifier a ConsensusReadSet (i.e. .bam files) instead for use within
    pbsmrtpipe.
    """
    def __init__(self, file_name):
        self.file_name = file_name
        self._is_fasta = False
        self.ext = op.splitext(file_name)[1].upper()
        if self.ext in [".FA", ".FASTA"]:
            self._dataset = FastaReader(file_name)
            self._is_fasta = True
        elif self.ext == ".BAM":
            self._dataset = openDataFile(file_name)
        else: # either contigset.xml or consensusreadset.xml
            assert self.ext == ".XML"
            self._dataset = openDataSet(file_name)
            if isinstance(self._dataset, ContigSet):
                self._is_fasta = True

    def __iter__(self):
        for rec in self._dataset:
            if not self._is_fasta:
                rec = CCSBamSequence(rec.peer)
            yield rec

    def close(self):
        """Close all datasets."""
        self._dataset.close()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def __len__(self):
        if not self._is_fasta:
            return len(self._dataset)
        else:
            if self.ext in [".FA", ".FASTA"]:
                return len([r for r in FastaReader(self.file_name)])
            else: # contigset
                n = 0
                for rr in self._dataset.resourceReaders():
                    n += len([r for r in rr])
                return n

    def __delitem__(self, dummy_name):
        raise NotImplementedError("%s.%s" % (self.__class__.__name__,
                                             "__delitem__"))

    def __setitem__(self, dummy_index, dummy_name):
        raise NotImplementedError("%s.%s" % (self.__class__.__name__,
                                             "__setitem__"))

    def __getitem__(self, key):
        raise NotImplementedError("%s.%s" % (self.__class__.__name__,
                                             "__getitem__"))
Beispiel #33
0
#! /usr/bin/env python
import sys
from pbcore.io import FastaReader

f = FastaReader(sys.argv[1])

for seq in f:
	chr = seq
	
list = chr.sequence.split('N')

max = 0
max_seq = ""
for sec in list:
	if len(sec) > max:
		max = len(sec)
		max_seq = sec

print len(max_seq)

wf = open("human_chr14.fa","w")

wf.write(max_seq)

f.close()
wf.close()