コード例 #1
0
ファイル: align_pairs.py プロジェクト: santayana/cgat
def iterate_list(infile, idx1, idx2=None):

    fasta1 = IndexedFasta.IndexedFasta(idx1)
    if idx2 is None:
        fasta2 = fasta1
    else:
        fasta2 = IndexedFasta.IndexedFasta(idx2)

    first = True
    for line in infile:
        if line[0] == "#":
            continue
        id1, id2 = line[:-1].split("\t")[:2]

        try:
            yield AlignedPairs.UnalignedPair(
                token1=id1,
                sequence1=fasta1.getSequence(id1),
                token2=id2,
                sequence2=fasta2.getSequence(id2))
        except KeyError, msg:
            if first:
                first = False
                continue
            raise KeyError(msg)
コード例 #2
0
ファイル: chain2stats.py プロジェクト: jmadzo/cgat
class CounterOfErrors(ChainCounter):

    """class for reporting invalid contig sizes in chains"""

    header = "Contig size validation report"

    def __init__(self, options):
        self.tdb = IndexedFasta(options.dbpath + options.targetgenome)
        self.qdb = IndexedFasta(options.dbpath + options.querygenome)
        self.tcontigs = self.tdb.getContigSizes()
        self.qcontigs = self.qdb.getContigSizes()
        self.badchains = []

    def add(self, c):
        db_tsize = self.tcontigs[c.tname]
        db_qsize = self.qcontigs[c.qname]
        if c.tsize != db_tsize:
            self.badchains.append("\t".join([str(x) for x in c.atts] + [" #bad target contigsize"]))
        if c.qsize != db_qsize:
            self.badchains.append("\t".join([str(x) for x in c.atts] + [" #bad query contigsize"]))

    def report(self, options):
        report = self._wrap_header()
        if len(self.badchains) == 0:
            report.append("All chains passed validation")
        else:
            report = report + self.badchains
        self._write_report(options, report)

    def tabbed_report(self, options, E):
        if len(self.badchains) > 0:
            lines = self.badchains
        else:
            lines = ["#no bad chains found"]
        self._write_tabbed("bad_contig_sizes", lines, E)
コード例 #3
0
ファイル: snp2counts_test.py プロジェクト: logust79/cgat-apps
    def setUp(self):

        self.mExons = []

        self.mSplitCodonsNext = {}
        self.mSplitCodonsPrev = {}

        self.mSpliceSize = 4
        self.mExonSize = 100
        self.mIntronSize = 900
        self.strand = "+"
        self.mNExons = 9
        self.mOffset = 1000
        length = 0
        self.frame = 0
        self.mIncrement = self.mIntronSize + self.mExonSize

        seq = list("123" * int((self.mNExons * self.mExonSize) / 3))

        exon_id = 0

        start = self.mOffset
        for x in range(self.mNExons):

            e = GTF.Entry()
            e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene1", "trans1"
            e.start, e.end = start, start + self.mExonSize
            e.frame = (3 - (length % 3)) % 3
            length += e.end - e.start
            self.mExons.append(e)
            if e.frame != 0:
                for y in range(0, e.frame):
                    self.mSplitCodonsPrev[start + y] = start - self.mIntronSize
                for y in range(0, 3 - e.frame):
                    self.mSplitCodonsNext[
                        start - self.mIntronSize - y - 1] = start

            exon_id += 1
            if exon_id < self.mNExons:
                p = exon_id * self.mExonSize + self.mIntronSize * (exon_id - 1)
                seq[p:p] = list("AG")
                seq[p:p] = list("T" * (self.mIntronSize - 4))
                seq[p:p] = list("GT")

            start += self.mIncrement
            # print str(e)
        # print self.mSplitCodonsNext
        # print self.mSplitCodonsPrev
        seq[0:0] = "C" * self.mOffset
        seq.append("G" * self.mOffset)
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.close()

        seq = "".join(seq)
        self.mSequence = seq
        self.contigSize = len(seq)
        IndexedFasta.createDatabase(tmpfile.name, iter([("chr1", seq), ]))
        self.mFasta = IndexedFasta.IndexedFasta(tmpfile.name)
コード例 #4
0
ファイル: gtf2fasta_test.py プロジェクト: yangjl/cgat
    def setUp(self):
        self.tmpdir = tempfile.mkdtemp()

        self.outfile_genome = os.path.join(self.tmpdir, "genome_in")
        self.outfile_gtf = os.path.join(self.tmpdir, "exons.gtf")
        self.outfile_output = os.path.join(self.tmpdir, "output")

        self.length = 1000

        genome = iter((("chr1", "A" * self.length), ))

        IndexedFasta.createDatabase(self.outfile_genome, genome)
        self.reference = ["g"] * self.length
コード例 #5
0
ファイル: gtf2fasta_test.py プロジェクト: Charlie-George/cgat
    def setUp(self):
        self.tmpdir = tempfile.mkdtemp()

        self.outfile_genome = os.path.join(self.tmpdir, "genome_in")
        self.outfile_gtf = os.path.join(self.tmpdir, "exons.gtf")
        self.outfile_output = os.path.join(self.tmpdir, "output")

        self.length = 1000

        genome = iter((("chr1", "A" * self.length), ))

        IndexedFasta.createDatabase(self.outfile_genome, genome)
        self.reference = ["g"] * self.length
コード例 #6
0
def buildContigSizes(infile, outfile):
    '''
    Get contig sizes from indexed genome :term:`fasta` files and
    outputs to a text file.
    Parameters
    ----------
    infile : str
      infile is constructed from the `PARAMS` variable to retrieve
      the `genome` :term:`fasta` file
    Returns
    -------
    outfile : str
      outfile is a text format file that contains two columns, matched
      contig name and contig size (in nucleotides).  The output file
      name is defined in `PARAMS: interface_contigs`.
    '''

    prefix = P.snip(infile, ".fasta")
    fasta = IndexedFasta.IndexedFasta(prefix)
    contigs = []

    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        contigs.append([contig, size])
    df_contig = pd.DataFrame(contigs, columns=['contigs', 'size'])
    df_contig.sort_values('contigs', inplace=True)
    df_contig.to_csv(outfile, sep="\t", header=False, index=False)
コード例 #7
0
def extractSequence( infile, outfile ):
    '''extract genomic sequence to be aligned against.'''
    
    fasta = IndexedFasta.IndexedFasta( infile[:-len(".fasta")] )
    outs = open( outfile,"w")
    outs.write( ">%s\n%s\n" % (CONTIG, fasta.getSequence( CONTIG, "+", START, END) ))
    outs.close()
コード例 #8
0
ファイル: patch_translations.py プロジェクト: santayana/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/patch_translations.py 1841 2008-05-08 12:07:13Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.set_defaults(genome_file=None, )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    entry = PredictionParser.PredictionParserEntry()

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    ninput, noutput = 0, 0
    for line in sys.stdin:
        if line[0] == "#":
            print line[:-1]
            continue

        entry.Read(line)

        ninput += 1

        # get genomic sequence
        genomic_sequence = fasta.getSequence(entry.mSbjctToken,
                                             entry.mSbjctStrand,
                                             entry.mSbjctGenomeFrom,
                                             entry.mSbjctGenomeTo)

        entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment(
            entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence)

        options.stdout.write(str(entry) + "\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i\n" % (ninput, noutput))

    E.Stop()
コード例 #9
0
ファイル: fasta2gff.py プロジェクト: gsc0107/cgat
def main(argv=None):

    parser = E.OptionParser(
        version="%prog version: $Id: fasta2gff.py 2861 2010-02-23 17:36:32Z andreas $")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true",
                      help="output as gtf.")

    parser.add_option("-f", "--fragment-size", dest="fragment_size", type="int",
                      help="fixed size of fragments [default=%default].")

    parser.add_option("-s", "--sample-size", dest="sample_size", type="int",
                      help="fixed size of fragments.")

    parser.set_defaults(
        as_gtf=False,
        genome_file=None,
        fragment_size=1000,
        sample_size=10000,
        pattern_id="%08i",
    )

    (options, args) = E.Start(parser)

    fasta = IndexedFasta.IndexedFasta(options.genome_file)
    contigs = fasta.getContigSizes()

    if options.as_gtf:
        entry = GTF.Entry()
    else:
        entry = GTF.Entry()

    n = 0
    entry.feature = "exon"
    entry.source = "random"

    for x in range(options.sample_size):

        entry.contig, entry.strand, entry.start, entry.end = fasta.getRandomCoordinates(
            options.fragment_size)

        if entry.strand == "-":
            l = contigs[entry.contig]
            entry.start, entry.end = l - entry.end, l - entry.start

        if options.as_gtf:
            entry.gene_id = options.pattern_id % n
            entry.transcript_id = entry.gene_id

        options.stdout.write(str(entry) + "\n")
        n += 1

    E.Stop()
コード例 #10
0
ファイル: pipeline_chains.py プロジェクト: logust79/cgat-flow
def writeContigSizes(genome, outfile):
    '''write contig sizes to outfile for UCSC tools.
    '''

    outf = IOTools.openFile(outfile, "w")
    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], genome))
    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        outf.write("%s\t%i\n" % (contig, size))
    outf.close()
コード例 #11
0
ファイル: wig2wig.py プロジェクト: zpeng1989/cgat
def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("merge", "filter-genome", "bins", "block",
                               "sanitize-genome", "shift", "extend"),
                      help="method to apply [default=%default]")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.set_defaults(methods=[],
                        merge_distance=0,
                        binning_method="equal-bases",
                        genome_file=None,
                        bam_file=None,
                        num_bins=5,
                        merge_min_intervals=1,
                        bin_edges=None,
                        offset=10000,
                        test=None,
                        extend_distance=1000)

    (options, args) = E.Start(parser, add_pipe_options=True)

    contigs = None

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = genome_fasta.getContigSizes()

    for method in options.methods:
        if method == "filter-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = filterGenome(processor, contigs)
        elif method == "sanitize-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = sanitizeGenome(options.stdin, options.stdout, contigs)

    E.Stop()
コード例 #12
0
ファイル: chain2stats.py プロジェクト: lesheng/cgat
class CounterPercentIdentify(ChainCounter):

    header = "Report on Percent Indentities"

    def __init__(self, tname, qname):
        self.tfasta = IndexedFasta(tname)
        self.qfasta = IndexedFasta(qname)
        self.pids = []
        self.stats = 0

    def _get_pid(self, x, y):
        z = zip(x, y)
        pid = (float(len([x for x, y in z if x == y])) / float(len(z)) * 100)
        return(pid)

    def add(self, c):
        nreg = len(c.tugr)
        for i in range(0, nreg):
            tseq = self.tfasta.getSequence(
                c.tname, "+", c.tugr[i][0], (sum(c.tugr[i])))
            qseq = self.qfasta.getSequence(
                c.qname, c.qstrand, c.qugr[i][0], (sum(c.qugr[i])))
            pid = self._get_pid(tseq.lower(), qseq.lower())
        self.pids.append(pid)

    def _get_stats(self):
        if self.stats == 0:
            self.stats = self._get_basic_stats(self.pids, string="{:.2f}")

    def report(self, options):
        self._get_stats()
        report = self._wrap_header()
        report.append(self._wrap_basic_stats(self.stats))
        self._write_report(options, report)

    def tabbed_report(self, options, E):
        self._get_stats()
        lines = ["mean\tmedian\tmax\tmin"]
        lines.append(self._wrap_basic_stats(self.stats, tabbed=True))
        self._write_tabbed("pids", lines, E)
コード例 #13
0
class CounterPercentIdentify(ChainCounter):

    header = "Report on Percent Indentities"

    def __init__(self, tname, qname):
        self.tfasta = IndexedFasta(tname)
        self.qfasta = IndexedFasta(qname)
        self.pids = []
        self.stats = 0

    def _get_pid(self, x, y):
        z = zip(x, y)
        pid = (float(len([x for x, y in z if x == y])) / float(len(z)) * 100)
        return (pid)

    def add(self, c):
        nreg = len(c.tugr)
        for i in range(0, nreg):
            tseq = self.tfasta.getSequence(c.tname, "+", c.tugr[i][0],
                                           (sum(c.tugr[i])))
            qseq = self.qfasta.getSequence(c.qname, c.qstrand, c.qugr[i][0],
                                           (sum(c.qugr[i])))
            pid = self._get_pid(tseq.lower(), qseq.lower())
        self.pids.append(pid)

    def _get_stats(self):
        if self.stats == 0:
            self.stats = self._get_basic_stats(self.pids, string="{:.2f}")

    def report(self, options):
        self._get_stats()
        report = self._wrap_header()
        report.append(self._wrap_basic_stats(self.stats))
        self._write_report(options, report)

    def tabbed_report(self, options, E):
        self._get_stats()
        lines = ["mean\tmedian\tmax\tmin"]
        lines.append(self._wrap_basic_stats(self.stats, tabbed=True))
        self._write_tabbed("pids", lines, E)
コード例 #14
0
ファイル: gtf2fasta.py プロジェクト: lesheng/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: gtf2fasta.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-i", "--ignore-missing", dest="ignore_missing", action="store_true",
                      help="Ignore transcripts on contigs that are not in the genome-file [default=%default].")

    parser.add_option("--min-intron-length", dest="min_intron_length", type="int",
                      help="minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown' [default=%default].")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("full", ),
                      help="method to apply [default=%default].")

    parser.set_defaults(
        genome_file=None,
        flank=1000,
        max_frameshift_length=4,
        min_intron_length=30,
        ignore_missing=False,
        restrict_source=None,
        method="full",
        report_step=1000,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if not options.genome_file:
        raise ValueError("an indexed genome is required.")

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))

    annotateGenome(iterator, fasta, options)

    # write footer and output benchmark information.
    E.Stop()
コード例 #15
0
ファイル: chain2stats.py プロジェクト: lesheng/cgat
class CounterOfErrors(ChainCounter):

    '''class for reporting invalid contig sizes in chains'''

    header = "Contig size validation report"

    def __init__(self, options):
        self.tdb = IndexedFasta(options.dbpath + options.targetgenome)
        self.qdb = IndexedFasta(options.dbpath + options.querygenome)
        self.tcontigs = self.tdb.getContigSizes()
        self.qcontigs = self.qdb.getContigSizes()
        self.badchains = []

    def add(self, c):
        db_tsize = self.tcontigs[c.tname]
        db_qsize = self.qcontigs[c.qname]
        if c.tsize != db_tsize:
            self.badchains.append(
                '\t'.join([str(x) for x in c.atts] + [" #bad target contigsize"]))
        if c.qsize != db_qsize:
            self.badchains.append(
                '\t'.join([str(x) for x in c.atts] + [" #bad query contigsize"]))

    def report(self, options):
        report = self._wrap_header()
        if len(self.badchains) == 0:
            report.append("All chains passed validation")
        else:
            report = report + self.badchains
        self._write_report(options, report)

    def tabbed_report(self, options, E):
        if len(self.badchains) > 0:
            lines = self.badchains
        else:
            lines = ["#no bad chains found"]
        self._write_tabbed("bad_contig_sizes", lines, E)
コード例 #16
0
ファイル: PipelineMotifs.py プロジェクト: yangjl/cgat
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    to_cluster = True
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme",
                               outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    tmpdir = P.getTempDir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.asList(PARAMS['motifs_masker']),
        halfwidth=int(PARAMS["meme_halfwidth"]),
        maxsize=int(PARAMS["meme_max_size"]),
        proportion=PARAMS["meme_proportion"],
        min_sequences=PARAMS["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run()

        collectMEMEResults(tmpdir, target_path, outfile)
コード例 #17
0
def getContigSizes(infile, outfile):

    from CGAT import IndexedFasta

    try:
        prefix = P.snip(infile, ".fasta")
    except ValueError:
        prefix = P.snip(infile, ".fa")

    fasta = IndexedFasta.IndexedFasta(prefix)
    outs = IOTools.openFile(outfile, "w")

    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        outs.write("%s\t%i\n" % (contig, size))

    outs.close()
コード例 #18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: index2gff.py 2880 2010-04-07 08:44:13Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.set_defaults(genome_file=None, )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    entry = GTF.Entry()
    entry.start = 0
    entry.feature = "contig"
    entry.source = "genome"

    for contig, size in fasta.getContigSizes(with_synonyms=False).iteritems():
        ninput += 1
        entry.contig = contig
        entry.end = int(size)
        options.stdout.write("%s\n" % str(entry))
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
コード例 #19
0
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"):
    '''export sequences for intervals in :term:`bed`-formatted *infile*
    to :term:`fasta` formatted *outfile*
    '''

    track = P.snip(infile, ".bed.gz")

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))
    outs = IOTools.open_file(outfile, "w")

    ids, seqs = [], []
    for bed in Bed.setName(Bed.iterator(IOTools.open_file(infile))):
        lcontig = fasta.getLength(bed.contig)

        if mode == "intervals":
            seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end))
            ids.append("%s_%s %s:%i..%i" %
                       (track, bed.name, bed.contig, bed.start, bed.end))

        elif mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_%s_l %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_%s_r %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

    masked = maskSequences(seqs, masker)
    outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]))

    outs.close()
コード例 #20
0
def buildContigBed(infile, outfile):
    '''
    Gets the contig sizes and co-ordinates from an indexed genome :term:`fasta`
    file and outputs them to :term:`BED` format
    Parameters
    ----------
    infile : str
      infile is constructed from `PARAMS` variable to retrieve
      the `genome` :term:`fasta` file
    Returns
    -------
    outfile : str
      :term:`BED` format file containing contig name, value (0) and contig size
      in nucleotides.  The output file name is defined in
      `PARAMS: interface_contigs_bed`
    '''
    prefix = P.snip(infile, ".fasta")
    fasta = IndexedFasta.IndexedFasta(prefix)
    outs = IOTools.open_file(outfile, "w")

    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        outs.write("%s\t%i\t%i\n" % (contig, 0, size))

    outs.close()
コード例 #21
0
ファイル: gff2table.py プロジェクト: kathrinjansen/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome (indexed).")

    parser.add_option("-w",
                      "--windows-bed-file",
                      dest="filename_windows",
                      type="string",
                      help="gff file with windows to use.")

    parser.add_option("-d",
                      "--filename-data",
                      dest="filename_data",
                      type="string",
                      help="gff file with data to use.")

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="filename-data is gtf file [default=%default.")

    parser.add_option("-f",
                      "--features",
                      dest="features",
                      type="choice",
                      action="append",
                      choices=("GC", ),
                      help="features to compute.")

    parser.add_option("-c",
                      "--decorator",
                      dest="decorator",
                      type="choice",
                      choices=("counts", "gc", "gc3", "mean-length",
                               "median-length", "percent-coverage",
                               "median-score", "mean-score", "stddev-score",
                               "min-score", "max-score"),
                      help="decorators to use.")

    parser.add_option("-e",
                      "--skip-empty",
                      dest="skip_empty",
                      action="store_true",
                      help="skip empty windows.")

    parser.add_option(
        "-t",
        "--transform=",
        dest="transform",
        type="choice",
        choices=("none", "overlap", "complement", "third_codon"),
        help="transform to use when mapping overlapping regions onto window.")

    parser.set_defaults(
        genome_file=None,
        filename_windows=None,
        filename_data=None,
        features=[],
        skip_empty=False,
        decorator="counts",
        transform="none",
        is_gtf=False,
    )

    (options, args) = E.Start(parser)

    #    test_transform_third_codon()

    if not options.filename_windows:
        raise ValueError("please supply a gff file with window information.")

    if options.loglevel >= 1:
        options.stdlog.write("# reading windows...")
        options.stdlog.flush()

    windows = GTF.readAsIntervals(
        GTF.iterator(IOTools.openFile(options.filename_windows, "r")))

    if options.loglevel >= 1:
        options.stdlog.write("done\n")
        options.stdlog.flush()

    if options.filename_data:
        if options.loglevel >= 1:
            options.stdlog.write("# reading data...")
            options.stdlog.flush()

        if options.is_gtf:
            gff_data = GTF.readFromFile(
                IOTools.openFile(options.filename_data, "r"))
        else:
            gff_data = GTF.readFromFile(
                IOTOols.openFile(options.filename_data, "r"))

        if options.loglevel >= 1:
            options.stdlog.write("done\n")
            options.stdlog.flush()

        data_ranges = GTF.SortPerContig(gff_data)
    else:
        # use windows to compute properties
        # by supplying no data and asking for the complement = original window
        gff_data = None
        data_ranges = None
        options.transform = "complement"

    map_contig2size = {}

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        map_contig2size = fasta.getContigSizes()
    else:
        for contig, values in list(windows.items()):
            map_contig2size[contig] = max(lambda x: x[1], values)
        fasta = None

    contigs = list(map_contig2size.keys())
    contigs.sort()

    # proceed contig wise
    noutput_contigs, ncontigs_skipped_windows, ncontigs_skipped_data = 0, 0, 0

    options.stdout.write("\t".join(
        map(str, ("contig", "start", "end", "ngenes", "ntranscripts", "n1",
                  "l1", "n2", "l2", "score", "extra_info"))) + "\n")

    for contig in contigs:

        skip = False
        if contig not in windows:
            ncontigs_skipped_windows += 1
            skip = True

        if data_ranges and contig not in data_ranges:
            ncontigs_skipped_data += 1
            skip = True

        if skip:
            continue

        noutput_contigs += 1
        if data_ranges:
            annotateWindows(
                contig, windows[contig],
                gff_data[data_ranges[contig][0]:data_ranges[contig][1]], fasta,
                options)
        else:
            annotateWindows(contig, windows[contig], [], fasta, options)

    E.info(
        "ninput_windows=%i, noutput_contigs=%i, ninput_contigs=%i, nskipped_windows=%i, nskipped_data=%i"
        % (len(windows), noutput_contigs, len(contigs),
           ncontigs_skipped_windows, ncontigs_skipped_data))

    E.Stop()
コード例 #22
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-a",
                      "--aggregate-by",
                      dest="aggregate",
                      type="choice",
                      choices=("name", "contig", "track", "none"),
                      help="aggregate counts by feature [default=%default].")

    parser.add_option("-p",
                      "--add-percent",
                      dest="add_percent",
                      action="store_true",
                      help="add percentages [default=%default].")

    parser.set_defaults(
        genome_file=None,
        aggregate="none",
        add_percent=False,
    )

    (options, args) = E.Start(parser, argv)

    # get files
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        if options.add_percent:
            raise ValueError("--add-percent option requires --genome-file")
        fasta = None

    if options.add_percent and not options.aggregate == "contig":
        raise NotImplementedError(
            "--add-percent option requires --aggregate=contig")

    counts = collections.defaultdict(Counter)
    total = Counter()
    output_totals = True

    if options.aggregate == "track":
        keyf = lambda x: x.track
    elif options.aggregate == "name":
        keyf = lambda x: x.name
    elif options.aggregate == "contig":
        keyf = lambda x: x.contig
    else:
        keyf = lambda x: "all"
        output_totals = False

    for bed in Bed.iterator(options.stdin):
        counts[keyf(bed)].add(bed)
        total.add(bed)

    outf = options.stdout

    key = "track"
    if options.add_percent:
        outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers_percent)))
    else:
        outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers)))

    total_bases = 0
    for key, count in sorted(counts.items()):
        if options.add_percent:
            total_bases += fasta.getLength(key)
            count.setSize(fasta.getLength(key))

        outf.write("%s\t%s\n" % (key, str(count)))

    if output_totals:
        if options.add_percent:
            count.setSize(total_bases)
        outf.write("%s\t%s\n" % ("total", str(total)))
    E.Stop()
コード例 #23
0
def writeSequencesForIntervals(track,
                               filename,
                               dbhandle,
                               full=False,
                               halfwidth=None,
                               maxsize=None,
                               proportion=None,
                               masker=[],
                               offset=0,
                               shuffled=False,
                               num_sequences=None,
                               min_sequences=None,
                               order="peakval",
                               shift=None,
                               stranded=False):
    '''build a sequence set for motif discovery. Intervals are taken from
    the table <track>_intervals in the database *dbhandle* and save to
    *filename* in :term:`fasta` format.

    If num_shuffles is set, shuffled copies are created as well with
    the shuffled number appended to the filename.

    The sequences are masked before shuffling (is this appropriate?)

    If *full* is set, the whole intervals will be output, otherwise
    only the region around the peak given by *halfwidth*

    If *maxsize* is set, the output is truncated at *maxsize* characters
    in order to create jobs that take too long.

    If proportion is set, only the top *proportion* intervals are output
    (sorted by peakval).

    If *num_sequences* is set, the first *num_sequences* will be used.

    *masker* can be a combination of
        * dust, dustmasker: apply dustmasker
        * softmask: mask softmasked genomic regions

    *order* is the order by which peaks should be sorted. Possible
    values are 'peakval' (peak value, descending order), 'score' (peak
    score, descending order)

    If *shift* is set, intervals will be shifted. ``leftright``
    creates two intervals on the left and right of the actual
    interval. The intervals will be centered around the mid-point and
    truncated the same way as the main intervals.

    '''
    cc = dbhandle.cursor()

    orderby = ""
    if order == "peakval":
        orderby = " ORDER BY peakval DESC"
    elif order == "max":
        orderby = " ORDER BY score DESC"
    elif order != "random":
        raise ValueError(
            "Unknown value passed as order parameter, check your ini file")

    tablename = "%s_intervals" % P.tablequote(track)
    statement = '''SELECT contig, start, end, interval_id, score, strand, peakcenter 
                       FROM %(tablename)s 
                       ''' % locals() + orderby

    cc.execute(statement)
    data = cc.fetchall()
    cc.close()

    E.debug("Got %s intervals for track %s" % (len(data), track))
    if len(data) == 0:
        P.touch(filename)
        return

    data = truncateList(data, track, proportion, min_sequences, num_sequences,
                        order == "random")

    beds = bedsFromList(data)

    L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker)))

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    # At the moment the pipeline retrieves from the DB the bed regions and they will
    # always be in the positive strand but if this were to change. The regions retrieved from
    # the negative strand will be counted from the end of the chromosome and not the beginning without this.
    # This should be tested.
    fasta.setConverter(IndexedFasta.getConverter("zero-single-open"))

    # modify the ranges
    if shift == "leftright":
        beds = shitfBeds(beds)

    if halfwidth and not full:
        beds = centreAndCrop(beds, halfwidth)

    sequences = getFASTAFromBed(beds, fasta, stranded, offset, maxsize)

    if shuffled:
        sequences = shuffleFasta(sequences)

    c = E.Counter()
    outs = IOTools.open_file(filename, "w")
    for masker in masker:
        if masker not in ("unmasked", "none", None):
            ids, sequences = zip(*[(x.title, x.sequence) for x in sequences])
            sequences = maskSequences(sequences, masker)
            sequences = (FastaRecord(id, seq)
                         for id, seq in zip(ids, sequences))

    with IOTools.open_file(filename, "w") as outs:

        for sequence in sequences:
            c.input += 1
            if len(sequence.sequence) == 0:
                c.empty += 1
                continue
            if len(sequence.sequence) < 0:
                c.too_short += 1
                continue

            outs.write(">%s\n%s\n" % (sequence.title, sequence.sequence))
            c.output += 1
        outs.close()

    E.info("%s" % c)

    return c.output
コード例 #24
0
ファイル: gff2coverage.py プロジェクト: lesheng/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: "
                            "$Id: gff2coverage.py 2781 2009-09-10 11:33:14Z "
                            "andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default]")

    parser.add_option("-f", "--features", dest="features", type="string",
                      action="append", help="features to collect "
                      "[default=%default]")

    parser.add_option("-w", "--window-size", dest="window_size", type="int",
                      help="window size in bp for histogram computation. "
                      "Determines the bin size.  "
                      "[default=%default]")

    parser.add_option("-b", "--num-bins", dest="num_bins", type="int",
                      help="number of bins for histogram computation "
                      "if window size is not given. "
                      "[default=%default]")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("genomic", "histogram", ),
                      help="methods to apply. "
                      "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        window_size=None,
        num_bins=1000,
        value_format="%6.4f",
        features=[],
        method="genomic",
    )

    (options, args) = E.Start(parser, add_output_options=True)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.method == "histogram":

        gff = GTF.readFromFile(options.stdin)

        gff.sort(lambda x, y: cmp((x.contig, x.start), (y.contig, y.start)))

        chunk = []
        last_contig = None

        for entry in gff:

            if last_contig != entry.contig:
                processChunk(last_contig, chunk, options, fasta)
                last_contig = entry.contig
                chunk = []

            chunk.append(entry)

        processChunk(last_contig, chunk, options, fasta)

    elif options.method == "genomic":
        intervals = collections.defaultdict(int)
        bases = collections.defaultdict(int)
        total = 0
        for entry in GTF.iterator(options.stdin):
            intervals[(entry.contig, entry.source, entry.feature)] += 1
            bases[(entry.contig, entry.source, entry.feature)
                  ] += entry.end - entry.start
            total += entry.end - entry.start

        options.stdout.write("contig\tsource\tfeature\tintervals\tbases")
        if fasta:
            options.stdout.write(
                "\tpercent_coverage\ttotal_percent_coverage\n")
        else:
            options.stdout.write("\n")

        total_genome_size = sum(
            fasta.getContigSizes(with_synonyms=False).values())

        for key in sorted(intervals.keys()):
            nbases = bases[key]
            nintervals = intervals[key]
            contig, source, feature = key
            options.stdout.write("\t".join(("\t".join(key),
                                            str(nintervals),
                                            str(nbases))))
            if fasta:
                options.stdout.write(
                    "\t%f" % (100.0 * float(nbases) / fasta.getLength(contig)))
                options.stdout.write(
                    "\t%f\n" % (100.0 * float(nbases) / total_genome_size))
            else:
                options.stdout.write("\n")

    E.Stop()
コード例 #25
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gtf2exons.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic data (indexed)." )

    parser.add_option("--coordinate-format", dest="coordinate_format", type="string",
                      help="input type of coordinates." )

    parser.add_option("--forward-coordinates", dest="forward_coordinates", action="store_true",
                      help="output forward coordinates." )

    parser.add_option("-e", "--extract-id", dest="extract_id", type="string",
                      help="""regular expression to extract id from id column, e.g. 'transcript_id "(\S+)"'.""" )

    parser.set_defaults(
        coordinate_format = "zero-forward",
        forward_coordinates = False,
        genome_file = None,
        extract_id = None )

    (options, args) = E.Start( parser )
    
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
        contig_sizes = fasta.getContigSizes()
    else:
        contig_sizes = {}

    if options.extract_id:
        extract_id = re.compile( options.extract_id )
    else:
        extract_id = None

    converter = IndexedFasta.getConverter( options.coordinate_format )

    exons = Exons.ReadExonBoundaries( sys.stdin,
                                      contig_sizes = contig_sizes,
                                      converter = converter,
                                      do_invert = True,
                                      format = "gtf",
                                      gtf_extract_id = extract_id )

    ntranscripts, nexons, nerrors = 0, 0, 0
    for id, ee in exons.items():
        ntranscripts += 1
        has_error = False
        for e in ee:
            if options.forward_coordinates and e.mSbjctToken in contig_sizes and \
                    e.mSbjctStrand == "-":
                l = contig_sizes[e.mSbjctToken]
                e.mGenomeFrom, e.mGenomeTo = l - e.mGenomeTo, l - e.mGenomeFrom

            if e.mGenomeFrom < 0:
                has_error = True
                if options.loglevel >= 1:
                    options.stderr.write( "# Error: %s\n" % str(e) )
                break

            options.stdout.write( str(e) + "\n" )
            nexons += 1
                
        if has_error:
            nerrors += 1
            continue
    
    if options.loglevel >= 1:
        options.stdlog.write("# ntranscripts=%i, nexons=%i, nerrors=%i\n" % (ntranscripts, nexons, nerrors))
    
    E.Stop()
コード例 #26
0
ファイル: chain2stats.py プロジェクト: lesheng/cgat
 def __init__(self, tname, qname):
     self.tfasta = IndexedFasta(tname)
     self.qfasta = IndexedFasta(qname)
     self.pids = []
     self.stats = 0
コード例 #27
0
ファイル: bed2fasta.py プロジェクト: CGATOxford/cgat
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic sequence to retrieve "
                      "sequences from.")

    parser.add_option("-m", "--masker", dest="masker", type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker to mask output sequences "
                      "[%default].")

    parser.add_option("--output-mode", dest="output_mode", type="choice",
                      choices=("intervals", "leftright", "segments"),
                      help="what to output. "
                      "'intervals' generates a single sequence for "
                      "each bed interval. 'leftright' generates two "
                      "sequences, one in each direction, for each bed "
                      "interval. 'segments' can be used to output "
                      "sequence from bed12 files so that sequence only covers "
                      "the segements [%default]")

    parser.add_option("--min-sequence-length", dest="min_length", type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-sequence-length", dest="max_length", type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option(
        "--extend-at", dest="extend_at", type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help="extend at 3', 5' or both or no ends. If 3only or 5only "
        "are set, only the added sequence is returned [default=%default]")

    parser.add_option(
        "--extend-by", dest="extend_by", type="int",
        help="extend by # bases [default=%default]")

    parser.add_option(
        "--use-strand", dest="ignore_strand",
        action="store_false",
        help="use strand information and return reverse complement "
        "on intervals located on the negative strand. "
        "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        masker=None,
        output_mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.output_mode == "segments" and bed.columns == 12:
            ids.append("%s %s:%i..%i (%s) %s %s" %
                       (bed.name, bed.contig, bed.start, bed.end, strand,
                        bed["blockSizes"], bed["blockStarts"]))
            seg_seqs = [fasta.getSequence(bed.contig, strand, start, end)
                        for start, end in bed.toIntervals()]
            seqs.append("".join(seg_seqs))

        elif (options.output_mode == "intervals" or
              options.output_mode == "segments"):
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.output_mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.Stop()
コード例 #28
0
ファイル: extractseq.py プロジェクト: SCV/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: extractseq.py 2861 2010-02-23 17:36:32Z andreas $")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="pattern to look for sequence filename.")

    parser.add_option("-d", "--identifier", dest="identifier", type="string",
                      help="identifier(s).")

    parser.add_option("-o", "--output-coordinate-format", dest="output_coordinate_format", type="choice",
                      choices=("full", "long"),
                      help="""output format of coordinates. Output format is contig:strand:from:to in zero based
/forward/reverse strand coordinates in open/closed notation. 'long' includes the contig length as fifth field"""  )

    parser.add_option("--input-format", dest="input_format", type="choice",
                      choices=("list", "id-compressed"),
                      help="input format.")

    parser.add_option("-i", "--input-coordinate-format", dest="input_coordinate_format", type="choice",
                      choices=("zero-both", "zero-forward"),
                      help="coordinate format.")

    parser.add_option("-e", "--extend-region", dest="extend_region", type="int",
                      help="regions are extended by this margin at either end.")

    parser.add_option("-r", "--shorten-region", dest="shorten_region", type="int",
                      help="regions are shortened by this margin at either end.")

    parser.set_defaults(
        genome_file=None,
        identifier=None,
        input_coordinate_format="zero-both",
        output_coordinate_format="full",
        input_format="list",
        extend_region=0,
        shorten_region=0,
    )

    (options, args) = E.Start(parser)

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    lines = []
    if options.identifier:
        lines += map(lambda x: x.split(":"), options.identifier.split(","))

    if args:
        lines += map(lambda x: x.split(":"), args)

    if len(lines) == 0:
        lines = map(lambda x: x[
                    :-1].split("\t"), (filter(lambda x: x[0] != "#", options.stdin.readlines())))

    ninput, nskipped, noutput = 0, 0, 0
    for data in lines:

        if options.input_format == "list":
            if len(data) < 4:
                sbjct_token = data[0]
                sbjct_from, sbjct_to = "0", "0"
                sbjct_strand = "+"
            else:
                sbjct_token, sbjct_strand, sbjct_from, sbjct_to = data[:4]
            id = None
        elif options.input_format == "id-compressed":
            id = data[0]
            sbjct_token, sbjct_strand, sbjct_from, sbjct_to = data[
                1].split(":")

        ninput += 1

        try:
            sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)
        except ValueError:
            E.warn("skipping line %s" % data)
            nskipped += 1
            continue

        sbjct_from -= (options.extend_region - options.shorten_region)
        sbjct_from = max(0, sbjct_from)
        lcontig = fasta.getLength(sbjct_token)
        if sbjct_to != 0:
            sbjct_to += (options.extend_region - options.shorten_region)
            sbjct_to = min(sbjct_to, lcontig)
        else:
            sbjct_to = lcontig

        if sbjct_to - sbjct_from <= 0:
            nskipped += 1
            continue

        sequence = fasta.getSequence(sbjct_token, sbjct_strand,
                                     sbjct_from, sbjct_to,
                                     converter=IndexedFasta.getConverter(options.input_coordinate_format))

        if options.output_coordinate_format == "full":
            coordinates = "%s:%s:%i:%i" % (sbjct_token,
                                           sbjct_strand,
                                           sbjct_from,
                                           sbjct_to)

        elif options.output_coordinate_format == "long":
            coordinates = "%s:%s:%i:%i:%i" % (sbjct_token,
                                              sbjct_strand,
                                              sbjct_from,
                                              sbjct_to,
                                              lcontig)

        if id:
            options.stdout.write(">%s %s\n%s\n" % (id, coordinates, sequence))
        else:
            options.stdout.write(">%s\n%s\n" % (coordinates, sequence))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
コード例 #29
0
 def __init__(self, options):
     self.tdb = IndexedFasta(options.dbpath + options.targetgenome)
     self.qdb = IndexedFasta(options.dbpath + options.querygenome)
     self.tcontigs = self.tdb.getContigSizes()
     self.qcontigs = self.qdb.getContigSizes()
     self.badchains = []
コード例 #30
0
ファイル: index_fasta.py プロジェクト: Charlie-George/cgat
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-e", "--extract", dest="extract", type="string",
                      help="extract region for testing purposes. Format is "
                      "contig:strand:from:to. "
                      "The default coordinates are 0-based "
                      "open/closed coordinates on both strands. "
                      "For example, chr1:+:10:12 will return "
                      "bases 11 to 12 on chr1.")

    compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug")
    parser.add_option("-c", "--compression", dest="compression", type="choice",
                      choices=compression_choices,
                      help="compress database, using specied compression. "
                      "Valid choices are %s. "
                      "[default=%%default]." % ", ".join(compression_choices))

    parser.add_option("--random-access-points", dest="random_access_points",
                      type="int",
                      help="save random access points every # number "
                      "of nucleotides [default=%default].")

    input_format_choices = ("one-forward-open", "zero-both-open")
    parser.add_option("-i", "--input-format", dest="input_format",
                      type="choice",
                      choices=input_format_choices,
                      help="coordinate format of input. Valid choices are "
                      "%s [default=%%default]." %
                      ", ".join(input_format_choices))

    parser.add_option("-s", "--synonyms", dest="synonyms", type="string",
                      help="list of synonyms, comma separated with =, "
                      "for example, chr1=chr1b [default=%default]")

    parser.add_option("-b", "--benchmark", dest="benchmark",
                      action="store_true",
                      help="benchmark time for read access "
                      "[default=%default].")

    parser.add_option("--benchmark-num-iterations",
                      dest="benchmark_num_iterations",
                      type="int",
                      help="number of iterations for benchmark "
                      "[default=%default].")

    parser.add_option("--benchmark-fragment-size",
                      dest="benchmark_fragment_size",
                      type="int",
                      help="benchmark: fragment size [default=%default].")

    parser.add_option("--verify", dest="verify", type="string",
                      help="verify against other database [default=%default].")

    parser.add_option("--verify-iterations", dest="verify_num_iterations",
                      type="int",
                      help="number of iterations for verification "
                      "[default=%default].")

    file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz")
    parser.add_option("--file-format", dest="file_format", type="choice",
                      choices=file_format_choices,
                      help="file format of input. Supply if data comes "
                      "from stdin "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(file_format_choices))

    parser.add_option("-a", "--clean-sequence", dest="clean_sequence",
                      action="store_true",
                      help="remove X/x from DNA sequences - they cause "
                      "errors in exonerate [default=%default].")

    parser.add_option("--allow-duplicates", dest="allow_duplicates",
                      action="store_true",
                      help="allow duplicate identifiers. Further occurances "
                      "of an identifier are suffixed by an '_%i' "
                      "[default=%default].")

    parser.add_option("--regex-identifier", dest="regex_identifier",
                      type="string",
                      help="regular expression for extracting the "
                      "identifier from fasta description line "
                      "[default=%default].")

    parser.add_option("--compress-index", dest="compress_index",
                      action="store_true",
                      help="compress index [default=%default].")

    parser.add_option("--force", dest="force", action="store_true",
                      help="force overwriting of existing files "
                      "[default=%default].")

    translator_choices = ("solexa", "phred", "bytes", "range200")
    parser.add_option("-t", "--translator", dest="translator", type="choice",
                      choices=translator_choices,
                      help="translate numerical quality scores. "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(translator_choices))

    parser.set_defaults(
        extract=None,
        input_format="zero-both-open",
        benchmark_fragment_size=1000,
        benchmark_num_iterations=1000000,
        benchmark=False,
        compression=None,
        random_access_points=0,
        synonyms=None,
        verify=None,
        verify_num_iterations=100000,
        verify_fragment_size=100,
        clean_sequence=False,
        allow_duplicates=False,
        regex_identifier=None,
        compress_index=False,
        file_format="auto",
        force=False,
        translator=None)

    (options, args) = E.Start(parser)

    if options.synonyms:
        synonyms = {}
        for x in options.synonyms.split(","):
            a, b = x.split("=")
            a = a.strip()
            b = b.strip()
            if a not in synonyms:
                synonyms[a] = []
            synonyms[a].append(b)
    else:
        synonyms = None

    if options.translator:
        if options.translator == "phred":
            options.translator = IndexedFasta.TranslatorPhred()
        elif options.translator == "solexa":
            options.translator = IndexedFasta.TranslatorSolexa()
        elif options.translator == "bytes":
            options.translator = IndexedFasta.TranslatorBytes()
        elif options.translator == "range200":
            options.translator = IndexedFasta.TranslatorRange200()
        else:
            raise ValueError("unknown translator %s" % options.translator)

    if options.extract:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.setTranslator(options.translator)
        converter = IndexedFasta.getConverter(options.input_format)

        contig, strand, start, end = IndexedFasta.parseCoordinates(
            options.extract)
        sequence = fasta.getSequence(contig, strand,
                                     start, end,
                                     converter=converter)
        options.stdout.write(">%s\n%s\n" %
                             (options.extract, sequence))

    elif options.benchmark:
        import timeit
        timer = timeit.Timer(
            stmt="IndexedFasta.benchmarkRandomFragment( fasta = fasta, size = %i)" % (
                options.benchmark_fragment_size),
            setup="""from __main__ import IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )""" % (args[0] ) )

        t = timer.timeit(number=options.benchmark_num_iterations)
        options.stdout.write("iter\tsize\ttime\n")
        options.stdout.write("%i\t%i\t%i\n" % (
            options.benchmark_num_iterations, options.benchmark_fragment_size, t))

    elif options.verify:
        fasta1 = IndexedFasta.IndexedFasta(args[0])
        fasta2 = IndexedFasta.IndexedFasta(options.verify)
        nerrors1 = IndexedFasta.verify(fasta1, fasta2,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors1))
        nerrors2 = IndexedFasta.verify(fasta2, fasta1,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors2))
    elif options.compress_index:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.compressIndex()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# creating database %s\n" % args[0])
            options.stdlog.write("# indexing the following files: \n# %s\n" %
                                 (" \n# ".join(args[1:])))
            options.stdlog.flush()

            if synonyms:
                options.stdlog.write("# Applying the following synonyms:\n")
                for k, v in synonyms.items():
                    options.stdlog.write("# %s=%s\n" % (k, ",".join(v)))
                options.stdlog.flush()
        if len(args) < 2:
            print globals()["__doc__"]
            sys.exit(1)

        iterator = IndexedFasta.MultipleFastaIterator(
            args[1:],
            regex_identifier=options.regex_identifier,
            format=options.file_format)

        IndexedFasta.createDatabase(
            args[0],
            iterator,
            synonyms=synonyms,
            random_access_points=options.random_access_points,
            compression=options.compression,
            clean_sequence=options.clean_sequence,
            allow_duplicates=options.allow_duplicates,
            translator=options.translator,
            force=options.force)

    E.Stop()
コード例 #31
0
ファイル: index2bed.py プロジェクト: zpeng1989/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--genome-file", dest="genome_file", type="string",
        help="filename with genome [default=%default].")

    parser.add_option(
        "--remove-regex", dest="remove_regex",
        type="string",
        help="regular expression of contigs to remove [default=None].")

    parser.add_option(
        "-e", "--gff-file", dest="gff_file", type="string",
        help="gff file to use for getting contig sizes.")

    parser.add_option(
        "-f", "--fixed-width-windows",
        dest="fixed_width_windows", type="string",
        help="fixed width windows. Supply the window size as a "
        "parameter. Optionally supply an offset.")

    parser.set_defaults(
        genome_file=None,
        remove_regex=None,
        fixed_windows=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.remove_regex:
        remove_regex = re.compile(options.remove_regex)
    else:
        remove_regex = None

    if options.fixed_width_windows:
        v = map(int, options.fixed_width_windows.split(","))
        if len(v) == 2:
            window_size, window_increment = v
        elif len(v) == 1:
            window_size, window_increment = v[0], v[0]
        else:
            raise ValueError(
                "could not parse window size '%s': should be size[,increment]" % options.fixed_width_windows)

    if options.gff_file:
        infile = open(options.gff_file, "r")
        gff = GTF.readFromFile(infile)
        infile.close()
        for g in gff:
            try:
                map_contig2size[g.mName] = max(map_contig2size[g.mName], g.end)
            except ValueError:
                map_contig2size[g.mName] = g.end

    else:
        gff = None

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        map_contig2size = fasta.getContigSizes(with_synonyms=False)
    else:
        fasta = None

    if map_contig2size is None:
        raise ValueError("no source of contig sizes supplied")

    # do sth
    counter = E.Counter()

    for contig, size in map_contig2size.items():
        size = int(size)
        counter.input += 1

        if remove_regex and remove_regex.search(contig):
            counter.skipped += 1
            continue

        if options.fixed_width_windows:
            for x in range(0, size, window_increment):
                if x + window_size > size:
                    continue
                options.stdout.write(
                    "%s\t%i\t%i\n" % (contig, x, min(size, x + window_size)))
                counter.windows += 1
        else:
            options.stdout.write("%s\t%i\t%i\n" % (contig, 0, size))
            counter.windows += 1

        counter.output += 1

    E.info(str(counter))

    # write footer and output benchmark information.
    E.Stop()
コード例 #32
0
ファイル: gff2psl.py プロジェクト: santayana/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input is gtf.")

    parser.add_option("--no-header", dest="with_header", action="store_false",
                      help="do not output BLAT header [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--input-filename-queries", dest="input_filename_queries", type="string",
                      help="fasta filename with queries [default=%default].")

    parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true",
                      help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default]."""  )

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        with_header=True,
                        allow_duplicates=False,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        genome_fasta = None

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    ninput, noutput, nskipped = 0, 0, 0

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin),
                                                                 feature="exon"),
                                           strict=not options.allow_duplicates)
    else:
        iterator = GTF.joined_iterator(GTF.iterator(sys.stdin))

    if options.with_header:
        options.stdout.write(Blat.Match().getHeader() + "\n")

    for gffs in iterator:

        if options.test and ninput >= options.test:
            break

        ninput += 1

        result = alignlib_lite.py_makeAlignmentBlocks()

        xstart = 0

        intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs])

        for start, end in intervals:
            xend = xstart + end - start

            result.addDiagonal(xstart, xend,
                               start - xstart)
            xstart = xend

        entry = Blat.Match()
        entry.mQueryId = gff.transcript_id
        entry.mSbjctId = gff.contig
        entry.strand = gff.strand

        if genome_fasta:
            if entry.mSbjctId in genome_fasta:
                entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId)
            else:
                entry.mSbjctLength = result.getColTo()

        if queries_fasta:
            if entry.mQueryId in queries_fasta:
                entry.mQueryLength = queries_fasta.getLength(entry.mQueryId)
        else:
            entry.mQueryLength = result.getRowTo()

        entry.fromMap(result)

        options.stdout.write(str(entry) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
コード例 #33
0
ファイル: gff2fasta.py プロジェクト: gsc0107/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-m",
                      "--merge-adjacent",
                      dest="merge",
                      action="store_true",
                      help="merge adjacent intervals with the same attributes."
                      " [default=%default]")

    parser.add_option("-e",
                      "--feature",
                      dest="feature",
                      type="string",
                      help="filter by a feature, for example 'exon', 'CDS'."
                      " If set to the empty string, all entries are output "
                      "[%default].")

    parser.add_option("-f",
                      "--maskregions-bed-file",
                      dest="filename_masks",
                      type="string",
                      metavar="gff",
                      help="mask sequences with regions given in gff file "
                      "[%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option("--min-interval-length",
                      dest="min_length",
                      type="int",
                      help="set minimum length for sequences output "
                      "[%default]")

    parser.add_option("--max-length",
                      dest="max_length",
                      type="int",
                      help="set maximum length for sequences output "
                      "[%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--extend-with",
                      dest="extend_with",
                      type="string",
                      help="extend using base [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("--fold-at",
                      dest="fold_at",
                      type="int",
                      help="fold sequence every n bases[%default].")

    parser.add_option(
        "--fasta-name-attribute",
        dest="naming_attribute",
        type="string",
        help="use attribute to name fasta entry. Currently only compatable"
        " with gff format [%default].")

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        merge=False,
                        feature=None,
                        filename_masks=None,
                        remove_masked_regions=False,
                        min_length=0,
                        max_length=0,
                        extend_at=None,
                        extend_by=100,
                        extend_with=None,
                        masker=None,
                        fold_at=None,
                        naming_attribute=False)

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        gffs = GTF.iterator(options.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with IOTools.openFile(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GTF.iterator(infile))

        # convert intervals to intersectors
        for contig in list(e.keys()):
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in e[contig]:
                intersector.add_interval(bx.intervals.Interval(start, end))
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of
    # GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = [x for x in ichunk if x.feature == feature]
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from "
                   "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start,
                                       ichunk[0].end, str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand
        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            if options.naming_attribute:
                attr_dict = {
                    x.split("=")[0]: x.split("=")[1]
                    for x in chunk[0].attributes.split(";")
                }
                name = attr_dict[options.naming_attribute]
            else:
                name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(start, end)]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions):
                    nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise NotImplementedError("unimplemented")

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# skipped because fully masked: "
                            "%s: regions=%s masks=%s\n" %
                            (name, str([(x.start, x.end)
                                        for x in chunk]), masked_regions))
                    continue

        out = intervals

        if options.extend_at and not options.extend_with:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        # IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if (l < options.min_length
                or (options.max_length and l > options.max_length)):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write("# skipped because length out of bounds "
                                     "%s: regions=%s len=%i\n" %
                                     (name, str(intervals), l))
                continue

        if options.extend_at and options.extend_with:
            extension = "".join((options.extend_with, ) * options.extend_by)

            if options.extend_at in ("5", "both"):
                s[1] = extension + s[1]
            if options.extend_at in ("3", "both"):
                s[-1] = s[-1] + extension

        if options.fold_at:
            n = options.fold_at
            s = "".join(s)
            seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)])
        else:
            seq = "\n".join(s)

        options.stdout.write(
            ">%s %s:%s:%s\n%s\n" %
            (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, "
           "nskipped_masked=%i, nskipped_length=%i" %
           (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked,
            nskipped_length))

    E.Stop()
コード例 #34
0
 def __init__(self, tpath, qpath):
     self.tfasta = IndexedFasta.IndexedFasta(tpath)
     self.qfasta = IndexedFasta.IndexedFasta(qpath)
     self.pids = []
     self.stats = 0
コード例 #35
0
ファイル: gff2predictions.py プロジェクト: radaniba/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $",
        usage=globals()["__doc__"],
    )

    parser.add_option("-t", "--trans", dest="trans", help="input is translated DNA.", action="store_true")

    parser.add_option(
        "-f", "--format", dest="format", help="input format.", type="choice", choices=("exons", "psl", "gff")
    )

    parser.add_option(
        "-o",
        "--output-format",
        dest="output_format",
        help="output format",
        type="choice",
        choices=("exontable", "exons", "predictions", "cds", "fasta"),
    )

    parser.add_option(
        "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)."
    )

    parser.add_option(
        "--predictions-file",
        dest="predictions_file",
        type="string",
        help="filename with predictions. Use gene structures from this file if available.",
    )

    parser.add_option(
        "-i",
        "--gff-field-id",
        dest="gff_field_id",
        type="string",
        help="field for the feature id in the gff info section.",
    )

    parser.add_option(
        "-p",
        "--filename-peptides",
        dest="filename_peptides",
        type="string",
        help="Filename with peptide sequences. If given, it is used to check the predicted translated sequences.",
    )

    parser.add_option(
        "--no-realignment",
        dest="do_realignment",
        action="store_false",
        help="do not re-align entries that do not parse correctly.",
    )

    parser.add_option(
        "--remove-unaligned",
        dest="remove_unaligned",
        action="store_true",
        help="remove entries that have not been aligned correctly.",
    )

    parser.add_option(
        "--input-coordinates",
        dest="input_coordinates",
        type="string",
        help="specify input format for input coordinates [forward|both-zero|one-closed|open].",
    )

    parser.set_defaults(
        trans=False,
        output_format="predictions",
        format="psl",
        gff_field_id="id",
        input_coordinates="both-zero-open",
        filename_peptides=None,
        genome_file=None,
        do_realignment=True,
        predictions_file=None,
        remove_unaligned=False,
    )

    (options, args) = E.Start(parser)

    if not options.genome_file:
        raise "please specify a genome file."

    fasta = IndexedFasta.IndexedFasta(options.genome_file)
    contig_sizes = fasta.getContigSizes()

    ninput, noutput, nskipped = 0, 0, 0
    nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(IOTools.openFile(options.filename_peptides, "r"))
        predictor = Predictor.PredictorExonerate()
        predictor.mLogLevel = 0
    else:
        peptide_sequences = None
        predictor = None

    converter = IndexedFasta.getConverter(options.input_coordinates)

    predictions = {}
    if options.predictions_file:
        parser = PredictionParser.iterator_predictions(IOTools.openFile(options.predictions_file, "r"))
        for p in parser:
            predictions[p.mPredictionId] = p

    if options.output_format == "predictions":

        if options.format == "psl":

            if options.trans:
                parser = PredictionParser.PredictionParserBlatTrans()
            else:
                parser = PredictionParser.PredictionParserBlatCDNA()

            nmatches = 1
            for line in sys.stdin:
                if line[0] == "#":
                    continue
                if not re.match("^[0-9]", line):
                    continue

                try:
                    entries = parser.Parse((line,))
                except PredictionParser.AlignmentError, e:
                    print "# %s" % str(e)
                    print "#", line[:-1]
                    sys.exit(1)

                for entry in entries:
                    entry.mPredictionId = nmatches
                    nmatches += 1

                print str(entries)

        elif options.format == "exons":
            parser = PredictionParser.PredictionParserExons(contig_sizes=contig_sizes)
        else:
            raise "unknown format %s for output option %s" % (options.format, options.output_format)

        if options.loglevel >= 2:
            options.stdlog.write("# parsing.\n")
            options.stdlog.flush()

        results = parser.Parse(sys.stdin.readlines())

        if options.loglevel >= 2:
            options.stdlog.write("# parsing finished.\n")
            options.stdlog.flush()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# parsing: ninput=%i, noutput=%i, nerrors=%i\n"
                % (parser.GetNumInput(), parser.GetNumOutput(), parser.GetNumErrors())
            )

            for error, msg in parser.mErrors:
                options.stdlog.write("# %s : %s\n" % (str(error), msg))
                options.stdlog.flush()

        # if genomes are given: build translation
        if options.genome_file:

            results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken))

            new_results = PredictionParser.Predictions()

            for entry in results:

                ninput += 1

                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# processing entry %s:%s on %s:%s %i/%i.\n"
                        % (
                            entry.mPredictionId,
                            entry.mQueryToken,
                            entry.mSbjctToken,
                            entry.mSbjctStrand,
                            ninput,
                            len(results),
                        )
                    )
                    options.stdlog.flush()

                try:
                    lgenome = fasta.getLength(entry.mSbjctToken)
                    # added 3 residues - was a problem at split codons just before the stop.
                    # See for example the chicken sequence ENSGALP00000002741
                    genomic_sequence = fasta.getSequence(
                        entry.mSbjctToken,
                        entry.mSbjctStrand,
                        entry.mSbjctGenomeFrom,
                        min(entry.mSbjctGenomeTo + 3, lgenome),
                    )

                except KeyError:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# did not find entry for %s on %s.\n" % (entry.mPredictionId, entry.mSbjctToken)
                        )
                    nskipped += 1
                    continue

                if predictions and entry.mPredictionId in predictions:
                    if options.loglevel >= 2:
                        options.stdlog.write(
                            "# substituting entry %s on %s:%s.\n"
                            % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand)
                        )
                        options.stdlog.flush()
                    entry = predictions[entry.mPredictionId]

                exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0, entry.mSbjctGenomeFrom)

                entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment(
                    Genomics.String2Alignment(entry.mAlignmentString), entry.mQueryFrom, 0, genomic_sequence
                )

                entry.score = entry.mMapPeptide2Translation.getColTo() - entry.mMapPeptide2Translation.getColFrom() + 1

                (
                    entry.mNIntrons,
                    entry.mNFrameShifts,
                    entry.mNGaps,
                    entry.mNSplits,
                    entry.mNStopCodons,
                    entry.mNDisruptions,
                ) = Genomics.CountGeneFeatures(0, entry.mMapPeptide2Genome, genomic_sequence)

                if peptide_sequences:

                    if str(entry.mPredictionId) in peptide_sequences:

                        reference = peptide_sequences[str(entry.mPredictionId)].upper()

                        translation = entry.mTranslation
                        nfound += 1

                        is_identical, nmismatches = checkIdentity(reference, translation, options)

                        if is_identical:
                            nidentical += 1
                        else:
                            nmismatch += 1

                            if options.do_realignment:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches..realigning in region %i:%i\n"
                                        % (entry.mPredictionId, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)
                                    )
                                    options.stdlog.flush()

                                    result = predictor(
                                        entry.mPredictionId,
                                        reference,
                                        entry.mSbjctToken,
                                        genomic_sequence,
                                        "--subopt FALSE --score '%s'" % str(80),
                                    )
                                    # "--exhaustive --subopt FALSE --score '%s'" % str(80) )

                                    if result:
                                        translation = result[0].mTranslation
                                        is_identical, nmismatches = checkIdentity(reference, translation, options)
                                    else:
                                        if options.loglevel >= 2:
                                            options.stdlog.write(
                                                "# %s: realignment returned empty result\n" % (entry.mPredictionId)
                                            )
                                            options.stdlog.flush()
                                        is_identical = False

                                    if is_identical:
                                        naligned += 1
                                        prediction_id = entry.mPredictionId
                                        sbjct_genome_from = entry.mSbjctGenomeFrom
                                        entry = result[0]
                                        entry.mPredictionId = prediction_id
                                        entry.mSbjctGenomeFrom += sbjct_genome_from
                                    else:
                                        nunaligned += 1
                                        if options.loglevel >= 1:
                                            options.stdlog.write(
                                                "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n"
                                                % (
                                                    entry.mPredictionId,
                                                    entry.mSbjctToken,
                                                    entry.mSbjctStrand,
                                                    entry.mSbjctGenomeFrom,
                                                    entry.mSbjctGenomeTo,
                                                    reference,
                                                    entry.mTranslation,
                                                    translation,
                                                )
                                            )
                                            options.stdlog.flush()
                                        if options.remove_unaligned:
                                            nskipped += 1
                                            continue

                            else:
                                if options.loglevel >= 2:
                                    options.stdlog.write(
                                        "# %s: mismatches on %s ... no realignment\n"
                                        % (entry.mPredictionId, entry.mSbjctToken)
                                    )
                                    if options.loglevel >= 3:
                                        options.stdlog.write(
                                            "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n"
                                            % (entry.mPredictionId, reference, translation)
                                        )
                                    options.stdlog.flush()

                                if options.remove_unaligned:
                                    nskipped += 1
                                    continue

                    else:
                        nnotfound += 1

                new_results.append(entry)
                noutput += 1

            results = new_results
        if results:
            options.stdout.write(str(results) + "\n")
コード例 #36
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")
    parser.add_option(
        "-t",
        "--tablename",
        dest="tablename",
        type="string",
        help=
        "tablename to get variants from (in samtools pileup format) [default=%default]."
    )
    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option(
        "-f",
        "--exons-file",
        dest="filename_exons",
        type="string",
        help=
        "filename with transcript model information (gtf formatted file)  [default=%default]."
    )
    parser.add_option(
        "-r",
        "--filename-reference",
        dest="filename_reference",
        type="string",
        help=
        "filename with transcript models of a reference gene set. Stop codons that do not"
        " overlap any of the exons in this file are ignore (gtf-formatted file)  [default=%default]."
    )
    parser.add_option(
        "--vcf-file",
        dest="filename_vcf",
        type="string",
        help=
        "filename with variants in VCF format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--pileup-file",
        dest="filename_pileup",
        type="string",
        help=
        "filename with variants in samtools pileup format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--vcf-sample",
        dest="vcf_sample",
        type="string",
        help=
        "sample id for species of interest in vcf formatted file [default=%default]."
    )
    parser.add_option(
        "-s",
        "--seleno-tsv-file",
        dest="filename_seleno",
        type="string",
        help=
        "filename of a list of transcript ids that are selenoproteins [default=%default]."
    )
    parser.add_option("-m",
                      "--module",
                      dest="modules",
                      type="choice",
                      action="append",
                      choices=("gene-counts", "transcript-effects"),
                      help="modules to apply [default=%default].")
    parser.add_option("-o",
                      "--output-section",
                      dest="output",
                      type="choice",
                      action="append",
                      choices=("all", "peptide", "cds", "table", "gtf", "map"),
                      help="sections to output [default=%default].")
    parser.add_option(
        "-k",
        "--with-knockouts",
        dest="with_knockouts",
        action="store_true",
        help=
        "add alleles that are knocked out to fasta and gtf files [default=%default]."
    )

    parser.set_defaults(
        genome_file=None,
        filename_exons=None,
        filename_referenec=None,
        filename_seleno=None,
        modules=[],
        border=200,
        separator="|",
        tablename=None,
        database="csvdb",
        output=[],
        with_knockouts=False,
        filename_vcf=None,
        vcf_sample=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.filename_seleno:
        seleno = set(IOTools.readList(open(options.filename_seleno, "r")))
    else:
        seleno = {}

    infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin))

    # acquire variants from SQLlite database
    if options.tablename:
        if not options.database:
            raise ValueError("please supply both database and tablename")
        variant_getter = VariantGetterSqlite(options.database,
                                             options.tablename)
    elif options.filename_pileup:
        variant_getter = VariantGetterPileup(options.filename_pileup)
    elif options.filename_vcf:
        variant_getter = VariantGetterVCF(options.filename_vcf,
                                          options.vcf_sample)
    else:
        raise ValueError("please specify a source of variants.")

    if len(options.output) == 0 or "all" in options.output:
        output_all = True
    else:
        output_all = False

    if "cds" in options.output or output_all:
        outfile_cds = E.openOutputFile("cds.fasta")
    else:
        outfile_cds = None

    if "map" in options.output or output_all:
        outfile_map = E.openOutputFile("map.psl")
    else:
        outfile_map = None

    if "peptide" in options.output or output_all:
        outfile_peptides = E.openOutputFile("peptides.fasta")
    else:
        outfile_peptides = None

    if "table" in options.output or output_all:
        outfile_alleles = E.openOutputFile("table")
        outfile_alleles.write("\t".join(("gene_id", "transcript_id",
                                         "allele_id", "contig", "strand",
                                         "is_wildtype",
                                         ("\t".join(Allele._fields)))) + "\n")
    else:
        outfile_alleles = None

    if "gtf" in options.output or output_all:
        outfile_gtf = E.openOutputFile("gtf")
    else:
        outfile_gtf = None

    # id separatar
    separator = options.separator

    for transcripts in infile_gtf:

        gene_id = transcripts[0][0].gene_id

        overall_start = min([min([x.start for x in y]) for y in transcripts])
        overall_end = max([max([x.end for x in y]) for y in transcripts])
        contig = transcripts[0][0].contig
        strand = transcripts[0][0].strand
        is_positive_strand = Genomics.IsPositiveStrand(strand)
        lcontig = fasta.getLength(contig)
        E.info("%s: started processing on %s:%i..%i (%s)" %
               (gene_id, contig, overall_start, overall_end, strand))

        ninput += 1
        extended_start = max(0, overall_start - options.border)
        extended_end = min(lcontig, overall_end + options.border)

        # if contig.startswith("chr"): contig = contig[3:]

        variants = variant_getter(contig, extended_start, extended_end)

        E.debug("%s: found %i variants in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# collected variants:", variants)

        # collect intron/exon sequences
        # coordinates are forward/reverse
        # also updates the coordinates in transcripts
        all_exons, all_introns = collectExonIntronSequences(transcripts, fasta)

        # update variants such that they use the same coordinates
        # as the transcript
        variants = Variants.updateVariants(variants, lcontig, strand)

        # deal with overlapping but consistent variants
        variants = Variants.mergeVariants(variants)

        E.debug("%s: found %i variants after merging in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# merged variants:", variants)

        # collect coordinate offsets and remove conflicting variants
        variants, removed_variants, offsets = Variants.buildOffsets(
            variants, contig=contig)

        if len(removed_variants) > 0:
            E.warn("removed %i conflicting variants" % len(removed_variants))
            for v in removed_variants:
                E.info("removed variant: %s" % str(v))

        E.info("%i variants after filtering" % len(variants))

        if len(variants) > 0:
            # build variants
            indexed_variants = Variants.indexVariants(variants)

            # update exon sequences according to variants
            variant_exons = buildVariantSequences(indexed_variants, all_exons)

            # update intron sequences according to variants
            variant_introns = buildVariantSequences(indexed_variants,
                                                    all_introns)

            if E.global_options.loglevel >= 10:
                for key in variant_exons:
                    print("exon", key)
                    Genomics.printPrettyAlignment(
                        all_exons[key],
                        variant_exons[key][0],
                        variant_exons[key][1],
                    )
                for key in variant_introns:
                    print("intron", key)
                    Genomics.printPrettyAlignment(
                        all_introns[key][:30] + all_introns[key][-30:],
                        variant_introns[key][0][:30] +
                        variant_introns[key][0][-30:],
                        variant_introns[key][1][:30] +
                        variant_introns[key][1][-30:])

        else:
            variant_exons, variant_introns = None, None

        for transcript in transcripts:

            transcript.sort(key=lambda x: x.start)

            transcript_id = transcript[0].transcript_id
            alleles = buildAlleles(
                transcript,
                variant_exons,
                variant_introns,
                all_exons,
                all_introns,
                offsets,
                is_seleno=transcript_id in seleno,
                reference_coordinates=False,
            )

            ##############################################################
            ##############################################################
            ##############################################################
            # output
            for aid, al in enumerate(alleles):

                allele, map_cds2reference = al

                reference_cds_sequence = buildCDSSequence(
                    transcript, all_exons)
                is_wildtype = reference_cds_sequence == allele.cds

                allele_id = str(aid)
                assert len(allele.exon_starts) == allele.nexons
                assert len(allele.cds_starts) == allele.nexons
                assert len(allele.frames) == allele.nexons

                # the output id
                outid = separator.join((gene_id, transcript_id, allele_id))

                # output map between cds and reference
                if outfile_map and map_cds2reference:
                    match = Blat.Match()
                    match.mQueryId = allele_id
                    match.mQueryLength = allele.cds_len
                    match.mSbjctId = contig
                    match.mSbjctLength = lcontig
                    match.strand = strand
                    match.fromMap(map_cds2reference, use_strand=True)
                    outfile_map.write("%s\n" % str(match))

                # only output sequences for genes that have not been knocked
                # out, unless required
                if not allele.is_nmd_knockout or options.with_knockouts:

                    if outfile_gtf:
                        gtf = GTF.Entry()
                        gtf.gene_id = gene_id
                        gtf.transcript_id = transcript_id
                        gtf.addAttribute("allele_id", allele_id)
                        gtf.contig = contig
                        gtf.strand = strand
                        gtf.feature = "CDS"
                        gtf.source = "gtfxnsps"
                        l = 0
                        last_cds_start = allele.cds_starts[0]
                        gtf.start = allele.exon_starts[0]
                        gtf.frame = allele.frames[0]

                        for exon_start, cds_start, frame in zip(
                                allele.exon_starts[1:], allele.cds_starts[1:],
                                allele.frames[1:]):
                            cds_length = cds_start - last_cds_start
                            gtf.end = gtf.start + cds_length
                            if not is_positive_strand:
                                gtf.start, gtf.end = lcontig - \
                                    gtf.end, lcontig - gtf.start
                            outfile_gtf.write(str(gtf) + "\n")

                            gtf.start = exon_start
                            gtf.frame = frame

                            l += cds_length
                            last_cds_start = cds_start

                        cds_length = len(allele.cds) - last_cds_start
                        gtf.end = gtf.start + cds_length
                        if not is_positive_strand:
                            gtf.start, gtf.end = lcontig - \
                                gtf.end, lcontig - gtf.start
                        outfile_gtf.write(str(gtf) + "\n")

                    if outfile_cds:
                        outfile_cds.write(">%s\n%s\n" % (outid, allele.cds))
                    if outfile_peptides:
                        outfile_peptides.write(">%s\n%s\n" %
                                               (outid, allele.peptide))

                # reformat for tabular output
                allele = allele._replace(
                    cds_starts=",".join(map(str, allele.cds_starts)),
                    exon_starts=",".join(map(str, allele.exon_starts)),
                    frames=",".join(map(str, allele.frames)))

                # convert reference coordinates to positive strand coordinates
                if allele.reference_first_stop_start >= 0 and not is_positive_strand:
                    allele = allele._replace(
                        reference_first_stop_start=lcontig -
                        allele.reference_first_stop_end,
                        reference_first_stop_end=lcontig -
                        allele.reference_first_stop_start,
                    )

                if outfile_alleles:
                    outfile_alleles.write("%s\t%s\n" % ("\t".join(
                        (gene_id, transcript_id, allele_id, contig, strand,
                         "%i" % is_wildtype)), "\t".join(map(str, allele))))

                noutput += 1
                # only output first allele (debugging)
                # break

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
コード例 #37
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version="%prog version: $Id: psl2wiggle.py 2834 2009-11-24 16:11:23Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-b", "--output-filename-pattern", dest="output_filename", type="string",
                      help="filename for output [default=%default]")

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("bedgraph", "wiggle", "bigbed", "bigwig"),
                      help="output format [default=%default]")

    parser.set_defaults(genome_file=None,
                        typecode=numpy.int16,
                        output_filename=None,
                        output_format="wiggle",
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    typecode = options.typecode

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        counts = {}
        contig_sizes = fasta.getContigSizes(with_synonyms=False)
        E.info("allocating memory for %i contigs and %i bytes" %
               (len(contig_sizes), sum(contig_sizes.values()) * typecode().itemsize))
        for contig, size in contig_sizes.items():
            E.debug("allocating %s: %i bases" % (contig, size))
            counts[contig] = numpy.zeros(size, typecode)

        E.info("allocated memory for %i contigs" % len(fasta))

    else:
        fasta = None
        contig_sizes = {}

    if options.output_format in ("bigwig", "bigbed"):

        if not options.genome_file:
            raise ValueError(
                "please supply genome file for bigwig/bigbed computation.")

        if not options.output_filename:
            raise ValueError(
                "please output file for bigwig/bigbed computation.")

        if options.output_format == "bigwig":
            executable_name = "wigToBigWig"
        elif options.output_format == "bigbed":
            executable_name = "bedToBigBed"
        else:
            raise ValueError("unknown output format `%s`" %
                             options.output_format)

        executable = IOTools.which(executable_name)

        if not executable:
            raise OSError("could not find %s in path." % executable_name)

        tmpdir = tempfile.mkdtemp()
        E.debug("temporary files are in %s" % tmpdir)

        tmpfile_wig = os.path.join(tmpdir, "wig")
        tmpfile_sizes = os.path.join(tmpdir, "sizes")

        # write contig sizes
        outfile_size = open(tmpfile_sizes, "w")
        for contig, size in contig_sizes.items():
            outfile_size.write("%s\t%s\n" % (contig, size))
        outfile_size.close()

        outfile = open(tmpfile_wig, "w")

    else:
        outfile = options.stdout

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, ncontigs, nskipped = 0, 0, 0

    E.info("started counting")

    while 1:

        if options.test and ninput >= options.test:
            break

        match = iterator.next()

        if match is None:
            break

        ninput += 1

        contig = match.mSbjctId

        for start, length in zip(match.mSbjctBlockStarts, match.mBlockSizes):
            counts[contig][start:start + length] += 1

    E.info("finished counting")

    if options.output_format in ("wig", "bigwig"):
        E.info("starting wig output")

        for contig, vals in counts.items():

            E.debug("output for %s" % contig)
            for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]):
                l = list(iter)
                start, end = l[0][0], l[-1][0]
                val = vals[start]
                if val > 0:
                    outfile.write("variableStep chrom=%s span=%i\n" %
                                  (contig, end - start + 1))
                    outfile.write("%i\t%i\n" % (start, val))

            ncontigs += 1
    elif options.output_format in ("bedgraph", "bigbed"):

        E.info("starting bedgraph output")

        for contig, vals in counts.items():
            E.debug("output for %s" % contig)
            for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]):
                l = list(iter)
                start, end = l[0][0], l[-1][0]
                val = vals[start]
                if val > 0:
                    outfile.write("%s\t%i\t%i\t%i\n" %
                                  (contig, start, end + 1, val))

            ncontigs += 1

    E.info("finished output")

    if options.output_format in ("bigwig", "bigbed"):
        outfile.close()

        E.info("starting bigwig conversion")
        try:
            retcode = subprocess.call(" ".join((executable,
                                                tmpfile_wig,
                                                tmpfile_sizes,
                                                os.path.abspath(options.output_filename)), ),
                                      shell=True)
            if retcode < 0:
                warn("wigToBigWig terminated with signal: %i" % -retcode)
                return -retcode
        except OSError, msg:
            warn("Error while executing bigwig: %s" % e)
            return 1

        shutil.rmtree(tmpdir)

        E.info("finished bigwig conversion")
コード例 #38
0
ファイル: chain2stats.py プロジェクト: jmadzo/cgat
 def __init__(self, options):
     self.tdb = IndexedFasta(options.dbpath + options.targetgenome)
     self.qdb = IndexedFasta(options.dbpath + options.querygenome)
     self.tcontigs = self.tdb.getContigSizes()
     self.qcontigs = self.qdb.getContigSizes()
     self.badchains = []
コード例 #39
0
ファイル: bed2fasta.py プロジェクト: Charlie-George/cgat
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("-m", "--masker", dest="masker", type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("-o", "--mode", dest="mode", type="choice",
                      choices=("intervals", "leftright"),
                      help="what to output [%default]")

    parser.add_option("--min-length", dest="min_length", type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-length", dest="max_length", type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option("--extend-at", dest="extend_at", type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no, 3', 5' or both ends. If 3only or 5only are set, only the added sequence is returned [default=%default]")

    parser.add_option("--extend-by", dest="extend_by", type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--use-strand", dest="ignore_strand", action="store_false",
                      help="use strand information and return reverse complement [default=%default]")

    parser.set_defaults(
        genome_file=None,
        masker=None,
        mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.mode == "intervals":
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.Stop()
コード例 #40
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: bed2annotator2tsv.py 2885 2010-04-07 08:46:50Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-f",
                      "--features",
                      dest="features",
                      type="string",
                      help="feature to collect [default=None].")

    parser.add_option("-i",
                      "--files",
                      dest="files",
                      action="append",
                      help="use multiple annotations [default=None].")

    parser.add_option(
        "-a",
        "--annotations",
        dest="annotations",
        type="string",
        help=
        "aggregate name for annotations if only single file is provided from STDIN [default=None]."
    )

    parser.add_option(
        "--input-filename-map",
        dest="input_filename_map",
        type="string",
        help="filename with a map of gene_ids to categories [default=None].")

    parser.add_option("-l",
                      "--max-length",
                      dest="max_length",
                      type="string",
                      help="maximum segment length [default=None].")

    parser.add_option(
        "-m",
        "--merge",
        dest="merge",
        action="store_true",
        help="merge overlapping bed segments [default=%default].")

    parser.add_option("-s",
                      "--section",
                      dest="section",
                      type="choice",
                      choices=("segments", "annotations", "workspace"),
                      help="annotator section [default=None].")

    parser.add_option(
        "--subset",
        dest="subsets",
        type="string",
        action="append",
        help=
        "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]."
    )

    parser.set_defaults(
        genome_file=None,
        feature=None,
        remove_random=True,
        section="segments",
        annotations="annotations",
        max_length=100000,
        files=[],
        subsets=[],
        input_filename_map=None,
        merge=False,
    )

    (options, args) = E.Start(parser)

    options.files += args
    if len(options.files) == 0: options.files.append("-")
    options.files = list(
        itertools.chain(*[re.split("[,; ]+", x) for x in options.files]))

    if options.subsets:
        subsets = collections.defaultdict(list)
        for s in options.subsets:
            filename_gff, label, filename_ids = s.split(",")
            subsets[filename_gff].append((label, filename_ids))
        options.subsets = subsets

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.section == "segments":
        prefix = "##Segs"
    elif options.section == "annotations":
        prefix = "##Id"
    elif options.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % options.section)

    if options.max_length:
        max_length = options.max_length
    else:
        max_length = 0

    ninput, ntracks, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0, 0

    if options.section in ("annotations"):
        contigs = set()
        it = itertools.groupby(Bed.iterator(options.stdin),
                               key=lambda x: x.track["name"])

        map_track2segments = {}
        for track, beds in it:
            ntracks += 1
            map_track2segments[track] = []
            first_segment = nsegments

            beds = list(beds)

            if options.merge: beds = Bed.merge(beds)

            for bed in beds:
                contig, start, end = bed.contig, bed.start, bed.end

                if options.remove_random and "random" in contig: continue

                if max_length > 0 and end - start > max_length:
                    ndiscarded += 1
                    continue

                contigs.add(contig)
                map_track2segments[track].append(nsegments)
                options.stdout.write("%s\t%i\t%s\t(%i,%i)\n" %
                                     (prefix, nsegments, contig, start, end))
                nsegments += 1

            options.stdout.write("##Ann\t%s\t%s\n" % (track, "\t".join(
                ["%i" % x for x in range(first_segment, nsegments)])))
            E.info("track %s: annotated with %i segments" %
                   (track, nsegments - first_segment))

        ncontigs = len(contigs)
        E.info(
            "ninput=%i, ntracks=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" %
            (ninput, ntracks, ncontigs, nsegments, ndiscarded))

    E.Stop()
コード例 #41
0
ファイル: quality2fasta.py プロジェクト: siping/cgat
        alphabet = "fastq",
        encoding = "phred",
        default_value = None,
        )
    
    (options, args) = E.Start( parser )

    ninput, noutput = 0, 0
    
    if options.format == "fasta":
        iterator = FromFastaIterator( sys.stdin, alphabet = options.alphabet, default = options.default_value )

    if options.output_format == "fasta":

        if options.build_index:
            IndexedFasta.createDatabase( options.build_index,
                                         iterator )
        else:
            while 1:
                try:
                    r = iterator.next()
                except StopIteration:
                    break
                t,s = r
                options.stdout.write( ">%s\n%s\n" % (t,s))

    elif options.output_format == "fastq":
        
        if not options.filename_sequences:
            raise "please supply a filename with sequences."

        iterator_sequence = FastaIterator.FastaIterator( open( options.filename_sequences, "r" ) )
コード例 #42
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: quality2fasta.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option("-f", "--format", dest="format", type="choice",
                      choices=("fasta", ),
                      help="input format [%default]."  )

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("fasta", "fastq" ),
                      help="output format - if fastq is chosen, also supply a sequence file [%default]."  )
    
    parser.add_option("-a", "--alphabet", dest="alphabet", type="choice",
                      choices=("fastq", "solexa", "printable" ),
                      help="characters to use for quality scores [%default]."  )

    parser.add_option("-e", "--encoding", dest="encoding", type="choice",
                      choices=("phred", "solexa" ),
                      help="encoding of quality scores [%default]."  )
    
    parser.add_option("-i", "--build-index", dest="build_index", type="string",
                      help="build an index. Supply the database name [%default]."  )

    parser.add_option("-s", "--filename-sequences", dest="filename_sequences", type="string",
                      help="input filename with file of sequences in fasta format - sorted in the same way as the quality file [%default]."  )


    parser.add_option( "-d", "--set-to-default", dest="default_value", type="int",
                       help="set all quality codes to the default value. Supply the fasta sequence instead of the quality codes [%default]." )

    parser.set_defaults(
        format = "fasta",
        output_format = "fasta",
        build_index = None,
        filename_sequences = None,
        alphabet = "fastq",
        encoding = "phred",
        default_value = None,
        )
    
    (options, args) = E.Start( parser )

    ninput, noutput = 0, 0
    
    if options.format == "fasta":
        iterator = FromFastaIterator( sys.stdin, alphabet = options.alphabet, default = options.default_value )

    if options.output_format == "fasta":

        if options.build_index:
            IndexedFasta.createDatabase( options.build_index,
                                         iterator )
        else:
            while 1:
                try:
                    r = iterator.next()
                except StopIteration:
                    break
                t,s = r
                options.stdout.write( ">%s\n%s\n" % (t,s))

    elif options.output_format == "fastq":
        
        if not options.filename_sequences:
            raise "please supply a filename with sequences."

        iterator_sequence = FastaIterator.FastaIterator( open( options.filename_sequences, "r" ) )
        
        while 1:
            qual, seq = None, None
            try:
                qual = iterator.next()
                seq = iterator_sequence.next()
            except StopIteration:
                if qual and not seq:
                    options.stdlog.write( "# sequence file incomplete\n" )
                elif seq and not qual:
                    options.stdlog.write( "# quality file incomplete\n" )

            qt, qs = qual
            st, ss = seq.title, seq.sequence
            assert qt == st, "sequence and quality identifiers incongruent: %s != %s" % (qt, st)
            options.stdout.write( "@%s\n%s\n+\n%s\n" % (qt, ss, qs))

    if options.loglevel >= 1:
        options.stdlog.write( "# ninput=%i, noutput=%i, noverflow=%i, nunderflow=%i\n" % \
                                  (iterator.mNInput, 
                                   iterator.mNOutput, 
                                   iterator.mNOverFlow, 
                                   iterator.mNUnderFlow ))

    E.Stop()
コード例 #43
0
ファイル: gtf2exons.py プロジェクト: siping/cgat
        extract_id = None )

    (options, args) = E.Start( parser )
    
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
        contig_sizes = fasta.getContigSizes()
    else:
        contig_sizes = {}

    if options.extract_id:
        extract_id = re.compile( options.extract_id )
    else:
        extract_id = None

    converter = IndexedFasta.getConverter( options.coordinate_format )

    exons = Exons.ReadExonBoundaries( sys.stdin,
                                      contig_sizes = contig_sizes,
                                      converter = converter,
                                      do_invert = True,
                                      format = "gtf",
                                      gtf_extract_id = extract_id )

    ntranscripts, nexons, nerrors = 0, 0, 0
    for id, ee in exons.items():
        ntranscripts += 1
        has_error = False
        for e in ee:
            if options.forward_coordinates and e.mSbjctToken in contig_sizes and \
                    e.mSbjctStrand == "-":
コード例 #44
0
def main(argv=None):

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: IndexedFasta.py 2801 2009-10-22 13:40:39Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--extract",
        dest="extract",
        type="string",
        help=
        "extract region for testing purposes. Format is contig:strand:from:to. "
        "The default coordinates are 0-based open/closed coordinates on both strands. "
        "For example, chr1:+:10:12 will return bases 11 to 12 on chr1.")

    parser.add_option("-c",
                      "--compression",
                      dest="compression",
                      type="choice",
                      choices=("lzo", "zlib", "gzip", "dictzip", "bzip2",
                               "debug"),
                      help="compress database [default=%default].")

    parser.add_option(
        "--random-access-points",
        dest="random_access_points",
        type="int",
        help=
        "save random access points every # number of nucleotides [default=%default]."
    )

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("one-forward-open", "zero-both-open"),
                      help="coordinate format of input [default=%default].")

    parser.add_option(
        "-s",
        "--synonyms",
        dest="synonyms",
        type="string",
        help=
        "list of synonyms, comma separated with =, for example, chr1=chr1b [default=%default]"
    )

    parser.add_option(
        "-b",
        "--benchmark",
        dest="benchmark",
        action="store_true",
        help="benchmark time for read access [default=%default].")

    parser.add_option(
        "--benchmark-num-iterations",
        dest="benchmark_num_iterations",
        type="int",
        help="number of iterations for benchmark [default=%default].")

    parser.add_option("--benchmark-fragment-size",
                      dest="benchmark_fragment_size",
                      type="int",
                      help="benchmark: fragment size [default=%default].")

    parser.add_option("--verify",
                      dest="verify",
                      type="string",
                      help="verify against other database [default=%default].")

    parser.add_option(
        "--file-format",
        dest="file_format",
        type="choice",
        choices=("fasta", "auto", "fasta.gz", "tar", "tar.gz"),
        help=
        "file format of input. Supply if data comes from stdin [default=%default]."
    )

    parser.add_option(
        "-a",
        "--clean-sequence",
        dest="clean_sequence",
        action="store_true",
        help=
        "remove X/x from DNA sequences - they cause errors in exonerate [default=%default]."
    )

    parser.add_option(
        "--allow-duplicates",
        dest="allow_duplicates",
        action="store_true",
        help=
        "allow duplicate identifiers. Further occurances of an identifier are suffixed by an '_%i' [default=%default]."
    )

    parser.add_option(
        "--regex-identifier",
        dest="regex_identifier",
        type="string",
        help=
        "regular expression for extracting the identifier from fasta description line [default=%default]."
    )

    parser.add_option("--compress-index",
                      dest="compress_index",
                      action="store_true",
                      help="compress index [default=%default].")

    parser.add_option(
        "--force",
        dest="force",
        action="store_true",
        help="force overwriting of existing files [default=%default].")

    parser.add_option(
        "-t",
        "--translator",
        dest="translator",
        type="choice",
        choices=("solexa", "phred", "bytes", "range200"),
        help="translate numerical quality scores [default=%default].")

    parser.set_defaults(extract=None,
                        input_format="zero-both-open",
                        benchmark_fragment_size=1000,
                        benchmark_num_iterations=1000000,
                        benchmark=False,
                        compression=None,
                        random_access_points=0,
                        synonyms=None,
                        verify=None,
                        verify_num_iterations=100000,
                        verify_fragment_size=100,
                        clean_sequence=False,
                        allow_duplicates=False,
                        regex_identifier=None,
                        compress_index=False,
                        file_format="auto",
                        force=False,
                        translator=None)

    (options, args) = E.Start(parser)

    if options.synonyms:
        synonyms = {}
        for x in options.synonyms.split(","):
            a, b = x.split("=")
            a = a.strip()
            b = b.strip()
            if a not in synonyms: synonyms[a] = []
            synonyms[a].append(b)
    else:
        synonyms = None

    if options.translator:
        if options.translator == "phred":
            options.translator = TranslatorPhred()
        elif options.translator == "solexa":
            options.translator = TranslatorSolexa()
        elif options.translator == "bytes":
            options.translator = TranslatorBytes()
        elif options.translator == "range200":
            options.translator = TranslatorRange200()
        else:
            raise ValueError("unknown translator %s" % options.translator)

    if options.extract:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.setTranslator(options.translator)
        converter = IndexedFasta.getConverter(options.input_format)

        contig, strand, start, end = IndexedFasta.parseCoordinates(
            options.extract)
        sequence = fasta.getSequence(contig,
                                     strand,
                                     start,
                                     end,
                                     converter=converter)
        options.stdout.write( ">%s\n%s\n" % \
                              ( options.extract, sequence ) )
    elif options.benchmark:
        import timeit
        timer = timeit.Timer(
            stmt="benchmarkRandomFragment( fasta = fasta, size = %i)" %
            (options.benchmark_fragment_size),
            setup=
            """from __main__ import benchmarkRandomFragment,IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )"""
            % (args[0]))

        t = timer.timeit(number=options.benchmark_num_iterations)
        options.stdout.write("iter\tsize\ttime\n")
        options.stdout.write("%i\t%i\t%i\n" %
                             (options.benchmark_num_iterations,
                              options.benchmark_fragment_size, t))
    elif options.verify:
        fasta1 = IndexedFasta.IndexedFasta(args[0])
        fasta2 = IndexedFasta.IndexedFasta(options.verify)
        nerrors1 = verify(fasta1,
                          fasta2,
                          options.verify_num_iterations,
                          options.verify_fragment_size,
                          stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors1))
        nerrors2 = IndexedFasta.verify(fasta2,
                                       fasta1,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors2))
    elif options.compress_index:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.compressIndex()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# creating database %s\n" % args[0])
            options.stdlog.write("# indexing the following files: \n# %s\n" %\
                                 (" \n# ".join( args[1:] ) ))
            options.stdlog.flush()

            if synonyms:
                options.stdlog.write("# Applying the following synonyms:\n")
                for k, v in synonyms.items():
                    options.stdlog.write("# %s=%s\n" % (k, ",".join(v)))
                options.stdlog.flush()
        if len(args) < 2:
            print globals()["__doc__"]
            sys.exit(1)

        iterator = IndexedFasta.MultipleFastaIterator(
            args[1:],
            regex_identifier=options.regex_identifier,
            format=options.file_format)

        IndexedFasta.createDatabase(
            args[0],
            iterator,
            synonyms=synonyms,
            random_access_points=options.random_access_points,
            compression=options.compression,
            clean_sequence=options.clean_sequence,
            allow_duplicates=options.allow_duplicates,
            translator=options.translator,
            force=options.force)

    E.Stop()
コード例 #45
0
ファイル: gff2predictions.py プロジェクト: siping/cgat
    
    fasta = IndexedFasta.IndexedFasta( options.genome_file ) 
    contig_sizes = fasta.getContigSizes()
    
    ninput, noutput, nskipped = 0,0,0
    nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0,0,0,0,0,0
    
    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences( IOTools.openFile( options.filename_peptides, "r"))
        predictor = PredictorExonerate()
        predictor.mLogLevel = 0
    else:
        peptide_sequences = None
        predictor = None
        
    converter = IndexedFasta.getConverter( options.input_coordinates )

    predictions = {}
    if options.predictions_file:
        parser = PredictionParser.iterator_predictions( IOTools.openFile( options.predictions_file, "r") )
        for p in parser:
            predictions[p.mPredictionId] = p
        
    if options.output_format == "predictions":
        
        if options.format == "psl":

            if options.trans:
                parser = PredictionParser.PredictionParserBlatTrans()
            else:
                parser = PredictionParser.PredictionParserBlatCDNA()