Example #1
0
def iterator_sorted(gff_iterator, sort_order="gene"):
    '''sort input and yield sorted output.'''
    entries = list(gff_iterator)
    if sort_order == "gene":
        entries.sort(key=lambda x: (x.gene_id, x.contig, x.start))
    elif sort_order == "gene+transcript":
        entries.sort(
            key=lambda x: (x.gene_id, x.transcript_id, x.contig, x.start))
    elif sort_order == "contig+gene":
        entries.sort(
            key=lambda x: (x.contig, x.gene_id, x.transcript_id, x.start))
    elif sort_order == "transcript":
        entries.sort(key=lambda x: (x.transcript_id, x.contig, x.start))
    elif sort_order == "position":
        entries.sort(key=lambda x: (x.contig, x.start))
    elif sort_order == "position+gene":
        entries.sort(key=lambda x: (x.gene_id, x.start))
        genes = list(flat_gene_iterator(entries))
        genes.sort(key=lambda x: (x[0].contig, x[0].start))
        entries = IOTools.flatten(genes)
    elif sort_order == "gene+exon":
        entries.sort(key=lambda x: (x.gene_id, x.exon_number))

    for entry in entries:
        yield entry
Example #2
0
File: GTF.py Project: sudlab/cgat
def iterator_sorted(gff_iterator, sort_order="gene"):
    '''sort input and yield sorted output.'''
    entries = list(gff_iterator)
    if sort_order == "gene":
        entries.sort(key=lambda x: (x.gene_id, x.contig, x.start))
    elif sort_order == "gene+transcript":
        entries.sort(
            key=lambda x: (x.gene_id, x.transcript_id, x.contig, x.start))
    elif sort_order == "contig+gene":
        entries.sort(
            key=lambda x: (x.contig, x.gene_id, x.transcript_id, x.start))
    elif sort_order == "transcript":
        entries.sort(key=lambda x: (x.transcript_id, x.contig, x.start))
    elif sort_order == "position":
        entries.sort(key=lambda x: (x.contig, x.start))
    elif sort_order == "position+gene":
        entries.sort(key=lambda x: (x.gene_id, x.start))
        genes = list(flat_gene_iterator(entries))
        genes.sort(key=lambda x: (x[0].contig, x[0].start))
        entries = IOTools.flatten(genes)
    elif sort_order == "gene+exon":
        entries.sort(key=lambda x: (x.gene_id, x.exon_number))

    for entry in entries:
        yield entry
Example #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: diff_bed.py 2866 2010-03-03 10:18:49Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-u", "--update", dest="filename_update", type="string",
                      help="if filename is given, previous results will be read from there and only changed sets will be computed [default=%default].")

    parser.add_option("-p", "--pattern-identifier", dest="pattern_id", type="string",
                      help="pattern to convert a filename to an id [default=%default].")

    parser.add_option("-t", "--tracks", dest="tracks", action="store_true",
                      help="compare files against all tracks in the first file [default=%default]")

    parser.set_defaults(
        filename_update=None,
        pattern_id="(.*).bed",
        tracks=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) < 2:
        raise ValueError("at least two arguments required")

    if options.filename_update:
        infile = IOTools.openFile(options.filename_update, "r")
        previous_results = {}
        for line in infile:
            if line.startswith("#"):
                continue
            if line.startswith("set1"):
                continue
            data = line[:-1].split("\t")
            set1, set2 = data[0], data[1]

            if set1 not in previous_results:
                previous_results[set1] = {}
            if set2 not in previous_results:
                previous_results[set2] = {}

            previous_results[set1][set2] = "\t".join(data[2:])
            rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)]
            previous_results[set2][set1] = "\t".join(IOTools.flatten(rev))
    else:
        previous_results = {}

    pattern_id = re.compile(options.pattern_id)

    def getTitle(x):
        try:
            return pattern_id.search(x).groups()[0]
        except AttributeError:
            return x

    ncomputed, nupdated = 0, 0

    if options.tracks:
        counter = CounterTracks(args[0])
        options.stdout.write("set1\tset2\t%s\n" % counter.getHeader())
        for filename in args[1:]:
            title1 = getTitle(filename)
            for title2 in counter.getTracks():

                if previous_results:
                    try:
                        prev = previous_results[title1][title2]
                    except KeyError:
                        pass
                    else:
                        options.stdout.write(
                            "%s\t%s\t%s\n" % ((title1, title2, prev)))
                        nupdated += 1
                        continue

                counter.count(filename, title2)
                options.stdout.write(
                    "%s\t%s\t%s\n" % ((title1, title2, str(counter))))
                ncomputed += 1
    else:
        counter = Counter()
        options.stdout.write("set1\tset2\t%s\n" % counter.getHeader())

        for x in range(len(args)):

            title1 = getTitle(args[x])

            for y in range(0, x):
                title2 = getTitle(args[y])
                if previous_results:
                    try:
                        prev = previous_results[title1][title2]
                    except KeyError:
                        pass
                    else:
                        options.stdout.write(
                            "%s\t%s\t%s\n" % ((title1, title2, prev)))
                        nupdated += 1
                        continue

                counter.count(args[x], args[y])
                options.stdout.write(
                    "%s\t%s\t%s\n" % ((title1, title2, str(counter))))
                ncomputed += 1

    E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed))
    E.Stop()
Example #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: diff_bed.py 2866 2010-03-03 10:18:49Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-u",
        "--update",
        dest="filename_update",
        type="string",
        help=
        "if filename is given, previous results will be read from there and only changed sets will be computed [default=%default]."
    )

    parser.add_option(
        "-p",
        "--pattern-identifier",
        dest="pattern_id",
        type="string",
        help="pattern to convert a filename to an id [default=%default].")

    parser.add_option(
        "-t",
        "--tracks",
        dest="tracks",
        action="store_true",
        help=
        "compare files against all tracks in the first file [default=%default]"
    )

    parser.set_defaults(
        filename_update=None,
        pattern_id="(.*).bed",
        tracks=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) < 2:
        raise ValueError("at least two arguments required")

    if options.filename_update:
        infile = IOTools.openFile(options.filename_update, "r")
        previous_results = {}
        for line in infile:
            if line.startswith("#"):
                continue
            if line.startswith("set1"):
                continue
            data = line[:-1].split("\t")
            set1, set2 = data[0], data[1]

            if set1 not in previous_results:
                previous_results[set1] = {}
            if set2 not in previous_results:
                previous_results[set2] = {}

            previous_results[set1][set2] = "\t".join(data[2:])
            rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)]
            previous_results[set2][set1] = "\t".join(IOTools.flatten(rev))
    else:
        previous_results = {}

    pattern_id = re.compile(options.pattern_id)

    def getTitle(x):
        try:
            return pattern_id.search(x).groups()[0]
        except AttributeError:
            return x

    ncomputed, nupdated = 0, 0

    if options.tracks:
        counter = CounterTracks(args[0])
        options.stdout.write("set1\tset2\t%s\n" % counter.getHeader())
        for filename in args[1:]:
            title1 = getTitle(filename)
            for title2 in counter.getTracks():

                if previous_results:
                    try:
                        prev = previous_results[title1][title2]
                    except KeyError:
                        pass
                    else:
                        options.stdout.write("%s\t%s\t%s\n" %
                                             ((title1, title2, prev)))
                        nupdated += 1
                        continue

                counter.count(filename, title2)
                options.stdout.write("%s\t%s\t%s\n" %
                                     ((title1, title2, str(counter))))
                ncomputed += 1
    else:
        counter = Counter()
        options.stdout.write("set1\tset2\t%s\n" % counter.getHeader())

        for x in range(len(args)):

            title1 = getTitle(args[x])

            for y in range(0, x):
                title2 = getTitle(args[y])
                if previous_results:
                    try:
                        prev = previous_results[title1][title2]
                    except KeyError:
                        pass
                    else:
                        options.stdout.write("%s\t%s\t%s\n" %
                                             ((title1, title2, prev)))
                        nupdated += 1
                        continue

                counter.count(args[x], args[y])
                options.stdout.write("%s\t%s\t%s\n" %
                                     ((title1, title2, str(counter))))
                ncomputed += 1

    E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed))
    E.Stop()
Example #5
0
File: diff_gtf.py Project: SCV/cgat
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-s", "--ignore-strand", dest="ignore_strand",
                      action="store_true",
                      help="ignore strand information [default=%default].")

    parser.add_option(
        "-u", "--update", dest="filename_update", type="string",
        help="if filename is given, previous results will be read"
        "from there and only changed sets will be computed "
        "[default=%default].")

    parser.add_option(
        "-p", "--pattern-identifier", dest="pattern_id", type="string",
        help="pattern to convert a filename to an id"
        "[default=%default].")

    parser.add_option(
        "-g", "--output-only-genes", dest="output_only_genes",
        action="store_true",
        help="only output gene stats (includes gene lists)"
        " [default=%default].")

    parser.set_defaults(
        ignore_strand=False,
        filename_update=None,
        pattern_id="(.*).gtf",
        output_only_genes=False,
    )

    (options, args) = E.Start(parser)

    if len(args) < 2:
        print USAGE
        raise ValueError("at least two arguments required")

    if options.filename_update:
        infile = open(options.filename_update, "r")
        previous_results = {}
        for line in infile:
            if line.startswith("#"):
                continue
            if line.startswith("set1"):
                continue
            data = line[:-1].split("\t")
            set1, set2 = data[0], data[1]

            if set1 not in previous_results:
                previous_results[set1] = {}
            if set2 not in previous_results:
                previous_results[set2] = {}

            previous_results[set1][set2] = "\t".join(data[2:])
            rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)]
            previous_results[set2][set1] = "\t".join(IOTools.flatten(rev))
    else:
        previous_results = {}

    if options.output_only_genes:
        counter = CounterGenes()
    else:
        counter = Counter()

    options.stdout.write("set1\tset2\t%s\n" % counter.getHeader())

    pattern_id = re.compile(options.pattern_id)

    def getTitle(x):
        try:
            return pattern_id.search(x).groups()[0]
        except AttributeError:
            return x

    ncomputed, nupdated = 0, 0
    for x in range(len(args)):
        title1 = getTitle(args[x])
        for y in range(0, x):
            title2 = getTitle(args[y])
            if previous_results:
                try:
                    prev = previous_results[title1][title2]
                except KeyError:
                    pass
                else:
                    options.stdout.write(
                        "%s\t%s\t%s\n" % ((title1, title2, prev)))
                    nupdated += 1
                    continue

            counter.count(args[x], args[y])
            options.stdout.write(
                "%s\t%s\t%s\n" % ((title1, title2, str(counter))))
            ncomputed += 1

    E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed))

    E.Stop()
    dbh = sqlite3.connect(PARAMS['database'])
    return dbh


@transform(INPUT_FORMATS, regex("(.*)"), r"\1")
def unprocessReads(infiles, outfiles):
    """dummy task - no processing of reads."""
    pass


# if preprocess tools are specified, preprocessing is done on output that has
# already been generated in the first run
if PARAMS.get("preprocessors", None):
    if PARAMS["auto_remove"]:
        # check if fastqc has been run
        for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]):
            f = re.match(REGEX_TRACK, x).group(1) + ".fastqc"
            if not os.path.exists(f):
                raise ValueError(
                    "file %s missing, "
                    "you need to run the pipeline once before "
                    "specifying 'auto_remove'" % f)

        @follows(mkdir("fasta.dir"))
        @transform(unprocessReads,
                   regex(SEQUENCEFILES_REGEX),
                   r"fasta.dir/\1.fasta")
        def makeAdaptorFasta(infile, outfile):
            '''Make a single fasta file for each sample of all contaminant adaptor
            sequences for removal
            '''
Example #7
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--ignore-strand",
                      dest="ignore_strand",
                      action="store_true",
                      help="ignore strand information [default=%default].")

    parser.add_option(
        "-u",
        "--update",
        dest="filename_update",
        type="string",
        help="if filename is given, previous results will be read"
        "from there and only changed sets will be computed "
        "[default=%default].")

    parser.add_option("-p",
                      "--pattern-identifier",
                      dest="pattern_id",
                      type="string",
                      help="pattern to convert a filename to an id"
                      "[default=%default].")

    parser.add_option("-g",
                      "--output-only-genes",
                      dest="output_only_genes",
                      action="store_true",
                      help="only output gene stats (includes gene lists)"
                      " [default=%default].")

    parser.set_defaults(
        ignore_strand=False,
        filename_update=None,
        pattern_id="(.*).gtf",
        output_only_genes=False,
    )

    (options, args) = E.Start(parser)

    if len(args) < 2:
        print USAGE
        raise ValueError("at least two arguments required")

    if options.filename_update:
        infile = open(options.filename_update, "r")
        previous_results = {}
        for line in infile:
            if line.startswith("#"):
                continue
            if line.startswith("set1"):
                continue
            data = line[:-1].split("\t")
            set1, set2 = data[0], data[1]

            if set1 not in previous_results:
                previous_results[set1] = {}
            if set2 not in previous_results:
                previous_results[set2] = {}

            previous_results[set1][set2] = "\t".join(data[2:])
            rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)]
            previous_results[set2][set1] = "\t".join(IOTools.flatten(rev))
    else:
        previous_results = {}

    if options.output_only_genes:
        counter = CounterGenes()
    else:
        counter = Counter()

    options.stdout.write("set1\tset2\t%s\n" % counter.getHeader())

    pattern_id = re.compile(options.pattern_id)

    def getTitle(x):
        try:
            return pattern_id.search(x).groups()[0]
        except AttributeError:
            return x

    ncomputed, nupdated = 0, 0
    for x in range(len(args)):
        title1 = getTitle(args[x])
        for y in range(0, x):
            title2 = getTitle(args[y])
            if previous_results:
                try:
                    prev = previous_results[title1][title2]
                except KeyError:
                    pass
                else:
                    options.stdout.write("%s\t%s\t%s\n" %
                                         ((title1, title2, prev)))
                    nupdated += 1
                    continue

            counter.count(args[x], args[y])
            options.stdout.write("%s\t%s\t%s\n" %
                                 ((title1, title2, str(counter))))
            ncomputed += 1

    E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed))

    E.Stop()
    dbh = sqlite3.connect(PARAMS['database'])
    return dbh


@transform(INPUT_FORMATS, regex("(.*)"), r"\1")
def unprocessReads(infiles, outfiles):
    """dummy task - no processing of reads."""


# if preprocess tools are specified, preprocessing is done on output that has
# already been generated in the first run
if PARAMS.get("preprocessors", None):
    if PARAMS["auto_remove"]:
        # check if fastqc has been run
        for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]):
            f = re.match(REGEX_TRACK, x).group(1) + ".fastqc"
            if not os.path.exists(f):
                raise ValueError(
                    "file %s missing, "
                    "you need to run the pipeline once before "
                    "specifying 'auto_remove'" % f)

        @follows(mkdir("fasta.dir"))
        @transform(unprocessReads,
                   regex(SEQUENCEFILES_REGEX),
                   r"fasta.dir/\1.fasta")
        def makeAdaptorFasta(infile, outfile):
            '''Make a single fasta file for each sample of all contaminant adaptor
            sequences for removal
            '''