예제 #1
0
def iterator_sorted(gff_iterator, sort_order="gene"):
    '''sort input and yield sorted output.'''
    entries = list(gff_iterator)
    if sort_order == "gene":
        entries.sort(key=lambda x: (x.gene_id, x.contig, x.start))
    elif sort_order == "gene+transcript":
        entries.sort(
            key=lambda x: (x.gene_id, x.transcript_id, x.contig, x.start))
    elif sort_order == "contig+gene":
        entries.sort(
            key=lambda x: (x.contig, x.gene_id, x.transcript_id, x.start))
    elif sort_order == "transcript":
        entries.sort(key=lambda x: (x.transcript_id, x.contig, x.start))
    elif sort_order == "position":
        entries.sort(key=lambda x: (x.contig, x.start))
    elif sort_order == "position+gene":
        entries.sort(key=lambda x: (x.gene_id, x.start))
        genes = list(flat_gene_iterator(entries))
        genes.sort(key=lambda x: (x[0].contig, x[0].start))
        entries = IOTools.flatten(genes)
    elif sort_order == "gene+exon":
        entries.sort(key=lambda x: (x.gene_id, x.exon_number))

    for entry in entries:
        yield entry
예제 #2
0
    dbh = sqlite3.connect(PARAMS['database'])
    return dbh


@transform(INPUT_FORMATS, regex("(.*)"), r"\1")
def unprocessReads(infiles, outfiles):
    """dummy task - no processing of reads."""


# if preprocess tools are specified, preprocessing is done on output that has
# already been generated in the first run
if PARAMS.get("preprocessors", None):
    if PARAMS["auto_remove"]:
        # check if fastqc has been run
        for x in IOTools.flatten([glob.glob(y) for y in INPUT_FORMATS]):
            f = re.match(REGEX_TRACK, x).group(1) + ".fastqc"
            if not os.path.exists(f):
                raise ValueError("file %s missing, "
                                 "you need to run the pipeline once before "
                                 "specifying 'auto_remove'" % f)

        @follows(mkdir("fasta.dir"))
        @transform(unprocessReads, regex(SEQUENCEFILES_REGEX),
                   r"fasta.dir/\1.fasta")
        def makeAdaptorFasta(infile, outfile):
            '''Make a single fasta file for each sample of all contaminant adaptor
            sequences for removal
            '''

            print(infile)
예제 #3
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--ignore-strand",
                      dest="ignore_strand",
                      action="store_true",
                      help="ignore strand information [default=%default].")

    parser.add_option(
        "-u",
        "--update",
        dest="filename_update",
        type="string",
        help="if filename is given, previous results will be read"
        "from there and only changed sets will be computed "
        "[default=%default].")

    parser.add_option("-p",
                      "--pattern-identifier",
                      dest="pattern_id",
                      type="string",
                      help="pattern to convert a filename to an id"
                      "[default=%default].")

    parser.add_option("-g",
                      "--output-only-genes",
                      dest="output_only_genes",
                      action="store_true",
                      help="only output gene stats (includes gene lists)"
                      " [default=%default].")

    parser.set_defaults(
        ignore_strand=False,
        filename_update=None,
        pattern_id="(.*).gtf",
        output_only_genes=False,
    )

    (options, args) = E.start(parser)

    if len(args) < 2:
        print(USAGE)
        raise ValueError("at least two arguments required")

    if options.filename_update:
        infile = IOTools.open_file(options.filename_update, "r")
        previous_results = {}
        for line in infile:
            if line.startswith("#"):
                continue
            if line.startswith("set1"):
                continue
            data = line[:-1].split("\t")
            set1, set2 = data[0], data[1]

            if set1 not in previous_results:
                previous_results[set1] = {}
            if set2 not in previous_results:
                previous_results[set2] = {}

            previous_results[set1][set2] = "\t".join(data[2:])
            rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)]
            previous_results[set2][set1] = "\t".join(IOTools.flatten(rev))
    else:
        previous_results = {}

    if options.output_only_genes:
        counter = CounterGenes()
    else:
        counter = Counter()

    options.stdout.write("set1\tset2\t%s\n" % counter.getHeader())

    pattern_id = re.compile(options.pattern_id)

    def getTitle(x):
        try:
            return pattern_id.search(x).groups()[0]
        except AttributeError:
            return x

    ncomputed, nupdated = 0, 0
    for x in range(len(args)):
        title1 = getTitle(args[x])
        for y in range(0, x):
            title2 = getTitle(args[y])
            if previous_results:
                try:
                    prev = previous_results[title1][title2]
                except KeyError:
                    pass
                else:
                    options.stdout.write("%s\t%s\t%s\n" %
                                         ((title1, title2, prev)))
                    nupdated += 1
                    continue

            counter.count(args[x], args[y])
            options.stdout.write("%s\t%s\t%s\n" %
                                 ((title1, title2, str(counter))))
            ncomputed += 1

    E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed))

    E.stop()
예제 #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: diff_bed.py 2866 2010-03-03 10:18:49Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-u", "--update", dest="filename_update", type="string",
                      help="if filename is given, previous results will be read from there and only changed sets will be computed [default=%default].")

    parser.add_option("-p", "--pattern-identifier", dest="pattern_id", type="string",
                      help="pattern to convert a filename to an id [default=%default].")

    parser.add_option("-t", "--tracks", dest="tracks", action="store_true",
                      help="compare files against all tracks in the first file [default=%default]")

    parser.set_defaults(
        filename_update=None,
        pattern_id="(.*).bed",
        tracks=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) < 2:
        raise ValueError("at least two arguments required")

    if options.filename_update:
        infile = IOTools.open_file(options.filename_update, "r")
        previous_results = {}
        for line in infile:
            if line.startswith("#"):
                continue
            if line.startswith("set1"):
                continue
            data = line[:-1].split("\t")
            set1, set2 = data[0], data[1]

            if set1 not in previous_results:
                previous_results[set1] = {}
            if set2 not in previous_results:
                previous_results[set2] = {}

            previous_results[set1][set2] = "\t".join(data[2:])
            rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)]
            previous_results[set2][set1] = "\t".join(IOTools.flatten(rev))
    else:
        previous_results = {}

    pattern_id = re.compile(options.pattern_id)

    def getTitle(x):
        try:
            return pattern_id.search(x).groups()[0]
        except AttributeError:
            return x

    ncomputed, nupdated = 0, 0

    if options.tracks:
        counter = CounterTracks(args[0])
        options.stdout.write("set1\tset2\t%s\n" % counter.getHeader())
        for filename in args[1:]:
            title1 = getTitle(filename)
            for title2 in counter.getTracks():

                if previous_results:
                    try:
                        prev = previous_results[title1][title2]
                    except KeyError:
                        pass
                    else:
                        options.stdout.write(
                            "%s\t%s\t%s\n" % ((title1, title2, prev)))
                        nupdated += 1
                        continue

                counter.count(filename, title2)
                options.stdout.write(
                    "%s\t%s\t%s\n" % ((title1, title2, str(counter))))
                ncomputed += 1
    else:
        counter = Counter()
        options.stdout.write("set1\tset2\t%s\n" % counter.getHeader())

        for x in range(len(args)):

            title1 = getTitle(args[x])

            for y in range(0, x):
                title2 = getTitle(args[y])
                if previous_results:
                    try:
                        prev = previous_results[title1][title2]
                    except KeyError:
                        pass
                    else:
                        options.stdout.write(
                            "%s\t%s\t%s\n" % ((title1, title2, prev)))
                        nupdated += 1
                        continue

                counter.count(args[x], args[y])
                options.stdout.write(
                    "%s\t%s\t%s\n" % ((title1, title2, str(counter))))
                ncomputed += 1

    E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed))
    E.stop()