Esempio n. 1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--gtf-file", dest="filename_gtf", type="string",
        help="filename with gene models in gtf format [%default]")

    parser.add_option(
        "-m", "--filename-mismapped", dest="filename_mismapped", type="string",
        help="output bam file for mismapped reads [%default]")

    parser.add_option(
        "-j", "--junctions-bed-file", dest="filename_junctions", type="string",
        help="bam file with reads mapped across junctions [%default]")

    parser.add_option(
        "-r", "--filename-regions", dest="filename_regions", type="string",
        help="filename with regions to remove in bed format [%default]")

    parser.add_option(
        "-t", "--transcripts-gtf-file", dest="filename_transcriptome",
        type="string",
        help="bam file with reads mapped against transcripts [%default]")

    parser.add_option(
        "-p", "--map-tsv-file", dest="filename_map", type="string",
        help="filename mapping transcript numbers (used by "
        "--filename-transciptome) to transcript names "
        "(used by --filename-gtf) [%default]")

    parser.add_option(
        "-s", "--filename-stats", dest="filename_stats", type="string",
        help="filename to output stats to [%default]")

    parser.add_option(
        "-o", "--colour",
        dest="colour_mismatches", action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option(
        "-i", "--ignore-mismatches",
        dest="ignore_mismatches", action="store_true",
        help="ignore mismatches [%default]")

    parser.add_option(
        "-c", "--remove-contigs", dest="remove_contigs", type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option(
        "-f", "--force-output", dest="force", action="store_true",
        help="force overwriting of existing files [%default]")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = IOTools.read_map(
            IOTools.open_file(options.filename_map), has_header=True)
        id_map = dict([(y, x) for x, y in id_map.items()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(IOTools.open_file(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(IOTools.open_file(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome,
                                                  "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile)
    else:
        output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.AlignmentFile(options.filename_mismapped,
                                               "wb",
                                               template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.AlignmentFile(options.filename_junctions,
                                                "rb")
    else:
        junctions_samfile = None

    c = bams2bam_filter(genome_samfile,
                        output_samfile,
                        output_mismapped,
                        transcripts_samfile,
                        junctions_samfile,
                        transcripts,
                        regions=regions_to_remove,
                        unique=options.unique,
                        remove_contigs=options.remove_contigs,
                        colour_mismatches=options.colour_mismatches,
                        ignore_mismatches=options.ignore_mismatches,
                        ignore_transcripts=transcripts_samfile is None,
                        ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = IOTools.open_file(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.stop()
Esempio n. 2
0
def test_cmdline():
    '''test style of scripts
    '''

    # start script in order to build the command line parser
    global ORIGINAL_START
    if ORIGINAL_START is None:
        ORIGINAL_START = E.start

    # read the first two columns
    map_option2action = IOTools.read_map(
        IOTools.open_file(FILENAME_OPTIONLIST),
        columns=(0, 1),
        has_header=True)

    files = []
    for label, expression in EXPRESSIONS:
        f = glob.glob(expression)
        files.extend(sorted(f))

    files = filter_files(files)

    # make sure to use the current working directory as
    # primary lookup.
    sys.path.insert(0, ".")

    # files = [
    #    'scripts/check_db.py',
    #    'scripts/cgat_build_report_page.py']

    for f in files:
        if os.path.isdir(f):
            continue
        if os.path.basename(f) in EXCLUDE:
            continue

        script_name = os.path.abspath(f)
        pyxfile = (os.path.join(os.path.dirname(f), "_") +
                   os.path.basename(f) + "x")

        fail_.description = script_name
        # check if script contains getopt
        with IOTools.open_file(script_name) as inf:
            if "getopt" in inf.read():
                yield (fail_, "script uses getopt directly: %s" % script_name)
                continue

        module, modulename = load_script(script_name)
        if module is None:
            yield (fail_, "module could not be imported: %s\n" % script_name)
            continue
        E.start = LocalStart

        try:
            module.main(argv=["dummy", "--help"])
        except AttributeError:
            yield (fail_, "no main method in %s\n" % script_name)
            ok_(False, "no main method in %s" % script_name)
        except SystemExit:
            yield (fail_, "script does not use E.start() %s\n" % script_name)
        except DummyError:
            pass

        for option in PARSER.option_list:
            # ignore options added by optparse
            if option.dest is None:
                continue

            optstring = option.get_opt_string()
            if optstring.startswith("--"):
                optstring = optstring[2:]

            check_option.description = script_name + ":" + optstring

            yield (check_option, optstring, os.path.abspath(f),
                   map_option2action)

        # clear up
        del sys.modules[modulename]

        # scripts with pyximport need special handling.
        #
        # Multiple imports of pyximport seems to create
        # some confusion - here, clear up sys.meta_path after
        # each script
        if os.path.exists(pyxfile):
            sys.meta_path = []
Esempio n. 3
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("translate", "translate-to-stop", "truncate-at-stop",
                 "back-translate", "mark-codons", "apply-map", "build-map",
                 "pseudo-codons", "filter", "interleaved-codons", "map-codons",
                 "remove-gaps", "mask-seg", "mask-bias", "mask-codons",
                 "mask-incomplete-codons", "mask-stops", "mask-soft",
                 "map-identifier", "nop", "remove-stops", "upper", "lower",
                 "reverse-complement", "sample", "shuffle"),
        help="method to apply to sequences.")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="parameter stack for methods that require one "
                      "[default=%default].")

    parser.add_option("-x",
                      "--ignore-errors",
                      dest="ignore_errors",
                      action="store_true",
                      help="ignore errors [default = %default].")

    parser.add_option("--sample-proportion",
                      dest="sample_proportion",
                      type="float",
                      help="sample proportion [default = %default].")

    parser.add_option("--exclude-pattern",
                      dest="exclude_pattern",
                      type="string",
                      help="exclude all sequences with ids matching pattern "
                      "[default = %default].")

    parser.add_option("--include-pattern",
                      dest="include_pattern",
                      type="string",
                      help="include only sequences with ids matching pattern "
                      "[default = %default].")

    parser.add_option("--filter-method",
                      dest="filter_methods",
                      type="string",
                      action="append",
                      help="filtering methods to apply "
                      "[default = %default].")

    parser.add_option(
        "-t",
        "--sequence-type",
        dest="type",
        type="choice",
        choices=("aa", "na"),
        help="sequence type (aa or na) [%default]. This option determines "
        "which characters to use for masking [default = %default].")

    parser.add_option(
        "-l",
        "--template-identifier",
        dest="template_identifier",
        type="string",
        help="template for numerical identifier [default = %default] "
        "for the operation --build-map. A %i is replaced by the position "
        "of the sequence in the file.")

    parser.add_option(
        "--map-tsv-file",
        dest="map_tsv_file",
        type="string",
        help=
        "input filename with map for identifiers. The first row is a header")

    parser.add_option(
        "--fold-width",
        dest="fold_width",
        type="int",
        help="fold width for sequence output. 0 is unfolded [%default]")

    parser.set_defaults(methods=[],
                        parameters="",
                        type="na",
                        aa_mask_chars="xX",
                        aa_mask_char="x",
                        na_mask_chars="nN",
                        na_mask_char="n",
                        gap_chars="-.",
                        gap_char="-",
                        template_identifier="ID%06i",
                        ignore_errors=False,
                        exclude_pattern=None,
                        include_pattern=None,
                        sample_proportion=None,
                        filter_methods=[],
                        input_filename_fasta="-",
                        input_filename_map=None,
                        fold_width=80)

    (options, args) = E.start(parser)

    if len(args) > 0:
        options.input_filename_fasta = args[0]

    options.parameters = options.parameters.split(",")

    rx_include, rx_exclude = None, None
    if options.include_pattern:
        rx_include = re.compile(options.include_pattern)
    if options.exclude_pattern:
        rx_exclude = re.compile(options.exclude_pattern)

    iterator = FastaIterator.FastaIterator(options.stdin)

    nseq = 0

    map_seq2nid = {}

    map_identifier = ("apply-map" in options.methods
                      or "map-identifier" in options.methods)
    if map_identifier:
        if options.input_filename_map is None:
            raise ValueError("for method=map-identifier use --map-tsv-file")
        with IOTools.open_file(options.input_filename_map) as infile:
            map_identifier = IOTools.read_map(infile, has_header=True)

    if options.type == "na":
        mask_chars = options.na_mask_chars
        mask_char = options.na_mask_char
    else:
        mask_chars = options.aa_mask_chars
        mask_char = options.aa_mask_char

    if "map-codons" in options.methods:
        map_codon2code = IOTools.ReadMap(open(options.parameters[0], "r"))
        del options.parameters[0]

    if "mask-soft" in options.methods:
        f = options.parameters[0]
        del options.parameters[0]
        hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "mask-codons" in options.methods or "back-translate" in options.methods:

        # open a second stream to read sequences from
        f = options.parameters[0]
        del options.parameters[0]

        other_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "sample" in options.methods:
        if not options.sample_proportion:
            raise ValueError("specify a sample proportion")
        sample_proportion = options.sample_proportion
    else:
        sample_proportion = None

    filter_min_sequence_length = None
    filter_max_sequence_length = None
    filter_id_list = None
    for f in options.filter_methods:
        if f.startswith("min-length"):
            filter_min_sequence_length = int(f.split("=")[1])
        elif f.startswith("max-length"):
            filter_max_sequence_length = int(f.split("=")[1])
        elif f.startswith("id-file"):
            filter_id_list = [
                line[:-1] for line in IOTools.open_file(f.split("=")[1])
            ]

    def raiseIfNotCodon(l, title):
        '''raise ValueError if sequence length l is not divisible by
        3'''

        if l % 3 != 0:
            raise ValueError("length of sequence %s not divisible by 3" %
                             (title))

    iterator = pysam.FastxFile(options.input_filename_fasta)

    c = E.Counter()

    fold_width = options.fold_width

    def fold(s, w):
        return "\n".join([s[x:x + w] for x in range(0, len(s), w)])

    for record in iterator:
        c.nseq += 1
        c.input += 1

        sequence = re.sub(" ", "", record.sequence)
        l = len(sequence)

        if rx_include and not rx_include.search(record.name):
            c.skipped += 1
            continue

        if rx_exclude and rx_exclude.search(record.name):
            c.skipped += 1
            continue

        if sample_proportion:
            if random.random() > sample_proportion:
                continue

        if not (filter_id_list is None or record.name in filter_id_list):
            c.skipped += 1
            continue

        for method in options.methods:

            if method == "translate":
                # translate such that gaps are preserved
                seq = []

                ls = len(re.sub('[%s]' % options.gap_chars, sequence, ""))

                if ls % 3 != 0:
                    msg = "length of sequence %s (%i) not divisible by 3" % (
                        record.name, ls)
                    c.errors += 1
                    if options.ignore_errors:
                        E.warn(msg)
                        continue
                    else:
                        raise ValueError(msg)

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "back-translate":
                # translate from an amino acid alignment to codon alignment
                seq = []

                try:
                    other_record = next(other_iterator)
                except StopIteration:
                    raise ValueError("run out of sequences")

                if record.name != other_record.title:
                    raise "sequence titles don't match: %s %s" % (
                        record.name, other_record.title)

                other_sequence = re.sub("[ %s]" % options.gap_chars, "",
                                        other_record.sequence)

                if len(other_sequence) % 3 != 0:
                    raise ValueError(
                        "length of sequence %s not divisible by 3" %
                        (other_record.title))

                r = re.sub("[%s]" % options.gap_chars, "", sequence)
                if len(other_sequence) != len(r) * 3:
                    raise ValueError(
                        "length of sequences do not match: %i vs %i" %
                        (len(other_sequence), len(r)))

                x = 0
                for aa in sequence:
                    if aa in options.gap_chars:
                        c = options.gap_char * 3
                    else:
                        c = other_sequence[x:x + 3]
                        x += 3
                    seq.append(c)

                sequence = "".join(seq)

            elif method == "pseudo-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "   ".join(seq)

            elif method == "reverse-complement":
                sequence = sequence.translate(
                    str.maketrans("ACGTacgt", "TGCAtgca"))[::-1]

            elif method in ("mask-stops", "remove-stops"):
                c = []
                codon = []
                new_sequence = []

                if method == "mask-stops":
                    char = options.na_mask_char
                elif method == "remove-stops":
                    char = options.gap_char

                for x in sequence:

                    if x not in options.gap_chars:
                        codon.append(x.upper())

                    c.append(x)

                    if len(codon) == 3:
                        codon = "".join(codon).upper()
                        # mask all non-gaps
                        if Genomics.IsStopCodon(codon):

                            for x in c:
                                if x in options.gap_chars:
                                    new_sequence.append(x)
                                else:
                                    new_sequence.append(char)
                        else:
                            new_sequence += c

                        c = []
                        codon = []

                new_sequence += c

                sequence = "".join(new_sequence)

            elif method == "mask-soft":
                # Get next hard masked record and extract sequence and length
                try:
                    cur_hm_record = next(hard_masked_iterator)
                except StopIteration:
                    break
                hm_sequence = re.sub(" ", "", cur_hm_record.sequence)
                lhm = len(hm_sequence)
                new_sequence = []

                # Check lengths of unmasked and soft masked sequences the same
                if l != lhm:
                    raise ValueError(
                        "length of unmasked and hard masked sequences not "
                        "identical for record %s" % (record.name))

                # Check if hard masked seq contains repeat (N), if so replace N
                # with lowercase sequence from unmasked version
                if sequence == hm_sequence:
                    pass
                else:
                    for x, y in zip_longest(sequence, hm_sequence):
                        if y == "N":
                            new_sequence += x.lower()
                        else:
                            new_sequence += x.upper()
                sequence = "".join(new_sequence)

            elif method == "map-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in (sequence[x:x + 3].upper()
                              for x in range(0, l, 3)):

                    if codon not in map_codon2code:
                        aa = "X"
                    else:
                        aa = map_codon2code[codon]
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "interleaved-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append("%s:%s" % (aa, codon))

                sequence = " ".join(seq)

            elif method == "translate-to-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "truncate-at-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break
                    seq.append(codon)

                sequence = "".join(seq)

            elif method == "remove-gaps":

                seq = []
                for s in sequence:
                    if s in options.gap_chars:
                        continue
                    seq.append(s)

                sequence = "".join(seq)

            elif method == "upper":
                sequence = sequence.upper()

            elif method == "lower":
                sequence = sequence.lower()

            elif method == "mark-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                sequence = " ".join(
                    [sequence[x:x + 3] for x in range(0, l, 3)])

            elif method == "apply-map":
                id = re.match("^(\S+)", record.name).groups()[0]
                if id in map_seq2nid:
                    rest = record.name[len(id):]
                    record.name = map_seq2nid[id] + rest

            elif method == "build-map":
                # build a map of identifiers
                id = re.match("^(\S+)", record.name).groups()[0]
                new_id = options.template_identifier % nseq
                if id in map_seq2nid:
                    raise "duplicate fasta entries - can't map those: %s" % id
                map_seq2nid[id] = new_id
                record.name = new_id

            elif method == "mask-bias":
                masker = Masker.MaskerBias()
                sequence = masker(sequence)

            elif method == "mask-seg":
                masker = Masker.MaskerSeg()
                sequence = masker(sequence)

            elif method == "shuffle":
                s = list(sequence)
                random.shuffle(s)
                sequence = "".join(s)

            elif method == "mask-incomplete-codons":
                seq = list(sequence)
                for x in range(0, l, 3):
                    nm = len([x for x in seq[x:x + 3] if x in mask_chars])
                    if 0 < nm < 3:
                        seq[x:x + 3] = [mask_char] * 3
                sequence = "".join(seq)

            elif method == "mask-codons":
                # mask codons based on amino acids given as reference
                # sequences.
                other_record = next(other_iterator)

                if other_record is None:
                    raise ValueError("run out of sequences.")

                if record.name != other_record.title:
                    raise ValueError("sequence titles don't match: %s %s" %
                                     (record.name, other_record.title))

                other_sequence = re.sub(" ", "", other_record.sequence)

                if len(other_sequence) * 3 != len(sequence):
                    raise ValueError(
                        "sequences for %s don't have matching lengths %i - %i"
                        %
                        (record.name, len(other_sequence) * 3, len(sequence)))

                seq = list(sequence)
                c = 0
                for x in other_sequence:
                    if x in options.aa_mask_chars:
                        if x.isupper():
                            seq[c:c + 3] = [options.na_mask_char.upper()] * 3
                        else:
                            seq[c:c + 3] = [options.na_mask_char.lower()] * 3
                    c += 3

                sequence = "".join(seq)

        l = len(sequence)
        if filter_min_sequence_length is not None and \
           l < filter_min_sequence_length:
            c.skipped += 1

        if filter_max_sequence_length is not None and \
           l > filter_max_sequence_length:
            c.skipped += 1
            continue

        record.sequence = sequence
        if fold_width >= 0:
            if record.comment:
                options.stdout.write(">{} {}\n{}\n".format(
                    record.name, record.comment,
                    fold(record.sequence, fold_width)))
            else:
                options.stdout.write(">{}\n{}\n".format(
                    record.name, fold(record.sequence, fold_width)))
        else:
            options.stdout.write(str(record) + "\n")

        c.output += 1

    if "build-map" in options.methods:
        p = options.parameters[0]
        if p:
            outfile = IOTools.open_file(p, "w")
        else:
            outfile = options.stdout

        outfile.write("old\tnew\n")
        for old_id, new_id in list(map_seq2nid.items()):
            outfile.write("%s\t%s\n" % (old_id, new_id))
        if p:
            outfile.close()

    E.info(c)
    E.stop()
Esempio n. 4
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--fasta",
                      dest="input_filename_fasta",
                      type="string",
                      help="filename with fasta sequences. ")

    parser.add_option("-o",
                      "--output-filename-sequences",
                      dest="output_filename_sequences",
                      type="string",
                      help="output per sequence information to filename")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      action="append",
                      type="choice",
                      choices=["map-identifier", "nop"],
                      help="method to apply")

    parser.add_option(
        "--input-filename-map",
        dest="input_filename_map",
        type="string",
        help=
        "input filename with map for identifiers. The first row is a header")

    parser.add_option(
        "--fold-width",
        dest="fold_width",
        type="int",
        help="fold width for sequence output. 0 is unfolded [%default]")

    parser.set_defaults(input_filename_fasta="-",
                        methods=[],
                        input_filename_map=None,
                        fold_width=80)

    (options, args) = E.start(parser, argv=argv)

    if len(args) > 0:
        options.input_filename_fasta = args[0]

    if options.methods is None:
        raise ValueError("please specify at least one method")

    map_identifier = "map-identifier" in options.methods
    if map_identifier:
        if options.input_filename_map is None:
            raise ValueError(
                "for method=map-identifier use --input-filename-map")
        with IOTools.open_file(options.input_filename_map) as infile:
            map_identifier = IOTools.read_map(infile, has_header=True)

    iterator = pysam.FastxFile(options.input_filename_fasta)

    c = E.Counter()

    fold_width = options.fold_width

    def fold(s, w):
        return "\n".join([s[x:x + w] for x in range(0, len(s), w)])

    method = options.method

    for record in iterator:
        c.input += 1

        if map_identifier:
            record.name = map_identifier.get(record.name, record.name)

        if fold_width >= 0:
            options.stdout.write(">{} {}\n{}\n".format(
                record.name, record.comment, fold(record.sequence,
                                                  fold_width)))
        else:
            options.stdout.write(str(record) + "\n")

        c.output += 1

    E.info(c)
    E.stop()
Esempio n. 5
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--species",
                      dest="species",
                      type="string",
                      help="species to use [default=%default].")

    parser.add_option("-i",
                      "--slims",
                      dest="filename_slims",
                      type="string",
                      help="filename with GO SLIM categories "
                      "[default=%default].")

    parser.add_option("-g",
                      "--genes-tsv-file",
                      dest="filename_genes",
                      type="string",
                      help="filename with genes to analyse "
                      "[default=%default].")

    parser.add_option("-b",
                      "--background-tsv-file",
                      dest="filename_background",
                      type="string",
                      help="filename with background genes to analyse "
                      "[default=%default].")

    parser.add_option("-m",
                      "--min-counts",
                      dest="minimum_counts",
                      type="int",
                      help="minimum count - ignore all categories that have "
                      "fewer than # number of genes"
                      " [default=%default].")

    parser.add_option("-o",
                      "--sort-order",
                      dest="sort_order",
                      type="choice",
                      choices=("fdr", "pvalue", "ratio"),
                      help="output sort order [default=%default].")

    parser.add_option("--ontology",
                      dest="ontology",
                      type="string",
                      action="append",
                      help="go ontologies to analyze. Ontologies are tested "
                      "separately [default=%default].")

    parser.add_option(
        "-t",
        "--threshold",
        dest="threshold",
        type="float",
        help="significance threshold [>1.0 = all ]. If --fdr is set, this "
        "refers to the fdr, otherwise it is a cutoff for p-values.")

    parser.add_option("--filename-dump",
                      dest="filename_dump",
                      type="string",
                      help="dump GO category assignments into a flatfile "
                      "[default=%default].")

    parser.add_option(
        "--gene2name-map-tsv-file",
        dest="filename_gene2name",
        type="string",
        help="optional filename mapping gene identifiers to gene names "
        "[default=%default].")

    parser.add_option(
        "--filename-ontology",
        dest="filename_ontology",
        type="string",
        help="filename with ontology in OBO format [default=%default].")

    parser.add_option("--filename-input",
                      dest="filename_input",
                      type="string",
                      help="read GO category assignments from a flatfile "
                      "[default=%default].")

    parser.add_option("--sample-size",
                      dest="sample",
                      type="int",
                      help="do sampling (with # samples) [default=%default].")

    parser.add_option(
        "--filename-output-pattern",
        "--output-filename-pattern",
        dest="output_filename_pattern",
        type="string",
        help="pattern with output filename pattern "
        "(should contain: %(go)s and %(section)s ) [default=%default]")

    parser.add_option("--fdr",
                      dest="fdr",
                      action="store_true",
                      help="calculate and filter by FDR default=%default].")

    parser.add_option(
        "--go2goslim",
        dest="go2goslim",
        action="store_true",
        help="convert go assignments in STDIN to goslim assignments and "
        "write to STDOUT [default=%default].")

    parser.add_option("--gene-pattern",
                      dest="gene_pattern",
                      type="string",
                      help="pattern to transform identifiers to GO gene names "
                      "[default=%default].")

    parser.add_option("--filename-map-slims",
                      dest="filename_map_slims",
                      type="string",
                      help="write mapping between GO categories and GOSlims "
                      "[default=%default].")

    parser.add_option(
        "--get-genes",
        dest="get_genes",
        type="string",
        help="list all genes in the with a certain GOID [default=%default].")

    parser.add_option(
        "--strict",
        dest="strict",
        action="store_true",
        help="require all genes in foreground to be part of background. "
        "If not set, genes in foreground will be added to the background "
        "[default=%default].")

    parser.add_option(
        "-q",
        "--fdr-method",
        dest="qvalue_method",
        type="choice",
        choices=("empirical", "storey", "BH"),
        help="method to perform multiple testing correction by controlling "
        "the fdr [default=%default].")

    parser.add_option(
        "--pairwise",
        dest="compute_pairwise",
        action="store_true",
        help="compute pairwise enrichment for multiple gene lists. "
        "[default=%default].")

    # parser.add_option( "--fdr-lambda", dest="qvalue_lambda", type="float",
    #                   help="fdr computation: lambda [default=%default]."  )

    # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
    #                    choices = ("smoother", "bootstrap" ),
    # help="fdr computation: method for estimating pi0 [default=%default]."  )

    parser.set_defaults(species=None,
                        filename_genes="-",
                        filename_background=None,
                        filename_slims=None,
                        minimum_counts=0,
                        ontology=[],
                        filename_dump=None,
                        sample=0,
                        fdr=False,
                        output_filename_pattern=None,
                        threshold=0.05,
                        filename_map_slims=None,
                        gene_pattern=None,
                        sort_order="ratio",
                        get_genes=None,
                        strict=False,
                        qvalue_method="empirical",
                        pairs_min_observed_counts=3,
                        compute_pairwise=False,
                        filename_gene2name=None)

    (options, args) = E.start(parser, add_database_options=True)

    if options.go2goslim:
        GO.convertGo2Goslim(options)
        E.stop()
        sys.exit(0)

    if options.fdr and options.sample == 0:
        E.warn("fdr will be computed without sampling")

    #############################################################
    # dump GO
    if options.filename_dump:
        # set default orthologies to GO
        if not options.ontology:
            options.ontology = [
                "biol_process", "mol_function", "cell_location"
            ]

        E.info("dumping GO categories to %s" % (options.filename_dump))

        dbhandle = Database.connect(url=options.database_url)

        outfile = IOTools.open_file(options.filename_dump,
                                    "w",
                                    create_dir=True)
        GO.DumpGOFromDatabase(outfile, dbhandle, options)
        outfile.close()
        E.stop()
        sys.exit(0)

    #############################################################
    # read GO categories from file
    if options.filename_input:
        E.info("reading association of categories and genes from %s" %
               (options.filename_input))
        infile = IOTools.open_file(options.filename_input)
        gene2gos, go2infos = GO.ReadGene2GOFromFile(infile)
        infile.close()

    if options.filename_gene2name:
        E.info("reading gene identifier to gene name mapping from %s" %
               options.filename_gene2name)
        infile = IOTools.open_file(options.filename_gene2name)
        gene2name = IOTools.read_map(infile, has_header=True)
        infile.close()
        E.info("read %i gene names for %i gene identifiers" %
               (len(set(gene2name.values())), len(gene2name)))
    else:
        # use identity mapping
        gene2name = dict([(x, x) for x in list(gene2gos.keys())])

    #############################################################
    # read GO ontology from file
    if options.filename_ontology:
        E.info("reading ontology from %s" % (options.filename_ontology))

        infile = IOTools.open_file(options.filename_ontology)
        ontology = GO.readOntology(infile)
        infile.close()

        def _g():
            return collections.defaultdict(GO.GOInfo)

        go2infos = collections.defaultdict(_g)

        # substitute go2infos
        for go in list(ontology.values()):
            go2infos[go.mNameSpace][go.mId] = GO.GOInfo(go.mId,
                                                        go_type=go.mNameSpace,
                                                        description=go.mName)

    #############################################################
    # get foreground gene list
    input_foreground, genelists = GO.ReadGeneLists(
        options.filename_genes, gene_pattern=options.gene_pattern)

    E.info("read %i genes for forground in %i gene lists" %
           (len(input_foreground), len(genelists)))

    #############################################################
    # get background
    if options.filename_background:

        # nick - bug fix: background is the first tuple element from
        # ReadGeneLists
        input_background = GO.ReadGeneLists(
            options.filename_background, gene_pattern=options.gene_pattern)[0]
        E.info("read %i genes for background" % len(input_background))
    else:
        input_background = None

    #############################################################
    # sort out which ontologies to test
    if not options.ontology:
        if options.filename_input:
            options.ontology = list(gene2gos.keys())

    E.info("found %i ontologies: %s" %
           (len(options.ontology), options.ontology))

    summary = []
    summary.append("\t".join(
        ("genelist", "ontology", "significant", "threshold", "ngenes",
         "ncategories", "nmaps", "nforegound", "nforeground_mapped",
         "nbackground", "nbackground_mapped", "nsample_counts",
         "nbackground_counts", "psample_assignments",
         "pbackground_assignments", "messages")) + "\n")

    #############################################################
    # get go categories for genes
    for test_ontology in sorted(options.ontology):

        # store results for aggregate output of multiple gene lists
        all_results = []
        all_significant_results = []
        all_genelists_with_results = []

        E.info("working on ontology %s" % test_ontology)
        #############################################################
        # get/read association of GO categories to genes
        if options.filename_input:
            gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology]
        else:
            E.info("reading data from database ...")

            dbhandle.Connect(options)
            gene2go, go2info = GO.ReadGene2GOFromDatabase(
                dbhandle, test_ontology, options.database, options.species)

            E.info("finished")

        if len(go2info) == 0:
            E.warn("could not find information for terms - "
                   "could be mismatch between ontologies")

        ngenes, ncategories, nmaps, counts_per_category = GO.CountGO(gene2go)
        E.info("assignments found: %i genes mapped to %i categories "
               "(%i maps)" % (ngenes, ncategories, nmaps))

        if options.minimum_counts > 0:
            to_remove = set([
                x for x, y in counts_per_category.items()
                if y < options.minimum_counts
            ])
            E.info("removing %i categories with less than %i genes" %
                   (len(to_remove), options.minimum_counts))
            GO.removeCategories(gene2go, to_remove)

            ngenes, ncategories, nmaps, counts_per_category = \
                GO.CountGO(gene2go)
            E.info("assignments after filtering: %i genes mapped "
                   "to %i categories (%i maps)" % (ngenes, ncategories, nmaps))

        for genelist_name, foreground in sorted(genelists.items()):

            msgs = []
            E.info("processing %s with %i genes" %
                   (genelist_name, len(foreground)))
            ##################################################################
            ##################################################################
            ##################################################################
            # build background - reconcile with foreground
            ##################################################################
            if input_background is None:
                background = list(gene2go.keys())
            else:
                background = list(input_background)

            # nick - bug-fix backgorund included the foreground in a tuple.
            # background is the first tuple element
            missing = foreground.difference(set(background))

            if options.strict:
                assert len(missing) == 0, \
                    "%i genes in foreground but not in background: %s" % (
                        len(missing), str(missing))
            else:
                if len(missing) != 0:
                    E.warn("%i genes in foreground that are not in "
                           "background - added to background of %i" %
                           (len(missing), len(background)))

                background.extend(missing)

            E.info("(unfiltered) foreground=%i, background=%i" %
                   (len(foreground), len(background)))

            # sort foreground and background, important for reproducibility
            # under random seed
            foreground = sorted(foreground)
            background = sorted(background)

            #############################################################
            # sanity checks:
            # are all of the foreground genes in the dataset
            # missing = set(genes).difference( set(gene2go.keys()) )
            # assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing))

            #############################################################
            # read GO slims and map GO categories to GO slim categories
            if options.filename_slims:
                go_slims = GO.GetGOSlims(
                    IOTools.open_file(options.filename_slims, "r"))

                if options.loglevel >= 1:
                    v = set()
                    for x in list(go_slims.values()):
                        for xx in x:
                            v.add(xx)
                    options.stdlog.write(
                        "# read go slims from %s: go=%i, slim=%i\n" %
                        (options.filename_slims, len(go_slims), len(v)))

                if options.filename_map_slims:
                    if options.filename_map_slims == "-":
                        outfile = options.stdout
                    else:
                        outfile = IOTools.open_file(options.filename_map_slims,
                                                    "w")

                    outfile.write("GO\tGOSlim\n")
                    for go, go_slim in sorted(list(go_slims.items())):
                        outfile.write("%s\t%s\n" % (go, go_slim))

                    if outfile != options.stdout:
                        outfile.close()

                gene2go = GO.MapGO2Slims(gene2go, go_slims, ontology=ontology)

                if options.loglevel >= 1:
                    ngenes, ncategories, nmaps, counts_per_category = \
                        GO.CountGO(gene2go)
                    options.stdlog.write(
                        "# after go slim filtering: %i genes mapped to "
                        "%i categories (%i maps)\n" %
                        (ngenes, ncategories, nmaps))

            #############################################################
            # Just dump out the gene list
            if options.get_genes:
                fg, bg, ng = [], [], []

                for gene, vv in list(gene2go.items()):
                    for v in vv:
                        if v.mGOId == options.get_genes:
                            if gene in genes:
                                fg.append(gene)
                            elif gene in background:
                                bg.append(gene)
                            else:
                                ng.append(gene)

                # skip to next GO class
                if not (bg or ng):
                    continue

                options.stdout.write("# genes in GO category %s\n" %
                                     options.get_genes)
                options.stdout.write("gene\tset\n")
                for x in sorted(fg):
                    options.stdout.write("%s\t%s\n" % ("fg", x))
                for x in sorted(bg):
                    options.stdout.write("%s\t%s\n" % ("bg", x))
                for x in sorted(ng):
                    options.stdout.write("%s\t%s\n" % ("ng", x))

                E.info("nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng)))

                E.stop()
                sys.exit(0)

            #############################################################
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='foreground',
                                     set=genelist_name)

            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(foreground))))
            if options.output_filename_pattern:
                outfile.close()

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='background',
                                     set=genelist_name)

            # Jethro bug fix - see section 'build background' for assignment
            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(background))))
            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            # do the analysis
            go_results = GO.AnalyseGO(gene2go, foreground, background)

            if len(go_results.mSampleGenes) == 0:
                E.warn("%s: no genes with GO categories - analysis aborted" %
                       genelist_name)
                continue

            pairs = list(go_results.mResults.items())

            #############################################################
            # calculate fdr for each hypothesis
            if options.fdr:
                fdrs, samples, method = GO.computeFDRs(go_results, foreground,
                                                       background, options,
                                                       test_ontology, gene2go,
                                                       go2info)
                for x, v in enumerate(pairs):
                    v[1].mQValue = fdrs[v[0]][0]
            else:
                fdrs, samples, method = {}, {}, None

            msgs.append("fdr=%s" % method)

            if options.sort_order == "fdr":
                pairs.sort(key=lambda x: x[1].mQValue)
            elif options.sort_order == "ratio":
                pairs.sort(key=lambda x: x[1].mRatio)
            elif options.sort_order == "pvalue":
                pairs.sort(key=lambda x: x[1].mPValue)

            #############################################################
            #############################################################
            #############################################################
            # output the full result
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='overall',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # filter significant results and output
            filtered_pairs = GO.selectSignificantResults(pairs, fdrs, options)

            nselected = len(filtered_pairs)
            nselected_up = len([x for x in filtered_pairs if x[1].mRatio > 1])
            nselected_down = len(
                [x for x in filtered_pairs if x[1].mRatio < 1])

            assert nselected_up + nselected_down == nselected

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='results',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             filtered_pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # save results for multi-gene-list analysis
            all_results.append(pairs)
            all_significant_results.append(filtered_pairs)
            all_genelists_with_results.append(genelist_name)

            #############################################################
            #############################################################
            #############################################################
            # output parameters
            ngenes, ncategories, nmaps, counts_per_category = \
                GO.CountGO(gene2go)

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='parameters',
                                     set=genelist_name)

            nbackground = len(background)
            if nbackground == 0:
                nbackground = len(go_results.mBackgroundGenes)

            outfile.write(
                "# input go mappings for gene list '%s' and category '%s'\n" %
                (genelist_name, test_ontology))
            outfile.write("parameter\tvalue\tdescription\n")
            outfile.write("mapped_genes\t%i\tmapped genes\n" % ngenes)
            outfile.write("mapped_categories\t%i\tmapped categories\n" %
                          ncategories)
            outfile.write("mappings\t%i\tmappings\n" % nmaps)
            outfile.write("genes_in_fg\t%i\tgenes in foreground\n" %
                          len(foreground))
            outfile.write(
                "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n"
                % (len(go_results.mSampleGenes)))
            outfile.write("genes_in_bg\t%i\tinput background\n" % nbackground)
            outfile.write(
                "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n"
                % (len(go_results.mBackgroundGenes)))
            outfile.write("associations_in_fg\t%i\tassociations in sample\n" %
                          go_results.mSampleCountsTotal)
            outfile.write(
                "associations_in_bg\t%i\tassociations in background\n" %
                go_results.mBackgroundCountsTotal)
            outfile.write(
                "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n"
                % (IOTools.pretty_percent(len(go_results.mSampleGenes),
                                          len(foreground), "%5.2f")))
            outfile.write(
                "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n"
                % (IOTools.pretty_percent(len(go_results.mBackgroundGenes),
                                          nbackground, "%5.2f")))
            outfile.write("significant\t%i\tsignificant results reported\n" %
                          nselected)
            outfile.write(
                "significant_up\t%i\tsignificant up-regulated results reported\n"
                % nselected_up)
            outfile.write(
                "significant_down\t%i\tsignificant up-regulated results reported\n"
                % nselected_down)
            outfile.write("threshold\t%6.4f\tsignificance threshold\n" %
                          options.threshold)

            if options.output_filename_pattern:
                outfile.close()

            summary.append("\t".join(
                map(str, (genelist_name, test_ontology, nselected,
                          options.threshold, ngenes, ncategories, nmaps,
                          len(foreground), len(go_results.mSampleGenes),
                          nbackground, len(go_results.mBackgroundGenes),
                          go_results.mSampleCountsTotal,
                          go_results.mBackgroundCountsTotal,
                          IOTools.pretty_percent(len(go_results.mSampleGenes),
                                                 len(foreground), "%5.2f"),
                          IOTools.pretty_percent(
                              len(go_results.mBackgroundGenes), nbackground,
                              "%5.2f"), ",".join(msgs)))) + "\n")

            #############################################################
            #############################################################
            #############################################################
            # output the fg patterns
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='withgenes',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples,
                             gene2go=gene2go,
                             foreground=foreground,
                             gene2name=gene2name)

            if options.output_filename_pattern:
                outfile.close()

        if len(genelists) > 1:

            ###################################################################
            # output various summary files
            # significant results
            GO.outputMultipleGeneListResults(all_significant_results,
                                             all_genelists_with_results,
                                             test_ontology,
                                             go2info,
                                             options,
                                             section='significant')

            # all results
            GO.outputMultipleGeneListResults(all_results,
                                             all_genelists_with_results,
                                             test_ontology,
                                             go2info,
                                             options,
                                             section='all')

            if options.compute_pairwise:
                GO.pairwiseGOEnrichment(all_results,
                                        all_genelists_with_results,
                                        test_ontology, go2info, options)

    outfile_summary = options.stdout
    outfile_summary.write("".join(summary))

    E.stop()