コード例 #1
0
ファイル: farm.py プロジェクト: logust79/cgat-core
def main(argv=None):

    parser = get_option_parser()

    (options, args) = E.start(parser, add_cluster_options=True)

    if len(args) == 0:
        raise ValueError(
            "command line argument missing - see usage information")

    options.renumber_column = [x.split(":") for x in options.renumber_column]

    cmd = args[0]
    if len(args) > 1:
        cmd += " '" + "' '".join(args[1:]) + "'"

    if options.dry_run:

        cmd = re.sub("%DIR%", "", cmd)
        retcode = subprocess.call(cmd,
                                  shell=True,
                                  stdin=sys.stdin,
                                  stdout=sys.stdout,
                                  cwd=os.getcwd(),
                                  close_fds=True)
        E.stop()
        sys.exit(0)

    failed_requests = []
    started_requests = []
    niterations = 0

    P.get_parameters()
    P.start_session()

    if not options.collect:
        tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir))

        E.info(" working in directory %s" % tmpdir)

        if options.split_at_lines:
            chunk_iterator = chunk_iterator_lines
            args = (options.split_at_lines, )
        elif options.split_at_column:
            chunk_iterator = chunk_iterator_column
            args = (options.split_at_column - 1, options.max_files)
        elif options.split_at_regex:
            chunk_iterator = chunk_iterator_regex_split
            args = (re.compile(options.split_at_regex), 0, options.chunksize,
                    options.max_lines)
        elif options.group_by_regex:
            chunk_iterator = chunk_iterator_regex_group
            args = (re.compile(options.group_by_regex), 0, options.chunksize)
        else:
            raise ValueError("please specify a way to chunk input data")

        data = [(x, cmd, options, None, options.subdirs)
                for x in chunk_iterator(options.stdin,
                                        args,
                                        prefix=tmpdir,
                                        use_header=options.input_header)]

        statements = [build_command(x) for x in data]
        started_requests = [(x[0], x[0] + ".out") for x in data]

        if len(data) == 0:
            E.warn("no data received")
            E.stop()
            sys.exit(0)

        P.run(statements)
    else:
        tmpdir = options.collect
        started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")]

        E.info("collecting %i files from %s" % (len(started_requests), tmpdir))

    if failed_requests:
        for fn, cmd in failed_requests:
            E.error("failed request: filename= %s, cmd= %s" % (fn, cmd))
    else:
        E.info("building result from %i parts" % len(started_requests))

        if options.renumber:
            mapper = MapperLocal(pattern=options.renumber)
        else:
            mapper = MapperEmpty()

        # deal with stdout
        name = None
        index = None

        for pattern, column in options.renumber_column:

            if re.search(pattern, "stdout"):
                try:
                    index = int(column) - 1
                except ValueError:
                    name = column
                    break

        if options.binary:
            ResultBuilderBinary()(started_requests, options.stdout, options)
        else:
            regex = None
            if options.output_regex_header:
                regex = re.compile(options.output_regex_header)
            ResultBuilder(mapper=mapper,
                          field_index=index,
                          field_name=name,
                          header_regex=regex)(started_requests, options.stdout,
                                              options)

        # deal with logfiles : combine them into a single file
        rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd)
        if rr:
            E.info("logging output goes to %s" % rr.groups()[0])
            logfile = IOTools.open_file(rr.groups()[0], "a")
            ResultBuilderLog()([(x[0], "%s.log" % x[0])
                                for x in started_requests], logfile, options)
            logfile.close()

        # deal with other files
        if options.subdirs:

            files = glob.glob("%s/*.dir/*" % tmpdir)
            # remove directory
            filenames = set([os.path.basename(x) for x in files])
            xx = len(".out")

            for filename in filenames:

                _, filetype = os.path.splitext(filename)

                name = None
                index = None

                for pattern, column in options.renumber_column:
                    if re.search(pattern, filename):
                        try:
                            index = int(column) - 1
                        except ValueError:
                            name = column
                        break

                if options.binary:
                    builder = ResultBuilderBinary(mapper=mapper)
                elif filetype in (".fa", ".fasta"):
                    builder = ResultBuilderFasta(mapper=mapper)
                elif filetype in (".mali", ):
                    builder = ResultBuilderFasta(mapper=MapperEmpty())
                elif filetype in (".png"):
                    builder = ResultBuilderCopies(mapper=mapper)
                else:
                    builder = ResultBuilder(mapper=mapper,
                                            field_index=index,
                                            field_name=name)

                E.debug("chose the following builder for %s: %s: %s" %
                        (filename, filetype, str(builder)))

                E.info("collecting results for %s" % filename)

                input_filenames = []
                for fi, fn in started_requests:
                    fn = fn[:-xx] + ".dir/" + filename
                    if os.path.exists(fn):
                        input_filenames.append((fi, fn))

                E.info("output of %i files goes to %s" %
                       (len(filenames), filename))

                outfile = IOTools.open_file(options.output_pattern % filename,
                                            "w")
                builder(input_filenames, outfile, options)
                outfile.close()

    if not options.debug and (not options.resume or not options.collect):
        if len(failed_requests) == 0:
            E.info("removing directory %s" % tmpdir)
            shutil.rmtree(tmpdir)
        else:
            E.info("directory %s not removed due to %i failed jobs" %
                   (tmpdir, len(failed_requests)))

    E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" %
           (len(started_requests), len(started_requests) -
            len(failed_requests), len(failed_requests), niterations))

    E.stop()
コード例 #2
0
def AnalyseGO(gene2go, genes, genes_background=None, do_probabilities=True):
    """analyse go ids.

    goids: list of goids to analyse
    genes: sample set of genes
    genes_background: background set of genes (default: all)
    """
    if genes_background is None:
        genes_background = list(gene2go.keys())

    result = GOResults()

    # get background frequencies
    (background_counts_total, background_counts, background_genes) = \
        GetGOFrequencies(gene2go,
                         genes_background)

    result.mBackgroundCountsTotal = background_counts_total
    result.mBackgroundNumCategories = len(background_counts)
    result.mBackgroundGenes = background_genes

    # get sample frequencies
    (sample_counts_total, sample_counts, sample_genes) = \
        GetGOFrequencies(gene2go,
                         genes)

    result.mNumGenes = len(genes)

    result.mSampleCountsTotal = sample_counts_total
    result.mSampleNumCategories = len(sample_counts)
    result.mSampleGenes = sample_genes

    # test for over or underrepresented categories in the slims
    # report results for all go categories in the background
    # so that also categories completely absent in the foreground (sample)
    # are considered.
    for go_id in list(background_counts.keys()):

        result_go = GOResult(go_id)

        # use gene counts
        result_go.mSampleCountsCategory = sample_counts.get(go_id, 0)
        result_go.mSampleCountsTotal = len(sample_genes)
        result_go.mBackgroundCountsTotal = len(background_genes)
        result_go.mBackgroundCountsCategory = background_counts[go_id]

        E.debug(
            "processing %s: genes in foreground=%i, genes in backgound=%i, sample_counts=%i, background_counts=%i"
            % (
                go_id,
                len(sample_genes),
                len(background_genes),
                sample_counts.get(go_id, 0),
                background_counts.get(go_id, 0),
            ))

        if do_probabilities:
            try:
                result_go.UpdateProbabilities()
            except AssertionError as msg:
                print(msg)
                print("# error while calculating probabilities for %s" % go_id)
                print("# genes in sample", sample_genes)
                print("# counts in sample: %i out of %i total" %
                      (result_go.mSampleCountsCategory,
                       result_go.mSampleCountsTotal))
                print("# counts in background %i out of %i total" %
                      (result_go.mBackgroundCountsCategory,
                       result_go.mBackgroundCountsTotal))
                for x in list(sample_genes.keys()):
                    for y in gene2go[x]:
                        print(x, str(y))

                sys.exit(0)

        result.mResults[go_id] = result_go

    return result
コード例 #3
0
ファイル: run_function.py プロジェクト: logust79/cgat-flow
def main(argv=None):

    # Parse the options
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-p",
        "--params",
        dest="params",
        type="string",
        help="comma separated list of addtional parameter strings")

    parser.add_option("-m",
                      "--module",
                      dest="module",
                      type="string",
                      help="the full path to the module file",
                      default=None)

    parser.add_option("-i",
                      "--input",
                      dest="input_filenames",
                      type="string",
                      action="append",
                      help="input filename")

    parser.add_option("-o",
                      "--output-section",
                      dest="output_filenames",
                      type="string",
                      action="append",
                      help="output filename")

    parser.add_option("-f",
                      "--function",
                      dest="function",
                      type="string",
                      help="the module function",
                      default=None)

    parser.set_defaults(input_filenames=[], output_filenames=[], params=None)

    (options, args) = E.Start(parser)

    # Check a module and function have been specified
    if not options.module or not options.function:
        raise ValueError("Both a function and Module must be specified")

    # If a full path was given, add this path to the system path
    location = os.path.dirname(options.module)
    if location != "":
        sys.path.append(location)

    # Establish the module name, accomodating cases where the
    # .py extension has been included in the module name
    module_name = os.path.basename(options.module)
    if module_name.endswith(".py"):
        module_base_name = module_name[:-3]
    else:
        module_base_name = module_name

    # Import the specified module and map the specified fuction
    E.info("importing module '%s' " % module_base_name)
    E.debug("sys.path is: %s" % sys.path)

    module = importlib.import_module(module_base_name)
    try:
        function = getattr(module, options.function)
    except AttributeError as msg:
        raise AttributeError(
            msg.message + "unknown function, available functions are: %s" %
            ",".join([x for x in dir(module) if not x.startswith("_")]))

    if options.input_filenames and not options.input_filenames == ["None"]:
        infiles = options.input_filenames
    else:
        infiles = False

    if options.output_filenames and not options.output_filenames == ["None"]:
        outfiles = options.output_filenames
    else:
        outfiles = False

    # Parse the parameters into an array
    if options.params:
        params = [param.strip() for param in options.params.split(",")]
    else:
        params = False

    # deal with single file case
    if infiles and len(infiles) == 1:
        infiles = infiles[0]
    if outfiles and len(outfiles) == 1:
        outfiles = outfiles[0]

    # Make the function call
    if infiles and outfiles and params:
        function(infiles, outfiles, params)
    elif infiles and outfiles and not params:
        function(infiles, outfiles)
    elif params:
        function(params)
    else:
        raise ValueError(
            "Expecting infile+outfile+params or infile+outfile or params")

    E.Stop()
コード例 #4
0
ファイル: Execution.py プロジェクト: CGATOxford/CGATCore
def run(**kwargs):
    """run a command line statement.

    The method runs a single or multiple statements on the cluster
    using drmaa. The cluster is bypassed if:

        * ``to_cluster`` is set to None in the context of the
          calling function.

        * ``--local`` has been specified on the command line
          and the option ``without_cluster`` has been set as
          a result.

        * no libdrmaa is present

        * the global session is not initialized (GLOBAL_SESSION is
          None)

    To decide which statement to run, the method works by examining
    the context of the calling function for a variable called
    ``statement`` or ``statements``.

    If ``statements`` is defined, multiple job scripts are created and
    sent to the cluster. If ``statement`` is defined, a single job
    script is created and sent to the cluster. Additionally, if
    ``job_array`` is defined, the single statement will be submitted
    as an array job.

    Troubleshooting:

       1. DRMAA creates sessions and their is a limited number
          of sessions available. If there are two many or sessions
          become not available after failed jobs, use ``qconf -secl``
          to list sessions and ``qconf -kec #`` to delete sessions.

       2. Memory: 1G of free memory can be requested using the job_memory
          variable: ``job_memory = "1G"``
          If there are error messages like "no available queue", then the
          problem could be that a particular complex attribute has
          not been defined (the code should be ``hc`` for ``host:complex``
          and not ``hl`` for ``host:local``). Note that qrsh/qsub directly
          still works.

    """

    # combine options using correct preference
    options = dict(list(PARAMS.items()))
    options.update(list(getCallerLocals().items()))
    options.update(list(kwargs.items()))

    # insert legacy synonyms
    options['without_cluster'] = options.get('without_cluster')
    getParallelEnvironment(options)

    # enforce highest priority for cluster options in command-line
    if "cli_cluster_memory_default" in PARAMS:
        options["cluster_memory_default"] = PARAMS["cli_cluster_memory_default"]
    if "cli_cluster_memory_resource" in PARAMS:
        options["cluster_memory_resource"] = PARAMS["cli_cluster_memory_resource"]
    if "cli_cluster_num_jobs" in PARAMS:
        options["cluster_num_jobs"] = PARAMS["cli_cluster_num_jobs"]
    if "cli_cluster_options" in PARAMS:
        options["cluster_options"] = PARAMS["cli_cluster_options"]
    if "cli_cluster_parallel_environment" in PARAMS:
        options["cluster_parallel_environment"] = PARAMS["cli_cluster_parallel_environment"]
    if "cli_cluster_priority" in PARAMS:
        options["cluster_priority"] = PARAMS["cli_cluster_priority"]
    if "cli_cluster_queue" in PARAMS:
        options["cluster_queue"] = PARAMS["cli_cluster_queue"]
    if "cli_cluster_queue_manager" in PARAMS:
        options["cluster_queue_manager"] = PARAMS["cli_cluster_queue_manager"]

    # if the command-line has not been used
    # get information from the legacy job_options
    if options["cluster_options"] == "":
        options["cluster_options"] = options.get("job_options", options["cluster_options"])

    # get the memory requirement for the job
    job_memory = getJobMemory(options, PARAMS)

    # get the queue manager
    queue_manager = PARAMS["cluster_queue_manager"]

    shellfile = os.path.join(PARAMS["workingdir"], "shell.log")

    pid = os.getpid()
    E.debug('task: pid = %i' % pid)

    # connect to global session
    session = GLOBAL_SESSION
    E.debug('task: pid %i: sge session = %s' % (pid, str(session)))

    ignore_pipe_errors = options.get('ignore_pipe_errors', False)
    ignore_errors = options.get('ignore_errors', False)

    # run on cluster if:
    # * to_cluster is not defined or set to True
    # * command line option without_cluster is set to False
    # * an SGE session is present
    run_on_cluster = ("to_cluster" not in options or
                      options.get("to_cluster")) and \
        not options["without_cluster"] and \
        GLOBAL_SESSION is not None

    # SGE compatible job_name
    job_name = re.sub(
        "[:]", "_",
        os.path.basename(options.get("outfile", "ruffus")))

    def _writeJobScript(statement, job_memory, job_name, shellfile):
        # disabled - problems with quoting
        # tmpfile.write( '''echo 'statement=%s' >> %s\n''' %
        # (shellquote(statement), shellfile) )
        # module list outputs to stderr, so merge stderr and stdout

        script = '''#!/bin/bash -e \n
                    echo "%(job_name)s : START -> ${0}" >> %(shellfile)s
                    set | sed 's/^/%(job_name)s : /' &>> %(shellfile)s
                    set +o errexit
                    module list 2>&1 | sed 's/^/%(job_name)s: /' &>> %(shellfile)s
                    set -o errexit
                    hostname | sed 's/^/%(job_name)s: /' &>> %(shellfile)s
                    cat /proc/meminfo | sed 's/^/%(job_name)s: /' &>> %(shellfile)s
                    echo "%(job_name)s : END -> ${0}" >> %(shellfile)s
                 ''' % locals()

        # restrict virtual memory
        # Note that there are resources in SGE which could do this directly
        # such as v_hmem.
        # Note that limiting resident set sizes (RSS) with ulimit is not
        # possible in newer kernels.
        script += "ulimit -v %i\n" % IOTools.human2bytes(job_memory)
        script += expandStatement(statement,
                                  ignore_pipe_errors=ignore_pipe_errors)
        script += "\n"

        job_path = getTempFilename(dir=PARAMS["workingdir"])

        with open(job_path, "w") as script_file:
            script_file.write(script)

        return(job_path)

    if run_on_cluster:
        # run multiple jobs
        if options.get("statements"):

            statement_list = []
            for statement in options.get("statements"):
                options["statement"] = statement
                statement_list.append(buildStatement(**options))

            if options.get("dryrun", False):
                return

            jt = setupDrmaaJobTemplate(session, options, job_name, job_memory)
            E.debug("Job spec is: %s" % jt.nativeSpecification)

            job_ids, filenames = [], []

            for statement in statement_list:
                E.info("running statement:\n%s" % statement)

                job_path = _writeJobScript(statement, job_memory, job_name, shellfile)

                jt, stdout_path, stderr_path = setDrmaaJobPaths(jt, job_path)

                job_id = session.runJob(jt)

                job_ids.append(job_id)
                filenames.append((job_path, stdout_path, stderr_path))

                E.debug("job has been submitted with job_id %s" % str(job_id))

            E.debug("waiting for %i jobs to finish " % len(job_ids))

            session.synchronize(job_ids, drmaa.Session.TIMEOUT_WAIT_FOREVER,
                                False)

            # collect and clean up
            for job_id, statement, paths in zip(job_ids, statement_list,
                                                filenames):
                job_path, stdout_path, stderr_path = paths
                collectSingleJobFromCluster(session, job_id,
                                            statement,
                                            stdout_path,
                                            stderr_path,
                                            job_path,
                                            ignore_errors=ignore_errors)

            session.deleteJobTemplate(jt)

        # run single job on cluster - this can be an array job
        else:

            statement = buildStatement(**options)
            E.info("running statement:\n%s" % statement)

            if options.get("dryrun", False):
                return

            jt = setupDrmaaJobTemplate(session, options, job_name, job_memory)
            E.debug("Job spec is: %s" % jt.nativeSpecification)

            job_path = _writeJobScript(statement, job_memory, job_name, shellfile)
            jt, stdout_path, stderr_path = setDrmaaJobPaths(jt, job_path)

            if "job_array" in options and options["job_array"] is not None:
                # run an array job
                start, end, increment = options.get("job_array")
                E.debug("starting an array job: %i-%i,%i" %
                        (start, end, increment))
                # sge works with 1-based, closed intervals
                job_ids = session.runBulkJobs(jt, start + 1, end, increment)
                E.debug("%i array jobs have been submitted as job_id %s" %
                        (len(job_ids), job_ids[0]))
                retval = session.synchronize(
                    job_ids, drmaa.Session.TIMEOUT_WAIT_FOREVER, True)

                stdout, stderr = getStdoutStderr(stdout_path, stderr_path)

            else:
                # run a single job
                job_id = session.runJob(jt)
                E.debug("job has been submitted with job_id %s" % str(job_id))

                collectSingleJobFromCluster(session, job_id,
                                            statement,
                                            stdout_path,
                                            stderr_path,
                                            job_path,
                                            ignore_errors=ignore_errors)

            session.deleteJobTemplate(jt)
    else:
        # run job locally on cluster
        statement_list = []
        if options.get("statements"):
            for statement in options.get("statements"):
                options["statement"] = statement
                statement_list.append(buildStatement(**options))
        else:
            statement_list.append(buildStatement(**options))

        if options.get("dryrun", False):
            return

        for statement in statement_list:
            E.info("running statement:\n%s" % statement)

            # process substitution <() and >() does not
            # work through subprocess directly. Thus,
            # the statement needs to be wrapped in
            # /bin/bash -c '...' in order for bash
            # to interpret the substitution correctly.
            if "<(" in statement or ">(" in statement:
                shell = os.environ.get('SHELL', "/bin/bash")
                if "bash" not in shell:
                    raise ValueError(
                        "require bash for advanced shell syntax: <()")
                # Note: pipes.quote is deprecated in Py3, use shlex.quote
                # (not present in Py2.7).
                statement = pipes.quote(statement)
                statement = "%s -c %s" % (shell, statement)

            process = subprocess.Popen(
                expandStatement(
                    statement,
                    ignore_pipe_errors=ignore_pipe_errors),
                cwd=PARAMS["workingdir"],
                shell=True,
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)

            # process.stdin.close()
            stdout, stderr = process.communicate()

            if process.returncode != 0 and not ignore_errors:
                raise OSError(
                    "---------------------------------------\n"
                    "Child was terminated by signal %i: \n"
                    "The stderr was: \n%s\n%s\n"
                    "-----------------------------------------" %
                    (-process.returncode, stderr, statement))
コード例 #5
0
ファイル: gff2gff.py プロジェクト: logust79/cgat-apps
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: gff2gff.py$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        choices=("add-flank", "add-upstream-flank", "add-downstream-flank",
                 "crop", "crop-unique", "complement-groups", "combine-groups",
                 "filter-range", "join-features", "merge-features", "sanitize",
                 "to-forward-coordinates", "to-forward-strand", "rename-chr"),
        help="method to apply [%default]")

    parser.add_option("--ignore-strand",
                      dest="ignore_strand",
                      help="ignore strand information.",
                      action="store_true")

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input will be treated as gtf [default=%default].")

    parser.add_option("-c",
                      "--contigs-tsv-file",
                      dest="input_filename_contigs",
                      type="string",
                      help="filename with contig lengths.")

    parser.add_option(
        "--agp-file",
        dest="input_filename_agp",
        type="string",
        help="agp file to map coordinates from contigs to scaffolds.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("--crop-gff-file",
                      dest="filename_crop_gff",
                      type="string",
                      help="GFF/GTF file to crop against.")

    parser.add_option(
        "--group-field",
        dest="group_field",
        type="string",
        help="""gff field/attribute to group by such as gene_id, "
        "transcript_id, ... [%default].""")

    parser.add_option(
        "--filter-range",
        dest="filter_range",
        type="string",
        help="extract all elements overlapping a range. A range is "
        "specified by eithor 'contig:from..to', 'contig:+:from..to', "
        "or 'from,to' .")

    parser.add_option("--sanitize-method",
                      dest="sanitize_method",
                      type="choice",
                      choices=("ucsc", "ensembl", "genome"),
                      help="method to use for sanitizing chromosome names. "
                      "[%default].")

    parser.add_option(
        "--flank-method",
        dest="flank_method",
        type="choice",
        choices=("add", "extend"),
        help="method to use for adding flanks. ``extend`` will "
        "extend existing features, while ``add`` will add new features. "
        "[%default].")

    parser.add_option("--skip-missing",
                      dest="skip_missing",
                      action="store_true",
                      help="skip entries on missing contigs. Otherwise an "
                      "exception is raised [%default].")

    parser.add_option(
        "--contig-pattern",
        dest="contig_pattern",
        type="string",
        help="a comma separated list of regular expressions specifying "
        "contigs to be removed when running method sanitize [%default].")

    parser.add_option(
        "--assembly-report",
        dest="assembly_report",
        type="string",
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize [%default].")

    parser.add_option(
        "--assembly-report-hasids",
        dest="assembly_report_hasIDs",
        type="int",
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize [%default].")

    parser.add_option(
        "--assembly-report-ucsccol",
        dest="assembly_report_ucsccol",
        type="int",
        help="column in the assembly report containing ucsc contig ids"
        "[%default].")

    parser.add_option(
        "--assembly-report-ensemblcol",
        dest="assembly_report_ensemblcol",
        type="int",
        help="column in the assembly report containing ensembl contig ids"
        "[%default].")

    parser.add_option(
        "--assembly-extras",
        dest="assembly_extras",
        type="str",
        help="additional mismatches between gtf and fasta to fix when"
        "sanitizing the genome [%default].")

    parser.add_option("--extension-upstream",
                      dest="extension_upstream",
                      type="float",
                      help="extension for upstream end [%default].")

    parser.add_option("--extension-downstream",
                      dest="extension_downstream",
                      type="float",
                      help="extension for downstream end [%default].")

    parser.add_option(
        "--min-distance",
        dest="min_distance",
        type="int",
        help="minimum distance of features to merge/join [%default].")

    parser.add_option(
        "--max-distance",
        dest="max_distance",
        type="int",
        help="maximum distance of features to merge/join [%default].")

    parser.add_option(
        "--min-features",
        dest="min_features",
        type="int",
        help="minimum number of features to merge/join [%default].")

    parser.add_option(
        "--max-features",
        dest="max_features",
        type="int",
        help="maximum number of features to merge/join [%default].")

    parser.add_option(
        "--rename-chr-file",
        dest="rename_chr_file",
        type="string",
        help="mapping table between old and new chromosome names."
        "TAB separated 2-column file.")

    parser.set_defaults(input_filename_contigs=False,
                        filename_crop_gff=None,
                        input_filename_agp=False,
                        genome_file=None,
                        rename_chr_file=None,
                        add_up_flank=None,
                        add_down_flank=None,
                        complement_groups=False,
                        crop=None,
                        crop_unique=False,
                        ignore_strand=False,
                        filter_range=None,
                        min_distance=0,
                        max_distance=0,
                        min_features=1,
                        max_features=0,
                        extension_upstream=1000,
                        extension_downstream=1000,
                        sanitize_method="ucsc",
                        flank_method="add",
                        output_format="%06i",
                        skip_missing=False,
                        is_gtf=False,
                        group_field=None,
                        contig_pattern=None,
                        assembly_report=None,
                        assembly_report_hasIDs=1,
                        assembly_report_ensemblcol=4,
                        assembly_report_ucsccol=9,
                        assembly_extras=None)

    (options, args) = E.start(parser, argv=argv)

    contigs = None
    genome_fasta = None
    chr_map = None

    if options.input_filename_contigs:
        contigs = Genomics.readContigSizes(
            IOTools.open_file(options.input_filename_contigs, "r"))

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = genome_fasta.getContigSizes()

    if options.rename_chr_file:
        chr_map = {}
        with open(options.rename_chr_file, 'r') as filein:
            reader = csv.reader(filein, delimiter='\t')
            for row in reader:
                if len(row) != 2:
                    raise ValueError(
                        "Mapping table must have exactly two columns")
                chr_map[row[0]] = row[1]
        if not len(chr_map.keys()) > 0:
            raise ValueError("Empty mapping dictionnary")

    if options.assembly_report:
        df = pd.read_csv(options.assembly_report,
                         comment="#",
                         header=None,
                         sep="\t")
        # fixes naming inconsistency in assembly report: ensembl chromosome
        # contigs found in columnn 0, ensembl unassigned contigs found in
        # column 4.
        if options.assembly_report_hasIDs == 1:
            ucsccol = options.assembly_report_ucsccol
            ensemblcol = options.assembly_report_ensemblcol
            df.ix[df[1] == "assembled-molecule",
                  ensemblcol] = df.ix[df[1] == "assembled-molecule", 0]
            if options.sanitize_method == "ucsc":
                assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict()
            elif options.sanitize_method == "ensembl":
                assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict()
            else:
                raise ValueError(''' When using assembly report,
                please specify sanitize method as either
                "ucsc" or "ensembl" to specify direction of conversion
                ''')
        else:
            assembly_dict = {}
        if options.assembly_extras is not None:
            assembly_extras = options.assembly_extras.split(",")
            for item in assembly_extras:
                item = item.split("-")
                assembly_dict[item[0]] = item[1]

    if options.method in ("forward_coordinates", "forward_strand",
                          "add-flank", "add-upstream-flank",
                          "add-downstream-flank") \
       and not contigs:
        raise ValueError("inverting coordinates requires genome file")

    if options.input_filename_agp:
        agp = AGP.AGP()
        agp.readFromFile(IOTools.open_file(options.input_filename_agp, "r"))
    else:
        agp = None

    gffs = GTF.iterator(options.stdin)

    if options.method in ("add-upstream-flank", "add-downstream-flank",
                          "add-flank"):

        add_upstream_flank = "add-upstream-flank" == options.method
        add_downstream_flank = "add-downstream-flank" == options.method
        if options.method == "add-flank":
            add_upstream_flank = add_downstream_flank = True

        upstream_flank = int(options.extension_upstream)
        downstream_flank = int(options.extension_downstream)
        extend_flank = options.flank_method == "extend"

        if options.is_gtf:
            iterator = GTF.flat_gene_iterator(gffs)
        else:
            iterator = GTF.joined_iterator(gffs, options.group_field)

        for chunk in iterator:
            is_positive = Genomics.IsPositiveStrand(chunk[0].strand)
            chunk.sort(key=lambda x: (x.contig, x.start))
            lcontig = contigs[chunk[0].contig]

            if extend_flank:
                if add_upstream_flank:
                    if is_positive:
                        chunk[0].start = max(0,
                                             chunk[0].start - upstream_flank)
                    else:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + upstream_flank)
                if add_downstream_flank:
                    if is_positive:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + downstream_flank)
                    else:
                        chunk[0].start = max(0,
                                             chunk[0].start - downstream_flank)
            else:
                if add_upstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - upstream_flank)
                        chunk.insert(0, gff)
                    else:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + upstream_flank)
                        chunk.append(gff)
                    gff.feature = "5-Flank"
                    gff.mMethod = "gff2gff"
                if add_downstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + downstream_flank)
                        chunk.append(gff)
                    else:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - downstream_flank)
                        chunk.insert(0, gff)
                    gff.feature = "3-Flank"
                    gff.mMethod = "gff2gff"

            if not is_positive:
                chunk.reverse()

            for gff in chunk:
                options.stdout.write(str(gff) + "\n")

    elif options.method == "complement-groups":

        iterator = GTF.joined_iterator(gffs, group_field=options.group_field)

        for chunk in iterator:
            if options.is_gtf:
                chunk = [x for x in chunk if x.feature == "exon"]
                if len(chunk) == 0:
                    continue
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.start = x.end
            x.feature = "intron"
            for c in chunk[1:]:
                x.end = c.start
                options.stdout.write(str(x) + "\n")
                x.start = c.end

    elif options.method == "combine-groups":

        iterator = GTF.joined_iterator(gffs, group_field=options.group_field)

        for chunk in iterator:
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.end = chunk[-1].end
            x.feature = "segment"
            options.stdout.write(str(x) + "\n")

    elif options.method == "join-features":
        for gff in combineGFF(gffs,
                              min_distance=options.min_distance,
                              max_distance=options.max_distance,
                              min_features=options.min_features,
                              max_features=options.max_features,
                              merge=False,
                              output_format=options.output_format):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "merge-features":
        for gff in combineGFF(gffs,
                              min_distance=options.min_distance,
                              max_distance=options.max_distance,
                              min_features=options.min_features,
                              max_features=options.max_features,
                              merge=True,
                              output_format=options.output_format):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "crop":
        for gff in cropGFF(gffs, options.filename_crop_gff):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "crop-unique":
        for gff in cropGFFUnique(gffs):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "filter-range":

        contig, strand, interval = None, None, None
        try:
            contig, strand, start, sep, end = re.match(
                "(\S+):(\S+):(\d+)(\.\.|-)(\d+)",
                options.filter_range).groups()
        except AttributeError:
            pass

        if not contig:
            try:
                contig, start, sep, end = re.match(
                    "(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups()
                strand = None
            except AttributeError:
                pass

        if not contig:
            try:
                start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)",
                                      options.filter_range).groups()
            except AttributeError:
                raise "can not parse range %s" % options.filter_range
            contig = None
            strand = None

        if start:
            interval = (int(start), int(end))
        else:
            interval = None

        E.debug("filter: contig=%s, strand=%s, interval=%s" %
                (str(contig), str(strand), str(interval)))

        for gff in GTF.iterator_filtered(gffs,
                                         contig=contig,
                                         strand=strand,
                                         interval=interval):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "sanitize":

        def assemblyReport(id):
            if id in assembly_dict.keys():
                id = assembly_dict[id]
            # if not in dict, the contig name is forced
            # into the desired convention, this is helpful user
            # modified gff files that contain additional contigs
            elif options.sanitize_method == "ucsc":
                if not id.startswith("contig") and not id.startswith("chr"):
                    id = "chr%s" % id
            elif options.sanitize_method == "ensembl":
                if id.startswith("contig"):
                    return id[len("contig"):]
                elif id.startswith("chr"):
                    return id[len("chr"):]
            return id

        if options.sanitize_method == "genome":
            if genome_fasta is None:
                raise ValueError("please specify --genome-file= when using "
                                 "--sanitize-method=genome")
            f = genome_fasta.getToken
        else:
            if options.assembly_report is None:
                raise ValueError(
                    "please specify --assembly-report= when using "
                    "--sanitize-method=ucsc or ensembl")
            f = assemblyReport

        skipped_contigs = collections.defaultdict(int)
        outofrange_contigs = collections.defaultdict(int)
        filtered_contigs = collections.defaultdict(int)

        for gff in gffs:
            try:
                gff.contig = f(gff.contig)
            except KeyError:
                if options.skip_missing:
                    skipped_contigs[gff.contig] += 1
                    continue
                else:
                    raise

            if genome_fasta:
                lcontig = genome_fasta.getLength(gff.contig)
                if lcontig < gff.end:
                    outofrange_contigs[gff.contig] += 1
                    continue

            if options.contig_pattern:
                to_remove = [
                    re.compile(x) for x in options.contig_pattern.split(",")
                ]
                if any([x.search(gff.contig) for x in to_remove]):
                    filtered_contigs[gff.contig] += 1
                    continue

            options.stdout.write(str(gff) + "\n")

        if skipped_contigs:
            E.info("skipped %i entries on %i contigs: %s" %
                   (sum(skipped_contigs.values()),
                    len(list(skipped_contigs.keys())), str(skipped_contigs)))

        if outofrange_contigs:
            E.warn(
                "skipped %i entries on %i contigs because they are out of range: %s"
                % (sum(outofrange_contigs.values()),
                   len(list(
                       outofrange_contigs.keys())), str(outofrange_contigs)))

        if filtered_contigs:
            E.info("filtered out %i entries on %i contigs: %s" %
                   (sum(filtered_contigs.values()),
                    len(list(filtered_contigs.keys())), str(filtered_contigs)))

    elif options.method == "rename-chr":
        if not chr_map:
            raise ValueError("please supply mapping file")

        for gff in renameChromosomes(gffs, chr_map):
            options.stdout.write(str(gff) + "\n")

    else:

        for gff in gffs:

            if options.method == "forward_coordinates":
                gff.invert(contigs[gff.contig])

            if options.method == "forward_strand":
                gff.invert(contigs[gff.contig])
                gff.strand = "+"

            if agp:
                # note: this works only with forward coordinates
                gff.contig, gff.start, gff.end = agp.mapLocation(
                    gff.contig, gff.start, gff.end)

            options.stdout.write(str(gff) + "\n")

    E.stop()
コード例 #6
0
ファイル: collect.py プロジェクト: logust79/cgat-flow
    for glob_expression, template, dest in dirs:

        if not os.path.exists(dest):
            os.mkdir(dest)

        files = glob.glob(os.path.abspath(glob_expression))

        for filename in files:
            dirname, name = os.path.split(filename)
            prefix = name[:-3]

            # if os.path.exists( os.path.join( dirname, "_%s.pyx" % prefix )):
            #     E.warn( "ignoring pyximport file _%s.pyx" % prefix )
            #     continue

            filename = os.path.join(os.path.abspath(dest), "%s.rst" % prefix)
            if os.path.exists(filename):
                nskipped += 1
                continue

            E.debug("adding %s" % filename)
            outfile = open(filename, "w")
            outfile.write(template % locals())
            outfile.close()

            ncreated += 1

    E.info("ncreated=%i, nskipped=%i" % (ncreated, nskipped))

    E.Stop()
コード例 #7
0
def main(argv=None):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: quality2masks.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "--quality-threshold",
        dest="quality_threshold",
        type="int",
        help="quality threshold for masking positions [default=%default]")

    parser.add_option(
        "--random",
        dest="random",
        action="store_true",
        help="shuffle quality scores before masking [default=%default]")

    parser.add_option(
        "--map-tsv-file",
        dest="filename_map",
        type="string",
        help=
        "filename in psl format mapping entries in multiple alignment to the genome [default=%default]"
    )

    parser.add_option(
        "-q",
        "--quality-file",
        dest="quality_file",
        type="string",
        help=
        "filename with genomic base quality information [default=%default].")

    parser.set_defaults(
        quality_threshold=40,
        quality_file="quality",
        filename_map=None,
        frame=3,
    )

    (options, args) = E.start(parser)

    ##################################################
    ##################################################
    ##################################################
    # read map
    ##################################################
    infile = IOTools.open_file(options.filename_map)
    map_genes2genome = {}
    for match in Blat.iterator(infile):
        assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId
        map_genes2genome[match.mQueryId] = match
    infile.close()

    ##################################################
    ##################################################
    ##################################################
    # get quality scores
    ##################################################
    quality = IndexedFasta.IndexedFasta(options.quality_file)
    quality.setTranslator(IndexedFasta.TranslatorBytes())

    ##################################################
    ##################################################
    ##################################################
    # main loop
    ##################################################
    ninput, noutput, nmissed = 0, 0, 0

    options.stdout.write("cluster_id\tstart\tend\n")

    for line in options.stdin:
        if line.startswith("cluster_id"):
            continue
        ninput += 1
        cluster_id, gene_id, alignment = line[:-1].split("\t")

        if gene_id not in map_genes2genome:
            nmissed += 1
            E.warn("gene_id %s not found in map." % gene_id)
            continue

        match = map_genes2genome[gene_id]
        map_gene2genome = match.getMapQuery2Target()
        is_negative = match.strand == "-"

        # if strand is negative, the coordinates are
        # on the negative strand of the gene/query
        # in order to work in the right coordinate system
        # revert the sequence
        if is_negative:
            alignment = alignment[::-1]

        # get map of gene to alignment
        map_gene2mali = alignlib_lite.py_makeAlignmentVector()
        fillAlignment(map_gene2mali, alignment)

        # get quality scores
        try:
            quality_scores = quality.getSequence(match.mSbjctId, "+",
                                                 match.mSbjctFrom,
                                                 match.mSbjctTo)
        except ValueError as msg:
            nmissed += 1
            E.warn("could not retrieve quality scores for %s:%i-%i: %s" %
                   (match.mSbjctId, match.mSbjctFrom, match.mSbjctTo, msg))
            continue

        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome))
        # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali))
        # print quality_scores

        map_mali2genome = alignlib_lite.py_makeAlignmentVector()
        alignlib_lite.py_combineAlignment(map_mali2genome, map_gene2mali,
                                          map_gene2genome, alignlib_lite.py_RR)
        # print str(alignlib_lite.py_AlignmentFormatEmissions(
        # map_mali2genome))

        # shuffle quality scores, but only those that are aligned
        if options.random:
            positions = []
            for fp, c in enumerate(alignment):
                if c == "-":
                    continue
                y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom
                if y < 0:
                    continue
                positions.append(y)
            scores = [quality_scores[x] for x in positions]
            random.shuffle(scores)
            for p, q in zip(positions, scores):
                quality_scores[p] = q

        # negative strand
        to_mask = []
        # reverse position
        rp = len(alignment)
        for fp, c in enumerate(alignment):
            rp -= 1
            if c == "-":
                continue
            y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom
            if y < 0:
                continue
            if quality_scores[y] < options.quality_threshold:
                if is_negative:
                    p = rp
                else:
                    p = fp
                E.debug(
                    "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i"
                    % (cluster_id, p, c, match.mSbjctId, match.strand,
                       map_mali2genome.mapRowToCol(fp), quality_scores[y]))
                if options.frame > 1:
                    start = (p // options.frame) * options.frame
                    to_mask.extend(list(range(start, start + options.frame)))
                else:
                    to_mask.append(p)

        regions = Iterators.group_by_distance(sorted(to_mask))

        for start, end in regions:
            options.stdout.write("%s\t%i\t%i\n" % (cluster_id, start, end))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed))

    E.stop()