コード例 #1
0
ファイル: cgat_cwd2list.py プロジェクト: logust79/cgat-flow
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    dir2files = {}
    for root, directory, files in os.walk("."):
        dir2files[root] = files

    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
    filename = "CWD_%s" % st
    E.info("outputting directory state to %s" % filename)
    with IOTools.openFile(filename, "w") as outf:
        outf.write("##contents of cwd on %s\n\n" % st)
        for directory, files in dir2files.items():
            for file in files:
                path = os.path.join(directory, file)
                outf.write(path + "\n")

    # write footer and output benchmark information.
    E.Stop()
コード例 #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    # stop parsing options at the first argument
    parser.disable_interspersed_args()

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:

        cmd = args[0]
        if len(args) > 1:
            cmd += " '" + "' '".join(args[1:]) + "'"

        s = subprocess.Popen(cmd, shell=True, cwd=os.getcwd(), close_fds=True)

        (out, err) = s.communicate()
        returncode = s.returncode
    else:
        returncode = 0

    E.Stop()

    sys.exit(returncode)
コード例 #3
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--filename-genome-sizes",
                      dest="filename_genome_sizes",
                      type="string",
                      help="Filename with chromosome sizes.")

    parser.add_option("-i",
                      "--num-iterations",
                      dest="num_iterations",
                      type="int",
                      help="Number of iterations [%default].")

    parser.set_defaults(filename_genome_sizes=None, num_iterations=1000)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    tagfile, bedfile = args

    if options.filename_genome_sizes is None:
        raise ValueError("please specify a filename with genome sizes")

    E.info("splitting filename %s into sections" % bedfile)

    # ff = IOTools.FilePool(output_pattern="%s.bed.gz")
    # for bed in Bed.iterator(IOTools.openFile(bedfile)):
    #     ff.write(bed.name, str(bed) + "\n")
    # ff.close()
    ff = glob.glob("*.bed.gz")
    iterations = options.num_iterations
    genomefile = options.filename_genome_sizes
    for testfile in ff:
        E.info("working on %s" % testfile)
        statement = """
        bits_test -a %(testfile)s
                  -b %(tagfile)s
                  -n %(iterations)i
                  -g %(genomefile)s
        """ % locals()

        result = E.run(statement, return_stdout=True)
        print(testfile, result)

    # write footer and output benchmark information.
    E.Stop()
コード例 #4
0
ファイル: setup_test.py プロジェクト: jscaber/cgat-proj057
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("script", "module"),
                      help="type of tests to create [%default].")

    parser.set_defaults(method="script")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) == 0:
        raise ValueError(
            "setup_test.py requires one or more command line arguments")

    targetdir = os.path.dirname(__file__)

    counter = E.Counter()

    for arg in args:
        counter.input += 1
        script_dirname, basename = os.path.split(arg)

        dirname = os.path.join(targetdir, basename)

        if os.path.exists(dirname):
            E.warn("%s already exists - skipping" % basename)
            counter.skipped += 1
            continue

        os.mkdir(dirname)

        with open(os.path.join(dirname, "tests.yaml"), "w") as outf:
            outf.write(YAML_TEMPLATE)

        counter.created += 1

    E.info("%s" % str(counter))

    # write footer and output benchmark information.
    E.Stop()
コード例 #5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="dry run, do not delete any files [%default]")

    parser.set_defaults(dry_run=False)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filenames = args

    c = E.Counter()
    for filename in filenames:
        c.checked += 1
        if os.path.exists(filename + ".log"):
            if IOTools.isComplete(filename + ".log"):
                c.complete += 1
                continue

        if IOTools.isComplete(filename):
            c.complete += 1
            continue

        c.incomplete += 1
        E.info('deleting %s' % filename)
        if options.dry_run:
            continue
        os.unlink(filename)
        c.deleted += 1

    E.info(c)

    # write footer and output benchmark information.
    E.Stop()
コード例 #6
0
ファイル: nofarm.py プロジェクト: logust79/cgat-flow
def main(argv=None):

    parser = farm.getOptionParser()

    (options, args) = E.Start(parser,
                              add_cluster_options=True)

    cmd = args[0]
    if len(args) > 1:
        cmd += " '" + "' '".join(args[1:]) + "'"

    cmd = re.sub("%DIR%", "", cmd)
    retcode = subprocess.call(cmd,
                              shell=True,
                              stdin=sys.stdin,
                              stdout=sys.stdout,
                              cwd=os.getcwd(),
                              close_fds=True)
    E.Stop()
コード例 #7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g",
        "--glob",
        dest="glob_pattern",
        type="string",
        help="glob pattern to use for collecting cluster jobs descriptions "
        "[%default]")

    parser.add_option(
        "-i",
        "--input-pattern",
        dest="input_pattern",
        type="string",
        help="regular expression to extract job id from filename [%default].")

    parser.add_option(
        "-o",
        "--output-filename-pattern",
        dest="output_pattern",
        type="string",
        help="string to convert a job id to a filename [%default].")

    parser.set_defaults(
        glob_pattern="job*.qsub",
        input_pattern="(\S+).qsub",
        output_pattern="%s.stdout",
        remove_old=True,
        force=False,
        check_completeness="python",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if args:
        filenames = args
    elif options.glob_pattern:
        filenames = glob.glob(options.glob_pattern)

    ninput, nrun, nskipped, nerrors = 0, 0, 0, 0
    ndeleted = 0

    if options.check_completeness == "python":
        isComplete = checkPythonRuns

    ##############################################################
    ##############################################################
    ##############################################################
    # decide what to do
    ##############################################################
    jobs = []
    files_to_delete = []

    for filename in filenames:

        ninput += 1
        try:
            job_name = re.search(options.input_pattern, filename).groups()[0]
        except AttributeError:
            options.stderr.write(
                "# could not extract invariant job name from %s\n" % filename)
            nerrors += 1
            continue

        result_filename = options.output_pattern % job_name

        do = False
        status = "up-to-date"

        if options.force:
            status = "force"
            do = True

        if not do:
            if os.path.exists(result_filename):
                if isNewer(filename, result_filename):
                    status = "newer"
                    do = True
                    if options.remove_old:
                        files_to_delete.append(result_filename)
                if not do and not isComplete(result_filename):
                    status = "incomplete"
                    do = True
                    if options.remove_old:
                        files_to_delete.append(result_filename)
            else:
                status = "missing"
                do = True

        E.info("%s->%s (%s)\n" % (filename, result_filename, status))

        if not do:
            nskipped += 1
            continue

        jobs.append(filename)

    ##############################################################
    ##############################################################
    ##############################################################
    # delete old files
    ##############################################################
    for filename in files_to_delete:
        if os.path.exists(filename):
            os.remove(filename)
            ndeleted += 1

    ##############################################################
    ##############################################################
    ##############################################################
    # start jobs
    ##############################################################
    for filename in jobs:

        cmd = "qsub %s" % filename
        try:
            retcode = subprocess.call(cmd, shell=True)
            if retcode != 0:
                if options.loglevel >= 1:
                    options.stdlog.write("# ERROR: failed to execute %s\n" %
                                         cmd)
                nerrors += 1
                continue
        except OSError as e:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# ERROR: failed to execute %s with msg %s\n" % (cmd, e))
        nrun += 1

    E.info("ninput=%i, nrun=%i, nskipped=%i, ndeleted=%i, nerrors=%i" %
           (ninput, nrun, nskipped, ndeleted, nerrors))

    E.Stop()
コード例 #8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("-p", "--path", dest="path", type="string",
                      help="path to scan for files [%default]")

    parser.add_option("-d", "--destination", dest="destination", type="string",
                      help="path to deposit files into [%defaul]")

    parser.set_defaults(path='/ifs/projects/sftp',
                        url='http://www.cgat.org/downloads/',
                        dest='/ifs/projects/overview')

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    statement = "find %s -name 'index.html'" % options.path

    process = subprocess.Popen(statement,
                               shell=True,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)

    stdout, stderr = process.communicate()

    files = stdout.split('\n')
    files.sort()

    outfile = IOTools.openFile(os.path.join(options.dest, "index.html"), "w")

    outfile.write('''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html>
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <title>CGAT project reports</title>
    <link rel="stylesheet" href="cgat.css" type="text/css" />
    <link rel="stylesheet" href="pygments.css" type="text/css" />
    <link rel="shortcut icon" href="http://cgatwiki.anat.ox.ac.uk/favicon.ico">
    <script type="text/javascript" src="sorttable.js"></script>
</head>

  <body>
    <div class="related">
      <h3>Navigation</h3>
      <ul>
        <li><a href="index.html">CGAT Projects Overview</a> &raquo;</li>
      </ul>
    </div>

    <div class="document">
      <div class="documentwrapper">
        <div class="bodywrapper">
          <div class="body">
 <div class="section" id="cgat-pipelines">
<H1>CGAT exported project pages</H1>

<p> 
This page is for internal use only. Do not distribute outside of
CGAT and do not make this page available on the world wide web.
</p>

<table class="sortable">\n''')

    outfile.write(
        '''<tr><th>Project</th><th>Report</th><th>Title</th></tr>\n''')

    for f in files:
        if f == '':
            continue

        proj = re.search('(proj\d+)', f).groups()[0]
        relpath = re.sub('.*proj\d+/', '', f)
        report = re.sub('^[^/]*/', '', os.path.dirname(relpath))

        lines = IOTools.openFile(f).readlines()
        titles = [x for x in lines if "<title>" in x]
        if titles:
            title = re.search("<title>(.*)</title>", titles[0]).groups()[0]
        else:
            title = "NA"

        if title.endswith("documentation"):
            title = title[:-len("documentation")]

        url = os.path.join(options.url, relpath)
        outfile.write(
            '<tr><td>%(proj)s</td><td><a HREF="%(url)s">%(report)s</td><td>%(title)s</td></tr>\n' % locals())

    outfile.write('''
</table>

</div>
</div>


          </div>
        </div>
      </div>
      <div class="sphinxsidebar">
        <div class="sphinxsidebarwrapper">
            <p class="logo"><a href="contents.html">
              <img class="logo" src="cgat_logo.png" alt="Logo"/>
            </a></p>





</body>
</html>\n''')

    outfile.close()

    E.info('created output file %s' % outfile.name)
    # write footer and output benchmark information.
    E.Stop()
コード例 #9
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $1.0$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="Supply database name")

    parser.add_option("-u",
                      "--indivfile",
                      dest="indivfile",
                      type="string",
                      help="Supply input bed file name for individual utrons")

    parser.add_option("-p",
                      "--partfile",
                      dest="partfile",
                      type="string",
                      help="Supply input bed file name for partnered utrons")

    parser.add_option("-n",
                      "--novelfile",
                      dest="novelfile",
                      type="string",
                      help="Supply input bed file name for novel utrons")

    parser.add_option("-t",
                      "--targetfile",
                      dest="targetfile",
                      type="string",
                      help="Supply input bed file name for miRNA TSs")

    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      type="string",
                      help="Supply output csv file name")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    global db
    db = options.database

    #get expressions files
    expressions = PUtils.fetch_DataFrame(
        "SELECT track, match_gene_id, transfrag_id, fpkm FROM agg_agg_agg_cuffcompare_transcripts CROSS JOIN agg_agg_agg_class WHERE transfrag_id = agg_agg_agg_class.transcript_id AND fpkm > 0",
        options.database)
    expressions = expressions.set_index(
        ["track", "match_gene_id", "transfrag_id"])

    grouped_expression = expressions["fpkm"].groupby(
        level=["track", "match_gene_id"])
    ex_fracts = grouped_expression.apply(lambda x: x / x.sum())
    ex_fracts.to_csv("pruned_expressionfractions.csv")
    ex_sums = grouped_expression.apply(lambda x: x.sum())
    ex_sums.to_csv("pruned_expressionsums.csv")

    ex_sums = pd.read_csv("pruned_expressionsums.csv",
                          names=['track', 'match_gene_id', 'exp_sum'])
    ex_sums = ex_sums.set_index(['match_gene_id', 'track'])
    ex_fracts = pd.read_csv(
        "pruned_expressionfractions.csv",
        names=['track', 'match_gene_id', 'transfrag_id', 'exp_fract'])
    ex_fracts = ex_fracts.set_index(['track', 'match_gene_id', 'transfrag_id'])
    fpkm_ex_fracts = ex_fracts.join(expressions, how='inner')
    fpkm_ex_fracts = fpkm_ex_fracts.reset_index()
    fpkm_ex_fracts = fpkm_ex_fracts.set_index(['match_gene_id', 'track'])
    ex_all = fpkm_ex_fracts.join(ex_sums, how='inner')
    ex_all = ex_all.reset_index()
    ex_all.to_csv("pruned_expression_all.csv")
    ex_all = pd.read_csv("pruned_expression_all.csv")
    ex_all = ex_all.set_index('transfrag_id')

    #stop distances
    ind_utrons = pd.read_table(
        options.indivfile,
        header=0,
        sep='\t',
        names=["chrom", "start", "end", "name", "score", "strand", "stop"],
        usecols=["start", "end", "name", "strand", "stop"],
        compression='gzip')

    ind_utrons['dist'] = ind_utrons.apply(lambda row: getStopDistdf(row),
                                          axis=1)
    ind_utrons = ind_utrons.set_index('name')
    grouped_stopdist = ind_utrons.groupby(level='name')
    transcript_dist = grouped_stopdist.apply(lambda group: group['dist'].max())
    transcript_dist.name = 'dist'
    transcript_over_under_50 = transcript_dist.apply(
        lambda row: getOverUnder50(row))
    transcript_over_under_50.name = 'over_under_50'

    ex_all_dist = ex_all.join(transcript_over_under_50, how='left')
    ex_all_dist = ex_all_dist.join(transcript_dist, how='left')
    ex_all_dist['utron'] = ex_all_dist.apply(lambda row: isUtron(row), axis=1)

    #novel utrons

    novel_utrons = pd.read_table(options.novelfile,
                                 header=0,
                                 sep='\t',
                                 names=[
                                     "chrom", "start", "end", "name", "score",
                                     "strand", "a", "b", "c", "d", "e", "f"
                                 ],
                                 usecols=["start", "end", "name"],
                                 compression='gzip')
    novel_utrons = novel_utrons.set_index(novel_utrons["name"])
    novel_utrons = novel_utrons.drop_duplicates(
        subset="name"
    )  #excludes entries with different start/end utron coordinates in same transcript

    novel_utrons['novel_utron'] = novel_utrons.apply(
        lambda row: insertYesCol(row), axis=1)
    novel_utrons = novel_utrons.drop(['start', 'end', 'name'], axis=1)
    ex_all_dist_nov = ex_all_dist.join(novel_utrons, how='left')

    #TSs

    utron_TSs = pd.read_table(
        options.targetfile,
        header=0,
        sep='\t',
        names=["chrom", "start", "end", "name", "score", "strand", "stop"],
        usecols=["start", "end", "name", "strand", "stop"],
        compression='gzip')
    utron_TSs['miRNA_TS'] = utron_TSs.apply(lambda row: insertYesCol(row),
                                            axis=1)
    utron_TSs = utron_TSs.drop(["start", "end", "strand", "stop"],
                               axis=1).drop_duplicates()
    utron_TSs = utron_TSs.set_index(["name"])
    ex_all_dist_nov_TS = ex_all_dist_nov.join(utron_TSs, how='left')

    #extra utrons

    tcons_ens = pd.read_table(options.partfile,
                              header=0,
                              sep='\t',
                              names=[
                                  "chrom", "start", "end", "name", "score",
                                  "strand", "a", 'b', 'c', 'd', 'e', 'f'
                              ],
                              usecols=["start", "end", "name", "strand"],
                              compression='gzip')
    tcons_ens['TCONS_id'] = tcons_ens.apply(lambda row: get_tcons(row), axis=1)
    tcons_ens['partner_id'] = tcons_ens.apply(lambda row: get_enst(row),
                                              axis=1)
    tcons_ens = tcons_ens.set_index('TCONS_id')

    tcons_ens['partner_id_TCONS'] = tcons_ens.apply(
        lambda row: get_tcons_from_ens(row), axis=1)
    tcons_ens = tcons_ens.drop_duplicates()
    tcons_ens['extra_utron'] = tcons_ens.apply(lambda row: insertYesCol(row),
                                               axis=1)

    partners = tcons_ens[['name', 'partner_id_TCONS']]
    partners = partners[partners['partner_id_TCONS'] != 'No_id']
    partners = partners.set_index('partner_id_TCONS')
    utrons_and_partners = tcons_ens.append(partners)
    utrons_and_partners = utrons_and_partners.join(ex_all_dist_nov,
                                                   how='inner')
    utrons_and_partners = utrons_and_partners.reset_index().drop_duplicates(
        subset=['match_gene_id', 'track', 'index'])
    utrons_and_partners = utrons_and_partners.set_index(
        ['match_gene_id', 'track'])
    groups = utrons_and_partners.groupby(level=['match_gene_id', 'track'])
    sums = groups.apply(lambda group: sum(group['fpkm']))
    utrons_and_partners['partner_exp_sum'] = sums
    utrons_and_partners['partner_exp_fract'] = utrons_and_partners.apply(
        lambda row: row['fpkm'] / row['partner_exp_sum'], axis=1)
    only_utrons = utrons_and_partners[utrons_and_partners['extra_utron'] ==
                                      'Yes']
    only_utrons = only_utrons[[
        'index', 'extra_utron', 'partner_exp_sum', 'partner_exp_fract',
        'partner_id_TCONS', 'partner_id'
    ]]
    only_utrons = only_utrons.reset_index()
    only_utrons = only_utrons.dropna(
        subset=['match_gene_id', 'track', 'index'])
    only_utrons = only_utrons.set_index(['match_gene_id', 'track', 'index'])

    ex_all_dist_nov_TS = ex_all_dist_nov_TS.reset_index()
    ex_all_dist_nov_TS = ex_all_dist_nov_TS.set_index(
        ['match_gene_id', 'track', 'index'])
    ex_all_dist_nov_TS_ext = ex_all_dist_nov_TS.join(only_utrons, how='left')

    #patients and treatment
    final = ex_all_dist_nov_TS_ext.reset_index()
    final['treatment'] = final.apply(lambda row: label_treatment(row), axis=1)
    final['patient'] = final.apply(lambda row: label_patient(row), axis=1)

    final.to_csv(options.outfile)

    # write footer and output benchmark information.
    E.Stop()
コード例 #10
0
ファイル: run_function.py プロジェクト: logust79/cgat-flow
def main(argv=None):

    # Parse the options
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-p",
        "--params",
        dest="params",
        type="string",
        help="comma separated list of addtional parameter strings")

    parser.add_option("-m",
                      "--module",
                      dest="module",
                      type="string",
                      help="the full path to the module file",
                      default=None)

    parser.add_option("-i",
                      "--input",
                      dest="input_filenames",
                      type="string",
                      action="append",
                      help="input filename")

    parser.add_option("-o",
                      "--output-section",
                      dest="output_filenames",
                      type="string",
                      action="append",
                      help="output filename")

    parser.add_option("-f",
                      "--function",
                      dest="function",
                      type="string",
                      help="the module function",
                      default=None)

    parser.set_defaults(input_filenames=[], output_filenames=[], params=None)

    (options, args) = E.Start(parser)

    # Check a module and function have been specified
    if not options.module or not options.function:
        raise ValueError("Both a function and Module must be specified")

    # If a full path was given, add this path to the system path
    location = os.path.dirname(options.module)
    if location != "":
        sys.path.append(location)

    # Establish the module name, accomodating cases where the
    # .py extension has been included in the module name
    module_name = os.path.basename(options.module)
    if module_name.endswith(".py"):
        module_base_name = module_name[:-3]
    else:
        module_base_name = module_name

    # Import the specified module and map the specified fuction
    E.info("importing module '%s' " % module_base_name)
    E.debug("sys.path is: %s" % sys.path)

    module = importlib.import_module(module_base_name)
    try:
        function = getattr(module, options.function)
    except AttributeError as msg:
        raise AttributeError(
            msg.message + "unknown function, available functions are: %s" %
            ",".join([x for x in dir(module) if not x.startswith("_")]))

    if options.input_filenames and not options.input_filenames == ["None"]:
        infiles = options.input_filenames
    else:
        infiles = False

    if options.output_filenames and not options.output_filenames == ["None"]:
        outfiles = options.output_filenames
    else:
        outfiles = False

    # Parse the parameters into an array
    if options.params:
        params = [param.strip() for param in options.params.split(",")]
    else:
        params = False

    # deal with single file case
    if infiles and len(infiles) == 1:
        infiles = infiles[0]
    if outfiles and len(outfiles) == 1:
        outfiles = outfiles[0]

    # Make the function call
    if infiles and outfiles and params:
        function(infiles, outfiles, params)
    elif infiles and outfiles and not params:
        function(infiles, outfiles)
    elif params:
        function(params)
    else:
        raise ValueError(
            "Expecting infile+outfile+params or infile+outfile or params")

    E.Stop()
コード例 #11
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-p", "--pattern-identifier", dest="pattern", type="string",
        help="jobs matching `pattern` in their job "
        "description will be killed [default=%default].")

    parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true",
                      help="do dry run, do not kill [default=%default].")

    parser.set_defaults(
        pattern=None,
        dry_run=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    output = StringIO.StringIO(
        subprocess.Popen(["qstat", "-xml"],
                         stdout=subprocess.PIPE).communicate()[0])

    tree = xml.etree.ElementTree.ElementTree(file=output)

    ntested = 0
    to_kill = set()

    if options.pattern:
        pattern = re.compile(options.pattern)
    else:
        pattern = None

    for x in tree.getiterator("job_list"):
        ntested += 1
        id = x.find("JB_job_number").text
        name = x.find("JB_name").text
        if pattern and pattern.search(name):
            to_kill.add(id)

    nkilled = len(to_kill)
    if not options.dry_run:
        p = subprocess.Popen(
            ["qdel", ",".join(to_kill)], stdout=subprocess.PIPE)
        stdout, stderr = p.communicate()

    E.info("ntested=%i, nkilled=%i" % (ntested, nkilled))

    # write footer and output benchmark information.
    E.Stop()
コード例 #12
0
ファイル: makeGeneset.py プロジェクト: logust79/cgat-apps
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-g", "--gtf", dest="gtf",
                      type="string",
                      help="path to input gtf")

    parser.add_option("-p", "--gtfpath", dest="gtfpath",
                      type="string",
                      help="path to online gtf")

    parser.add_option("-c", "--remove-contigs", dest="remove_contigs",
                      type="string",
                      help="contigs to remove, delimited by |")

    parser.add_option("-k", "--keep-contigs", dest="keep_contigs",
                      type="string",
                      help="""all contigs to keep, delimited by |.  Contigs
                      specified in --remove-contigs will still be removed""")

    parser.add_option("-o", "--outfile", dest="outfile",
                      type="string",
                      help="path to processed output gtf")

    parser.add_option("-f", "--filter", dest="filters",
                      type="string",
                      action="append",
                      help="""List of filters to apply to your GTF""")

    parser.set_defaults(
        remove_contigs=None,
        keep_contigs=None,
    )

    (options, args) = E.Start(parser)

    if options.gtf:
        gtf = options.gtf
    elif options.gtfpath:
        getGTF(options.gtfpath)
        gtf = options.gtfpath.split("/")[-1]
    else:
        raise ValueError("Please provide a GTF or the path to an online GTF")

    if not options.outfile:
        raise ValueError("Please provide an output file name")

    d = 0
    if options.remove_contigs or options.keep_contigs:
        d += 1
        statement = 'zcat %s |' % gtf

        if options.remove_contigs:
            statement += removeNamedContigs(options.remove_contigs)

        if options.keep_contigs:
            statement += keepOnlyNamedContigs(options.keep_contigs)

        if options.outfile.endswith(".gz"):
            outfile = options.outfile
        else:
            outfile = options.outfile + ".gz"

        statement += "gzip > %s " % outfile

        os.system(statement)

    T1 = gtf
    if options.filters:
        d += 1
        for filterstring in options.filters:
            T2 = P.getTempFilename(".")
            T2 = T2 + ".gtf"
            filterGTF(T1, filterstring, T2)
            T1 = T2

        shutil.move(T2, options.outfile)

    if d == 0:
        raise ValueError("No filters provided")
コード例 #13
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-o",
        "--output_file",
        type="string",
        default=None,
        help="[Optional] Filename to output results to. [default=STDOUT]")
    parser.add_option(
        "-u",
        "--url",
        type="string",
        default="http://www.cbioportal.org/public-portal/webservice.do",
        help="[Optional] Url to the cBioPortal webservice [default=%default]")

    cqueryopts = optparse.OptionGroup(parser, "Common parameters",
                                      "Common arguments to the query")
    cqueryopts.add_option(
        "-s",
        "--study_id",
        dest="study_id",
        type="string",
        default=None,
        help=
        "[Required/OPtional]  cBioPortal ID for study [default=%default].\n This or study_name required for: getGeneticProfiles, getCaseLists, getProteinArrayInfo, getLink,getOncoprintHTML, getPercentAltered, getTotalAltered"
    )
    cqueryopts.add_option(
        "-n",
        "--study_name",
        dest="study_name",
        type="string",
        default=None,
        help=
        "[Required/Optional] cBioPortal Name for study [defualt=%default].\n See above for which commands require this."
    )
    cqueryopts.add_option(
        "-c",
        "--case_set_id",
        dest="case_set_id",
        type="string",
        default=None,
        help=
        "[Required for some] cBioPortal case_set_id specifying the case list to use.\nRequired for getProfileData, getMutationData, getClincalData, getProteinArrayData, getPercentAltered, getTotalAltered. Default is case_set_id for case list 'All Tumours' "
    )
    cqueryopts.add_option(
        "-g",
        "--gene_list",
        dest="gene_list",
        type="string",
        default=None,
        help=
        "[Required for some] Comma seperated list of HUGO gene symbols or Entrez gene IDs.\nRequired for getProfileData, getMutationData, getLink, getOncoprintHTML"
    )
    cqueryopts.add_option("-f",
                          "--gene_list_file",
                          dest="gene_list_file",
                          type="string",
                          default=None,
                          help="[Optional] Filename to read in gene_list from")
    cqueryopts.add_option(
        "-p",
        "--profile_id",
        dest="profile_id",
        type="string",
        help=
        "[Optional] Comma seperated list of cBioPortal genetic_profile_ids. If none are specified then the list of profiles for the study where display in analysis is True is used."
    )

    squeryopts = optparse.OptionGroup(
        parser, "Query specific parameters",
        "Arguments specific to a particular query")
    squeryopts.add_option(
        "--protein_array_type",
        dest="protein_array_type",
        type="string",
        default="protein_level",
        help=
        "[Optional] Either protein_level or phosphorylation [default=%default]"
    )
    squeryopts.add_option(
        "--protein_array_id",
        dest="protein_array_id",
        type="string",
        help=
        "[Required for some] comma seperated list of one or more protein array IDs"
    )
    squeryopts.add_option(
        "--array_info",
        dest="protein_array_info",
        type="int",
        default=0,
        help=
        "[Optional] If 1, antibody infomation will also be exported in a getProteinArrayData query [default=%default]"
    )
    squeryopts.add_option(
        "--output-report",
        dest="report",
        type="string",
        default="full",
        help=
        "[Optional] Report type to display for getLink. Either full or oncoprint_html [default=%default] "
    )
    squeryopts.add_option(
        "--threshold",
        dest="threshold",
        type="int",
        default=2,
        help=
        "[Optional] Threshold for deciding if an alteration is significant for continuous metrics [default=%default]"
    )

    parser.add_option_group(cqueryopts)
    parser.add_option_group(squeryopts)

    (options, args) = E.Start(parser,
                              add_pipe_options=False,
                              add_output_options=False,
                              argv=argv)

    portal = CBioPortal(url=options.url,
                        study=options.study_id,
                        study_name=options.study_name,
                        case_list_id=options.case_set_id)

    results = []

    if options.gene_list_file:
        infile = IOTools.openFile(options.gene_list_file)
        gene_list = [x.strip() for x in infile]
    elif options.gene_list:
        gene_list = options.gene_list.split(",")

    if options.profile_id:
        profile_id = options.profile_id.split(",")
    else:
        profile_id = None

    if "getCancerStudies" in args:
        results.append(portal.getCancerStudies())

    if "getGeneticProfiles" in args:
        results.append(portal.getGeneticProfiles())

    if "getCaseLists" in args:
        results.append(portal.getCaseLists())

    if "getProfileData" in args:
        results.append(
            portal.getProfileData(gene_list=gene_list,
                                  genetic_profile_id=profile_id))

    if "getMutationData" in args:
        results.append(
            portal.getMutationData(gene_list=gene_list,
                                   genetic_profile_id=profile_id))

    if "getClinicalData" in args:
        results.append(portal.getClinicalData())

    if "getProteinArrayInfo" in args:
        results.append(
            portal.getProteinArrayInfo(
                gene_list=gene_list,
                protein_array_type=options.protein_array_type))

    if "getProteinArrayData" in args:
        results.append(
            portal.getProteinArrayData(
                protein_array_id=options.protein_array_id,
                array_info=options.array_info))

    if "getPercentAltered" in args:
        results.append(
            portal.getPercentAltered(gene_list=gene_list,
                                     genetic_profile_id=profile_id,
                                     threshold=options.threshold))

    if "getLink" in args:
        results.append(
            portal.getLink(gene_list=gene_list, report=options.report))

    if "getOncoprintHTML" in args:
        results.append(portal.getOncoprintHTML(gene_list=gene_list))

    if len(results) == 0:
        sys.stderr.write("No recognised query commands provided")
        sys.exit()

    if options.output_file:
        outf = IOTools.openFile(options.output_file, "w")
    else:
        outf = sys.stdout

    for result in results:
        try:
            outf.write(tableToString(result))
        except:
            outf.write(result)

    E.Stop()
コード例 #14
0
ファイル: WrapperIDR.py プロジェクト: logust79/cgat-apps
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--output-prefix",
                      dest="output_prefix",
                      type="string",
                      help="output filename prefix [default=%default].")

    parser.add_option(
        "-c",
        "--chromosome-table",
        dest="filename_chromosome_table",
        type="string",
        help=
        "filename with tab separated list of chromosome names [default=%default]."
    )

    parser.add_option("--action",
                      dest="action",
                      type="choice",
                      choices=("plot", "run"),
                      help="action to perform [default=%default]")

    parser.add_option(
        "-s",
        "--signal-value",
        dest="signal_value",
        type="string",
        help=
        "use either p.value or sig.value as ranking measure [default=%default]"
    )

    parser.add_option(
        "-r",
        "--overlap-ratio",
        dest="overlap_ratio",
        type="int",
        help=
        "a value between 0 and 1 that controls how much two peaks have to overlap to be called as the same [default=%default]"
    )

    parser.set_defaults(
        action="plot",
        output_prefix="output",
        half_width=None,
        overlap_ratio=0,
        is_broadpeak=False,
        signal_value="signal.value",
        filename_chromosome_table="genome_table.txt",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.action == "plot":
        plotIDR(options.output_prefix + ".pdf", args)
    elif options.action == "run":
        if len(args) != 2:
            raise ValueError("require exactly two replicates")
        runIDR(options, args[0], args[1])

    # write footer and output benchmark information.
    E.Stop()
コード例 #15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--glob", dest="glob_pattern", type="string",
        help="glob pattern to use for collecting files [%default].")

    parser.add_option(
        "-f", "--file-pattern", dest="file_pattern", type="string",
        help="only check files matching this pattern [%default].")

    parser.add_option("-m", "--mode", dest="mode", type="choice",
                      choices=("file", "node"),
                      help="analysis mode [%default].")

    parser.add_option(
        "-r", "--recursive", action="store_true",
        help="recursively look for logfiles from current directory "
        "[%default].")

    parser.set_defaults(
        truncate_sites_list=0,
        glob_pattern="*.log",
        mode="file",
        recursive=False,
    )

    (options, args) = E.Start(parser)

    if args:
        filenames = args
    elif options.glob_pattern:
        filenames = glob.glob(options.glob_pattern)

    if len(filenames) == 0:
        raise ValueError("no files to analyse")

    if options.mode == "file":
        totals = Logfile.LogFileData()

        options.stdout.write("file\t%s\n" % totals.getHeader())

        for filename in filenames:
            if filename == "-":
                infile = sys.stdin
            elif filename[-3:] == ".gz":
                infile = gzip.open(filename, "r")
            else:
                infile = open(filename, "r")

            subtotals = Logfile.LogFileData()
            for line in infile:
                subtotals.add(line)

            infile.close()

            options.stdout.write("%s\t%s\n" % (filename, str(subtotals)))
            totals += subtotals

        options.stdout.write("%s\t%s\n" % ("total", str(totals)))

    elif options.mode == "node":

        chunks_per_node = {}

        rx_node = re.compile("# job started at .* \d+ on (\S+)")

        for filename in filenames:
            if filename == "-":
                infile = sys.stdin
            elif filename[-3:] == ".gz":
                infile = gzip.open(filename, "r")
            else:
                infile = open(filename, "r")

            data = Logfile.LogFileDataLines()

            for line in infile:

                if rx_node.match(line):
                    node_id = rx_node.match(line).groups()[0]
                    data = Logfile.LogFileDataLines()
                    if node_id not in chunks_per_node:
                        chunks_per_node[node_id] = []
                    chunks_per_node[node_id].append(data)
                    continue

                data.add(line)

        options.stdout.write("node\t%s\n" % data.getHeader())
        total = Logfile.LogFileDataLines()

        for node, data in sorted(chunks_per_node.items()):
            subtotal = Logfile.LogFileDataLines()
            for d in data:
                # options.stdout.write( "%s\t%s\n" % (node, str(d) ) )
                subtotal += d

            options.stdout.write("%s\t%s\n" % (node, str(subtotal)))

            total += subtotal

        options.stdout.write("%s\t%s\n" % ("total", str(total)))

    E.Stop()
コード例 #16
0
ファイル: Control.py プロジェクト: k3yavi/CGATCore
def main(args=sys.argv):
    """command line control function for a pipeline.

    This method defines command line options for the pipeline and
    updates the global configuration dictionary correspondingly.

    It then provides a command parser to execute particular tasks
    using the ruffus pipeline control functions. See the generated
    command line help for usage.

    To use it, add::

        import CGAT.Pipeline as P

        if __name__ == "__main__":
            sys.exit(P.main(sys.argv))

    to your pipeline script.

    Arguments
    ---------
    args : list
        List of command line arguments.

    """

    global GLOBAL_OPTIONS
    global GLOBAL_ARGS

    parser = E.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--pipeline-action",
                      dest="pipeline_action",
                      type="choice",
                      choices=("make", "show", "plot", "dump", "config",
                               "clone", "check", "regenerate", "printconfig"),
                      help="action to take [default=%default].")

    parser.add_option("--pipeline-format",
                      dest="pipeline_format",
                      type="choice",
                      choices=("dot", "jpg", "svg", "ps", "png"),
                      help="pipeline format [default=%default].")

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="perform a dry run (do not execute any shell "
                      "commands) [default=%default].")

    parser.add_option("-f",
                      "--force-output",
                      dest="force",
                      action="store_true",
                      help="force running the pipeline even if there "
                      "are uncommited changes "
                      "in the repository [default=%default].")

    parser.add_option("-p",
                      "--multiprocess",
                      dest="multiprocess",
                      type="int",
                      help="number of parallel processes to use on "
                      "submit host "
                      "(different from number of jobs to use for "
                      "cluster jobs) "
                      "[default=%default].")

    parser.add_option("-e",
                      "--exceptions",
                      dest="log_exceptions",
                      action="store_true",
                      help="echo exceptions immediately as they occur "
                      "[default=%default].")

    parser.add_option("-i",
                      "--terminate",
                      dest="terminate",
                      action="store_true",
                      help="terminate immediately at the first exception "
                      "[default=%default].")

    parser.add_option("-d",
                      "--debug",
                      dest="debug",
                      action="store_true",
                      help="output debugging information on console, "
                      "and not the logfile "
                      "[default=%default].")

    parser.add_option("-s",
                      "--set",
                      dest="variables_to_set",
                      type="string",
                      action="append",
                      help="explicitly set paramater values "
                      "[default=%default].")

    parser.add_option("-c",
                      "--checksums",
                      dest="ruffus_checksums_level",
                      type="int",
                      help="set the level of ruffus checksums"
                      "[default=%default].")

    parser.add_option("-t",
                      "--is-test",
                      dest="is_test",
                      action="store_true",
                      help="this is a test run"
                      "[default=%default].")

    parser.add_option("--rabbitmq-exchange",
                      dest="rabbitmq_exchange",
                      type="string",
                      help="RabbitMQ exchange to send log messages to "
                      "[default=%default].")

    parser.add_option("--rabbitmq-host",
                      dest="rabbitmq_host",
                      type="string",
                      help="RabbitMQ host to send log messages to "
                      "[default=%default].")

    parser.add_option("--input-validation",
                      dest="input_validation",
                      action="store_true",
                      help="perform input validation before starting "
                      "[default=%default].")

    parser.set_defaults(pipeline_action=None,
                        pipeline_format="svg",
                        pipeline_targets=[],
                        multiprocess=40,
                        logfile="pipeline.log",
                        dry_run=False,
                        force=False,
                        log_exceptions=False,
                        exceptions_terminate_immediately=False,
                        debug=False,
                        variables_to_set=[],
                        is_test=False,
                        ruffus_checksums_level=0,
                        rabbitmq_host="saruman",
                        rabbitmq_exchange="ruffus_pipelines",
                        input_validation=False)

    (options, args) = E.Start(parser, add_cluster_options=True)

    GLOBAL_OPTIONS, GLOBAL_ARGS = options, args
    E.info("Started in: %s" % PARAMS.get("workingdir"))
    # At this point, the PARAMS dictionary has already been
    # built. It now needs to be updated with selected command
    # line options as these should always take precedence over
    # configuration files.

    PARAMS["dryrun"] = options.dry_run
    PARAMS["input_validation"] = options.input_validation

    # use cli_cluster_* keys in PARAMS to ensure highest priority
    # of cluster_* options passed with the command-line
    if options.cluster_memory_default is not None:
        PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default
        PARAMS["cluster_memory_default"] = options.cluster_memory_default
    if options.cluster_memory_resource is not None:
        PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource
        PARAMS["cluster_memory_resource"] = options.cluster_memory_resource
    if options.cluster_num_jobs is not None:
        PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs
        PARAMS["cluster_num_jobs"] = options.cluster_num_jobs
    if options.cluster_options is not None:
        PARAMS["cli_cluster_options"] = options.cluster_options
        PARAMS["cluster_options"] = options.cluster_options
    if options.cluster_parallel_environment is not None:
        PARAMS[
            "cli_cluster_parallel_environment"] = options.cluster_parallel_environment
        PARAMS[
            "cluster_parallel_environment"] = options.cluster_parallel_environment
    if options.cluster_priority is not None:
        PARAMS["cli_cluster_priority"] = options.cluster_priority
        PARAMS["cluster_priority"] = options.cluster_priority
    if options.cluster_queue is not None:
        PARAMS["cli_cluster_queue"] = options.cluster_queue
        PARAMS["cluster_queue"] = options.cluster_queue
    if options.cluster_queue_manager is not None:
        PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager
        PARAMS["cluster_queue_manager"] = options.cluster_queue_manager

    PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level

    for variables in options.variables_to_set:
        variable, value = variables.split("=")
        PARAMS[variable.strip()] = IOTools.str2val(value.strip())

    if args:
        options.pipeline_action = args[0]
        if len(args) > 1:
            options.pipeline_targets.extend(args[1:])

    # see inputValidation function in Parameters.py
    if options.input_validation:
        inputValidation(PARAMS, sys.argv[0])

    if options.pipeline_action == "check":
        counter, requirements = Requirements.checkRequirementsFromAllModules()
        for requirement in requirements:
            E.info("\t".join(map(str, requirement)))
        E.info("version check summary: %s" % str(counter))
        E.Stop()
        return

    elif options.pipeline_action == "debug":
        # create the session proxy
        startSession()

        method_name = options.pipeline_targets[0]
        caller = getCaller()
        method = getattr(caller, method_name)
        method(*options.pipeline_targets[1:])

    elif options.pipeline_action in ("make", "show", "svg", "plot", "touch",
                                     "regenerate"):

        # set up extra file logger
        handler = logging.FileHandler(filename=options.logfile, mode="a")
        handler.setFormatter(
            MultiLineFormatter(
                '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s'
            ))
        logger = logging.getLogger()
        logger.addHandler(handler)
        messenger = None

        try:
            if options.pipeline_action == "make":

                # get tasks to be done. This essentially replicates
                # the state information within ruffus.
                stream = io.StringIO()
                pipeline_printout(
                    stream,
                    options.pipeline_targets,
                    verbose=5,
                    checksum_level=options.ruffus_checksums_level)

                messenger = LoggingFilterRabbitMQ(
                    stream.getvalue(),
                    project_name=getProjectName(),
                    pipeline_name=getPipelineName(),
                    host=options.rabbitmq_host,
                    exchange=options.rabbitmq_exchange)

                logger.addFilter(messenger)

                if not options.without_cluster:
                    global task
                    # use threading instead of multiprocessing in order to
                    # limit the number of concurrent jobs by using the
                    # GIL
                    #
                    # Note that threading might cause problems with rpy.
                    task.Pool = ThreadPool

                    # create the session proxy
                    startSession()

                #
                #   make sure we are not logging at the same time in
                #   different processes
                #
                # session_mutex = manager.Lock()
                E.info(E.GetHeader())
                E.info("code location: %s" % PARAMS["pipeline_scriptsdir"])
                E.info("Working directory is: %s" % PARAMS["workingdir"])

                pipeline_run(
                    options.pipeline_targets,
                    multiprocess=options.multiprocess,
                    logger=logger,
                    verbose=options.loglevel,
                    log_exceptions=options.log_exceptions,
                    exceptions_terminate_immediately=options.
                    exceptions_terminate_immediately,
                    checksum_level=options.ruffus_checksums_level,
                )

                E.info(E.GetFooter())

                closeSession()

            elif options.pipeline_action == "show":
                pipeline_printout(
                    options.stdout,
                    options.pipeline_targets,
                    verbose=options.loglevel,
                    checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "touch":
                pipeline_run(options.pipeline_targets,
                             touch_files_only=True,
                             verbose=options.loglevel,
                             checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "regenerate":
                pipeline_run(options.pipeline_targets,
                             touch_files_only=options.ruffus_checksums_level,
                             verbose=options.loglevel)

            elif options.pipeline_action == "svg":
                pipeline_printout_graph(
                    options.stdout.buffer,
                    options.pipeline_format,
                    options.pipeline_targets,
                    checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "plot":
                outf, filename = tempfile.mkstemp()
                pipeline_printout_graph(
                    os.fdopen(outf, "wb"),
                    options.pipeline_format,
                    options.pipeline_targets,
                    checksum_level=options.ruffus_checksums_level)
                execute("inkscape %s" % filename)
                os.unlink(filename)

        except ruffus_exceptions.RethrownJobError as value:

            if not options.debug:
                E.error("%i tasks with errors, please see summary below:" %
                        len(value.args))
                for idx, e in enumerate(value.args):
                    task, job, error, msg, traceback = e

                    if task is None:
                        # this seems to be errors originating within ruffus
                        # such as a missing dependency
                        # msg then contains a RethrownJobJerror
                        msg = str(msg)
                        pass
                    else:
                        task = re.sub("__main__.", "", task)
                        job = re.sub("\s", "", job)

                    if messenger:
                        messenger.send_error(task, job, error, msg)

                    # display only single line messages
                    if len([x for x in msg.split("\n") if x != ""]) > 1:
                        msg = ""

                    E.error("%i: Task=%s Error=%s %s: %s" %
                            (idx, task, error, job, msg))

                E.error("full traceback is in %s" % options.logfile)

                # write full traceback to log file only by removing the stdout
                # handler
                lhStdout = logger.handlers[0]
                logger.removeHandler(lhStdout)
                logger.error("start of error messages")
                logger.error(value)
                logger.error("end of error messages")
                logger.addHandler(lhStdout)

                # raise error
                raise ValueError("pipeline failed with %i errors" %
                                 len(value.args))
            else:
                raise

    elif options.pipeline_action == "dump":
        print(json.dumps(PARAMS))

    elif options.pipeline_action == "printconfig":
        print("Printing out pipeline parameters: ")
        for k in sorted(PARAMS):
            print(k, "=", PARAMS[k])
        printConfigFiles()

    elif options.pipeline_action == "config":
        f = sys._getframe(1)
        caller = f.f_globals["__file__"]
        pipeline_path = os.path.splitext(caller)[0]
        general_path = os.path.join(os.path.dirname(pipeline_path),
                                    "configuration")
        writeConfigFiles(pipeline_path, general_path)

    elif options.pipeline_action == "clone":
        clonePipeline(options.pipeline_targets[0])

    else:
        raise ValueError("unknown pipeline action %s" %
                         options.pipeline_action)

    E.Stop()
コード例 #17
0
ファイル: cgat_zap.py プロジェクト: logust79/cgat-flow
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="do dry run, do not kill [default=%default].")

    parser.add_option("-l",
                      "--ignore-links",
                      dest="ignore_links",
                      action="store_true",
                      help="do not zap symbolic links [default=%default].")

    parser.set_defaults(
        dry_run=False,
        ignore_links=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    outfile = options.stdout

    fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev',
              'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev',
              'st_size', 'st_uid')

    outfile.write("filename\tlinkdest\t%s\n" % "\t".join(fields))

    # remove any duplicates and sort
    args = sorted(set(args))

    for fn in args:

        # stat follows times to links
        original = os.stat(fn)

        if os.path.islink(fn):
            if not options.ignore_links:
                linkdest = os.readlink(fn)
                E.info('breaking link from %s to %s' % (fn, linkdest))
                if not options.dry_run:
                    os.unlink(fn)
                    f = open(fn, "w")
                    f.close()
        else:
            E.info('truncating file %s' % fn)
            linkdest = ""
            if not options.dry_run:
                f = open(fn, "w")
                f.truncate()
                f.close()

        outfile.write("%s\t%s\t%s\n" % (fn, linkdest, "\t".join(
            [str(getattr(original, x)) for x in fields])))

        if not options.dry_run:
            # Set original times
            os.utime(fn, (original.st_atime, original.st_mtime))
            os.chmod(fn, original.st_mode)

    # write footer and output benchmark information.
    E.Stop()
コード例 #18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-s",
        "--scratch-dir",
        dest="scratchdir",
        type="string",
        help="the scratch directory on the nodes [default=%default].")

    parser.add_option(
        "-c",
        "--collection",
        dest="collection",
        type="string",
        help="files will be put into collection. This is a directory that "
        "will be created just below the scratch directory [default=%default].")

    parser.set_defaults(
        scratchdir="/scratch",
        collection="",
        nodes=[],
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) == 0:
        raise ValueError("please specify a collection of files/directories "
                         "that should be mirrored.")

    targetdir = os.path.join(options.scratchdir, options.collection)

    nodes = getNodes(options.nodes)

    E.info("copying to %s on nodes %s" % (targetdir, ",".join(nodes)))

    ninput, noutput, nskipped = 0, 0, 0

    filenames = " ".join(args)

    for node in nodes:
        E.info("copying to node %s" % node)
        ninput += 1
        statement = '''
               ssh %(node)s mkdir %(targetdir)s >& /dev/null;
               rsync --progress -az %(filenames)s %(node)s:%(targetdir)s
        ''' % locals()
        E.run(statement)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
コード例 #19
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-l",
                      "--logfile",
                      dest="logfile",
                      type="string",
                      help="name of logfile [default=%default]")

    parser.add_option("-t",
                      "--time",
                      dest="time",
                      type="choice",
                      choices=("seconds", "milliseconds"),
                      help="time to show [default=%default]")

    parser.add_option(
        "--no-reset",
        dest="reset",
        action="store_false",
        help="do not reset counters when a new pipeline run started "
        "The default is to reset so that only the counts from the latest "
        "pipeline execution are show "
        "[default=%default]")

    parser.add_option("-f",
                      "--filter-method",
                      dest="filter",
                      type="choice",
                      choices=("unfinished", "running", "completed", "all"),
                      help="apply filter to output [default=%default]")

    parser.add_option("-i",
                      "--ignore-errors",
                      dest="ignore_errors",
                      action="store_true",
                      help="ignore errors [default=%default]")

    parser.set_defaults(sections=[],
                        logfile="pipeline.log",
                        filter="all",
                        reset=True,
                        time="seconds")

    (options, args) = E.Start(parser, argv)

    rx = re.compile("^[0-9]+")

    if options.sections:
        profile_sections = options.sections
    else:
        profile_sections = ("task", "job")

    counts = {}
    for section in profile_sections:
        counts[section] = collections.defaultdict(Counter)

    rootpath = os.path.abspath(".")

    infile = IOTools.openFile(options.logfile)

    for line in infile:
        if not rx.match(line):
            continue
        data = line[:-1].split()
        if len(data) < 5:
            continue
        date, time, level, source = data[:4]

        if re.search("output generated by", line):
            if options.reset:
                E.info("resetting counts at line=%s" % line[:-1])
                for section in profile_sections:
                    counts[section] = collections.defaultdict(Counter)
            continue

        if not re.match("task\.", source):
            continue

        dt = datetime.datetime.strptime(" ".join((date, time)),
                                        "%Y-%m-%d %H:%M:%S,%f")

        msg = "".join(data[4:])

        started_task, completed_task, started_job, completed_job = \
            (None, None, None, None)

        if re.search("task.log_at_level.\d+Task=(\S+)", msg):
            checked_task = re.search("task.log_at_level.\d+Task=(\S+)",
                                     msg).groups()[0]
        elif re.search("Job=\[(\S+)->(\S+)\]Missingfile[s]*\[(\S+)\]", msg):
            started_infiles, started_job, missing = re.search(
                "Job=\[(\S+)->(\S+)\]Missingfile[s]*\[(\S+)\]", msg).groups()
        elif re.search("Job=\[(\S+)->(\S+)\]Missingfile[s]*", msg):
            started_infiles, started_job = re.search(
                "Job=\[(\S+)->(\S+)\]Missingfile[s]*", msg).groups()
        elif re.search("Job=\[(\S+)->(\S+)\]\s*\.\.\.", msg):
            # multi-line log messages
            started_infiles, started_job = re.search(
                "Job=\[(\S+)->(\S+)\]\s*\.\.\.", msg).groups()
        elif re.search("Taskentersqueue=(\S+)", msg):
            started_task = re.search("Taskentersqueue=(\S+)", msg).groups()[0]
        elif re.search("Job=\[(\S+)->(\S+)\]completed", msg):
            completed_infiles, completed_job = re.search(
                "Job=\[(\S+)->(\S+)\]completed", msg).groups()
        elif re.search("CompletedTask=(\S+)", msg):
            completed_task = re.search("CompletedTask=(\S+)", msg).groups()[0]
        elif re.search("UptodateTask=(\S+)", msg):
            completed_task = re.search("UptodateTask=(\S+)", msg).groups()[0]
        else:
            continue

        try:
            if started_task:
                counts["task"][started_task].add(True, dt, started_task)
            elif completed_task:
                counts["task"][completed_task].add(False, dt, completed_task)
            elif started_job:
                counts["job"][started_job].add(True, dt, started_job)
            elif completed_job:
                counts["job"][completed_job].add(False, dt, completed_job)
            else:
                raise ValueError("unknown action")
        except ValueError as msg:
            if not options.ignore_errors:
                raise ValueError(str(msg) + "\nat line %s" % line)

    if options.time == "milliseconds":
        f = lambda d: d.seconds + d.microseconds / 1000
    elif options.time == "seconds":
        f = lambda d: d.seconds + d.microseconds / 1000000

    for section in profile_sections:
        options.stdout.write("\t".join(("section", "object", "ncalls",
                                        "duration", "percall", "running")) +
                             "\n")

        running = []
        for objct, c in counts[section].items():

            # apply filters
            if options.filter in ("unfinished", "running") and c.running == 0:
                continue

            d = f(c.duration)
            if c.calls > 0:
                percall = "%6.3f" % (d / float(c.calls))
            else:
                percall = "na"

            options.stdout.write("\t".join((list(
                map(str, (
                    section,
                    objct,
                    c.calls,
                    d,
                    percall,
                    c.running,
                ))))) + "\n")

            running.extend([x for x, y in c._started.items() if y != 0])

        options.stdout.write("#//\n\n")

        if running:
            options.stdout.write("# running %ss\n" % section)
            options.stdout.write("\n".join(map(str, running)) + "\n")
            options.stdout.write("#//\n\n")

    E.Stop()
コード例 #20
0
                      "--filename-input-tree",
                      dest="filename_input_tree",
                      type="string",
                      help="filename with tree information.")
    parser.add_option("-T",
                      "--filename-output-tree",
                      dest="filename_output_tree",
                      type="string",
                      help="output filename with tree information.")
    parser.add_option("-p",
                      "--program",
                      dest="program",
                      type="string",
                      help="program to use.")
    parser.add_option("-o",
                      "--options",
                      dest="options",
                      type="string",
                      help="input options.")

    parser.set_defaults(
        filename_input_tree=None,
        filename_output_tree=None,
        program=None,
        options="",
    )

    (options, args) = Experiment.Start(parser)

    Experiment.Stop()
コード例 #21
0
def main(argv=None):

    parser = getOptionParser()

    (options, args) = E.Start(parser, add_cluster_options=True)

    if len(args) == 0:
        raise ValueError(
            "command line argument missing - see usage information")

    options.renumber_column = [x.split(":") for x in options.renumber_column]

    cmd = args[0]
    if len(args) > 1:
        cmd += " '" + "' '".join(args[1:]) + "'"

    if options.dry_run:

        cmd = re.sub("%DIR%", "", cmd)
        retcode = subprocess.call(cmd,
                                  shell=True,
                                  stdin=sys.stdin,
                                  stdout=sys.stdout,
                                  cwd=os.getcwd(),
                                  close_fds=True)
        E.Stop()
        sys.exit(0)

    failed_requests = []
    started_requests = []
    niterations = 0

    if not options.collect:
        tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir))

        E.info(" working in directory %s" % tmpdir)

        if options.split_at_lines:
            chunk_iterator = chunk_iterator_lines
            args = (options.split_at_lines, )
        elif options.split_at_column:
            chunk_iterator = chunk_iterator_column
            args = (options.split_at_column - 1, options.max_files)
        elif options.split_at_regex:
            chunk_iterator = chunk_iterator_regex_split
            args = (re.compile(options.split_at_regex), 0, options.chunksize,
                    options.max_lines)
        elif options.group_by_regex:
            chunk_iterator = chunk_iterator_regex_group
            args = (re.compile(options.group_by_regex), 0, options.chunksize)
        else:
            raise ValueError("please specify a way to chunk input data")

        data = [(x, cmd, options, None, options.subdirs)
                for x in chunk_iterator(options.stdin,
                                        args,
                                        prefix=tmpdir,
                                        use_header=options.input_header)]

        started_requests = [(x[0], x[0] + ".out") for x in data]

        if len(data) == 0:
            E.warn("no data received")
            E.Stop()
            sys.exit(0)

        if options.method == "multiprocessing":
            pool = Pool(options.cluster_num_jobs)
            results = pool.map(runCommand, data, chunksize=1)
        elif options.method == "drmaa":
            results = []
            runDRMAA(data, environment=options.environment)
        elif options.method == "threads":
            pool = ThreadPool(options.cluster_num_jobs)
            results = pool.map(runCommand, data, chunksize=1)

        niterations = 0
        for retcode, filename, cmd, logfile, iterations in results:
            niterations += iterations
            if not hasFinished(retcode, filename, options.output_tag, logfile):
                failed_requests.append((filename, cmd))

    else:
        tmpdir = options.collect
        started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")]

        E.info("collecting %i files from %s" % (len(started_requests), tmpdir))

    if failed_requests:
        for fn, cmd in failed_requests:
            E.error("failed request: filename= %s, cmd= %s" % (fn, cmd))
    else:
        E.info("building result from %i parts" % len(started_requests))

        if options.renumber:
            mapper = MapperLocal(pattern=options.renumber)
        else:
            mapper = MapperEmpty()

        # deal with stdout
        name = None
        index = None

        for pattern, column in options.renumber_column:

            if re.search(pattern, "stdout"):
                try:
                    index = int(column) - 1
                except ValueError:
                    name = column
                    break

        if options.binary:
            ResultBuilderBinary()(started_requests, options.stdout, options)
        else:
            regex = None
            if options.output_regex_header:
                regex = re.compile(options.output_regex_header)
            ResultBuilder(mapper=mapper,
                          field_index=index,
                          field_name=name,
                          header_regex=regex)(started_requests, options.stdout,
                                              options)

        # deal with logfiles : combine them into a single file
        rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd)
        if rr:
            E.info("logging output goes to %s" % rr.groups()[0])
            logfile = IOTools.openFile(rr.groups()[0], "a")
            ResultBuilderLog()([(x[0], "%s.log" % x[0])
                                for x in started_requests], logfile, options)
            logfile.close()

        # deal with other files
        if options.subdirs:

            files = glob.glob("%s/*.dir/*" % tmpdir)
            # remove directory
            filenames = set([os.path.basename(x) for x in files])
            xx = len(".out")

            for filename in filenames:

                _, filetype = os.path.splitext(filename)

                name = None
                index = None

                for pattern, column in options.renumber_column:
                    if re.search(pattern, filename):
                        try:
                            index = int(column) - 1
                        except ValueError:
                            name = column
                        break

                if options.binary:
                    builder = ResultBuilderBinary(mapper=mapper)
                elif filetype in (".fa", ".fasta"):
                    builder = ResultBuilderFasta(mapper=mapper)
                elif filetype in (".mali", ):
                    builder = ResultBuilderFasta(mapper=MapperEmpty())
                elif filetype in (".psl"):
                    builder = ResultBuilderPSL(mapper=mapper)
                elif filetype in (".gtf", ".gff"):
                    builder = ResultBuilderGFF(mapper=mapper,
                                               field_index=index,
                                               field_name=name)
                elif filetype in (".png"):
                    builder = ResultBuilderCopies(mapper=mapper)
                else:
                    builder = ResultBuilder(mapper=mapper,
                                            field_index=index,
                                            field_name=name)

                E.debug("chose the following builder for %s: %s: %s" %
                        (filename, filetype, str(builder)))

                E.info("collecting results for %s" % filename)

                input_filenames = []
                for fi, fn in started_requests:
                    fn = fn[:-xx] + ".dir/" + filename
                    if os.path.exists(fn):
                        input_filenames.append((fi, fn))

                E.info("output of %i files goes to %s" %
                       (len(filenames), filename))

                outfile = IOTools.openFile(options.output_pattern % filename,
                                           "w")
                builder(input_filenames, outfile, options)
                outfile.close()

    if not options.debug and (not options.resume or not options.collect):
        if len(failed_requests) == 0:
            E.info("removing directory %s" % tmpdir)
            shutil.rmtree(tmpdir)
        else:
            E.info("directory %s not removed due to %i failed jobs" %
                   (tmpdir, len(failed_requests)))

    E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" %
           (len(started_requests), len(started_requests) -
            len(failed_requests), len(failed_requests), niterations))

    E.Stop()
コード例 #22
0
ファイル: collect.py プロジェクト: logust79/cgat-flow
'''

TEMPLATE_PIPELINEMODULE = '''
.. automodule:: %(prefix)s
   :members:
   :show-inheritance:
'''

import glob
import os
import CGATCore.Experiment as E

if __name__ == "__main__":

    E.Start()

    dirs = (("../scripts/*.py", TEMPLATE_SCRIPT,
             'scripts'), ("../CGAT/*.py", TEMPLATE_MODULE, 'modules'),
            ("../CGATPipelines/pipeline*.py", TEMPLATE_PIPELINE,
             'pipelines'), ("../CGATPipelines/[A-Z]*.py",
                            TEMPLATE_PIPELINEMODULE, 'pipelinemodules'))

    ncreated, nskipped = 0, 0

    for glob_expression, template, dest in dirs:

        if not os.path.exists(dest):
            os.mkdir(dest)

        files = glob.glob(os.path.abspath(glob_expression))