def main(file_list, jar, xml, db, out, queue, walltime, engine, num_jobs, vmem, log_level, port, local):
    """
    Specify the path to a .json file as created by the fetch_runs.py script via the FILE_LIST argument.
    num_jobs will be created and executed on the cluster.
    """
    level = logging.INFO
    if log_level is "DEBUG":
        level = logging.DEBUG
    elif log_level is "WARN":
        level = logging.WARN
    elif log_level is "INFO":
        level = logging.INFO

    logging.captureWarnings(True)
    logging.basicConfig(format=("%(asctime)s - %(name)s - %(levelname)s - " + "%(message)s"), level=level)

    df = pd.read_json(file_list)
    logger.info("Read {} runs from .json file".format(len(df)))

    # get data files
    jarpath = path.abspath(jar)
    xmlpath = path.abspath(xml)
    db_path = path.abspath(db)
    outpath = path.abspath(out)
    output_directory = path.dirname(outpath)
    # create dir if it doesnt exist
    os.makedirs(output_directory, exist_ok=True)
    logger.info("Writing output and temporary data  to {}".format(output_directory))

    job_list = make_jobs(jarpath, xmlpath, db_path, output_directory, df, engine, queue, vmem, num_jobs, walltime)
    job_outputs = gridmap.process_jobs(job_list, max_processes=num_jobs, local=local)
    erna.collect_output(job_outputs, out, df)
Example #2
0
def main(earliest_night, latest_night, data_dir, jar, xml, db, out, queue, walltime, engine, num_runs, vmem, log_level, port, source, conditions, max_delta_t, local, password):

    level=logging.INFO
    if log_level is 'DEBUG':
        level = logging.DEBUG
    elif log_level is 'WARN':
        level = logging.WARN
    elif log_level is 'INFO':
        level = logging.INFO

    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +  '%(message)s'), level=level)

    jarpath = os.path.abspath(jar)
    xmlpath =os. path.abspath(xml)
    outpath = os.path.abspath(out)
    erna.ensure_output(out)
    db_path = os.path.abspath(db)
    output_directory = os.path.dirname(outpath)
    #create dir if it doesnt exist
    os.makedirs(output_directory, exist_ok=True)
    logger.info("Writing output data  to {}".format(out))
    factdb = sqlalchemy.create_engine("mysql+pymysql://factread:{}@129.194.168.95/factdata".format(password))
    data_conditions=dcc.conditions[conditions]
    df_runs = erna.load(earliest_night, latest_night, data_dir, source_name=source, timedelta_in_minutes=max_delta_t, factdb=factdb, data_conditions=data_conditions)

    logger.info("Would process {} jobs with {} runs per job".format(len(df_runs)//num_runs, num_runs))
    click.confirm('Do you want to continue processing and start jobs?', abort=True)

    job_list = make_jobs(jarpath, xmlpath, db_path, output_directory, df_runs,  engine, queue, vmem, num_runs, walltime)
    job_outputs = gridmap.process_jobs(job_list, max_processes=len(job_list), local=local)
    erna.collect_output(job_outputs, out, df_runs)
Example #3
0
def check_process_jobs(wait_sec, local):
    inputs = [(1, wait_sec), (2, wait_sec), (4, wait_sec), (8, wait_sec),
              (16, wait_sec)]
    expected = list(map(compute_factorial, inputs))
    function_jobs = make_jobs(inputs, compute_factorial)
    outputs = process_jobs(function_jobs, quiet=False, local=local)
    eq_(expected, outputs)
Example #4
0
def check_process_jobs(wait_sec, local):
    inputs = [(1, wait_sec), (2, wait_sec), (4, wait_sec), (8, wait_sec), (16,
              wait_sec)]
    expected = list(map(compute_factorial, inputs))
    function_jobs = make_jobs(inputs, compute_factorial)
    outputs = process_jobs(function_jobs, quiet=False, local=local)
    eq_(expected, outputs)
Example #5
0
def main():
    """
    run a set of jobs on cluster
    """

    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'),
                        level=logging.INFO)

    print("=====================================")
    print("========   Submit and Wait   ========")
    print("=====================================")
    print("")

    functionJobs = make_jobs()

    print("sending function jobs to cluster")
    print("")

    job_outputs = process_jobs(functionJobs, max_processes=4)

    print("results from each job")
    for (i, result) in enumerate(job_outputs):
        print("Job {0}- result: {1}".format(i, result))
Example #6
0
def check_idle_parent_process(wait_sec):
    '''
    Make sure that we don't kill idle parents that have active children.
    '''
    inputs = [(1, wait_sec), (2, wait_sec), (4, wait_sec), (8, wait_sec), (16,
              wait_sec)]
    inputs = [(1, wait_sec), (2, wait_sec), (4, wait_sec), (8, wait_sec), (16,
              wait_sec)]
    expected = list(map(compute_factorial, inputs))
    outputs = process_jobs([Job(pool_map_factorial, [inputs])], quiet=False)[0]
    eq_(expected, outputs)
Example #7
0
def check_idle_parent_process(wait_sec):
    '''
    Make sure that we don't kill idle parents that have active children.
    '''
    inputs = [(1, wait_sec), (2, wait_sec), (4, wait_sec), (8, wait_sec),
              (16, wait_sec)]
    inputs = [(1, wait_sec), (2, wait_sec), (4, wait_sec), (8, wait_sec),
              (16, wait_sec)]
    expected = list(map(compute_factorial, inputs))
    outputs = process_jobs([Job(pool_map_factorial, [inputs])], quiet=False)[0]
    eq_(expected, outputs)
def RunPerPool(vcfFile,id,sampledir,count,args):
    """This will run the pool to be analyzed.


    :param str vcfFile: str of vcf file name
    :param str id: str of sample id
    :param Namespace args: Namespace of args to get other variables
    :return: None
    :rtype: None

    """
    jobs = []
    if(os.path.isfile(vcfFile)):
        jobId = "run_ppg_" + str(count) + "_" + str(id)
        cmdList = []
        cmd = args.python + " " + args.ppg + " " + vcfFile + " " + sampledir + " -s " + id + " --iAnnotateSV " + args.ias + " --genome hg19"
        # cmd = str(cmd)
        threads = int(args.threads)
        threads = threads + 1
        qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId + " -o " + jobId + ".stdout" + " -e " + jobId + ".stderr" + \
            " -V -l h_vmem=6G,virtual_free=6G -pe smp " + str(threads) + " -wd " + sampledir + " -sync y " + " -b y " + cmd
        print "qsub_cmd:", qsub_cmd, "\n"
        cmdList.append(qsub_cmd)
        job = Job(
            RunJob,
            cmdList,
            kwlist=None,
            cleanup=True,
            mem_free="2G",
            name=jobId,
            num_slots=1,
            queue=args.queue)
        jobs.append(job)
    print("sending function jobs to cluster")
    print("")

    job_outputs = process_jobs(
        jobs,
        max_processes=10,
        temp_dir='/dmp/analysis/SCRATCH/',
        white_list=None,
        quiet=False,
        local=False)

    print("results from each job")
    for (i, result) in enumerate(job_outputs):
        print("Job {0}- result: {1}".format(i, result))

    return
Example #9
0
def RunPerPool(vcfFile, id, sampledir, count, args):
    """
    This will run the pool to be analyzed.

    :param str vcfFile: str of vcf file name
    :param str id: str of sample id
    :param Namespace args: Namespace of args to get other variables
    :return: None
    :rtype: None

    """
    jobs = []
    if (os.path.isfile(vcfFile)):
        jobId = "run_dhs_" + str(count) + "_" + str(id)
        cmdList = []
        cmd = args.python + " " + args.dhs + " " + vcfFile + " " + sampledir + \
        " --iAnnotateSV " + args.ias + " --genome hg19" + " -hsl " + args.hotspotFile + " -bl " + args.blackListGenes + " -kgl " + args.genesToKeep
        # cmd = str(cmd)
        threads = int(args.threads)
        threads = threads + 1
        qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId + " -o " + jobId + ".stdout" + " -e " + jobId + ".stderr" + \
            " -V -l h_vmem=6G,virtual_free=6G -pe smp " + str(threads) + " -wd " + sampledir + " -sync y " + " -b y " + cmd
        print "qsub_cmd:", qsub_cmd, "\n"
        cmdList.append(qsub_cmd)
        job = Job(RunJob,
                  cmdList,
                  kwlist=None,
                  cleanup=True,
                  mem_free="2G",
                  name=jobId,
                  num_slots=1,
                  queue=args.queue)
        jobs.append(job)
    print("sending function jobs to cluster")
    print("")

    job_outputs = process_jobs(jobs,
                               max_processes=10,
                               temp_dir='/dmp/analysis/SCRATCH/',
                               white_list=None,
                               quiet=False,
                               local=False)

    print("results from each job")
    for (i, result) in enumerate(job_outputs):
        print("Job {0}- result: {1}".format(i, result))

    return
Example #10
0
def main( jar, xml, out, mc_path, queue, walltime, engine, num_jobs, vmem, log_level, port, local):
    '''
    Script to execute fact-tools on MonteCarlo files. Use the MC_PATH argument to specifiy the folders containing the MC
    '''
    level=logging.INFO
    if log_level is 'DEBUG':
        level = logging.DEBUG
    elif log_level is 'WARN':
        level = logging.WARN
    elif log_level is 'INFO':
        level = logging.INFO

    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +  '%(message)s'), level=level)

    erna.ensure_output(out)
    jarpath = path.abspath(jar)
    xmlpath = path.abspath(xml)
    drspath = erna.mc_drs_file()
    logger.info('Using drs file at {}'.format(drspath))

    #get data files
    files=[]
    for folder in tqdm(mc_path):
        # print("Entering folder {}".format(folder))
        pattern = path.join(folder, '**/*_Events.fit*')
        f = glob.glob(pattern, recursive=True)
        files = files + f

    num_files = len(files)
    logger.info("Found {} files.".format(num_files))
    if num_files == 1:
        logger.error("Need more than one file to work with.")
        return
    if num_jobs > num_files:
        logger.error("You specified more jobs than files. This doesn't make sense.")
        return

    click.confirm('Do you want to continue processing and start jobs?', abort=True)

    mc_paths_array = np.array(files)
    drs_paths_array = np.repeat(np.array(drspath), len(mc_paths_array))

    job_list = make_jobs(jarpath, xmlpath, mc_paths_array, drs_paths_array,  engine, queue, vmem, num_jobs, walltime)

    job_outputs = gridmap.process_jobs(job_list, max_processes=num_jobs, local=local)
    erna.collect_output(job_outputs, out)
Example #11
0
def main():
    """
    run a set of jobs on cluster
    """

    print("=====================================")
    print("========   Submit and Wait   ========")
    print("=====================================")
    print("")

    functionJobs = make_jobs()

    print("sending function jobs to cluster")
    print("")

    job_outputs = process_jobs(functionJobs)

    print("results from each job")
    for (i, result) in enumerate(job_outputs):
        print("Job {0}- result: {1}".format(i, result))
Example #12
0
def main():
    """
    run a set of jobs on cluster
    """

    args = parser.parse_args()
    engine = args.engine
    queue = args.queue
    vmem = args.vmem
    port = args.port
    local =args.local
    level = args.logging

    if level is 'DEBUG':
        level = logging.DEBUG
    elif level is 'WARN':
        level = logging.WARN
    elif level is 'INFO':
        level = logging.INFO

    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +  '%(message)s'), level=level)

    print("=====================================")
    print("========   Submit and Wait   ========")
    print("=====================================\n")


    functionJobs = make_jobs(engine, queue, vmem)
    if local :
        print('Running jobs locally')
    else:
        print("Sending function jobs to cluster engine: {}. Into queue: {} \n".format(engine, queue))


    job_outputs = process_jobs(functionJobs, max_processes=4, port=port, local=local)

    print("results from each job")
    for (i, result) in enumerate(job_outputs):
        print("Job {0}- result: {1}".format(i, result))
Example #13
0
def main():
    """
    run a set of jobs on cluster
    """

    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'), level=logging.INFO)

    print("=====================================")
    print("========   Submit and Wait   ========")
    print("=====================================")
    print("")

    functionJobs = make_jobs()

    print("sending function jobs to cluster")
    print("")

    job_outputs = process_jobs(functionJobs, max_processes=4)

    print("results from each job")
    for (i, result) in enumerate(job_outputs):
        print("Job {0}- result: {1}".format(i, result))
Example #14
0
def RunPerPool(titleFile, outdir, HSmetricsFileList, bamFileList, args):
    """This will run the pool to be analyzed.


    :param str titleFile: str of meta information file
    :param str outdir: str of output directory
    :param list HSmetricsFileList: list of picard hsmetrics files
    :param list bamFileList: list of bam files
    :param Namespace args: Namespace of args to get other variables
    :return: None
    :rtype: None

    """
    # Run Preprocess
    titleFileDF = pd.read_csv(titleFile, sep='\t', header=0, keep_default_na='True')
    groupByPatientId = titleFileDF.groupby('Patient_ID')
    baseNames = {}
    jobs = []
    poolidRegXcompile = re.compile('.*[PoolNormal|PooledNormal].*')
    poolHsmetricsFile = filter(poolidRegXcompile.match, HSmetricsFileList).pop()
    poolbamFile = filter(poolidRegXcompile.match, bamFileList).pop()
    for patientID, group in groupByPatientId:
        print patientID, ":"
        tsampleId = ''
        tBamFile = ''
        nBamFile = ''
        basename = ''
        nsampleId = ''
        if(os.path.isdir(outdir)):
            if(args.verbose):
                print "Pool Output Dir:", outdir, "exists!!!"
        else:
            os.mkdir(outdir)
            os.chmod(outdir, 0o755)
        for count, row in group.iterrows():
            sampleId = row.loc['Sample_ID']
            patientId = row.loc['Patient_ID']
            sampleClass = row.loc['Class']
            idRegXcompile = re.compile('.*' + sampleId + '.*')
            if(sampleClass == "Tumor"):
                basename = sampleId
                tBamFile = filter(idRegXcompile.match, bamFileList).pop()
                os.symlink(tBamFile, os.path.join(outdir, os.path.basename(tBamFile)))
                tBamFile = os.path.join(outdir, os.path.basename(tBamFile))
                tsampleId = sampleId
            if(sampleClass == "Normal"):
                nBamFile = filter(idRegXcompile.match, bamFileList).pop()
                os.symlink(nBamFile, os.path.join(outdir, os.path.basename(nBamFile)))
                nBamFile = os.path.join(outdir, os.path.basename(nBamFile))
                nsampleId = sampleId
                nHSmetricsFile = filter(idRegXcompile.match, HSmetricsFileList).pop()
                (decision) = SelectNormal(nHSmetricsFile, poolHsmetricsFile)
                if(decision == 'UnMatched'):
                    nBamFile = poolbamFile
                else:
                    if(args.verbose):
                        print "Matched Sample\n"
        if(os.path.isfile(tBamFile) and (os.path.isfile(nBamFile))):
            jobId = "iCallSV_" + str(count) + "_" + str(basename)
            cmdList = []
            cmd = args.python + " " + args.icsv + " -sc " + args.conf + " -bbam " + nBamFile + " -abam " + \
                tBamFile + " -aId " + tsampleId + " -bId " + nsampleId + " -op " + tsampleId + " -o " + outdir + " -v"
            # cmd = str(cmd)
            threads = int(args.threads)
            threads = threads + 1
            qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId + " -o " + jobId + ".stdout" + " -e " + jobId + ".stderr" + \
                " -V -l h_vmem=6G,virtual_free=6G -pe smp " + str(threads) + " -wd " + outdir + " -sync y " + " -b y " + cmd
            print "qsub_cmd:", qsub_cmd, "\n"
            cmdList.append(qsub_cmd)
            job = Job(
                RunJob,
                cmdList,
                kwlist=None,
                cleanup=True,
                mem_free="2G",
                name=jobId,
                num_slots=1,
                queue=args.queue)
            jobs.append(job)
    print("sending function jobs to cluster")
    print("")

    job_outputs = process_jobs(
        jobs,
        max_processes=10,
        temp_dir='/dmp/analysis/SCRATCH/',
        white_list=None,
        quiet=False,
        local=False)

    print("results from each job")
    for (i, result) in enumerate(job_outputs):
        print("Job {0}- result: {1}".format(i, result))

    return
Example #15
0
def RunPerPool(titleFile, outdir, HSmetricsFileList, bamFileList, args):
    """This will run the pool to be analyzed.


    :param str titleFile: str of meta information file
    :param str outdir: str of output directory
    :param list HSmetricsFileList: list of picard hsmetrics files
    :param list bamFileList: list of bam files
    :param Namespace args: Namespace of args to get other variables
    :return: None
    :rtype: None

    """
    # Run Preprocess
    titleFileDF = pd.read_csv(titleFile,
                              sep='\t',
                              header=0,
                              keep_default_na='True')
    groupByPatientId = titleFileDF.groupby('Patient_ID')
    baseNames = {}
    jobs = []
    poolidRegXcompile = re.compile('.*[PoolNormal|PooledNormal].*')
    poolHsmetricsFile = filter(poolidRegXcompile.match,
                               HSmetricsFileList).pop()
    poolbamFile = filter(poolidRegXcompile.match, bamFileList).pop()
    for patientID, group in groupByPatientId:
        print patientID, ":"
        tsampleId = ''
        tBamFile = ''
        nBamFile = ''
        basename = ''
        nsampleId = ''
        if (os.path.isdir(outdir)):
            if (args.verbose):
                print "Pool Output Dir:", outdir, "exists!!!"
        else:
            os.mkdir(outdir)
            os.chmod(outdir, 0o755)
        for count, row in group.iterrows():
            sampleId = row.loc['Sample_ID']
            patientId = row.loc['Patient_ID']
            sampleClass = row.loc['Class']
            idRegXcompile = re.compile('.*' + sampleId + '.*')
            if (sampleClass == "Tumor"):
                basename = sampleId
                tBamFile = filter(idRegXcompile.match, bamFileList).pop()
                os.symlink(tBamFile,
                           os.path.join(outdir, os.path.basename(tBamFile)))
                tBamFile = os.path.join(outdir, os.path.basename(tBamFile))
                tsampleId = sampleId
            if (sampleClass == "Normal"):
                nBamFile = filter(idRegXcompile.match, bamFileList).pop()
                os.symlink(nBamFile,
                           os.path.join(outdir, os.path.basename(nBamFile)))
                nBamFile = os.path.join(outdir, os.path.basename(nBamFile))
                nsampleId = sampleId
                nHSmetricsFile = filter(idRegXcompile.match,
                                        HSmetricsFileList).pop()
                (decision) = SelectNormal(nHSmetricsFile, poolHsmetricsFile)
                if (decision == 'UnMatched'):
                    nBamFile = poolbamFile
                else:
                    if (args.verbose):
                        print "Matched Sample\n"
        if (os.path.isfile(tBamFile) and (os.path.isfile(nBamFile))):
            jobId = "iCallSV_" + str(count) + "_" + str(basename)
            cmdList = []
            cmd = args.python + " " + args.icsv + " -sc " + args.conf + " -bbam " + nBamFile + " -abam " + \
                tBamFile + " -aId " + tsampleId + " -bId " + nsampleId + " -op " + tsampleId + " -o " + outdir + " -v"
            # cmd = str(cmd)
            threads = int(args.threads)
            threads = threads + 1
            qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId + " -o " + jobId + ".stdout" + " -e " + jobId + ".stderr" + \
                " -V -l h_vmem=6G,virtual_free=6G -pe smp " + str(threads) + " -wd " + outdir + " -sync y " + " -b y " + cmd
            print "qsub_cmd:", qsub_cmd, "\n"
            cmdList.append(qsub_cmd)
            job = Job(RunJob,
                      cmdList,
                      kwlist=None,
                      cleanup=True,
                      mem_free="2G",
                      name=jobId,
                      num_slots=1,
                      queue=args.queue)
            jobs.append(job)
    print("sending function jobs to cluster")
    print("")

    job_outputs = process_jobs(jobs,
                               max_processes=10,
                               temp_dir='/dmp/analysis/SCRATCH/',
                               white_list=None,
                               quiet=False,
                               local=False)

    print("results from each job")
    for (i, result) in enumerate(job_outputs):
        print("Job {0}- result: {1}".format(i, result))

    return
Example #16
0
def run_configuration(config_file,
                      local=False,
                      overwrite=True,
                      queue='all.q',
                      hosts=None,
                      write_summary=True,
                      quiet=False,
                      ablation=0,
                      resume=False,
                      log_level=logging.INFO):
    """
    Takes a configuration file and runs the specified jobs on the grid.

    Parameters
    ----------
    config_file : str
        Path to the configuration file we would like to use.
    local : bool, optional
        Should this be run locally instead of on the cluster?
        Defaults to ``False``.
    overwrite : bool, optional
        If the model files already exist, should we overwrite
        them instead of re-using them?
        Defaults to ``True``.
    queue : str, optional
        The DRMAA queue to use if we're running on the cluster.
        Defaults to ``'all.q'``.
    hosts : list of str, optional
        If running on the cluster, these are the machines we should use.
        Defaults to ``None``.
    write_summary : bool, optional
        Write a TSV file with a summary of the results.
        Defaults to ``True``.
    quiet : bool, optional
        Suppress printing of "Loading..." messages.
        Defaults to ``False``.
    ablation : int, optional
        Number of features to remove when doing an ablation
        experiment. If positive, we will perform repeated ablation
        runs for all combinations of features removing the
        specified number at a time. If ``None``, we will use all
        combinations of all lengths. If 0, the default, no
        ablation is performed. If negative, a ``ValueError`` is
        raised.
        Defaults to 0.
    resume : bool, optional
        If result files already exist for an experiment, do not
        overwrite them. This is very useful when doing a large
        ablation experiment and part of it crashes.
        Defaults to ``False``.
    log_level : str, optional
        The level for logging messages.
        Defaults to ``logging.INFO``.

    Returns
    -------
    result_json_paths : list of str
        A list of paths to .json results files for each variation in the
        experiment.

    Raises
    ------
    ValueError
        If value for ``"ablation"`` is not a positive int or ``None``.
    OSError
        If the lenth of the ``FeatureSet`` name > 210.
    """

    try:

        # Read configuration
        (experiment_name, task, sampler, fixed_sampler_parameters,
         feature_hasher, hasher_features, id_col, label_col, train_set_name,
         test_set_name, suffix, featuresets, do_shuffle, model_path,
         do_grid_search, grid_objectives, probability, pipeline, results_path,
         pos_label_str, feature_scaling, min_feature_count, folds_file,
         grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds,
         save_cv_models, use_folds_file_for_grid_search, do_stratified_folds,
         fixed_parameter_list, param_grid_list, featureset_names, learners,
         prediction_dir, log_path, train_path, test_path, ids_to_floats,
         class_map, custom_learner_path, custom_metric_path,
         learning_curve_cv_folds_list, learning_curve_train_sizes,
         output_metrics) = parse_config_file(config_file, log_level=log_level)

        # get the main experiment logger that will already have been
        # created by the configuration parser so we don't need anything
        # except the name `experiment`.
        logger = get_skll_logger('experiment')

        # Check if we have gridmap
        if not local and not _HAVE_GRIDMAP:
            local = True
            logger.warning('gridmap 0.10.1+ not available. Forcing local '
                           'mode.  To run things on a DRMAA-compatible '
                           'cluster, install gridmap>=0.10.1 via pip.')

        # No grid search or ablation for learning curve generation
        if task == 'learning_curve':
            if ablation is None or ablation > 0:
                ablation = 0
                logger.warning("Ablating features is not supported during "
                               "learning curve generation. Ignoring.")

        # if we just had a train file and a test file, there are no real featuresets
        # in which case there are no features to ablate
        if len(featuresets) == 1 and len(featuresets[0]) == 1:
            if ablation is None or ablation > 0:
                ablation = 0
                logger.warning(
                    "Not enough featuresets for ablation. Ignoring.")

        # if performing ablation, expand featuresets to include combinations of
        # features within those sets
        if ablation is None or ablation > 0:
            # Make new feature set lists so that we can iterate without issue
            expanded_fs = []
            expanded_fs_names = []
            for features, featureset_name in zip(featuresets,
                                                 featureset_names):
                features = sorted(features)
                featureset = set(features)
                # Expand to all feature combinations if ablation is None
                if ablation is None:
                    for i in range(1, len(features)):
                        for excluded_features in combinations(features, i):
                            expanded_fs.append(
                                sorted(featureset - set(excluded_features)))
                            expanded_fs_names.append(
                                featureset_name + '_minus_' +
                                _munge_featureset_name(excluded_features))
                # Otherwise, just expand removing the specified number at a time
                else:
                    for excluded_features in combinations(features, ablation):
                        expanded_fs.append(
                            sorted(featureset - set(excluded_features)))
                        expanded_fs_names.append(
                            featureset_name + '_minus_' +
                            _munge_featureset_name(excluded_features))
                # Also add version with nothing removed as baseline
                expanded_fs.append(features)
                expanded_fs_names.append(featureset_name + '_all')

            # Replace original feature set lists
            featuresets = expanded_fs
            featureset_names = expanded_fs_names
        elif ablation < 0:
            raise ValueError('Value for "ablation" argument must be either '
                             'positive integer or None.')

        # the list of jobs submitted (if running on grid)
        if not local:
            jobs = []

        # the list to hold the paths to all the result json files
        result_json_paths = []

        # check if the length of the featureset_name exceeds the maximum length
        # allowed
        for featureset_name in featureset_names:
            if len(featureset_name) > 210:
                raise OSError(
                    'System generated file length "{}" exceeds the '
                    'maximum length supported.  Please specify names of '
                    'your datasets with "featureset_names".  If you are '
                    'running ablation experiment, please reduce the '
                    'length of the features in "featuresets" because the'
                    ' auto-generated name would be longer than the file '
                    'system can handle'.format(featureset_name))

        # if the task is learning curve, and ``metrics`` was specified, then
        # assign the value of ``metrics`` to ``grid_objectives`` - this lets
        # us piggyback on the parallelization of the objectives that is already
        # set up for us to use
        if task == 'learning_curve' and len(output_metrics) > 0:
            grid_objectives = output_metrics

        # if there were no grid objectives provided, just set it to
        # a list containing a single None so as to allow the parallelization
        # to proceeed and to pass the correct default value of grid_objective
        # down to _classify_featureset().
        if not grid_objectives:
            grid_objectives = [None]

        # Run each featureset-learner-objective combination
        for featureset, featureset_name in zip(featuresets, featureset_names):
            for learner_num, learner_name in enumerate(learners):
                for grid_objective in grid_objectives:

                    # for the individual job name, we need to add the feature set name
                    # and the learner name
                    if grid_objective is None or len(grid_objectives) == 1:
                        job_name_components = [
                            experiment_name, featureset_name, learner_name
                        ]
                    else:
                        job_name_components = [
                            experiment_name, featureset_name, learner_name,
                            grid_objective
                        ]

                    job_name = '_'.join(job_name_components)

                    # change the prediction prefix to include the feature set
                    prediction_prefix = join(prediction_dir, job_name)

                    # the log file that stores the actual output of this script (e.g.,
                    # the tuned parameters, what kind of experiment was run, etc.)
                    logfile = join(log_path, '{}.log'.format(job_name))

                    # Figure out result json file path
                    result_json_path = join(results_path,
                                            '{}.results.json'.format(job_name))

                    # save the path to the results json file that will be written
                    result_json_paths.append(result_json_path)

                    # If result file already exists and we're resuming, move on
                    if resume and (exists(result_json_path)
                                   and getsize(result_json_path)):
                        logger.info(
                            'Running in resume mode and %s exists, '
                            'so skipping job.', result_json_path)
                        continue

                    # create job if we're doing things on the grid
                    job_args = {}
                    job_args["experiment_name"] = experiment_name
                    job_args["task"] = task
                    job_args["sampler"] = sampler
                    job_args["feature_hasher"] = feature_hasher
                    job_args["hasher_features"] = hasher_features
                    job_args["job_name"] = job_name
                    job_args["featureset"] = featureset
                    job_args["featureset_name"] = featureset_name
                    job_args["learner_name"] = learner_name
                    job_args["train_path"] = train_path
                    job_args["test_path"] = test_path
                    job_args["train_set_name"] = train_set_name
                    job_args["test_set_name"] = test_set_name
                    job_args["shuffle"] = do_shuffle
                    job_args["model_path"] = model_path
                    job_args["prediction_prefix"] = prediction_prefix
                    job_args["grid_search"] = do_grid_search
                    job_args["grid_objective"] = grid_objective
                    job_args['output_metrics'] = output_metrics
                    job_args["suffix"] = suffix
                    job_args["log_file"] = logfile
                    job_args["log_level"] = log_level
                    job_args["probability"] = probability
                    job_args["pipeline"] = pipeline
                    job_args["results_path"] = results_path
                    job_args["sampler_parameters"] = (
                        fixed_sampler_parameters
                        if fixed_sampler_parameters else dict())
                    job_args["fixed_parameters"] = (
                        fixed_parameter_list[learner_num]
                        if fixed_parameter_list else dict())
                    job_args["param_grid"] = (param_grid_list[learner_num]
                                              if param_grid_list else None)
                    job_args["pos_label_str"] = pos_label_str
                    job_args["overwrite"] = overwrite
                    job_args["feature_scaling"] = feature_scaling
                    job_args["min_feature_count"] = min_feature_count
                    job_args["grid_search_jobs"] = grid_search_jobs
                    job_args["grid_search_folds"] = grid_search_folds
                    job_args["folds_file"] = folds_file
                    job_args["cv_folds"] = cv_folds
                    job_args["save_cv_folds"] = save_cv_folds
                    job_args["save_cv_models"] = save_cv_models
                    job_args[
                        "use_folds_file_for_grid_search"] = use_folds_file_for_grid_search
                    job_args["do_stratified_folds"] = do_stratified_folds
                    job_args["label_col"] = label_col
                    job_args["id_col"] = id_col
                    job_args["ids_to_floats"] = ids_to_floats
                    job_args["quiet"] = quiet
                    job_args["class_map"] = class_map
                    job_args["custom_learner_path"] = custom_learner_path
                    job_args["custom_metric_path"] = custom_metric_path
                    job_args[
                        "learning_curve_cv_folds"] = learning_curve_cv_folds_list[
                            learner_num]
                    job_args[
                        "learning_curve_train_sizes"] = learning_curve_train_sizes

                    if not local:
                        jobs.append(
                            Job(_classify_featureset, [job_args],
                                num_slots=(MAX_CONCURRENT_PROCESSES if
                                           (do_grid_search or task
                                            == 'learning_curve') else 1),
                                name=job_name,
                                queue=queue))
                    else:
                        _classify_featureset(job_args)

        # Call get_skll_logger again after _classify_featureset
        # calls are finished so that any warnings that may
        # happen after this point get correctly logged to the
        # main logger
        logger = get_skll_logger('experiment')

        # submit the jobs (if running on grid)
        if not local and _HAVE_GRIDMAP:
            if log_path:
                job_results = process_jobs(jobs,
                                           white_list=hosts,
                                           temp_dir=log_path)
            else:
                job_results = process_jobs(jobs, white_list=hosts)
            _check_job_results(job_results)

        # write out the summary results file
        if (task == 'cross_validate' or task == 'evaluate') and write_summary:
            summary_file_name = experiment_name + '_summary.tsv'
            with open(join(results_path, summary_file_name), 'w',
                      newline='') as output_file:
                _write_summary_file(result_json_paths,
                                    output_file,
                                    ablation=ablation)
        elif task == 'learning_curve':
            output_file_name = experiment_name + '_summary.tsv'
            output_file_path = join(results_path, output_file_name)
            with open(output_file_path, 'w', newline='') as output_file:
                _write_learning_curve_file(result_json_paths, output_file)

            # generate the actual plot if we have the requirements installed
            generate_learning_curve_plots(experiment_name, results_path,
                                          output_file_path)

    finally:

        # Close/remove any logger handlers
        close_and_remove_logger_handlers(get_skll_logger('experiment'))

    return result_json_paths
Example #17
0
def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
                      hosts=None, write_summary=True, quiet=False,
                      ablation=0, resume=False):
    """
    Takes a configuration file and runs the specified jobs on the grid.

    :param config_path: Path to the configuration file we would like to use.
    :type config_path: str
    :param local: Should this be run locally instead of on the cluster?
    :type local: bool
    :param overwrite: If the model files already exist, should we overwrite
                      them instead of re-using them?
    :type overwrite: bool
    :param queue: The DRMAA queue to use if we're running on the cluster.
    :type queue: str
    :param hosts: If running on the cluster, these are the machines we should
                  use.
    :type hosts: list of str
    :param write_summary: Write a tsv file with a summary of the results.
    :type write_summary: bool
    :param quiet: Suppress printing of "Loading..." messages.
    :type quiet: bool
    :param ablation: Number of features to remove when doing an ablation
                     experiment. If positive, we will perform repeated ablation
                     runs for all combinations of features removing the
                     specified number at a time. If ``None``, we will use all
                     combinations of all lengths. If 0, the default, no
                     ablation is performed. If negative, a ``ValueError`` is
                     raised.
    :type ablation: int or None
    :param resume: If result files already exist for an experiment, do not
                   overwrite them. This is very useful when doing a large
                   ablation experiment and part of it crashes.
    :type resume: bool

    :return: A list of paths to .json results files for each variation in the
             experiment.
    :rtype: list of str

    """
    # Initialize logger
    logger = logging.getLogger(__name__)

    # Read configuration
    (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher,
     hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
     featuresets, do_shuffle, model_path, do_grid_search, grid_objective,
     probability, results_path, pos_label_str, feature_scaling,
     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds,
     fixed_parameter_list, param_grid_list, featureset_names, learners,
     prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map,
     custom_learner_path) = _parse_config_file(config_file)

    # Check if we have gridmap
    if not local and not _HAVE_GRIDMAP:
        local = True
        logger.warning('gridmap 0.10.1+ not available. Forcing local '
                       'mode.  To run things on a DRMAA-compatible '
                       'cluster, install gridmap>=0.10.1 via pip.')

    # if performing ablation, expand featuresets to include combinations of
    # features within those sets
    if ablation is None or ablation > 0:
        # Make new feature set lists so that we can iterate without issue
        expanded_fs = []
        expanded_fs_names = []
        for features, featureset_name in zip(featuresets, featureset_names):
            features = sorted(features)
            featureset = set(features)
            # Expand to all feature combinations if ablation is None
            if ablation is None:
                for i in range(1, len(features)):
                    for excluded_features in combinations(features, i):
                        expanded_fs.append(sorted(featureset -
                                                  set(excluded_features)))
                        expanded_fs_names.append(
                            featureset_name +
                            '_minus_' +
                            _munge_featureset_name(excluded_features))
            # Otherwise, just expand removing the specified number at a time
            else:
                for excluded_features in combinations(features, ablation):
                    expanded_fs.append(sorted(featureset -
                                              set(excluded_features)))
                    expanded_fs_names.append(
                        featureset_name +
                        '_minus_' +
                        _munge_featureset_name(excluded_features))
            # Also add version with nothing removed as baseline
            expanded_fs.append(features)
            expanded_fs_names.append(featureset_name + '_all')

        # Replace original feature set lists
        featuresets = expanded_fs
        featureset_names = expanded_fs_names
    elif ablation < 0:
        raise ValueError('Value for "ablation" argument must be either '
                         'positive integer or None.')

    # the list of jobs submitted (if running on grid)
    if not local:
        jobs = []

    # the list to hold the paths to all the result json files
    result_json_paths = []

    # check if the length of the featureset_name exceeds the maximum length
    # allowed
    for featureset_name in featureset_names:
        if len(featureset_name) > 210:
            raise OSError('System generated file length "{}" exceeds the '
                          'maximum length supported.  Please specify names of '
                          'your datasets with "featureset_names".  If you are '
                          'running ablation experiment, please reduce the '
                          'length of the features in "featuresets" because the'
                          ' auto-generated name would be longer than the file '
                          'system can handle'.format(featureset_name))

    # Run each featureset-learner combination
    for featureset, featureset_name in zip(featuresets, featureset_names):
        for learner_num, learner_name in enumerate(learners):

            # for the individual job name, we need to add the feature set name
            # and the learner name
            job_name_components = [experiment_name, featureset_name,
                                   learner_name]
            job_name = '_'.join(job_name_components)

            # change the prediction prefix to include the feature set
            prediction_prefix = join(prediction_dir, job_name)

            # the log file that stores the actual output of this script (e.g.,
            # the tuned parameters, what kind of experiment was run, etc.)
            temp_logfile = join(log_path, '{}.log'.format(job_name))

            # Figure out result json file path
            result_json_path = join(results_path,
                                    '{}.results.json'.format(job_name))

            # save the path to the results json file that will be written
            result_json_paths.append(result_json_path)

            # If result file already exists and we're resuming, move on
            if resume and (exists(result_json_path) and
                           os.path.getsize(result_json_path)):
                logger.info('Running in resume mode and %s exists, so skipping'
                            ' job.', result_json_path)
                continue

            # create job if we're doing things on the grid
            job_args = {}
            job_args["experiment_name"] = experiment_name
            job_args["task"] = task
            job_args["sampler"] = sampler
            job_args["feature_hasher"] = feature_hasher
            job_args["hasher_features"] = hasher_features
            job_args["job_name"] = job_name
            job_args["featureset"] = featureset
            job_args["featureset_name"] = featureset_name
            job_args["learner_name"] = learner_name
            job_args["train_path"] = train_path
            job_args["test_path"] = test_path
            job_args["train_set_name"] = train_set_name
            job_args["test_set_name"] = test_set_name
            job_args["shuffle"] = do_shuffle
            job_args["model_path"] = model_path
            job_args["prediction_prefix"] = prediction_prefix
            job_args["grid_search"] = do_grid_search
            job_args["grid_objective"] = grid_objective
            job_args["suffix"] = suffix
            job_args["log_path"] = temp_logfile
            job_args["probability"] = probability
            job_args["results_path"] = results_path
            job_args["sampler_parameters"] = (fixed_sampler_parameters
                                              if fixed_sampler_parameters
                                              else dict())
            job_args["fixed_parameters"] = (fixed_parameter_list[learner_num]
                                            if fixed_parameter_list
                                            else dict())
            job_args["param_grid"] = (param_grid_list[learner_num]
                                      if param_grid_list else None)
            job_args["pos_label_str"] = pos_label_str
            job_args["overwrite"] = overwrite
            job_args["feature_scaling"] = feature_scaling
            job_args["min_feature_count"] = min_feature_count
            job_args["grid_search_jobs"] = grid_search_jobs
            job_args["grid_search_folds"] = grid_search_folds
            job_args["cv_folds"] = cv_folds
            job_args["do_stratified_folds"] = do_stratified_folds
            job_args["label_col"] = label_col
            job_args["id_col"] = id_col
            job_args["ids_to_floats"] = ids_to_floats
            job_args["quiet"] = quiet
            job_args["class_map"] = class_map
            job_args["custom_learner_path"] = custom_learner_path

            if not local:
                jobs.append(Job(_classify_featureset, [job_args],
                                num_slots=(MAX_CONCURRENT_PROCESSES if
                                           do_grid_search else 1),
                                name=job_name, queue=queue))
            else:
                _classify_featureset(job_args)
    test_set_name = basename(test_path)

    # submit the jobs (if running on grid)
    if not local and _HAVE_GRIDMAP:
        if log_path:
            job_results = process_jobs(jobs, white_list=hosts,
                                       temp_dir=log_path)
        else:
            job_results = process_jobs(jobs, white_list=hosts)
        _check_job_results(job_results)

    # write out the summary results file
    if (task == 'cross_validate' or task == 'evaluate') and write_summary:
        summary_file_name = experiment_name + '_summary.tsv'
        file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
        with open(join(results_path, summary_file_name),
                  file_mode) as output_file:
            _write_summary_file(result_json_paths, output_file,
                                ablation=ablation)

    return result_json_paths
Example #18
0
def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
                      hosts=None, write_summary=True, quiet=False,
                      ablation=0, resume=False):
    """
    Takes a configuration file and runs the specified jobs on the grid.

    :param config_path: Path to the configuration file we would like to use.
    :type config_path: str
    :param local: Should this be run locally instead of on the cluster?
    :type local: bool
    :param overwrite: If the model files already exist, should we overwrite
                      them instead of re-using them?
    :type overwrite: bool
    :param queue: The DRMAA queue to use if we're running on the cluster.
    :type queue: str
    :param hosts: If running on the cluster, these are the machines we should
                  use.
    :type hosts: list of str
    :param write_summary: Write a tsv file with a summary of the results.
    :type write_summary: bool
    :param quiet: Suppress printing of "Loading..." messages.
    :type quiet: bool
    :param ablation: Number of features to remove when doing an ablation
                     experiment. If positive, we will perform repeated ablation
                     runs for all combinations of features removing the
                     specified number at a time. If ``None``, we will use all
                     combinations of all lengths. If 0, the default, no
                     ablation is performed. If negative, a ``ValueError`` is
                     raised.
    :type ablation: int or None
    :param resume: If result files already exist for an experiment, do not
                   overwrite them. This is very useful when doing a large
                   ablation experiment and part of it crashes.
    :type resume: bool

    :return: A list of paths to .json results files for each variation in the
             experiment.
    :rtype: list of str

    """
    # Initialize logger
    logger = logging.getLogger(__name__)

    # Read configuration
    (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher,
     hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
     featuresets, do_shuffle, model_path, do_grid_search, grid_objectives,
     probability, results_path, pos_label_str, feature_scaling,
     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds,
     do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names,
     learners, prediction_dir, log_path, train_path, test_path, ids_to_floats,
     class_map, custom_learner_path) = _parse_config_file(config_file)

    # Check if we have gridmap
    if not local and not _HAVE_GRIDMAP:
        local = True
        logger.warning('gridmap 0.10.1+ not available. Forcing local '
                       'mode.  To run things on a DRMAA-compatible '
                       'cluster, install gridmap>=0.10.1 via pip.')

    # if performing ablation, expand featuresets to include combinations of
    # features within those sets
    if ablation is None or ablation > 0:
        # Make new feature set lists so that we can iterate without issue
        expanded_fs = []
        expanded_fs_names = []
        for features, featureset_name in zip(featuresets, featureset_names):
            features = sorted(features)
            featureset = set(features)
            # Expand to all feature combinations if ablation is None
            if ablation is None:
                for i in range(1, len(features)):
                    for excluded_features in combinations(features, i):
                        expanded_fs.append(sorted(featureset -
                                                  set(excluded_features)))
                        expanded_fs_names.append(
                            featureset_name +
                            '_minus_' +
                            _munge_featureset_name(excluded_features))
            # Otherwise, just expand removing the specified number at a time
            else:
                for excluded_features in combinations(features, ablation):
                    expanded_fs.append(sorted(featureset -
                                              set(excluded_features)))
                    expanded_fs_names.append(
                        featureset_name +
                        '_minus_' +
                        _munge_featureset_name(excluded_features))
            # Also add version with nothing removed as baseline
            expanded_fs.append(features)
            expanded_fs_names.append(featureset_name + '_all')

        # Replace original feature set lists
        featuresets = expanded_fs
        featureset_names = expanded_fs_names
    elif ablation < 0:
        raise ValueError('Value for "ablation" argument must be either '
                         'positive integer or None.')

    # the list of jobs submitted (if running on grid)
    if not local:
        jobs = []

    # the list to hold the paths to all the result json files
    result_json_paths = []

    # check if the length of the featureset_name exceeds the maximum length
    # allowed
    for featureset_name in featureset_names:
        if len(featureset_name) > 210:
            raise OSError('System generated file length "{}" exceeds the '
                          'maximum length supported.  Please specify names of '
                          'your datasets with "featureset_names".  If you are '
                          'running ablation experiment, please reduce the '
                          'length of the features in "featuresets" because the'
                          ' auto-generated name would be longer than the file '
                          'system can handle'.format(featureset_name))

    # Run each featureset-learner combination
    for featureset, featureset_name in zip(featuresets, featureset_names):
        for learner_num, learner_name in enumerate(learners):
            for grid_objective in grid_objectives:

                # for the individual job name, we need to add the feature set name
                # and the learner name
                if len(grid_objectives) == 1:
                    job_name_components = [experiment_name, featureset_name,
                                           learner_name]
                else:
                    job_name_components = [experiment_name, featureset_name,
                                           learner_name, grid_objective]

                job_name = '_'.join(job_name_components)

                # change the prediction prefix to include the feature set
                prediction_prefix = join(prediction_dir, job_name)

                # the log file that stores the actual output of this script (e.g.,
                # the tuned parameters, what kind of experiment was run, etc.)
                temp_logfile = join(log_path, '{}.log'.format(job_name))

                # Figure out result json file path
                result_json_path = join(results_path,
                                        '{}.results.json'.format(job_name))

                # save the path to the results json file that will be written
                result_json_paths.append(result_json_path)

                # If result file already exists and we're resuming, move on
                if resume and (exists(result_json_path) and
                               os.path.getsize(result_json_path)):
                    logger.info('Running in resume mode and %s exists, '
                                'so skipping job.', result_json_path)
                    continue

                # create job if we're doing things on the grid
                job_args = {}
                job_args["experiment_name"] = experiment_name
                job_args["task"] = task
                job_args["sampler"] = sampler
                job_args["feature_hasher"] = feature_hasher
                job_args["hasher_features"] = hasher_features
                job_args["job_name"] = job_name
                job_args["featureset"] = featureset
                job_args["featureset_name"] = featureset_name
                job_args["learner_name"] = learner_name
                job_args["train_path"] = train_path
                job_args["test_path"] = test_path
                job_args["train_set_name"] = train_set_name
                job_args["test_set_name"] = test_set_name
                job_args["shuffle"] = do_shuffle
                job_args["model_path"] = model_path
                job_args["prediction_prefix"] = prediction_prefix
                job_args["grid_search"] = do_grid_search
                job_args["grid_objective"] = grid_objective
                job_args["suffix"] = suffix
                job_args["log_path"] = temp_logfile
                job_args["probability"] = probability
                job_args["results_path"] = results_path
                job_args["sampler_parameters"] = (fixed_sampler_parameters
                                                  if fixed_sampler_parameters
                                                  else dict())
                job_args["fixed_parameters"] = (fixed_parameter_list[learner_num]
                                                if fixed_parameter_list
                                                else dict())
                job_args["param_grid"] = (param_grid_list[learner_num]
                                          if param_grid_list else None)
                job_args["pos_label_str"] = pos_label_str
                job_args["overwrite"] = overwrite
                job_args["feature_scaling"] = feature_scaling
                job_args["min_feature_count"] = min_feature_count
                job_args["grid_search_jobs"] = grid_search_jobs
                job_args["grid_search_folds"] = grid_search_folds
                job_args["cv_folds"] = cv_folds
                job_args["save_cv_folds"] = save_cv_folds
                job_args["do_stratified_folds"] = do_stratified_folds
                job_args["label_col"] = label_col
                job_args["id_col"] = id_col
                job_args["ids_to_floats"] = ids_to_floats
                job_args["quiet"] = quiet
                job_args["class_map"] = class_map
                job_args["custom_learner_path"] = custom_learner_path

                if not local:
                    jobs.append(Job(_classify_featureset, [job_args],
                                    num_slots=(MAX_CONCURRENT_PROCESSES if
                                               do_grid_search else 1),
                                    name=job_name, queue=queue))
                else:
                    _classify_featureset(job_args)
    test_set_name = basename(test_path)

    # submit the jobs (if running on grid)
    if not local and _HAVE_GRIDMAP:
        if log_path:
            job_results = process_jobs(jobs, white_list=hosts,
                                       temp_dir=log_path)
        else:
            job_results = process_jobs(jobs, white_list=hosts)
        _check_job_results(job_results)

    # write out the summary results file
    if (task == 'cross_validate' or task == 'evaluate') and write_summary:
        summary_file_name = experiment_name + '_summary.tsv'
        file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
        with open(join(results_path, summary_file_name),
                  file_mode) as output_file:
            _write_summary_file(result_json_paths, output_file,
                                ablation=ablation)

    return result_json_paths
Example #19
0
def RunPerPool(titleFile, outdir, HSmetricsFileList, bamFileList, segmentFileList, args, jobqueue):
    # Run Preprocess
    titleFileDF = pd.read_csv(titleFile, sep='\t', header=0, keep_default_na='True')
    groupByPatientId = titleFileDF.groupby('Patient_ID')
    baseNames = {}
    jobs = []
    poolidRegXcompile = re.compile('.*[PoolNormal|PooledNormal].*')
    poolHsmetricsFile = filter(poolidRegXcompile.match, HSmetricsFileList).pop()
    poolbamFile = filter(poolidRegXcompile.match, bamFileList).pop()
    for patientID, group in groupByPatientId:
        print patientID, ":"
        outTargetFile = ''
        tBamFile = ''
        nBamFile = ''
        basename = ''
        toutdir = ''
        if(os.path.isdir(outdir)):
            if(args.verbose):
                print "Pool Output Dir:", outdir, "exists!!!"
        else:
            os.mkdir(outdir)
            os.chmod(outdir, 0o755)
        for count, row in group.iterrows():
            bcId = row.loc['Barcode']
            poolId = row.loc['Pool']
            sampleId = row.loc['Sample_ID']
            patientId = row.loc['Patient_ID']
            sampleClass = row.loc['Class']
            idRegXcompile = re.compile('.*' + sampleId + '.*')
            if(sampleClass == "Tumor"):
                toutdir = outdir + "/" + sampleId
                if(os.path.isdir(toutdir)):
                    if(args.verbose):
                        print "Output Dir:", toutdir, "exists!!!"
                else:
                    os.mkdir(toutdir)
                    os.chmod(toutdir, 0o755)
                outTargetFile = toutdir + "/" + sampleId + "_targetRegion.bed"
                txt_fh = open(outTargetFile, "wb")
                txt_fh.write("chrom\tloc.start\tloc.end\n")
                basename = sampleId
                tBamFile = filter(idRegXcompile.match, bamFileList).pop()
                segfile = filter(idRegXcompile.match, segmentFileList).pop()
                if(segfile):
                    segFileDF = pd.read_csv(segfile, sep=' ', header=0, keep_default_na='True')
                    for segcount, segrow in segFileDF.iterrows():
                        chr = segrow.loc['chrom']
                        start = segrow.loc['loc.start']
                        end = segrow.loc['loc.end']
                        txt_fh.write(str(chr) + "\t" + str(start) + "\t" + str(end) + "\n")
                    txt_fh.close()
            if(sampleClass == "Normal"):
                nBamFile = filter(idRegXcompile.match, bamFileList).pop()
                nHSmetricsFile = filter(idRegXcompile.match, HSmetricsFileList).pop()
                (decision) = SelectNormal(nHSmetricsFile, poolHsmetricsFile)
                if(decision == 'UnMatched'):
                    nBamFile = poolbamFile
                else:
                    if(args.verbose):
                        print "Matched Sample\n"

        if(os.path.isfile(tBamFile) and (os.path.isfile(nBamFile))and (os.path.isfile(outTargetFile))):
            # Make Bai and Soft-Link Bam and Bai Files# Tumor Bam
            file_dir, this_filename = os.path.split(tBamFile)
            destTBamFile = toutdir + "/" + this_filename
            tBaiFile = this_filename
            tBaiFile = tBaiFile[:-1]
            tBaiFile = tBaiFile + "i"
            destTBaiFile = tBaiFile[:-4]
            destTBaiFile = destTBaiFile + ".bam.bai"
            destTBaiFile = toutdir + "/" + destTBaiFile
            tBaiFile = file_dir + "/" + tBaiFile
            if(os.path.isfile(destTBamFile)):
                print destTBamFile, "File already exists!!"
            else:
                os.symlink(tBamFile, destTBamFile)
            if(os.path.isfile(destTBaiFile)):
                print destTBaiFile, "File already exists!!"
            else:
                os.symlink(tBaiFile, destTBaiFile)
            tBamFile = destTBamFile
            # Make Bai and Soft-Link Bam and Bai Files#Noraml Bam File
            file_dir, this_filename = os.path.split(nBamFile)
            destNBamFile = toutdir + "/" + this_filename
            nBaiFile = this_filename
            nBaiFile = nBaiFile[:-1]
            nBaiFile = nBaiFile + "i"
            destNBaiFile = nBaiFile[:-4]
            destNBaiFile = destNBaiFile + ".bam.bai"
            destNBaiFile = toutdir + "/" + destNBaiFile
            nBaiFile = file_dir + "/" + nBaiFile
            # print destNBamFile,"\n",destNBaiFile,"\n",nBamFile,"\n",nBaiFile
            if(os.path.isfile(destNBamFile)):
                print destNBamFile, "File already exists!!"
            else:
                os.symlink(nBamFile, destNBamFile)
            if(os.path.isfile(destNBaiFile)):
                print destNBaiFile, "File already exists!!"
            else:
                os.symlink(nBaiFile, destNBaiFile)
            nBamFile = destNBamFile
            jobId_preprocess = "Preprocess_" + str(count) + "_" + str(basename)
            baseNames[basename] = toutdir + "#" + jobId_preprocess
            outpklFile = toutdir + "/" + basename + '.MixClone.input.pkl'
            if(os.path.isfile(outpklFile)):
                continue
            else:
                cmdList = []
                cmd = args.python + " " + args.mixclone + " preprocess " + args.ref + " " + outTargetFile + " " + nBamFile + " " + tBamFile + " " + basename + \
                    " --min_depth " + args.minDepth + " --min_base_qual " + args.minBQ + " --min_map_qual " + args.minMQ + " --process_num " + args.threads
                #cmd = str(cmd)
                threads = int(args.threads)
                threads = threads + 1
                qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId_preprocess + " -o " + jobId_preprocess + ".stdout" + " -e " + \
                    jobId_preprocess + ".stderr" + " -V -l h_vmem=6G,virtual_free=6G -pe smp " + str(threads) + " -wd " + toutdir + " -sync y " + " -b y " + cmd
                print "qsub_cmd:", qsub_cmd, "\n"
                cmdList.append(qsub_cmd)
                job = Job(
                    RunJob,
                    cmdList,
                    kwlist=None,
                    cleanup=True,
                    mem_free="2G",
                    name=jobId_preprocess,
                    num_slots=1,
                    queue=args.queue)
                jobs.append(job)
    print("sending function jobs to cluster")
    print("")

    job_outputs = process_jobs(
        jobs,
        max_processes=10,
        temp_dir='/dmp/analysis/SCRATCH/',
        white_list=None,
        quiet=False,
        local=False)

    print("results from each job")
    for (i, result) in enumerate(job_outputs):
        print("Job {0}- result: {1}".format(i, result))

    # RunModel
    count = 0
    jobs = []
    for basename, jdata in baseNames.iteritems():
        (toutdir, jobId_preprocess) = jdata.split('#', 1)
        jobId_runmodel = "RunModel_" + str(count) + "_" + str(basename)
        outputbasename = basename + '_output'
        outpklFileRegex = toutdir + "/" + basename + '*.MixClone.output.pkl'
        outpklFiles = glob.glob(outpklFileRegex)
        if(outpklFiles):
            continue
        else:
            cmdList = []
            cmd = args.python + " " + args.mixclone + " run_model " + basename + " " + \
                outputbasename + " --max_copynumber 8 --subclone_num 3 --max_iters 30 --stop_value 1e-6"
            qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId_runmodel + " -o " + jobId_runmodel + ".stdout" + " -e " + \
                jobId_runmodel + ".stderr" + " -V -l h_vmem=6G,virtual_free=6G -pe smp 1" + " -wd " + toutdir + " -sync y " + "-b y " + cmd
            print "qsub_cmd:", qsub_cmd, "\n"
            cmdList.append(qsub_cmd)
            job = Job(
                RunJob,
                cmdList,
                kwlist=None,
                cleanup=True,
                mem_free="2G",
                name=jobId_runmodel,
                num_slots=1,
                queue=args.queue)
            jobs.append(job)
            count = count + 1

    print("sending function jobs to cluster")
    print("")

    job_outputs = process_jobs(
        jobs,
        max_processes=10,
        temp_dir='/dmp/analysis/SCRATCH/',
        white_list=None,
        quiet=False,
        local=False)

    print("results from each job")
    for (i, result) in enumerate(job_outputs):
        print("Job {0}- result: {1}".format(i, result))

    # Run BAF
    count = 0
    jobs = []
    for basename, jdata in baseNames.iteritems():
        (toutdir, jobId_preprocess) = jdata.split('#', 1)
        outputbasename = basename + '_output'
        jobId_PP = "PostPorcess_" + str(count) + "_" + str(basename)
        cmdList = []
        cmd = args.python + " " + args.mixclone + " postprocess " + outputbasename
        qsub_cmd = args.qsub + " -q " + args.queue + " -N " + jobId_PP + " -o " + jobId_PP + ".stdout" + " -e " + jobId_PP + \
            ".stderr" + " -V -l h_vmem=6G,virtual_free=6G -pe smp 1" + " -wd " + toutdir + " -sync y " + " -b y " + cmd
        print "qsub_cmd:", qsub_cmd, "\n"
        cmdList.append(qsub_cmd)
        job = Job(
            RunJob,
            cmdList,
            kwlist=None,
            cleanup=True,
            mem_free="2G",
            name=jobId_PP,
            num_slots=1,
            queue=args.queue)
        jobs.append(job)
        count = count + 1

    print("sending function jobs to cluster")
    print("")

    job_outputs = process_jobs(
        jobs,
        max_processes=10,
        temp_dir='/dmp/analysis/SCRATCH/',
        white_list=None,
        quiet=False,
        local=False)

    print("results from each job")
    for (i, result) in enumerate(job_outputs):
        print("Job {0}- result: {1}".format(i, result))

    return