def run_overlap_evaluation(args, logger): """ :param args: :return: """ arglist = assemble_overlap_params(args) logger.debug('Assembled list of size {} for processing'.format(len(arglist))) results = [] wg_counts = col.defaultdict(col.Counter) with mp.Pool(args.workers) as pool: resit = pool.imap_unordered(compute_pairwise_overlap, arglist, chunksize=1) for res in resit: logger.debug('Received results for {}'.format(res['chrom'])) results.append(res) for comp, stats in res['stats'].items(): wg_counts[comp].update(stats) logger.debug('All statistics collected') norm_dict = dict() for comp, stats in wg_counts.items(): stats['jaccard'] = stats['ovl_cov_bp'] / stats['union_cov_bp'] norm_dict[comp] = dict(stats) results.append({'chrom': 'wg', 'roi_info': {}, 'stats': norm_dict}) results = sorted(results, key=lambda x: x['chrom']) create_filepath(args.outputfile, logger) logger.debug('Dumping output json') with open(args.outputfile, 'w', encoding='ascii') as outf: json.dump(results, outf, indent=1, ensure_ascii=True) return 0
def run_compute_features(args): """ :param args: :return: """ logger = args.module_logger setattr(args, 'selectchroms', args.selectchroms.strip('"')) _ = create_filepath(args.outputfile, logger) logger.debug('Chromosome select pattern: {}'.format(args.selectchroms)) if args.task == 'regress': logger.debug('Computing features/sampling data for task {}'.format( args.task)) logger.warning('=== This run mode is not actively supported ===') logger.warning('=== Unexpected behavior / failure is likely ===') # "magic number" following common limits, e.g., in ChromImpute chromlim = CHROMOSOME_BOUNDARY _ = prepare_regsig_samples(args, chromlim, logger) elif args.task == 'groups': logger.debug('Computing features for task {}'.format(args.task)) logger.warning('=== This run mode is not actively supported ===') logger.warning('=== Unexpected behavior / failure is likely ===') assert args.posingroup and args.negingroup, 'Need to specify HDF groups for positive and negative class' _ = prepare_clsreg_samples(args, logger) elif args.task == 'classify': logger.debug('Computing region features for task: {}'.format( args.task)) _ = prepare_scnreg_samples(args, logger) else: raise NotImplementedError('Task unknown: {}'.format(args.task)) return 0
def prepare_clsreg_samples(args, logger): """ :param args: :param logger: :return: """ arglist = assemble_clsreg_args(args, logger) logger.debug('Argument list of size {} to process'.format(len(arglist))) _ = create_filepath(args.outputfile, logger) with pd.HDFStore(args.outputfile, args.filemode, complib='blosc', complevel=9, encoding='utf-8') as hdfout: metadata = pd.DataFrame(columns=MD_FEATDATA_COLDEFS) with mp.Pool(args.workers) as pool: resit = pool.imap_unordered(get_region_traindata, arglist, chunksize=1) for chrom, pos_samples, posgrp, neg_samples, neggrp in resit: grp, dataobj, metadata = gen_obj_and_md( metadata, posgrp, chrom, args, pos_samples) hdfout.put(grp, dataobj, format='fixed') grp, dataobj, metadata = gen_obj_and_md( metadata, neggrp, chrom, args, neg_samples) hdfout.put(grp, dataobj, format='fixed') hdfout.flush() logger.debug('Processed chromosome {}'.format(chrom)) hdfout.put('metadata', metadata, format='table') hdfout.flush() logger.debug('Collecting training data done') return 0
def prepare_regsig_samples(args, chromlim, logger): """ :param args: :param chromlim: :param logger: :return: """ arglist = assemble_regsig_args(chromlim, args) logger.debug('Collecting training data') _ = create_filepath(args.outputfile, logger) with pd.HDFStore(args.outputfile, args.filemode, complevel=9, complib='blosc', encoding='utf-8') as hdfout: with mp.Pool(args.workers) as pool: resit = pool.imap_unordered(sample_signal_traindata, arglist) metadata = pd.DataFrame(columns=MD_FEATDATA_COLDEFS) for chrom, samples in resit: logger.debug('Processed chromosome {}'.format(chrom)) grp, dataobj, metadata = gen_obj_and_md( metadata, args.outputgroup, chrom, args, samples) hdfout.put(grp, dataobj, format='fixed') hdfout.flush() hdfout.put('metadata', metadata, format='table') return 0
def run_bedgraph_conversion(args, logger): """ :param args: :param logger: :return: """ csizes = read_chromosome_sizes(args.chromsizes, args.selectchroms) logger.debug('Processing {} chromosome(s)'.format(len(csizes))) arglist = assemble_worker_args(csizes, args) meminfo = round(psu.virtual_memory().available / DIV_B_TO_GB, 2) logger.debug('Start processing, available memory: {}GB'.format(meminfo)) create_filepath(args.outputfile, logger) with pd.HDFStore(args.outputfile, args.filemode, complevel=9, complib='blosc') as hdfout: with mp.Pool(args.workers) as pool: if 'metadata' in hdfout: metadata = hdfout['metadata'] else: metadata = pd.DataFrame(columns=MD_SIGNAL_COLDEFS) resit = pool.imap_unordered(process_signal, arglist, chunksize=1) logger.debug('Start processing chromosomes...') for chrom, valobj in resit: logger.debug('Chromosome {} completed'.format(chrom)) if valobj is None: logger.warning( 'No data (or all zero) for chromosome {} in input file(s)' .format(chrom)) continue grp, valobj, metadata = gen_obj_and_md(metadata, args.outputgroup, chrom, args.inputfiles, valobj) hdfout.put(grp, valobj, format='fixed') hdfout.flush() meminfo = round(psu.virtual_memory().available / DIV_B_TO_GB, 2) logger.debug( 'Processed chromosome {} - available memory: {}'.format( chrom, meminfo)) hdfout.put('metadata', metadata, format='table') hdfout.flush() logger.debug('HDF file closed: {}'.format(args.outputfile)) meminfo = round(psu.virtual_memory().available / DIV_B_TO_GB, 2) logger.debug('Available memory: {}'.format(meminfo)) return 0
def check_input_files(filepaths, old_ext, new_ext, outdir, overwrite, logger): """ :param filepaths: :param old_ext: :param new_ext: :param overwrite: :param outdir: :param logger: :return: """ annotation = col.defaultdict(list) for fp in filepaths: fn = os.path.basename(fp) try: root = get_default_group(fp) except AssertionError as ae: logger.error('File {} contains several groups per chromosome - ' 'multi-sample normalization supports only one group ' 'per chromosome per file.'.format(fp)) raise ae else: chroms_in_file = get_chrom_list(fp, verify=False) new_fn = fn.replace(old_ext, new_ext) if outdir == 'as_input': outpath = os.path.dirname(fp) else: outpath = outdir create_filepath(outpath, logger) new_filepath = os.path.join(outpath, new_fn) if fn == new_fn: if (fp == new_filepath) and overwrite: logger.error( 'Path to new file {} is identical to old one {} and filemode is' ' set to overwrite [w] - this will result in data loss,' ' cannot proceed.'.format(new_filepath, fp)) raise AssertionError( 'Cannot overwrite original file with new file') for chrom in chroms_in_file: load_group = os.path.join(root, chrom) infos = (fn, fp, load_group, chrom, new_filepath) annotation[chrom].append(infos) return annotation
def read_split_map(args, trgchroms, qrychroms, logger): """ :param args: :return: """ names = [ 'tchrom', 'tstart', 'tend', 'tstrand', 'qchrom', 'qstart', 'qend', 'qstrand' ] names.insert(args.indexcol, 'index') datatypes = { 'tchrom': str, 'tstart': np.int32, 'tend': np.int32, 'tstrand': str, 'qchrom': str, 'qstart': np.int32, 'qend': np.int32, 'qstrand': str, 'index': np.int32 } logger.debug('Reading map file...') mapdf = pd.read_csv(args.inputfiles[0], sep='\t', names=names, index_col=args.indexcol, low_memory=True, dtype=datatypes, compression='infer', encoding='utf-8') size_in_mem = mapdf.values.nbytes logger.debug('Reading done - full map size: ~{}GiB with {} rows'.format( round(size_in_mem / DIV_B_TO_GB, 2), mapdf.shape[0])) mapdf.replace({'qstrand': {'+': 1, '-': -1}}, inplace=True) tchroms = dict((k, trgchroms[k]) for k in mapdf.tchrom.unique()) logger.debug( 'Identified {} chromosomes for target assembly in map file'.format( len(tchroms))) qchroms = dict((k, qrychroms[k]) for k in mapdf.qchrom.unique()) logger.debug( 'Identified {} chromosomes for query assembly in map file'.format( len(qchroms))) _ = create_filepath(args.outputfile, logger) with pd.HDFStore(args.outputfile, args.filemode, complevel=9, complib='blosc', encoding='utf-8') as hdfout: chrompairs = save_splits(hdfout, tchroms, qchroms, mapdf, logger) save_conservation_masks(hdfout, tchroms, chrompairs, args.minmap, 'target', logger) save_conservation_masks(hdfout, qchroms, chrompairs, args.minmap, 'query', logger) return tchroms, qchroms, chrompairs
def run_compute_correlation(args): """ :param args: :return: """ logger = args.module_logger if args.task == 'cons': assert os.path.isfile( args.mapfile ), 'No valid path to map file for task "cons": {}'.format(args.mapfile) run_funcs = { 'cons': compute_corr_cons, 'full': compute_corr_full, 'active': compute_corr_active, 'roi': compute_corr_roi } exec_fun = run_funcs[args.task] logger.debug('Statistics to compute: {}'.format(args.measure)) arglist = assemble_worker_params(args) output = { 'file_A': os.path.basename(args.inputfilea), 'file_B': os.path.basename(args.inputfileb), 'roi_file': os.path.basename(args.roifile), 'roi_limit': args.roilimit, 'map_file': 'None' if not args.mapfile else os.path.basename(args.mapfile), 'map_reference': 'None' if not args.mapfile else args.mapreference, 'group_A': 'default' if not args.inputgroupa else args.inputgroupa, 'group_B': 'default' if not args.inputgroupb else args.inputgroupb, 'task': args.task, 'measure': args.measure, 'correlations': [] } logger.debug('Initializing worker pool') with mp.Pool(args.workers) as pool: resit = pool.imap_unordered(exec_fun, arglist, chunksize=1) for chrom, results in resit: logger.debug( 'Computed correlation for chromosome {}'.format(chrom)) output['correlations'].append((chrom, results)) logger.debug('Finished computation') fpath = create_filepath(args.outputfile, logger) with open(fpath, 'w') as outfile: json.dump(output, outfile, indent=1, sort_keys=True) logger.debug('Output written to file {}'.format(args.outputfile)) return 0
def run_classification(args, model, modelmd, loadgroups, logger): """ :param args: :param modelmd: :param logger: :return: """ _ = create_filepath(args.outputfile, logger) logger.debug('Loading dataset') dataset, output, dtinfo, sminfo, ftinfo = load_ml_dataset( args.inputfile, loadgroups, modelmd['feature_info']['order'], args, logger) orig_shape = dataset.shape if 'preprocess_info' in modelmd: logger.debug('Preprocessing data') dataset, _ = apply_preprocessor(dataset, modelmd['preprocess_info'], 'test') assert dataset.shape == orig_shape, 'Shape mismatch: {} {}'.format( orig_shape, dataset.shape) if args.task == 'test': out_md = run_classification_testdata(args, model, modelmd, dataset, output, logger) elif args.task == 'est': assert output is None, 'Loaded sample outputs from dataset for prediction task' out_md = run_classification_newdata(model, dataset, logger) else: raise ValueError('Unknown task for classification: {}'.format( args.task)) runinfo = dict() runinfo['task'] = args.task runinfo['model_file'] = os.path.basename(args.modelfile) runinfo['data_file'] = os.path.basename(args.inputfile) runinfo['data_group'] = args.inputgroup out_md['model_info'] = modelmd['model_info'] out_md['run_info'] = runinfo out_md['sample_info'] = sminfo out_md['feature_info'] = ftinfo out_md['dataset_info'] = dtinfo logger.debug('Writing metadata of run...') with open(args.outputfile, 'w') as outfile: _ = json.dump(out_md, outfile, indent=1, sort_keys=True) logger.debug('Metadata saved') return 0
def run_merge_datasets(args): """ :param args: :return: """ logger = args.module_logger if len(args.inputfiles) > 1: assert len(args.inputfiles) < 27, 'Merging of more than 26 datasets not supported at the moment' logger.debug('Preparing merge of datasets...') if len(args.valfile) > 0: logger.debug('Extending the final dataset with infos' ' from {} additional file(s)'.format(len(args.valfile))) if len(args.inputfiles) < 2 and len(args.valfile) == 0: logger.warning('No merging possible and no additional datasets specified.' ' What do you want me to do, human?') else: arglist = assemble_worker_params(args) logger.debug('Assembled a parameter list of size {} to process'.format(len(arglist))) _ = create_filepath(args.outputfile, logger) with pd.HDFStore(args.outputfile, args.filemode, complevel=9, complib='blosc') as hdf: if 'metadata' in hdf: metadata = hdf['metadata'] else: metadata = pd.DataFrame(columns=MD_REGION_COLDEFS) with mp.Pool(args.workers) as pool: logger.debug('Initialized {} worker process(es)'.format(args.workers)) if args.stack: resit = pool.imap_unordered(stack_datasets, arglist, chunksize=1) else: resit = pool.imap_unordered(merge_extend_datasets, arglist, chunksize=1) for chrom, dataobj in resit: logger.debug('Received data for chromosome {}'.format(chrom)) grp, dataobj, metadata = region_generate(metadata, args.outputgroup, chrom, [args.inputfiles, args.valfile], dataobj) hdf.put(grp, dataobj, format='fixed') hdf.flush() logger.debug('Flushed data to file') hdf.put('metadata', metadata, format='table') hdf.flush() logger.debug('Merging complete') return 0
def run_map_signal(args): """ :param args: :return: """ baseline_mem = round(psu.virtual_memory().active / DIV_B_TO_GB, 2) logger = args.module_logger setattr(args, 'selectchroms', args.selectchroms.strip('"')) logger.debug('Chromosome select pattern for query [map to]: {}'.format( args.selectchroms)) _, ingroup, infile = check_path_infos(args.inputfile, args.inputgroup) _, outgroup, outfile = check_path_infos(args.outputfile, args.outputgroup) qchroms = extract_chromsizes_from_map(args.mapfile, 'query', args.selectchroms) num_qchroms = len(qchroms) tchroms = get_chrom_list(infile, verify=True) logger.debug( 'Chromosomes in target data file [map from]: {}'.format(tchroms)) meminfo = round(psu.virtual_memory().active / DIV_B_TO_GB - baseline_mem, 2) logger.debug('Occupied RAM: {}GB'.format(meminfo)) _ = create_filepath(args.outputfile, logger) logger.debug('Processing {} query chromosomes at a time'.format( args.allocate)) meminfo = round(psu.virtual_memory().active / DIV_B_TO_GB - baseline_mem, 2) logger.debug('Start processing - occupied RAM: {}GB'.format(meminfo)) global _shm_carr with pd.HDFStore(outfile, args.filemode, complib='blosc', complevel=9, encoding='utf-8') as hdf: if '/metadata' in hdf: metadata = hdf['metadata'] else: metadata = pd.DataFrame(columns=MD_SIGNAL_COLDEFS) while len(qchroms) > 0: logger.debug('Query chromosomes left: {}'.format(len(qchroms))) indexlists = assemble_worker_params(infile, ingroup, args.mapfile, args.allocate, tchroms, qchroms, _shm_carr) logger.debug('Processing query chromosomes: {}'.format( sorted(_shm_carr.keys()))) logger.debug('Parameter list of size {} created'.format( len(indexlists))) meminfo = round( psu.virtual_memory().active / DIV_B_TO_GB - baseline_mem, 2) logger.debug( 'Parameter list assembled - occupied RAM: {}GB'.format( meminfo)) check_coverage = col.defaultdict(int) # the following only works on fork platforms (no Windows support) with mp.Pool(args.workers) as pool: resit = pool.imap_unordered(map_signal_data, indexlists, chunksize=1) for res in resit: for item in res: qchrom, tchrom, qcov = item check_coverage[qchrom] += qcov logger.debug('Worker pool finished') for chrom, valobj in _shm_carr.items(): grp, valobj, metadata = gen_obj_and_md( metadata, outgroup, chrom, infile, pd.Series(valobj[:], dtype=np.float64)) # valobj is a pandas.Series at this point actual_cov = (valobj > 0).sum() logger.debug( 'Non-zero coverage in chrom {}: {} (mapped pos.: {})'. format(chrom, actual_cov, check_coverage[chrom])) hdf.put(grp, valobj, format='fixed') hdf.flush() logger.debug('Stored data for chromosome {}'.format(chrom)) meminfo = round( psu.virtual_memory().active / DIV_B_TO_GB - baseline_mem, 2) logger.debug('Occupied RAM: {}GB'.format(meminfo)) for k in list(_shm_carr.keys()): del _shm_carr[k] assert len( hdf.keys()) >= num_qchroms, 'Signal mapping incomplete: {}'.format( hdf.keys()) hdf.put('metadata', metadata, format='table') hdf.flush() logger.debug('Metadata saved') return 0
def run_train_model(args): """ :param args: :return: """ logger = args.module_logger _ = create_filepath(args.modelout, logger) logger.debug('Loading model specification from {}'.format(args.modelspec)) model_spec = json.load(open(args.modelspec)) model = load_model(model_spec['module_path'], model_spec['model_name']) load_groups = get_valid_hdf5_groups(args.inputfile, args.inputgroup) traindata, targets, dtinfo, sminfo, ftinfo = load_ml_dataset(args.inputfile, load_groups, None, args, logger) assert traindata.shape[0] > 1, 'No samples (rows) in training data' assert traindata.shape[1] > 1, 'No features (columns) in training data' if 'preprocess' in model_spec and model_spec['preprocess']: logger.debug('Preprocessing dataset with method: {}'.format(model_spec['preprocessor']['preprocessor_name'])) traindata, prepinfo = apply_preprocessor(traindata, model_spec['preprocessor'], 'train') else: prepinfo = None if targets is not None: assert targets.size == traindata.shape[0], 'Mismatch num targets {} and num samples {}'.format(targets.size, traindata.shape[0]) run_metadata = {'dataset_info': dtinfo, 'sample_info': sminfo, 'feature_info': ftinfo, 'model_info': dict()} if prepinfo is not None: run_metadata['preprocess_info'] = prepinfo logger.debug('Training model') if args.notuning: params = model_spec['default'] model = train_nocv(model, params, traindata, targets, sminfo['weights']) run_metadata['model_info']['params'] = params run_metadata['model_info']['tuned'] = False else: params = model_spec['cvtune'] tune_info = train_gridcv(model, params, traindata, targets, args.cvfolds, args.workers, sminfo['weights']) model = tune_info.best_estimator_ run_metadata['model_info']['params'] = tune_info.best_params_ run_metadata['model_info']['tuned'] = True run_metadata['training_info'] = dict() run_metadata['training_info']['cv_scores'] = simplify_cv_scores(tune_info.cv_results_) run_metadata['training_info']['best_score'] = tune_info.best_score_ run_metadata['training_info']['best_index'] = int(tune_info.best_index_) run_metadata['training_info']['scoring'] = params['scoring'] run_metadata['model_info']['name'] = model_spec['model_name'] run_metadata['model_info']['type'] = model_spec['model_type'] if model_spec['model_type'] == 'classifier': run_metadata['training_info']['class_order'] = list(map(int, model.classes_)) logger.debug('Training finished') if 'store_attributes' in model_spec: logger.debug('Storing user requested model attributes') attribs = extract_model_attributes(model, model_spec['store_attributes'], logger) run_metadata['attribute_info'] = attribs if args.calcweights: raise NotImplementedError('Currently not functional') logger.debug('Saving model and metadata') run_metadata['run_info'] = dict() run_metadata['run_info']['model_spec'] = os.path.basename(args.modelspec) run_metadata['run_info']['model_file'] = os.path.basename(args.modelout) run_metadata['run_info']['train_data'] = os.path.basename(args.inputfile) run_metadata['run_info']['train_group'] = args.inputgroup logger.debug('Writing model file...') with open(args.modelout, 'wb') as outfile: pck.dump(model, outfile) if not args.metadataout: mdout = args.modelout.rsplit('.', 1)[0] + '.json' else: mdout = args.metadataout _ = create_filepath(mdout, logger) logger.debug('Writing model metadata...') with open(mdout, 'w') as outfile: _ = json.dump(run_metadata, outfile, indent=1, sort_keys=True) logger.debug('Done') return 0
def run_region_conversion(args, logger): """ :param args: :param logger: :return: """ if args.useheader: assert len(args.inputfiles) == 1, 'Too many input files. Cannot use header information when merging' \ ' several input files (since merging only works on overlapping' \ ' intervals defined by start and end coordinate).' arglist = assemble_worker_args(args, logger) logger.debug('Start processing {} region file(s)'.format( len(args.inputfiles))) _ = create_filepath(args.outputfile, logger) with pd.HDFStore(args.outputfile, args.filemode, complevel=9, complib='blosc') as hdfout: if 'metadata' in hdfout: metadata = hdfout['metadata'] else: metadata = pd.DataFrame(columns=MD_REGION_COLDEFS) with mp.Pool(args.workers) as pool: all_chroms = set() all_regions = None logger.debug('Iterating results') resit = pool.imap_unordered(process_regions, arglist, chunksize=1) for regobj, chroms in resit: logger.debug('Received {} regions'.format(regobj.shape[0])) # collect all chromosomes in dataset(s) all_chroms |= chroms if all_regions is None: all_regions = regobj else: # note to self: concat does not accept aliases 'index' and 'columns' for parameter axis all_regions = pd.concat([all_regions, regobj], axis=0, ignore_index=True, join='inner') logger.debug('All files processed...') if len(args.inputfiles) > 1: logger.debug('Merging {} files...'.format(len( args.inputfiles))) all_regions = merge_overlapping_regions( all_regions, all_chroms) logger.debug('Merging resulted in {} regions'.format( all_regions.shape[0])) # note here that if the file contains a "name" field in the header # the user does not need to specify name-idx if args.nameidx == -1 and 'name' not in all_regions.columns: all_regions = all_regions.assign( name=lambda x: ['region_' + str(idx) for idx in x.index]) logger.debug('Identified {} chromosomes in dataset(s)'.format( len(all_chroms))) for chrom in sorted(all_chroms): chrom_regions = all_regions[all_regions.chrom == chrom] logger.debug('Collected {} regions from chromosome {}'.format( chrom_regions.shape[0], chrom)) chrom_regions = chrom_regions.drop(['chrom'], axis='columns', inplace=False) if chrom_regions.empty: continue grp, valobj, metadata = gen_obj_and_md(metadata, args.outputgroup, chrom, args.inputfiles, chrom_regions) hdfout.put(grp, valobj, format='fixed') hdfout.flush() logger.debug('Processed chromosome {}'.format(chrom)) hdfout.put('metadata', metadata, format='table') logger.debug('HDF file closed: {}'.format(args.outputfile)) return 0