def split_bam(donor_id, analysis_id, use_cntl, specimen_map, data_path, outpath, clean_dir, tmp_dir, num_processes=8): """Processes a sample level BAM by RG to create unaligned BAMs""" output_dir = utils.make_new_dir(os.path.join(outpath, donor_id, analysis_id)) metadata = header_utils.parse_cghub_metadata(analysis_id) metadata['use_cntl'] = use_cntl bam_filename = bam_utils.get_bam_file(os.path.join(data_path, analysis_id)) logger.info('Starting gen_unaligned_bam: %s' % analysis_id) bam_utils.gen_unaligned_bam(bam_filename, analysis_id, metadata, specimen_map, output_dir, clean_dir, tmp_dir, num_processes=num_processes) logger.info('Completed gen_unaligned_bam: %s' % analysis_id)
def classif_experiment(features_file_name, labels_file_name, train_percent_list = [1, 5, 10, 20], n_rep = 5, results_files_prefix = '', results_dir = svm_results_dir()): (features, labels, n_samples) = extract_training_and_test( features_file_name, labels_file_name) results_dir = utils.make_new_dir(results_dir, 'svm_experiment') with open(path.join(results_dir, 'description.txt'), 'w') as desc_file: desc_file.write('features file: %s\n' % features_file_name) desc_file.write('labels file: %s\n' % labels_file_name) desc_file.write('n samples: %d\n' % n_samples) desc_file.write('n positives: %d\n' % np.sum(labels)) for train_percent in train_percent_list: file_name_prefix = '%s_train_%d_percent' % (results_files_prefix, train_percent) file_name_prefix = utils.find_good_name(results_dir, file_name_prefix) fp_file_name = utils.find_good_name( results_dir, '%s_false_positives.txt' % file_name_prefix) fn_file_name = utils.find_good_name( results_dir, '%s_false_negatives.txt' % file_name_prefix) tp_file_name = utils.find_good_name( results_dir, '%s_true_positives.txt' % file_name_prefix) tn_file_name = utils.find_good_name( results_dir, '%s_true_negatives.txt' % file_name_prefix) with open(fp_file_name, 'a') as fp_file: with open(fn_file_name, 'a') as fn_file: with open(tp_file_name, 'a') as tp_file: with open(tn_file_name, 'a') as tn_file: for rep in xrange(n_rep): print ('%d percent of train, repetition %d' % (train_percent, rep)) train_indices, test_indices = choose_train_indices( n_samples, train_percent) classif = train_classif( features[train_indices,:], labels[train_indices]) perf = test_classif( classif, features[test_indices,:], labels[test_indices]) fp_file.write('%d\n' % perf['false_positives']) fn_file.write('%d\n' % perf['false_negatives']) tp_file.write('%d\n' % perf['true_positives']) tn_file.write('%d\n' % perf['true_negatives'])
def create_dl_dir(leaf_name_root, extradir=None): # structure of dirs created, shown by example: # 1) no extradir: # 2013 # +-10 # +-leaf_name_root05_120101_55abbbc # ... # # 2) with extradir: # 2013 # +-10 # +-extradir # +-leaf_name_root05_120102_44deeef # ... root_dl_dir = DownloadManagerController.Instance().get_download_dir() st_time = time.gmtime() yr_name = str(st_time.tm_year) mon_name = str(st_time.tm_mon) leaf_dir_name = mkFname(leaf_name_root) if extradir: path = (yr_name, mon_name, extradir, leaf_dir_name) else: path = (yr_name, mon_name, leaf_dir_name) rel_path = os.path.join(*path) full_path = root_dl_dir for p in path[:-1]: full_path = os.path.join(full_path, p) check_or_make_dir(full_path, logger) full_path = os.path.join (full_path, leaf_dir_name) make_new_dir(full_path, logger) return full_path, rel_path
def create_dl_dir(leaf_name_root, extradir=None): # structure of dirs created, shown by example: # 1) no extradir: # 2013 # +-10 # +-leaf_name_root05_120101_55abbbc # ... # # 2) with extradir: # 2013 # +-10 # +-extradir # +-leaf_name_root05_120102_44deeef # ... root_dl_dir = DownloadManagerController.Instance().get_download_dir() st_time = time.gmtime() yr_name = str(st_time.tm_year) mon_name = str(st_time.tm_mon) leaf_dir_name = mkFname(leaf_name_root) if extradir: path = (yr_name, mon_name, extradir, leaf_dir_name) else: path = (yr_name, mon_name, leaf_dir_name) rel_path = os.path.join(*path) full_path = root_dl_dir for p in path[:-1]: full_path = os.path.join(full_path, p) check_or_make_dir(full_path, logger) full_path = os.path.join(full_path, leaf_dir_name) make_new_dir(full_path, logger) return full_path, rel_path
def gosta_experiment(nb_repetitions = 1, n_iter = 10000, trace_period = 10, data_files = ['wine.csv'], graph_type_list = ['watts', 'complete', 'grid'], f_to_avg = pf.within_clust_scatter, averaging_function = gosta.neighbourhood_avg, root_results_folder = path_to_results_folder()): for data_src in data_files: data_name = re.match(r'([^\.]+)\..*', data_src).group(1) print '\n', data_name data = gc.parse(data_src) print data.shape true_mean = compute_truth_all_pairs( data, f_to_avg) for graph_type in graph_type_list: print '\n', graph_type, '\n' results_folder_name = '%s_%s_%s' % ( function_names[averaging_function], data_name, graph_type) results_folder = utils.make_new_dir( root_results_folder, results_folder_name) graph = gc.build_graph(data, graph_type) traces = [] error_traces = [] for rep in range(nb_repetitions): print '%s, %s, repetition %d' % (data_name, graph_type, rep) trace = gosta.gosta_sync( graph, f_to_avg, n_iter = n_iter, trace_period = trace_period, averaging_function = averaging_function, log_filename = path.join(results_folder, 'log.npy'), log_period = 1000) traces.append(trace) error_traces.append(np.abs((np.array(trace) - true_mean) / true_mean)) traces = np.array(traces) error_traces = np.atleast_2d(np.array(error_traces)) err_mean = np.mean(error_traces, axis = 0) traces_file_name = path.join( results_folder, '%s_%s_traces.npy' % (data_name, graph_type)) mean_file_name = path.join( results_folder, '%s_%s_mean.npy' % (data_name, graph_type)) with open(traces_file_name, 'w') as traces_file: np.save(traces_file, traces) with open(mean_file_name, 'w') as mean_file: np.save(mean_file, err_mean)
default=None) parser.add_argument('--work_dir', type=str, help='path/to/output/directory', default=None) parser.add_argument('--output_dir', type=str, help='path/to/output/directory', required=True) args = parser.parse_args() if args.work_dir is None: args.work_dir = args.output_dir exit_code = 0 output_dir = utils.make_new_dir(args.output_dir) work_dir = utils.make_new_dir(args.work_dir) try: if args.tumor_id is None and args.normal_id is not None: metadata = header_utils.parse_cghub_metadata(args.normal_id) metadata['use_cntl'] = 'N/A' exit_code = bam_utils.gen_unaligned_bam(args.bam_path, args.normal_id, metadata, specimen_dict, work_dir, output_dir) elif args.tumor_id is not None and args.normal_id is not None: metadata = header_utils.parse_cghub_metadata(args.tumor_id) metadata['use_cntl'] = args.normal_id exit_code = bam_utils.gen_unaligned_bam(args.bam_path, args.tumor_id, metadata, specimen_dict, work_dir,
def main(): tweets = _read_tweets_to_dataframe("data/tweet_data/", True, 2000) make_new_dir("data/datasets") save_to_csv(tweets, "data/datasets/individual_tweets.csv", "tweet_id")
import pickle segmented_dir_tempalte = "../outputs/check_segmentation_fovea/{}/" ori_img_dir = "../data/merged_training_set/" # load features, labels csv_file = "../outputs/dme_features.csv" df = pd.read_csv(csv_file) df_mat = df.as_matrix() n_total = len(df) ratio_val = 0.1 n_val = int(n_total * ratio_val) # set outdir out_dir = "../dme_xgb_models" utils.make_new_dir(out_dir) # run xgboost min_child_weight = 1 subsample = 0.2 colsample_by_tree = 0.2 colsample_bylevel = 0.2 lambda_val = 3 alpha = 5 depth = 8 train_accs, val_accs = [], [] for i in range(10): # set training and validation dataset train_X = np.concatenate([
parser = argparse.ArgumentParser(prog='pcap_split.py', description='Create unaligned BAM files') parser.add_argument('--bam_path', type=str, help='path/to/tcga/data/labeled_by_analysis_id', required=True) parser.add_argument('--normal_id', type=str, help='UUID for normal analysis (analysis_id)', default=None) parser.add_argument('--tumor_id', type=str, help='Comma separated list of tumor analysis UUIDs (analysid_id(s))', default=None) parser.add_argument('--work_dir', type=str, help='path/to/output/directory', default=None) parser.add_argument('--output_dir', type=str, help='path/to/output/directory', required=True) parser.add_argument('--specimen_map', type=str, default=os.path.join(basedir,'tcga_dcc_specimen_type.txt'), help='path/to/tcga/icgc/sample_code_specimen_mapping') args = parser.parse_args() specimen_dict = parse_specimen_dict(args.specimen_map) if args.work_dir is None: args.work_dir = args.output_dir exit_code = 0 output_dir = utils.make_new_dir(args.output_dir) work_dir = utils.make_new_dir(args.work_dir) try: if args.tumor_id is None and args.normal_id is not None: metadata = header_utils.parse_cghub_metadata(args.normal_id) metadata['use_cntl'] = 'N/A' exit_code = bam_utils.gen_unaligned_bam(args.bam_path, args.normal_id, metadata, specimen_dict, work_dir, output_dir) elif args.tumor_id is not None and args.normal_id is not None: metadata = header_utils.parse_cghub_metadata(args.tumor_id) metadata['use_cntl'] = args.normal_id exit_code = bam_utils.gen_unaligned_bam(args.bam_path, args.tumor_id, metadata, specimen_dict, work_dir, output_dir) else: print "Please define --normal_id or (--normal_id and --tumor_id)" sys.exit(1) except: print "PCAP SPLIT Failure!!!"