def main():
	if len(sys.argv) != 6:
		usage()
	cg_dir = sys.argv[1]
	helper.check_dir_exist(cg_dir)
	out_dir = sys.argv[2]
	helper.make_dir(out_dir)
	num_chromHMM_model = helper.get_command_line_integer(sys.argv[3])
	num_score_bins = helper.get_command_line_integer(sys.argv[4])
	cell_type_list_fn = sys.argv[5]
	ct_list = helper.get_list_from_line_seperated_file(cell_type_list_fn)
	helper.check_file_exist(cell_type_list_fn)
	print "Done getting command line arguments"
	calculate_summary_staistics_across_ct(cg_dir, out_dir, num_chromHMM_model, num_score_bins, ct_list)
	print "Done!"
def main():
    num_mandatory_args = 8
    if len(sys.argv) < num_mandatory_args:
        usage()
    train_segment_fn = sys.argv[1]
    helper.check_file_exist(train_segment_fn)
    all_ct_segment_folder = sys.argv[
        2]  # where the segmentation data of all cell types are combined, and stored in files corresponding to different regions in the genome.
    if not os.path.isdir(all_ct_segment_folder):
        print "all_ct_segment_folder IS NOT VALID: " + all_ct_segment_folder
        usage()
    predict_outDir = sys.argv[3]
    helper.make_dir(predict_outDir)
    response_ct = sys.argv[4]
    try:
        num_chromHMM_state = int(sys.argv[5])
        assert num_chromHMM_state > 0, "num_chromHMM_state needs to be positive"
        num_train_ct = int(sys.argv[6])
        assert num_train_ct > 0, "num_train_ct needs to be positive"
    except:
        print "num_chromHMM_state or num_train_ct is not valid"
        usage()
    train_mode = sys.argv[7]
    if len(sys.argv) != (num_train_ct + num_mandatory_args):
        print "num_train_ct is different from the number of arguments passed into the program"
        usage()
    print "Done getting command line arguments"
    train_cell_types = sys.argv[
        num_mandatory_args:]  # the rest of the arguments are the cell types that we use to train the model
    # 1. Get the data of predictors and response for training
    Xtrain_segment_df, Y_df = get_XY_segmentation_data(train_cell_types,
                                                       response_ct,
                                                       num_chromHMM_state,
                                                       train_segment_fn,
                                                       train_mode)
    print "Done getting one hot data"
    print Xtrain_segment_df.head()
    print
    print Y_df.head()
    # 2. Get the regression machine
    regression_machine = train_model(Xtrain_segment_df, Y_df,
                                     num_chromHMM_state, train_mode)
    print "Done training"
    # 3. Based on the machine just created, process training data and then predict the segmentation at each position for the response_ct
    predict_segmentation(all_ct_segment_folder, regression_machine,
                         predict_outDir, train_cell_types, response_ct,
                         num_chromHMM_state, train_mode)
    print "Done predicting whole genome"
def main():
    if len(sys.argv) != 4:
        usage()
    cell_type_folder = sys.argv[1]
    if not os.path.isdir(cell_type_folder):
        print "cell_type_folder DOES NOT EXIST"
        usage()
    ct_fn = sys.argv[2]
    helper.check_file_exist(ct_fn)
    ct_list = get_cell_types_of_interest(
        ct_fn)  # list of cell types of interests example: ['E003', 'E004']
    output_fn = sys.argv[3]
    helper.create_folder_for_file(ct_fn)
    print "Done getting command line arguments"
    # select regions on the genome that we will sample from
    genome_sample_df = sample_genome_positions(
        cell_type_folder, ct_list, output_fn
    )  # --> a dataframe of 3 columns: "chromosome", "start_bp", 'end_bp'
def main():
    num_mandatory_args = 7
    if len(sys.argv) < num_mandatory_args:
        usage()
    train_segment_fn = sys.argv[1]
    helper.check_file_exist(train_segment_fn)
    all_ct_posterior_folder = sys.argv[
        2]  # where the segmentation data of all cell types are combined, and stored in files corresponding to different regions in the genome.
    helper.check_dir_exist(all_ct_posterior_folder)
    predict_outDir = sys.argv[3]
    helper.make_dir(predict_outDir)
    response_ct = sys.argv[4]
    try:
        num_chromHMM_state = int(sys.argv[5])
        assert num_chromHMM_state > 0, "num_chromHMM_state needs to be positive"
        num_train_ct = int(sys.argv[6])
        assert num_train_ct > 0, "num_train_ct needs to be positive"
    except:
        print "num_chromHMM_state or num_train_ct is not valid"
        usage()
    if len(sys.argv) != (num_train_ct + num_mandatory_args):
        print "num_train_ct is different from the number of arguments passed into the program"
        usage()
    print "Done getting command line arguments"
    train_cell_types = sys.argv[
        num_mandatory_args:]  # the rest of the arguments are the cell types that we use to train the model
    # 1. Get the data of predictors and response for training
    Xtrain_segment_df, Y_df = get_XY_segmentation_data(
        train_cell_types, response_ct, num_chromHMM_state, train_segment_fn
    )  # Xtrain_segment_df: example colnames: 'E047_S16', 'E047_S17' --> posterior probabilities of each of the state in each cell type that are used to train
    # Y_df --> example colnames 'E047' --> state numbers 1 --> 18 of each position used to train data for the response cell type
    print "Done getting one hot data"
    print Xtrain_segment_df.head()
    print
    print Y_df.head()
    # 2. Get the regression machine
    regression_machine = train_multinomial_logistic_regression(
        Xtrain_segment_df, Y_df, num_chromHMM_state)
    print "Done training"
    # 3. Based on the machine just created, process training data and then predict the segmentation at each position for the response_ct
    predict_segmentation(all_ct_posterior_folder, regression_machine,
                         predict_outDir, train_cell_types, response_ct,
                         num_chromHMM_state)
    print "Done predicting whole genome"
Exemple #5
0
def main():
	if len(sys.argv) != 7:
		usage()
	cg_dir = sys.argv[1]
	helper.check_dir_exist(cg_dir)
	out_dir = sys.argv[2]
	helper.make_dir(out_dir)
	state_annotation_fn = sys.argv[3]
	helper.check_file_exist(state_annotation_fn)
	state_annot_df = read_state_annot_fn(state_annotation_fn)
	ct_list_fn = sys.argv[4]
	helper.check_file_exist(ct_list_fn)
	ct_list = helper.get_list_from_line_seperated_file(ct_list_fn)
	num_chromHMM_state = helper.get_command_line_integer(sys.argv[5])
	igv_track_name = sys.argv[6]
	print "Done getting command line arguments"
	get_average_state_assign_matrix(cg_dir, ct_list, num_chromHMM_state, out_dir)
	print "Done getting the representative state semgentation for the cellg group"
	draw_genome_pos_list = ['chr5_15']
	# create_igv_format_bed(out_dir, state_annot_df, draw_genome_pos_list, igv_track_name)
	print "Done!"
def main():
    if len(sys.argv) != 7:
        usage()
    train_sampled_data_fn = sys.argv[1]
    helper.check_file_exist(train_sampled_data_fn)
    outDir = sys.argv[2]
    helper.make_dir(outDir)
    all_ct_posterior_folder = sys.argv[3]
    helper.check_dir_exist(all_ct_posterior_folder)
    num_chromHMM_state = helper.get_command_line_integer(sys.argv[4])
    validate_ct = sys.argv[5]
    all_ct_list_fn = sys.argv[6]
    print "Done getting command line arguments"
    # get all cell types
    ct_list = get_all_train_ct_list(all_ct_list_fn, validate_ct)
    print ct_list
    # call all cell types
    call_cross_validation_functions(validate_ct, ct_list, outDir,
                                    train_sampled_data_fn,
                                    all_ct_posterior_folder,
                                    num_chromHMM_state)
Exemple #7
0
def main():
    if len(sys.argv) != 8:
        usage()
    train_sampled_data_fn = sys.argv[1]
    helper.check_file_exist(train_sampled_data_fn)
    outDir = sys.argv[2]
    helper.make_dir(outDir)
    all_ct_segment_folder = sys.argv[3]
    helper.check_dir_exist(all_ct_segment_folder)
    num_chromHMM_state = helper.get_command_line_integer(sys.argv[4])
    validate_ct = sys.argv[5]
    train_mode = sys.argv[6]
    all_ct_list_fn = sys.argv[7]
    print "Done getting command line arguments"
    # get the list of all genomic positions used to segment the genome for our model training (we exclude chromosome Y in all analysis)
    gen_pos_list = get_genomic_positions_list(all_ct_segment_folder)
    # get all cell types
    ct_list = get_all_train_ct_list(all_ct_list_fn, validate_ct)
    # call all cell types
    call_cross_validation_functions(validate_ct, ct_list, outDir,
                                    train_sampled_data_fn,
                                    all_ct_segment_folder, num_chromHMM_state,
                                    gen_pos_list, train_mode)