def main(args): chr_number = 1 # Use dictionary keys instead of index keys for args args = process_args(args) job = str(args['job']) # must be a number print('JOB {}'.format(job)) prof_option = args['profile'] sim_option = args['sim option'] path = args['path'] [sim_data_dir, germline_out_dir, sim_results_dir] = create_sim_directories(path) processedData = process_input_files(args['param file'], args['model file'], args) using_pseudo_array = True if not processedData.get('discovery') and not processedData.get('sample') and not processedData.get('daf'): using_pseudo_array = False debugPrint(3, "Finished processing input\nprocessedData: ", processedData) ### Create a list of Sequence class instances. These will contain the bulk of all sequence-based data sequences = create_sequences(processedData) names = [seq.name for seq in sequences] n_d = sum([1 for seq in sequences if seq.type == 'discovery']) debugPrint(1,'name\ttotal\tpanel\tgenotyped') for seq in sequences: debugPrint(1,'{}\t{}\t{}\t{}'.format(seq.name, seq.tot, seq.panel, seq.genotyped)) total = sum([seq.tot for seq in sequences]) debugPrint(1, 'total samples: {}'.format(sum([seq.genotyped for seq in sequences if seq.type=='discovery'] + [seq.tot for seq in sequences if seq.type=='sample']))) ### Define simulation size length = processedData['length'] debugPrint(1, 'Perform simulation and get sequences') pedmap = args['pedmap'] germline = args['germline'] ########################################################################## ################## Perform simulation and get sequences ################## ########################################################################## ### Flag to check if the simulation works SNPs_exceed_available_sites = True while SNPs_exceed_available_sites: # add genetic map to macs_args list macs_args = [] macs_args = processedData['macs_args'] if sim_option == 'macs': ### Run macs and make bitarray profile(prof_option, path, job, "start_run_macs") [sequences,position] = run_macs(macs_args, sequences) profile(prof_option, path, job, "end_run_macs") nbss = len(sequences[0].bits) / (sequences[0].tot) if using_pseudo_array: ## get position of the simulated sites and scale it to the "real" position in the SNP chip sim_positions = get_sim_positions(position, nbss, length) elif sim_option == 'macs_file': ### Using a static sim output rather than generating from seed seq_alleles = AllelesMacsFile('tests/test_data/sites1000000.txt') set_seq_bits(sequences, seq_alleles) nbss = len(sequences[0].bits) / (sequences[0].tot) if using_pseudo_array: ## get position of the simulated sites and scale it to the "real" position in the SNP chip sim_positions = get_sim_positions_old(seq_alleles, nbss, length) profile(prof_option, path, job, "start_set_discovery_bits") set_discovery_bits(sequences) profile(prof_option, path, job, "end_set_discovery_bits") debugPrint(1, 'Number of sites in simulation: {}'.format(nbss)) assert nbss > 10, "Number of sites is less than 10: {}".format(nbss) ########################################################################## ### Create pseudo array according to ascertainment scheme and template ### ########################################################################## if using_pseudo_array: SNPs = get_SNP_sites(args['SNP file']) debugPrint(1, 'Number of SNPs in Array: {}'.format(len(SNPs))) profile(prof_option, path, job, "start_set_panel_bits") asc_panel_bits = set_panel_bits(nbss, sequences) profile(prof_option, path, job, "end_set_panel_bits") debugPrint(1,'Number of chromosomes in asc_panel: {}'.format(asc_panel_bits.length()/nbss)) ### Get pseudo array sites debugPrint(2,'Making pseudo array') profile(prof_option, path, job, "start_pseudo_array_bits") [pos_asc, nbss_asc, avail_site_indices, avail_sites] = pseudo_array_bits(asc_panel_bits, processedData['daf'], sim_positions, SNPs) profile(prof_option, path, job, "end_pseudo_array_bits") nb_avail_sites = len(avail_sites) SNPs_exceed_available_sites = ( len(SNPs) >= nb_avail_sites ) else: SNPs = [] SNPs_exceed_available_sites = False if using_pseudo_array: profile(prof_option, path, job, "start_set_asc_bits") set_asc_bits(sequences, nbss_asc, pos_asc, avail_site_indices) profile(prof_option, path, job, "end_set_asc_bits") debugPrint(1, 'Calculating summary statistics') ########################################################################## ###################### Calculate summary statistics ###################### ########################################################################## res, head = [], [] ### Calculate summary stats from genomes if nbss > 0: # Simulations must contain at least one segregating site profile(prof_option, path, job, "start_store_segregating_site_stats") stat_tools.store_segregating_site_stats(sequences, res, head) profile(prof_option, path, job, "end_store_segregating_site_stats") profile(prof_option, path, job, "start_store_pairwise_FSTs") stat_tools.store_pairwise_FSTs(sequences, n_d, res, head) profile(prof_option, path, job, "end_store_pairwise_FSTs") ### Calculate summary stats from the ascertained SNPs if using_pseudo_array: if nbss_asc > 0: profile(prof_option, path, job, "start_store_array_segregating_site_stats") stat_tools.store_array_segregating_site_stats(sequences, res, head) profile(prof_option, path, job, "end_store_array_segregating_site_stats") profile(prof_option, path, job, "start_store_array_FSTs") stat_tools.store_array_FSTs(sequences, res, head) profile(prof_option, path, job, "end_store_array_FSTs") debugPrint(2,'Making ped and map files') ped_file_name = '{0}/macs_asc_{1}_chr{2}.ped'.format(sim_data_dir, job, str(chr_number)) map_file_name = '{0}/macs_asc_{1}_chr{2}.map'.format(sim_data_dir, job, str(chr_number)) out_file_name = '{0}/macs_asc_{1}_chr{2}'.format(germline_out_dir, job, str(chr_number)) if os.path.isfile(out_file_name + '.match'): # Maybe remove if statement os.remove(ped_file_name) os.remove(map_file_name) if using_pseudo_array and pedmap or germline: profile(prof_option, path, job, "start_make_ped_file") make_ped_file(ped_file_name, sequences) profile(prof_option, path, job, "end_make_ped_file") profile(prof_option, path, job, "start_make_map_file") make_map_file(map_file_name, pos_asc, chr_number, avail_sites) profile(prof_option, path, job, "end_make_map_file") ### Use Germline to find IBD on pseduo array ped and map files do_i_run_germline = int(args['germline']) debugPrint(1,'run germline? {}'.format("True" if do_i_run_germline else "False")) if (do_i_run_germline == True): ########################### <CHANGE THIS LATER> ########################### ### Germline seems to be outputting in the wrong unit - so I am putting the min at 3000000 so that it is 3Mb, but should be the default. profile(prof_option, path, job, "start_run_germline") # germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m = 3000000) profile(prof_option, path, job, "end_run_germline") germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m=300) ########################### </CHANGE THIS LATER> ########################## ### Get IBD stats from Germline output if os.path.isfile(out_file_name + '.match'): print('Reading Germline IBD output') profile(prof_option, path, job, "start_process_germline_file") [IBD_pairs, IBD_dict] = process_germline_file(out_file_name, names) profile(prof_option, path, job, "end_process_germline_file") print('Calculating summary stats') stats = OrderedDict([('num', len), ('mean', np.mean), ('med', np.median), ('var', np.var)]) profile(prof_option, path, job, "start_store_IBD_stats") stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head) stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head, min_val=30) profile(prof_option, path, job, "end_store_IBD_stats") debugPrint(1,'finished calculating ss') write_sim_results_file(sim_results_dir, job, processedData['param_dict'], res, head) print('') print('#########################') print('### PROGRAM COMPLETED ###') print('#########################') print('') profile(prof_option, path, job, "COMPLETE")
def main(args): args = processArgs(args) model_file = args['model file'] param_file = args['param file'] path = args['output'] [sim_data_dir, germline_out_dir, sim_results_dir] = create_sim_directories(path) processedData = processInputFiles(param_file, model_file, args) debugPrint(3, "Finished processing input\nprocessedData: ", processedData) using_pseudo_array = True if not processedData.get('discovery') and not processedData.get( 'sample') and not processedData.get('daf'): using_pseudo_array = False ### Create a list of Sequence class instances. These will contain the bulk of all sequence-based data sequences = create_sequences(processedData, args) names = [seq.name for seq in sequences] n_d = sum([1 for seq in sequences if seq.type == 'discovery']) debugPrint(1, 'name\ttotal\tpanel\tgenotyped') for seq in sequences: debugPrint( 1, '{}\t{}\t{}\t{}'.format(seq.name, seq.tot, seq.panel, seq.genotyped)) total = sum([seq.tot for seq in sequences]) debugPrint( 1, 'total samples: {}'.format( sum([ seq.genotyped for seq in sequences if seq.type == 'discovery' ] + [seq.tot for seq in sequences if seq.type == 'sample']))) ########################################################################## ####################### Read Data from tped files ######################## ########################################################################## genome_file = args['genome file'] job = os.path.basename(genome_file) seq_alleles_genome = AllelesReal(str(genome_file) + '.tped') set_real_genome_bits(sequences, seq_alleles_genome) if using_pseudo_array == True: array_file = args['array file'] job = str(job) + '_' + str(os.path.basename(array_file)) seq_alleles_array = AllelesReal(str(array_file) + '.tped') set_real_array_bits(sequences, seq_alleles_array) ########################################################################## ###################### Calculate summary statistics ###################### ########################################################################## res, head = [], [] ### Calculate summary stats from genomes stat_tools.store_segregating_site_stats(sequences, res, head) stat_tools.store_pairwise_FSTs(sequences, n_d, res, head) ### Calculate summary stats from the ascertained SNPs if using_pseudo_array: stat_tools.store_array_segregating_site_stats(sequences, res, head) stat_tools.store_array_FSTs(sequences, res, head) debugPrint(1, 'Make ped and map files') ped_file_name = '{0}/{1}.ped'.format(sim_data_dir, job) map_file_name = '{0}/{1}.map'.format(sim_data_dir, job) out_file_name = '{0}/{1}'.format(germline_out_dir, job) ### Use Germline to find IBD on pseduo array ped and map files do_i_run_germline = 1 #fix this later debugPrint(1, 'run germline? ' + str(do_i_run_germline)) if (do_i_run_germline == 0): ########################### <CHANGE THIS LATER> ########################### ### Germline seems to be outputting in the wrong unit - so I am putting the min at 3000000 so that it is 3Mb, but should be the default. # germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m = 3000000) germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m=300) ########################### </CHANGE THIS LATER> ########################## ### Get IBD stats from Germline output if os.path.isfile(out_file_name + '.match'): debugPrint(1, 'Reading Germline IBD output') [IBD_pairs, IBD_dict] = process_germline_file(out_file_name, names) debugPrint(1, 'Calculating summary stats') stats = OrderedDict([('num', len), ('mean', np.mean), ('med', np.median), ('var', np.var)]) stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head) stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head, min_val=30) # print 'finished calculating ss' write_stats_file(sim_results_dir, job, res, head) print('') print('##########################') print('### PROGRAM COMPLETED ###') print('##########################') print('')