def test_alleles_bitarray(self): alleles_macs_file = AllelesMacsFile('tests/test_data/sites.txt') alleles_bits = alleles_macs_file.make_bitarray()[0] self.maxDiff = None expected_alleles_bits = bitarray( '0000000000000010000000100100001000000000000000000000000000110100000000010000100000000100000000000000000000001000100000000000000000000000010000000000000100000000110000111101111011110011110000100001001111000011110111101100001111000010000100110000001001000000000000111100001000010000001111000010000100001100001111011110111111000011110111101111110000111101111011110011110000100001000000111100001000010000110000111101111011110011110000100001000011000011110111101111001111000010000100000011110000100001000011000011110111101111000000001000000000000011110000100001000011000011110111101111010000110101001010000000000100000000000000111100001000010000110000111101111011110000000000000100000011000011110111101111010000000000000010000000000000001000000011000011010111101100' ) self.assertEqual(alleles_bits, expected_alleles_bits)
def test_seq_bitarray(self): seq_macs_file = AllelesMacsFile('tests/test_data/sites.txt') seqA_bits = seq_macs_file.make_bitarray_seq(0, 10) seqB_bits = seq_macs_file.make_bitarray_seq(10, 20) self.maxDiff = None expceted_seqA_bits = bitarray('00000000000010010000000000000001000000000000010000000000001000000000000000000000110000111100111100001100001111001111000000000010010011110000001111000011000011111100001111110000111100111100000011110000110000111100111100001100001111001111000000111100001100001111000000001000111100001100001111010000110100000001000011110000110000111100000000001100001111010000000000000000001100001101') expceted_seqB_bits = bitarray('00001000001000000000000000001101000010000000000000001000000000000001000100000000011110111110000100110111101100100001001100000000001000010000100001000001111011110111101111011110111110000100001000010000011110111110000100000111101111100001000010000100000111101111000000000010000100000111101111010010100000000000001000010000011110111100010000000111101111000000100000100000000111101100') self.assertEqual(seqA_bits,expceted_seqA_bits) self.assertEqual(seqB_bits,expceted_seqB_bits)
def main(args): chr_number = 1 # Use dictionary keys instead of index keys for args args = process_args(args) job = str(args['job']) # must be a number print('JOB {}'.format(job)) prof_option = args['profile'] sim_option = args['sim option'] path = args['path'] [sim_data_dir, germline_out_dir, sim_results_dir] = create_sim_directories(path) processedData = process_input_files(args['param file'], args['model file'], args) using_pseudo_array = True if not processedData.get('discovery') and not processedData.get('sample') and not processedData.get('daf'): using_pseudo_array = False debugPrint(3, "Finished processing input\nprocessedData: ", processedData) ### Create a list of Sequence class instances. These will contain the bulk of all sequence-based data sequences = create_sequences(processedData) names = [seq.name for seq in sequences] n_d = sum([1 for seq in sequences if seq.type == 'discovery']) debugPrint(1,'name\ttotal\tpanel\tgenotyped') for seq in sequences: debugPrint(1,'{}\t{}\t{}\t{}'.format(seq.name, seq.tot, seq.panel, seq.genotyped)) total = sum([seq.tot for seq in sequences]) debugPrint(1, 'total samples: {}'.format(sum([seq.genotyped for seq in sequences if seq.type=='discovery'] + [seq.tot for seq in sequences if seq.type=='sample']))) ### Define simulation size length = processedData['length'] debugPrint(1, 'Perform simulation and get sequences') pedmap = args['pedmap'] germline = args['germline'] ########################################################################## ################## Perform simulation and get sequences ################## ########################################################################## ### Flag to check if the simulation works SNPs_exceed_available_sites = True while SNPs_exceed_available_sites: # add genetic map to macs_args list macs_args = [] macs_args = processedData['macs_args'] if sim_option == 'macs': ### Run macs and make bitarray profile(prof_option, path, job, "start_run_macs") [sequences,position] = run_macs(macs_args, sequences) profile(prof_option, path, job, "end_run_macs") nbss = len(sequences[0].bits) / (sequences[0].tot) if using_pseudo_array: ## get position of the simulated sites and scale it to the "real" position in the SNP chip sim_positions = get_sim_positions(position, nbss, length) elif sim_option == 'macs_file': ### Using a static sim output rather than generating from seed seq_alleles = AllelesMacsFile('tests/test_data/sites1000000.txt') set_seq_bits(sequences, seq_alleles) nbss = len(sequences[0].bits) / (sequences[0].tot) if using_pseudo_array: ## get position of the simulated sites and scale it to the "real" position in the SNP chip sim_positions = get_sim_positions_old(seq_alleles, nbss, length) profile(prof_option, path, job, "start_set_discovery_bits") set_discovery_bits(sequences) profile(prof_option, path, job, "end_set_discovery_bits") debugPrint(1, 'Number of sites in simulation: {}'.format(nbss)) assert nbss > 10, "Number of sites is less than 10: {}".format(nbss) ########################################################################## ### Create pseudo array according to ascertainment scheme and template ### ########################################################################## if using_pseudo_array: SNPs = get_SNP_sites(args['SNP file']) debugPrint(1, 'Number of SNPs in Array: {}'.format(len(SNPs))) profile(prof_option, path, job, "start_set_panel_bits") asc_panel_bits = set_panel_bits(nbss, sequences) profile(prof_option, path, job, "end_set_panel_bits") debugPrint(1,'Number of chromosomes in asc_panel: {}'.format(asc_panel_bits.length()/nbss)) ### Get pseudo array sites debugPrint(2,'Making pseudo array') profile(prof_option, path, job, "start_pseudo_array_bits") [pos_asc, nbss_asc, avail_site_indices, avail_sites] = pseudo_array_bits(asc_panel_bits, processedData['daf'], sim_positions, SNPs) profile(prof_option, path, job, "end_pseudo_array_bits") nb_avail_sites = len(avail_sites) SNPs_exceed_available_sites = ( len(SNPs) >= nb_avail_sites ) else: SNPs = [] SNPs_exceed_available_sites = False if using_pseudo_array: profile(prof_option, path, job, "start_set_asc_bits") set_asc_bits(sequences, nbss_asc, pos_asc, avail_site_indices) profile(prof_option, path, job, "end_set_asc_bits") debugPrint(1, 'Calculating summary statistics') ########################################################################## ###################### Calculate summary statistics ###################### ########################################################################## res, head = [], [] ### Calculate summary stats from genomes if nbss > 0: # Simulations must contain at least one segregating site profile(prof_option, path, job, "start_store_segregating_site_stats") stat_tools.store_segregating_site_stats(sequences, res, head) profile(prof_option, path, job, "end_store_segregating_site_stats") profile(prof_option, path, job, "start_store_pairwise_FSTs") stat_tools.store_pairwise_FSTs(sequences, n_d, res, head) profile(prof_option, path, job, "end_store_pairwise_FSTs") ### Calculate summary stats from the ascertained SNPs if using_pseudo_array: if nbss_asc > 0: profile(prof_option, path, job, "start_store_array_segregating_site_stats") stat_tools.store_array_segregating_site_stats(sequences, res, head) profile(prof_option, path, job, "end_store_array_segregating_site_stats") profile(prof_option, path, job, "start_store_array_FSTs") stat_tools.store_array_FSTs(sequences, res, head) profile(prof_option, path, job, "end_store_array_FSTs") debugPrint(2,'Making ped and map files') ped_file_name = '{0}/macs_asc_{1}_chr{2}.ped'.format(sim_data_dir, job, str(chr_number)) map_file_name = '{0}/macs_asc_{1}_chr{2}.map'.format(sim_data_dir, job, str(chr_number)) out_file_name = '{0}/macs_asc_{1}_chr{2}'.format(germline_out_dir, job, str(chr_number)) if os.path.isfile(out_file_name + '.match'): # Maybe remove if statement os.remove(ped_file_name) os.remove(map_file_name) if using_pseudo_array and pedmap or germline: profile(prof_option, path, job, "start_make_ped_file") make_ped_file(ped_file_name, sequences) profile(prof_option, path, job, "end_make_ped_file") profile(prof_option, path, job, "start_make_map_file") make_map_file(map_file_name, pos_asc, chr_number, avail_sites) profile(prof_option, path, job, "end_make_map_file") ### Use Germline to find IBD on pseduo array ped and map files do_i_run_germline = int(args['germline']) debugPrint(1,'run germline? {}'.format("True" if do_i_run_germline else "False")) if (do_i_run_germline == True): ########################### <CHANGE THIS LATER> ########################### ### Germline seems to be outputting in the wrong unit - so I am putting the min at 3000000 so that it is 3Mb, but should be the default. profile(prof_option, path, job, "start_run_germline") # germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m = 3000000) profile(prof_option, path, job, "end_run_germline") germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m=300) ########################### </CHANGE THIS LATER> ########################## ### Get IBD stats from Germline output if os.path.isfile(out_file_name + '.match'): print('Reading Germline IBD output') profile(prof_option, path, job, "start_process_germline_file") [IBD_pairs, IBD_dict] = process_germline_file(out_file_name, names) profile(prof_option, path, job, "end_process_germline_file") print('Calculating summary stats') stats = OrderedDict([('num', len), ('mean', np.mean), ('med', np.median), ('var', np.var)]) profile(prof_option, path, job, "start_store_IBD_stats") stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head) stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head, min_val=30) profile(prof_option, path, job, "end_store_IBD_stats") debugPrint(1,'finished calculating ss') write_sim_results_file(sim_results_dir, job, processedData['param_dict'], res, head) print('') print('#########################') print('### PROGRAM COMPLETED ###') print('#########################') print('') profile(prof_option, path, job, "COMPLETE")
def test_alleles(self): alleles_macs_file = AllelesMacsFile('tests/test_data/sites.txt') alleles = alleles_macs_file.make_lists()[0] self.maxDiff = None expected_alleles = [ [ '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0' ], [ '0', '0', '1', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0' ], [ '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1' ], [ '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0' ], [ '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' ], [ '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0' ], [ '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0' ], [ '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0' ], [ '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1' ], [ '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '1' ], [ '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '0', '0' ], [ '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '1' ], [ '0', '0', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' ], [ '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0' ], [ '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0' ], [ '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1' ], [ '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1' ], [ '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1' ], [ '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0' ], [ '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0' ], [ '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1' ], [ '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0' ], [ '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1' ], [ '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0' ], [ '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0' ], [ '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1' ], [ '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' ], [ '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0' ], [ '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1' ], [ '0', '1', '0', '0', '0', '0', '1', '1', '0', '1', '0', '1', '0', '0', '1', '0', '1', '0', '0', '0' ], [ '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' ], [ '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0' ], [ '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1' ], [ '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0' ], [ '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1' ], [ '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0' ], [ '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0' ], [ '1', '1', '0', '0', '0', '0', '1', '1', '0', '1', '0', '1', '1', '1', '1', '0', '1', '1', '0', '0' ] ] self.assertListEqual(alleles, expected_alleles) return alleles
def test_make_lists(self): alleles_macs_file = AllelesMacsFile('tests/test_data/sites.txt') alleles_bits = alleles_macs_file.make_lists() self.maxDiff = None check =[[['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0'], ['0', '0', '1', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1'], ['0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0'], ['0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '1'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '1'], ['0', '0', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '1', '0', '0', '0', '0', '1', '1', '0', '1', '0', '1', '0', '0', '1', '0', '1', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '0', '1', '0', '1', '1', '1', '1', '0', '1', '1', '0', '0']], [' 0.0705276367', ' 0.0773083074', ' 0.141827334', ' 0.144695839', ' 0.148809003', ' 0.169434171', ' 0.181985945', ' 0.193180539', ' 0.21844322', ' 0.244852453', ' 0.256794131', ' 0.298170765', ' 0.344656974', ' 0.372631514', ' 0.468767313', ' 0.491370823', ' 0.497166281', ' 0.542687155', ' 0.567144847', ' 0.567368208', ' 0.573822118', ' 0.575989391', ' 0.581127352', ' 0.602843268', ' 0.633125154', ' 0.645520699', ' 0.660214656', ' 0.683393244', ' 0.708882773', ' 0.746408379', ' 0.803594527', ' 0.80459505', ' 0.811848332', ' 0.862597683', ' 0.874138479', ' 0.933816125', ' 0.968021389', ' 0.982665522']] self.assertEqual(alleles_bits, check)