def populateFlags(variables, modelData): ''' This will fill a dictionary with keys that equal the flags, and values that is a list of every time (in order) the flag is used ''' debugPrint(2, "Starting: populateFlags ") flags = OrderedDict() orderedEvents = [] lowTime = False # loops through all items in data for i, line in enumerate(modelData): lineSplit = line.split(',') flag = lineSplit[0] # if flag starts with -e it will be an event flag, thus, the order must be preserved if flag.startswith("-e") and "_" in flag: if len(lineSplit)>1: # striping any random whitepace lineSplit[1] = lineSplit[1].strip() if int(flag.split("_")[1]) > 1: lowTime = modelData[i-1].split(',')[1] if lineSplit[1] in variables: lineSplit[1] = getUnscaledValue(variables, lineSplit[1], lowTime) else: if lineSplit[1] in variables: lineSplit[1] = getUnscaledValue(variables, lineSplit[1]) if lineSplit[1] not in times and "inst" not in lineSplit[1]: if lowTime: lineSplit[1] = getUnscaledValue(variables, lineSplit[1], lowTime) else: lineSplit[1] = getUnscaledValue(variables, lineSplit[1]) else: Ne = findScaleValue(flags, variables) lastTime = getUnscaledValue(variables, modelData[i-1].split(',')[1]) tempTime = str(float(lastTime) + 1) while tempTime in times: tempTime += 1 lineSplit[1] = tempTime times.append(lineSplit[1]) flag = lineSplit[0].split("_")[0] if flag == "-t": flag.replace("Nachman","2.5e-8").replace("Other",'1.65e-8') if flag == "-F": my_file = Path(lineSplit[1:]) if not my_file.is_file(): raise ValueError("The file for -F is not in your file path.") if flag not in flags.keys(): flags[flag] = [[x.strip() for x in lineSplit[1:] if x]] else: flags[flag].append([x.strip() for x in lineSplit[1:] if x]) modelData[i] = ",".join(lineSplit) return flags
def find_scale_value(flags): # used for scaling debugPrint(2, "Finding scaling value") ne = 10000 if "-Ne" not in flags.keys(): if "-n" in flags.keys(): ne = float(flags["-n"][0][1]) else: ne = float(flags['-Ne'][0][0]) debugPrint(2, "Scaling factor found: {0}".format(ne)) return ne
def findScaleValue(flags = {}, variables = {}): # used for scaling debugPrint(2, "Finding scaling value") Ne=10000 if "-Ne" not in flags.keys(): if "-n" in flags.keys(): Ne=float(getUnscaledValue(variables, flags["-n"][0][1])) else: Ne = float(getUnscaledValue(variables, flags['-Ne'][0][0])) debugPrint(2,"Scaling factor found: {0}".format(Ne)) return Ne
def populate_macs_args(macs_args, scaled_flags): for flag in scaled_flags.keys(): # Looping through every key debugPrint(3, "FLAG: {}: {}".format(flag, scaled_flags[flag])) for argument_raw in scaled_flags[flag]: # Looping through every argumentRaw try: debugPrint(3, flag + ": " + str(argument_raw)) macs_args.append(flag.strip()) for sub_line in argument_raw: macs_args.append(sub_line.strip()) except IndexError: print("There was an index error!\nThis most likely means your input file has a malformed flag.") print("Try running with -vv argument for last flag ran") sys.exit()
def scale_flags(flags_raw): # find scale value ne = find_scale_value(flags_raw) flags = {} for flag in flags_raw.keys(): # Looping through every key debugPrint(3, "FLAG: {}: {}".format(flag, flags_raw[flag])) arguments = [] for argument_raw in flags_raw[flag]: argument = generate_argument(argument_raw, flag, ne) arguments.append(argument) flags[flag] = arguments return flags
def create_sequences(processedData, args): ''' Parameters: args is a dictionary that maps the SNP file to array_template args: a dictionary (seen below in the args parameter) processedData: a dictionary (seen below in the args parameter) Returns: instance types named [d1, s1] ''' debugPrint(2,"Running create_sequences:") sequences = [] if 'discovery' in processedData and 'sample' in processedData and 'daf' in processedData: ### Initialize all discovery type sequence data for i, ind in enumerate(processedData.get('discovery')): tot_index = processedData['macs_args'].index("-I") + 1 + ind tot = int(processedData['macs_args'][tot_index]) # total number of individuals used in simulation name = processedData.get('name').pop(0) seq = SeqInfo(name, tot, seq_type = 'discovery') seq.genotyped = processedData['I'][ind - 1] seq.panel = seq.tot - seq.genotyped sequences.append(seq) ### Initialize all sample type sequence data for i, ind in enumerate(processedData.get('sample')): tot = processedData['I'][ind-1] name = processedData.get('name').pop(0) seq = SeqInfo(name, tot, seq_type = 'sample') seq.panel = seq.tot seq.genotyped = seq.tot sequences.append(seq) else: for ind in range(int(processedData['macs_args'][4])): tot = processedData['I'][ind-1] name = processedData.get('name').pop(0) seq = SeqInfo(name, tot, seq_type = 'discovery') # seq.panel = seq.tot #pretty sure it can be deleted seq.genotyped = seq.tot sequences.append(seq) return sequences
def run_macs(macs_args, sequences): ''' Parameters: sequences and macs_args macs_args: ['./bin/macs', '166.0', '1000000', '-I', '2', '26', '140', '-t', '0.00444997180488', '-s', '1231', '-r', '0.00177998872195', '-h', '1e5', '-n', '1', '1.0', '-n', '2', '0.899072251249', '-en', '0.0118708617304', '1', '0.224720524949', '-ej', '0.0143090794261', '2', '1', '-R', 'genetic_map_b37/genetic_map_GRCh37_chr1.txt.macshs'] sequences: [A, B], which is a sequence type Returns: sequences, which is a list of two instance types stored as [A, B] position: list of floats cast as strings, length: 10752 the floaty strings increase from '0.000178136752' to ' 0.99995896' ''' debugPrint(2, "running macs simulation:") position = [] null = open(os.devnull, 'w') proc = subprocess.Popen(macs_args, stdout=subprocess.PIPE, stderr=null) #debugPrint(3,"macs command: {}".format(" ".join(macs_args))) while True: line = proc.stdout.readline() line = line.rstrip() # line = line.decode("utf-8") if line != b'': if line.startswith(b"SITE:"): columns = line.split(b'\t') site_alleles = columns[4].strip() position.append(columns[2]) seq_loc = 0 for seq in sequences: seq.bits.extend(site_alleles[seq_loc:seq_loc + seq.tot]) seq_loc += seq.tot # elif not line.isnum(): # debugPrint(3,line) else: break #print("THIS IS SEQUENCES zero: " + str(sequences[0].__dict__)) #print("THIS IS SEQUENCES one: " + str(sequences[1].__dict__)) # print("THIS IS position: " + str(position)) # debugPrint(2,"Finished macs simulation") return [sequences, position]
def processArgs(arguments): parser = argparse.ArgumentParser() parser.add_argument("-p", "--param", help="REQUIRED!: The location of the parameter file", required=True) parser.add_argument("-m", "--model", help="REQUIRED!: The location of the model file", required=True) parser.add_argument("-o", "--out", help="REQUIRED!: The location of the output dir", required=True) parser.add_argument("-g", "--genome", help="The location of the genome file", required=True) parser.add_argument("-a", "--array", help="The location of the array file", required=True) parser.add_argument("-v", help="increase output verbosity", action="count", default=0) tmpArgs = parser.parse_args() args = { 'param file': tmpArgs.param, 'model file': tmpArgs.model, 'genome file': tmpArgs.genome, 'array file': tmpArgs.array, 'output': tmpArgs.out } global_vars.init() global_vars.verbos = tmpArgs.v debugPrint(1, "Debug on: Level " + str(global_vars.verbos)) return args
def populate_flags(model_data_raw): """ This will fill a dictionary with keys that equal the flags, and values that is a list of every time (in order) the flag is used. :param model_data_raw: :return: """ debugPrint(2, "Starting: populateFlags ") flags = OrderedDict() # loops through all items in modelDataRaw for i, argument in enumerate(model_data_raw): arg_split = argument.split(',') flag = arg_split[0] if flag in flags.keys(): flags[flag].append([x.strip() for x in arg_split[1:] if x]) else: flags[flag] = [[x.strip() for x in arg_split[1:] if x]] return flags
def main(args): chr_number = 1 # Use dictionary keys instead of index keys for args args = process_args(args) job = str(args['job']) # must be a number print('JOB {}'.format(job)) prof_option = args['profile'] sim_option = args['sim option'] path = args['path'] [sim_data_dir, germline_out_dir, sim_results_dir] = create_sim_directories(path) processedData = process_input_files(args['param file'], args['model file'], args) using_pseudo_array = True if not processedData.get('discovery') and not processedData.get('sample') and not processedData.get('daf'): using_pseudo_array = False debugPrint(3, "Finished processing input\nprocessedData: ", processedData) ### Create a list of Sequence class instances. These will contain the bulk of all sequence-based data sequences = create_sequences(processedData) names = [seq.name for seq in sequences] n_d = sum([1 for seq in sequences if seq.type == 'discovery']) debugPrint(1,'name\ttotal\tpanel\tgenotyped') for seq in sequences: debugPrint(1,'{}\t{}\t{}\t{}'.format(seq.name, seq.tot, seq.panel, seq.genotyped)) total = sum([seq.tot for seq in sequences]) debugPrint(1, 'total samples: {}'.format(sum([seq.genotyped for seq in sequences if seq.type=='discovery'] + [seq.tot for seq in sequences if seq.type=='sample']))) ### Define simulation size length = processedData['length'] debugPrint(1, 'Perform simulation and get sequences') pedmap = args['pedmap'] germline = args['germline'] ########################################################################## ################## Perform simulation and get sequences ################## ########################################################################## ### Flag to check if the simulation works SNPs_exceed_available_sites = True while SNPs_exceed_available_sites: # add genetic map to macs_args list macs_args = [] macs_args = processedData['macs_args'] if sim_option == 'macs': ### Run macs and make bitarray profile(prof_option, path, job, "start_run_macs") [sequences,position] = run_macs(macs_args, sequences) profile(prof_option, path, job, "end_run_macs") nbss = len(sequences[0].bits) / (sequences[0].tot) if using_pseudo_array: ## get position of the simulated sites and scale it to the "real" position in the SNP chip sim_positions = get_sim_positions(position, nbss, length) elif sim_option == 'macs_file': ### Using a static sim output rather than generating from seed seq_alleles = AllelesMacsFile('tests/test_data/sites1000000.txt') set_seq_bits(sequences, seq_alleles) nbss = len(sequences[0].bits) / (sequences[0].tot) if using_pseudo_array: ## get position of the simulated sites and scale it to the "real" position in the SNP chip sim_positions = get_sim_positions_old(seq_alleles, nbss, length) profile(prof_option, path, job, "start_set_discovery_bits") set_discovery_bits(sequences) profile(prof_option, path, job, "end_set_discovery_bits") debugPrint(1, 'Number of sites in simulation: {}'.format(nbss)) assert nbss > 10, "Number of sites is less than 10: {}".format(nbss) ########################################################################## ### Create pseudo array according to ascertainment scheme and template ### ########################################################################## if using_pseudo_array: SNPs = get_SNP_sites(args['SNP file']) debugPrint(1, 'Number of SNPs in Array: {}'.format(len(SNPs))) profile(prof_option, path, job, "start_set_panel_bits") asc_panel_bits = set_panel_bits(nbss, sequences) profile(prof_option, path, job, "end_set_panel_bits") debugPrint(1,'Number of chromosomes in asc_panel: {}'.format(asc_panel_bits.length()/nbss)) ### Get pseudo array sites debugPrint(2,'Making pseudo array') profile(prof_option, path, job, "start_pseudo_array_bits") [pos_asc, nbss_asc, avail_site_indices, avail_sites] = pseudo_array_bits(asc_panel_bits, processedData['daf'], sim_positions, SNPs) profile(prof_option, path, job, "end_pseudo_array_bits") nb_avail_sites = len(avail_sites) SNPs_exceed_available_sites = ( len(SNPs) >= nb_avail_sites ) else: SNPs = [] SNPs_exceed_available_sites = False if using_pseudo_array: profile(prof_option, path, job, "start_set_asc_bits") set_asc_bits(sequences, nbss_asc, pos_asc, avail_site_indices) profile(prof_option, path, job, "end_set_asc_bits") debugPrint(1, 'Calculating summary statistics') ########################################################################## ###################### Calculate summary statistics ###################### ########################################################################## res, head = [], [] ### Calculate summary stats from genomes if nbss > 0: # Simulations must contain at least one segregating site profile(prof_option, path, job, "start_store_segregating_site_stats") stat_tools.store_segregating_site_stats(sequences, res, head) profile(prof_option, path, job, "end_store_segregating_site_stats") profile(prof_option, path, job, "start_store_pairwise_FSTs") stat_tools.store_pairwise_FSTs(sequences, n_d, res, head) profile(prof_option, path, job, "end_store_pairwise_FSTs") ### Calculate summary stats from the ascertained SNPs if using_pseudo_array: if nbss_asc > 0: profile(prof_option, path, job, "start_store_array_segregating_site_stats") stat_tools.store_array_segregating_site_stats(sequences, res, head) profile(prof_option, path, job, "end_store_array_segregating_site_stats") profile(prof_option, path, job, "start_store_array_FSTs") stat_tools.store_array_FSTs(sequences, res, head) profile(prof_option, path, job, "end_store_array_FSTs") debugPrint(2,'Making ped and map files') ped_file_name = '{0}/macs_asc_{1}_chr{2}.ped'.format(sim_data_dir, job, str(chr_number)) map_file_name = '{0}/macs_asc_{1}_chr{2}.map'.format(sim_data_dir, job, str(chr_number)) out_file_name = '{0}/macs_asc_{1}_chr{2}'.format(germline_out_dir, job, str(chr_number)) if os.path.isfile(out_file_name + '.match'): # Maybe remove if statement os.remove(ped_file_name) os.remove(map_file_name) if using_pseudo_array and pedmap or germline: profile(prof_option, path, job, "start_make_ped_file") make_ped_file(ped_file_name, sequences) profile(prof_option, path, job, "end_make_ped_file") profile(prof_option, path, job, "start_make_map_file") make_map_file(map_file_name, pos_asc, chr_number, avail_sites) profile(prof_option, path, job, "end_make_map_file") ### Use Germline to find IBD on pseduo array ped and map files do_i_run_germline = int(args['germline']) debugPrint(1,'run germline? {}'.format("True" if do_i_run_germline else "False")) if (do_i_run_germline == True): ########################### <CHANGE THIS LATER> ########################### ### Germline seems to be outputting in the wrong unit - so I am putting the min at 3000000 so that it is 3Mb, but should be the default. profile(prof_option, path, job, "start_run_germline") # germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m = 3000000) profile(prof_option, path, job, "end_run_germline") germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m=300) ########################### </CHANGE THIS LATER> ########################## ### Get IBD stats from Germline output if os.path.isfile(out_file_name + '.match'): print('Reading Germline IBD output') profile(prof_option, path, job, "start_process_germline_file") [IBD_pairs, IBD_dict] = process_germline_file(out_file_name, names) profile(prof_option, path, job, "end_process_germline_file") print('Calculating summary stats') stats = OrderedDict([('num', len), ('mean', np.mean), ('med', np.median), ('var', np.var)]) profile(prof_option, path, job, "start_store_IBD_stats") stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head) stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head, min_val=30) profile(prof_option, path, job, "end_store_IBD_stats") debugPrint(1,'finished calculating ss') write_sim_results_file(sim_results_dir, job, processedData['param_dict'], res, head) print('') print('#########################') print('### PROGRAM COMPLETED ###') print('#########################') print('') profile(prof_option, path, job, "COMPLETE")
def process_input_files(param_file, model_file, args): """ :param param_file: :param model_file: :param args: :return: """ """ This is the function that takes links to two files and outputs a dictionary (processedData) With all the (useful) data in the two files """ debugPrint(2, "Starting processInputFiles") model_data_raw = read_model_file(model_file) debugPrint(2, "Finished reading " + str(model_file)) debugPrint(3, "Raw input data into make_args", model_data_raw) model_params_dict_raw = read_params_file(param_file) debugPrint(2, "Finished reading " + str(param_file)) debugPrint(3, "Raw Output for modelParamsDict", model_params_dict_raw) # defining and replacing the variables from the param file model_params_variables = define_priors(model_params_dict_raw, model_data_raw) model_data = substitute_variables(model_params_variables, model_data_raw) flags = populate_flags(model_data) macs_args = generate_macs_args(flags) # find and add sizes to macs_args sizes = populate_sizes(flags) total = float(sum(sizes)) macs_args.insert(1, str(total)) sizes_str = map(str, sizes) if sys.version_info > (3, 0): sizes_str = list(sizes_str) macs_args.extend(sizes_str) debugPrint(3, "Processing flags in for macs_args") # take out ignored flags flags = remove_ignored_flags(flags) # take out process data )type 1 processed_data = process_type1_flags(flags) flags = filter_out_type1(flags) # scale values if needed scaled_flags = scale_flags(flags) # pull out seed seed = scaled_flags.get("-s", None) if seed: processed_data['seed'] = seed # seasons is all the time based events seasons = add_events_to_seasons(scaled_flags) macs_args_flags = filter_out_events(scaled_flags) # add to macs_args # TODO: This needs to be done explictily populate_macs_args(macs_args, macs_args_flags) pop_names = gather_pop_names(model_data_raw) processed_data['name'] = pop_names if not processed_data.get('discovery') or not processed_data.get( 'sample') or not processed_data.get('daf'): if not processed_data.get('discovery') and not processed_data.get( 'sample') and not processed_data.get('daf'): debugPrint(2, "discovery, sample, and daf are all missing") else: print("discovery, sample, or daf is missing") quit() debugPrint(2, "Adding events data back to flag pool") for i in range(len(seasons)): seasons[i][1] = float(seasons[i][1]) seasons = sorted(seasons, key=itemgetter(1)) for i in range(len(seasons)): seasons[i][1] = str(seasons[i][1]) for season in seasons: macs_args.extend(season) processed_data["macs_args"] = macs_args debugPrint(3, "printing model_params_variables:", model_params_variables) processed_data['param_dict'] = model_params_variables if 'genetic_map' in args.keys() and args['genetic map']: processed_data['macs_args'].extend(['-R', args['genetic map']]) return processed_data
def pseudo_array(asc_panel, daf, pos, snps): Tasc_panel = zip(*asc_panel) print( 'number of sites in Tasc_panel:', len(Tasc_panel)) print( 'number of chromosomes in Tasc_panel:', len(Tasc_panel[0])) #######Array with the available sites given the frequency cut off ##array with the frequency of all the simulated snps sites_freq = [] ##array with the available sites, that pass the frequency cut-off avail_sites = [] ##this one has the positions of the snps index_avail_sites = [] ##this one has the indexes of the snps for n in range(len(Tasc_panel)): freq_site = float(Tasc_panel[n][0:len(asc_panel)].count('1')) / float(len(asc_panel)) if freq_site >= daf and freq_site <= 1 - daf: sites_freq.append(freq_site) avail_sites.append(pos[n]) index_avail_sites.append(n) nb_avail_sites = len(avail_sites) if (len(avail_sites) == len(snps)): debugPrint(3,"number of avail_sites is equal to the number of Array snps") pos_asc = [] pos_asc = index_avail_sites nbss_asc = len(pos_asc) flag_nb_asc_snps = 1 elif (len(avail_sites) > len(snps)): debugPrint(3,"number of avail_sites greater than number of Array snps") pos_asc = [None] * int(len(snps)) ##indexes of the SNPs that pass the frequency cut-off and position for i in range(len(snps)): # each snp on the snp array on a chromosome ## y is the position of the SNPs in the array y = snps[i] ##find the closest SNP in the array closestleft = find2(avail_sites, y) if (i > 0 and pos_asc[i - 1] == closestleft and closestleft + 1 < len(avail_sites)): ##avoid duplicates closestleft = closestleft + 1 ##move one position to the right pos_asc[i] = closestleft elif (i > 0 and pos_asc[i - 1] > closestleft and pos_asc[i - 1] + 1 < len(avail_sites)): closestleft = pos_asc[i - 1] + 1 pos_asc[i] = closestleft else: pos_asc[i] = closestleft ###if I have duplicates at this point, it means that there were not anyt more snps to choose from ###closestleft+1 or pos_asc[i-1]+1 == len(avail_sites) #####smoothing ##last index of the pos_asc i = len(pos_asc) - 1 ##check if there is another position that might work better for j in range(0, i): if (j == i - 1 and pos_asc[j] + 1 < pos_asc[j + 1] and pos_asc[j] < (len(avail_sites) - 1) and ( j + 1) < len(avail_sites)): d1 = abs(snps[j] - avail_sites[pos_asc[j]]) d2 = abs(snps[j] - avail_sites[pos_asc[j] + 1]) if (d2 < d1): pos_asc[j] = pos_asc[j] + 1 ##removes duplicates pos_asc = (list(set(pos_asc))) pos_asc.sort() nbss_asc = len(pos_asc) if (len(snps) == nbss_asc): flag_nb_asc_snps = 1 debugPrint(3,'Number of asc snps equal to nb array snps') if (len(snps) != len(pos_asc)): flag_nb_asc_snps = 0 debugPrint(3,'Number of asc snps not equal to nb array snps') diff = int(len(snps) - len(pos_asc)) for m in range(1, diff + 1): pos_asc2 = [] pos_asc2 = add_snps(avail_sites, nb_avail_sites, pos_asc, nbss_asc, nb_array_snps) pos_asc = pos_asc2 nbss_asc = len(pos_asc) if nbss_asc == len(snps): flag_nb_asc_snps = 1 break else: flag_nb_asc_snps = 0 if (flag_nb_asc_snps == 0): ##it means that the 1st index in pos_asc is 0; and the last is len(avail_sites)-1 diff = int(len(snps) - len(pos_asc)) while (len(pos_asc) != len(snps)): rand_numb = random.randint(0, len(avail_sites) - 1) # print( 'random',rand_numb) if rand_numb not in pos_asc: pos_asc.append(rand_numb) pos_asc.sort() nbss_asc = len(pos_asc) print( 'finished making pseudo array') return pos_asc, nbss_asc, index_avail_sites, avail_sites
def pseudo_array_bits(asc_panel_bits, daf, pos, snps): ''' Parameters: asc_panel_bits: bitarray daf: float (0.0264139586625) pos: list of floats in acsending order snps: list of ints Returns: pos_asc: list of ints (2481-2679) nbss_asc: 200 index_avail_sites: avail_sites: list of floats Errors: - the asc_panel_bits needs to be divisible by pos - daf cannot be negative or greater than 1 ''' n = asc_panel_bits.length()/len(pos) n = int(n) #######Array with the available sites given the frequency cut off ##array with the frequency of all the simulated snps sites_freq = [] ##array with the available sites, that pass the frequency cut-off avail_sites = [] ##this one has the positions of the snps index_avail_sites = [] ##this one has the indexes of the snps i = 0 for site in range(0, asc_panel_bits.length(), int(n)): freq_site = float(asc_panel_bits[site:site + n].count(1) / float(n)) if freq_site >= daf and freq_site <= 1 - daf: sites_freq.append(freq_site) avail_sites.append(pos[i]) index_avail_sites.append(i) i=i+1 nb_avail_sites = len(avail_sites) if (len(avail_sites) < len(snps)): print( "Error: There are not enough simulated sites in the discovery panel with allele frequency >=",daf,"and <=",1 - daf) sys.exit() if (len(avail_sites) == len(snps)): #debugPrint(3,"Number of avail_sites is equal to the number of Array snps") pos_asc = [] pos_asc = index_avail_sites nbss_asc = len(pos_asc) flag_nb_asc_snps = 1 elif (len(avail_sites) > len(snps)): #debugPrint(3,"Number of avail_sites greater than number of Array snps") pos_asc = [None] * int(len(snps)) ##indexes of the SNPs that pass the frequency cut-off and position for i in range(len(snps)): # each snp on the snp array on a chromosome ## y is the position of the SNPs in the array y = snps[i] ##find the closest SNP in the array closestleft = find2(avail_sites, y) if (i > 0 and pos_asc[i - 1] == closestleft and closestleft + 1 < len(avail_sites)): ##avoid duplicates closestleft = closestleft + 1 ##move one position to the right pos_asc[i] = closestleft elif (i > 0 and pos_asc[i - 1] > closestleft and pos_asc[i - 1] + 1 < len(avail_sites)): closestleft = pos_asc[i - 1] + 1 pos_asc[i] = closestleft else: pos_asc[i] = closestleft ###if I have duplicates at this point, it means that there were not anyt more snps to choose from ###closestleft+1 or pos_asc[i-1]+1 == len(avail_sites) #####smoothing ##last index of the pos_asc i = len(pos_asc) - 1 ##check if there is another position that might work better for j in range(0, i): if (j == i - 1 and pos_asc[j] + 1 < pos_asc[j + 1] and pos_asc[j] < (len(avail_sites) - 1) and ( j + 1) < len(avail_sites)): d1 = abs(snps[j] - avail_sites[pos_asc[j]]) d2 = abs(snps[j] - avail_sites[pos_asc[j] + 1]) if (d2 < d1): pos_asc[j] = pos_asc[j] + 1 ##removes duplicates pos_asc = (list(set(pos_asc))) pos_asc.sort() nbss_asc = len(pos_asc) nb_array_snps = len(snps) if (len(snps) == nbss_asc): flag_nb_asc_snps = 1 debugPrint(3,'nb of asc snps equal to nb array snps') if (len(snps) != len(pos_asc)): flag_nb_asc_snps = 0 #debugPrint(3,'nb of asc snps not equal to nb array snps') diff = int(len(snps) - len(pos_asc)) for m in range(1, diff + 1): pos_asc2 = [] pos_asc2 = add_snps(avail_sites, nb_avail_sites, pos_asc, nbss_asc, nb_array_snps) pos_asc = pos_asc2 nbss_asc = len(pos_asc) if nbss_asc == len(snps): flag_nb_asc_snps = 1 break else: flag_nb_asc_snps = 0 if (flag_nb_asc_snps == 0): ##it means that the 1st index in pos_asc is 0; and the last is len(avail_sites)-1 diff = int(len(snps) - len(pos_asc)) while (len(pos_asc) != len(snps)): rand_numb = random.randint(0, len(avail_sites) - 1) # print( 'random',rand_numb) if rand_numb not in pos_asc: pos_asc.append(rand_numb) pos_asc.sort() nbss_asc = len(pos_asc) #debugPrint(2,'finished making pseudo array') return pos_asc, nbss_asc, index_avail_sites, avail_sites
def main(args): args = processArgs(args) model_file = args['model file'] param_file = args['param file'] path = args['output'] [sim_data_dir, germline_out_dir, sim_results_dir] = create_sim_directories(path) processedData = processInputFiles(param_file, model_file, args) debugPrint(3, "Finished processing input\nprocessedData: ", processedData) using_pseudo_array = True if not processedData.get('discovery') and not processedData.get( 'sample') and not processedData.get('daf'): using_pseudo_array = False ### Create a list of Sequence class instances. These will contain the bulk of all sequence-based data sequences = create_sequences(processedData, args) names = [seq.name for seq in sequences] n_d = sum([1 for seq in sequences if seq.type == 'discovery']) debugPrint(1, 'name\ttotal\tpanel\tgenotyped') for seq in sequences: debugPrint( 1, '{}\t{}\t{}\t{}'.format(seq.name, seq.tot, seq.panel, seq.genotyped)) total = sum([seq.tot for seq in sequences]) debugPrint( 1, 'total samples: {}'.format( sum([ seq.genotyped for seq in sequences if seq.type == 'discovery' ] + [seq.tot for seq in sequences if seq.type == 'sample']))) ########################################################################## ####################### Read Data from tped files ######################## ########################################################################## genome_file = args['genome file'] job = os.path.basename(genome_file) seq_alleles_genome = AllelesReal(str(genome_file) + '.tped') set_real_genome_bits(sequences, seq_alleles_genome) if using_pseudo_array == True: array_file = args['array file'] job = str(job) + '_' + str(os.path.basename(array_file)) seq_alleles_array = AllelesReal(str(array_file) + '.tped') set_real_array_bits(sequences, seq_alleles_array) ########################################################################## ###################### Calculate summary statistics ###################### ########################################################################## res, head = [], [] ### Calculate summary stats from genomes stat_tools.store_segregating_site_stats(sequences, res, head) stat_tools.store_pairwise_FSTs(sequences, n_d, res, head) ### Calculate summary stats from the ascertained SNPs if using_pseudo_array: stat_tools.store_array_segregating_site_stats(sequences, res, head) stat_tools.store_array_FSTs(sequences, res, head) debugPrint(1, 'Make ped and map files') ped_file_name = '{0}/{1}.ped'.format(sim_data_dir, job) map_file_name = '{0}/{1}.map'.format(sim_data_dir, job) out_file_name = '{0}/{1}'.format(germline_out_dir, job) ### Use Germline to find IBD on pseduo array ped and map files do_i_run_germline = 1 #fix this later debugPrint(1, 'run germline? ' + str(do_i_run_germline)) if (do_i_run_germline == 0): ########################### <CHANGE THIS LATER> ########################### ### Germline seems to be outputting in the wrong unit - so I am putting the min at 3000000 so that it is 3Mb, but should be the default. # germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m = 3000000) germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m=300) ########################### </CHANGE THIS LATER> ########################## ### Get IBD stats from Germline output if os.path.isfile(out_file_name + '.match'): debugPrint(1, 'Reading Germline IBD output') [IBD_pairs, IBD_dict] = process_germline_file(out_file_name, names) debugPrint(1, 'Calculating summary stats') stats = OrderedDict([('num', len), ('mean', np.mean), ('med', np.median), ('var', np.var)]) stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head) stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head, min_val=30) # print 'finished calculating ss' write_stats_file(sim_results_dir, job, res, head) print('') print('##########################') print('### PROGRAM COMPLETED ###') print('##########################') print('')
def processInputFiles(paramFile, modelFile, args): ''' This is the function that takes links to two files and outputs a dictionay (processedData) With all the (useful) data in the two files ''' debugPrint(2, "Starting processInputFiles") modelData = readModelFile(modelFile) debugPrint(2, "Finished reading " + str(modelFile)) debugPrint(3, "Raw input data into make_args", modelData) variables = readParamsFile(paramFile) debugPrint(2, "Finished reading " + str(paramFile)) debugPrint(3,"Raw Output for variables", variables) processedData = processModelData(variables, modelData) # creates the input for macsSwig debugPrint(3,"Priting variables:", variables) processedData['param_dict'] = variables if args['genetic map']: processedData['macs_args'].extend(['-R', args['genetic map']]) return processedData
def processModelData(variables, modelData): """ """ debugPrint(2, "Starting: processModelData") processedData = {} flags = populateFlags(variables, modelData) if '-macs_file' in flags: macs_args = [flags['-macs_file'][0], flags['-length'][0][0], "-I", flags['-I'][0][0]] elif '-macsswig' in flags: macs_args = [flags['-macsswig'][0][0], flags['-length'][0][0], "-I", flags['-I'][0][0]] elif '-macs' in flags: macs_args = [flags['-macs'][0][0], flags['-length'][0][0], "-I", flags['-I'][0][0]] sizes = map(int, flags["-I"][0][1:]) if (sys.version_info > (3, 0)): sizes = list(sizes) if '-discovery' in flags: for discovery_pop_str in flags["-discovery"][0]: discovery_pop = int(discovery_pop_str)-1 if "True" in flags['-random_discovery'][0]: sizes[discovery_pop] += random.randint(2, sizes[discovery_pop]) else: sizes[discovery_pop] += sizes[discovery_pop] total = float(sum(sizes)) macs_args.insert(1,str(total)) sizes_str = map(str, sizes) if (sys.version_info > (3, 0)): sizes_str = list(sizes_str) macs_args.extend(sizes_str) # seasons is all the time based events seasons = [] Ne = findScaleValue(flags, variables) # processOrderedSeasons(flags, variables) debugPrint(3,"Processing flags in for macs_args") for flag in flags.keys(): debugPrint(3," {}: {}".format(flag,flags[flag])) for tempLine in flags[flag]: try: # debugPrint(3,flag + ": " + str(tempLine)) if flag == "-discovery": processedData['discovery'] = [int(s.strip()) for s in tempLine if s] continue if flag == "-sample": processedData['sample'] = [int(s.strip()) for s in tempLine if s] continue if flag == "-s": processedData['seed'] = tempLine[0] if flag == "-daf": processedData['daf'] = float(getUnscaledValue(variables, tempLine[0])) continue if flag == "-length": processedData['length'] = tempLine[0] continue if flag == "-macs": processedData['macs'] = tempLine[0] continue if flag == "-I": processedData["I"] = [int(s.strip()) for s in tempLine[1:] if s] continue if flag == "-macsswig": processedData['macsswig'] = tempLine[0] continue if flag == "-n": tmp = processedData.get('name', []) tmp.append(tempLine[1]) processedData['name'] = tmp #----------------------- For Added Arguments from Model_CSV ignoredFlags = ["-germline", "-array", "-nonrandom_discovery", "-random_discovery", "-pedmap"] if flag in ignoredFlags: continue if flag == "-Ne": tempLine[0] = getUnscaledValue(variables, tempLine[0]) if flag == "-em": tempLine[3] = getUnscaledValue(variables, tempLine[3]) tempLine[3] = str(float(4*(float(tempLine[3])*Ne))) elif flag == "-eM" or flag == "-g": tempLine[1] = getUnscaledValue(variables, tempLine[1]) tempLine[1] = str(float(4*(float(tempLine[1])*Ne))) elif flag == "-ema": for i in range(2,len(tempLine)): tempLine[i] = getUnscaledValue(variables, tempLine[i]) tempLine[i] = str(float(4*(float(tempLine[i])*Ne))) elif flag == "-eN" or flag == "-n": tempLine[1] = getUnscaledValue(variables, tempLine[1]) tempLine[1] = str(float((float(tempLine[1])/Ne))) elif flag == "-en": tempLine[2] = getUnscaledValue(variables, tempLine[2]) tempLine[2] = str(float((float(tempLine[2])/Ne))) elif flag == "-eg": tempLine[2] = getUnscaledValue(variables, tempLine[2]) tempLine[2] = str(float(4*(float(tempLine[2])*Ne))) elif flag == "-es": tempLine[2] = getUnscaledValue(variables, tempLine[2]) elif flag == "-m": tempLine[2] = getUnscaledValue(variables, tempLine[2]) tempLine[2] = str(float(4*(float(tempLine[2])*Ne))) elif flag == "-ma": for i in range(len(tempLine)): tempLine[i] = getUnscaledValue(variables, tempLine[i]) tempLine[i]=str(float(4*(float(tempLine[i])*Ne))) elif flag == "-t" or flag == "-r" or flag == "-G": # both <m> <r> <alpha> have same scaling factor tempLine[0] = getUnscaledValue(variables, tempLine[0]) tempLine[0] = str(float(4*(float(tempLine[0])*Ne))) if flag.startswith('-e'): # all <t>'s are scaled pass tempLine[0] = getUnscaledValue(variables, tempLine[0]) tempLine[0]=str(round(float(tempLine[0]))/(4*Ne)) seasons.append([flag] + tempLine) else: macs_args.append(flag.strip()) for subLine in tempLine: macs_args.append(subLine.strip()) except IndexError as e: print("There was an index error!\nThis most likely means your input file has a malformed flag.") print("Try running with -vv argument for last flag ran") sys.exit() if '-n' not in flags: tmp = list(range(1,int(flags['-I'][0][0])+1)) processedData['name'] = tmp if not processedData.get('discovery') or not processedData.get('sample') or not processedData.get('daf'): if not processedData.get('discovery') and not processedData.get('sample') and not processedData.get('daf'): debugPrint(2, "discovery, sample, and daf are all missing") else: print("discovery, sample, or daf is missing") quit() debugPrint(2, "Adding events data back to flag pool") for i in range(len(seasons)): seasons[i][1] = float(seasons[i][1]) seasons = sorted(seasons, key=itemgetter(1)) for i in range(len(seasons)): seasons[i][1] = str(seasons[i][1]) for season in seasons: macs_args.extend(season) processedData["macs_args"] = macs_args return processedData