def init_gerp_dic(motif_dic): ''' Read pickle containing motif dic, then extract all the chromosomes and genomic coords that will need to be read in RS score. motif dic is expected to be of form: {miso_event: {region: {subkey1: subval1 ...}}} Returns: gerp_dic of form: {chr1: {coord1: None, coord1: None...} chrY: {...}} ''' # Define subkey constants. Should match get_dic_subkeys() genomic_coord_str = 'genomic_coordinate' # Init chromosome list chr_list = get_chr_list() # Init gerp_dic gerp_dic = {} # Create empty subdics, by chromosmoe for chromosome in chr_list: gerp_dic[chromosome] = {} # Iterate through the different subdics, retrieving genomic coords # store genomic coords to gerp_dic coord_count = 0 for miso_event in motif_dic: for region in motif_dic[miso_event]: # get colon separated chr_start_end value. chr_start_end_list = \ motif_dic[miso_event][region][genomic_coord_str] # loop through list of coords (mostly length 1) for chr_start_end in chr_start_end_list: if chr_start_end is not None: # Split by colon chr_start_end_split = chr_start_end.split(':') # Get chromo, start, ends separately. # Start is inclusive in RS score file. # End is not inclusive (so we subtract by 1). chromo = chr_start_end_split[0] start = int(chr_start_end_split[1]) end = int(chr_start_end_split[2]) # Update gerp_dic containing empty dics # with coordinates as keys, from start to end # Note: end is not inclusive. for coordinate in range(start, end): gerp_dic[chromo][coordinate] = None coord_count += 1 print '%s coordinates stored into gerp_dic.' %coord_count return gerp_dic
def init_gerp_dic(motif_dic): ''' Read pickle containing motif dic, then extract all the chromosomes and genomic coords that will need to be read in RS score. motif dic is expected to be of form: {miso_event: {region: {subkey1: subval1 ...}}} Returns: gerp_dic of form: {chr1: {coord1: None, coord1: None...} chrY: {...}} ''' # Define subkey constants. Should match get_dic_subkeys() genomic_coord_str = 'genomic_coordinate' # Init chromosome list chr_list = get_chr_list() # Init gerp_dic gerp_dic = {} # Create empty subdics, by chromosmoe for chromosome in chr_list: gerp_dic[chromosome] = {} # Iterate through the different subdics, retrieving genomic coords # store genomic coords to gerp_dic coord_count = 0 for miso_event in motif_dic: for region in motif_dic[miso_event]: # get colon separated chr_start_end value. chr_start_end_list = \ motif_dic[miso_event][region][genomic_coord_str] # loop through list of coords (mostly length 1) for chr_start_end in chr_start_end_list: if chr_start_end is not None: # Split by colon chr_start_end_split = chr_start_end.split(':') # Get chromo, start, ends separately. # Start is inclusive in RS score file. # End is not inclusive (so we subtract by 1). chromo = chr_start_end_split[0] start = int(chr_start_end_split[1]) end = int(chr_start_end_split[2]) # Update gerp_dic containing empty dics # with coordinates as keys, from start to end # Note: end is not inclusive. for coordinate in range(start, end): gerp_dic[chromo][coordinate] = None coord_count += 1 print '%s coordinates stored into gerp_dic.' % coord_count return gerp_dic
def main(): usage = 'usage: %prog pickle_filepath gerp_directory output_file\n'\ 'Two args must be specified in commandline: \n'\ '1) Path to pickle from summarize_meme_results.py\n'\ '2) Directory containing GERP RS score text files by chromosome\n'\ '3) Output file to which results will be written.\n' parser = OptionParser(usage=usage) parser.add_option('-p', '--gerp_pickle_fname', dest='gerp_pickle_fname', default='gerp_pickle.pkl', help='gerp scores pickle filename.\n'\ 'Default gerp_pickle.pkl') parser.add_option('-l', '--presaved_gerp_dic_path', dest='gerp_presaved_path', default=None, help='If a gerp dic has been presaved, use this flag to'\ 'indicate the file path to directly open the gerp file. '\ 'Reduces need for multiprocessing.') parser.add_option('-o', '--updated_dic_fname', dest='updated_dic_fname', default='meme_summary.gerp_updated.pkl', help='Filename to updated meme_summary.pkl' ' with gerp scores,\n'\ 'Default "meme_summary.gerp_updated.pkl"') parser.add_option('-t', '--conservation_type', dest='cons_type', default='rs', help='Either "rs" or "phastcons"') parser.add_option('-m', '--multiprocessing', dest='multiprocessing', default=True, help='True or False. Sets multiprocessing or not.') (options, args) = parser.parse_args() if len(args) < 2: print 'Incorrect number of parameters specified.' print usage sys.exit() motif_pickle_path = args[0] gerp_dir = args[1] output_path = args[2] # parse options gerp_pickle_fname = options.gerp_pickle_fname gerp_presaved_pkl_path = options.gerp_presaved_path updated_dic_fname = options.updated_dic_fname cons_type = options.cons_type if cons_type not in ['rs', 'phastcons']: print 'Conservation type must be "rs" or "phastcons". %s found.' \ %cons_type sys.exit() # Load motif dic, obtained from summarize_meme_results pickle_file = open(motif_pickle_path, 'rb') motif_dic = pickle.load(pickle_file) pickle_file.close() # Get chromosome list chr_list = get_chr_list() # Create dic containing chromosomes and genomic coordinates # relevant to our motifs by reading pickled dictionary. gerp_dic = init_gerp_dic(motif_dic) # Multithread only if presaved_gerp_dic_path flag is None. if gerp_presaved_pkl_path == None: # BEGIN: MULTITHREADING print 'Beginning multiprocessing.' q = Queue() process_list = [] # For each chromosome, open relevant gerp file and retrieve # RS scores associated with the coordinates. for chromosome in chr_list: p = Process(target=add_rs_scores_to_gerp_dic, args=(gerp_dic, chromosome, gerp_dir, q, cons_type)) process_list.append(p) p.start() print 'Calculating RS scores for %s' %chromosome print 'Finsihed calculating RS scores.' for _ in chr_list: # Update dic for every process that started. # the actual chromosoem doesn't matter. It's the # number of iterations that matter. (gerp_dic_chromo, chromo) = q.get() print 'Updating dictionary for chromo: %s' %chromo # Find which chromosome this came from by looking at key gerp_dic[chromo].update(gerp_dic_chromo) # Wait for all threads to be done before continuing. for p in process_list: p.join() # END: MULTITHREADING print 'Done multiprocessing.' # save gerp dic as pickle gerp_pickle_dir = os.path.dirname(motif_pickle_path) gerp_pickle_fpath = os.path.join(gerp_pickle_dir, gerp_pickle_fname) with open(gerp_pickle_fpath, 'wb') as gerp_pickle_file: pickle.dump(gerp_dic, gerp_pickle_file, -1) print 'Saved pickle object to: %s' %gerp_pickle_fpath else: # Try to open pickle path from options.gerp_path with open(gerp_presaved_pkl_path, 'rb') as presaved_pkl: pickle.load(presaved_pkl) print 'Loaded presaved pickle from %s' %gerp_presaved_pkl_path # Update motif_dic with gerp_dic motif_dic = update_motif_dic_with_gerp_dic(motif_dic, gerp_dic) # Save updated motif dic to pickle updated_dic_dir = os.path.dirname(motif_pickle_path) updated_dic_path = os.path.join(updated_dic_dir, updated_dic_fname) with open(updated_dic_path, 'wb') as updated_dic_file: pickle.dump(motif_dic, updated_dic_file, -1) print 'Updated dic saved to: %s' %updated_dic_path # Write updated gerp_dic to file # add GERP scores as a subkey in subkey_list subkeys_list = get_dic_subkeys() rs_score_subkey = get_rs_score_subkey() subkeys_list.append(rs_score_subkey) # Write updated motif dic to file write_outdic_to_file(motif_dic, output_path, subkeys_list)
def main(): usage = 'usage: %prog pickle_filepath gerp_directory output_file\n'\ 'Two args must be specified in commandline: \n'\ '1) Path to pickle from summarize_meme_results.py\n'\ '2) Directory containing GERP RS score text files by chromosome\n'\ '3) Output file to which results will be written.\n' parser = OptionParser(usage=usage) parser.add_option('-p', '--gerp_pickle_fname', dest='gerp_pickle_fname', default='gerp_pickle.pkl', help='gerp scores pickle filename.\n'\ 'Default gerp_pickle.pkl') parser.add_option('-l', '--presaved_gerp_dic_path', dest='gerp_presaved_path', default=None, help='If a gerp dic has been presaved, use this flag to'\ 'indicate the file path to directly open the gerp file. '\ 'Reduces need for multiprocessing.') parser.add_option('-o', '--updated_dic_fname', dest='updated_dic_fname', default='meme_summary.gerp_updated.pkl', help='Filename to updated meme_summary.pkl' ' with gerp scores,\n'\ 'Default "meme_summary.gerp_updated.pkl"') parser.add_option('-t', '--conservation_type', dest='cons_type', default='rs', help='Either "rs" or "phastcons"') parser.add_option('-m', '--multiprocessing', dest='multiprocessing', default=True, help='True or False. Sets multiprocessing or not.') (options, args) = parser.parse_args() if len(args) < 2: print 'Incorrect number of parameters specified.' print usage sys.exit() motif_pickle_path = args[0] gerp_dir = args[1] output_path = args[2] # parse options gerp_pickle_fname = options.gerp_pickle_fname gerp_presaved_pkl_path = options.gerp_presaved_path updated_dic_fname = options.updated_dic_fname cons_type = options.cons_type if cons_type not in ['rs', 'phastcons']: print 'Conservation type must be "rs" or "phastcons". %s found.' \ %cons_type sys.exit() # Load motif dic, obtained from summarize_meme_results pickle_file = open(motif_pickle_path, 'rb') motif_dic = pickle.load(pickle_file) pickle_file.close() # Get chromosome list chr_list = get_chr_list() # Create dic containing chromosomes and genomic coordinates # relevant to our motifs by reading pickled dictionary. gerp_dic = init_gerp_dic(motif_dic) # Multithread only if presaved_gerp_dic_path flag is None. if gerp_presaved_pkl_path == None: # BEGIN: MULTITHREADING print 'Beginning multiprocessing.' q = Queue() process_list = [] # For each chromosome, open relevant gerp file and retrieve # RS scores associated with the coordinates. for chromosome in chr_list: p = Process(target=add_rs_scores_to_gerp_dic, args=(gerp_dic, chromosome, gerp_dir, q, cons_type)) process_list.append(p) p.start() print 'Calculating RS scores for %s' % chromosome print 'Finsihed calculating RS scores.' for _ in chr_list: # Update dic for every process that started. # the actual chromosoem doesn't matter. It's the # number of iterations that matter. (gerp_dic_chromo, chromo) = q.get() print 'Updating dictionary for chromo: %s' % chromo # Find which chromosome this came from by looking at key gerp_dic[chromo].update(gerp_dic_chromo) # Wait for all threads to be done before continuing. for p in process_list: p.join() # END: MULTITHREADING print 'Done multiprocessing.' # save gerp dic as pickle gerp_pickle_dir = os.path.dirname(motif_pickle_path) gerp_pickle_fpath = os.path.join(gerp_pickle_dir, gerp_pickle_fname) with open(gerp_pickle_fpath, 'wb') as gerp_pickle_file: pickle.dump(gerp_dic, gerp_pickle_file, -1) print 'Saved pickle object to: %s' % gerp_pickle_fpath else: # Try to open pickle path from options.gerp_path with open(gerp_presaved_pkl_path, 'rb') as presaved_pkl: pickle.load(presaved_pkl) print 'Loaded presaved pickle from %s' % gerp_presaved_pkl_path # Update motif_dic with gerp_dic motif_dic = update_motif_dic_with_gerp_dic(motif_dic, gerp_dic) # Save updated motif dic to pickle updated_dic_dir = os.path.dirname(motif_pickle_path) updated_dic_path = os.path.join(updated_dic_dir, updated_dic_fname) with open(updated_dic_path, 'wb') as updated_dic_file: pickle.dump(motif_dic, updated_dic_file, -1) print 'Updated dic saved to: %s' % updated_dic_path # Write updated gerp_dic to file # add GERP scores as a subkey in subkey_list subkeys_list = get_dic_subkeys() rs_score_subkey = get_rs_score_subkey() subkeys_list.append(rs_score_subkey) # Write updated motif dic to file write_outdic_to_file(motif_dic, output_path, subkeys_list)