def _create_row_feature(self, feature_mapping, collapsed_view): if collapsed_view: new_map_position = feature_mapping.clone() else: map_position = MappingResult.get_empty() new_map_position = map_position.clone() new_map_position.set_feature(feature_mapping) return new_map_position
def _parse_index_file_by_id(self, query_ids_dict, index_path, data_path, map_config, chrom_dict, multiple_param, dataset_synonyms, test_set): mapping_results_list = [] ## TODO: DO SOMETHING WITH THE SYNONYMS!!! ## map_name = map_config.get_name() map_has_cm_pos = map_config.has_cm_pos() map_has_bp_pos = map_config.has_bp_pos() map_is_physical = map_config.as_physical() sys.stderr.write("MappingsParser: loading index " + str(index_path) + "...\n") with open(index_path, 'r') as index_f: index = cPickle.load(index_f) sys.stderr.write("MappingsParser: loaded index with " + str(len(index)) + " entries.\n") sys.stderr.write("MappingsParser: obtaining index of queries...\n") queries_bytes = [] for query in test_set: if query in index: query_bytes = index[query] query_ids_dict[query] = 1 # found queries_bytes.append(query_bytes) with open(data_path, 'r') as data_f: for query_bytes in queries_bytes: data_f.seek(query_bytes) mapping_line = data_f.readline() mapping_data = mapping_line.strip().split("\t") mapping_result = MappingResult.init_from_data( mapping_data, map_name, chrom_dict, map_is_physical, map_has_cm_pos, map_has_bp_pos) if mapping_result.has_multiple_pos(): if multiple_param == False: continue mapping_results_list.append(mapping_result) return mapping_results_list
def parse_mapping_file(self, data_path, map_config, chrom_dict): mapping_results_list = [] map_name = map_config.get_name() map_has_cm_pos = map_config.has_cm_pos() map_has_bp_pos = map_config.has_bp_pos() map_is_physical = map_config.as_physical() for hit in open(data_path, 'r'): if hit.startswith(">") or hit.startswith("#"): continue hit_data = hit.strip().split("\t") mapping_result = MappingResult.init_from_data( hit_data, map_name, chrom_dict, map_is_physical, map_has_cm_pos, map_has_bp_pos) mapping_results_list.append(mapping_result) return mapping_results_list
def _createPositions(self, markers_positions, multiple_param, chrom_dict, map_name): positions_list = [] for marker_id in markers_positions: #sys.stderr.write(marker_id+"\n") positions = markers_positions[marker_id]["positions"] #sys.stderr.write(str(positions)+"\n") num_marker_pos = len(positions) if num_marker_pos == 0: continue # contigs without position if (not multiple_param) and (num_marker_pos > 1): if self._verbose: sys.stderr.write( "Mappers: discarded multiple pos marker: " + str(marker_id) + "\n") continue # Multiple positions num_contig_no_pos = len( markers_positions[marker_id]["hits_no_position"]) for pos in positions: # marker - chr - cm_pos - bp_pos - multiple - has_contigs_with_no_pos - map_name chr_pos = pos["chr"] # If the chromosome is not in the genome, skip this alignment result if not chr_pos in chrom_dict: continue chrom_order = chrom_dict[ chr_pos] # Numeric value of chromsome (for sorting purposes) mapping_result = MappingResult( marker_id, chr_pos, chrom_order, pos["cm_pos"], pos["cm_end_pos"], pos["bp_pos"], pos["bp_end_pos"], pos["strand"], num_marker_pos > 1, num_contig_no_pos > 0, map_name) positions_list.append(mapping_result) return positions_list
def _parse_mapping_file_by_id(self, query_ids_dict, data_path, map_config, chrom_dict, multiple_param, dataset_synonyms={}, test_set=None): mapping_results_list = [] map_name = map_config.get_name() map_has_cm_pos = map_config.has_cm_pos() map_has_bp_pos = map_config.has_bp_pos() map_is_physical = map_config.as_physical() for hit in open(data_path, 'r'): #sys.stderr.write(" ONE**************************\n") #sys.stderr.write(str(hit)+"\n") if hit.startswith(">") or hit.startswith("#"): continue hit_data = hit.strip().split("\t") #sys.stderr.write("data\n") if test_set: hit_query = hit_data[0] #if hit_query == "12_30924": # sys.stderr.write(str(test_set)+"\n") #sys.stderr.write("CHECK TESTSET\n") if hit_query in dataset_synonyms: #if hit_query == "12_30924": # sys.stderr.write("IS IN SYNONYMS\n") hit_synonyms = dataset_synonyms[hit_query] synonyms_found = test_set.intersection(hit_synonyms) if len(synonyms_found) > 0: #if hit_query == "12_30924": # sys.stderr.write("-".join(synonyms_found)+"\n") mapping_result = MappingResult.init_from_data( hit_data, map_name, chrom_dict, map_is_physical, map_has_cm_pos, map_has_bp_pos) for synonym in synonyms_found: # all found query_ids_dict[synonym] = 1 if mapping_result.has_multiple_pos(): if multiple_param == False: continue else: # just for sake of readability for synonym in synonyms_found: # all found if synonym in test_set: test_set.remove(synonym) mapping_result.set_marker_id("|".join(synonyms_found)) mapping_results_list.append(mapping_result) else: #if hit_query == "12_30924": # sys.stderr.write("IS NOT IN SYNONYMS\n") if hit_query in test_set: #sys.stderr.write("create mapping data\n") mapping_result = MappingResult.init_from_data( hit_data, map_name, chrom_dict, map_is_physical, map_has_cm_pos, map_has_bp_pos) query_ids_dict[hit_query] = 1 # found #sys.stderr.write("append\n") if mapping_result.has_multiple_pos(): if multiple_param == False: continue else: # just for sake of readability test_set.remove(hit_query) mapping_results_list.append(mapping_result) else: # retrieve all mapping results mapping_result = MappingResult.init_from_data( hit_data, map_name, chrom_dict, map_is_physical, map_has_cm_pos, map_has_bp_pos) query_ids_dict[hit_query] = 1 # found if mapping_result.has_multiple_pos(): if multiple_param == False: continue mapping_results_list.append(mapping_result) #sys.stderr.write("**********NEXT\n") if len(test_set) == 0: break return mapping_results_list
def parse_mapping_file_on_pos(self, map_intervals, data_path, chrom_dict, map_config, map_sort_by, dataset, dataset_name, feature_type): map_name = map_config.get_name() map_is_physical = map_config.as_physical() map_has_cm_pos = map_config.has_cm_pos() map_has_bp_pos = map_config.has_bp_pos() current_interval_pos = 0 featured_current_interval = map_intervals[current_interval_pos] current_interval = featured_current_interval.get_map_interval() current_features = featured_current_interval.get_features() #sys.stderr.write("MappingsParser \n") #sys.stderr.write("\t"+str(current_interval)+"\n") #sys.stderr.write("\t"+str(len(current_features))+"\n") #for feature in current_features: # sys.stderr.write("\t\t"+str(feature)+"\n") # Find all the hits for this map for hit in open(data_path, 'r'): if hit.startswith(">") or hit.startswith("#"): continue hit_data = hit.strip().split("\t") #sys.stderr.write(hit+"\n") #sys.stderr.write("\t"+str(current_interval)+"\n") mapping_result = MappingResult.init_from_data( hit_data, map_name, chrom_dict, map_is_physical, map_has_cm_pos, map_has_bp_pos) chrom_name = mapping_result.get_chrom_name() if chrom_name != current_interval.get_chrom(): continue map_end_pos = mapping_result.get_sort_end_pos(map_sort_by) if float(map_end_pos) < float(current_interval.get_ini_pos()): continue marker_id = mapping_result.get_marker_id() chrom_order = mapping_result.get_chrom_order() map_pos = mapping_result.get_sort_pos( map_sort_by) #float(mapping_result.get_sort_pos(map_sort_by)) dataset_interval = MapInterval(chrom_name, map_pos, map_end_pos) does_overlap = MapInterval.intervals_overlap( dataset_interval, current_interval) # This if-else could be unnecessary, but hopefully is useful to read the code if does_overlap: next_interval_pos = current_interval_pos next_interval = current_interval next_features = current_features while (does_overlap): feature = FeaturesFactory.get_feature( marker_id, dataset, dataset_name, feature_type, mapping_result) next_features.append(feature) next_interval_pos += 1 if next_interval_pos >= len(map_intervals): break featured_next_interval = map_intervals[next_interval_pos] next_interval = featured_next_interval.get_map_interval() next_features = featured_next_interval.get_features() does_overlap = MapInterval.intervals_overlap( dataset_interval, next_interval) else: while (float(map_pos) > float(current_interval.get_end_pos())): current_interval_pos += 1 if current_interval_pos >= len(map_intervals): break featured_current_interval = map_intervals[ current_interval_pos] current_interval = featured_current_interval.get_map_interval( ) current_features = featured_current_interval.get_features() #sys.stderr.write("MappingsParser generated intervals\n") #for featured_map_interval in map_intervals: # map_interval = featured_map_interval.get_map_interval() # sys.stderr.write("\tinterval: "+str(map_interval)+"\n") # features = featured_map_interval.get_features() # for feature in features: # sys.stderr.write("\t\tfeature: "+str(feature)+"\n") return map_intervals
def parse_mapping_file_by_pos(self, map_intervals, data_path, chrom_dict, map_config, map_sort_by): mapping_results_list = [] map_name = map_config.get_name() map_is_physical = map_config.as_physical() map_has_cm_pos = map_config.has_cm_pos() map_has_bp_pos = map_config.has_bp_pos() current_interval_pos = 0 current_interval = map_intervals[current_interval_pos] # Find all the hits for this map within intervals of interest # Note: hits/map features are pre-computed & sorted along chroms, chroms are in chrom_dict order # Note: intervals are pre-sorted as well for hit in open(data_path, 'r'): if hit.startswith(">") or hit.startswith("#"): continue hit_data = hit.strip().split("\t") #sys.stderr.write(hit+"\n") #sys.stderr.write("\t"+str(current_interval)+"\n") mapping_result = MappingResult.init_from_data( hit_data, map_name, chrom_dict, map_is_physical, map_has_cm_pos, map_has_bp_pos) chrom_name = mapping_result.get_chrom_name() # move to next interval to match chrom (only if previous mappings exist) # Note: needed when last mapping matched exactly the last gene of a chrom while len(mapping_results_list) > 0 and \ current_interval_pos < len(map_intervals)-1 and \ int(chrom_dict[chrom_name]) > int(chrom_dict[current_interval.get_chrom()]): current_interval_pos += 1 current_interval = map_intervals[current_interval_pos] if chrom_name != current_interval.get_chrom(): continue map_end_pos = mapping_result.get_sort_end_pos(map_sort_by) if float(map_end_pos) < float(current_interval.get_ini_pos()): continue marker_id = mapping_result.get_marker_id() chrom_order = mapping_result.get_chrom_order() map_pos = mapping_result.get_sort_pos( map_sort_by) #float(mapping_result.get_sort_pos(map_sort_by)) while (float(map_pos) > float(current_interval.get_end_pos())): current_interval_pos += 1 if current_interval_pos >= len(map_intervals): break current_interval = map_intervals[current_interval_pos] if current_interval.get_chrom() != chrom_name: break if current_interval_pos >= len(map_intervals): break if chrom_name != current_interval.get_chrom(): continue if float(map_end_pos) < float(current_interval.get_ini_pos()): continue dataset_interval = MapInterval(chrom_name, map_pos, map_end_pos) #sys.stderr.write("MappingsParser: by_pos "+str(dataset_interval)+" - "+str(current_interval)+"\n") does_overlap = MapInterval.intervals_overlap( dataset_interval, current_interval) # Check if alignment overlaps with some mapping interval if does_overlap: mapping_results_list.append(mapping_result) return mapping_results_list
def get_empty(): return GeneMapping("-", "-", "-", "-", MappingResult.get_empty(), FeatureMapping.ROW_TYPE_MAPPING_RESULT, empty = True, annots = [])
def get_empty(): return MarkerMapping("-", "-", "-", "-", MappingResult.get_empty(), FeatureMapping.ROW_TYPE_MAPPING_RESULT, empty = True)
def parse_mapping_file_by_pos(self, map_intervals, data_path, chrom_dict, map_config, map_sort_by): mapping_results_list = [] map_name = map_config.get_name() map_is_physical = map_config.as_physical() map_has_cm_pos = map_config.has_cm_pos() map_has_bp_pos = map_config.has_bp_pos() current_interval_pos = 0 current_interval = map_intervals[current_interval_pos] # Find all the hits for this map for hit in open(data_path, 'r'): if hit.startswith(">") or hit.startswith("#"): continue hit_data = hit.strip().split("\t") #sys.stderr.write(hit+"\n") #sys.stderr.write("\t"+str(current_interval)+"\n") mapping_result = MappingResult.init_from_data( hit_data, map_name, chrom_dict, map_is_physical, map_has_cm_pos, map_has_bp_pos) chrom_name = mapping_result.get_chrom_name() if chrom_name != current_interval.get_chrom(): continue map_end_pos = mapping_result.get_sort_end_pos(map_sort_by) if float(map_end_pos) < float(current_interval.get_ini_pos()): continue marker_id = mapping_result.get_marker_id() chrom_order = mapping_result.get_chrom_order() map_pos = mapping_result.get_sort_pos( map_sort_by) #float(mapping_result.get_sort_pos(map_sort_by)) while (float(map_pos) > float(current_interval.get_end_pos())): current_interval_pos += 1 if current_interval_pos >= len(map_intervals): break current_interval = map_intervals[current_interval_pos] if current_interval.get_chrom() != chrom_name: break if current_interval_pos >= len(map_intervals): break if chrom_name != current_interval.get_chrom(): continue if float(map_end_pos) < float(current_interval.get_ini_pos()): continue dataset_interval = MapInterval(chrom_name, map_pos, map_end_pos) #sys.stderr.write("MappingsParser: by_pos "+str(dataset_interval)+" - "+str(current_interval)+"\n") does_overlap = MapInterval.intervals_overlap( dataset_interval, current_interval) # Check if alignment overlaps with some mapping interval if does_overlap: mapping_results_list.append(mapping_result) return mapping_results_list