def get_feature(marker_id, dataset_id, dataset_name, feature_type, mapping_result): feature = None if feature_type == DatasetsConfig.DATASET_TYPE_GENETIC_MARKER: feature = MarkerMapping(marker_id, dataset_id, dataset_name, feature_type, mapping_result, FeatureMapping.ROW_TYPE_ENRICHMENT) elif feature_type == DatasetsConfig.DATASET_TYPE_GENE: feature = GeneMapping(marker_id, dataset_id, dataset_name, feature_type, mapping_result, FeatureMapping.ROW_TYPE_ENRICHMENT, annots=[]) elif feature_type == DatasetsConfig.DATASET_TYPE_ANCHORED: feature = AnchoredMapping(marker_id, dataset_id, dataset_name, feature_type, mapping_result, FeatureMapping.ROW_TYPE_ENRICHMENT) else: raise m2pException("Unrecognized feature type " + str(feature_type) + ".") return feature
def check_sort_param(self, map_config, sort_param, DEFAULT_SORT_PARAM): sort_by = "" map_name = map_config.get_name() map_has_cm_pos = map_config.has_cm_pos() map_has_bp_pos = map_config.has_bp_pos() map_default_sort_by = map_config.get_default_sort_by() if sort_param == map_default_sort_by: sort_by = sort_param else: # sort_param has priority if sort_param == MapTypes.MAP_SORT_PARAM_CM and map_has_cm_pos: sort_by = sort_param elif sort_param == MapTypes.MAP_SORT_PARAM_BP and map_has_bp_pos: sort_by = sort_param # else, check map_default_sort_by else: if sort_param != DEFAULT_SORT_PARAM: sys.stderr.write("WARNING: the sort parameter "+sort_param+" is not compatible with map "+map_name+". Using default map sort parameter...\n") if map_default_sort_by == MapTypes.MAP_SORT_PARAM_CM and map_has_cm_pos: sort_by = map_default_sort_by elif map_default_sort_by == MapTypes.MAP_SORT_PARAM_BP and map_has_bp_pos: sort_by = map_default_sort_by else: raise m2pException("Map default sort configure as \""+map_default_sort_by+"\" assigned to a map which has not such kind of position.") return sort_by
def load_conf(conf_file, verbose=False): conf_rows = [] if verbose: sys.stderr.write("Loading configuration file " + conf_file + "...\n") try: for line in open(conf_file, 'r'): if line.startswith("#") or not line.strip(): continue # line.strip() is False if is an empty line "^$" if verbose: sys.stderr.write("\t conf line: " + line.strip() + "\n") line_data = line.strip().split(" ") conf_rows.append(line_data) except Exception: raise m2pException("Error loading configuration file " + conf_file + ".") return conf_rows ## END
def _load_config(self, config_file): self._config_dict = {} self._config_list = [] conf_rows = load_conf(config_file, self._verbose) # data_utils.load_conf #self._config_dict = load_maps(self._config_file, self._verbose) # data_utils.load_maps for conf_row in conf_rows: dataset_name = conf_row[DatasetsConfig.DATASET_NAME] dataset_id = conf_row[DatasetsConfig.DATASET_ID] dataset_type = conf_row[DatasetsConfig.DATASET_TYPE] file_path = conf_row[DatasetsConfig.FILE_PATH] file_type = conf_row[DatasetsConfig.FILE_TYPE] databases = conf_row[DatasetsConfig.DATABASES].strip().split(",") synonyms = conf_row[DatasetsConfig.SYNONYMS] prefixes = conf_row[DatasetsConfig.PREFIXES].strip().split(",") dataset = DatasetConfig(dataset_name, dataset_id, dataset_type, file_path, file_type, databases, synonyms, prefixes) if dataset_name.startswith(">"): dataset.set_dataset_name(dataset_name[1:]) # remove the ">" from the name dataset.set_ignore_build(True) # mark the dataset as to be ignored in the build datasets script if dataset_id in self._config_dict: raise m2pException("Duplicated dataset "+dataset_id+" in configuration file "+config_file+".") else: self._config_dict[dataset_id] = dataset self._config_list.append(dataset_id) return
def align(self, fasta_path, db, ref_type, threshold_id, threshold_cov): sys.stderr.write("\n") fasta_headers = alignment_utils.get_fasta_headers(fasta_path) sys.stderr.write("GMAPAligner: DB --> "+str(db)+"\n") sys.stderr.write("GMAPAligner: to align "+str(len(fasta_headers))+"\n") # use GMAP or GMAPL if ref_type == REF_TYPE_STD: app_path = self._app_path elif ref_type == REF_TYPE_BIG: app_path = self._gmapl_app_path else: raise m2pException("GMAPAligner: Unrecognized ref type "+ref_type+".") # get_hits from m2p_gmap.py self._results_hits = m2p_gmap.get_best_score_hits(app_path, self._n_threads, fasta_path, self._dbs_path, db, threshold_id, threshold_cov, \ self._verbose) query_list = [a.get_query_id() for a in self._results_hits] sys.stderr.write("GMAPAligner: aligned "+str(len(set([a.split(" ")[0] for a in query_list])))+"\n") self._results_unaligned = alignment_utils.filter_list(fasta_headers, query_list) sys.stderr.write("GMAPAligner: no hits "+str(len(self._results_unaligned))+"\n") return self.get_hits()
def _map_intervals(self, sorted_map, map_sort_by, extend_window): map_intervals = [] if self._verbose: sys.stderr.write("MarkerEnricher: creating intervals on markers\n") sys.stderr.write("MarkerEnricher: map sort by "+str(map_sort_by)+"\n") if map_sort_by == MapTypes.MAP_SORT_PARAM_BP: self.MAP_UNIT = self.MAP_UNIT_PHYSICAL elif map_sort_by == MapTypes.MAP_SORT_PARAM_CM: self.MAP_UNIT = self.MAP_UNIT_GENETIC else: raise m2pException("Unrecognized map sort unit "+str(map_sort_by)+".") # Loop over consecutive positions to compare them and create intervals prev_position = None prev_interval = None for map_position in sorted_map: #sys.stderr.write("\tMap position: "+str(map_position)+"\n") pos_marker = map_position.get_marker_id() #position[MapFields.MARKER_NAME_POS] pos_chr = map_position.get_chrom_name() #position[MapFields.MARKER_CHR_POS] pos_pos = map_position.get_sort_pos(map_sort_by) #float(position[map_sort_by]) pos_end_pos = map_position.get_sort_end_pos(map_sort_by) interval = self._get_new_interval(map_position, pos_chr, pos_pos, pos_end_pos, extend_window) #sys.stderr.write("\tInterval "+str(interval)+"\n") self._append_interval(map_intervals, interval) sys.stderr.write("MapEnricher: "+str(len(map_intervals))+" intervals created.\n") return map_intervals
def get_search_engine(search_type, maps_path, best_score_param, databases_config, aligner_list, threshold_id, threshold_cov, n_threads, verbose = False): search_engine = None if search_type == MapsConfig.SEARCH_TYPE_GREEDY: if best_score_param: search_engine = SearchEngineGreedy(maps_path, best_score_param, databases_config, aligner_list, threshold_id, threshold_cov, n_threads, ALIGNMENT_TYPE_BEST_SCORE, verbose) else: search_engine = SearchEngineGreedy(maps_path, best_score_param, databases_config, aligner_list, threshold_id, threshold_cov, n_threads, ALIGNMENT_TYPE_GREEDY, verbose) elif search_type == MapsConfig.SEARCH_TYPE_HIERARCHICAL: search_engine = SearchEngineGreedy(maps_path, best_score_param, databases_config, aligner_list, threshold_id, threshold_cov, n_threads, ALIGNMENT_TYPE_HIERARCHICAL, verbose) elif search_type == MapsConfig.SEARCH_TYPE_EXHAUSTIVE: if best_score_param: search_engine = SearchEngineExhaustive(maps_path, best_score_param, databases_config, aligner_list, threshold_id, threshold_cov, n_threads, ALIGNMENT_TYPE_BEST_SCORE, verbose) else: search_engine = SearchEngineExhaustive(maps_path, best_score_param, databases_config, aligner_list, threshold_id, threshold_cov, n_threads, ALIGNMENT_TYPE_GREEDY, verbose) else: raise m2pException("Unrecognized search type "+search_type+".") return search_engine
def get_alignment_engine(search_type, aligner_list, paths_config, ref_type_param, n_threads, verbose): alignment_engine = None if search_type == ALIGNMENT_TYPE_GREEDY: alignment_engine = GreedyEngine(aligner_list, paths_config, ref_type_param, n_threads, verbose) elif search_type == ALIGNMENT_TYPE_HIERARCHICAL: alignment_engine = HierarchicalEngine(aligner_list, paths_config, ref_type_param, n_threads, verbose) elif search_type == ALIGNMENT_TYPE_BEST_SCORE: alignment_engine = BestScoreEngine(aligner_list, paths_config, ref_type_param, n_threads, verbose) else: raise m2pException("Unrecognized search type " + search_type + ".") return alignment_engine
def _map_intervals(self, sorted_map, map_sort_by, extend_window): map_intervals = [] if self._verbose: sys.stderr.write("MapEnricher: creating intervals around markers\n") sys.stderr.write("MapEnricher: map sort by "+str(map_sort_by)+", extend interval "+str(extend_window)+"\n") if map_sort_by == MapTypes.MAP_SORT_PARAM_BP: self.MAP_UNIT = self.MAP_UNIT_PHYSICAL elif map_sort_by == MapTypes.MAP_SORT_PARAM_CM: self.MAP_UNIT = self.MAP_UNIT_GENETIC else: raise m2pException("Unrecognized map sort unit "+str(map_sort_by)+".") # Loop over consecutive positions to compare them and create intervals prev_position = None prev_interval = None for map_position in sorted_map: pos_marker = map_position.get_marker_id() #position[MapFields.MARKER_NAME_POS] pos_chr = map_position.get_chrom_name() #position[MapFields.MARKER_CHR_POS] pos_pos = map_position.get_sort_pos(map_sort_by) #float(position[map_sort_by]) pos_end_pos = map_position.get_sort_end_pos(map_sort_by) #if self._verbose: sys.stderr.write("\tMap position: "+str(map_position)+"\n") interval = self._get_new_interval(map_position, pos_chr, pos_pos, pos_end_pos, extend_window) ## check whether intervals overlap to each other if prev_position: prev_chr = prev_position.get_chrom_name() #prev_position[MapFields.MARKER_CHR_POS] if pos_chr != prev_chr: self._append_interval(map_intervals, prev_interval) # The same chromosome... else: # Check if there is overlap if MapInterval.intervals_overlap(prev_interval, interval): self._add_position_to_interval(prev_interval, map_position, pos_end_pos, extend_window) interval = prev_interval #if self._verbose: sys.stdout.write("\t\toverlap --> Updated interval "+str(prev_interval)+"\n") else: self._append_interval(map_intervals, prev_interval) # If first interval # else: DO NOTHING prev_position = map_position prev_interval = interval # Append the last interval if prev_interval: self._append_interval(map_intervals, prev_interval) sys.stderr.write("MapEnricher: "+str(len(map_intervals))+" intervals created.\n") return map_intervals
def create_map(self, query_path, query_sets_ids, map_config, facade, sort_param, multiple_param, tmp_files_dir=None): raise m2pException("To be implemented in child classes.")
def __gmap(gmap_app_path, n_threads, threshold_id, threshold_cov, query_fasta_path, gmap_dbs_path, db_name, verbose = False): # CPCantalapiedra 201701 ###### Check that DB is available for this aligner dbpath = gmap_dbs_path + "/" + db_name dbpathfile = dbpath + "/" + db_name + ".ref153positions" sys.stderr.write("Checking database: "+dbpath+" DB exists for "+ALIGNER+".\n") if not (os.path.exists(dbpathfile) and os.path.isfile(dbpathfile)): raise m2pException("DB path "+dbpath+" for "+ALIGNER+" aligner NOT FOUND.") # GMAP __command = "".join([gmap_app_path, \ " -t ", str(n_threads), \ " -B 0 -n ", str(MAX_NUMBER_PATHS_PER_QUERY)]) gmap_thres_id = float(threshold_id) / 100.0 gmap_thres_cov = float(threshold_cov) / 100.0 if verbose: sys.stderr.write("m2p_gmap: Thresholds: ID="+str(gmap_thres_id)+"; COV="+str(gmap_thres_cov)+"\n") __filter_id = "--min-identity="+str(gmap_thres_id) __filter_cov = "--min-trimmed-coverage="+str(gmap_thres_cov) __db = "".join([" -d ", db_name]) __db_dir = "".join([" -D ", gmap_dbs_path]) gmap_cmd = " ".join([__command, __filter_id, __filter_cov, __db, __db_dir, query_fasta_path]) if verbose: sys.stderr.write("m2p_gmap: Executing '"+gmap_cmd+"'\n") retValue = 0 FNULL = open(os.devnull, 'w') if verbose: p = Popen(gmap_cmd, shell=True, stdout=PIPE, stderr=sys.stderr) else: p = Popen(gmap_cmd, shell=True, stdout=PIPE, stderr=PIPE) com_list = p.communicate() output = com_list[0] output_err = com_list[1] retValue = p.returncode if retValue != 0: if verbose: raise Exception("m2p_gmap: return != 0. "+gmap_cmd+"\n") else: raise Exception("m2p_gmap: return != 0. "+gmap_cmd+"\nError: "+str(output_err)+"\n") if verbose: sys.stderr.write("m2p_gmap: GMAP return value "+str(retValue)+"\n"+str(output_err)+"\n") results = __compress(output, db_name) #print "M2PGMAP***********************" #for result in results: # print result return results
def get_results(self): retvalue = None if self._results != None: retvalue = self._results else: raise m2pException("DatasetsRetriever: error obtaining unloaded results. Call a retrieve method first.") return retvalue
def get_unmapped(self): retvalue = None if self._unmapped != None: retvalue = self._unmapped else: raise m2pException("DatasetsRetriever: error obtaining unloaded list of unmapped markers. Call a retrieve method first.") return retvalue
def output_features_header(self, map_as_physical, map_has_cm_pos, map_has_bp_pos, multiple_param, load_annot=False, annotator=None): raise m2pException( "Method has to be implemented in child class inheriting from OutputPrinter" )
def __hs_blast(hsblastn_app_path, n_threads, query_fasta_path, hsblastn_dbs_path, db_name, verbose = False): results = [] # CPCantalapiedra 201701 ###### Check that DB is available for this aligner dbpath = hsblastn_dbs_path + db_name dbpathfile = dbpath + ".bwt" sys.stderr.write("Checking database: "+dbpath+" DB exists for "+ALIGNER+".\n") if not (os.path.exists(dbpathfile) and os.path.isfile(dbpathfile)): raise m2pException("DB path "+dbpath+" for "+ALIGNER+" aligner NOT FOUND.") ###### HS-Blastn blast_command = " ".join([hsblastn_app_path, " align ", \ " -num_threads ", str(n_threads), \ "-dust no ", \ '-outfmt 6']) #'-outfmt \"6 qseqid qlen sseqid slen length qstart qend sstart send bitscore evalue pident mismatch gapopen\"']) blast_db = "".join(["-db ", dbpath]) # blast_db = "".join(["-db ", blast_dbs_path, db_name , ".fa"]) # blast_query = " ".join(["-query ", query_fasta_path]) #blast_cmd = " ".join([ResourcesMng.get_deploy_dir()+blast_command, blast_db, blast_query]) blast_cmd = " ".join([blast_command, blast_db, blast_query]) if verbose: sys.stderr.write(os.path.basename(__file__)+": Running '"+blast_cmd+"'\n") retValue = 0 FNULL = open(os.devnull, 'w') if verbose: p = Popen(blast_cmd, shell=True, stdout=PIPE, stderr=sys.stderr) else: p = Popen(blast_cmd, shell=True, stdout=PIPE, stderr=PIPE) com_list = p.communicate() output = com_list[0] output_err = com_list[1] retValue = p.returncode if retValue != 0: if verbose: raise Exception(os.path.basename(__file__)+": HS-Blastn return != 0. "+blast_cmd+"\n"+str(output)+"\n") else: raise Exception(os.path.basename(__file__)+": HS-Blastn return != 0. "+blast_cmd+"\n"+str(output)+"\n"+str(output_err)+"\n") if "error" in output or "Error" in output or "ERROR" in output: sys.stderr.write("m2p_hs_blast: error in hs-blastn output. We will report 0 results for this alignment.\n") sys.stderr.write(output+"\n") sys.stderr.write(str(output_err)+"\n") results = [] else: if verbose: sys.stderr.write(os.path.basename(__file__)+": HS-Blastn return value "+str(retValue)+"\n") [results.append(line) for line in output.strip().split("\n") if line != ""] return results
def get_search_engine(search_type, maps_path, best_score_param, databases_config, aligner_list, threshold_id, threshold_cov, n_threads, verbose=False): search_engine = None if search_type == MapsConfig.SEARCH_TYPE_GREEDY: if best_score_param: search_engine = SearchEngineGreedy(maps_path, best_score_param, databases_config, aligner_list, threshold_id, threshold_cov, n_threads, ALIGNMENT_TYPE_BEST_SCORE, verbose) else: search_engine = SearchEngineGreedy(maps_path, best_score_param, databases_config, aligner_list, threshold_id, threshold_cov, n_threads, ALIGNMENT_TYPE_GREEDY, verbose) elif search_type == MapsConfig.SEARCH_TYPE_HIERARCHICAL: search_engine = SearchEngineGreedy(maps_path, best_score_param, databases_config, aligner_list, threshold_id, threshold_cov, n_threads, ALIGNMENT_TYPE_HIERARCHICAL, verbose) elif search_type == MapsConfig.SEARCH_TYPE_EXHAUSTIVE: if best_score_param: search_engine = SearchEngineExhaustive( maps_path, best_score_param, databases_config, aligner_list, threshold_id, threshold_cov, n_threads, ALIGNMENT_TYPE_BEST_SCORE, verbose) else: search_engine = SearchEngineExhaustive( maps_path, best_score_param, databases_config, aligner_list, threshold_id, threshold_cov, n_threads, ALIGNMENT_TYPE_GREEDY, verbose) else: raise m2pException("Unrecognized search type " + search_type + ".") return search_engine
def get_map_enricher(show_how, enricher, mapping_results, verbose): map_enricher = None if show_how == SHOW_ON_INTERVALS: map_enricher = MapEnricher(enricher, mapping_results, verbose) elif show_how == SHOW_ON_MARKERS: map_enricher = MarkerEnricher(enricher, mapping_results, verbose) else: raise m2pException("Unrecognized show_how parameter "+str(show_how)+".") return map_enricher
def get_enricher_factory(show_how): enricher_factory = None if show_how == SHOW_ON_INTERVALS: enricher_factory = EnricherFactory() elif show_how == SHOW_ON_MARKERS: enricher_factory = MarkerEnricherFactory() else: raise m2pException("Unrecognized show_how parameter "+str(show_how)+".") return enricher_factory
def load_synonyms(self, synonyms): dataset_synonyms = {} if synonyms != "" and synonyms != DatasetsConfig.SYNONYMS_NO: for syn_line in open(synonyms, 'r'): syn_data = syn_line.strip().split() syn_key = syn_data[0] if syn_key in dataset_synonyms: raise m2pException("Repeated synonyms entry for marker "+syn_key+".") else: dataset_synonyms[syn_key] = syn_data return dataset_synonyms
def output_results(self, aligned, databases_ids = None): if not databases_ids: raise m2pException("AlignmentsGreedyPrinter needs a list of DBs.") for db_entry in databases_ids: db_name = self._databases_config.get_database_name(db_entry) sys.stdout.write(">"+str(db_name)+"\n") self.print_header() self.print_records_db(aligned, db_entry, db_name) return
def __process_id(gtf_data, feature_type, file_type): new_id = "" if feature_type == GTF_TYPE_TRANSCRIPT: new_id = gtf_data[GTF_TRANSCRIPT_ID_COL] elif feature_type == GTF_TYPE_GENE: new_id = gtf_data[GTF_GENE_ID_COL] else: raise m2pException("Unrecognized GTF type "+str(feature_type)+".") if file_type == FILE_TYPE_GTF: new_id = new_id.translate(None, '";') # Remove " and ; from the string elif file_type == FILE_TYPE_GFF3: raise m2pException("GFF3 file type is not supported yet.") else: raise m2pException("Unrecognized file type "+file_type+".") return new_id ## END
def get_alignments_printer(search_type, databases_config): alignments_printer = None if search_type == ALIGNMENT_TYPE_GREEDY: alignments_printer = AlignmentsGreedyPrinter(databases_config) elif search_type == ALIGNMENT_TYPE_HIERARCHICAL: alignments_printer = AlignmentsHierarchicalPrinter(databases_config) elif search_type == ALIGNMENT_TYPE_BEST_SCORE: alignments_printer = AlignmentsBestScorePrinter(databases_config) else: raise m2pException("Unrecognized search type "+search_type+".") return alignments_printer
def __process_id(gtf_data, feature_type, file_type): new_id = "" if feature_type == GTF_TYPE_TRANSCRIPT: new_id = gtf_data[GTF_TRANSCRIPT_ID_COL] elif feature_type == GTF_TYPE_GENE: new_id = gtf_data[GTF_GENE_ID_COL] else: raise m2pException("Unrecognized GTF type " + str(feature_type) + ".") if file_type == FILE_TYPE_GTF: new_id = new_id.translate(None, '";') # Remove " and ; from the string elif file_type == FILE_TYPE_GFF3: raise m2pException("GFF3 file type is not supported yet.") else: raise m2pException("Unrecognized file type " + file_type + ".") return new_id ## END
def __run_command(cmd): sys.stderr.write(_SCRIPT+": running command:\n") sys.stderr.write("\t"+cmd+"\n") #p = Popen(cmd, shell=True, stdout=PIPE, stderr=sys.stderr) p = Popen(cmd, shell=True) com_list = p.communicate() retValue = p.returncode if retValue != 0: raise m2pException(_SCRIPT+": return != 0. "+cmd+"\n") sys.stderr.write(_SCRIPT+": return value "+str(retValue)+"\n") return
def output_results(self, aligned, databases_ids=None): if not databases_ids: raise m2pException("AlignmentsGreedyPrinter needs a list of DBs.") for db_entry in databases_ids: db_name = self._databases_config.get_database_name(db_entry) sys.stdout.write(">" + str(db_name) + "\n") self.print_header() self.print_records_db(aligned, db_entry, db_name) return
def get_enricher_factory(show_how): enricher_factory = None if show_how == SHOW_ON_INTERVALS: enricher_factory = EnricherFactory() elif show_how == SHOW_ON_MARKERS: enricher_factory = MarkerEnricherFactory() else: raise m2pException("Unrecognized show_how parameter " + str(show_how) + ".") return enricher_factory
def get_map_enricher(show_how, enricher, mapping_results, verbose): map_enricher = None if show_how == SHOW_ON_INTERVALS: map_enricher = MapEnricher(enricher, mapping_results, verbose) elif show_how == SHOW_ON_MARKERS: map_enricher = MarkerEnricher(enricher, mapping_results, verbose) else: raise m2pException("Unrecognized show_how parameter " + str(show_how) + ".") return map_enricher
def get_alignments_printer(search_type, databases_config): alignments_printer = None if search_type == ALIGNMENT_TYPE_GREEDY: alignments_printer = AlignmentsGreedyPrinter(databases_config) elif search_type == ALIGNMENT_TYPE_HIERARCHICAL: alignments_printer = AlignmentsHierarchicalPrinter( databases_config) elif search_type == ALIGNMENT_TYPE_BEST_SCORE: alignments_printer = AlignmentsBestScorePrinter(databases_config) else: raise m2pException("Unrecognized search type " + search_type + ".") return alignments_printer
def __run_command(cmd): sys.stderr.write(_SCRIPT + ": running command:\n") sys.stderr.write("\t" + cmd + "\n") #p = Popen(cmd, shell=True, stdout=PIPE, stderr=sys.stderr) p = Popen(cmd, shell=True) com_list = p.communicate() retValue = p.returncode if retValue != 0: raise m2pException(_SCRIPT + ": return != 0. " + cmd + "\n") sys.stderr.write(_SCRIPT + ": return value " + str(retValue) + "\n") return
def get_empty_feature(feature_type): feature = None if feature_type == DatasetsConfig.DATASET_TYPE_GENETIC_MARKER: feature = MarkerMapping.get_empty() elif feature_type == DatasetsConfig.DATASET_TYPE_GENE: feature = GeneMapping.get_empty() elif feature_type == DatasetsConfig.DATASET_TYPE_ANCHORED: feature = AnchoredMapping.get_empty() else: raise m2pException("Unrecognized feature type "+str(feature_type)+".") return feature
def get_empty_feature(feature_type): feature = None if feature_type == DatasetsConfig.DATASET_TYPE_GENETIC_MARKER: feature = MarkerMapping.get_empty() elif feature_type == DatasetsConfig.DATASET_TYPE_GENE: feature = GeneMapping.get_empty() elif feature_type == DatasetsConfig.DATASET_TYPE_ANCHORED: feature = AnchoredMapping.get_empty() else: raise m2pException("Unrecognized feature type " + str(feature_type) + ".") return feature
def load_conf(conf_file, verbose = False): conf_rows = [] if verbose: sys.stderr.write("Loading configuration file "+conf_file+"...\n") try: for line in open(conf_file, 'r'): if line.startswith("#") or not line.strip(): continue # line.strip() is False if is an empty line "^$" if verbose: sys.stderr.write("\t conf line: "+line.strip()+"\n") line_data = line.strip().split(" ") conf_rows.append(line_data) except Exception: raise m2pException("Error loading configuration file "+conf_file+".") return conf_rows ## END
def get_alignment_engine(search_type, aligner_list, paths_config, ref_type_param, n_threads, verbose): alignment_engine = None if search_type == ALIGNMENT_TYPE_GREEDY: alignment_engine = GreedyEngine(aligner_list, paths_config, ref_type_param, n_threads, verbose) elif search_type == ALIGNMENT_TYPE_HIERARCHICAL: alignment_engine = HierarchicalEngine(aligner_list, paths_config, ref_type_param, n_threads, verbose) elif search_type == ALIGNMENT_TYPE_BEST_SCORE: alignment_engine = BestScoreEngine(aligner_list, paths_config, ref_type_param, n_threads, verbose) else: raise m2pException("Unrecognized search type "+search_type+".") return alignment_engine
def _map_intervals(self, sorted_map, map_sort_by, extend_window): map_intervals = [] if self._verbose: sys.stderr.write("MarkerEnricher: creating intervals on markers\n") sys.stderr.write("MarkerEnricher: map sort by " + str(map_sort_by) + "\n") if map_sort_by == MapTypes.MAP_SORT_PARAM_BP: self.MAP_UNIT = self.MAP_UNIT_PHYSICAL elif map_sort_by == MapTypes.MAP_SORT_PARAM_CM: self.MAP_UNIT = self.MAP_UNIT_GENETIC else: raise m2pException("Unrecognized map sort unit " + str(map_sort_by) + ".") # Loop over consecutive positions to compare them and create intervals prev_position = None prev_interval = None for map_position in sorted_map: #sys.stderr.write("\tMap position: "+str(map_position)+"\n") pos_marker = map_position.get_marker_id( ) #position[MapFields.MARKER_NAME_POS] pos_chr = map_position.get_chrom_name( ) #position[MapFields.MARKER_CHR_POS] pos_pos = map_position.get_sort_pos( map_sort_by) #float(position[map_sort_by]) pos_end_pos = map_position.get_sort_end_pos(map_sort_by) interval = self._get_new_interval(map_position, pos_chr, pos_pos, pos_end_pos, extend_window) #sys.stderr.write("\tInterval "+str(interval)+"\n") self._append_interval(map_intervals, interval) sys.stderr.write("MapEnricher: " + str(len(map_intervals)) + " intervals created.\n") return map_intervals
def _load_config(self, config_file): self._config_dict = {} self._config_list = [] conf_rows = load_conf(config_file, self._verbose) # data_utils.load_conf #self._config_dict = load_maps(self._config_file, self._verbose) # data_utils.load_maps for conf_row in conf_rows: dataset_name = conf_row[DatasetsConfig.DATASET_NAME] dataset_id = conf_row[DatasetsConfig.DATASET_ID] dataset_type = conf_row[DatasetsConfig.DATASET_TYPE] file_path = conf_row[DatasetsConfig.FILE_PATH] file_type = conf_row[DatasetsConfig.FILE_TYPE] databases = conf_row[DatasetsConfig.DATABASES].strip().split(",") synonyms = conf_row[DatasetsConfig.SYNONYMS] prefixes = conf_row[DatasetsConfig.PREFIXES].strip().split(",") dataset = DatasetConfig(dataset_name, dataset_id, dataset_type, file_path, file_type, databases, synonyms, prefixes) if dataset_name.startswith(">"): dataset.set_dataset_name( dataset_name[1:]) # remove the ">" from the name dataset.set_ignore_build( True ) # mark the dataset as to be ignored in the build datasets script if dataset_id in self._config_dict: raise m2pException("Duplicated dataset " + dataset_id + " in configuration file " + config_file + ".") else: self._config_dict[dataset_id] = dataset self._config_list.append(dataset_id) return
def get_aligner(aligner_list, n_threads, paths_config, verbose = False): # This is an AlignerFactory aligner = None tmp_files_dir = paths_config.get_tmp_files_path() if len(aligner_list) > 1: aligners = [] for aligner_name in aligner_list: try: aligner = AlignersFactory.get_aligner([aligner_name], n_threads, paths_config, verbose) aligners.append(aligner) except m2pException: sys.stderr.write("WARNING: exception obtaining "+aligner_name+".\nSkipping to next aligner.\n") aligner = ListAligner(aligners, tmp_files_dir) else: aligner_name = aligner_list[0] if aligner_name == ALIGNER_BLASTN: aligner = AlignersFactory.get_aligner_blastn(paths_config, n_threads, verbose) elif aligner_name == ALIGNER_GMAP: aligner = AlignersFactory.get_aligner_gmap(paths_config, n_threads, verbose) elif aligner_name == ALIGNER_HSBLASTN: aligner = AlignersFactory.get_aligner_hsblastn(paths_config, n_threads, verbose) else: raise m2pException("Unknown aligner type "+str(aligner_name)+" when requesting aligner.") return aligner
def get_feature(marker_id, dataset_id, dataset_name, feature_type, mapping_result): feature = None if feature_type == DatasetsConfig.DATASET_TYPE_GENETIC_MARKER: feature = MarkerMapping(marker_id, dataset_id, dataset_name, feature_type, mapping_result, FeatureMapping.ROW_TYPE_ENRICHMENT) elif feature_type == DatasetsConfig.DATASET_TYPE_GENE: feature = GeneMapping(marker_id, dataset_id, dataset_name, feature_type, mapping_result, FeatureMapping.ROW_TYPE_ENRICHMENT, annots = []) elif feature_type == DatasetsConfig.DATASET_TYPE_ANCHORED: feature = AnchoredMapping(marker_id, dataset_id, dataset_name, feature_type, mapping_result, FeatureMapping.ROW_TYPE_ENRICHMENT) else: raise m2pException("Unrecognized feature type "+str(feature_type)+".") return feature
def __gmap(gmap_app_path, n_threads, threshold_id, threshold_cov, query_fasta_path, gmap_dbs_path, db_name, verbose=False): # CPCantalapiedra 201701 ###### Check that DB is available for this aligner dbpath = gmap_dbs_path + "/" + db_name dbpathfile = dbpath + "/" + db_name + ".ref153positions" sys.stderr.write("Checking database: " + dbpath + " DB exists for " + ALIGNER + ".\n") if not (os.path.exists(dbpathfile) and os.path.isfile(dbpathfile)): raise m2pException("DB path " + dbpath + " for " + ALIGNER + " aligner NOT FOUND.") # GMAP __command = "".join([gmap_app_path, \ " -t ", str(n_threads), \ " -B 0 -n ", str(MAX_NUMBER_PATHS_PER_QUERY)]) gmap_thres_id = float(threshold_id) / 100.0 gmap_thres_cov = float(threshold_cov) / 100.0 if verbose: sys.stderr.write("m2p_gmap: Thresholds: ID=" + str(gmap_thres_id) + "; COV=" + str(gmap_thres_cov) + "\n") __filter_id = "--min-identity=" + str(gmap_thres_id) __filter_cov = "--min-trimmed-coverage=" + str(gmap_thres_cov) __db = "".join([" -d ", db_name]) __db_dir = "".join([" -D ", gmap_dbs_path]) gmap_cmd = " ".join([ __command, __filter_id, __filter_cov, __db, __db_dir, query_fasta_path ]) if verbose: sys.stderr.write("m2p_gmap: Executing '" + gmap_cmd + "'\n") retValue = 0 FNULL = open(os.devnull, 'w') if verbose: p = Popen(gmap_cmd, shell=True, stdout=PIPE, stderr=sys.stderr) else: p = Popen(gmap_cmd, shell=True, stdout=PIPE, stderr=PIPE) com_list = p.communicate() output = com_list[0] output_err = com_list[1] retValue = p.returncode if retValue != 0: if verbose: raise Exception("m2p_gmap: return != 0. " + gmap_cmd + "\n") else: raise Exception("m2p_gmap: return != 0. " + gmap_cmd + "\nError: " + str(output_err) + "\n") if verbose: sys.stderr.write("m2p_gmap: GMAP return value " + str(retValue) + "\n" + str(output_err) + "\n") results = __compress(output, db_name) #print "M2PGMAP***********************" #for result in results: # print result return results
def _map_intervals(self, sorted_map, map_sort_by, extend_window): map_intervals = [] if self._verbose: sys.stderr.write( "MapEnricher: creating intervals around markers\n") sys.stderr.write("MapEnricher: map sort by " + str(map_sort_by) + ", extend interval " + str(extend_window) + "\n") if map_sort_by == MapTypes.MAP_SORT_PARAM_BP: self.MAP_UNIT = self.MAP_UNIT_PHYSICAL elif map_sort_by == MapTypes.MAP_SORT_PARAM_CM: self.MAP_UNIT = self.MAP_UNIT_GENETIC else: raise m2pException("Unrecognized map sort unit " + str(map_sort_by) + ".") # Loop over consecutive positions to compare them and create intervals prev_position = None prev_interval = None for map_position in sorted_map: pos_marker = map_position.get_marker_id( ) #position[MapFields.MARKER_NAME_POS] pos_chr = map_position.get_chrom_name( ) #position[MapFields.MARKER_CHR_POS] pos_pos = map_position.get_sort_pos( map_sort_by) #float(position[map_sort_by]) pos_end_pos = map_position.get_sort_end_pos(map_sort_by) #if self._verbose: sys.stderr.write("\tMap position: "+str(map_position)+"\n") interval = self._get_new_interval(map_position, pos_chr, pos_pos, pos_end_pos, extend_window) ## check whether intervals overlap to each other if prev_position: prev_chr = prev_position.get_chrom_name( ) #prev_position[MapFields.MARKER_CHR_POS] if pos_chr != prev_chr: self._append_interval(map_intervals, prev_interval) # The same chromosome... else: # Check if there is overlap if MapInterval.intervals_overlap(prev_interval, interval): self._add_position_to_interval(prev_interval, map_position, pos_end_pos, extend_window) interval = prev_interval #if self._verbose: sys.stdout.write("\t\toverlap --> Updated interval "+str(prev_interval)+"\n") else: self._append_interval(map_intervals, prev_interval) # If first interval # else: DO NOTHING prev_position = map_position prev_interval = interval # Append the last interval if prev_interval: self._append_interval(map_intervals, prev_interval) sys.stderr.write("MapEnricher: " + str(len(map_intervals)) + " intervals created.\n") return map_intervals
def __split_blast(split_blast_path, blast_app_path, n_threads, query_fasta_path, blast_dbs_path, db_name, verbose = False): results = [] # CPCantalapiedra 201701 ###### Check that DB is available for this aligner dbpath = blast_dbs_path + db_name dbpathfile = dbpath + ".nsq" dbpathfile2 = dbpath + ".nal" sys.stderr.write("Checking database: "+dbpath+" DB exists for "+ALIGNER+".\n") if not ((os.path.exists(dbpathfile) or os.path.exists(dbpathfile2)) \ and (os.path.isfile(dbpathfile) or os.path.isfile(dbpathfile2))): raise m2pException("DB path "+dbpath+" for "+ALIGNER+" aligner NOT FOUND.") ###### Split blast bins ###### Retrieve num of fasta seqs to calculate necessary bins retValue = 0 p = Popen(" ".join(["cat", query_fasta_path, " | grep -c \"^>\""]), \ shell=True, stdout=PIPE, stderr=sys.stderr) output = p.communicate()[0] retValue = p.returncode if retValue == 0: num_of_seqs = int(output) split_blast_bins = (num_of_seqs / 50) + 1 else: split_blast_bins = 100 ###### Split blast blast_command = " ".join([split_blast_path+"split_blast.pl", str(n_threads), str(split_blast_bins), blast_app_path, \ "-dust no -soft_masking false -task megablast", \ '-outfmt \\"6 qseqid qlen sseqid slen length qstart qend sstart send bitscore evalue pident mismatch gapopen\\"']) blast_db = "".join(["-db ", dbpath]) # blast_db = "".join(["-db ", blast_dbs_path, db_name , ".fa"]) # blast_query = " ".join(["-query ", query_fasta_path]) #blast_cmd = " ".join([ResourcesMng.get_deploy_dir()+blast_command, blast_db, blast_query]) blast_cmd = " ".join([blast_command, blast_db, blast_query]) if verbose: sys.stderr.write("m2p_split_blast: Executing '"+blast_cmd+"'\n") retValue = 0 FNULL = open(os.devnull, 'w') if verbose: p = Popen(blast_cmd, shell=True, stdout=PIPE, stderr=sys.stderr) else: p = Popen(blast_cmd, shell=True, stdout=PIPE, stderr=PIPE) com_list = p.communicate() output = com_list[0] output_err = com_list[1] retValue = p.returncode if retValue != 0: if verbose: raise Exception("m2p_split_blast: Blast return != 0. "+blast_cmd+"\n"+str(output)+"\n") else: raise Exception("m2p_split_blast: Blast return != 0. "+blast_cmd+"\n"+str(output)+"\n"+str(output_err)+"\n") if "error" in output or "Error" in output or "ERROR" in output: sys.stderr.write("m2p_split_blast: error in blast output. We will report 0 results for this alignment.\n") sys.stderr.write(output+"\n") sys.stderr.write(str(output_err)+"\n") results = [] else: if verbose: sys.stderr.write("m2p_split_blast: Blast return value "+str(retValue)+"\n") [results.append(line) for line in output.strip().split("\n") if line != "" and not line.startswith("#")] # startswith("#") due to split_blast.pl printing in stdout comments, warnings and so on return results
def align(self, fasta_path, db, ref_type, threshold_id, threshold_cov): raise m2pException("BaseAligner is an abstract class. 'align' has to be implemented in child class.")
def __split_blast(split_blast_path, blast_app_path, n_threads, query_fasta_path, blast_dbs_path, db_name, verbose=False): results = [] # CPCantalapiedra 201701 ###### Check that DB is available for this aligner dbpath = blast_dbs_path + db_name dbpathfile = dbpath + ".nsq" dbpathfile2 = dbpath + ".nal" sys.stderr.write("Checking database: " + dbpath + " DB exists for " + ALIGNER + ".\n") if not ((os.path.exists(dbpathfile) or os.path.exists(dbpathfile2)) \ and (os.path.isfile(dbpathfile) or os.path.isfile(dbpathfile2))): raise m2pException("DB path " + dbpath + " for " + ALIGNER + " aligner NOT FOUND.") ###### Split blast bins ###### Retrieve num of fasta seqs to calculate necessary bins retValue = 0 p = Popen(" ".join(["cat", query_fasta_path, " | grep -c \"^>\""]), \ shell=True, stdout=PIPE, stderr=sys.stderr) output = p.communicate()[0] retValue = p.returncode if retValue == 0: num_of_seqs = int(output) split_blast_bins = (num_of_seqs / 50) + 1 else: split_blast_bins = 100 ###### Split blast blast_command = " ".join([split_blast_path+"split_blast.pl", str(n_threads), str(split_blast_bins), blast_app_path, \ "-dust no -soft_masking false -task megablast", \ '-outfmt \\"6 qseqid qlen sseqid slen length qstart qend sstart send bitscore evalue pident mismatch gapopen\\"']) blast_db = "".join([ "-db ", dbpath ]) # blast_db = "".join(["-db ", blast_dbs_path, db_name , ".fa"]) # blast_query = " ".join(["-query ", query_fasta_path]) #blast_cmd = " ".join([ResourcesMng.get_deploy_dir()+blast_command, blast_db, blast_query]) blast_cmd = " ".join([blast_command, blast_db, blast_query]) if verbose: sys.stderr.write("m2p_split_blast: Executing '" + blast_cmd + "'\n") retValue = 0 FNULL = open(os.devnull, 'w') if verbose: p = Popen(blast_cmd, shell=True, stdout=PIPE, stderr=sys.stderr) else: p = Popen(blast_cmd, shell=True, stdout=PIPE, stderr=PIPE) com_list = p.communicate() output = com_list[0] output_err = com_list[1] retValue = p.returncode if retValue != 0: if verbose: raise Exception("m2p_split_blast: Blast return != 0. " + blast_cmd + "\n" + str(output) + "\n") else: raise Exception("m2p_split_blast: Blast return != 0. " + blast_cmd + "\n" + str(output) + "\n" + str(output_err) + "\n") if "error" in output or "Error" in output or "ERROR" in output: sys.stderr.write( "m2p_split_blast: error in blast output. We will report 0 results for this alignment.\n" ) sys.stderr.write(output + "\n") sys.stderr.write(str(output_err) + "\n") results = [] else: if verbose: sys.stderr.write("m2p_split_blast: Blast return value " + str(retValue) + "\n") [ results.append(line) for line in output.strip().split("\n") if line != "" and not line.startswith("#") ] # startswith("#") due to split_blast.pl printing in stdout comments, warnings and so on return results
def output_features_pos(self, pos, map_as_physical, map_has_cm_pos, map_has_bp_pos, multiple_param, load_annot = False, annotator = None): raise m2pException("Method has to be implemented in child class inheriting from OutputPrinter")
sys.stdout.write(_SCRIPT+": dataset "+dataset_name+" with id "+dataset_id+" created.\n") ### 2) GTF FILES ### elif dataset_file_type == DatasetsConfig.FILE_TYPE_GTF: ### Create the new directory _create_dir(dataset_path) maps_conf_file = __app_path+ConfigBase.MAPS_CONF maps_config = MapsConfig(maps_conf_file, verbose = verbose_param) # align to all the maps if (len(dataset_db_list)==1) and (dataset_db_list[0] == DatasetsConfig.DATABASES_ANY): raise m2pException("GTF files have to be associated to a single database in datasets configuration.") # align to maps which are associated to databases also associated to this dataset else: #paths_conf_file = __app_path+"/"+PATHS_CONF #config_path_dict = read_paths(paths_conf_file) # data_utils.read_paths #__app_path = config_path_dict["app_path"] maps_path = paths_config.get_maps_path() parsed_gtf = False for map_id in maps_config.get_maps(): map_config = maps_config.get_map_config(map_id) map_db_list = map_config.get_db_list()
def create_map(self, query_path, query_sets_ids, map_config, facade, sort_param, multiple_param, tmp_files_dir = None): raise m2pException("To be implemented in child classes.")
### elif dataset_file_type == DatasetsConfig.FILE_TYPE_GTF: ### Create the new directory _create_dir(dataset_path) maps_conf_file = __app_path + ConfigBase.MAPS_CONF maps_config = MapsConfig(maps_conf_file, verbose=verbose_param) # align to all the maps if (len(dataset_db_list) == 1) and (dataset_db_list[0] == DatasetsConfig.DATABASES_ANY): raise m2pException( "GTF files have to be associated to a single database in datasets configuration." ) # align to maps which are associated to databases also associated to this dataset else: #paths_conf_file = __app_path+"/"+PATHS_CONF #config_path_dict = read_paths(paths_conf_file) # data_utils.read_paths #__app_path = config_path_dict["app_path"] maps_path = paths_config.get_maps_path() parsed_gtf = False for map_id in maps_config.get_maps(): map_config = maps_config.get_map_config(map_id) map_db_list = map_config.get_db_list()
def perform_alignment(self, query_fasta_path, dbs_list, databases_config, threshold_id, threshold_cov): raise m2pException("SearchEngine is an abstract class. 'perform_alignment' must be implemented in a child class.")
def perform_alignment(self, query_fasta_path, dbs_list, databases_config, threshold_id, threshold_cov): raise m2pException( "SearchEngine is an abstract class. 'perform_alignment' must be implemented in a child class." )