def closeFile(self, file): try: file.close() except IOError, exce: Log.log("MAFProcessor.closeFile : Enable to close file '" + file + "'. From:\n\t--> " + str(exce))
def createLogos(self, input_commstruct): db_file_path = [] for index in range(len(self.dbFiles)): db_file_path.append(os.path.join(self.dbPath, self.dbFiles[index])) motif_name_list = input_commstruct.motifStatistics.keys() motif_definition = MotifUtils.getMotifsDefinitionFromTF( motif_name_list, db_file_path) logos_path = os.path.join(self.outPath, FinalOutputProcessor.LOGOS_DIR_NAME) FileUtils.createDirectory(logos_path) for motif_name in motif_name_list: if motif_name in motif_definition.keys(): file_name = motif_name + ".tf" def_file_path = os.path.join(logos_path, file_name) def_file = open(def_file_path, "w") for line in motif_definition[motif_name]: def_file.write(line) def_file.flush def_file.close() RSATUtils.createLogoFromTF(logos_path, file_name, motif_name) else: Log.log( "FinalOutputProcessor.createLogos : No definition found to create logo for motif : " + motif_name)
def toXMLFile( outpath, pipelines): pipelines_element = Element( PipelineXMLParser.PIPELINES_TAG) for pipeline in pipelines: pipeline_element = Element( PipelineXMLParser.PIPELINE_TAG) pipelines_element.append( pipeline_element) pipeline_element.attrib[ PipelineXMLParser.PIPELINE_NAME_ATT] = pipeline.name for component in pipeline.componentList: component_element = Element( PipelineXMLParser.COMPONENT_TAG) pipeline_element.append( component_element) component_element.attrib[ PipelineXMLParser.COMPONENT_PROCESSOR_ATT] = component.processorName for param_name, param_value in component.parameters.iteritems(): param_element = Element( PipelineXMLParser.PARAM_TAG) component_element.append( param_element) param_element.attrib[ PipelineXMLParser.PARAM_NAME_ATT] = str( param_name) param_element.attrib[ PipelineXMLParser.PARAM_VALUE_ATT] = str( param_value) try: PipelineXMLParser.indent( pipelines_element, 0) outfile = os.path.join( outpath, pipeline.name + ".xml") ElementTree( pipelines_element).write( outfile) except IOError, exce: Log.log( "PipelineXMLParser.toXMLFile : Unable to write Pipelines to XML file. From:\n\t---> " + str( exce))
def fromXMLFile( input_filepath): try: return BedSeqAlignmentStatsCommStruct.getCommStructFromXML( input_filepath) except ParsingException, par_exce: Log.log( "BedSeqAlignmentStatsCommStruct.fromXMLFile : Unable to get CommStruct from XML file '" + input_filepath + "'. From:\n\t---> " + str( par_exce)) return None
def toXMLFile( self, output_filepath): try: root_element = self.convertCommStructToElementTree() self.indent( root_element, 0) ElementTree( root_element).write( output_filepath) except IOError, exce: Log.log( "BedSeqAlignmentStatsCommStruct.toXMLFile : Unable to write CommStruct to XML file. From:\n\t---> " + str( exce))
def addSequence(self, species, sequence): if species != None and sequence != None: self.sequences[species] = sequence seq_length = len(sequence) if self.totalLength == 0: self.totalLength = seq_length else: if seq_length != self.totalLength: Log.log( "SequenceAlignment.addSequence : Added sequence does not have the right lenght for this alignment : Alignement length = " + str(self.totalLength) + " DNA sequence length = " + str(seq_length)) for fix_index in range(self.totalLength - len(sequence)): sequence.append(Constants.SEQUENCE_INSERTION_CHAR)
def outputClassification(self, input_commstruct, analysis, limit_value, parameter_dic): try: # Create and write to file the XML element root_element = self.toXML(input_commstruct, analysis, limit_value, parameter_dic) self.indent(root_element, 0) # Output the XML to file doc = ET.ElementTree(root_element) classification_file_path = os.path.join( self.outPath, self.component.pipelineName + "_MotifClassification.xml") outfile = open(classification_file_path, 'w') outfile.write('<?xml version="1.0" encoding="utf-8"?>\n') outfile.write( '<?xml-stylesheet type="text/xsl" href="classification.xsl"?>\n' ) doc.write(outfile) outfile.close() # Copy the XSL file in the same directory than the XML shutil.copy( os.path.join( self.component.getParameter(Constants.INSTALL_DIR_PARAM), "resources/xsl/classification/classification.xsl"), self.outPath) shutil.copy( os.path.join( self.component.getParameter(Constants.INSTALL_DIR_PARAM), "resources/xsl/classification/RSAT_menu.js"), self.outPath) shutil.copy( os.path.join( self.component.getParameter(Constants.INSTALL_DIR_PARAM), "resources/xsl/classification/jquery.dataTables.js"), self.outPath) shutil.copy( os.path.join( self.component.getParameter(Constants.INSTALL_DIR_PARAM), "resources/xsl/classification/results.css"), self.outPath) shutil.copy( os.path.join( self.component.getParameter(Constants.INSTALL_DIR_PARAM), "resources/xsl/classification/peak-footprints.css"), self.outPath) except IOError, exce: Log.log( "ClassificationProcessor.outputClassification : Unable to write classification to XML file. From:\n\t---> " + str(exce))
def getMotifsDetailsFromJaspar(): matrix_path = os.path.join( MotifUtils.JASPAR_FLAT_DB_PATH, "MATRIX.txt") matrix_annotation_path = os.path.join( MotifUtils.JASPAR_FLAT_DB_PATH, "MATRIX_ANNOTATION.txt") names = {} id = {} family = {} type = {} classe = {} try: matrix_file = open( matrix_path, "r") matrix_annotation_file = open( matrix_annotation_path, "r") for line in matrix_file: tokens = line.split() if len( tokens) >= 5: current_num = tokens[ 0] current_name = tokens[ 2] + "." + tokens[3] current_id = "".join( tokens[ 4:]) names[ current_num] = current_name id[ current_name] = current_id else: raise ParsingException( "MotifUtils.getMotifsDetailsFromJaspar : Matrix file is not correctly formatted: 5 columns required while " + str( len( tokens)) + " columns are found") for line in matrix_annotation_file: tokens = line.split() current_num = tokens[ 0] if current_num in names.keys(): current_key = tokens[ 1] current_value = "".join( tokens[2:]) if current_key == "family": family[ names[ current_num]] = current_value elif current_key == "class": classe[ names[ current_num]] = current_value elif current_key == "type": type[ names[ current_num]] = current_value else: Log.log( "MotifUtils.getMotifsDetailsFromJaspar : Motif number was not detected in matrix file : " + current_num) matrix_annotation_file.close() matrix_file.close() except (IOError, ParsingException), exce: Log.log( "MotifUtils.getMotifsDetailsFromJaspar : unable to read motifs definition. From:\n\t---> " + str( exce))
def fixIndex(self, text_index): if text_index < 0: return text_index if self.referenceSpecies in self.sequences.keys(): limit = min(text_index + 1, len(self.sequences[self.referenceSpecies])) count = 0 for index in range(limit): if self.sequences[self.referenceSpecies][ index] == Constants.SEQUENCE_INSERTION_CHAR: count += 1 return text_index - count else: Log.log( "SequenceAlignement.fixIndex : Reference species is not set for Sequence Alignement : " + self.name) return text_index
def getBEDSequenceDictionnary(species, bed_filepath, extension_5p, extension_3p): sequence_dic = {} try: input_file = open(bed_filepath) for line in input_file: tokens = line.split() if len(tokens) > BEDParser._endindex_col: chrom = tokens[BEDParser._chrom_col].lower() #if chrom[ 0:3] == "chr": if chrom[0:1] != "#": if len(chrom) < 4: chrom = "chr" + chrom start = BEDParser.getTokenAsint( tokens[BEDParser._startindex_col]) end = BEDParser.getTokenAsint( tokens[BEDParser._endindex_col]) if start < end: start = start - extension_5p if start < 0: start = 0 end = end + extension_3p bedsequence = BEDSequence(species, chrom, start, end) if len(tokens) > BEDParser._id_col: bedsequence.id = tokens[BEDParser._id_col] bedsequence_key = bedsequence.getKey() if not sequence_dic.has_key(bedsequence_key): sequence_dic[bedsequence_key] = [] sequence_dic[bedsequence_key].append(bedsequence) else: Log.log( "BEDParser.getBEDSequenceDictionnary : A sequence has inversed start and end coordinates : " + line) else: Log.log("No 'chr' in line :" + line) except ParsingException, par_exce: raise ParsingException( "BEDParser.getBEDSequenceDictionnary : Some attributes are mor numbers. From:\n\t--> " + str(par_exce))
def getMotifStatistics(node_motif, motif): statistics = MotifStatistics() for node_param in node_motif: if node_param.tag.lower() == MotifStatisticsCommStruct.PARAM_TAG: param_name = MotifStatisticsCommStruct.getAttribute( node_param, MotifStatisticsCommStruct.PARAM_NAME_ATT, False) param_value = MotifStatisticsCommStruct.getAttribute( node_param, MotifStatisticsCommStruct.PARAM_VALUE_ATT, False) if param_name != None and len(param_name) > 0: if param_value != None and len(param_value) > 0: if param_name == MotifStatisticsCommStruct.CHI2_PARAM_NAME: statistics.chi2 = MotifStatisticsCommStruct.getTokenAsfloat( param_value, False) elif param_name == MotifStatisticsCommStruct.HISTOGRAM_GRAPH_PATH_PARAM_NAME: statistics.histogramGraphPath = param_value elif param_name == MotifStatisticsCommStruct.HISTOGRAM_PARAM_NAME: statistics.histogram = param_value.split( MotifStatisticsCommStruct. HISTOGRAM_ENTRY_SEPARATOR_CHAR) elif param_name == MotifStatisticsCommStruct.NULL_HISTOGRAM_PARAM_NAME: statistics.nullHistogram = param_value.split( MotifStatisticsCommStruct. HISTOGRAM_ENTRY_SEPARATOR_CHAR) else: Log.log( "MotifStatisticsCommStruct.getMotifAttributes : Unknown attribute name : " + param_name) else: raise ParsingException( "MotifStatisticsCommStruct.getMotifAttributes : Malformed parameter - unable to retrieve parameter value in motif '" + motif.name + "'") else: raise ParsingException( "MotifStatisticsCommStruct.getMotifAttributes : Malformed parameter - unable to retrieve parameter name in motif '" + motif.name + "'") return statistics
def generateRandomSites(self, motif, motif_file_path, site_number): # Retrieve method required parameters RSAT_PATH = self.component.getParameter(Constants.RSAT_DIR_PARAM) dir_path = os.path.join(self.component.outputDir, self.component.getComponentPrefix()) output_path = os.path.join(dir_path, motif + "_sites.fasta") # Execute the RSAT random-seq command cmd = os.path.join(RSAT_PATH, "python-scripts/random-sites") cmd += " -m " + motif_file_path cmd += " -n " + str(site_number) cmd += " -o " + output_path # Execute the command cmd_result = commands.getstatusoutput(cmd) if cmd_result[0] != 0: Log.log( "ImplantSitesProcessor.generateSites : status returned is :" + str(cmd_result[0]) + " for command '" + cmd + "'") Log.log( "ImplantSitesProcessor.generateSites : command output is = \n" + str(cmd_result[1])) raise ExecutionException( "ImplantSitesProcessor.generateSites : Cannot execute random-sites commands. See logs for more details" ) # Parse the result of the command sites = [] try: site_file = open(output_path, "r") for line in site_file: if not line.isspace() and line[0] != ">": sites.append(line.split()[0].upper()) site_file.close() except IOError, io_exce: raise ExecutionException( "ImplantSitesProcessor.generateSites : Unable to read motif sites from file '" + output_path + "'. From:\n\t---> " + str(io_exce))
class MotifStatisticsCommStruct(CommStruct): # -------------------------------------------------------------------------------------- def __init__(self): CommStruct.__init__(self) self.motifList = [] self.motifToStatistics = {} # -------------------------------------------------------------------------------------- def addMotif(self, motif): if motif != None: self.motifList.append(motif) # -------------------------------------------------------------------------------------- def addMotifStatistics(self, motif, statistics): if motif != None and statistics != None: self.motifToStatistics[motif] = statistics # -------------------------------------------------------------------------------------- def toXMLFile(self, output_filepath): try: root_element = self.convertCommStructToElementTree() self.indent(root_element, 0) ElementTree(root_element).write(output_filepath) except IOError, exce: Log.log( "MotifStatisticsCommStruct.toXMLFile : Unable to write CommStruct to XML file. From:\n\t---> " + str(exce)) except ParsingException, par_exce: Log.log( "MotifStatisticsCommStruct.toXMLFile : Unable to save CommStruct to XML file. From:\n\t---> " + str(par_exce))
def outputProgression(pipeline): try: # create the pipeline element and set its attributes pipeline_element = Element(ProgressionManager.PIPELINE_TAG) pipeline_element.attrib[ ProgressionManager.NAME_ATT] = pipeline.name pipeline_prog = ProgressionManager.instance.pipelinesProgressions[ pipeline] pipeline_element.attrib[ ProgressionManager.STATUS_ATT] = pipeline_prog.status pipeline_element.attrib[ ProgressionManager.START_TIME_ATT] = time.strftime( "%b %d %Y %H:%M:%S", time.localtime(pipeline_prog.startTime)) if pipeline_prog.status == ProgressionManager.RUNNING_STATUS or pipeline_prog.status == ProgressionManager.NOT_STARTED_STATUS: pipeline_element.attrib[ProgressionManager.END_TIME_ATT] = "0" else: pipeline_element.attrib[ ProgressionManager.END_TIME_ATT] = time.strftime( "%b %d %Y %H:%M:%S", time.localtime(pipeline_prog.endTime)) pipeline_elapsed_time = pipeline_prog.getElapsedTime() if pipeline_elapsed_time > 0: pipeline_element.attrib[ ProgressionManager.ELAPSED_TIME_ATT] = str( pipeline_elapsed_time) # Parse the component list to create the component element and set their attributes for component_prog in pipeline_prog.componentProgressions: component_element = Element(ProgressionManager.COMPONENT_TAG) pipeline_element.append(component_element) component_element.attrib[ ProgressionManager. NAME_ATT] = component_prog.component.processorShortName component_element.attrib[ ProgressionManager. DISPLAY_NAME_ATT] = component_prog.component.processorDisplayName component_element.attrib[ ProgressionManager. BRANCH_ATT] = component_prog.component.branch component_element.attrib[ ProgressionManager. RANK_ATT] = component_prog.component.rank component_element.attrib[ ProgressionManager.STATUS_ATT] = component_prog.status component_elapsed_time = component_prog.getElapsedTime() if component_elapsed_time >= 0: component_element.attrib[ ProgressionManager. ELAPSED_TIME_ATT] = ProgressionManager.convertTime( component_elapsed_time) if component_prog.status == ProgressionManager.RUNNING_STATUS: # If component is running, look for tasks to create corresponding elements and attributes if len(component_prog.tasks) > 0: for task in component_prog.tasks: task_element = Element(ProgressionManager.TASK_TAG) component_element.append(task_element) task_element.attrib[ ProgressionManager.NAME_ATT] = task task_element.attrib[ ProgressionManager. PROGRESSION_VALUE_ATT] = str( int( math.ceil(component_prog. taskProgression[task] * 1000.0)) / float(10)) + "%" else: # If no task exists, set the progrssion attribute at the component level component_element.attrib[ ProgressionManager.PROGRESSION_VALUE_ATT] = str( int( math.ceil(component_prog.getProgression() * 1000.0)) / float(10)) + "%" # If component is not running and is not 'not started', set the output put result elif component_prog.status != ProgressionManager.NOT_STARTED_STATUS: component_element.attrib[ ProgressionManager. RESULT_ATT] = component_prog.component.getOutputFilePath( ) ProgressionManager.indent(pipeline_element, 0) doc = ET.ElementTree(pipeline_element) pipeline_output_dir = os.path.join( ProgressionManager.instance.outputPath, pipeline.name) progression_file = os.path.join(pipeline_output_dir, Constants.PROGRESSION_XML_FILE) #ElementTree( pipeline_element).write( progression_file) outfile = FileUtils.openFile(progression_file, 'w') outfile.write('<?xml version="1.0" encoding="utf-8"?>\n') outfile.write('<?xml-stylesheet type="text/xsl" href="' + ProgressionManager.instance.stylesheetPath + '"?>\n') doc.write(outfile) outfile.close() except IOError, exce: Log.log( "ProgressionManager.outputProgression : Unable to write progresssion to XML file. From:\n\t---> " + str(exce))
def compareMotifs(self, reference_motifs, identified_motifs): # Retrieve required parameters RSAT_PATH = self.component.getParameter(Constants.RSAT_DIR_PARAM) #Prepare outputdir dir_path = os.path.join(self.component.outputDir, self.component.getComponentPrefix()) shutil.rmtree(dir_path, True) os.mkdir(dir_path) old_working_dir = os.getcwd() os.chdir(dir_path) # Establish the progression total_length = len(reference_motifs) * len(identified_motifs.keys()) ProgressionManager.setTaskProgression("Comparing motifs", self.component, 0.0) progress = 0 for reference_motif in reference_motifs: ref_file_info = self.outputMotifToTransfacFile( reference_motif, dir_path) for identified_motif_name in identified_motifs.keys(): progress += 1 if reference_motif.name != identified_motif_name: count = 0 identified_motif_list = identified_motifs[ identified_motif_name] for identified_motif in identified_motif_list: count += 1 ident_file_info = self.outputMotifToTransfacFile( identified_motif, dir_path) # Compose the compare-matrices command line with all required options cmd = os.path.join(RSAT_PATH, "perl-scripts/compare-matrices") cmd += " -file1 " + ref_file_info[1] + " -format1 tf" cmd += " -file2 " + ident_file_info[1] + " -format2 tf" cmd += " -mode matches" cmd += " -return all" if len(identified_motif_list) > 1: cmd += " -o " + reference_motif.name + "_" + identified_motif_name + "_" + count count += 1 else: cmd += " -o " + reference_motif.name + "_" + identified_motif_name # Execute the command cmd_result = commands.getstatusoutput(cmd) if cmd_result[0] != 0: Log.log( "CompareIdentifiedMotifsProcessor.compareMotifs : status returned is :" + str(cmd_result[0]) + " for command '" + cmd + "'") Log.log( "CompareIdentifiedMotifsProcessor.compareMotifs : command output is = \n" + str(cmd_result[1])) continue if progress % 10 == 0: ProgressionManager.setTaskProgression( "Identifying motifs", self.component, progress / float(total_length)) # returns to initial working dir os.chdir(old_working_dir)
def executeClustalW(self, input_commstruct): #Retrieve the method parameters desired_species_line = self.getParameter( MSAProcessor.DESIRED_SPECIES_LIST_PARAM, False) if desired_species_line != None: desired_species_list = desired_species_line.split() else: desired_species_list = [] command_options_line = self.getParameter( MSAProcessor.COMMAND_OPTIONS_PARAM, False) if command_options_line == None: command_options = "" else: command_options = command_options_line # Prepare the outputdir for FASTA file export file_info = self.prepareOutputDir() dir_path = file_info[0] file_name = file_info[1] file_path = os.path.join(dir_path, file_name + ".fasta") # Change directory to output dir working_dir = os.getcwd() os.chdir(dir_path) command = self.component.getParameter(Constants.CLUSTALW_COMMAND_PARAM) # Compose the ClustalW command line with all required options output_filepath = file_path + "result.txt" cmd = command cmd += " -INFILE=" + file_path cmd += " -ALIGN" cmd += " -TYPE=DNA" cmd += " -OUTFILE=" + output_filepath cmd += " " + command_options for bed_sequence in input_commstruct.bedToMA.keys(): final_result = [] for alignment in input_commstruct.bedToMA[bed_sequence]: #output the alignment to FASTA file self.outputAlignmentToFASTAFile(alignment, file_path, desired_species_list) # Execute the command cmd_result = commands.getstatusoutput(cmd) if cmd_result[0] != 0: Log.log( "MSAProcessor.executeClustalW : status returned is :" + str(cmd_result[0]) + " for command '" + cmd + "'") Log.log( "MSAProcessor.executeClustalW : command output is = \n" + str(cmd_result[1])) continue # Parse the result of the compare-matrices command to get the result list final_result.append( self.parseClustalWResult(output_filepath, desired_species_list)) if final_result != None: input_commstruct.bedToMA[bed_sequence] = final_result # Change dir to previous working dir os.chdir(working_dir)
def buildHistogramsAndGraphs(self, input_commstruct, histogram_interval): # Retrieve the algorithm parameters RSAT_PATH = self.component.getParameter(Constants.RSAT_DIR_PARAM) # Compute the statistics of the motifs Log.info( "HistogramProcessor.buildHistogramsAndGraphs : collecting motifs statistics" ) statistics = self.computeMotifStatistics(input_commstruct, ) hits_distances = statistics[0] motif_size_min = statistics[1] motif_size_max = statistics[2] hits_peakscore = statistics[3] #print "motif_size_max = " + str( motif_size_max) # Compute the uniform distribution probabilities Log.info( "HistogramProcessor.buildHistogramsAndGraphs : computing uniform distribution" ) uniform_distributions = self.computeUniformDistributions( input_commstruct, histogram_interval, motif_size_min, motif_size_max) # Build the output CommStruct Log.info( "HistogramProcessor.buildHistogramsAndGraphs : building histogram and graphs" ) # Execute the RSAT commands and computations try: # Prepare the output directories dir_path = os.path.join(self.component.outputDir, self.component.getComponentPrefix()) shutil.rmtree(dir_path, True) FileUtils.createDirectory(dir_path, 0777) # Parse the motif list and execute the computations and commands for each of them ProgressionManager.setTaskProgression("Building motifs histogram", self.component, 0.0) total_motif_number = len(hits_distances.keys()) count_motif = 0 for motif_name in hits_distances.keys(): count_motif += 1 motif_stats = input_commstruct.motifStatistics[motif_name] motif_id = motif_stats.motifID motif_size = motif_stats.motifSize hit_number = motif_stats.getAttributeAsint( MotifStatistics.MOTIF_HIT_SCORE) # Initialize the motif prefix ID #if motif_id != None and len( motif_id) > 0: # prefix_id = "_" + motif_id #else: # prefix_id = "" prefix_id = "" # save the stats to a tabbed file for classfreq command input_path = os.path.join( dir_path, motif_name + prefix_id + "_Distances.tab") self.outputMotifStatistics(hits_distances[motif_name], input_path) # execute the classfreq command histo_path = os.path.join( dir_path, motif_name + prefix_id + "_Distances_histogram.tab") cmd = os.path.join(RSAT_PATH, "perl-scripts/classfreq") cmd += " -i '" + input_path + "'" cmd += " -col 1" cmd += " -ci " + str(histogram_interval) cmd += " -o '" + histo_path + "'" cmd_result = commands.getstatusoutput(cmd) if cmd_result[0] != 0: Log.log( "HistogramProcessor.buildHistogramsAndGraphs : status returned is :" + str(cmd_result[0]) + " for command '" + cmd + "'") Log.log(" Command output is = \n" + str(cmd_result[1])) continue # retrieve the classfreq results from output file motif_distribution = self.parseClassfreqResults(histo_path) # compute the homogen distribution for the current motif null_distribution = self.computeMotifNullDistribution( uniform_distributions[motif_size], hit_number) # Save both histograms to same file to create a common graph all_histo_path = os.path.join( dir_path, motif_name + prefix_id + "_Distances_histograms.tab") label1 = motif_name label2 = "Homogeneous model" self.outputAllHistograms(motif_distribution, label1, null_distribution, label2, histogram_interval, all_histo_path) motif_stats.setAttribute( MotifStatistics.MOTIF_DISTANCE_HISTOGRAM, all_histo_path) # Execute a chi2 test on the motif distribution against the motif homogen distribution chi2_test = RSATUtils.executeChi2Test(all_histo_path, 4, 5) if chi2_test != None: motif_stats.setAttribute(MotifStatistics.MOTIF_CHI2, chi2_test[0]) motif_stats.setAttribute(MotifStatistics.MOTIF_CHI2_PVALUE, chi2_test[1]) else: motif_stats.setAttribute(MotifStatistics.MOTIF_CHI2, "0.0") motif_stats.setAttribute(MotifStatistics.MOTIF_CHI2_PVALUE, "1.0") # Build the PNG graph corresponding to all histograms using RSAT XYGraph command graph_path = os.path.join( dir_path, motif_name + prefix_id + "_Distances.png") cmd = os.path.join(RSAT_PATH, "perl-scripts/XYgraph") cmd += " -i '" + all_histo_path + "'" cmd += " -title1 '" + self.component.pipelineName + "'" cmd += " -title2 ''" #cmd += " -xcol 3 -ycol 4,5" cmd += " -xcol 3 -ycol 4" cmd += " -xleg1 'Distance to peak maximum'" cmd += " -yleg1 'Number of motif hits'" cmd += " -legend -header -format png -fhisto" cmd += " -o '" + graph_path + "'" cmd_result = commands.getstatusoutput(cmd) if cmd_result[0] != 0: Log.log( "HistogramProcessor.buildHistogramsAndGraphs : status returned is :" + str(cmd_result[0]) + " for command '" + cmd + "'") Log.log(" Command output is = \n" + str(cmd_result[1])) continue motif_stats.setAttribute( MotifStatistics.MOTIF_DISTANCE_HISTOGRAM_GRAPH, graph_path) # Build the PDF graph corresponding to all histograms using RSAT XYGraph command graph_path_pdf = os.path.join( dir_path, motif_name + prefix_id + "_Distances.pdf") cmd = os.path.join(RSAT_PATH, "perl-scripts/XYgraph") cmd += " -i '" + all_histo_path + "'" cmd += " -title1 '" + self.component.pipelineName + "'" cmd += " -title2 ''" #cmd += " -xcol 3 -ycol 4,5" cmd += " -xcol 3 -ycol 4" cmd += " -xleg1 'Distance to peak maximum'" cmd += " -yleg1 'Number of motif hits'" cmd += " -legend -header -format pdf -fhisto" cmd += " -o '" + graph_path_pdf + "'" cmd_result = commands.getstatusoutput(cmd) if cmd_result[0] != 0: Log.log( "HistogramProcessor.buildHistogramsAndGraphs : status returned is :" + str(cmd_result[0]) + " for command '" + cmd + "'") Log.log(" Command output is = \n" + str(cmd_result[1])) continue motif_stats.setAttribute( MotifStatistics.MOTIF_DISTANCE_HISTOGRAM_GRAPH_PDF, graph_path_pdf) # Output the histogram of motif peak scores if len(hits_peakscore[motif_name]) > 1: valuable = False for value in hits_peakscore[motif_name]: if value != 0: valuable = True break if valuable: score_histo_prefix = motif_name + prefix_id + "_PeakScores" title1 = self.component.pipelineName title2 = "Distribution of peak score for " + motif_name + prefix_id legendx = "Peak Score" legendy = "Number of occurence" pathes = RSATUtils.outputHistogram( hits_peakscore[motif_name], histogram_interval, dir_path, score_histo_prefix, title1, title2, legendx, legendy, None, True) motif_stats.setAttribute( MotifStatistics.MOTIF_PEAK_SCORE_HISTOGRAM, pathes[0]) motif_stats.setAttribute( MotifStatistics.MOTIF_PEAK_SCORE_HISTOGRAM_GRAPH, pathes[1]) # Update the progression if count_motif % 10 == 0: ProgressionManager.setTaskProgression( "Building motifs histogram", self.component, count_motif / float(total_motif_number)) except IOError, io_exce: raise ExecutionException( "HistogramProcessor.buildHistogramsAndGraphs : Unable to build histogram and graph. From:\n\t---> " + str(io_exce))
def testFinalMSA(self, final_seq_align, maf_blocks, bed_sequence): # Note: all the indexes in this first part are coordinates in the genome # initialize the list that represents the succession of sequence from the MAF blocks long_seq = [] # # if necessary, add dots at the beginning of the long_seq representing missing information seq_start = maf_blocks[0].sequences[self.referenceSpecies].indexStart if seq_start > bed_sequence.indexStart: long_seq.extend([Constants.SEQUENCE_INIT_CHAR] * (seq_start - bed_sequence.indexStart)) previous_end = seq_start # Compose the long_seq by the succession of sequence from MAF blocks for maf_block in maf_blocks: current_start = maf_block.sequences[ self.referenceSpecies].indexStart # inserts dots between two sequences if they are not successive long_seq.extend(['.'] * (current_start - previous_end)) # create a list from the MAF sequence text text_list = list(maf_block.sequences[self.referenceSpecies].text) # localize the insertion characters at the beginning of the list begin = 0 for i in range(len(text_list)): if text_list[i] == Constants.SEQUENCE_INSERTION_CHAR: begin += 1 else: break # localize the insertion characters at the end of the list end = len(text_list) for i in range(len(text_list) - 1): if text_list[-i - 1] == Constants.SEQUENCE_INSERTION_CHAR: end = end - 1 else: break # insert the MAF sequence in the long_seq ignoring insertion character at the beginning and at the end long_seq.extend(text_list[begin:end]) previous_end = current_start + maf_block.sequences[ self.referenceSpecies].textLength # if necessary, add dots at the end of the long_seq representing missing information if previous_end < bed_sequence.indexEnd: long_seq.extend([Constants.SEQUENCE_INIT_CHAR] * (bed_sequence.indexEnd - previous_end)) # compute the index at which the BED sequence may start in the long_seq index coordinates if bed_sequence.indexStart < maf_blocks[0].sequences[ self.referenceSpecies].indexStart: index_seq_start = 0 else: index_seq_start = bed_sequence.indexStart - maf_blocks[ 0].sequences[self.referenceSpecies].indexStart # compute the index at which the BED sequence may end in the long_seq index coordinates index_seq_end = index_seq_start + bed_sequence.indexEnd - bed_sequence.indexStart # Note : in this second part, we have to consider that insertion characters exists in the sequence text # to compute the true star and end index of the BED sequence in the long_seq # modify the end index according to the number of insertion characters indice = 0 index = 0 count = 0 while indice <= index_seq_start: if long_seq[index] != Constants.SEQUENCE_INSERTION_CHAR: indice += 1 else: count += 1 index += 1 seq_index_start = index_seq_start + count # modify the end index according to the number of insertion characters indice = 0 index = 0 count = 0 while indice < index_seq_end: if long_seq[index] != Constants.SEQUENCE_INSERTION_CHAR: indice += 1 else: count += 1 index += 1 seq_index_end = index_seq_end + count # retrieve the sub-string of logn_seq that should represent the BED sequence result = long_seq[seq_index_start:seq_index_end] # compare the string obtained above with the one if the MSA composed in the previous method str_result = "".join(result) str_final = "".join(final_seq_align.sequences[self.referenceSpecies]) # If the result are not equals, some thing is wrong if str_result != str_final: Log.log( "MAFProcessor.testFinalMSA: an error has been detected on the recomposed sequence" ) Log.log("Composed MSA sequence = " + str_final) Log.log("Test MSA sequence = " + str_result) str_long = "".join(long_seq) # try to find the MSA sequence in the long_seq directly index_test = str_long.find(str_final) # if the MSA sequence is not found, we are facing a true issue if index_test < 0: Log.log( " The error is confirmed since the composed MSA sequence does not appear in the string composed by the succession of sequences from MAF file : " ) Log.log(" Succession sequence = " + str_long) Log.log(" Bed seq start = " + str(bed_sequence.indexStart)) Log.log(" Bed seq end = " + str(bed_sequence.indexEnd)) Log.log(" Associated MAF blocks : ") for maf_block in maf_blocks: Log.log(maf_block.toString()) # if the MSA sequence is found, the error come from an index computation issue else: Log.log( "The error is not confirmed since the composed MSA sequence appears at index " + str(index_test) + " in the string composed by the succession of sequences from MAF file:" ) Log.log(str_long)
def generateRandomMSA(self, msa_length, bedseq_number, max_length, output_commstruct): # Retrieve method required parameters RSAT_PATH = self.component.getParameter(Constants.RSAT_DIR_PARAM) dir_path = os.path.join(self.component.outputDir, self.component.getComponentPrefix()) file_path = os.path.join(dir_path, "random_sequences.txt") try: # Execute the RSAT random-seq command cmd = os.path.join(RSAT_PATH, "perl-scripts/random-seq") cmd += " -l " + str(int(max_length * 1.5)) cmd += " -n " + str(bedseq_number) cmd += " -a a:t 0.3 c:g 0.2" cmd += " -type DNA" cmd += " -format multi" cmd += " -o " + file_path Log.info( "GenerateMSAProcessor.generateMSA : starting random sequence generation. Command used is : " + cmd) # Execute the command cmd_result = commands.getstatusoutput(cmd) if cmd_result[0] != 0: Log.log( "GenerateMSAProcessor.generateMSA : status returned is :" + str(cmd_result[0]) + " for command '" + cmd + "'") Log.log( "GenerateMSAProcessor.generateMSA : command output is = \n" + str(cmd_result[1])) raise ExecutionException( "GenerateMSAProcessor.generateMSA : Cannot execute random-seq commands. See logs for more details" ) # Read the output file to get the random sequences sequence_list = [] sequence_file = open(file_path, "r") for line in sequence_file: sequence_list.append(line.split()[0]) # Generate the species list species_list = [] species_list.append(output_commstruct.baseSpecies) for index in range(msa_length - 1): species_list.append("Species" + str(index + 1)) # Create and fill the MSA for each BED sequence count_seq = 0 for chrom in output_commstruct.bedSequencesDict.keys(): for bedseq in output_commstruct.bedSequencesDict[chrom]: msa = SequenceAlignment() msa.name = bedseq.name + "_1" msa.referenceSpecies = output_commstruct.baseSpecies seq_length = bedseq.indexEnd - bedseq.indexStart sequence = list(sequence_list[count_seq][:seq_length]) for index in range(msa_length): msa.addSequence(species_list[index], sequence) #msa.addSequence( species_list[index], list(['.'] * len( sequence))) msa.finalizeSequences() output_commstruct.addSequenceAlignment(bedseq, msa) count_seq += 1 except IOError, io_exce: raise ExecutionException( "GenerateMSAProcessor.generateMSA : Unable to save/read random sequences file. From:\n\t---> " + str(io_exce))
def parseBlock(self, input_file, indexed=False): new_block = None #Search for the first sequence line of the block and verify if the block match # with at least one of the BED sequence while 1: line = input_file.readline() if len(line) == 0: break # Check if the line is not void elif not line.isspace(): tokens = line.split() # Check if the line contains enough tokens if tokens != None and len(tokens) > MAFProcessor._text_col: if tokens[MAFProcessor._lineType_col] == 's': # Verify if current sequence species match with reference species spec_chrom = SequenceUtils.getSpeciesAndChrom( tokens[MAFProcessor._speciesChrom_col]) species = spec_chrom[0] chromosom = spec_chrom[1] if species == self.referenceSpecies: # Search for BED Sequences having the same <species>.<chromosom> bed_sequences = self.getAssociatedBEDSequences( species + "." + chromosom) if bed_sequences != None and len( bed_sequences) > 0: strand = tokens[MAFProcessor._strand_col] bp_start = self.computeStartIndex( tokens, strand) text_length = self.getIntValue( tokens[MAFProcessor._textlength_col]) # Search for BEDSequences intersection the current sequence new_block = self.findMatchingBEDSequences( bed_sequences, bp_start, text_length, strand) if new_block != None: text = tokens[MAFProcessor._text_col] new_block.addSequence( species, chromosom, bp_start, text_length, text) break else: # This block does not intersect any BEDSequence. If indexation is used, something is wrong if indexed == True: return False else: return True else: # This block does not match the chromosom of any BED Sequences # Parsing must be stopped if the file contain only information of one chromosom # Alert is raised in case of index file is used if indexed == True: Log.log( "MAFProcessor.parseBlock : No BED sequences corresponds to this MSA Block" ) return False else: Log.log( "MAFProcessor.parseBlock : The first sequence of the parsed block does not correspond to the reference species : " + line) return False else: # This block is void but we have to continue the parsing return True # If the block matches with at least one of the BED sequence, # parses the rest of the block and store the information if new_block != None: while 1: line = input_file.readline() if len(line) == 0: break elif not line.isspace(): tokens = line.split() if tokens != None and len(tokens) > MAFProcessor._text_col: if tokens[MAFProcessor._lineType_col] == 's': spec_chrom = SequenceUtils.getSpeciesAndChrom( tokens[MAFProcessor._speciesChrom_col]) species = spec_chrom[0] chromosom = spec_chrom[1] if len(self.desiredSpeciesList) == 0 or ( len(self.desiredSpeciesList) > 0 and species in self.desiredSpeciesList): bp_start = self.getIntValue( tokens[MAFProcessor._startindex_col]) text_length = self.getIntValue( tokens[MAFProcessor._textlength_col]) text = tokens[MAFProcessor._text_col] new_block.addSequence(species, chromosom, bp_start, text_length, text) if not species in self.parsedSpeciesList: self.parsedSpeciesList.append(species) #Block ends at the first empty line else: break else: Log.log( "MAFProcessor.parseBlock : The parsed block does not contains any sequence (line starting with 's')" ) return False return True
def parseBlockListWithIndex(self, index_file, input_file): is_chrom_file = False ordered = False spec_chrom = None # Read the index file header to know if the file is chromosom specialized and ordered while 1: line = index_file.readline() if len(line) == 0: Log.log("MAFProcessor.parseBlockListWithIndex : index file '" + index_file.name + "' has no header line : skipping it") return else: tokens = line.split() if tokens != None and tokens[0] == Constants.COMMENT_CHAR: if len(tokens) > 1 and tokens[1] != Constants.MIXED: is_chrom_file = True spec_chrom = tokens[1] if len(tokens) > 2 and tokens[2] == Constants.ORDERED: ordered = True break # If the file is specialized by chromosom, get once for all the bed sequences concerned # by this species and chromosom if is_chrom_file == True: bed_sequences = self.getAssociatedBEDSequences(spec_chrom) if bed_sequences == None or len(bed_sequences) == 0: Log.info( "MAFProcessor.parseBlockListWithIndex : No BED sequences matching for file :" + index_file.name) return else: bed_sequences = None # If file is ordered, compute the peaks extremum in order to optimize the parsing if ordered == True: min_start = 1000000000 max_end = 0 for bed_sequence in bed_sequences: if bed_sequence.indexStart < min_start: min_start = bed_sequence.indexStart if bed_sequence.indexEnd > max_end: max_end = bed_sequence.indexEnd # Parse the index file while 1: line = index_file.readline() if len(line) == 0: break else: tokens = line.split() if tokens != None and len(tokens) == 4: # retrieve the index information spec_chrom = tokens[0] start = self.getIntValue(tokens[1]) end = self.getIntValue(tokens[2]) position = self.getIntValue(tokens[3]) if ordered == True: # If the file is ordered and the indexes are less than the BED indexes, skip the line if end <= min_start: continue # If the file is ordered and the indexes are greater than the BED indexes, skip the file elif start >= max_end: break # If the indexes are at least in one of the BED sequences index range, # the corresponding MSA block is parsed else: for bed_sequence in bed_sequences: if end > bed_sequence.indexStart and start < bed_sequence.indexEnd: input_file.seek(position, 0) result = self.parseBlock(input_file, True) if result == False: raise ExecutionException( "MAFFile.parseBlockListWithIndex : Indexed MSA block seems not correct. You should have not updated indexes. Please see logs for more information" ) break else: # If the file is not chromosom specialized, the bed sequence list must be # retrieve for each new index if is_chrom_file == False: bed_sequences = self.getAssociatedBEDSequences( spec_chrom) if (bed_sequences == None or len(bed_sequences) == 0): continue # If the indexes are at least in one of the BED sequences index range, # the corresponding MSA block is parsed for bed_sequence in bed_sequences: if end > bed_sequence.indexStart and start < bed_sequence.indexEnd: input_file.seek(position, 0) result = self.parseBlock(input_file, True) if result == False: Log.log( "MAFFile.parseBlockListWithIndex : Indexed MSA block seems not correct. You should have not updated indexes" ) raise ExecutionException( "MAFFile.parseBlockListWithIndex : Indexed MSA block seems not correct. You should have not updated indexes. Please, see logs for more information" ) break
# -------------------------------------------------------------------------------------- # Write the CommStruct to the given XML file def toXMLFile(self, output_filepath): try: root_element = self.convertCommStructToElementTree() self.indent(root_element, 0) ElementTree(root_element).write(output_filepath) except IOError, exce: Log.log( "BedSeqAlignmentStatsCommStruct.toXMLFile : Unable to write CommStruct to XML file. From:\n\t---> " + str(exce)) except ParsingException, par_exce: Log.log( "BedSeqAlignmentStatsCommStruct.toXMLFile : Unable to save CommStruct to XML file. From:\n\t---> " + str(par_exce)) # ############################# # METHODS TO READ THE XLM FILE # ############################# # -------------------------------------------------------------------------------------- @staticmethod def getCommStructFromXML(commstruct_filepath): commstruct_file = None root_element = None try: commstruct_file = open(commstruct_filepath, "r")
pipeline_element.append( component_element) component_element.attrib[ PipelineXMLParser.COMPONENT_PROCESSOR_ATT] = component.processorName for param_name, param_value in component.parameters.iteritems(): param_element = Element( PipelineXMLParser.PARAM_TAG) component_element.append( param_element) param_element.attrib[ PipelineXMLParser.PARAM_NAME_ATT] = str( param_name) param_element.attrib[ PipelineXMLParser.PARAM_VALUE_ATT] = str( param_value) try: PipelineXMLParser.indent( pipelines_element, 0) outfile = os.path.join( outpath, pipeline.name + ".xml") ElementTree( pipelines_element).write( outfile) except IOError, exce: Log.log( "PipelineXMLParser.toXMLFile : Unable to write Pipelines to XML file. From:\n\t---> " + str( exce)) except ParsingException, par_exce: Log.log( "PipelineXMLParser.toXMLFile : Unable to save Pipelines to XML file. From:\n\t---> " + str( par_exce)) # -------------------------------------------------------------------------------------- # Add indentation to the ElementTree in order to have a pretty print # in the XML file (used by subclasses) @staticmethod def indent( elem, level=0): i = "\n" + level*" " if len(elem): if not elem.text or not elem.text.strip(): elem.text = i + " " if not elem.tail or not elem.tail.strip(): elem.tail = i for elem in elem:
def getRequiredParameters(): Log.log( "The method 'getRequiredParameters' must be implemented at the inherited class level" ) return None
def getOutputCommStructClass(): Log.log( "The method 'getOutputCommStructClass' must be implemented at the inherited class level" ) return ("Not defined", )
Log.trace("##") Log.trace("## Starting component '" + current_component.getComponentPrefix() + "'") if resume: Log.trace("## Resume mode") else: Log.trace("## Forcing mode") Log.trace("##") # Execute the component try: executed = current_component.start( pipeline, pipeline_output, self.config, resume) except ExecutionException, exe_exce: Log.log( "PipelineManager.executePipelines : Aborting execution of component : '" + current_component.getComponentPrefix() + "' . From:\n\t---> " + str(exe_exce)) # If the component was not correctly executed, its next component are not passed to queue if not executed: Log.log( "PipelineManager.executePipelines : Component : '" + current_component.getComponentPrefix + "' was not executed. See logs for more details" ) else: # remove the component from queue and add its following component to the queue start (depth-first execution) component_queue_list.remove(current_component) components_to_add = [] for next_component in current_component.nextComponents: if not next_component in component_queue_list:
def getDisplayName(): Log.log( "The method 'getOutputCommStructClass' must be implemented at the inherited class level" ) return Processor.__class__.__name__ + " (no display name defined)"
def compareMotifHistogram(self, input_commstructs): # Retrieve the required parameters RSAT_PATH = self.component.getParameter(Constants.RSAT_DIR_PARAM) # List the motifs that are commons to all input CommStructs common_motifs = self.getCommonMotifs(input_commstructs) # List the label of each origins labels = self.getLabels(input_commstructs) number_inputs = len(input_commstructs) # Prepare the output directories dir_path = os.path.join(self.component.outputDir, self.component.getComponentPrefix()) shutil.rmtree(dir_path, True) FileUtils.createDirectory(dir_path, 0777) for motif_name in common_motifs.keys(): motifs = common_motifs[motif_name] full_histogram = {} full_null_histogram = {} for index in range(len(motifs)): motif = motifs[index] if motif != None: # Add entries for the motif histogram for the current previous output motif_histogram = input_commstructs[ index].motifToStatistics[motif].histogram if motif_histogram != None: for token in motif_histogram: xy = token.split(MotifStatisticsCommStruct. HISTOGRAM_VALUE_SEPARATOR_CHAR) x = xy[0] y = self.getTokenAsfloat(xy[1]) if not x in full_histogram.keys(): full_histogram[x] = [0.0] * number_inputs full_histogram[x][index] = y # Add entries for the motif null histogram for the current previous output motif_null_histogram = input_commstructs[ index].motifToStatistics[motif].nullHistogram if motif_null_histogram != None: for token in motif_null_histogram: xy = token.split(MotifStatisticsCommStruct. HISTOGRAM_VALUE_SEPARATOR_CHAR) x = xy[0] y = self.getTokenAsfloat(xy[1]) if not x in full_null_histogram.keys(): full_null_histogram[x] = [0.0] * number_inputs full_null_histogram[x][index] = y if len(full_histogram) > 0: try: # Output the histogram values to file file_name = motif_name + "_full_histogram" file_path = os.path.join(dir_path, file_name + ".tab") file = open(file_path, "w") # Write the headers in the file file.write("# x") for label in labels: file.write("\t" + motif_name + " (" + label + ")") for label in labels: file.write("\tHomogeneous model (" + label + ")") file.write("\n") # Write the data in the file while len(full_histogram) > 0: # Search for the littlest x in the dictionnary keys x_val_min = 10000 for x in full_histogram.keys(): x_val = self.getTokenAsfloat(x) if x_val < x_val_min: x_min = x x_val_min = x_val file.write(x_min) # write the y values of motif histograms corresponding to the min x found for y in full_histogram[x_min]: file.write("\t" + str(y)) del full_histogram[x_min] # write the y values of null histograms corresponding to the min x found if x_min in full_null_histogram.keys(): for y in full_null_histogram[x_min]: file.write("\t" + str(y)) del full_null_histogram[x_min] else: for y in range(number_inputs): file.write("\t0.0") file.write("\n") file.flush() file.close() # Draw the histogram graph graph_path = os.path.join(dir_path, file_name + "_graph.png") value_cols = "" for index in range(number_inputs): value_cols += str(index + 2) + "," for index in range(number_inputs): value_cols += str(number_inputs + index + 2) + "," value_cols = value_cols[:-1] cmd = os.path.join(RSAT_PATH, "perl-scripts/XYgraph") cmd += " -i " + file_path cmd += " -title1 'Global distribution over peaks for " + motif_name + "'" cmd += " -xcol 1 -ycol " + value_cols cmd += " -xleg1 'Position against peak maximum' -lines" cmd += " -yleg1 'Number of occurence'" cmd += " -legend -header -format png -histo" cmd += " -o " + graph_path cmd_result = commands.getstatusoutput(cmd) if cmd_result[0] != 0: Log.log( "CompareStatisticsProcessor.compareMotifHistogram : status returned is :" + str(cmd_result[0]) + " for command '" + cmd + "'") Log.log(" Command output is = \n" + str(cmd_result[1])) continue except IOError, io_exce: raise ExecutionException( "CompareStatisticsProcessor.compareMotifHistogram : Unable to save histogram to tab file : '" + file_path + "'. From:\n\t---> " + str(io_exce))
def execute(self, comm_struct, pipeline): Log.log( "The method 'execute' must be implemented at the inherited class level" ) return None
def execute(self, input_commstructs): if input_commstructs == None or len(input_commstructs) == 0: raise ExecutionException("BEDOutputProcessor.execute : No inputs") input_commstruct = input_commstructs[0] # Retrieve the processor parameters reference_motif = self.getParameter(BEDOutputProcessor.REFERENCE_MOTIF) color_method = self.getParameter(BEDOutputProcessor.COLOR_METHOD, False) if color_method == None: color_method = BEDOutputProcessor.COLOR_METHOD_SCORE else: color_method = color_method.lower() if color_method != BEDOutputProcessor.COLOR_METHOD_SCORE and color_method != BEDOutputProcessor.COLOR_METHOD_FAMILY: color_method = BEDOutputProcessor.COLOR_METHOD_SCORE score_min = self.getParameterAsfloat(BEDOutputProcessor.SCORE_MIN) score_max = self.getParameterAsfloat(BEDOutputProcessor.SCORE_MAX) # Prepare the processor output dir out_path = os.path.join(self.component.outputDir, self.component.getComponentPrefix()) shutil.rmtree(out_path, True) FileUtils.createDirectory( out_path, 0777) # Retrieve the JASPAR motifs details motif_details = MotifUtils.getMotifsDetailsFromJaspar() motif_id = motif_details[ 0] motif_family = motif_details[ 1] family_rgb = {} # build the bed output file path bed_file_path = os.path.join(out_path, self.component.pipelineName + "_Motifs.bed") try: bed_file = open(bed_file_path, "w") #bed_file.write("track name='" + self.component.pipelineName + "' visibility=3 itemRgb='On' use_score=1\n") #bed_file.write("browser dense RSAT\n") #bed_file.write("browser dense\n") #bed_file.write("## seq_name start end feature_name score strand thickStart thickEnd itemRgb blockCount blockSizes blckStarts\n") current_color = None bedseq_list = input_commstruct.bedToMA.keys() bedseq_list.sort(BEDSequence.compare) previous_line_start = 0 previous_line_key = "" for bed_seq in bedseq_list: for msa in input_commstruct.bedToMA[ bed_seq]: for motif in msa.motifs: motif_name = motif.name if not input_commstruct.motifStatistics.has_key(motif_name): continue if motif_name in motif_id.keys(): out_name = motif_id[ motif_name] chromosom = bed_seq.chromosom start_position = bed_seq.indexStart + msa.fixIndex(motif.indexStart) end_position = bed_seq.indexStart + msa.fixIndex(motif.indexEnd) score = motif.score # Commented : Black is assigned to the reference motif #if motif_name == reference_motif: # item_rgb = "0,0,0" # for the other motif, color depends on the chosen method #else: if color_method == BEDOutputProcessor.COLOR_METHOD_FAMILY: if motif_name in motif_family.keys(): #print("-----------------------------") #print "Current color = " + str(current_color) #print "Motif name=" + motif_name #print "Motif family=" + motif_family[ motif_name] family_rgb = self.updateFamilyRGB(motif_family[ motif_name], family_rgb, current_color) #print "Family RGB = " + str(family_rgb) item_rgb = family_rgb[ motif_family[ motif_name]] #print "Item rgb = ", str(item_rgb) current_color = item_rgb else: item_rgb = BEDOutputProcessor.COLORS[ 0] else: item_rgb = self.getColorForScore(score, score_min, score_max) # Write the lines to output file if len( chromosom) <4: line_out = "chr" + chromosom else: line_out = chromosom line_out += "\t" + str(start_position) line_out += "\t" + str(end_position) line_out += "\t" + out_name line_out += "\t" + str(int(score * 1000)) line_out += "\t" + motif.strand line_out += "\t" + str(start_position) # ThickStart line_out += "\t" + str(end_position) # ThickEnd line_out += "\t" + item_rgb # itemRGB #line_out += "\t" + "0" # BlockCount #line_out += "\t" + "0" # BlockSizes #line_out += "\t" + "0" # BlockStarts # Build a key that represent the motif chrom, name and positions line_key = chromosom + ":" + str(start_position) + ":" + str(end_position) + ":" + out_name # If the new line has the same key has the previous one, we must keep only one of the two lines # i.e. the one with the highest score (the tell() and seek() method permits to overwrite the old line # line if required. # If the new line and the previous one has different keys the new line is simply written if previous_line_key != line_key: previous_line_start = bed_file.tell() bed_file.write(line_out) bed_file.write("\n") bed_file.flush previous_line_key = line_key previous_score = score else: if score > previous_score: bed_file.seek(previous_line_start) bed_file.write(line_out) bed_file.write("\n") bed_file.flush previous_score = score bed_file.close() input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.BED_OUTPUT_PATH] = bed_file_path # Sort bed_file (used for bigBed conversion) sorted_bed_file_path = os.path.join(out_path, self.component.pipelineName + "_Motifs_sorted.bed") cmd = "sort -k1,1 -k2,2n" cmd += " " + bed_file_path cmd += " > " + sorted_bed_file_path Log.info( "BEDOuputProcessor.execute : Sorting BED file") Log.info( "BEDOuputProcessor.execute : command used is : " + cmd) cmd_result = commands.getstatusoutput( cmd) Log.trace( "BEDOuputProcessor.execute : " + threading.currentThread().getName() + " : status returned is :" + str( cmd_result[0])) if cmd_result[0] != 0: Log.log( "BEDOuputProcessor.execute : status returned is :" + str( cmd_result[0]) + " for command '" + cmd + "'" ) Log.log( "BEDOuputProcessor.execute : command output is = \n" + str( cmd_result[1])) return input_commstruct # Fetch the chrom sizes that will be use to convert BED file to bigBed file chrom_sizes_path = os.path.join(out_path, self.component.pipelineName + "_chrom_size.txt") RSAT_PATH = self.component.getParameter( Constants.RSAT_DIR_PARAM) cmd = os.path.join( RSAT_PATH , "contrib/peak-footprints/tools/fetchChromSizes") cmd += " " + input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.REFERENCE_SPECIES] cmd += " > " + chrom_sizes_path Log.info( "BEDOuputProcessor.execute : Fetching Chrom sizes for species : " + input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.REFERENCE_SPECIES]) Log.info( "BEDOuputProcessor.execute : command used is : " + cmd) cmd_result = commands.getstatusoutput( cmd) Log.trace( "BEDOuputProcessor.execute : " + threading.currentThread().getName() + " : status returned is :" + str( cmd_result[0])) if cmd_result[0] != 0: Log.log( "BEDOuputProcessor.execute : status returned is :" + str( cmd_result[0]) + " for command '" + cmd + "'" ) Log.log( "BEDOuputProcessor.execute : command output is = \n" + str( cmd_result[1])) return input_commstruct # Build the bigBed file # sudo ln -s /lib/x86_64-linux-gnu/libssl.so.1.0.0 /usr/lib/libssl.so.10 # sudo ln -s /lib/x86_64-linux-gnu/libcrypto.so.1.0.0 /usr/lib/libcrypto.so.10 big_bed_path = os.path.join(out_path, self.component.pipelineName + "_Motifs.bb") RSAT_PATH = self.component.getParameter( Constants.RSAT_DIR_PARAM) cmd = os.path.join( RSAT_PATH , "contrib/peak-footprints/tools/bedToBigBed") cmd += " " + sorted_bed_file_path cmd += " " + chrom_sizes_path cmd += " " + big_bed_path Log.info( "BEDOuputProcessor.execute : Converting BED file to bigBed file") Log.info( "BEDOuputProcessor.execute : command used is : " + cmd) cmd_result = commands.getstatusoutput( cmd) Log.trace( "BEDOuputProcessor.execute : " + threading.currentThread().getName() + " : status returned is :" + str( cmd_result[0])) if cmd_result[0] != 0: Log.log( "BEDOuputProcessor.execute : status returned is :" + str( cmd_result[0]) + " for command '" + cmd + "'" ) Log.log( "BEDOuputProcessor.execute : command output is = \n" + str( cmd_result[1])) return input_commstruct input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.BIGBED_OUTPUT_PATH] = big_bed_path except IOError, io_exce: Log.log("BEDOutputProcessor.execute : Unable to save the BED file of recognized motifs : " + str(io_exce))