def verifyConfig(self): previous_config = self.getConfigFromFile() if previous_config == None: Log.trace( "Component.verifyConfig : No config file found for processor '" + self.processorName + "'. Executing it") return False same = False if len(previous_config) == len(self.parameters): if len(previous_config) == 0: same = True else: for param_name in previous_config.keys(): if param_name in self.parameters.keys(): if previous_config[param_name] == self.parameters[ param_name]: same = True else: same = False break else: same = False break if not same: Log.trace("Component.verifyConfig : Configuration of processor '" + self.processorName + "' has changed since previous run: Previous = " + str(previous_config) + " while current = " + str(self.parameters) + ". Executing processor") return same
def computeStartIndex(self, tokens, strand): # If current sequence direction is inserved, coordinates must be transformed if strand != Constants.POSITIVE_STRAND: Log.trace( "MAFIndexer : Negative strand detected") source_size = self.getIntValue( tokens[ MAFIndexerProcessor._source_size_col]) text_length = self.getIntValue( tokens[ MAFIndexerProcessor._textlength_col]) rev_start = self.getIntValue( tokens[ MAFIndexerProcessor._startindex_col]) bp_start = source_size + 1 - (text_length + rev_start) else: bp_start = self.getIntValue( tokens[ MAFIndexerProcessor._startindex_col]) return bp_start
def parseFile(self, file_name, is_chrom_file): try: input_file = open(file_name, 'r') # Verify if the token '##maf' indicating a MAF file is found in the first lines is_maf_file = False while 1: line = input_file.readline() if len(line) == 0: break elif not line.isspace(): tokens = line.split() if tokens != None and len( tokens) > 0 and tokens[0] == "##maf": is_maf_file = True break # if it is a maf file, verify if an index file exists if is_maf_file == True: indexed = False try: index_path = file_name + "index" input_index_file = open(index_path, "r") indexed = True except IOError: pass if indexed == True: Log.trace("MAFProcessor.parseFile : parsing file '" + file_name + "' using index '" + index_path + "'") self.parseBlockListWithIndex(input_index_file, input_file) self.closeFile(input_index_file) else: Log.trace("MAFProcessor.parseFile : parsing file '" + file_name + "'") self.parseBlockListWithoutIndex(input_file, is_chrom_file) self.closeFile(input_file) return else: self.closeFile(input_file) raise ParsingException("MAFProcessor.parseFile : The file '" + file_name + "' is not a MAF file") except IOError, io_exec: raise ParsingException( "MAFProcessor.parseFile : Enable to open file '" + file_name + "'. From:\n\t---> " + str(io_exec))
def startNewThread(self, file_queue, specialized_file, thread_list): if not file_queue.empty(): file = file_queue.get() my_thread = threading.Thread(None, self.parseFile, file, ( file, specialized_file, )) thread_list.append(my_thread) Log.trace( "MAFProcessor.startNewThread : Starting new thread to parse file : '" + file + "'. Number of active Thread = " + str(len(thread_list))) my_thread.start()
def addSites(self, output_commstruct): # Retrieve the algorithm parameters site_number = self.getParameterAsint( ImplantSitesProcessor.SITE_NUMBER_PARAM) if site_number <= 0: Log.trace( "ImplantSitesProcessor.addSites : Motif sites implantation not requested" ) return motif_list_line = self.getParameter( ImplantSitesProcessor.MOTIF_LIST_PARAM) motif_name_list = motif_list_line.split() optimize_motif = (self.getParameter( ImplantSitesProcessor.OPTIMIZE_MOTIF_PARAM).lower() == "true") database_file_path = self.getParameter( ImplantSitesProcessor.DATABASE_FILE_PATH_PARAM) distribution_mode = self.getParameter( ImplantSitesProcessor.DISTRIBUTION_MODE_PARAM) distribution_mode = distribution_mode.lower() # Retrieve the motifs PWM motif_def_list = self.getMotifDefinitions(motif_name_list, database_file_path) # Prepare output directory dir_path = os.path.join(self.component.outputDir, self.component.getComponentPrefix()) shutil.rmtree(dir_path, True) os.mkdir(dir_path) # Generate the motif sites motif_sites = {} for motif in motif_def_list: if optimize_motif == False: motif_file_path = self.outputMotifDefinition(motif, dir_path) motif_sites[motif] = self.generateRandomSites( motif, motif_file_path, site_number) else: motif_sites[motif] = self.generateOptimalSites( motif, site_number) # Implant sites in the MSA self.implantSites(motif_sites, distribution_mode, output_commstruct, dir_path)
def execute( self, input_commstructs): source_maffile = self.getParameter( MAFIndexerProcessor.INPUT_MAF_FILE_PARAM) self.referenceSpecies = self.getParameter( MAFIndexerProcessor.REFERENCE_SPECIES_PARAM) # look for MAF files to parse maf_file_list = FileUtils.getFileList( source_maffile, "maf", self.referenceSpecies) if maf_file_list == None: raise ExecutionException( "MAFIndexerProcessor.execute : The path '" + source_maffile + "' does not point to a MAF file or a directory containing MAF files and does not contain a subdirectory '" + self.referenceSpecies + "' containing MAF files.") count_file = 0 for maf_file_path in maf_file_list: Log.trace( "MAFIndexerProcessor.execute : Indexing " + maf_file_path) self.parseFile( maf_file_path) count_file += 1 ProgressionManager.setComponentProgression( self.component, count_file/float( len( maf_file_list)))
def execute(self, input_commstructs): if input_commstructs == None or len(input_commstructs) == 0: raise ExecutionException( "ImplantSitesProcessor.execute : No inputs") input_commstruct = input_commstructs[0] # Implant TF Motif binding sites in mSA Sequences Log.trace("ImplantSitesProcessor.execute : Implanting motif sites") ProgressionManager.setTaskProgression("Implanting motif sites", self.component, 0.0) self.addSites(input_commstruct) ProgressionManager.setTaskProgression("Implanting motif sites", self.component, 1.0) return input_commstruct
def execute(self, input_commstructs): if input_commstructs == None or len(input_commstructs) == 0: raise ExecutionException("BlockProcessor.execute : No inputs") input_commstruct = input_commstructs[0] # retrieve the processor parameters self.windowSize = self.getParameterAsint( BlockProcessor.WINDOW_SIZE_PARAM) self.residuConservationLimit = self.getParameterAsfloat( BlockProcessor.RESIDU_CONSERVATION_LIMIT_PARAM) self.windowConservationLimit = self.getParameterAsfloat( BlockProcessor.WINDOW_CONSERVATION_LIMIT_PARAM) algo = self.getParameter(BlockProcessor.ALGORITHM_PARAM, False) if algo != None: self.algorithm = algo.lower() referenceSpecies = self.getParameter( BlockProcessor.REFERENCE_SPECIES_PARAM) desired_species_line = self.getParameter( BlockProcessor.DESIRED_SPECIES_LIST_PARAM, False) Log.trace("BlockProcessor.execute : Chosen Algorithm is '" + self.algorithm + "'") self.desiredSpeciesList = [] self.desiredSpeciesList.append(referenceSpecies) if desired_species_line != None: self.desiredSpeciesList.extend(desired_species_line.split()) # Analyze the conserved region in each MSA # If 'None' algorithm is chosen, the entire MSA is considered as conserved for bed_seq in input_commstruct.bedToMA.keys(): for alignment in input_commstruct.bedToMA[bed_seq]: pwm = PWM() pwm.initFromAlignment(alignment, self.desiredSpeciesList) if self.algorithm != BlockProcessor.ALGORITHM_NONE_VALUE: self.analyzeConservedBlocks(pwm, alignment) else: new_block = Motif(0, alignment.totalLength, "", pwm) new_block.composeName(alignment.name) alignment.addMotif(new_block, True) return input_commstruct
def parseBlockListWithoutIndex(self, input_file, is_chrom_file): # search for the next line starting with 'a' (meaning new alignment lbock) counter = 0 while 1: line = input_file.readline() if len(line) == 0: break elif not line.isspace(): tokens = line.split() if tokens != None and len(tokens) > 0 and tokens[ MAFProcessor._lineType_col] == "a": counter += 1 if counter % 100000 == 0: Log.trace( "MAFIndexerProcessor.execute : Number of MSA already parsed : " + str(counter)) parsed = self.parseBlock(input_file) if not parsed and is_chrom_file: return
def executePipelines(self): result = True while len(self.serverQueue) > 0: params = self.serverQueue[0] pipelines_filepath = params[0] pipeline_options = params[1] try: verbosity = int(params[2]) except ValueError: verbosity = 1 resume = (params[3].lower() == "true") working_dir = params[4] # Modifies the config if required and initialize logs and output directory if working_dir != None and len(working_dir) > 0: self.config[PFConstants.BASE_OUTPUT_DIR_PARAM] = working_dir # Verify the base output dir and the output dir are created and create them if not FileUtils.createDirectory( self.config[PFConstants.BASE_OUTPUT_DIR_PARAM], 0777) self.config[PFConstants.OUTPUT_DIR_PARAM] = os.path.join( self.getParameter(PFConstants.BASE_OUTPUT_DIR_PARAM), PFConstants.OUTPUT_DIR_NAME) FileUtils.createDirectory( self.config[PFConstants.OUTPUT_DIR_PARAM], 0777) # Switch log location Log.switchFiles(self.getParameter(PFConstants.OUTPUT_DIR_PARAM), verbosity) # Parse the XML file to retrieve the pipelines definition Log.trace( "#################################################################################" ) Log.trace( "# PipelineManager.executePipelines : Reading pipelines from : " + pipelines_filepath) Log.trace( "#################################################################################" ) try: pipelines = PipelineXMLParser.getPipelines(pipelines_filepath) OptionManager.applyOptions(pipelines, pipeline_options) PipelineXMLParser.toXMLFile( self.config[PFConstants.OUTPUT_DIR_PARAM], pipelines) except SyntaxError, syn_exce: raise ParsingException( "PipelineManager.executePipelines : Unable to read definition of pipelines from XML file: '" + pipelines_filepath + "'. From:\n\t---> " + str(syn_exce)) except ParsingException, par_exce: raise ParsingException( "PipelineManager.executePipelines : Unable to read definition of pipelines from XML file: '" + pipelines_filepath + "'. From:\n\t---> " + str(par_exce))
def getInputCommStructs(self): authorized_input_classes = self.getAuthorizedInputClasses() input_commstructs = [] if authorized_input_classes != None: input_file = self.getParameter(Component.INPUT_FILE_PARAM, False) if input_file == None: #Compares the list of authorized inputs to outputs of previous components for component in self.previousComponents: previous_result_class = component.resultClass if previous_result_class in authorized_input_classes: input_commstruct = previous_result_class.fromXMLFile( component.getOutputFilePath()) if input_commstruct != None: input_commstructs.append(input_commstruct) else: raise ExecutionException( "Component.getInputCommStructs : input is not of the right class. Class is '" + previous_result_class + "' but waited classes are " + str(authorized_input_classes)) else: #Try to read the input file using classes authorized as input for input_class in authorized_input_classes: try: Log.trace( "Component.getInputCommStructs : Trying to load data from file : " + input_file) input_commstruct = input_class.fromXMLFile(input_file) if input_commstruct != None: input_commstructs.append(input_commstruct) Log.trace( "Component.getInputCommStructs : Data correctly loaded" ) except Exception, exce: Log.trace( "Component.getInputCommStructs : Data not loaded using class '" + str(input_class) + "' : " + str(exce)) pass if len(input_commstructs) == 0: raise ExecutionException( "Component.getInputCommStructs : The provided input file does not contain information the processor '" + self.processorName + "' can manage : " + input_file)
def start(self, pipeline, pipeline_out, runtime_params, resume=False): self.outputDir = pipeline_out self.runtimeParameters = runtime_params if resume == True: # Test if the previous component were all resumed if self.canResume(): self.resumed = False # test if the Component parameters have changed since the previous run. If so, the processor cannot # be resumed and must be re-run if self.verifyConfig(): # Test if an output file of a previous run of the associated processor can be retrieved # If so (or if the processor has output no files), the Component is declared as resumed and returns True try: output_filepath = self.getOutputFilePath() if os.path.isfile(output_filepath): authorized_output_classes = self.getAuthorizedOutputClasses( ) if authorized_output_classes != None: for output_class in authorized_output_classes: try: output_commstruct = output_class.fromXMLFile( output_filepath) if output_commstruct != None: self.resultClass = output_class self.resumed = True self.executed = False ProgressionManager.setComponentStatus( self, ProgressionManager. RESUMED_STATUS) Log.trace( "Component.execute : Resuming data from file : " + output_filepath) output_commstruct = None gc.collect() return True except BaseException, exce: Log.info( "Component.execute : Tried to resume output file with class '" + str(output_class) + "' : " + str(exce)) pass else: self.resumed = True self.executed = False ProgressionManager.setComponentStatus( self, ProgressionManager.RESUMED_STATUS) return True except IOError, io_exce: Log.trace( "Component.execute : Unable to open output file to resume processor '" + self.processorName + "'. From\n\t---> " + str(io_exce)) # Here, the processor cannot be resumed, for any reason linked to outfiles, Log.trace( "Component.execute : No output file found for processor '" + self.processorName + "': executing it") self.removePreviousOutputs() # If the processor does not have to be resumed because previous components were not resumed, # removes all old output files and the processor is executed else: Log.trace( "Component.execute : Processor '" + self.processorName + "' cannot be resumed since previous components have been executed." ) self.removePreviousOutputs()
def execute(self, input_commstructs): if input_commstructs == None or len(input_commstructs) == 0: raise ExecutionException( "ImplantSitesProcessor.execute : No inputs") input_commstruct = input_commstructs[0] # Retrieve the processor parameters bedseq_number = self.getParameterAsint( GenerateMSAProcessor.PEAK_NUMBER_PARAM) insertion_number = self.getParameterAsint( GenerateMSAProcessor.INSERTION_NUMBER_PARAM) bedseq_medium_length = self.getParameterAsint( GenerateMSAProcessor.PEAK_MEDIUM_SIZE_PARAM, False) msa_length = self.getParameterAsint( GenerateMSAProcessor.MSA_SIZE_PARAM) trivial_msa = self.getParameter( GenerateMSAProcessor.TRIVIAL_SEQUENCES_PARAM, False) if trivial_msa == None: trivial_msa = False else: trivial_msa = (trivial_msa.lower() == "true") # Prepare the processor output dir out_path = os.path.join(self.component.outputDir, self.component.getComponentPrefix()) shutil.rmtree(out_path, True) os.mkdir(out_path) # Build the output CommStruct output_commstruct = BedSeqAlignmentStatsCommStruct() output_commstruct.baseSpecies = input_commstruct.baseSpecies output_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct. REFERENCE_SPECIES] = output_commstruct.baseSpecies output_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.ALIGNED_SPECIES] = "" output_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.BEDSEQUENCE_NUMBER] = bedseq_number output_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct. BEDSEQUENCE_WITH_MSA_NUMBER] = bedseq_number if bedseq_medium_length == None: # Get the required number of sequence from BED sequence list count_peak = 0 max_length = 0 for chrom in input_commstruct.bedSequencesDict: output_commstruct.bedSequencesDict[chrom] = [] for bedseq in input_commstruct.bedSequencesDict[chrom]: output_commstruct.bedSequencesDict[chrom].append(bedseq) length = bedseq.indexEnd - bedseq.indexStart if length > max_length: max_length = length count_peak += 1 if count_peak >= bedseq_number: break if count_peak >= bedseq_number: break else: # Generate random BED sequences self.generateBedSequences(bedseq_number, bedseq_medium_length, output_commstruct) # Export the new bedsequence size histogram and graph self.outputSequenceSizeHistogram(output_commstruct) # Generate MSA for each BED Sequence Log.trace("GenerateMSAProcessor.execute : Generating MSA") ProgressionManager.setTaskProgression("Generating MSA", self.component, 0.0) if trivial_msa: self.generateTrivialMSA(msa_length, bedseq_number, output_commstruct) else: self.generateRandomMSA(msa_length, bedseq_number, bedseq_medium_length, output_commstruct) ProgressionManager.setTaskProgression("Generating MSA", self.component, 1.0) # Implant insertion characters into the MSA Sequences Log.trace("GenerateMSAProcessor.execute : Implanting insertions") ProgressionManager.setTaskProgression("Implanting insertions", self.component, 0.0) self.implantInsertions(output_commstruct, insertion_number) ProgressionManager.setTaskProgression("Implanting insertions", self.component, 1.0) # Export the new bedsequence size histogram and graph self.outputMSALenghtHistogram(output_commstruct) return output_commstruct
except SyntaxError, syn_exce: raise ParsingException( "PipelineManager.executePipelines : Unable to read definition of pipelines from XML file: '" + pipelines_filepath + "'. From:\n\t---> " + str(syn_exce)) except ParsingException, par_exce: raise ParsingException( "PipelineManager.executePipelines : Unable to read definition of pipelines from XML file: '" + pipelines_filepath + "'. From:\n\t---> " + str(par_exce)) if pipelines == None or len(pipelines) == 0: raise ParsingException( "PipelineManager.executePipelines : No pipeline defined in the given definition file : " + pipelines_filepath) # Verify if the definition of pipelines is correct Log.trace("PipelineManager.executePipelines : Verifying pipelines") try: self.verifyPipelinesDefinition(pipelines) except ParsingException, exe_exce: raise ParsingException( "PipelineManager.executePipelines : Canceling execution of pipelines. From:\n\t---> " + str(exe_exce)) # Initialize the ProgressionManager ProgressionManager.initialize( pipelines, self.getParameter(PFConstants.OUTPUT_DIR_PARAM), self.getParameter(PFConstants.INSTALL_DIR_PARAM)) # Execute the pipelines Log.trace("**************************************************") Log.trace("# Starting Pipelines")
class MAFIndexerProcessor( Processor): INPUT_MAF_FILE_PARAM = "MAFFile" REFERENCE_SPECIES_PARAM = "ReferenceSpecies" _lineType_col = 0 _speciesChrom_col = 1 _startindex_col = 2 _textlength_col = 3 _strand_col = 4 _source_size_col = 5 _text_col = 6 # -------------------------------------------------------------------------------------- def __init__(self): Processor.__init__( self) self.referenceSpecies = "" # -------------------------------------------------------------------------------------- # Returns the name of the CommStruct class used as input # (None if no input CommStruct) @staticmethod def getInputCommStructClass(): return None # -------------------------------------------------------------------------------------- # Returns the name of the CommStruct class used as output # (None if no output CommStruct) @staticmethod def getOutputCommStructClass(): return None # -------------------------------------------------------------------------------------- # Returns a name that will be used as display name in the user friendly outputs @staticmethod def getDisplayName(): return "Indexation of MAF files" #------------------------------------------------------------------------------------ # Returns a list of parameters names that are required parameters for the corresponding processor @staticmethod def getRequiredParameters(): return ( MAFIndexerProcessor.INPUT_MAF_FILE_PARAM, MAFIndexerProcessor.REFERENCE_SPECIES_PARAM) # -------------------------------------------------------------------------------------- # Execute the processor def execute( self, input_commstructs): source_maffile = self.getParameter( MAFIndexerProcessor.INPUT_MAF_FILE_PARAM) self.referenceSpecies = self.getParameter( MAFIndexerProcessor.REFERENCE_SPECIES_PARAM) # look for MAF files to parse maf_file_list = FileUtils.getFileList( source_maffile, "maf", self.referenceSpecies) if maf_file_list == None: raise ExecutionException( "MAFIndexerProcessor.execute : The path '" + source_maffile + "' does not point to a MAF file or a directory containing MAF files and does not contain a subdirectory '" + self.referenceSpecies + "' containing MAF files.") count_file = 0 for maf_file_path in maf_file_list: Log.trace( "MAFIndexerProcessor.execute : Indexing " + maf_file_path) self.parseFile( maf_file_path) count_file += 1 ProgressionManager.setComponentProgression( self.component, count_file/float( len( maf_file_list))) # -------------------------------------------------------------------------------------- # Parse the MAF file def parseFile(self, maf_file_path): try: input_file = open( maf_file_path, "r") except IOError, io_exec: raise ParsingException( "MAFIndexerProcessor.parseFile : Unable to open file '" + maf_file_path + "'. From:\n\t---> " + str(io_exec)) if input_file != None: # Verify if the token '##maf' indicating a MAF file is found in the first lines is_maf_file = False while 1: line = input_file.readline() if len( line) == 0: break elif not line.isspace(): tokens = line.split() if tokens != None and len( tokens) > 0 and tokens[0] == "##maf": is_maf_file = True break if is_maf_file == True: output = [] try: # search for the next line starting with 'a' (meaning new alignment lbock) counter = 0 specialized = True ordered = True previous_indexing = None while 1: line = input_file.readline() if len( line) == 0: break elif not line.isspace(): tokens = line.split() if tokens != None and len( tokens) > 0 and tokens[ MAFIndexerProcessor._lineType_col] == "a": counter += 1 if counter % 100000 == 0: Log.trace( "MAFIndexerProcessor.execute : Number of MSA already indexed : " + str( counter)) line_number = input_file.tell() indexing = self.indexBlock( input_file, previous_indexing) if indexing != None: if previous_indexing != None: specialized = specialized and (indexing[1] == previous_indexing[1]) ordered = ordered and (indexing[2] >= previous_indexing[3]) output.append( indexing[0] + "\t" + str( line_number)) previous_indexing = indexing #Write the result of indexing in file output_path = maf_file_path + "index" output_file = open( output_path, "w") output_file.write( Constants.COMMENT_CHAR) if specialized == True: output_file.write( "\t" + previous_indexing[ 1]) if ordered == True: output_file.write( "\t" + Constants.ORDERED) else: output_file.write( "\t" + Constants.MIXED) output_file.write( "\n") for indexing in output: output_file.write( indexing + "\n") output_file.flush() self.closeFile( input_file) self.closeFile( output_file) return except IOError, io_exec: raise ParsingException( "MAFIndexerProcessor.parseFile : Enable to create/write file '" + output_path + "'. From:\n\t---> " + str( io_exec)) else: self.closeFile( input_file) raise ParsingException( "MAFIndexerProcessor.parseFile : The file '" + maf_file_path + "' is not a MAF file")
def chooseBindingPoints(self, motif, sites, distribution_mode, implantations, output_commstruct, dir_path): bedseq_list = output_commstruct.bedToMA.keys() bedseq_list_length = len(bedseq_list) chosen_distances_signed = [] # Case of normal distribution # ........................... if distribution_mode == ImplantSitesProcessor.CENTERED_DISTRIBUTION_MODE_VALUE: for site in sites: tries = 0 while True: # Draw a bedseq with uniform probability chosen_bedseq = bedseq_list[int( random.uniform(0, bedseq_list_length))] # Choose start index using normal distribution around peak reference index chosen_middle_index = int( random.normalvariate(chosen_bedseq.referenceIndex, 30.0)) chosen_start_index = chosen_middle_index - int( len(site) / float(2)) chosen_end_index = chosen_middle_index + int( math.ceil(len(site) / float(2))) chosen_distances_signed.append( chosen_middle_index - chosen_bedseq.referenceIndex) # Test if any other site previously placed intersect the chosen one intersect = False if chosen_bedseq in implantations.keys(): for previous_indexes in implantations[chosen_bedseq]: if chosen_start_index < previous_indexes[ 2] and chosen_end_index > previous_indexes[ 1]: intersect = True break # TO REMOVE intersect = False # END TO REMOVE # If position is free, add the chosen position to the list of implantations if not intersect: if not chosen_bedseq in implantations.keys(): implantations[chosen_bedseq] = [] implantations[chosen_bedseq].append( (site, chosen_start_index, chosen_end_index)) break else: tries += 1 if tries > 50: Log.trace( "GenerateMSAProcessor.chooseBindingPoints : No place found for site : " + site + ". Bypassing site") break # Case of uniform distribution # ............................ elif distribution_mode == ImplantSitesProcessor.UNIFORM_DISTRIBUTION_MODE_VALUE: # Build a table that will permit to easily find a bedseq when drawing will be done total_length = 0 bedseq_limits = [] for bedseq in bedseq_list: length = bedseq.getLength() bedseq_limits.append(total_length + length) total_length += length for site in sites: tries = 0 while True: # draw a number (uniform) over all BED Sequence indexes drawen_index = random.randint(0, total_length - 1) # find to which BED sequence and to which index correspond the drawed number chosen_bedseq = None for index in range(bedseq_list_length): if drawen_index < bedseq_limits[index]: chosen_bedseq = bedseq_list[index] if index == 0: chosen_middle_index = chosen_bedseq.indexStart + drawen_index + int( len(site) / float(2)) else: chosen_middle_index = chosen_bedseq.indexStart + drawen_index - bedseq_limits[ index - 1] break # if the BEDseq is correctly found, check if the place is good for the site if chosen_bedseq != None: # if the index is to near from the sequence endpoints, draw a new index if chosen_middle_index < ( chosen_bedseq.indexStart + int(len(site) / float(2))) or chosen_middle_index > ( chosen_bedseq.indexEnd - int(math.ceil(len(site) / float(2)))): #print "-------------------------------" #print "chosen_middle_index = " + str( chosen_middle_index) #print "chosen_bedseq.indexStart = " + str( chosen_bedseq.indexStart) #print "chosen_bedseq.indexEnd = " + str( chosen_bedseq.indexEnd) #print "int( len( site) / float(2)) = " + str( int( len( site) / float(2))) #print "int( math.ceil( len(site) / float(2))) = " + str( int( math.ceil( len(site) / float(2)))) continue chosen_start_index = chosen_middle_index - int( len(site) / float(2)) chosen_end_index = chosen_middle_index + int( math.ceil(len(site) / float(2))) #chosen_distances.append( int( math.fabs( chosen_middle_index - chosen_bedseq.referenceIndex ))) chosen_distances_signed.append( chosen_middle_index - chosen_bedseq.referenceIndex) # Test if any other site previously placed intersect the chosen one intersect = False if chosen_bedseq in implantations.keys(): for previous_indexes in implantations[ chosen_bedseq]: if chosen_start_index < previous_indexes[ 2] and chosen_end_index > previous_indexes[ 1]: intersect = True break # If position is free, add the chosen position to the list of implantations if not intersect: if not chosen_bedseq in implantations.keys(): implantations[chosen_bedseq] = [] implantations[chosen_bedseq].append( (site, chosen_start_index, chosen_end_index)) break else: tries += 1 if tries > 50: Log.trace( "ImplantSitesProcessor.chooseBindingPoints : No place found for site : " + site + ". Bypassing site") break else: print "No bedseq found" # Case of unknown distribution # ............................ else: raise ExecutionException( "ImplantSitesProcessor.chooseBindingPoint : The chosen distribution mode is unknown '" + distribution_mode) # Compute the histogram of sites distances and graph it RSATUtils.outputHistogram( chosen_distances_signed, 5, dir_path, motif.name + "Sites", self.component.pipelineName, "Global distribution of " + motif.name + " sites over peaks", "Distance from peak maximum", "Number of occurence", None)
max_size = bedseq_length total_size += bedseq_length mean_size = (int)(total_size / float(bedseq_number)) output_commstruct.paramStatistics[ BedSeqCommStruct.BEDSEQUENCES_NUMBER] = bedseq_number output_commstruct.paramStatistics[ BedSeqCommStruct.BEDSEQUENCES_MIN_SIZE] = min_size output_commstruct.paramStatistics[ BedSeqCommStruct.BEDSEQUENCES_MAX_SIZE] = max_size output_commstruct.paramStatistics[ BedSeqCommStruct.BEDSEQUENCES_MEAN_SIZE] = mean_size output_commstruct.paramStatistics[ BedSeqCommStruct.BEDSEQUENCES_TOTAL_SIZE] = total_size Log.trace("BEDProcessor.execute : Total number of BED Sequences = " + str(bedseq_number)) Log.trace("BEDProcessor.execute : Minimum size of BED Sequences = " + str(min_size)) Log.trace("BEDProcessor.execute : Maximum size of BED Sequences = " + str(max_size)) Log.trace("BEDProcessor.execute : Mean size of BED Sequences = " + str(mean_size)) Log.trace("BEDProcessor.execute : Total size of BED Sequences = " + str(total_size)) # output the sequences size histogram self.outputSequenceSizeHistogram(bedseq_dictionnary, output_commstruct) return output_commstruct # --------------------------------------------------------------------------------------
def execute(self, input_commstructs): if input_commstructs == None or len(input_commstructs) == 0: raise ExecutionException("MAFProcessor.execute : No inputs") input_commstruct = input_commstructs[0] # retrieve processor parameters source_maffile = self.getParameter(MAFProcessor.INPUT_MAF_FILE_PARAM) specialized_file_line = self.getParameter( MAFProcessor.SPECIALIZED_MAF_FILE_PARAM, False) if specialized_file_line == None: specialized_file = False else: specialized_file = (specialized_file_line.lower() == "true") desired_species_line = self.getParameter( MAFProcessor.DESIRED_SPECIES_LIST_PARAM, False) if desired_species_line != None: self.desiredSpeciesList = desired_species_line.split() self.referenceSpecies = self.getParameter( MAFProcessor.REFERENCE_SPECIES_PARAM) thread_number = self.getParameterAsint( MAFProcessor.THREAD_NUMBER_PARAM, False) if thread_number == None or thread_number < 0: thread_number = 1 keep_gaps = self.getParameter(MAFProcessor.KEEP_GAPS, False) if keep_gaps == None: keep_gaps = False # Retrieve BED sequences from the input CommStruct self.bedSequencesDict = input_commstruct.bedSequencesDict if self.bedSequencesDict == None or len(self.bedSequencesDict) == 0: raise ExecutionException( "MAFProcessor.execute : No BEDSequence provided as input") # Look for MAF files to parse maf_file_list = FileUtils.getFileList(source_maffile, "maf", self.referenceSpecies) if maf_file_list == None: raise ExecutionException( "MAFProcessor.execute : The path '" + source_maffile + "' does not point to a MAF file or a directory containing MAF files and does not contain a subdirectory '" + self.referenceSpecies + "' containing MAF files.") # Retrieve the alignement blocks of the MAF files corresponding to each BED sequence, # managing the parsing of MAF file list according to the chosen number of threads count_parsed_files = 0 ProgressionManager.setTaskProgression("Parsing MAF Files", self.component, 0.0) if thread_number > 1: # store the list of MAF file in a queue file_queue = Queue.Queue(len(maf_file_list)) for file in maf_file_list: file_queue.put(file) # Create a Thread lock used in several methods to avoid possible Thread conflicts self.threadLock = threading.Lock() # Launch the firsts Threads thread_list = [] while not file_queue.empty() and len(thread_list) < thread_number: self.startNewThread(file_queue, specialized_file, thread_list) # Manage the Threads list in order to empty the file queue while not file_queue.empty() or len(thread_list) > 0: for threads in thread_list: if not threads.is_alive(): thread_list.remove(threads) count_parsed_files += 1 ProgressionManager.setTaskProgression( "Parsing MAF Files", self.component, count_parsed_files / float(len(maf_file_list))) self.startNewThread(file_queue, specialized_file, thread_list) time.sleep(MAFProcessor.THREAD_CHECK_DELAY) else: # Parse the files in the MAF file list for file in maf_file_list: self.parseFile(file, specialized_file) count_parsed_files += 1 ProgressionManager.setTaskProgression( "Parsing MAF Files", self.component, count_parsed_files / float(len(maf_file_list))) # Assign the whole list of parsed species if a desired list was not set if len(self.desiredSpeciesList) == 0: self.desiredSpeciesList = self.parsedSpeciesList # create the output CommStruct output_commstruct = BedSeqAlignmentStatsCommStruct() output_commstruct.processorName = self.component.processorName output_commstruct.baseSpecies = input_commstruct.baseSpecies output_commstruct.bedSequencesDict = input_commstruct.bedSequencesDict output_commstruct.paramStatistics = input_commstruct.paramStatistics output_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct. REFERENCE_SPECIES] = self.referenceSpecies output_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.ALIGNED_SPECIES] = ", ".join( self.desiredSpeciesList) # Compose the MSA corresponding to each BED sequence from the MAF blocks ProgressionManager.setTaskProgression("Building MSA", self.component, 0.0) count = 0 total_number_bed = len(self.mafBlockDic.keys()) min_size = 100000000 max_size = -1 total_size = 0 msa_lenghts = [] for bed_sequence in self.mafBlockDic.keys(): count += 1 if count % 100 == 0: ProgressionManager.setTaskProgression( "Building MSA", self.component, count / float(total_number_bed)) alignment = self.composeSequenceAlignment(bed_sequence, keep_gaps) align_length = alignment.totalLength msa_lenghts.append(align_length) if align_length < min_size: min_size = align_length if align_length > max_size: max_size = align_length total_size += align_length output_commstruct.addSequenceAlignment(bed_sequence, alignment) ProgressionManager.setTaskProgression("Building MSA", self.component, 1.0) mean_size = (int)(total_size / float(total_number_bed)) output_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.MSA_NUMBER] = count output_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.MSA_MIN_SIZE] = min_size output_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.MSA_MAX_SIZE] = max_size output_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.MSA_MEAN_SIZE] = mean_size output_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.MSA_TOTAL_SIZE] = total_size Log.trace( "MAFProcessor.execute : Total number of BEDsequence with associated MSA = " + str(count)) Log.trace( "MAFProcessor.execute : Minimal size of BEDsequence with associated MSA = " + str(min_size)) Log.trace( "MAFProcessor.execute : Maximal size of BEDsequence with associated MSA = " + str(max_size)) Log.trace( "MAFProcessor.execute : Mean size of BEDsequence with associated MSA = " + str(mean_size)) Log.trace( "MAFProcessor.execute : Total size of BEDsequence with associated MSA = " + str(total_size)) # Output the MSA lengths histogram and graph self.outputMSALenghtHistogram(msa_lenghts, output_commstruct) return output_commstruct
def execute(self, input_commstructs): if input_commstructs == None or len(input_commstructs) == 0: raise ExecutionException("BEDOutputProcessor.execute : No inputs") input_commstruct = input_commstructs[0] # Retrieve the processor parameters reference_motif = self.getParameter(BEDOutputProcessor.REFERENCE_MOTIF) color_method = self.getParameter(BEDOutputProcessor.COLOR_METHOD, False) if color_method == None: color_method = BEDOutputProcessor.COLOR_METHOD_SCORE else: color_method = color_method.lower() if color_method != BEDOutputProcessor.COLOR_METHOD_SCORE and color_method != BEDOutputProcessor.COLOR_METHOD_FAMILY: color_method = BEDOutputProcessor.COLOR_METHOD_SCORE score_min = self.getParameterAsfloat(BEDOutputProcessor.SCORE_MIN) score_max = self.getParameterAsfloat(BEDOutputProcessor.SCORE_MAX) # Prepare the processor output dir out_path = os.path.join(self.component.outputDir, self.component.getComponentPrefix()) shutil.rmtree(out_path, True) FileUtils.createDirectory( out_path, 0777) # Retrieve the JASPAR motifs details motif_details = MotifUtils.getMotifsDetailsFromJaspar() motif_id = motif_details[ 0] motif_family = motif_details[ 1] family_rgb = {} # build the bed output file path bed_file_path = os.path.join(out_path, self.component.pipelineName + "_Motifs.bed") try: bed_file = open(bed_file_path, "w") #bed_file.write("track name='" + self.component.pipelineName + "' visibility=3 itemRgb='On' use_score=1\n") #bed_file.write("browser dense RSAT\n") #bed_file.write("browser dense\n") #bed_file.write("## seq_name start end feature_name score strand thickStart thickEnd itemRgb blockCount blockSizes blckStarts\n") current_color = None bedseq_list = input_commstruct.bedToMA.keys() bedseq_list.sort(BEDSequence.compare) previous_line_start = 0 previous_line_key = "" for bed_seq in bedseq_list: for msa in input_commstruct.bedToMA[ bed_seq]: for motif in msa.motifs: motif_name = motif.name if not input_commstruct.motifStatistics.has_key(motif_name): continue if motif_name in motif_id.keys(): out_name = motif_id[ motif_name] chromosom = bed_seq.chromosom start_position = bed_seq.indexStart + msa.fixIndex(motif.indexStart) end_position = bed_seq.indexStart + msa.fixIndex(motif.indexEnd) score = motif.score # Commented : Black is assigned to the reference motif #if motif_name == reference_motif: # item_rgb = "0,0,0" # for the other motif, color depends on the chosen method #else: if color_method == BEDOutputProcessor.COLOR_METHOD_FAMILY: if motif_name in motif_family.keys(): #print("-----------------------------") #print "Current color = " + str(current_color) #print "Motif name=" + motif_name #print "Motif family=" + motif_family[ motif_name] family_rgb = self.updateFamilyRGB(motif_family[ motif_name], family_rgb, current_color) #print "Family RGB = " + str(family_rgb) item_rgb = family_rgb[ motif_family[ motif_name]] #print "Item rgb = ", str(item_rgb) current_color = item_rgb else: item_rgb = BEDOutputProcessor.COLORS[ 0] else: item_rgb = self.getColorForScore(score, score_min, score_max) # Write the lines to output file if len( chromosom) <4: line_out = "chr" + chromosom else: line_out = chromosom line_out += "\t" + str(start_position) line_out += "\t" + str(end_position) line_out += "\t" + out_name line_out += "\t" + str(int(score * 1000)) line_out += "\t" + motif.strand line_out += "\t" + str(start_position) # ThickStart line_out += "\t" + str(end_position) # ThickEnd line_out += "\t" + item_rgb # itemRGB #line_out += "\t" + "0" # BlockCount #line_out += "\t" + "0" # BlockSizes #line_out += "\t" + "0" # BlockStarts # Build a key that represent the motif chrom, name and positions line_key = chromosom + ":" + str(start_position) + ":" + str(end_position) + ":" + out_name # If the new line has the same key has the previous one, we must keep only one of the two lines # i.e. the one with the highest score (the tell() and seek() method permits to overwrite the old line # line if required. # If the new line and the previous one has different keys the new line is simply written if previous_line_key != line_key: previous_line_start = bed_file.tell() bed_file.write(line_out) bed_file.write("\n") bed_file.flush previous_line_key = line_key previous_score = score else: if score > previous_score: bed_file.seek(previous_line_start) bed_file.write(line_out) bed_file.write("\n") bed_file.flush previous_score = score bed_file.close() input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.BED_OUTPUT_PATH] = bed_file_path # Sort bed_file (used for bigBed conversion) sorted_bed_file_path = os.path.join(out_path, self.component.pipelineName + "_Motifs_sorted.bed") cmd = "sort -k1,1 -k2,2n" cmd += " " + bed_file_path cmd += " > " + sorted_bed_file_path Log.info( "BEDOuputProcessor.execute : Sorting BED file") Log.info( "BEDOuputProcessor.execute : command used is : " + cmd) cmd_result = commands.getstatusoutput( cmd) Log.trace( "BEDOuputProcessor.execute : " + threading.currentThread().getName() + " : status returned is :" + str( cmd_result[0])) if cmd_result[0] != 0: Log.log( "BEDOuputProcessor.execute : status returned is :" + str( cmd_result[0]) + " for command '" + cmd + "'" ) Log.log( "BEDOuputProcessor.execute : command output is = \n" + str( cmd_result[1])) return input_commstruct # Fetch the chrom sizes that will be use to convert BED file to bigBed file chrom_sizes_path = os.path.join(out_path, self.component.pipelineName + "_chrom_size.txt") RSAT_PATH = self.component.getParameter( Constants.RSAT_DIR_PARAM) cmd = os.path.join( RSAT_PATH , "contrib/peak-footprints/tools/fetchChromSizes") cmd += " " + input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.REFERENCE_SPECIES] cmd += " > " + chrom_sizes_path Log.info( "BEDOuputProcessor.execute : Fetching Chrom sizes for species : " + input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.REFERENCE_SPECIES]) Log.info( "BEDOuputProcessor.execute : command used is : " + cmd) cmd_result = commands.getstatusoutput( cmd) Log.trace( "BEDOuputProcessor.execute : " + threading.currentThread().getName() + " : status returned is :" + str( cmd_result[0])) if cmd_result[0] != 0: Log.log( "BEDOuputProcessor.execute : status returned is :" + str( cmd_result[0]) + " for command '" + cmd + "'" ) Log.log( "BEDOuputProcessor.execute : command output is = \n" + str( cmd_result[1])) return input_commstruct # Build the bigBed file # sudo ln -s /lib/x86_64-linux-gnu/libssl.so.1.0.0 /usr/lib/libssl.so.10 # sudo ln -s /lib/x86_64-linux-gnu/libcrypto.so.1.0.0 /usr/lib/libcrypto.so.10 big_bed_path = os.path.join(out_path, self.component.pipelineName + "_Motifs.bb") RSAT_PATH = self.component.getParameter( Constants.RSAT_DIR_PARAM) cmd = os.path.join( RSAT_PATH , "contrib/peak-footprints/tools/bedToBigBed") cmd += " " + sorted_bed_file_path cmd += " " + chrom_sizes_path cmd += " " + big_bed_path Log.info( "BEDOuputProcessor.execute : Converting BED file to bigBed file") Log.info( "BEDOuputProcessor.execute : command used is : " + cmd) cmd_result = commands.getstatusoutput( cmd) Log.trace( "BEDOuputProcessor.execute : " + threading.currentThread().getName() + " : status returned is :" + str( cmd_result[0])) if cmd_result[0] != 0: Log.log( "BEDOuputProcessor.execute : status returned is :" + str( cmd_result[0]) + " for command '" + cmd + "'" ) Log.log( "BEDOuputProcessor.execute : command output is = \n" + str( cmd_result[1])) return input_commstruct input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.BIGBED_OUTPUT_PATH] = big_bed_path except IOError, io_exce: Log.log("BEDOutputProcessor.execute : Unable to save the BED file of recognized motifs : " + str(io_exce))