Exemple #1
0
    def verifyConfig(self):

        previous_config = self.getConfigFromFile()

        if previous_config == None:
            Log.trace(
                "Component.verifyConfig : No config file found for processor '"
                + self.processorName + "'. Executing it")
            return False

        same = False
        if len(previous_config) == len(self.parameters):
            if len(previous_config) == 0:
                same = True
            else:
                for param_name in previous_config.keys():
                    if param_name in self.parameters.keys():
                        if previous_config[param_name] == self.parameters[
                                param_name]:
                            same = True
                        else:
                            same = False
                            break
                    else:
                        same = False
                        break

        if not same:
            Log.trace("Component.verifyConfig : Configuration of processor '" +
                      self.processorName +
                      "' has changed since previous run: Previous = " +
                      str(previous_config) + " while current = " +
                      str(self.parameters) + ". Executing processor")

        return same
    def computeStartIndex(self, tokens,  strand):
        
        # If current sequence direction is inserved, coordinates must be transformed
        if strand != Constants.POSITIVE_STRAND:
            Log.trace( "MAFIndexer : Negative strand detected")
            source_size = self.getIntValue( tokens[ MAFIndexerProcessor._source_size_col])
            text_length = self.getIntValue( tokens[ MAFIndexerProcessor._textlength_col])
            rev_start = self.getIntValue( tokens[ MAFIndexerProcessor._startindex_col])
            bp_start = source_size + 1 - (text_length + rev_start)
        else:
            bp_start = self.getIntValue( tokens[ MAFIndexerProcessor._startindex_col])

        return bp_start
Exemple #3
0
    def parseFile(self, file_name, is_chrom_file):

        try:
            input_file = open(file_name, 'r')

            # Verify if the token '##maf' indicating a MAF file is found in the first lines
            is_maf_file = False
            while 1:
                line = input_file.readline()
                if len(line) == 0:
                    break
                elif not line.isspace():
                    tokens = line.split()
                    if tokens != None and len(
                            tokens) > 0 and tokens[0] == "##maf":
                        is_maf_file = True
                        break

            # if it is a maf file, verify if an index file exists
            if is_maf_file == True:
                indexed = False
                try:
                    index_path = file_name + "index"
                    input_index_file = open(index_path, "r")
                    indexed = True

                except IOError:
                    pass

                if indexed == True:
                    Log.trace("MAFProcessor.parseFile : parsing file '" +
                              file_name + "' using index '" + index_path + "'")
                    self.parseBlockListWithIndex(input_index_file, input_file)
                    self.closeFile(input_index_file)
                else:
                    Log.trace("MAFProcessor.parseFile : parsing file '" +
                              file_name + "'")
                    self.parseBlockListWithoutIndex(input_file, is_chrom_file)

                self.closeFile(input_file)
                return

            else:
                self.closeFile(input_file)
                raise ParsingException("MAFProcessor.parseFile : The file '" +
                                       file_name + "' is not a MAF file")
        except IOError, io_exec:
            raise ParsingException(
                "MAFProcessor.parseFile : Enable to open file '" + file_name +
                "'. From:\n\t---> " + str(io_exec))
Exemple #4
0
    def startNewThread(self, file_queue, specialized_file, thread_list):

        if not file_queue.empty():
            file = file_queue.get()
            my_thread = threading.Thread(None, self.parseFile, file, (
                file,
                specialized_file,
            ))
            thread_list.append(my_thread)
            Log.trace(
                "MAFProcessor.startNewThread : Starting new thread to parse file : '"
                + file + "'. Number of active Thread = " +
                str(len(thread_list)))
            my_thread.start()
    def addSites(self, output_commstruct):

        # Retrieve the algorithm parameters
        site_number = self.getParameterAsint(
            ImplantSitesProcessor.SITE_NUMBER_PARAM)

        if site_number <= 0:
            Log.trace(
                "ImplantSitesProcessor.addSites : Motif sites implantation not requested"
            )
            return

        motif_list_line = self.getParameter(
            ImplantSitesProcessor.MOTIF_LIST_PARAM)
        motif_name_list = motif_list_line.split()

        optimize_motif = (self.getParameter(
            ImplantSitesProcessor.OPTIMIZE_MOTIF_PARAM).lower() == "true")

        database_file_path = self.getParameter(
            ImplantSitesProcessor.DATABASE_FILE_PATH_PARAM)

        distribution_mode = self.getParameter(
            ImplantSitesProcessor.DISTRIBUTION_MODE_PARAM)
        distribution_mode = distribution_mode.lower()

        # Retrieve the motifs PWM
        motif_def_list = self.getMotifDefinitions(motif_name_list,
                                                  database_file_path)

        # Prepare output directory
        dir_path = os.path.join(self.component.outputDir,
                                self.component.getComponentPrefix())
        shutil.rmtree(dir_path, True)
        os.mkdir(dir_path)

        # Generate the motif sites
        motif_sites = {}
        for motif in motif_def_list:
            if optimize_motif == False:
                motif_file_path = self.outputMotifDefinition(motif, dir_path)
                motif_sites[motif] = self.generateRandomSites(
                    motif, motif_file_path, site_number)
            else:
                motif_sites[motif] = self.generateOptimalSites(
                    motif, site_number)

        # Implant sites in the MSA
        self.implantSites(motif_sites, distribution_mode, output_commstruct,
                          dir_path)
 def execute( self, input_commstructs):
     
     source_maffile = self.getParameter( MAFIndexerProcessor.INPUT_MAF_FILE_PARAM)
     self.referenceSpecies = self.getParameter( MAFIndexerProcessor.REFERENCE_SPECIES_PARAM)
     
     # look for MAF files to parse
     maf_file_list = FileUtils.getFileList( source_maffile, "maf", self.referenceSpecies)
     if maf_file_list == None:
         raise ExecutionException( "MAFIndexerProcessor.execute : The path '" + source_maffile + "' does not point to a MAF file or a directory containing MAF files and does not contain a subdirectory '" + self.referenceSpecies + "' containing MAF files.")
     
     count_file = 0
     for maf_file_path in maf_file_list:
         Log.trace( "MAFIndexerProcessor.execute : Indexing " + maf_file_path)
         self.parseFile( maf_file_path)
         count_file += 1
         ProgressionManager.setComponentProgression( self.component, count_file/float( len( maf_file_list)))
    def execute(self, input_commstructs):

        if input_commstructs == None or len(input_commstructs) == 0:
            raise ExecutionException(
                "ImplantSitesProcessor.execute : No inputs")

        input_commstruct = input_commstructs[0]

        # Implant TF Motif binding sites in mSA Sequences
        Log.trace("ImplantSitesProcessor.execute : Implanting motif sites")
        ProgressionManager.setTaskProgression("Implanting motif sites",
                                              self.component, 0.0)
        self.addSites(input_commstruct)
        ProgressionManager.setTaskProgression("Implanting motif sites",
                                              self.component, 1.0)

        return input_commstruct
Exemple #8
0
    def execute(self, input_commstructs):

        if input_commstructs == None or len(input_commstructs) == 0:
            raise ExecutionException("BlockProcessor.execute : No inputs")

        input_commstruct = input_commstructs[0]

        # retrieve the processor parameters
        self.windowSize = self.getParameterAsint(
            BlockProcessor.WINDOW_SIZE_PARAM)
        self.residuConservationLimit = self.getParameterAsfloat(
            BlockProcessor.RESIDU_CONSERVATION_LIMIT_PARAM)
        self.windowConservationLimit = self.getParameterAsfloat(
            BlockProcessor.WINDOW_CONSERVATION_LIMIT_PARAM)
        algo = self.getParameter(BlockProcessor.ALGORITHM_PARAM, False)
        if algo != None:
            self.algorithm = algo.lower()

        referenceSpecies = self.getParameter(
            BlockProcessor.REFERENCE_SPECIES_PARAM)

        desired_species_line = self.getParameter(
            BlockProcessor.DESIRED_SPECIES_LIST_PARAM, False)
        Log.trace("BlockProcessor.execute : Chosen Algorithm is '" +
                  self.algorithm + "'")

        self.desiredSpeciesList = []
        self.desiredSpeciesList.append(referenceSpecies)
        if desired_species_line != None:
            self.desiredSpeciesList.extend(desired_species_line.split())

        # Analyze the conserved region in each MSA
        # If 'None' algorithm is chosen, the entire MSA is considered as conserved
        for bed_seq in input_commstruct.bedToMA.keys():
            for alignment in input_commstruct.bedToMA[bed_seq]:
                pwm = PWM()
                pwm.initFromAlignment(alignment, self.desiredSpeciesList)
                if self.algorithm != BlockProcessor.ALGORITHM_NONE_VALUE:
                    self.analyzeConservedBlocks(pwm, alignment)
                else:
                    new_block = Motif(0, alignment.totalLength, "", pwm)
                    new_block.composeName(alignment.name)
                    alignment.addMotif(new_block, True)

        return input_commstruct
Exemple #9
0
    def parseBlockListWithoutIndex(self, input_file, is_chrom_file):

        # search for the next line starting with 'a' (meaning new alignment lbock)
        counter = 0
        while 1:
            line = input_file.readline()
            if len(line) == 0:
                break
            elif not line.isspace():
                tokens = line.split()
                if tokens != None and len(tokens) > 0 and tokens[
                        MAFProcessor._lineType_col] == "a":
                    counter += 1
                    if counter % 100000 == 0:
                        Log.trace(
                            "MAFIndexerProcessor.execute : Number of MSA already parsed : "
                            + str(counter))
                    parsed = self.parseBlock(input_file)
                    if not parsed and is_chrom_file:
                        return
    def executePipelines(self):

        result = True

        while len(self.serverQueue) > 0:

            params = self.serverQueue[0]
            pipelines_filepath = params[0]
            pipeline_options = params[1]
            try:
                verbosity = int(params[2])
            except ValueError:
                verbosity = 1
            resume = (params[3].lower() == "true")
            working_dir = params[4]

            # Modifies the config if required and initialize logs and output directory
            if working_dir != None and len(working_dir) > 0:
                self.config[PFConstants.BASE_OUTPUT_DIR_PARAM] = working_dir

            # Verify the base output dir and the output dir are created and create them if not
            FileUtils.createDirectory(
                self.config[PFConstants.BASE_OUTPUT_DIR_PARAM], 0777)
            self.config[PFConstants.OUTPUT_DIR_PARAM] = os.path.join(
                self.getParameter(PFConstants.BASE_OUTPUT_DIR_PARAM),
                PFConstants.OUTPUT_DIR_NAME)
            FileUtils.createDirectory(
                self.config[PFConstants.OUTPUT_DIR_PARAM], 0777)

            # Switch log location
            Log.switchFiles(self.getParameter(PFConstants.OUTPUT_DIR_PARAM),
                            verbosity)

            # Parse the XML file to retrieve the pipelines definition
            Log.trace(
                "#################################################################################"
            )
            Log.trace(
                "# PipelineManager.executePipelines : Reading pipelines from : "
                + pipelines_filepath)
            Log.trace(
                "#################################################################################"
            )

            try:
                pipelines = PipelineXMLParser.getPipelines(pipelines_filepath)
                OptionManager.applyOptions(pipelines, pipeline_options)
                PipelineXMLParser.toXMLFile(
                    self.config[PFConstants.OUTPUT_DIR_PARAM], pipelines)
            except SyntaxError, syn_exce:
                raise ParsingException(
                    "PipelineManager.executePipelines : Unable to read definition of pipelines from XML file: '"
                    + pipelines_filepath + "'. From:\n\t---> " + str(syn_exce))
            except ParsingException, par_exce:
                raise ParsingException(
                    "PipelineManager.executePipelines : Unable to read definition of pipelines from XML file: '"
                    + pipelines_filepath + "'. From:\n\t---> " + str(par_exce))
Exemple #11
0
    def getInputCommStructs(self):

        authorized_input_classes = self.getAuthorizedInputClasses()

        input_commstructs = []
        if authorized_input_classes != None:
            input_file = self.getParameter(Component.INPUT_FILE_PARAM, False)
            if input_file == None:
                #Compares the list of authorized inputs to outputs of previous components
                for component in self.previousComponents:
                    previous_result_class = component.resultClass
                    if previous_result_class in authorized_input_classes:
                        input_commstruct = previous_result_class.fromXMLFile(
                            component.getOutputFilePath())
                        if input_commstruct != None:
                            input_commstructs.append(input_commstruct)
                    else:
                        raise ExecutionException(
                            "Component.getInputCommStructs : input is not of the right class. Class is '"
                            + previous_result_class +
                            "' but waited classes are " +
                            str(authorized_input_classes))
            else:
                #Try to read the input file using classes authorized as input
                for input_class in authorized_input_classes:
                    try:
                        Log.trace(
                            "Component.getInputCommStructs : Trying to load data from file : "
                            + input_file)
                        input_commstruct = input_class.fromXMLFile(input_file)
                        if input_commstruct != None:
                            input_commstructs.append(input_commstruct)
                        Log.trace(
                            "Component.getInputCommStructs : Data correctly loaded"
                        )
                    except Exception, exce:
                        Log.trace(
                            "Component.getInputCommStructs : Data not loaded using class '"
                            + str(input_class) + "' : " + str(exce))
                        pass
                if len(input_commstructs) == 0:
                    raise ExecutionException(
                        "Component.getInputCommStructs : The provided input file does not contain information the processor '"
                        + self.processorName + "' can manage : " + input_file)
Exemple #12
0
    def start(self, pipeline, pipeline_out, runtime_params, resume=False):

        self.outputDir = pipeline_out
        self.runtimeParameters = runtime_params

        if resume == True:
            # Test if the previous component were all resumed
            if self.canResume():
                self.resumed = False
                # test if the Component parameters have changed since the previous run. If so, the processor cannot
                # be resumed and must be re-run
                if self.verifyConfig():
                    # Test if an output file of a previous run of the associated processor can be retrieved
                    # If so (or if the processor has output no files), the Component is declared as resumed and returns True
                    try:
                        output_filepath = self.getOutputFilePath()
                        if os.path.isfile(output_filepath):
                            authorized_output_classes = self.getAuthorizedOutputClasses(
                            )
                            if authorized_output_classes != None:
                                for output_class in authorized_output_classes:
                                    try:
                                        output_commstruct = output_class.fromXMLFile(
                                            output_filepath)
                                        if output_commstruct != None:
                                            self.resultClass = output_class
                                            self.resumed = True
                                            self.executed = False
                                            ProgressionManager.setComponentStatus(
                                                self, ProgressionManager.
                                                RESUMED_STATUS)
                                            Log.trace(
                                                "Component.execute : Resuming data from file : "
                                                + output_filepath)
                                            output_commstruct = None
                                            gc.collect()
                                            return True
                                    except BaseException, exce:
                                        Log.info(
                                            "Component.execute : Tried to resume output file with class '"
                                            + str(output_class) + "' : " +
                                            str(exce))
                                        pass
                            else:
                                self.resumed = True
                                self.executed = False
                                ProgressionManager.setComponentStatus(
                                    self, ProgressionManager.RESUMED_STATUS)
                                return True
                    except IOError, io_exce:
                        Log.trace(
                            "Component.execute : Unable to open output file to resume processor '"
                            + self.processorName + "'. From\n\t---> " +
                            str(io_exce))

                    # Here, the processor cannot be resumed, for any reason linked to outfiles,
                    Log.trace(
                        "Component.execute : No output file found for processor '"
                        + self.processorName + "': executing it")

                self.removePreviousOutputs()

            # If the processor does not have to be resumed because previous components were not resumed,
            # removes all old output files and the processor is executed
            else:
                Log.trace(
                    "Component.execute : Processor '" + self.processorName +
                    "' cannot be resumed since previous components have been executed."
                )
                self.removePreviousOutputs()
Exemple #13
0
    def execute(self, input_commstructs):

        if input_commstructs == None or len(input_commstructs) == 0:
            raise ExecutionException(
                "ImplantSitesProcessor.execute : No inputs")

        input_commstruct = input_commstructs[0]

        # Retrieve the processor parameters
        bedseq_number = self.getParameterAsint(
            GenerateMSAProcessor.PEAK_NUMBER_PARAM)

        insertion_number = self.getParameterAsint(
            GenerateMSAProcessor.INSERTION_NUMBER_PARAM)

        bedseq_medium_length = self.getParameterAsint(
            GenerateMSAProcessor.PEAK_MEDIUM_SIZE_PARAM, False)

        msa_length = self.getParameterAsint(
            GenerateMSAProcessor.MSA_SIZE_PARAM)

        trivial_msa = self.getParameter(
            GenerateMSAProcessor.TRIVIAL_SEQUENCES_PARAM, False)
        if trivial_msa == None:
            trivial_msa = False
        else:
            trivial_msa = (trivial_msa.lower() == "true")

        # Prepare the processor output dir
        out_path = os.path.join(self.component.outputDir,
                                self.component.getComponentPrefix())
        shutil.rmtree(out_path, True)
        os.mkdir(out_path)

        # Build the output CommStruct
        output_commstruct = BedSeqAlignmentStatsCommStruct()
        output_commstruct.baseSpecies = input_commstruct.baseSpecies

        output_commstruct.paramStatistics[
            BedSeqAlignmentStatsCommStruct.
            REFERENCE_SPECIES] = output_commstruct.baseSpecies
        output_commstruct.paramStatistics[
            BedSeqAlignmentStatsCommStruct.ALIGNED_SPECIES] = ""
        output_commstruct.paramStatistics[
            BedSeqAlignmentStatsCommStruct.BEDSEQUENCE_NUMBER] = bedseq_number
        output_commstruct.paramStatistics[
            BedSeqAlignmentStatsCommStruct.
            BEDSEQUENCE_WITH_MSA_NUMBER] = bedseq_number

        if bedseq_medium_length == None:
            # Get the required number of sequence from BED sequence list
            count_peak = 0
            max_length = 0
            for chrom in input_commstruct.bedSequencesDict:
                output_commstruct.bedSequencesDict[chrom] = []
                for bedseq in input_commstruct.bedSequencesDict[chrom]:
                    output_commstruct.bedSequencesDict[chrom].append(bedseq)
                    length = bedseq.indexEnd - bedseq.indexStart
                    if length > max_length:
                        max_length = length
                    count_peak += 1
                    if count_peak >= bedseq_number:
                        break
                if count_peak >= bedseq_number:
                    break
        else:
            # Generate random BED sequences
            self.generateBedSequences(bedseq_number, bedseq_medium_length,
                                      output_commstruct)

        # Export the new bedsequence size histogram and graph
        self.outputSequenceSizeHistogram(output_commstruct)

        # Generate MSA for each BED Sequence
        Log.trace("GenerateMSAProcessor.execute : Generating MSA")
        ProgressionManager.setTaskProgression("Generating MSA", self.component,
                                              0.0)
        if trivial_msa:
            self.generateTrivialMSA(msa_length, bedseq_number,
                                    output_commstruct)
        else:
            self.generateRandomMSA(msa_length, bedseq_number,
                                   bedseq_medium_length, output_commstruct)
        ProgressionManager.setTaskProgression("Generating MSA", self.component,
                                              1.0)

        # Implant insertion characters into the MSA Sequences
        Log.trace("GenerateMSAProcessor.execute : Implanting insertions")
        ProgressionManager.setTaskProgression("Implanting insertions",
                                              self.component, 0.0)
        self.implantInsertions(output_commstruct, insertion_number)
        ProgressionManager.setTaskProgression("Implanting insertions",
                                              self.component, 1.0)

        # Export the new bedsequence size histogram and graph
        self.outputMSALenghtHistogram(output_commstruct)

        return output_commstruct
            except SyntaxError, syn_exce:
                raise ParsingException(
                    "PipelineManager.executePipelines : Unable to read definition of pipelines from XML file: '"
                    + pipelines_filepath + "'. From:\n\t---> " + str(syn_exce))
            except ParsingException, par_exce:
                raise ParsingException(
                    "PipelineManager.executePipelines : Unable to read definition of pipelines from XML file: '"
                    + pipelines_filepath + "'. From:\n\t---> " + str(par_exce))

            if pipelines == None or len(pipelines) == 0:
                raise ParsingException(
                    "PipelineManager.executePipelines : No pipeline defined in the given definition file : "
                    + pipelines_filepath)

            # Verify if the definition of pipelines is correct
            Log.trace("PipelineManager.executePipelines : Verifying pipelines")
            try:
                self.verifyPipelinesDefinition(pipelines)
            except ParsingException, exe_exce:
                raise ParsingException(
                    "PipelineManager.executePipelines : Canceling execution of pipelines. From:\n\t---> "
                    + str(exe_exce))

            # Initialize the ProgressionManager
            ProgressionManager.initialize(
                pipelines, self.getParameter(PFConstants.OUTPUT_DIR_PARAM),
                self.getParameter(PFConstants.INSTALL_DIR_PARAM))

            # Execute the pipelines
            Log.trace("**************************************************")
            Log.trace("# Starting Pipelines")
class MAFIndexerProcessor( Processor):
    
    INPUT_MAF_FILE_PARAM = "MAFFile"
    REFERENCE_SPECIES_PARAM = "ReferenceSpecies"
    
    _lineType_col = 0
    _speciesChrom_col = 1
    _startindex_col = 2
    _textlength_col = 3
    _strand_col = 4
    _source_size_col = 5
    _text_col = 6


    # --------------------------------------------------------------------------------------
    def __init__(self):
        Processor.__init__( self)
        self.referenceSpecies = ""


    # --------------------------------------------------------------------------------------
    # Returns the name of the CommStruct class used as input 
    # (None if no input CommStruct)
    @staticmethod
    def getInputCommStructClass():
        
        return None


    # --------------------------------------------------------------------------------------
    # Returns the name of the CommStruct class used as output
    # (None if no output CommStruct)
    @staticmethod
    def getOutputCommStructClass():
        
        return None



    # --------------------------------------------------------------------------------------
    # Returns a name that will be used as display name in the user friendly outputs
    @staticmethod
    def getDisplayName():
        
        return "Indexation of MAF files"
        


    #------------------------------------------------------------------------------------
    # Returns a list of parameters names that are required parameters for the corresponding processor
    @staticmethod
    def getRequiredParameters():
        
        return ( MAFIndexerProcessor.INPUT_MAF_FILE_PARAM, MAFIndexerProcessor.REFERENCE_SPECIES_PARAM)
        


    # --------------------------------------------------------------------------------------
    # Execute the processor
    def execute( self, input_commstructs):
        
        source_maffile = self.getParameter( MAFIndexerProcessor.INPUT_MAF_FILE_PARAM)
        self.referenceSpecies = self.getParameter( MAFIndexerProcessor.REFERENCE_SPECIES_PARAM)
        
        # look for MAF files to parse
        maf_file_list = FileUtils.getFileList( source_maffile, "maf", self.referenceSpecies)
        if maf_file_list == None:
            raise ExecutionException( "MAFIndexerProcessor.execute : The path '" + source_maffile + "' does not point to a MAF file or a directory containing MAF files and does not contain a subdirectory '" + self.referenceSpecies + "' containing MAF files.")
        
        count_file = 0
        for maf_file_path in maf_file_list:
            Log.trace( "MAFIndexerProcessor.execute : Indexing " + maf_file_path)
            self.parseFile( maf_file_path)
            count_file += 1
            ProgressionManager.setComponentProgression( self.component, count_file/float( len( maf_file_list)))


    # --------------------------------------------------------------------------------------
    # Parse the MAF file
    def parseFile(self, maf_file_path):

        try:
            input_file = open( maf_file_path, "r")
        except IOError,  io_exec:
            raise ParsingException( "MAFIndexerProcessor.parseFile : Unable to open file '" + maf_file_path + "'. From:\n\t---> " + str(io_exec))
         
        if input_file != None:
            
            # Verify if the token '##maf' indicating a MAF file is found in the first lines
            is_maf_file = False
            while 1:
                line = input_file.readline()
                if len( line) == 0:
                    break
                elif not line.isspace():
                    tokens = line.split()
                    if tokens != None and len( tokens) > 0 and tokens[0] == "##maf":
                        is_maf_file = True
                        break
            
            if is_maf_file == True:
                output = []
                try:

                    # search for the next line starting with 'a' (meaning new alignment lbock)
                    counter = 0
                    specialized = True
                    ordered = True
                    previous_indexing = None
                    while 1:
                        line = input_file.readline()
                        if len( line) == 0:
                            break
                        elif not line.isspace():
                            tokens = line.split()
                            if tokens != None and len( tokens) > 0 and tokens[ MAFIndexerProcessor._lineType_col] == "a":
                                counter += 1
                                if counter % 100000 == 0:
                                    Log.trace( "MAFIndexerProcessor.execute : Number of MSA already indexed : " + str( counter))
                                line_number = input_file.tell()
                                indexing = self.indexBlock( input_file, previous_indexing)
                                if indexing != None:
                                    if previous_indexing != None:
                                        specialized = specialized and (indexing[1] == previous_indexing[1])
                                        ordered = ordered and (indexing[2] >= previous_indexing[3])
                                    output.append( indexing[0] + "\t" + str( line_number))
                                    previous_indexing = indexing
                    
                    #Write the result of indexing in file
                    output_path = maf_file_path + "index"
                    output_file = open( output_path, "w")
                    output_file.write( Constants.COMMENT_CHAR)
                    if specialized == True:
                        output_file.write( "\t" + previous_indexing[ 1])
                        if ordered == True:
                            output_file.write( "\t" + Constants.ORDERED)
                    else:
                        output_file.write( "\t" + Constants.MIXED)
                    output_file.write( "\n")
                    for indexing in output:
                        output_file.write( indexing + "\n")
                    output_file.flush()
                    self.closeFile( input_file)
                    self.closeFile( output_file)
                    return
                except IOError, io_exec:
                    raise ParsingException( "MAFIndexerProcessor.parseFile : Enable to create/write file '" + output_path + "'. From:\n\t---> " + str( io_exec))
                
                
            else:
                self.closeFile( input_file)
                raise ParsingException( "MAFIndexerProcessor.parseFile : The file '" + maf_file_path + "' is not a MAF file")
    def chooseBindingPoints(self, motif, sites, distribution_mode,
                            implantations, output_commstruct, dir_path):

        bedseq_list = output_commstruct.bedToMA.keys()
        bedseq_list_length = len(bedseq_list)

        chosen_distances_signed = []

        # Case of normal distribution
        # ...........................
        if distribution_mode == ImplantSitesProcessor.CENTERED_DISTRIBUTION_MODE_VALUE:
            for site in sites:
                tries = 0
                while True:
                    # Draw a bedseq with uniform probability
                    chosen_bedseq = bedseq_list[int(
                        random.uniform(0, bedseq_list_length))]
                    # Choose start index using normal distribution around peak reference index
                    chosen_middle_index = int(
                        random.normalvariate(chosen_bedseq.referenceIndex,
                                             30.0))
                    chosen_start_index = chosen_middle_index - int(
                        len(site) / float(2))
                    chosen_end_index = chosen_middle_index + int(
                        math.ceil(len(site) / float(2)))
                    chosen_distances_signed.append(
                        chosen_middle_index - chosen_bedseq.referenceIndex)
                    # Test if any other site previously placed intersect the chosen one
                    intersect = False
                    if chosen_bedseq in implantations.keys():
                        for previous_indexes in implantations[chosen_bedseq]:
                            if chosen_start_index < previous_indexes[
                                    2] and chosen_end_index > previous_indexes[
                                        1]:
                                intersect = True
                                break
                    # TO REMOVE
                    intersect = False
                    # END TO REMOVE
                    # If position is free, add the chosen position to the list of implantations
                    if not intersect:
                        if not chosen_bedseq in implantations.keys():
                            implantations[chosen_bedseq] = []
                        implantations[chosen_bedseq].append(
                            (site, chosen_start_index, chosen_end_index))
                        break
                    else:
                        tries += 1
                        if tries > 50:
                            Log.trace(
                                "GenerateMSAProcessor.chooseBindingPoints : No place found for site : "
                                + site + ". Bypassing site")
                            break

        # Case of uniform distribution
        # ............................
        elif distribution_mode == ImplantSitesProcessor.UNIFORM_DISTRIBUTION_MODE_VALUE:

            # Build a table that will permit to easily find a bedseq when drawing will be done
            total_length = 0
            bedseq_limits = []
            for bedseq in bedseq_list:
                length = bedseq.getLength()
                bedseq_limits.append(total_length + length)
                total_length += length

            for site in sites:
                tries = 0
                while True:
                    # draw a number (uniform) over all BED Sequence indexes
                    drawen_index = random.randint(0, total_length - 1)
                    # find to which BED sequence and to which index correspond the drawed number
                    chosen_bedseq = None
                    for index in range(bedseq_list_length):
                        if drawen_index < bedseq_limits[index]:
                            chosen_bedseq = bedseq_list[index]
                            if index == 0:
                                chosen_middle_index = chosen_bedseq.indexStart + drawen_index + int(
                                    len(site) / float(2))
                            else:
                                chosen_middle_index = chosen_bedseq.indexStart + drawen_index - bedseq_limits[
                                    index - 1]
                            break
                    # if the BEDseq is correctly found, check if the place is good for the site
                    if chosen_bedseq != None:
                        # if the index is to near from the sequence endpoints, draw a new index
                        if chosen_middle_index < (
                                chosen_bedseq.indexStart +
                                int(len(site) /
                                    float(2))) or chosen_middle_index > (
                                        chosen_bedseq.indexEnd -
                                        int(math.ceil(len(site) / float(2)))):
                            #print "-------------------------------"
                            #print "chosen_middle_index = " + str( chosen_middle_index)
                            #print "chosen_bedseq.indexStart = " + str( chosen_bedseq.indexStart)
                            #print "chosen_bedseq.indexEnd = " + str( chosen_bedseq.indexEnd)
                            #print "int( len( site) / float(2)) = " + str( int( len( site) / float(2)))
                            #print "int( math.ceil( len(site) / float(2))) = " + str( int( math.ceil( len(site) / float(2))))
                            continue
                        chosen_start_index = chosen_middle_index - int(
                            len(site) / float(2))
                        chosen_end_index = chosen_middle_index + int(
                            math.ceil(len(site) / float(2)))
                        #chosen_distances.append( int( math.fabs( chosen_middle_index - chosen_bedseq.referenceIndex )))
                        chosen_distances_signed.append(
                            chosen_middle_index - chosen_bedseq.referenceIndex)
                        # Test if any other site previously placed intersect the chosen one
                        intersect = False
                        if chosen_bedseq in implantations.keys():
                            for previous_indexes in implantations[
                                    chosen_bedseq]:
                                if chosen_start_index < previous_indexes[
                                        2] and chosen_end_index > previous_indexes[
                                            1]:
                                    intersect = True
                                    break
                        # If position is free, add the chosen position to the list of implantations
                        if not intersect:
                            if not chosen_bedseq in implantations.keys():
                                implantations[chosen_bedseq] = []
                            implantations[chosen_bedseq].append(
                                (site, chosen_start_index, chosen_end_index))
                            break
                        else:
                            tries += 1
                            if tries > 50:
                                Log.trace(
                                    "ImplantSitesProcessor.chooseBindingPoints : No place found for site : "
                                    + site + ". Bypassing site")
                                break
                    else:
                        print "No bedseq found"

        # Case of unknown distribution
        # ............................
        else:
            raise ExecutionException(
                "ImplantSitesProcessor.chooseBindingPoint : The chosen distribution mode is unknown '"
                + distribution_mode)

        # Compute the histogram of sites distances and graph it
        RSATUtils.outputHistogram(
            chosen_distances_signed, 5, dir_path, motif.name + "Sites",
            self.component.pipelineName,
            "Global distribution of " + motif.name + " sites over peaks",
            "Distance from peak maximum", "Number of occurence", None)
Exemple #17
0
                    max_size = bedseq_length
                total_size += bedseq_length

        mean_size = (int)(total_size / float(bedseq_number))

        output_commstruct.paramStatistics[
            BedSeqCommStruct.BEDSEQUENCES_NUMBER] = bedseq_number
        output_commstruct.paramStatistics[
            BedSeqCommStruct.BEDSEQUENCES_MIN_SIZE] = min_size
        output_commstruct.paramStatistics[
            BedSeqCommStruct.BEDSEQUENCES_MAX_SIZE] = max_size
        output_commstruct.paramStatistics[
            BedSeqCommStruct.BEDSEQUENCES_MEAN_SIZE] = mean_size
        output_commstruct.paramStatistics[
            BedSeqCommStruct.BEDSEQUENCES_TOTAL_SIZE] = total_size
        Log.trace("BEDProcessor.execute : Total number of BED Sequences = " +
                  str(bedseq_number))
        Log.trace("BEDProcessor.execute : Minimum size of BED Sequences = " +
                  str(min_size))
        Log.trace("BEDProcessor.execute : Maximum size of BED Sequences = " +
                  str(max_size))
        Log.trace("BEDProcessor.execute : Mean size of BED Sequences = " +
                  str(mean_size))
        Log.trace("BEDProcessor.execute : Total size of BED Sequences = " +
                  str(total_size))

        # output the sequences size histogram
        self.outputSequenceSizeHistogram(bedseq_dictionnary, output_commstruct)

        return output_commstruct

    # --------------------------------------------------------------------------------------
Exemple #18
0
    def execute(self, input_commstructs):

        if input_commstructs == None or len(input_commstructs) == 0:
            raise ExecutionException("MAFProcessor.execute : No inputs")

        input_commstruct = input_commstructs[0]

        # retrieve processor parameters
        source_maffile = self.getParameter(MAFProcessor.INPUT_MAF_FILE_PARAM)

        specialized_file_line = self.getParameter(
            MAFProcessor.SPECIALIZED_MAF_FILE_PARAM, False)
        if specialized_file_line == None:
            specialized_file = False
        else:
            specialized_file = (specialized_file_line.lower() == "true")

        desired_species_line = self.getParameter(
            MAFProcessor.DESIRED_SPECIES_LIST_PARAM, False)
        if desired_species_line != None:
            self.desiredSpeciesList = desired_species_line.split()

        self.referenceSpecies = self.getParameter(
            MAFProcessor.REFERENCE_SPECIES_PARAM)

        thread_number = self.getParameterAsint(
            MAFProcessor.THREAD_NUMBER_PARAM, False)
        if thread_number == None or thread_number < 0:
            thread_number = 1

        keep_gaps = self.getParameter(MAFProcessor.KEEP_GAPS, False)
        if keep_gaps == None:
            keep_gaps = False

        # Retrieve BED sequences from the input CommStruct
        self.bedSequencesDict = input_commstruct.bedSequencesDict

        if self.bedSequencesDict == None or len(self.bedSequencesDict) == 0:
            raise ExecutionException(
                "MAFProcessor.execute : No BEDSequence provided as input")

        # Look for MAF files to parse
        maf_file_list = FileUtils.getFileList(source_maffile, "maf",
                                              self.referenceSpecies)
        if maf_file_list == None:
            raise ExecutionException(
                "MAFProcessor.execute : The path '" + source_maffile +
                "' does not point to a MAF file or a directory containing MAF files and does not contain a subdirectory '"
                + self.referenceSpecies + "' containing MAF files.")

        # Retrieve the alignement blocks of the MAF files corresponding to each BED sequence,
        # managing the parsing of MAF file list according to the chosen number of threads
        count_parsed_files = 0
        ProgressionManager.setTaskProgression("Parsing MAF Files",
                                              self.component, 0.0)
        if thread_number > 1:
            # store the list of MAF file in a queue
            file_queue = Queue.Queue(len(maf_file_list))
            for file in maf_file_list:
                file_queue.put(file)

            # Create a Thread lock used in several methods to avoid possible Thread conflicts
            self.threadLock = threading.Lock()

            # Launch the firsts Threads
            thread_list = []
            while not file_queue.empty() and len(thread_list) < thread_number:
                self.startNewThread(file_queue, specialized_file, thread_list)

            # Manage the Threads list in order to empty the file queue
            while not file_queue.empty() or len(thread_list) > 0:
                for threads in thread_list:
                    if not threads.is_alive():
                        thread_list.remove(threads)
                        count_parsed_files += 1
                        ProgressionManager.setTaskProgression(
                            "Parsing MAF Files", self.component,
                            count_parsed_files / float(len(maf_file_list)))
                        self.startNewThread(file_queue, specialized_file,
                                            thread_list)
                time.sleep(MAFProcessor.THREAD_CHECK_DELAY)
        else:
            # Parse the files in the MAF file list
            for file in maf_file_list:
                self.parseFile(file, specialized_file)
                count_parsed_files += 1
                ProgressionManager.setTaskProgression(
                    "Parsing MAF Files", self.component,
                    count_parsed_files / float(len(maf_file_list)))

        # Assign the whole list of parsed species if a desired list was not set
        if len(self.desiredSpeciesList) == 0:
            self.desiredSpeciesList = self.parsedSpeciesList

        # create the output CommStruct
        output_commstruct = BedSeqAlignmentStatsCommStruct()
        output_commstruct.processorName = self.component.processorName
        output_commstruct.baseSpecies = input_commstruct.baseSpecies
        output_commstruct.bedSequencesDict = input_commstruct.bedSequencesDict
        output_commstruct.paramStatistics = input_commstruct.paramStatistics

        output_commstruct.paramStatistics[
            BedSeqAlignmentStatsCommStruct.
            REFERENCE_SPECIES] = self.referenceSpecies
        output_commstruct.paramStatistics[
            BedSeqAlignmentStatsCommStruct.ALIGNED_SPECIES] = ", ".join(
                self.desiredSpeciesList)

        # Compose the MSA corresponding to each BED sequence from the MAF blocks
        ProgressionManager.setTaskProgression("Building MSA", self.component,
                                              0.0)
        count = 0
        total_number_bed = len(self.mafBlockDic.keys())
        min_size = 100000000
        max_size = -1
        total_size = 0
        msa_lenghts = []
        for bed_sequence in self.mafBlockDic.keys():
            count += 1
            if count % 100 == 0:
                ProgressionManager.setTaskProgression(
                    "Building MSA", self.component,
                    count / float(total_number_bed))
            alignment = self.composeSequenceAlignment(bed_sequence, keep_gaps)
            align_length = alignment.totalLength
            msa_lenghts.append(align_length)
            if align_length < min_size:
                min_size = align_length
            if align_length > max_size:
                max_size = align_length
            total_size += align_length

            output_commstruct.addSequenceAlignment(bed_sequence, alignment)

        ProgressionManager.setTaskProgression("Building MSA", self.component,
                                              1.0)

        mean_size = (int)(total_size / float(total_number_bed))

        output_commstruct.paramStatistics[
            BedSeqAlignmentStatsCommStruct.MSA_NUMBER] = count
        output_commstruct.paramStatistics[
            BedSeqAlignmentStatsCommStruct.MSA_MIN_SIZE] = min_size
        output_commstruct.paramStatistics[
            BedSeqAlignmentStatsCommStruct.MSA_MAX_SIZE] = max_size
        output_commstruct.paramStatistics[
            BedSeqAlignmentStatsCommStruct.MSA_MEAN_SIZE] = mean_size
        output_commstruct.paramStatistics[
            BedSeqAlignmentStatsCommStruct.MSA_TOTAL_SIZE] = total_size
        Log.trace(
            "MAFProcessor.execute : Total number of BEDsequence with associated MSA = "
            + str(count))
        Log.trace(
            "MAFProcessor.execute : Minimal size of BEDsequence with associated MSA = "
            + str(min_size))
        Log.trace(
            "MAFProcessor.execute : Maximal size of BEDsequence with associated MSA = "
            + str(max_size))
        Log.trace(
            "MAFProcessor.execute : Mean size of BEDsequence with associated MSA = "
            + str(mean_size))
        Log.trace(
            "MAFProcessor.execute : Total size of BEDsequence with associated MSA = "
            + str(total_size))

        # Output the MSA lengths histogram and graph
        self.outputMSALenghtHistogram(msa_lenghts, output_commstruct)

        return output_commstruct
    def execute(self, input_commstructs):
        
        if input_commstructs == None or len(input_commstructs) == 0:
            raise ExecutionException("BEDOutputProcessor.execute : No inputs")
        
        input_commstruct = input_commstructs[0]
        
        # Retrieve the processor parameters
        reference_motif = self.getParameter(BEDOutputProcessor.REFERENCE_MOTIF)
                
        color_method = self.getParameter(BEDOutputProcessor.COLOR_METHOD, False)
        if color_method == None:
            color_method = BEDOutputProcessor.COLOR_METHOD_SCORE
        else:
            color_method = color_method.lower()
            if color_method != BEDOutputProcessor.COLOR_METHOD_SCORE and color_method != BEDOutputProcessor.COLOR_METHOD_FAMILY:
                color_method = BEDOutputProcessor.COLOR_METHOD_SCORE
                
        score_min = self.getParameterAsfloat(BEDOutputProcessor.SCORE_MIN)
        score_max = self.getParameterAsfloat(BEDOutputProcessor.SCORE_MAX)
        
        # Prepare the processor output dir
        out_path = os.path.join(self.component.outputDir, self.component.getComponentPrefix())
        shutil.rmtree(out_path, True)
        FileUtils.createDirectory( out_path, 0777)

        # Retrieve the JASPAR motifs details
        motif_details = MotifUtils.getMotifsDetailsFromJaspar()
        motif_id = motif_details[ 0]
        motif_family = motif_details[ 1]
        family_rgb = {}

        # build the bed output file path
        bed_file_path = os.path.join(out_path, self.component.pipelineName + "_Motifs.bed")

        try:
            bed_file = open(bed_file_path, "w")

            #bed_file.write("track name='" + self.component.pipelineName + "' visibility=3 itemRgb='On' use_score=1\n")
            #bed_file.write("browser dense RSAT\n")
            #bed_file.write("browser dense\n") 
            #bed_file.write("## seq_name	start	end	feature_name	score	strand	thickStart	thickEnd	itemRgb	blockCount	blockSizes	blckStarts\n")

            current_color = None
            bedseq_list = input_commstruct.bedToMA.keys()
            bedseq_list.sort(BEDSequence.compare)
            previous_line_start = 0
            previous_line_key = ""
            for bed_seq in bedseq_list:
                for msa in input_commstruct.bedToMA[ bed_seq]:
                    for motif in msa.motifs:
                        motif_name = motif.name
                        if not input_commstruct.motifStatistics.has_key(motif_name):
                            continue
                        if motif_name in motif_id.keys():
                            out_name = motif_id[ motif_name]
                            chromosom = bed_seq.chromosom
                            start_position = bed_seq.indexStart + msa.fixIndex(motif.indexStart)
                            end_position = bed_seq.indexStart + msa.fixIndex(motif.indexEnd)
                            score = motif.score
                            
                            # Commented : Black is assigned to the reference motif
                            #if motif_name == reference_motif:
                            #    item_rgb = "0,0,0"
                            # for the other motif, color depends on the chosen method
                            #else:
                            if color_method == BEDOutputProcessor.COLOR_METHOD_FAMILY:
                                if motif_name in motif_family.keys():
                                    #print("-----------------------------")
                                    #print "Current color = " + str(current_color)
                                    #print "Motif name=" + motif_name
                                    #print "Motif family=" + motif_family[ motif_name]
                                    family_rgb = self.updateFamilyRGB(motif_family[ motif_name], family_rgb, current_color)
                                    #print "Family RGB = " + str(family_rgb)
                                    item_rgb = family_rgb[ motif_family[ motif_name]]
                                    #print "Item rgb = ", str(item_rgb)
                                    current_color = item_rgb
                                else:
                                    item_rgb = BEDOutputProcessor.COLORS[ 0]
                            else:
                                item_rgb = self.getColorForScore(score, score_min, score_max)
                            
                            # Write the lines to output file
                            if len( chromosom) <4:
                                line_out = "chr" + chromosom
                            else:
                                line_out = chromosom
                            line_out += "\t" + str(start_position)
                            line_out += "\t" + str(end_position)
                            line_out += "\t" + out_name
                            line_out += "\t" + str(int(score * 1000))
                            line_out += "\t" + motif.strand
                            line_out += "\t" + str(start_position)           # ThickStart
                            line_out += "\t" + str(end_position)            # ThickEnd
                            line_out += "\t" + item_rgb        # itemRGB
                            #line_out += "\t" + "0"            # BlockCount
                            #line_out += "\t" + "0"            # BlockSizes
                            #line_out += "\t" + "0"            # BlockStarts
                            
                            # Build a key that represent the motif chrom,  name and positions
                            line_key = chromosom + ":" + str(start_position) + ":" + str(end_position) + ":" + out_name
                            
                            # If the new line has the same key has the previous one, we must keep only one of the two lines
                            # i.e. the one with the highest score (the tell() and seek() method permits to overwrite the old line
                            # line if required.
                            # If the new line and the previous one has different keys the new line is simply written
                            if previous_line_key != line_key:
                                previous_line_start = bed_file.tell()
                                bed_file.write(line_out)
                                bed_file.write("\n")
                                bed_file.flush
                                previous_line_key = line_key
                                previous_score = score
                            else:
                                if score > previous_score:
                                    bed_file.seek(previous_line_start)
                                    bed_file.write(line_out)
                                    bed_file.write("\n")
                                    bed_file.flush
                                    previous_score = score     

            bed_file.close()
            input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.BED_OUTPUT_PATH] = bed_file_path
            
            # Sort bed_file (used for bigBed conversion)
            sorted_bed_file_path = os.path.join(out_path, self.component.pipelineName + "_Motifs_sorted.bed")
            cmd = "sort -k1,1 -k2,2n"
            cmd += " " + bed_file_path
            cmd += " > " + sorted_bed_file_path
            
            Log.info( "BEDOuputProcessor.execute : Sorting BED file")
            Log.info( "BEDOuputProcessor.execute  : command used is : " + cmd)
            
            cmd_result = commands.getstatusoutput( cmd)
            Log.trace( "BEDOuputProcessor.execute : " + threading.currentThread().getName() + " : status returned is :" + str( cmd_result[0]))
            if cmd_result[0] != 0:
                Log.log( "BEDOuputProcessor.execute : status returned is :" + str( cmd_result[0]) + " for command '" + cmd + "'" )
                Log.log( "BEDOuputProcessor.execute : command output is = \n" + str( cmd_result[1]))
                return input_commstruct
                        
            # Fetch the chrom sizes that will be use to convert BED file to bigBed file
            chrom_sizes_path = os.path.join(out_path, self.component.pipelineName + "_chrom_size.txt")
            
            RSAT_PATH = self.component.getParameter( Constants.RSAT_DIR_PARAM)
            cmd = os.path.join( RSAT_PATH , "contrib/peak-footprints/tools/fetchChromSizes")
            cmd += " " + input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.REFERENCE_SPECIES]
            cmd += " > " + chrom_sizes_path
            
            Log.info( "BEDOuputProcessor.execute : Fetching Chrom sizes for species : " + input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.REFERENCE_SPECIES])
            Log.info( "BEDOuputProcessor.execute  : command used is : " + cmd)
            
            cmd_result = commands.getstatusoutput( cmd)
            Log.trace( "BEDOuputProcessor.execute : " + threading.currentThread().getName() + " : status returned is :" + str( cmd_result[0]))
            if cmd_result[0] != 0:
                Log.log( "BEDOuputProcessor.execute : status returned is :" + str( cmd_result[0]) + " for command '" + cmd + "'" )
                Log.log( "BEDOuputProcessor.execute : command output is = \n" + str( cmd_result[1]))
                return input_commstruct
            
            # Build the bigBed file
            # sudo ln -s /lib/x86_64-linux-gnu/libssl.so.1.0.0 /usr/lib/libssl.so.10
            # sudo ln -s /lib/x86_64-linux-gnu/libcrypto.so.1.0.0 /usr/lib/libcrypto.so.10
            
            big_bed_path = os.path.join(out_path, self.component.pipelineName + "_Motifs.bb")
            
            RSAT_PATH = self.component.getParameter( Constants.RSAT_DIR_PARAM)
            cmd = os.path.join( RSAT_PATH , "contrib/peak-footprints/tools/bedToBigBed")
            cmd += " " + sorted_bed_file_path
            cmd += " " + chrom_sizes_path
            cmd += " " + big_bed_path
            
            Log.info( "BEDOuputProcessor.execute : Converting BED file to bigBed file")
            Log.info( "BEDOuputProcessor.execute  : command used is : " + cmd)
            
            cmd_result = commands.getstatusoutput( cmd)
            Log.trace( "BEDOuputProcessor.execute : " + threading.currentThread().getName() + " : status returned is :" + str( cmd_result[0]))
            if cmd_result[0] != 0:
                Log.log( "BEDOuputProcessor.execute : status returned is :" + str( cmd_result[0]) + " for command '" + cmd + "'" )
                Log.log( "BEDOuputProcessor.execute : command output is = \n" + str( cmd_result[1]))
                return input_commstruct
                        
            input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.BIGBED_OUTPUT_PATH] = big_bed_path
            
        except IOError, io_exce:
            Log.log("BEDOutputProcessor.execute : Unable to save the BED file of recognized motifs : " + str(io_exce))