Beispiel #1
0
 def __init__(self):
     
     self.path_home = Constants.PATH_HOME
     self.protein_list_file = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.MOTIFS_PROTEIN_FILE_PROPERTY, True)
     self.protein_list = FileParser.read_file(self.protein_list_file)
     self.motif_folder = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.MOTIFS_FOLDER_PROPERTY, True)
     self.domain_region_file = self.path_home + PropertyManager.get_instance().get_property( DataConstants.MOTIFS_DOMAIN_REGION_FILE_PROPERTY, True)
Beispiel #2
0
    def dictionary_identifier(self):

        Logger.get_instance().info(
            " Creation of a dictionary for novel gene of dataset 2\
The dictionary structure is : \n \
{gene = [ isoform1, isoform2,...isoformN]}")

        self.ensembl_path_output = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.ENSEMBL_OUTPUT_PATH_SEQUENCE_PROPERTY,
                       True)
        self.ensembl_output_dataset2 = self.ensembl_path_output + PropertyManager.get_instance(
        ).get_property(DataConstants.ENSEMBL_FILE_SEQUENCES_2_PROPERTY, True)

        self.dictionary_output = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.DICTIONARY_PATH_OUTPUT_PROPERTY, True)
        self.dictionary_namefile = self.dictionary_output + PropertyManager.get_instance(
        ).get_property(DataConstants.DICTIONARY_NAME_FILE_PROPERTY, True)

        dict_identifier = InfoFasta.make_dictionary(
            self.ensembl_output_dataset2)

        file_dict = FileUtils.open_text_w(self.dictionary_namefile)

        pickle.dump(dict_identifier, file_dict)

        Logger.get_instance().info(
            " The creation of a dictionary for novel gene in dataset 2 is completed \n\n"
        )
Beispiel #3
0
 def iupred_motifs(self):
     
     Logger.get_instance().info( "        .....Start of IUPred motifs analysis.....\n")
     
     self.iupred_folder = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.MOTIFS_IUPRED_FOLDER_PROPERTY,True)
     
     # Iupred Analysis at threshold value of 0.4
     
     Timer.get_instance().step(" Start of IUPred motifs analysis - threshold = 0.4 \n")
     
     self.threshold_1 = Constants.MOTIFS_THRESHOLD_1
     self.output_folder_1 = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.MOTIFS_IUP_OUTPUT_FOLDER_1_PROPERTY, True)
     
     GlobalOverlapRegionAnalysis.iupred_overlap_analysis(self.protein_list,self.iupred_folder, self.output_folder_1, self.threshold_1,
                                                         self.motif_folder,self.domain_region_file)
     
     Timer.get_instance().step(" End of IUPred motifs analysis - threshold = 0.4 \n")
     
     
     # Iupred Analysis at threshold value of 0.5
     
     Timer.get_instance().step(" Start of IUPred motifs analysis - threshold = 0.5 \n")
     self.threshold_2 = Constants.MOTIFS_THRESHOLD_2
     self.output_folder_2 = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.MOTIFS_IUP_OUTPUT_FOLDER_2_PROPERTY, True)
     
     GlobalOverlapRegionAnalysis.iupred_overlap_analysis(self.protein_list,self.iupred_folder, self.output_folder_2, self.threshold_2,
                                                         self.motif_folder,self.domain_region_file)
     
     Timer.get_instance().step(" End of IUPred motifs analysis - threshold = 0.5 \n")
     
     Logger.get_instance().info( "        .....End of IUPred motifs analysis.....\n")
Beispiel #4
0
    def make_dictionary(self):

        Logger.get_instance().info(
            " Creation of a dictionary for novel gene of dataset 2\
The dictionary structure is : \n \
{gene = [ isoform1, isoform2,...isoformN]}")

        self.path_home = Constants.PATH_HOME
        self.path_input_file = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_DICTIONARY_INPUT_FILE_PROPERTY,
                       True)

        self.dictionary_output_path = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_DICTIONARY_OUTPUT_PATH_PROPERTY,
                       True)
        self.output_file_path = self.dictionary_output_path + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_DICTIONARY_FILE_OUTPUT_PROPERTY,
                       True)

        dict_identifier = InfoFasta.make_dictionary(self.path_input_file)

        self.dict_file = FileUtils.open_text_w(self.output_file_path)

        pickle.dump(dict_identifier, self.dict_file)

        Logger.get_instance().info(
            " The creation of a dictionary is completed \n\n")
Beispiel #5
0
    def delet_append_file(self):

        self.del_ensembl_input = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.DEL_ENSEMBL_PATH_PROPERTY, True)
        self.del_ensembl_file1 = self.del_ensembl_input + PropertyManager.get_instance(
        ).get_property(DataConstants.DEL_ENSEMBL_FILE1_PROPERTY, True)
        self.del_ensembl_file2 = self.del_ensembl_input + PropertyManager.get_instance(
        ).get_property(DataConstants.DEL_ENSEMBL_FILE2_PROPERTY, True)

        self.del_longest_input = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.DEL_LONGEST_PATH_PROPERTY, True)
        self.del_longest_file = self.del_longest_input + PropertyManager.get_instance(
        ).get_property(DataConstants.DEL_LONGEST_FILE_PROPERTY, True)
        self.del_isoform_file = self.del_longest_input + PropertyManager.get_instance(
        ).get_property(DataConstants.DEL_ISOFORM_FILE_PROPERTY, True)
        self.del_random_isoform_file = self.del_longest_input + PropertyManager.get_instance(
        ).get_property(DataConstants.DEL_RANDOM_ISOFORM_FILE_PROPERTY, True)

        self.del_fusion_path = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.DEL_FUSION_PATH_PROPERTY, True)
        self.del_fusion_file_longest = self.del_fusion_path + PropertyManager.get_instance(
        ).get_property(DataConstants.DEL_FUSION_DATASET_LONGEST_PROPERTY, True)
        self.del_fusion_file_dataset12 = self.del_fusion_path + PropertyManager.get_instance(
        ).get_property(DataConstants.DEL_FUSION_DATASET12_PROPERTY, True)

        files = [
            self.del_ensembl_file1, self.del_ensembl_file2,
            self.del_longest_file, self.del_isoform_file,
            self.del_random_isoform_file, self.del_fusion_file_longest,
            self.del_fusion_file_dataset12
        ]

        for namefile in files:
            RemoveFile.delete_file(namefile)
Beispiel #6
0
    def isoform_sequences(self):

        Logger.get_instance().info(
            " Starting the random selection of isoforms with same length \n")
        Logger.get_instance().info(
            " The following headers are the proteins randomly selected \n")

        self.path_output_longest = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.LONGEST_PATH_OUTPUT_PROPERTY, True)

        self.path_file_isoform = self.path_output_longest + PropertyManager.get_instance(
        ).get_property(DataConstants.ISOFORM_FILE_PROPERTY, True)
        self.path_file_selected_isoform = self.path_output_longest + PropertyManager.get_instance(
        ).get_property(DataConstants.RANDOM_ISOFORM_SEQ_PROPERTY, True)

        # The headers of a Isoform fasta file are taken by InfoFasta class
        # You make sure that the arg text is equal to False because the input object is a file and not a list

        self.headers = InfoFasta.get_header(self.path_file_isoform, text=False)

        # Extraction of genes form headers line
        # This vector contains double gene because the file contains some isoform of the same gene

        gene_isoform = []
        for header in self.headers:
            gene = header[1:16]
            gene_isoform.append(gene)

        # gene set creation
        unique_gene = set(gene_isoform)

        # This for loop flows on the unique gene
        #
        random_header = []
        old_num_isoform = 0
        for gene in unique_gene:
            # For each gene counts how many isoform has
            num_isoform = gene_isoform.count(gene)
            item = range(0, num_isoform)
            # Select one isoform randomly
            sel = random.choice(item)
            # The header selected randomly are stored in array
            random_header.append(self.headers[old_num_isoform:old_num_isoform +
                                              num_isoform][sel])
            old_num_isoform = old_num_isoform + num_isoform

        self.file_random_seq = FileUtils.open_text_a(
            self.path_file_selected_isoform)

        # The sequences corresponding to header selected are extracted from isoform file

        for header in random_header:
            Logger.get_instance().info('Header selected : ' + header)
            identifier = header[33:48]
            sequence = InfoFasta.get_seq(self.path_file_isoform, identifier)
            fasta_seq = SeqToFasta.give_fasta(header, sequence)
            self.file_random_seq.write(fasta_seq)

        Logger.get_instance().info(" End of selection random sequences \n ")
Beispiel #7
0
 def iupred_analysis(self):
     
     Timer.get_instance().step(" Start of Iupred analysis...")
     
     
     self.tool_path_input = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.TOOL_PATH_INPUT_PROPERTY, True)
     self.iupred_path_output = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.IUPRED_PATH_OUTPUT_PROPERTY,True)
     
         
     Iupred.global_iupred_analysis(self.tool_path_input,
                                   self.iupred_path_output)
     
     Timer.get_instance().step(" End of Iupred analysis")
Beispiel #8
0
 def anchor_analysis(self):
     
     
     
     Timer.get_instance().step(" Start of Anchor analysis...")
     
     self.tool_path_input = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.TOOL_PATH_INPUT_PROPERTY, True)
     self.anchor_path_output = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.ANCHOR_PATH_OUTPUT_PROPERTY,True)
     self.motif_list_path = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.ANCHOR_MOTIF_PATH_PROPERTY, True)
     
     Anchor.global_anchor_analysis(self.motif_list_path,
                                   self.tool_path_input,
                                   self.anchor_path_output)
     
     Timer.get_instance().step(" End of Anchor analysis")
Beispiel #9
0
 def disordpbind_analysis(self):
     
     Timer.get_instance().step(" Start of DisoRDPbind output analysis.. ")
     
     self.path_home = Constants.PATH_HOME
     self.input_file = self.path_home + PropertyManager.get_instance().get_property( DataConstants.DISO_INPUT_FILE_PROPERTY, True)
     self.ouput_path = self.path_home + PropertyManager.get_instance().get_property( DataConstants.DISO_OUTPUT_FOLDER_PROPERTY, True)
     self.binding_partner = PropertyManager.get_instance().get_property( DataConstants.DISO_BINDING_PARTNER_PROPERTY, True)
     self.num_aa_diso = PropertyManager.get_instance().get_property( DataConstants.DISO_NUM_AA_PROPERTY, True)
     self.dataset_type = PropertyManager.get_instance().get_property( DataConstants.DISO_DATASET_TYPE_PROPERTY, True)
     
     
     
     DisoRDPbind.make_disordp_file(self.input_file, self.ouput_path, int(self.binding_partner), int(self.num_aa_diso), self.dataset_type)
     
     Timer.get_instance().step(" End of DisoRDPbind output analysis")
Beispiel #10
0
 def anchor_motifs(self):
     
     Logger.get_instance().info( "        .....Start of IUPred motifs analysis.....\n")
     
     self.anchor_folder = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.MOTIFS_ANCHOR_FOLDER_PROPERTY, True)
     self.anchor_output_folder = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.MOTIFS_ANCHOR_OUTPUT_FOLDER_PROPERTY, True)
     
     Timer.get_instance().step(" Start of ANCHOR motifs analysis \n")
     
     GlobalOverlapRegionAnalysis.anchor_overlap_analysis(self.protein_list,self.anchor_folder,self.anchor_output_folder,
                                                         self.motif_folder,self.domain_region_file)
     
      
     Timer.get_instance().step(" End of IUPred motifs analysis \n")
     
     Logger.get_instance().info( "        .....End of ANCHOR motifs analysis.....\n")
Beispiel #11
0
 def split_dataset(self):
         
         
     Logger.get_instance().info( " Division of Dataset in many fasta file each containing one protein sequence")
     
     self.path_home = Constants.PATH_HOME
         
     self.split_path_input = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPLIT_PATH_INPUT_PROPERTY, True)
     self.split_file_fasta = self.split_path_input + PropertyManager.get_instance().get_property( DataConstants.SPLIT_DATASET_PROPERTY, True)
     self.split_path_output = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.SPLIT_PATH_OUTPUT_PROPERTY, True)
     self.split_start_index = PropertyManager.get_instance().get_property( DataConstants.SPLIT_START_HEADER_PROPERTY, True)
     self.split_end_index = PropertyManager.get_instance().get_property( DataConstants.SPLIT_END_HEADER_PROPERTY, True)
     
     SplitSeq.split_seq( self.split_file_fasta, self.split_path_output, int(self.split_start_index), int(self.split_end_index))
     
         
     Logger.get_instance().info( " The dataset has been split in many fasta files ")
Beispiel #12
0
    def disordp_motifs(self):
        
        Logger.get_instance().info( "        .....Start of DisoRDPbind motifs analysis.....\n")
        
        self.disordp_folder = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.MOTIFS_DISORDP_FOLDER_PROPERTY, True)
        self.disordp_output_folder = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.MOTIFS_DISORDP_OUTPUT_FOLDER_PROPERTY, True)
        self.filename =  PropertyManager.get_instance().get_property( DataConstants.MOTIFS_DISORDP_FILE_PROPERTY,True)
        
        Timer.get_instance().step(" Start of DisoRDPbind motifs analysis \n")
        
        GlobalOverlapRegionAnalysis.disordp_overlap_analysis(self.protein_list, self.disordp_folder, self.filename, self.motif_folder,
                                                             self.domain_region_file,self.disordp_output_folder)
        
        
        
        Timer.get_instance().step(" End of DisoRDPbind motifs analysis \n")

        Logger.get_instance().info( "        .....End of DisoRDPbind motifs analysis.....\n")
Beispiel #13
0
    def download_product_gene_seq(self):

        Logger.get_instance().info(
            " Start of sequences download from Ensembl..")

        self.path_home = Constants.PATH_HOME
        self.gene_list_input = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_ENSEMBL_FILE_INPUT_LIST_PROPERTY,
                       True)
        self.ensembl_seq_output = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_ENSEMBL_FILE_OUPUT_SEQ_PROPERTY,
                       True)
        self.type_query = PropertyManager.get_instance().get_property(
            DataConstants.DOWNLOAD_ENSEMBL_TYPE_QUERY_PROPERTY, True)

        Ensembl.list_get_seq(self.gene_list_input,
                             int(self.type_query),
                             path_output=self.ensembl_seq_output)

        Logger.get_instance().info(" End of sequences download from Ensembl..")
Beispiel #14
0
    def merger_sequences(self):

        Logger.get_instance().info(
            " Union of the longest sequences and the random selected isoform ")

        # Input variables to merge the longest Novel sequences and random selected isoform of dataset

        self.path_home = Constants.PATH_HOME
        self.path_file_longest = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_FUSION_FILE_LONGEST_PROPERTY,
                       True)
        self.path_file_random = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_FUSION_FILE_RANDOM_PROPERTY,
                       True)
        self.path_final_file = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_FUSION_FINAL_DATASET_PROPERTY,
                       True)

        FileParser.merge_file(self.path_file_longest, self.path_file_random,
                              self.path_final_file)

        Logger.get_instance().info(
            " The Final Proteome Dataset has been created\n ")
Beispiel #15
0
 def whole_procedure():
         
     dataset_type = PropertyManager.get_instance().get_property( DataConstants.MOTIFS_DATASET_TYPE_PROPERTY, True)
     
     # start chrono
     Timer.get_instance().start_chrono()
     Logger.get_instance().info("        ........Start of " + dataset_type + " Motifs Analysis.....\n ")
     
     motifs = MotifsAnalysis()
     motifs.iupred_motifs()
     motifs.anchor_motifs()
     motifs.disordp_motifs()
     
     Timer.get_instance().stop_chrono(" End of " + dataset_type + " Motifs Analysis")
Beispiel #16
0
    def get_longest_seq(self):

        Logger.get_instance().info(
            " Start of the selection of longest sequences of dataset \n")

        self.path_home = Constants.PATH_HOME

        self.file_sequences = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_LONGEST_SEQ_INPUT_FILE_PROPERTY,
                       True)
        self.output_path = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_LONGEST_SEQ_OUTPUT_PATH_PROPERTY,
                       True)

        self.dictionary_file = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_LONGEST_DICTIONARY_PROPERTY,
                       True)

        self.longest_file = self.output_path + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_LONGEST_SEQ_FILE_PROPERTY, True)
        self.isoform_file = self.output_path + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_LONGEST_ISOFORM_FILE, True)

        # Extraction the longest sequences from dataset sequences (isoforms)
        self.file_seq = open(self.file_sequences)
        self.seq_obj = self.file_seq.readlines()

        LengthSeq.longest_seq(self.seq_obj,
                              self.dictionary_file,
                              self.longest_file,
                              self.isoform_file,
                              type_obj='list')

        Logger.get_instance().info(
            " End of selection of the longest sequences: \n \
two file have been generated one with longest sequences and the other one containing the isoform with same length  "
        )
Beispiel #17
0
    def comparison_dataset(self):

        # Create Logger instance to see the start of comparison between two datasets
        Logger.get_instance().info(" Start of comparison datasets:...")

        # Definition of InfoDataset arguments

        self.dataset_input_path = PropertyManager.get_instance().get_property(
            DataConstants.DATASET_INPUT_PATH_PROPERTY, True)
        self.dataset_1_file = PropertyManager.get_instance().get_property(
            DataConstants.DATASET_1_FILE_PROPERTY, True)
        self.dataset_2_file = PropertyManager.get_instance().get_property(
            DataConstants.DATASET_2_FILE_PROPERTY, True)
        self.dataset_1_index_col = PropertyManager.get_instance().get_property(
            DataConstants.DATASET_1_INDEX_COL_PROPERTY, True)
        self.dataset_2_index_col = PropertyManager.get_instance().get_property(
            DataConstants.DATASET_2_INDEX_COL_PROPERTY, True)
        self.dataset_output = PropertyManager.get_instance().get_property(
            DataConstants.DATASET_OUTPUT_PROPERTY, True)
        self.dataset_1_length = PropertyManager.get_instance().get_property(
            DataConstants.DATASET_1_LENGTH_PROPERTY, True)
        self.dataset_2_length = PropertyManager.get_instance().get_property(
            DataConstants.DATASET_2_LENGTH_PROPERTY, True)

        self.index_col = (int(self.dataset_1_index_col),
                          int(self.dataset_2_index_col))
        self.dataset_length = (int(self.dataset_1_length),
                               int(self.dataset_2_length))

        self.path_home = Constants.PATH_HOME
        InfoDataset.global_analysis_dataset(
            self.path_home + self.dataset_input_path,
            (self.dataset_1_file, self.dataset_2_file),
            self.index_col,
            self.path_home + self.dataset_output,
            length=self.dataset_length)

        Logger.get_instance().info(
            " The comparison of datasets is completed : \
two file  with the common and difference \
items has been generated in \n\n " + self.dataset_output)
Beispiel #18
0
 def analysis_tools_output(self):
     
     Timer.get_instance().step(" Start of tools analysis.. ")
     
     
     self.path_home = Constants.PATH_HOME
     self.input_path_iupred = self.path_home + PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_INPUT_PATH_IUPRED_PROPERTY, True)
     self.output_path_analysis = self.path_home + PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_OUTPUT_PATH_TOOLS_PROPERTY, True)
     
     self.threshold_1 =  PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_THRESHOLD_1_PROPERTY, True)
     self.threshold_2 =  PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_THRESHOLD_2_PROPERTY, True)
     self.number_aa_iupred =  PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_AMINOACID_NUMBER_IUPRED_PROPERTY, True)
     self.dataset_type = PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_DATASET_TYPE_PROPERTY, True)
     
     Iupred.make_iupred_file(self.input_path_iupred, self.output_path_analysis, float(self.threshold_1), float(self.threshold_2), int(self.number_aa_iupred), self.dataset_type)
     
     self.input_path_anchor = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_INPUT_PATH_ANCHOR_PROPERTY, True)
     self.num_aa_anchor = PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_AMINOACID_NUMBER_ANCHOR_PROPERTY, True)
          
     Anchor.make_anchor_file(self.input_path_anchor, self.output_path_analysis, int(self.num_aa_anchor),self.dataset_type )
             
     
     Timer.get_instance().step(" End of tools analysis")
Beispiel #19
0
 def change_header(self):
     
     # Variables definition
     
     self.path_home = Constants.PATH_HOME
     
     self.path_input =  self.path_home +  PropertyManager.get_instance().get_property( DataConstants.HEADER_INPUT_SEQ_PROPERTY, True)
     self.namefile = PropertyManager.get_instance().get_property( DataConstants.HEADER_FILE_SEQ_PROPERTY, True)
     self.path_file_input = self.path_input + self.namefile
     
     self.path_output =  self.path_home +  PropertyManager.get_instance().get_property( DataConstants.HEADER_OUTPUT_SEQ_PROPERTY, True)
     self.path_file_output = self.path_output + PropertyManager.get_instance().get_property( DataConstants.HEADER_FILE_OUTPUT_PROPERTY, True)
     
     self.source = PropertyManager.get_instance().get_property( DataConstants.HEADER_SOURCE_PROPERTY, True)
     self.type_id = PropertyManager.get_instance().get_property( DataConstants.HEADER_TYPE_ID_PROPERTY,True)
     
     # Method calling
     HeaderParser.change_header(self.path_file_input, self.path_file_output, source=int(self.source), type_id=int(self.type_id))
Beispiel #20
0
    def longest_sequence(self):

        Logger.get_instance().info(
            " Start of the selection of longest sequences of novel dataset \n")

        # Definition of arguments
        self.path_sequences = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.LONGEST_PATH_SEQUENCE_PROPERTY, True)
        self.file_sequences = self.path_sequences + PropertyManager.get_instance(
        ).get_property(DataConstants.LONGEST_PROT_FILE_SEQUENCES_2_PROPERTY,
                       True)
        self.path_dictionary_identifier = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.LONGEST_PATH_DICTIONARY_PROPERTY, True)
        self.file_dictionary = self.path_dictionary_identifier + PropertyManager.get_instance(
        ).get_property(DataConstants.LONGEST_DICTIONARY_NAME_FILE_PROPERTY,
                       True)

        self.path_output_longest = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.LONGEST_PATH_OUTPUT_PROPERTY, True)

        self.path_file_longest = self.path_output_longest + PropertyManager.get_instance(
        ).get_property(DataConstants.LONGEST_FILE_PROPERTY, True)
        self.path_file_isoform = self.path_output_longest + PropertyManager.get_instance(
        ).get_property(DataConstants.ISOFORM_FILE_PROPERTY, True)

        # Extraction the longest sequences from dataset 2 sequences (isoforms)
        LengthSeq.longest_seq(self.file_sequences, self.file_dictionary,
                              self.path_file_longest, self.path_file_isoform)

        # Timer step
        Timer.get_instance().step(
            " End of selection of the longest sequences in dataset 2  \n")

        Logger.get_instance().info(
            " End of selection of the longest sequences: \n \
two file have been generated one with longest sequences and the other one containing the isoform with same length  "
        )
Beispiel #21
0
    def creation_list(self):

        Logger.get_instance().info(" Creation of gene and protein list \n ")

        # Creation of two file containing respectively the genes and protein of dataset 1

        self.dataset_input_path = PropertyManager.get_instance().get_property(
            DataConstants.DATASET_INPUT_PATH_PROPERTY, True)
        self.file_dataset_1 = PropertyManager.get_instance().get_property(
            DataConstants.DATASET_1_FILE_PROPERTY, True)
        self.gene_index_col = PropertyManager.get_instance().get_property(
            DataConstants.LIST_GENE_INDEX_COL_PROPERTY, True)
        self.protein_index_col = PropertyManager.get_instance().get_property(
            DataConstants.LIST_PROTEIN_INDEX_COL_PROPERTY, True)

        self.list_gene_dataset_1 = PropertyManager.get_instance().get_property(
            DataConstants.LIST_FILE_GENE_DATASET_1_PROPERTY, True)
        self.list_protein_dataset_1 = PropertyManager.get_instance(
        ).get_property(DataConstants.LIST_FILE_PROTEIN_DATASET_1_PROPERTY,
                       True)

        self.path_gene_dataset_1 = Constants.PATH_HOME + self.dataset_input_path + self.list_gene_dataset_1
        self.path_protein_dataset_1 = Constants.PATH_HOME + self.dataset_input_path + self.list_protein_dataset_1

        self.path_home = Constants.PATH_HOME
        self.path_dataset_1 = self.path_home + self.dataset_input_path + self.file_dataset_1

        dataset_1 = FileParser.make_table(self.path_dataset_1)

        gene_dataset_1 = TableWrapper.get_column(dataset_1,
                                                 int(self.gene_index_col),
                                                 start=1)
        protein_dataset_1 = TableWrapper.get_column(
            dataset_1, int(self.protein_index_col), start=1)

        FileWriter.write_table(self.path_gene_dataset_1, gene_dataset_1)
        FileWriter.write_table(self.path_protein_dataset_1, protein_dataset_1)

        Logger.get_instance().info(
            " The genes and proteins file of dataset 1 have been created \
in the following path  \n\n " + self.dataset_input_path)
Beispiel #22
0
    def domain_classification(self):

        self.path_home = Constants.PATH_HOME
        self.file_domain = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_LIST_FILE_PROPERTY, True)
        self.file_jprot_information = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_LIST_JPROTEOMICS_PROPERTY, True)
        self.file_pfamid = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_FILE_PFAM_PROPERTY, True)

        # reading of pfam table in particular of the pfam id and the domain name
        table_pfam = FileParser.make_table(self.file_pfamid)
        pfamid = TableWrapper.get_column(table_pfam, 0)
        domain_name = TableWrapper.get_column(table_pfam, 3)

        # dictionary pfamid and domain name
        dict_pfam = {row[3]: row[0] for row in table_pfam}
        dict_pfam['DUF1785'] = '-'
        dict_pfam['DUF1898'] = '-'

        # reading of domain classification provided by dataset 2
        # dictionary motifs--> class

        table_domain = FileParser.make_table(self.file_domain)
        dict_type_domain = TableWrapper.make_dictionary(table_domain)

        # make a inverse dictionary class--> motifs
        inverse_table_domain = TableWrapper.inv_column(table_domain, 0, 1)
        dict_class_domain = TableWrapper.make_dictionary(inverse_table_domain)

        # reading of domain information of dataset 2
        jprot_table = FileParser.make_table(self.file_jprot_information,
                                            skip=1)

        for i, item in enumerate(jprot_table):
            if len(item) == 1:
                jprot_table[i] = [item[0], '.', '.']

        # Jproteomics
        # ===================================

        # extraction of domain for each gene
        # extraction of gene id
        domain_column_jprot = TableWrapper.get_column(jprot_table, 1)
        genes_jprot = TableWrapper.get_column(jprot_table, 0)
        pfam_id_jprot = TableWrapper.get_column(jprot_table, 2)

        # This part reads the type of domain for each gene and creates a new column with the type of domain
        # at the end returns a new table with namegene, domain name, pfam id, class of domain
        #
        # One gene/protein can have more than one domain
        # For each gene the following part checks if the classification of protein domains by comparing the domain with dict_class_domains:
        #
        #

        count_classical = 0
        count_nonclassic = 0
        count_unclissified = 0
        count_other_class = 0
        count_no_domain = 0
        new_table_jprot = []
        for i, gene in enumerate(genes_jprot):
            row = []
            row.append(gene)
            row.append(domain_column_jprot[i])
            row.append(pfam_id_jprot[i])
            string_domains = ''
            # If the protein hasn't any domain, add static 'no-domains' information
            if domain_column_jprot[i] == '.':
                string_domains += Constants.DOMAIN_NONE + Constants.DOMAIN_COMMA
                count_no_domain += 1
            # If the protein contains some domains checks the class and
            # makes a string containing the class domains separated by a comma
            else:
                domains = domain_column_jprot[i].split(',')
                for type_domain in domains:
                    print type_domain
                    if type_domain in dict_type_domain:
                        class_domain = dict_type_domain[type_domain][0]
                        if class_domain == Constants.DOMAIN_CLASSICAL:
                            string_domains += Constants.DOMAIN_CLASSICAL + Constants.DOMAIN_COMMA
                            count_classical += 1
                        elif class_domain == Constants.DOMAIN_NONCLASSICAL:
                            string_domains += Constants.DOMAIN_NONCLASSICAL + Constants.DOMAIN_COMMA
                            count_nonclassic += 1
                        elif class_domain == Constants.DOMAIN_UNKNOWN:
                            string_domains += Constants.DOMAIN_UNKNOWN + Constants.DOMAIN_COMMA
                            count_unclissified += 1
                    elif type_domain not in dict_type_domain:
                        string_domains += Constants.DOMAIN_OTHER + Constants.DOMAIN_COMMA
                        count_other_class += 1
                    else:
                        Logger.get_instance().info('unexpected case',
                                                   type_domain)
            # -1 allows to delete the last comma in string
            row.append(string_domains[0:len(string_domains) - 1])
            new_table_jprot.append(row)

        # print of proteins number for each domain class
        Logger.get_instance().info(str(count_classical))
        Logger.get_instance().info(str(count_nonclassic))
        Logger.get_instance().info(str(count_unclissified))
        Logger.get_instance().info(str(count_other_class))
        Logger.get_instance().info(str(count_no_domain))

        self.path_ouput_file_jprot = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_FINAL_TABLE_JPROT_PROPERTY, True)
        FileWriter.write_table(self.path_ouput_file_jprot, new_table_jprot)

        # NatRevGenetics
        # =========================================================
        # this part classifies the gene in according to the type of domain and creates a new table with
        # name gene, type domain, pfam id, clas of domain

        self.input_file_nrgenetics = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_LIST_NATREVGENETICS_PROPERTY, True)

        # reading of domain information of dataset 2
        nrgenetics_table = FileParser.make_table(self.input_file_nrgenetics,
                                                 skip=1)

        # extraction of domain domain for each gene
        # extraction of gene id
        domain_column_nrgenetics = TableWrapper.get_column(nrgenetics_table, 2)
        genes_nrgenetics = TableWrapper.get_column(nrgenetics_table, 0)

        # This part reads the type of domain for each gene and creates a new columns with the type of domain
        # at the end returns a new table with namegene, domain name, pfam id , class of domain
        #
        # One gene/protein can have more than one domain
        # For each gene the following part checks if the classification of protein domains by comparing the domain with dict_class_domains:
        #
        #
        count_classical = 0
        count_nonclassic = 0
        count_unclissified = 0
        count_other_class = 0
        count_no_domain = 0
        new_table_nrgenetics = []
        for i, gene in enumerate(genes_nrgenetics):
            row = []
            row.append(gene)
            row.append(domain_column_nrgenetics[i])
            string_domains = ''
            string_pfamid = ''
            # If the protein hasn't any domain, add static 'no-domains' information
            if domain_column_nrgenetics[i] == '.':
                string_domains += Constants.DOMAIN_NONE + Constants.DOMAIN_COMMA
                string_pfamid += '.,'
                count_no_domain += 1
            # If the protein contains some domains checks the class and
            # akes a string containing the class domains separated by a comma
            else:
                # the domains are separated by a comma
                domains = domain_column_nrgenetics[i].split(',')
                for type_domain in domains:
                    # the domain present also the number information
                    # X-domain[n] for this reason in order to take only the domain name
                    # the domain is split to '['
                    type_domain = type_domain.split('[')[0]
                    if type_domain in dict_pfam:
                        string_pfamid += dict_pfam[type_domain] + ','
                    else:
                        string_pfamid += '-,'
                    print type_domain
                    if type_domain in dict_type_domain:
                        class_domain = dict_type_domain[type_domain][0]
                        if class_domain == Constants.DOMAIN_CLASSICAL:
                            string_domains += Constants.DOMAIN_CLASSICAL + Constants.DOMAIN_COMMA
                            count_classical += 1
                        elif class_domain == Constants.DOMAIN_NONCLASSICAL:
                            string_domains += Constants.DOMAIN_NONCLASSICAL + Constants.DOMAIN_COMMA
                            count_nonclassic += 1
                        elif class_domain == Constants.DOMAIN_UNKNOWN:
                            string_domains += Constants.DOMAIN_UNKNOWN + Constants.DOMAIN_COMMA
                            count_unclissified += 1
                    elif type_domain not in dict_type_domain:
                        string_domains += Constants.DOMAIN_OTHER + Constants.DOMAIN_COMMA
                        count_other_class += 1
                    else:
                        print 'unexpected case', type_domain
            row.append(string_pfamid[0:len(string_pfamid) - 1])
            row.append(string_domains[0:len(string_domains) - 1])
            new_table_nrgenetics.append(row)

        # print of proteins number for each domain class
        Logger.get_instance().info(str(count_classical))
        Logger.get_instance().info(str(count_nonclassic))
        Logger.get_instance().info(str(count_unclissified))
        Logger.get_instance().info(str(count_other_class))
        Logger.get_instance().info(str(count_no_domain))

        self.path_ouput_file_nrgenetics = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_FINAL_TABLE_NATREVGENETICS, True)
        FileWriter.write_table(self.path_ouput_file_nrgenetics,
                               new_table_nrgenetics)

        # reading of sequences file in order to take the headers
        # reading of gene and protein id correspondences for different genes

        self.file_sequences = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_FILE_SEQ_PROPERTY, True)
        self.header = InfoFasta.get_header(self.file_sequences, text=False)
        gene_seq = [item.split('>')[1].split('|')[0] for item in self.header]
        prot_seq = [item.split('>')[1].split('|')[2] for item in self.header]

        # Construction of table containing the RBP protein and the corresponding class domain and pfam id
        count_j = 0
        count_n = 0
        final_table = []
        title = [
            'gene id', 'type_domain', 'pfam id', 'class domain', 'prot id'
        ]
        final_table.append(title)
        # table construction
        for n, gene in enumerate(gene_seq):
            Logger.get_instance().info(str(n + 1))
            Logger.get_instance().info(gene)
            # if gene is in jprot and in nrgenetics
            if gene in genes_jprot and gene in genes_nrgenetics:
                count_n += 1
                ind_gene = genes_nrgenetics.index(gene)
                row = new_table_nrgenetics[ind_gene]
                row.append(prot_seq[n])
            # if gene is in jprot and not in nrgenetics
            elif gene in genes_jprot and gene not in genes_nrgenetics:
                count_j += 1
                ind_gene = genes_jprot.index(gene)
                row = new_table_jprot[ind_gene]
                row.append(prot_seq[n])
            # if gene not in jprot and gene in nrgenetics
            elif gene not in genes_jprot and gene in genes_nrgenetics:
                count_n += 1
                ind_gene = genes_nrgenetics.index(gene)
                row = new_table_nrgenetics[ind_gene]
                row.append(prot_seq[n])
            # if the gene is not in both dataset: Error
            else:
                Logger.get_instance().info('Error' + gene + prot_seq[n])
            final_table.append(row)

        sort_table = TableWrapper.inv_column(final_table, 0, 4)

        sort_table = TableWrapper.inv_column(sort_table, 1, 4)
        sort_table = TableWrapper.inv_column(sort_table, 2, 4)
        sort_table = TableWrapper.inv_column(sort_table, 3, 4)

        self.final_file_table = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_FINALE_TABLE_RBP_DATASET_PROPERTY,
                       True)
        FileWriter.write_table(self.file_final_table, sort_table)

        # Counting the number of protein with Classical domain, Non-classical, unclassified or combinations of this class
        prot_name = TableWrapper.get_column(sort_table, 1, 1)
        class_domain = TableWrapper.get_column(sort_table, 4, 1)
        classical = []
        nonclassical = []
        unclissified = []
        otherclass = []
        nodomain = []

        # there are several combinations of class domain in a protein
        # the list creation in according to class domain has been performed by the filters of spreadsheet (see documentation)
        for n, domain in enumerate(class_domain):
            diff_domain = domain.split(',')
            for item in diff_domain:
                if item == Constants.DOMAIN_CLASSICAL:
                    classical.append(prot_name[n])
                elif item == Constants.DOMAIN_NONCLASSICAL:
                    nonclassical.append(prot_name[n])
                elif item == Constants.DOMAIN_UNKNOWN:
                    unclissified.append(prot_name[n])
                elif item == Constants.DOMAIN_OTHER:
                    otherclass.append(prot_name[n])
                elif item == Constants.DOMAIN_NONE:
                    nodomain.append(prot_name[n])
                else:
                    # many others
                    pass
Beispiel #23
0
    @staticmethod
    def whole_procedure():
            
        dataset_type = PropertyManager.get_instance().get_property( DataConstants.MOTIFS_DATASET_TYPE_PROPERTY, True)
        
        # start chrono
        Timer.get_instance().start_chrono()
        Logger.get_instance().info("        ........Start of " + dataset_type + " Motifs Analysis.....\n ")
        
        motifs = MotifsAnalysis()
        motifs.iupred_motifs()
        motifs.anchor_motifs()
        motifs.disordp_motifs()
        
        Timer.get_instance().stop_chrono(" End of " + dataset_type + " Motifs Analysis")




if __name__ == '__main__':
    
    OptionManager.get_instance().initialize()
    
    # Set the level of verbosity
    Logger.get_instance().set_level( OptionManager.get_instance().get_option( OptionConstants.OPTION_VERBOSITY))

    PropertyManager.get_instance().read_properties( OptionManager.get_instance().get_option( OptionConstants.OPTION_MOTIFS_ANALYSIS_PROPERTY_PATH, True))
    
    MotifsAnalysis.whole_procedure()
    
Beispiel #24
0
        Timer.get_instance().start_chrono()
        Logger.get_instance().info(
            "Start of the creation of RBP dataset.....\n ")

        M = MakeDatasetRbp()

        #M.delet_append_file()

        #M.comparison_dataset()
        #M.creation_list()
        #M.connection_to_ensembl()
        #M.dictionary_identifier()
        #M.longest_sequence()
        #M.isoform_sequences()
        #M.merger_sequences()
        M.split_dataset()

        Timer.get_instance().stop_chrono(' End of the creation of RBP dataset')


if __name__ == '__main__':

    OptionManager.get_instance().initialize()

    # Retrieve the MakeDatasetRbp properties
    PropertyManager.get_instance().read_properties(
        OptionManager.get_instance().get_option(
            OptionConstants.OPTION_MAKEDATASETRBP_PROPERTIES_PATH, True))

    MakeDatasetRbp.whole_procedure()
Beispiel #25
0
    def connection_to_ensembl(self):

        Logger.get_instance().info(" Connection to Ensembl: Starting...\n")

        # DATASET 1
        # =============================================

        # Collection of sequences for dataset 1
        Logger.get_instance().info(" Dataset 1 : Extraction of sequences...\n")

        # Timer step
        Timer.get_instance().step(" Start of sequences extraction \n")

        # Definition of Ensembl list_get_seq arguments

        self.path_home = Constants.PATH_HOME
        self.list_path = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.DATASET_INPUT_PATH_PROPERTY, True)

        self.gene_list_1 = PropertyManager.get_instance().get_property(
            DataConstants.LIST_FILE_GENE_DATASET_1_PROPERTY, True)
        self.protein_list = PropertyManager.get_instance().get_property(
            DataConstants.LIST_FILE_PROTEIN_DATASET_1_PROPERTY, True)

        self.ensembl_gene_list_1_path = self.list_path + self.gene_list_1
        self.ensembl_protein_list_1_path = self.list_path + self.protein_list
        self.type_query1_ensembl = PropertyManager.get_instance().get_property(
            DataConstants.ENSEMBL_TYPE_QUERY_DATASET_1_PROPERTY, True)

        self.ensembl_path_output = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.ENSEMBL_OUTPUT_PATH_SEQUENCE_PROPERTY,
                       True)
        self.ensembl_output_dataset1 = self.ensembl_path_output + PropertyManager.get_instance(
        ).get_property(DataConstants.ENSEMBL_FILE_SEQUENCES_1_PROPERTY)

        # Calling Ensembl.list_get_seq
        Ensembl.list_get_seq(
            self.ensembl_gene_list_1_path,
            int(self.type_query1_ensembl),
            path_protein_list=self.ensembl_protein_list_1_path,
            path_output=self.ensembl_output_dataset1)

        # Timer step
        Timer.get_instance().step(" End of Dataset 1 Sequences Extraction\n")

        Logger.get_instance().info(
            " Extraction of sequences for the dataset 1 has been completed \n\n"
        )

        # END DATASET 1
        # =====================================================

        # DATASET 2
        # =====================================================

        # Collection of sequences for dataset 2
        Logger.get_instance().info(
            " Dataset 2 : Extraction of sequences ....\n")

        # Definition of Ensembl list_get_seq arguments

        self.ensembl_input_list_2 = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.DATASET_OUTPUT_PROPERTY, True)
        self.gene_list_2 = Constants.FILE_DIFF
        self.ensembl_gene_list_2_path = self.ensembl_input_list_2 + self.gene_list_2
        self.type_query2_ensembl = PropertyManager.get_instance().get_property(
            DataConstants.ENSEMBL_TYPE_QUERY_DATASET_2_PROPERTY, True)

        self.ensembl_path_output = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.ENSEMBL_OUTPUT_PATH_SEQUENCE_PROPERTY,
                       True)
        self.ensembl_output_dataset2 = self.ensembl_path_output + PropertyManager.get_instance(
        ).get_property(DataConstants.ENSEMBL_FILE_SEQUENCES_2_PROPERTY, True)

        # Calling Ensembl.list_get_seq
        Ensembl.list_get_seq(self.ensembl_gene_list_2_path,
                             int(self.type_query2_ensembl),
                             path_protein_list=None,
                             path_output=self.ensembl_output_dataset2)

        # Timer step
        Timer.get_instance().step(" End of Dataset 2 Sequences Extraction\n")

        Logger.get_instance().info(
            " Extraction of sequences for the dataset 2 has been completed \n\n"
        )

        # END DATASET 2
        # =====================================================

        Logger.get_instance().info(
            " The sequences file of dataset 1 and the novel gene in dataset 2 \
have been created in the following path  \n" + self.ensembl_path_output)
Beispiel #26
0
    def whole_procedure():

        # start chrono
        Timer.get_instance().start_chrono()
        Logger.get_instance().info("Start of the sequences extraction.....\n ")

        D = DownloadEnsemblSeq()

        #D.download_product_gene_seq()

        #D.make_dictionary()
        #D.get_longest_seq()
        #D.isoform_sequences()
        #D.merger_sequences()

        Timer.get_instance().stop_chrono(' End of the sequences extraction')


if __name__ == '__main__':

    OptionManager.get_instance().initialize()

    # Retrieve the  properties DownloadEnsemblSeq
    PropertyManager.get_instance().read_properties(
        OptionManager.get_instance().get_option(
            OptionConstants.OPTION_DOWNLOADENSEMBLSEQ_PROPERTY_PATH, True))

    D = DownloadEnsemblSeq()

    D.whole_procedure()
Beispiel #27
0
 def particular_analysis(self):
     
     
     Timer.get_instance().step(" Start of tools analysis for specific protein ")
     
     self.path_home = Constants.PATH_HOME
     self.path_input_anchor_file = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_ANCHOR_FILE_PROPERTY, True)
     self.path_input_iupred_file = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_IUPRED_FILE_PROPERTY, True)
     self.path_input_disordp_file = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_DISORDP_FILE_PROPERTY, True)
     self.path_input_reg_anchor = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_REG_ANCHOR_FILE_PROPERTY, True)
     self.path_input_reg_iupred_1 = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_REG_IUPRED_1_FILE_PROPERTY, True)
     self.path_input_reg_iupred_2 = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_REG_IUPRED_2_FILE_PROPERTY, True)
     self.path_input_reg_diso = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_REG_DISO_FILE_PROPERTY, True)
     self.input_files = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_DIR_FILE_PROPERTY, True)
     self.list_namefiles = PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_LIST_NAMEFILE_PROPERTY, True)
     self.path_output_dir = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_OUTPUT_DIR_PROPERTY, True)
     self.path_output_dir_diso = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_OUTPUT_DIR_DISO_PROPERTY, True)
     
     # This parameter represents the column of protein id in the classification files
     #
     # In Domain Class  files the column of protein id is the 2 ( that is 1 for python)
     # In RNA target files the column of protein id is the 1 (that is 0 for python)
     #
     
     #self.protein_column_rna =  PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_PROTEIN_LIST_COLUMN_RNA_PROPERTY, True)
     self.protein_column_class =  PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_PROTEIN_LIST_COLUMN_CLASS_PROPERTY, True)
     
     # region file
     anchor_table = FileParser.make_table(self.path_input_reg_anchor, skip=1)
     iupred_table_1 = FileParser.make_table(self.path_input_reg_iupred_1, skip=1)
     iupred_table_2 = FileParser.make_table(self.path_input_reg_iupred_2, skip=1)
     disordp_table = FileParser.make_table(self.path_input_reg_diso, skip=1)
     
     # table file (fraction)
     anchor_t = FileParser.make_table(self.path_input_anchor_file, skip=1)
     iupred_t = FileParser.make_table(self.path_input_iupred_file, skip=1)
     disordp_t = FileParser.make_table(self.path_input_disordp_file, skip=1)
     
     list_filenames = self.list_namefiles.split(',')
     
     for filename in list_filenames:
         feature = filename.split('.')[0]
         table_domain = FileParser.make_table(self.input_files + str(filename))
         list_prot = TableWrapper.get_column(table_domain,int(self.protein_column_class))
         prot_id_anchor = TableWrapper.get_column(anchor_table, 0)
         prot_id_iupred_1 = TableWrapper.get_column(iupred_table_1, 0)
         prot_id_iupred_2 = TableWrapper.get_column(iupred_table_2, 0)
         prot_id_disordp = TableWrapper.get_column(disordp_table, 0)
         
         prot_id_anchor_t = TableWrapper.get_column(anchor_t, 0)
         prot_id_iupred_t = TableWrapper.get_column(iupred_t, 0)
         prot_id_disordp_t = TableWrapper.get_column(disordp_t, 0)
         
         # region file
         new_table_anchor = [line for n, line in enumerate(anchor_table) if prot_id_anchor[n] in list_prot]
         new_table_iupred_1 = [line for n, line in enumerate(iupred_table_1) if prot_id_iupred_1[n] in list_prot]
         new_table_iupred_2 = [line for n, line in enumerate(iupred_table_2) if prot_id_iupred_2[n] in list_prot]
         new_table_disordp = [line for n, line in enumerate(disordp_table) if prot_id_disordp[n] in list_prot]
         anchor_output_file_path = self.path_output_dir + feature + '_AnchorRegion.txt'
         iupred1_output_file_path = self.path_output_dir + feature + '_IUPredRegion_0.4.txt'
         iupred2_output_file_path = self.path_output_dir + feature + '_IUPredRegion_0.5.txt'
         disordp_output_file_path = self.path_output_dir_diso + feature + '_DisoRDPRegion.txt'
         
         # Table file (fraction)
         new_table_a = [line for n, line in enumerate(anchor_t) if prot_id_anchor_t[n] in list_prot]
         new_table_i = [line for n, line in enumerate(iupred_t) if prot_id_iupred_t[n] in list_prot]
         new_table_d = [line for n, line in enumerate(disordp_t) if prot_id_disordp_t[n] in list_prot]
         anchor_output_table = self.path_output_dir + feature + '_AnchorTable.txt'
         iupred_output_table = self.path_output_dir + feature + '_IUPredTable_0.4_0.5.txt'
         disordp_output_table = self.path_output_dir_diso + feature + '_DisoRDPTable.txt'
         
         
         # file writing
         
         # Region file
         FileWriter.write_table(anchor_output_file_path, new_table_anchor)
         FileWriter.write_table(iupred1_output_file_path, new_table_iupred_1)
         FileWriter.write_table(iupred2_output_file_path, new_table_iupred_2)
         FileWriter.write_table(disordp_output_file_path, new_table_disordp)
         
         # Table file
         FileWriter.write_table(anchor_output_table, new_table_a)
         FileWriter.write_table(iupred_output_table, new_table_i)
         FileWriter.write_table(disordp_output_table, new_table_d)
     
     Timer.get_instance().step(" End of tools analysis for specific protein ")
Beispiel #28
0
    def merger_sequences(self):

        Logger.get_instance().info(
            " Union of the longest sequences and the random selected isoform ")

        # Input variables to merge the longest Novel sequences and random selected isoform of dataset 2

        self.path_input_longest = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_PATH_INPUT_PROPERTY, True)
        self.path_file_longest = self.path_input_longest + PropertyManager.get_instance(
        ).get_property(DataConstants.LONGEST_FILE_PROPERTY, True)
        self.path_file_isoform = self.path_input_longest + PropertyManager.get_instance(
        ).get_property(DataConstants.SELECTED_ISOFORM_FILE_PROPERTY, True)

        self.path_output_seq = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_PATH_OUTPUT_PROPERTY, True)
        self.path_file_seq_dataset_2 = self.path_output_seq + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_FILE_SEQ_DATASET_2_PROPERTY, True)

        FileParser.merge_file(self.path_file_longest, self.path_file_isoform,
                              self.path_file_seq_dataset_2)

        # Input variables to merge the sequences datasets (Novel_JProteomics and NatRevGenetics)

        self.path_input_seq_dataset1 = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_PATH_INPUT_DATASET_1_PROPERTY,
                       True)
        self.path_file_dataset1 = self.path_input_seq_dataset1 + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_FILE_DATASET_1_PROPERTY, True)

        self.path_file_dataset12 = self.path_output_seq + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_DATASET_12_PROPERTY, True)

        Logger.get_instance().info(
            " Union of sequences respectively of dataset 1 and the novel dataset 2 proteins \n "
        )

        FileParser.merge_file(self.path_file_dataset1,
                              self.path_file_seq_dataset_2,
                              self.path_file_dataset12)

        Logger.get_instance().info(" The New RBP Dataset has been created\n ")

        # This part checks if there are pseudo genes inside dataset 2
        # Make the comparison between the original genes gived as an input and the genes obtained after
        # connection to Ensembl
        # This check allows to find genes that are not anymore available or that are pseudogenes

        Logger.get_instance().info(
            " Comparison between original genes and Ensembl output ")
        self.path_home = Constants.PATH_HOME
        self.path_input_original_file = self.dataset_output = PropertyManager.get_instance(
        ).get_property(DataConstants.DATASET_OUTPUT_PROPERTY, True)
        self.original_file = self.path_home + self.path_input_original_file + Constants.FILE_DIFF

        original_genes = FileParser.read_file(self.original_file)

        self.path_output_seq = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_PATH_OUTPUT_PROPERTY, True)
        self.path_file_seq_dataset_2 = self.path_output_seq + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_FILE_SEQ_DATASET_2_PROPERTY, True)

        final_headers = InfoFasta.get_header(self.path_file_seq_dataset_2)
        final_genes = [item[1:16] for item in final_headers]

        out_comparison = InfoDataset.comparison_dataset(original_genes,
                                                        final_genes,
                                                        header=False)

        genes = '\n'.join(out_comparison[1])

        Logger.get_instance().info(
            " The genes lost during the request to Ensembl are : \n" + genes)
Beispiel #29
0
    def mrna_protein(self):

        self.path_home = Constants.PATH_HOME
        self.jproteomics_seq = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVERNA_JPROTEOMICS_SEQ_PROPERTY,
                       True)
        self.jproteomics_info = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVERNA_JPROTEOMICS_INFO_PROPERTY,
                       True)

        # Reading J proteomics file containing information about the protein attendance in others datasets
        #
        info_table = FileParser.make_table(self.jproteomics_info, '\t', skip=1)
        #
        # column Extraction
        gene_info = TableWrapper.get_column(info_table, 1)
        castello = TableWrapper.get_column(info_table, 2)
        baltz = TableWrapper.get_column(info_table, 3)
        esc = TableWrapper.get_column(info_table, 4)
        rbpdb = TableWrapper.get_column(info_table, 5)
        rnacompete = TableWrapper.get_column(info_table, 6)

        mrna = [(cast_val, baltz_val, esc_val)
                for cast_val, baltz_val, esc_val in zip(castello, baltz, esc)]

        self.header = InfoFasta.get_header(self.jproteomics_seq, text=False)

        # Protein and gene of dataset 2 attend in final RBP dataset
        #
        gene_seq = [item.split('>')[1].split('|')[0] for item in self.header]
        prot_seq = [item.split('>')[1].split('|')[2] for item in self.header]

        # Construction of column containing the putative RNA target for each gene
        # in according to if condition
        putative_rna_target = []
        rbpdb_target = []
        for n, gene_id in enumerate(gene_seq):
            Logger.get_instance().info(gene_id)
            ind_gene_id = gene_info.index(gene_id)
            # if the gene have at least one Y in this array can be considered to have mRNA target
            if 'Y' in mrna[ind_gene_id]:
                rna_target = 'mRNA'
                Logger.get_instance().info(
                    'The putative Rna target of this gene is ' + rna_target)
                putative_rna_target.append([gene_id, prot_seq[n], rna_target])
            elif 'N' in mrna[ind_gene_id]:
                # if the gene attend in rnacompete means that the RNA target is unkown
                if 'N' in rbpdb[ind_gene_id] and 'Y' in rnacompete[ind_gene_id]:
                    rna_target = 'unknown'
                    Logger.get_instance().info(
                        'The putative Rna target of this gene is ' +
                        rna_target)
                    putative_rna_target.append(
                        [gene_id, prot_seq[n], rna_target])
                # if the gene attend in rbpdb means just it is a RBP protein (no information about RNA target)
                elif 'Y' in rbpdb[ind_gene_id] and 'N' in rnacompete[
                        ind_gene_id]:
                    rna_target = 'RBPDB'
                    Logger.get_instance().info('The gene is in ' + rna_target)
                    rbpdb_target.append([gene_id, prot_seq[n], rna_target])
            else:
                Logger.get_instance().info(' Check this line' +
                                           info_table[ind_gene_id])

        self.file_rna_target_jeproteomics = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVE_MRNA_GENE_JPROTEOMICS_PROPERTY,
                       True)
        self.file_rbpdb_jproteomics = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVE_RBPDB_GENE_JPROTEOMICS_PROPERTY,
                       True)

        # File Writing

        FileWriter.write_table(self.file_mrna_jeproteomics,
                               putative_rna_target,
                               symbol='\t')

        FileWriter.write_table(self.file_rbpdb_jproteomics,
                               rbpdb_target,
                               symbol='\t')
Beispiel #30
0
    @staticmethod
    def whole_procedure():
            
        
        # start chrono
        Timer.get_instance().start_chrono()
        Logger.get_instance().info("Start of Disorder Analysis.....\n ")
           
        disorder = DisorderAnalysis()
        #disorder.change_header()
        #disorder.split_dataset()
        #disorder.anchor_analysis()
        #disorder.iupred_analysis()
        #disorder.analysis_tools_output()
        #disorder.disordpbind_analysis()
        #disorder.particular_analysis()
            
        Timer.get_instance().stop_chrono(' End of Disorder Analysis')
        
     


if __name__ == '__main__':
    
    OptionManager.get_instance().initialize()

    PropertyManager.get_instance().read_properties( OptionManager.get_instance().get_option( OptionConstants.OPTION_DISORDER_ANALYSIS_PATH, True))
    
    DisorderAnalysis.whole_procedure()