def __init__(self): self.path_home = Constants.PATH_HOME self.protein_list_file = self.path_home + PropertyManager.get_instance().get_property( DataConstants.MOTIFS_PROTEIN_FILE_PROPERTY, True) self.protein_list = FileParser.read_file(self.protein_list_file) self.motif_folder = self.path_home + PropertyManager.get_instance().get_property( DataConstants.MOTIFS_FOLDER_PROPERTY, True) self.domain_region_file = self.path_home + PropertyManager.get_instance().get_property( DataConstants.MOTIFS_DOMAIN_REGION_FILE_PROPERTY, True)
def dictionary_identifier(self): Logger.get_instance().info( " Creation of a dictionary for novel gene of dataset 2\ The dictionary structure is : \n \ {gene = [ isoform1, isoform2,...isoformN]}") self.ensembl_path_output = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.ENSEMBL_OUTPUT_PATH_SEQUENCE_PROPERTY, True) self.ensembl_output_dataset2 = self.ensembl_path_output + PropertyManager.get_instance( ).get_property(DataConstants.ENSEMBL_FILE_SEQUENCES_2_PROPERTY, True) self.dictionary_output = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.DICTIONARY_PATH_OUTPUT_PROPERTY, True) self.dictionary_namefile = self.dictionary_output + PropertyManager.get_instance( ).get_property(DataConstants.DICTIONARY_NAME_FILE_PROPERTY, True) dict_identifier = InfoFasta.make_dictionary( self.ensembl_output_dataset2) file_dict = FileUtils.open_text_w(self.dictionary_namefile) pickle.dump(dict_identifier, file_dict) Logger.get_instance().info( " The creation of a dictionary for novel gene in dataset 2 is completed \n\n" )
def iupred_motifs(self): Logger.get_instance().info( " .....Start of IUPred motifs analysis.....\n") self.iupred_folder = self.path_home + PropertyManager.get_instance().get_property( DataConstants.MOTIFS_IUPRED_FOLDER_PROPERTY,True) # Iupred Analysis at threshold value of 0.4 Timer.get_instance().step(" Start of IUPred motifs analysis - threshold = 0.4 \n") self.threshold_1 = Constants.MOTIFS_THRESHOLD_1 self.output_folder_1 = self.path_home + PropertyManager.get_instance().get_property( DataConstants.MOTIFS_IUP_OUTPUT_FOLDER_1_PROPERTY, True) GlobalOverlapRegionAnalysis.iupred_overlap_analysis(self.protein_list,self.iupred_folder, self.output_folder_1, self.threshold_1, self.motif_folder,self.domain_region_file) Timer.get_instance().step(" End of IUPred motifs analysis - threshold = 0.4 \n") # Iupred Analysis at threshold value of 0.5 Timer.get_instance().step(" Start of IUPred motifs analysis - threshold = 0.5 \n") self.threshold_2 = Constants.MOTIFS_THRESHOLD_2 self.output_folder_2 = self.path_home + PropertyManager.get_instance().get_property( DataConstants.MOTIFS_IUP_OUTPUT_FOLDER_2_PROPERTY, True) GlobalOverlapRegionAnalysis.iupred_overlap_analysis(self.protein_list,self.iupred_folder, self.output_folder_2, self.threshold_2, self.motif_folder,self.domain_region_file) Timer.get_instance().step(" End of IUPred motifs analysis - threshold = 0.5 \n") Logger.get_instance().info( " .....End of IUPred motifs analysis.....\n")
def make_dictionary(self): Logger.get_instance().info( " Creation of a dictionary for novel gene of dataset 2\ The dictionary structure is : \n \ {gene = [ isoform1, isoform2,...isoformN]}") self.path_home = Constants.PATH_HOME self.path_input_file = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_DICTIONARY_INPUT_FILE_PROPERTY, True) self.dictionary_output_path = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_DICTIONARY_OUTPUT_PATH_PROPERTY, True) self.output_file_path = self.dictionary_output_path + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_DICTIONARY_FILE_OUTPUT_PROPERTY, True) dict_identifier = InfoFasta.make_dictionary(self.path_input_file) self.dict_file = FileUtils.open_text_w(self.output_file_path) pickle.dump(dict_identifier, self.dict_file) Logger.get_instance().info( " The creation of a dictionary is completed \n\n")
def delet_append_file(self): self.del_ensembl_input = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.DEL_ENSEMBL_PATH_PROPERTY, True) self.del_ensembl_file1 = self.del_ensembl_input + PropertyManager.get_instance( ).get_property(DataConstants.DEL_ENSEMBL_FILE1_PROPERTY, True) self.del_ensembl_file2 = self.del_ensembl_input + PropertyManager.get_instance( ).get_property(DataConstants.DEL_ENSEMBL_FILE2_PROPERTY, True) self.del_longest_input = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.DEL_LONGEST_PATH_PROPERTY, True) self.del_longest_file = self.del_longest_input + PropertyManager.get_instance( ).get_property(DataConstants.DEL_LONGEST_FILE_PROPERTY, True) self.del_isoform_file = self.del_longest_input + PropertyManager.get_instance( ).get_property(DataConstants.DEL_ISOFORM_FILE_PROPERTY, True) self.del_random_isoform_file = self.del_longest_input + PropertyManager.get_instance( ).get_property(DataConstants.DEL_RANDOM_ISOFORM_FILE_PROPERTY, True) self.del_fusion_path = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.DEL_FUSION_PATH_PROPERTY, True) self.del_fusion_file_longest = self.del_fusion_path + PropertyManager.get_instance( ).get_property(DataConstants.DEL_FUSION_DATASET_LONGEST_PROPERTY, True) self.del_fusion_file_dataset12 = self.del_fusion_path + PropertyManager.get_instance( ).get_property(DataConstants.DEL_FUSION_DATASET12_PROPERTY, True) files = [ self.del_ensembl_file1, self.del_ensembl_file2, self.del_longest_file, self.del_isoform_file, self.del_random_isoform_file, self.del_fusion_file_longest, self.del_fusion_file_dataset12 ] for namefile in files: RemoveFile.delete_file(namefile)
def isoform_sequences(self): Logger.get_instance().info( " Starting the random selection of isoforms with same length \n") Logger.get_instance().info( " The following headers are the proteins randomly selected \n") self.path_output_longest = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.LONGEST_PATH_OUTPUT_PROPERTY, True) self.path_file_isoform = self.path_output_longest + PropertyManager.get_instance( ).get_property(DataConstants.ISOFORM_FILE_PROPERTY, True) self.path_file_selected_isoform = self.path_output_longest + PropertyManager.get_instance( ).get_property(DataConstants.RANDOM_ISOFORM_SEQ_PROPERTY, True) # The headers of a Isoform fasta file are taken by InfoFasta class # You make sure that the arg text is equal to False because the input object is a file and not a list self.headers = InfoFasta.get_header(self.path_file_isoform, text=False) # Extraction of genes form headers line # This vector contains double gene because the file contains some isoform of the same gene gene_isoform = [] for header in self.headers: gene = header[1:16] gene_isoform.append(gene) # gene set creation unique_gene = set(gene_isoform) # This for loop flows on the unique gene # random_header = [] old_num_isoform = 0 for gene in unique_gene: # For each gene counts how many isoform has num_isoform = gene_isoform.count(gene) item = range(0, num_isoform) # Select one isoform randomly sel = random.choice(item) # The header selected randomly are stored in array random_header.append(self.headers[old_num_isoform:old_num_isoform + num_isoform][sel]) old_num_isoform = old_num_isoform + num_isoform self.file_random_seq = FileUtils.open_text_a( self.path_file_selected_isoform) # The sequences corresponding to header selected are extracted from isoform file for header in random_header: Logger.get_instance().info('Header selected : ' + header) identifier = header[33:48] sequence = InfoFasta.get_seq(self.path_file_isoform, identifier) fasta_seq = SeqToFasta.give_fasta(header, sequence) self.file_random_seq.write(fasta_seq) Logger.get_instance().info(" End of selection random sequences \n ")
def iupred_analysis(self): Timer.get_instance().step(" Start of Iupred analysis...") self.tool_path_input = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.TOOL_PATH_INPUT_PROPERTY, True) self.iupred_path_output = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.IUPRED_PATH_OUTPUT_PROPERTY,True) Iupred.global_iupred_analysis(self.tool_path_input, self.iupred_path_output) Timer.get_instance().step(" End of Iupred analysis")
def anchor_analysis(self): Timer.get_instance().step(" Start of Anchor analysis...") self.tool_path_input = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.TOOL_PATH_INPUT_PROPERTY, True) self.anchor_path_output = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.ANCHOR_PATH_OUTPUT_PROPERTY,True) self.motif_list_path = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.ANCHOR_MOTIF_PATH_PROPERTY, True) Anchor.global_anchor_analysis(self.motif_list_path, self.tool_path_input, self.anchor_path_output) Timer.get_instance().step(" End of Anchor analysis")
def disordpbind_analysis(self): Timer.get_instance().step(" Start of DisoRDPbind output analysis.. ") self.path_home = Constants.PATH_HOME self.input_file = self.path_home + PropertyManager.get_instance().get_property( DataConstants.DISO_INPUT_FILE_PROPERTY, True) self.ouput_path = self.path_home + PropertyManager.get_instance().get_property( DataConstants.DISO_OUTPUT_FOLDER_PROPERTY, True) self.binding_partner = PropertyManager.get_instance().get_property( DataConstants.DISO_BINDING_PARTNER_PROPERTY, True) self.num_aa_diso = PropertyManager.get_instance().get_property( DataConstants.DISO_NUM_AA_PROPERTY, True) self.dataset_type = PropertyManager.get_instance().get_property( DataConstants.DISO_DATASET_TYPE_PROPERTY, True) DisoRDPbind.make_disordp_file(self.input_file, self.ouput_path, int(self.binding_partner), int(self.num_aa_diso), self.dataset_type) Timer.get_instance().step(" End of DisoRDPbind output analysis")
def anchor_motifs(self): Logger.get_instance().info( " .....Start of IUPred motifs analysis.....\n") self.anchor_folder = self.path_home + PropertyManager.get_instance().get_property( DataConstants.MOTIFS_ANCHOR_FOLDER_PROPERTY, True) self.anchor_output_folder = self.path_home + PropertyManager.get_instance().get_property( DataConstants.MOTIFS_ANCHOR_OUTPUT_FOLDER_PROPERTY, True) Timer.get_instance().step(" Start of ANCHOR motifs analysis \n") GlobalOverlapRegionAnalysis.anchor_overlap_analysis(self.protein_list,self.anchor_folder,self.anchor_output_folder, self.motif_folder,self.domain_region_file) Timer.get_instance().step(" End of IUPred motifs analysis \n") Logger.get_instance().info( " .....End of ANCHOR motifs analysis.....\n")
def split_dataset(self): Logger.get_instance().info( " Division of Dataset in many fasta file each containing one protein sequence") self.path_home = Constants.PATH_HOME self.split_path_input = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPLIT_PATH_INPUT_PROPERTY, True) self.split_file_fasta = self.split_path_input + PropertyManager.get_instance().get_property( DataConstants.SPLIT_DATASET_PROPERTY, True) self.split_path_output = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.SPLIT_PATH_OUTPUT_PROPERTY, True) self.split_start_index = PropertyManager.get_instance().get_property( DataConstants.SPLIT_START_HEADER_PROPERTY, True) self.split_end_index = PropertyManager.get_instance().get_property( DataConstants.SPLIT_END_HEADER_PROPERTY, True) SplitSeq.split_seq( self.split_file_fasta, self.split_path_output, int(self.split_start_index), int(self.split_end_index)) Logger.get_instance().info( " The dataset has been split in many fasta files ")
def disordp_motifs(self): Logger.get_instance().info( " .....Start of DisoRDPbind motifs analysis.....\n") self.disordp_folder = self.path_home + PropertyManager.get_instance().get_property( DataConstants.MOTIFS_DISORDP_FOLDER_PROPERTY, True) self.disordp_output_folder = self.path_home + PropertyManager.get_instance().get_property( DataConstants.MOTIFS_DISORDP_OUTPUT_FOLDER_PROPERTY, True) self.filename = PropertyManager.get_instance().get_property( DataConstants.MOTIFS_DISORDP_FILE_PROPERTY,True) Timer.get_instance().step(" Start of DisoRDPbind motifs analysis \n") GlobalOverlapRegionAnalysis.disordp_overlap_analysis(self.protein_list, self.disordp_folder, self.filename, self.motif_folder, self.domain_region_file,self.disordp_output_folder) Timer.get_instance().step(" End of DisoRDPbind motifs analysis \n") Logger.get_instance().info( " .....End of DisoRDPbind motifs analysis.....\n")
def download_product_gene_seq(self): Logger.get_instance().info( " Start of sequences download from Ensembl..") self.path_home = Constants.PATH_HOME self.gene_list_input = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_ENSEMBL_FILE_INPUT_LIST_PROPERTY, True) self.ensembl_seq_output = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_ENSEMBL_FILE_OUPUT_SEQ_PROPERTY, True) self.type_query = PropertyManager.get_instance().get_property( DataConstants.DOWNLOAD_ENSEMBL_TYPE_QUERY_PROPERTY, True) Ensembl.list_get_seq(self.gene_list_input, int(self.type_query), path_output=self.ensembl_seq_output) Logger.get_instance().info(" End of sequences download from Ensembl..")
def merger_sequences(self): Logger.get_instance().info( " Union of the longest sequences and the random selected isoform ") # Input variables to merge the longest Novel sequences and random selected isoform of dataset self.path_home = Constants.PATH_HOME self.path_file_longest = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_FUSION_FILE_LONGEST_PROPERTY, True) self.path_file_random = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_FUSION_FILE_RANDOM_PROPERTY, True) self.path_final_file = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_FUSION_FINAL_DATASET_PROPERTY, True) FileParser.merge_file(self.path_file_longest, self.path_file_random, self.path_final_file) Logger.get_instance().info( " The Final Proteome Dataset has been created\n ")
def whole_procedure(): dataset_type = PropertyManager.get_instance().get_property( DataConstants.MOTIFS_DATASET_TYPE_PROPERTY, True) # start chrono Timer.get_instance().start_chrono() Logger.get_instance().info(" ........Start of " + dataset_type + " Motifs Analysis.....\n ") motifs = MotifsAnalysis() motifs.iupred_motifs() motifs.anchor_motifs() motifs.disordp_motifs() Timer.get_instance().stop_chrono(" End of " + dataset_type + " Motifs Analysis")
def get_longest_seq(self): Logger.get_instance().info( " Start of the selection of longest sequences of dataset \n") self.path_home = Constants.PATH_HOME self.file_sequences = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_LONGEST_SEQ_INPUT_FILE_PROPERTY, True) self.output_path = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_LONGEST_SEQ_OUTPUT_PATH_PROPERTY, True) self.dictionary_file = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_LONGEST_DICTIONARY_PROPERTY, True) self.longest_file = self.output_path + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_LONGEST_SEQ_FILE_PROPERTY, True) self.isoform_file = self.output_path + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_LONGEST_ISOFORM_FILE, True) # Extraction the longest sequences from dataset sequences (isoforms) self.file_seq = open(self.file_sequences) self.seq_obj = self.file_seq.readlines() LengthSeq.longest_seq(self.seq_obj, self.dictionary_file, self.longest_file, self.isoform_file, type_obj='list') Logger.get_instance().info( " End of selection of the longest sequences: \n \ two file have been generated one with longest sequences and the other one containing the isoform with same length " )
def comparison_dataset(self): # Create Logger instance to see the start of comparison between two datasets Logger.get_instance().info(" Start of comparison datasets:...") # Definition of InfoDataset arguments self.dataset_input_path = PropertyManager.get_instance().get_property( DataConstants.DATASET_INPUT_PATH_PROPERTY, True) self.dataset_1_file = PropertyManager.get_instance().get_property( DataConstants.DATASET_1_FILE_PROPERTY, True) self.dataset_2_file = PropertyManager.get_instance().get_property( DataConstants.DATASET_2_FILE_PROPERTY, True) self.dataset_1_index_col = PropertyManager.get_instance().get_property( DataConstants.DATASET_1_INDEX_COL_PROPERTY, True) self.dataset_2_index_col = PropertyManager.get_instance().get_property( DataConstants.DATASET_2_INDEX_COL_PROPERTY, True) self.dataset_output = PropertyManager.get_instance().get_property( DataConstants.DATASET_OUTPUT_PROPERTY, True) self.dataset_1_length = PropertyManager.get_instance().get_property( DataConstants.DATASET_1_LENGTH_PROPERTY, True) self.dataset_2_length = PropertyManager.get_instance().get_property( DataConstants.DATASET_2_LENGTH_PROPERTY, True) self.index_col = (int(self.dataset_1_index_col), int(self.dataset_2_index_col)) self.dataset_length = (int(self.dataset_1_length), int(self.dataset_2_length)) self.path_home = Constants.PATH_HOME InfoDataset.global_analysis_dataset( self.path_home + self.dataset_input_path, (self.dataset_1_file, self.dataset_2_file), self.index_col, self.path_home + self.dataset_output, length=self.dataset_length) Logger.get_instance().info( " The comparison of datasets is completed : \ two file with the common and difference \ items has been generated in \n\n " + self.dataset_output)
def analysis_tools_output(self): Timer.get_instance().step(" Start of tools analysis.. ") self.path_home = Constants.PATH_HOME self.input_path_iupred = self.path_home + PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_INPUT_PATH_IUPRED_PROPERTY, True) self.output_path_analysis = self.path_home + PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_OUTPUT_PATH_TOOLS_PROPERTY, True) self.threshold_1 = PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_THRESHOLD_1_PROPERTY, True) self.threshold_2 = PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_THRESHOLD_2_PROPERTY, True) self.number_aa_iupred = PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_AMINOACID_NUMBER_IUPRED_PROPERTY, True) self.dataset_type = PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_DATASET_TYPE_PROPERTY, True) Iupred.make_iupred_file(self.input_path_iupred, self.output_path_analysis, float(self.threshold_1), float(self.threshold_2), int(self.number_aa_iupred), self.dataset_type) self.input_path_anchor = Constants.PATH_HOME + PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_INPUT_PATH_ANCHOR_PROPERTY, True) self.num_aa_anchor = PropertyManager.get_instance().get_property( DataConstants.ANALYSIS_AMINOACID_NUMBER_ANCHOR_PROPERTY, True) Anchor.make_anchor_file(self.input_path_anchor, self.output_path_analysis, int(self.num_aa_anchor),self.dataset_type ) Timer.get_instance().step(" End of tools analysis")
def change_header(self): # Variables definition self.path_home = Constants.PATH_HOME self.path_input = self.path_home + PropertyManager.get_instance().get_property( DataConstants.HEADER_INPUT_SEQ_PROPERTY, True) self.namefile = PropertyManager.get_instance().get_property( DataConstants.HEADER_FILE_SEQ_PROPERTY, True) self.path_file_input = self.path_input + self.namefile self.path_output = self.path_home + PropertyManager.get_instance().get_property( DataConstants.HEADER_OUTPUT_SEQ_PROPERTY, True) self.path_file_output = self.path_output + PropertyManager.get_instance().get_property( DataConstants.HEADER_FILE_OUTPUT_PROPERTY, True) self.source = PropertyManager.get_instance().get_property( DataConstants.HEADER_SOURCE_PROPERTY, True) self.type_id = PropertyManager.get_instance().get_property( DataConstants.HEADER_TYPE_ID_PROPERTY,True) # Method calling HeaderParser.change_header(self.path_file_input, self.path_file_output, source=int(self.source), type_id=int(self.type_id))
def longest_sequence(self): Logger.get_instance().info( " Start of the selection of longest sequences of novel dataset \n") # Definition of arguments self.path_sequences = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.LONGEST_PATH_SEQUENCE_PROPERTY, True) self.file_sequences = self.path_sequences + PropertyManager.get_instance( ).get_property(DataConstants.LONGEST_PROT_FILE_SEQUENCES_2_PROPERTY, True) self.path_dictionary_identifier = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.LONGEST_PATH_DICTIONARY_PROPERTY, True) self.file_dictionary = self.path_dictionary_identifier + PropertyManager.get_instance( ).get_property(DataConstants.LONGEST_DICTIONARY_NAME_FILE_PROPERTY, True) self.path_output_longest = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.LONGEST_PATH_OUTPUT_PROPERTY, True) self.path_file_longest = self.path_output_longest + PropertyManager.get_instance( ).get_property(DataConstants.LONGEST_FILE_PROPERTY, True) self.path_file_isoform = self.path_output_longest + PropertyManager.get_instance( ).get_property(DataConstants.ISOFORM_FILE_PROPERTY, True) # Extraction the longest sequences from dataset 2 sequences (isoforms) LengthSeq.longest_seq(self.file_sequences, self.file_dictionary, self.path_file_longest, self.path_file_isoform) # Timer step Timer.get_instance().step( " End of selection of the longest sequences in dataset 2 \n") Logger.get_instance().info( " End of selection of the longest sequences: \n \ two file have been generated one with longest sequences and the other one containing the isoform with same length " )
def creation_list(self): Logger.get_instance().info(" Creation of gene and protein list \n ") # Creation of two file containing respectively the genes and protein of dataset 1 self.dataset_input_path = PropertyManager.get_instance().get_property( DataConstants.DATASET_INPUT_PATH_PROPERTY, True) self.file_dataset_1 = PropertyManager.get_instance().get_property( DataConstants.DATASET_1_FILE_PROPERTY, True) self.gene_index_col = PropertyManager.get_instance().get_property( DataConstants.LIST_GENE_INDEX_COL_PROPERTY, True) self.protein_index_col = PropertyManager.get_instance().get_property( DataConstants.LIST_PROTEIN_INDEX_COL_PROPERTY, True) self.list_gene_dataset_1 = PropertyManager.get_instance().get_property( DataConstants.LIST_FILE_GENE_DATASET_1_PROPERTY, True) self.list_protein_dataset_1 = PropertyManager.get_instance( ).get_property(DataConstants.LIST_FILE_PROTEIN_DATASET_1_PROPERTY, True) self.path_gene_dataset_1 = Constants.PATH_HOME + self.dataset_input_path + self.list_gene_dataset_1 self.path_protein_dataset_1 = Constants.PATH_HOME + self.dataset_input_path + self.list_protein_dataset_1 self.path_home = Constants.PATH_HOME self.path_dataset_1 = self.path_home + self.dataset_input_path + self.file_dataset_1 dataset_1 = FileParser.make_table(self.path_dataset_1) gene_dataset_1 = TableWrapper.get_column(dataset_1, int(self.gene_index_col), start=1) protein_dataset_1 = TableWrapper.get_column( dataset_1, int(self.protein_index_col), start=1) FileWriter.write_table(self.path_gene_dataset_1, gene_dataset_1) FileWriter.write_table(self.path_protein_dataset_1, protein_dataset_1) Logger.get_instance().info( " The genes and proteins file of dataset 1 have been created \ in the following path \n\n " + self.dataset_input_path)
def domain_classification(self): self.path_home = Constants.PATH_HOME self.file_domain = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOMAIN_LIST_FILE_PROPERTY, True) self.file_jprot_information = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOMAIN_LIST_JPROTEOMICS_PROPERTY, True) self.file_pfamid = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOMAIN_FILE_PFAM_PROPERTY, True) # reading of pfam table in particular of the pfam id and the domain name table_pfam = FileParser.make_table(self.file_pfamid) pfamid = TableWrapper.get_column(table_pfam, 0) domain_name = TableWrapper.get_column(table_pfam, 3) # dictionary pfamid and domain name dict_pfam = {row[3]: row[0] for row in table_pfam} dict_pfam['DUF1785'] = '-' dict_pfam['DUF1898'] = '-' # reading of domain classification provided by dataset 2 # dictionary motifs--> class table_domain = FileParser.make_table(self.file_domain) dict_type_domain = TableWrapper.make_dictionary(table_domain) # make a inverse dictionary class--> motifs inverse_table_domain = TableWrapper.inv_column(table_domain, 0, 1) dict_class_domain = TableWrapper.make_dictionary(inverse_table_domain) # reading of domain information of dataset 2 jprot_table = FileParser.make_table(self.file_jprot_information, skip=1) for i, item in enumerate(jprot_table): if len(item) == 1: jprot_table[i] = [item[0], '.', '.'] # Jproteomics # =================================== # extraction of domain for each gene # extraction of gene id domain_column_jprot = TableWrapper.get_column(jprot_table, 1) genes_jprot = TableWrapper.get_column(jprot_table, 0) pfam_id_jprot = TableWrapper.get_column(jprot_table, 2) # This part reads the type of domain for each gene and creates a new column with the type of domain # at the end returns a new table with namegene, domain name, pfam id, class of domain # # One gene/protein can have more than one domain # For each gene the following part checks if the classification of protein domains by comparing the domain with dict_class_domains: # # count_classical = 0 count_nonclassic = 0 count_unclissified = 0 count_other_class = 0 count_no_domain = 0 new_table_jprot = [] for i, gene in enumerate(genes_jprot): row = [] row.append(gene) row.append(domain_column_jprot[i]) row.append(pfam_id_jprot[i]) string_domains = '' # If the protein hasn't any domain, add static 'no-domains' information if domain_column_jprot[i] == '.': string_domains += Constants.DOMAIN_NONE + Constants.DOMAIN_COMMA count_no_domain += 1 # If the protein contains some domains checks the class and # makes a string containing the class domains separated by a comma else: domains = domain_column_jprot[i].split(',') for type_domain in domains: print type_domain if type_domain in dict_type_domain: class_domain = dict_type_domain[type_domain][0] if class_domain == Constants.DOMAIN_CLASSICAL: string_domains += Constants.DOMAIN_CLASSICAL + Constants.DOMAIN_COMMA count_classical += 1 elif class_domain == Constants.DOMAIN_NONCLASSICAL: string_domains += Constants.DOMAIN_NONCLASSICAL + Constants.DOMAIN_COMMA count_nonclassic += 1 elif class_domain == Constants.DOMAIN_UNKNOWN: string_domains += Constants.DOMAIN_UNKNOWN + Constants.DOMAIN_COMMA count_unclissified += 1 elif type_domain not in dict_type_domain: string_domains += Constants.DOMAIN_OTHER + Constants.DOMAIN_COMMA count_other_class += 1 else: Logger.get_instance().info('unexpected case', type_domain) # -1 allows to delete the last comma in string row.append(string_domains[0:len(string_domains) - 1]) new_table_jprot.append(row) # print of proteins number for each domain class Logger.get_instance().info(str(count_classical)) Logger.get_instance().info(str(count_nonclassic)) Logger.get_instance().info(str(count_unclissified)) Logger.get_instance().info(str(count_other_class)) Logger.get_instance().info(str(count_no_domain)) self.path_ouput_file_jprot = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOMAIN_FINAL_TABLE_JPROT_PROPERTY, True) FileWriter.write_table(self.path_ouput_file_jprot, new_table_jprot) # NatRevGenetics # ========================================================= # this part classifies the gene in according to the type of domain and creates a new table with # name gene, type domain, pfam id, clas of domain self.input_file_nrgenetics = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOMAIN_LIST_NATREVGENETICS_PROPERTY, True) # reading of domain information of dataset 2 nrgenetics_table = FileParser.make_table(self.input_file_nrgenetics, skip=1) # extraction of domain domain for each gene # extraction of gene id domain_column_nrgenetics = TableWrapper.get_column(nrgenetics_table, 2) genes_nrgenetics = TableWrapper.get_column(nrgenetics_table, 0) # This part reads the type of domain for each gene and creates a new columns with the type of domain # at the end returns a new table with namegene, domain name, pfam id , class of domain # # One gene/protein can have more than one domain # For each gene the following part checks if the classification of protein domains by comparing the domain with dict_class_domains: # # count_classical = 0 count_nonclassic = 0 count_unclissified = 0 count_other_class = 0 count_no_domain = 0 new_table_nrgenetics = [] for i, gene in enumerate(genes_nrgenetics): row = [] row.append(gene) row.append(domain_column_nrgenetics[i]) string_domains = '' string_pfamid = '' # If the protein hasn't any domain, add static 'no-domains' information if domain_column_nrgenetics[i] == '.': string_domains += Constants.DOMAIN_NONE + Constants.DOMAIN_COMMA string_pfamid += '.,' count_no_domain += 1 # If the protein contains some domains checks the class and # akes a string containing the class domains separated by a comma else: # the domains are separated by a comma domains = domain_column_nrgenetics[i].split(',') for type_domain in domains: # the domain present also the number information # X-domain[n] for this reason in order to take only the domain name # the domain is split to '[' type_domain = type_domain.split('[')[0] if type_domain in dict_pfam: string_pfamid += dict_pfam[type_domain] + ',' else: string_pfamid += '-,' print type_domain if type_domain in dict_type_domain: class_domain = dict_type_domain[type_domain][0] if class_domain == Constants.DOMAIN_CLASSICAL: string_domains += Constants.DOMAIN_CLASSICAL + Constants.DOMAIN_COMMA count_classical += 1 elif class_domain == Constants.DOMAIN_NONCLASSICAL: string_domains += Constants.DOMAIN_NONCLASSICAL + Constants.DOMAIN_COMMA count_nonclassic += 1 elif class_domain == Constants.DOMAIN_UNKNOWN: string_domains += Constants.DOMAIN_UNKNOWN + Constants.DOMAIN_COMMA count_unclissified += 1 elif type_domain not in dict_type_domain: string_domains += Constants.DOMAIN_OTHER + Constants.DOMAIN_COMMA count_other_class += 1 else: print 'unexpected case', type_domain row.append(string_pfamid[0:len(string_pfamid) - 1]) row.append(string_domains[0:len(string_domains) - 1]) new_table_nrgenetics.append(row) # print of proteins number for each domain class Logger.get_instance().info(str(count_classical)) Logger.get_instance().info(str(count_nonclassic)) Logger.get_instance().info(str(count_unclissified)) Logger.get_instance().info(str(count_other_class)) Logger.get_instance().info(str(count_no_domain)) self.path_ouput_file_nrgenetics = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOMAIN_FINAL_TABLE_NATREVGENETICS, True) FileWriter.write_table(self.path_ouput_file_nrgenetics, new_table_nrgenetics) # reading of sequences file in order to take the headers # reading of gene and protein id correspondences for different genes self.file_sequences = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOMAIN_FILE_SEQ_PROPERTY, True) self.header = InfoFasta.get_header(self.file_sequences, text=False) gene_seq = [item.split('>')[1].split('|')[0] for item in self.header] prot_seq = [item.split('>')[1].split('|')[2] for item in self.header] # Construction of table containing the RBP protein and the corresponding class domain and pfam id count_j = 0 count_n = 0 final_table = [] title = [ 'gene id', 'type_domain', 'pfam id', 'class domain', 'prot id' ] final_table.append(title) # table construction for n, gene in enumerate(gene_seq): Logger.get_instance().info(str(n + 1)) Logger.get_instance().info(gene) # if gene is in jprot and in nrgenetics if gene in genes_jprot and gene in genes_nrgenetics: count_n += 1 ind_gene = genes_nrgenetics.index(gene) row = new_table_nrgenetics[ind_gene] row.append(prot_seq[n]) # if gene is in jprot and not in nrgenetics elif gene in genes_jprot and gene not in genes_nrgenetics: count_j += 1 ind_gene = genes_jprot.index(gene) row = new_table_jprot[ind_gene] row.append(prot_seq[n]) # if gene not in jprot and gene in nrgenetics elif gene not in genes_jprot and gene in genes_nrgenetics: count_n += 1 ind_gene = genes_nrgenetics.index(gene) row = new_table_nrgenetics[ind_gene] row.append(prot_seq[n]) # if the gene is not in both dataset: Error else: Logger.get_instance().info('Error' + gene + prot_seq[n]) final_table.append(row) sort_table = TableWrapper.inv_column(final_table, 0, 4) sort_table = TableWrapper.inv_column(sort_table, 1, 4) sort_table = TableWrapper.inv_column(sort_table, 2, 4) sort_table = TableWrapper.inv_column(sort_table, 3, 4) self.final_file_table = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOMAIN_FINALE_TABLE_RBP_DATASET_PROPERTY, True) FileWriter.write_table(self.file_final_table, sort_table) # Counting the number of protein with Classical domain, Non-classical, unclassified or combinations of this class prot_name = TableWrapper.get_column(sort_table, 1, 1) class_domain = TableWrapper.get_column(sort_table, 4, 1) classical = [] nonclassical = [] unclissified = [] otherclass = [] nodomain = [] # there are several combinations of class domain in a protein # the list creation in according to class domain has been performed by the filters of spreadsheet (see documentation) for n, domain in enumerate(class_domain): diff_domain = domain.split(',') for item in diff_domain: if item == Constants.DOMAIN_CLASSICAL: classical.append(prot_name[n]) elif item == Constants.DOMAIN_NONCLASSICAL: nonclassical.append(prot_name[n]) elif item == Constants.DOMAIN_UNKNOWN: unclissified.append(prot_name[n]) elif item == Constants.DOMAIN_OTHER: otherclass.append(prot_name[n]) elif item == Constants.DOMAIN_NONE: nodomain.append(prot_name[n]) else: # many others pass
@staticmethod def whole_procedure(): dataset_type = PropertyManager.get_instance().get_property( DataConstants.MOTIFS_DATASET_TYPE_PROPERTY, True) # start chrono Timer.get_instance().start_chrono() Logger.get_instance().info(" ........Start of " + dataset_type + " Motifs Analysis.....\n ") motifs = MotifsAnalysis() motifs.iupred_motifs() motifs.anchor_motifs() motifs.disordp_motifs() Timer.get_instance().stop_chrono(" End of " + dataset_type + " Motifs Analysis") if __name__ == '__main__': OptionManager.get_instance().initialize() # Set the level of verbosity Logger.get_instance().set_level( OptionManager.get_instance().get_option( OptionConstants.OPTION_VERBOSITY)) PropertyManager.get_instance().read_properties( OptionManager.get_instance().get_option( OptionConstants.OPTION_MOTIFS_ANALYSIS_PROPERTY_PATH, True)) MotifsAnalysis.whole_procedure()
Timer.get_instance().start_chrono() Logger.get_instance().info( "Start of the creation of RBP dataset.....\n ") M = MakeDatasetRbp() #M.delet_append_file() #M.comparison_dataset() #M.creation_list() #M.connection_to_ensembl() #M.dictionary_identifier() #M.longest_sequence() #M.isoform_sequences() #M.merger_sequences() M.split_dataset() Timer.get_instance().stop_chrono(' End of the creation of RBP dataset') if __name__ == '__main__': OptionManager.get_instance().initialize() # Retrieve the MakeDatasetRbp properties PropertyManager.get_instance().read_properties( OptionManager.get_instance().get_option( OptionConstants.OPTION_MAKEDATASETRBP_PROPERTIES_PATH, True)) MakeDatasetRbp.whole_procedure()
def connection_to_ensembl(self): Logger.get_instance().info(" Connection to Ensembl: Starting...\n") # DATASET 1 # ============================================= # Collection of sequences for dataset 1 Logger.get_instance().info(" Dataset 1 : Extraction of sequences...\n") # Timer step Timer.get_instance().step(" Start of sequences extraction \n") # Definition of Ensembl list_get_seq arguments self.path_home = Constants.PATH_HOME self.list_path = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.DATASET_INPUT_PATH_PROPERTY, True) self.gene_list_1 = PropertyManager.get_instance().get_property( DataConstants.LIST_FILE_GENE_DATASET_1_PROPERTY, True) self.protein_list = PropertyManager.get_instance().get_property( DataConstants.LIST_FILE_PROTEIN_DATASET_1_PROPERTY, True) self.ensembl_gene_list_1_path = self.list_path + self.gene_list_1 self.ensembl_protein_list_1_path = self.list_path + self.protein_list self.type_query1_ensembl = PropertyManager.get_instance().get_property( DataConstants.ENSEMBL_TYPE_QUERY_DATASET_1_PROPERTY, True) self.ensembl_path_output = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.ENSEMBL_OUTPUT_PATH_SEQUENCE_PROPERTY, True) self.ensembl_output_dataset1 = self.ensembl_path_output + PropertyManager.get_instance( ).get_property(DataConstants.ENSEMBL_FILE_SEQUENCES_1_PROPERTY) # Calling Ensembl.list_get_seq Ensembl.list_get_seq( self.ensembl_gene_list_1_path, int(self.type_query1_ensembl), path_protein_list=self.ensembl_protein_list_1_path, path_output=self.ensembl_output_dataset1) # Timer step Timer.get_instance().step(" End of Dataset 1 Sequences Extraction\n") Logger.get_instance().info( " Extraction of sequences for the dataset 1 has been completed \n\n" ) # END DATASET 1 # ===================================================== # DATASET 2 # ===================================================== # Collection of sequences for dataset 2 Logger.get_instance().info( " Dataset 2 : Extraction of sequences ....\n") # Definition of Ensembl list_get_seq arguments self.ensembl_input_list_2 = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.DATASET_OUTPUT_PROPERTY, True) self.gene_list_2 = Constants.FILE_DIFF self.ensembl_gene_list_2_path = self.ensembl_input_list_2 + self.gene_list_2 self.type_query2_ensembl = PropertyManager.get_instance().get_property( DataConstants.ENSEMBL_TYPE_QUERY_DATASET_2_PROPERTY, True) self.ensembl_path_output = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.ENSEMBL_OUTPUT_PATH_SEQUENCE_PROPERTY, True) self.ensembl_output_dataset2 = self.ensembl_path_output + PropertyManager.get_instance( ).get_property(DataConstants.ENSEMBL_FILE_SEQUENCES_2_PROPERTY, True) # Calling Ensembl.list_get_seq Ensembl.list_get_seq(self.ensembl_gene_list_2_path, int(self.type_query2_ensembl), path_protein_list=None, path_output=self.ensembl_output_dataset2) # Timer step Timer.get_instance().step(" End of Dataset 2 Sequences Extraction\n") Logger.get_instance().info( " Extraction of sequences for the dataset 2 has been completed \n\n" ) # END DATASET 2 # ===================================================== Logger.get_instance().info( " The sequences file of dataset 1 and the novel gene in dataset 2 \ have been created in the following path \n" + self.ensembl_path_output)
def whole_procedure(): # start chrono Timer.get_instance().start_chrono() Logger.get_instance().info("Start of the sequences extraction.....\n ") D = DownloadEnsemblSeq() #D.download_product_gene_seq() #D.make_dictionary() #D.get_longest_seq() #D.isoform_sequences() #D.merger_sequences() Timer.get_instance().stop_chrono(' End of the sequences extraction') if __name__ == '__main__': OptionManager.get_instance().initialize() # Retrieve the properties DownloadEnsemblSeq PropertyManager.get_instance().read_properties( OptionManager.get_instance().get_option( OptionConstants.OPTION_DOWNLOADENSEMBLSEQ_PROPERTY_PATH, True)) D = DownloadEnsemblSeq() D.whole_procedure()
def particular_analysis(self): Timer.get_instance().step(" Start of tools analysis for specific protein ") self.path_home = Constants.PATH_HOME self.path_input_anchor_file = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_ANCHOR_FILE_PROPERTY, True) self.path_input_iupred_file = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_IUPRED_FILE_PROPERTY, True) self.path_input_disordp_file = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_DISORDP_FILE_PROPERTY, True) self.path_input_reg_anchor = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_REG_ANCHOR_FILE_PROPERTY, True) self.path_input_reg_iupred_1 = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_REG_IUPRED_1_FILE_PROPERTY, True) self.path_input_reg_iupred_2 = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_REG_IUPRED_2_FILE_PROPERTY, True) self.path_input_reg_diso = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_REG_DISO_FILE_PROPERTY, True) self.input_files = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_DIR_FILE_PROPERTY, True) self.list_namefiles = PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_LIST_NAMEFILE_PROPERTY, True) self.path_output_dir = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_OUTPUT_DIR_PROPERTY, True) self.path_output_dir_diso = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_OUTPUT_DIR_DISO_PROPERTY, True) # This parameter represents the column of protein id in the classification files # # In Domain Class files the column of protein id is the 2 ( that is 1 for python) # In RNA target files the column of protein id is the 1 (that is 0 for python) # #self.protein_column_rna = PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_PROTEIN_LIST_COLUMN_RNA_PROPERTY, True) self.protein_column_class = PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_PROTEIN_LIST_COLUMN_CLASS_PROPERTY, True) # region file anchor_table = FileParser.make_table(self.path_input_reg_anchor, skip=1) iupred_table_1 = FileParser.make_table(self.path_input_reg_iupred_1, skip=1) iupred_table_2 = FileParser.make_table(self.path_input_reg_iupred_2, skip=1) disordp_table = FileParser.make_table(self.path_input_reg_diso, skip=1) # table file (fraction) anchor_t = FileParser.make_table(self.path_input_anchor_file, skip=1) iupred_t = FileParser.make_table(self.path_input_iupred_file, skip=1) disordp_t = FileParser.make_table(self.path_input_disordp_file, skip=1) list_filenames = self.list_namefiles.split(',') for filename in list_filenames: feature = filename.split('.')[0] table_domain = FileParser.make_table(self.input_files + str(filename)) list_prot = TableWrapper.get_column(table_domain,int(self.protein_column_class)) prot_id_anchor = TableWrapper.get_column(anchor_table, 0) prot_id_iupred_1 = TableWrapper.get_column(iupred_table_1, 0) prot_id_iupred_2 = TableWrapper.get_column(iupred_table_2, 0) prot_id_disordp = TableWrapper.get_column(disordp_table, 0) prot_id_anchor_t = TableWrapper.get_column(anchor_t, 0) prot_id_iupred_t = TableWrapper.get_column(iupred_t, 0) prot_id_disordp_t = TableWrapper.get_column(disordp_t, 0) # region file new_table_anchor = [line for n, line in enumerate(anchor_table) if prot_id_anchor[n] in list_prot] new_table_iupred_1 = [line for n, line in enumerate(iupred_table_1) if prot_id_iupred_1[n] in list_prot] new_table_iupred_2 = [line for n, line in enumerate(iupred_table_2) if prot_id_iupred_2[n] in list_prot] new_table_disordp = [line for n, line in enumerate(disordp_table) if prot_id_disordp[n] in list_prot] anchor_output_file_path = self.path_output_dir + feature + '_AnchorRegion.txt' iupred1_output_file_path = self.path_output_dir + feature + '_IUPredRegion_0.4.txt' iupred2_output_file_path = self.path_output_dir + feature + '_IUPredRegion_0.5.txt' disordp_output_file_path = self.path_output_dir_diso + feature + '_DisoRDPRegion.txt' # Table file (fraction) new_table_a = [line for n, line in enumerate(anchor_t) if prot_id_anchor_t[n] in list_prot] new_table_i = [line for n, line in enumerate(iupred_t) if prot_id_iupred_t[n] in list_prot] new_table_d = [line for n, line in enumerate(disordp_t) if prot_id_disordp_t[n] in list_prot] anchor_output_table = self.path_output_dir + feature + '_AnchorTable.txt' iupred_output_table = self.path_output_dir + feature + '_IUPredTable_0.4_0.5.txt' disordp_output_table = self.path_output_dir_diso + feature + '_DisoRDPTable.txt' # file writing # Region file FileWriter.write_table(anchor_output_file_path, new_table_anchor) FileWriter.write_table(iupred1_output_file_path, new_table_iupred_1) FileWriter.write_table(iupred2_output_file_path, new_table_iupred_2) FileWriter.write_table(disordp_output_file_path, new_table_disordp) # Table file FileWriter.write_table(anchor_output_table, new_table_a) FileWriter.write_table(iupred_output_table, new_table_i) FileWriter.write_table(disordp_output_table, new_table_d) Timer.get_instance().step(" End of tools analysis for specific protein ")
def merger_sequences(self): Logger.get_instance().info( " Union of the longest sequences and the random selected isoform ") # Input variables to merge the longest Novel sequences and random selected isoform of dataset 2 self.path_input_longest = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.FUSION_PATH_INPUT_PROPERTY, True) self.path_file_longest = self.path_input_longest + PropertyManager.get_instance( ).get_property(DataConstants.LONGEST_FILE_PROPERTY, True) self.path_file_isoform = self.path_input_longest + PropertyManager.get_instance( ).get_property(DataConstants.SELECTED_ISOFORM_FILE_PROPERTY, True) self.path_output_seq = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.FUSION_PATH_OUTPUT_PROPERTY, True) self.path_file_seq_dataset_2 = self.path_output_seq + PropertyManager.get_instance( ).get_property(DataConstants.FUSION_FILE_SEQ_DATASET_2_PROPERTY, True) FileParser.merge_file(self.path_file_longest, self.path_file_isoform, self.path_file_seq_dataset_2) # Input variables to merge the sequences datasets (Novel_JProteomics and NatRevGenetics) self.path_input_seq_dataset1 = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.FUSION_PATH_INPUT_DATASET_1_PROPERTY, True) self.path_file_dataset1 = self.path_input_seq_dataset1 + PropertyManager.get_instance( ).get_property(DataConstants.FUSION_FILE_DATASET_1_PROPERTY, True) self.path_file_dataset12 = self.path_output_seq + PropertyManager.get_instance( ).get_property(DataConstants.FUSION_DATASET_12_PROPERTY, True) Logger.get_instance().info( " Union of sequences respectively of dataset 1 and the novel dataset 2 proteins \n " ) FileParser.merge_file(self.path_file_dataset1, self.path_file_seq_dataset_2, self.path_file_dataset12) Logger.get_instance().info(" The New RBP Dataset has been created\n ") # This part checks if there are pseudo genes inside dataset 2 # Make the comparison between the original genes gived as an input and the genes obtained after # connection to Ensembl # This check allows to find genes that are not anymore available or that are pseudogenes Logger.get_instance().info( " Comparison between original genes and Ensembl output ") self.path_home = Constants.PATH_HOME self.path_input_original_file = self.dataset_output = PropertyManager.get_instance( ).get_property(DataConstants.DATASET_OUTPUT_PROPERTY, True) self.original_file = self.path_home + self.path_input_original_file + Constants.FILE_DIFF original_genes = FileParser.read_file(self.original_file) self.path_output_seq = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.FUSION_PATH_OUTPUT_PROPERTY, True) self.path_file_seq_dataset_2 = self.path_output_seq + PropertyManager.get_instance( ).get_property(DataConstants.FUSION_FILE_SEQ_DATASET_2_PROPERTY, True) final_headers = InfoFasta.get_header(self.path_file_seq_dataset_2) final_genes = [item[1:16] for item in final_headers] out_comparison = InfoDataset.comparison_dataset(original_genes, final_genes, header=False) genes = '\n'.join(out_comparison[1]) Logger.get_instance().info( " The genes lost during the request to Ensembl are : \n" + genes)
def mrna_protein(self): self.path_home = Constants.PATH_HOME self.jproteomics_seq = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.PUTATIVERNA_JPROTEOMICS_SEQ_PROPERTY, True) self.jproteomics_info = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.PUTATIVERNA_JPROTEOMICS_INFO_PROPERTY, True) # Reading J proteomics file containing information about the protein attendance in others datasets # info_table = FileParser.make_table(self.jproteomics_info, '\t', skip=1) # # column Extraction gene_info = TableWrapper.get_column(info_table, 1) castello = TableWrapper.get_column(info_table, 2) baltz = TableWrapper.get_column(info_table, 3) esc = TableWrapper.get_column(info_table, 4) rbpdb = TableWrapper.get_column(info_table, 5) rnacompete = TableWrapper.get_column(info_table, 6) mrna = [(cast_val, baltz_val, esc_val) for cast_val, baltz_val, esc_val in zip(castello, baltz, esc)] self.header = InfoFasta.get_header(self.jproteomics_seq, text=False) # Protein and gene of dataset 2 attend in final RBP dataset # gene_seq = [item.split('>')[1].split('|')[0] for item in self.header] prot_seq = [item.split('>')[1].split('|')[2] for item in self.header] # Construction of column containing the putative RNA target for each gene # in according to if condition putative_rna_target = [] rbpdb_target = [] for n, gene_id in enumerate(gene_seq): Logger.get_instance().info(gene_id) ind_gene_id = gene_info.index(gene_id) # if the gene have at least one Y in this array can be considered to have mRNA target if 'Y' in mrna[ind_gene_id]: rna_target = 'mRNA' Logger.get_instance().info( 'The putative Rna target of this gene is ' + rna_target) putative_rna_target.append([gene_id, prot_seq[n], rna_target]) elif 'N' in mrna[ind_gene_id]: # if the gene attend in rnacompete means that the RNA target is unkown if 'N' in rbpdb[ind_gene_id] and 'Y' in rnacompete[ind_gene_id]: rna_target = 'unknown' Logger.get_instance().info( 'The putative Rna target of this gene is ' + rna_target) putative_rna_target.append( [gene_id, prot_seq[n], rna_target]) # if the gene attend in rbpdb means just it is a RBP protein (no information about RNA target) elif 'Y' in rbpdb[ind_gene_id] and 'N' in rnacompete[ ind_gene_id]: rna_target = 'RBPDB' Logger.get_instance().info('The gene is in ' + rna_target) rbpdb_target.append([gene_id, prot_seq[n], rna_target]) else: Logger.get_instance().info(' Check this line' + info_table[ind_gene_id]) self.file_rna_target_jeproteomics = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.PUTATIVE_MRNA_GENE_JPROTEOMICS_PROPERTY, True) self.file_rbpdb_jproteomics = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.PUTATIVE_RBPDB_GENE_JPROTEOMICS_PROPERTY, True) # File Writing FileWriter.write_table(self.file_mrna_jeproteomics, putative_rna_target, symbol='\t') FileWriter.write_table(self.file_rbpdb_jproteomics, rbpdb_target, symbol='\t')
@staticmethod def whole_procedure(): # start chrono Timer.get_instance().start_chrono() Logger.get_instance().info("Start of Disorder Analysis.....\n ") disorder = DisorderAnalysis() #disorder.change_header() #disorder.split_dataset() #disorder.anchor_analysis() #disorder.iupred_analysis() #disorder.analysis_tools_output() #disorder.disordpbind_analysis() #disorder.particular_analysis() Timer.get_instance().stop_chrono(' End of Disorder Analysis') if __name__ == '__main__': OptionManager.get_instance().initialize() PropertyManager.get_instance().read_properties( OptionManager.get_instance().get_option( OptionConstants.OPTION_DISORDER_ANALYSIS_PATH, True)) DisorderAnalysis.whole_procedure()