Ejemplo n.º 1
0
 def __init__(self):
     
     self.path_home = Constants.PATH_HOME
     self.protein_list_file = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.MOTIFS_PROTEIN_FILE_PROPERTY, True)
     self.protein_list = FileParser.read_file(self.protein_list_file)
     self.motif_folder = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.MOTIFS_FOLDER_PROPERTY, True)
     self.domain_region_file = self.path_home + PropertyManager.get_instance().get_property( DataConstants.MOTIFS_DOMAIN_REGION_FILE_PROPERTY, True)
Ejemplo n.º 2
0
    def get_list_seq(path_input_list, path_output):

        seq_file = FileUtils.open_text_a(path_output)

        protein_list = FileParser.read_file(path_input_list)
        for protein in protein_list:
            seq = Uniprot.get_sequence(protein, format_out=True)
            seq_file.write(seq)

        seq_file.close()
Ejemplo n.º 3
0
    def merger_sequences(self):

        Logger.get_instance().info(
            " Union of the longest sequences and the random selected isoform ")

        # Input variables to merge the longest Novel sequences and random selected isoform of dataset

        self.path_home = Constants.PATH_HOME
        self.path_file_longest = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_FUSION_FILE_LONGEST_PROPERTY,
                       True)
        self.path_file_random = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_FUSION_FILE_RANDOM_PROPERTY,
                       True)
        self.path_final_file = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_FUSION_FINAL_DATASET_PROPERTY,
                       True)

        FileParser.merge_file(self.path_file_longest, self.path_file_random,
                              self.path_final_file)

        Logger.get_instance().info(
            " The Final Proteome Dataset has been created\n ")
Ejemplo n.º 4
0
    def creation_list(self):

        Logger.get_instance().info(" Creation of gene and protein list \n ")

        # Creation of two file containing respectively the genes and protein of dataset 1

        self.dataset_input_path = PropertyManager.get_instance().get_property(
            DataConstants.DATASET_INPUT_PATH_PROPERTY, True)
        self.file_dataset_1 = PropertyManager.get_instance().get_property(
            DataConstants.DATASET_1_FILE_PROPERTY, True)
        self.gene_index_col = PropertyManager.get_instance().get_property(
            DataConstants.LIST_GENE_INDEX_COL_PROPERTY, True)
        self.protein_index_col = PropertyManager.get_instance().get_property(
            DataConstants.LIST_PROTEIN_INDEX_COL_PROPERTY, True)

        self.list_gene_dataset_1 = PropertyManager.get_instance().get_property(
            DataConstants.LIST_FILE_GENE_DATASET_1_PROPERTY, True)
        self.list_protein_dataset_1 = PropertyManager.get_instance(
        ).get_property(DataConstants.LIST_FILE_PROTEIN_DATASET_1_PROPERTY,
                       True)

        self.path_gene_dataset_1 = Constants.PATH_HOME + self.dataset_input_path + self.list_gene_dataset_1
        self.path_protein_dataset_1 = Constants.PATH_HOME + self.dataset_input_path + self.list_protein_dataset_1

        self.path_home = Constants.PATH_HOME
        self.path_dataset_1 = self.path_home + self.dataset_input_path + self.file_dataset_1

        dataset_1 = FileParser.make_table(self.path_dataset_1)

        gene_dataset_1 = TableWrapper.get_column(dataset_1,
                                                 int(self.gene_index_col),
                                                 start=1)
        protein_dataset_1 = TableWrapper.get_column(
            dataset_1, int(self.protein_index_col), start=1)

        FileWriter.write_table(self.path_gene_dataset_1, gene_dataset_1)
        FileWriter.write_table(self.path_protein_dataset_1, protein_dataset_1)

        Logger.get_instance().info(
            " The genes and proteins file of dataset 1 have been created \
in the following path  \n\n " + self.dataset_input_path)
Ejemplo n.º 5
0
    def elm_region(protname, directory):
        

        filename = Constants.ELM_FILE + protname + Constants.EXTENSION_TXT_FILE
        filepath = directory + filename
        dictionary = {}
        
        occurrence = os.path.isfile(filepath)

        if occurrence == True:
            motif_table = FileParser.make_table(filepath, skip=1)
            # for loop to make the dictionary
            for m, row in enumerate(motif_table):
                name = row[0]
                start = int(row[1])
                end = int(row[2])
                if name not in dictionary:
                    dictionary[name] = [[start, end]]
                else:
                    dictionary[name].append([start,end])
            Logger.get_instance().info(" In the protein " + protname +' '+ str(len(dictionary)) + ' SLiMs have been found')      
        else:
            Logger.get_instance().info(' This protein has not SLiMs ')
        return dictionary
Ejemplo n.º 6
0
 def particular_analysis(self):
     
     
     Timer.get_instance().step(" Start of tools analysis for specific protein ")
     
     self.path_home = Constants.PATH_HOME
     self.path_input_anchor_file = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_ANCHOR_FILE_PROPERTY, True)
     self.path_input_iupred_file = self.path_home +  PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_IUPRED_FILE_PROPERTY, True)
     self.path_input_disordp_file = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_DISORDP_FILE_PROPERTY, True)
     self.path_input_reg_anchor = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_REG_ANCHOR_FILE_PROPERTY, True)
     self.path_input_reg_iupred_1 = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_REG_IUPRED_1_FILE_PROPERTY, True)
     self.path_input_reg_iupred_2 = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_REG_IUPRED_2_FILE_PROPERTY, True)
     self.path_input_reg_diso = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_REG_DISO_FILE_PROPERTY, True)
     self.input_files = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_INPUT_DIR_FILE_PROPERTY, True)
     self.list_namefiles = PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_LIST_NAMEFILE_PROPERTY, True)
     self.path_output_dir = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_OUTPUT_DIR_PROPERTY, True)
     self.path_output_dir_diso = self.path_home + PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_OUTPUT_DIR_DISO_PROPERTY, True)
     
     # This parameter represents the column of protein id in the classification files
     #
     # In Domain Class  files the column of protein id is the 2 ( that is 1 for python)
     # In RNA target files the column of protein id is the 1 (that is 0 for python)
     #
     
     #self.protein_column_rna =  PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_PROTEIN_LIST_COLUMN_RNA_PROPERTY, True)
     self.protein_column_class =  PropertyManager.get_instance().get_property( DataConstants.SPECIFIC_PROTEIN_LIST_COLUMN_CLASS_PROPERTY, True)
     
     # region file
     anchor_table = FileParser.make_table(self.path_input_reg_anchor, skip=1)
     iupred_table_1 = FileParser.make_table(self.path_input_reg_iupred_1, skip=1)
     iupred_table_2 = FileParser.make_table(self.path_input_reg_iupred_2, skip=1)
     disordp_table = FileParser.make_table(self.path_input_reg_diso, skip=1)
     
     # table file (fraction)
     anchor_t = FileParser.make_table(self.path_input_anchor_file, skip=1)
     iupred_t = FileParser.make_table(self.path_input_iupred_file, skip=1)
     disordp_t = FileParser.make_table(self.path_input_disordp_file, skip=1)
     
     list_filenames = self.list_namefiles.split(',')
     
     for filename in list_filenames:
         feature = filename.split('.')[0]
         table_domain = FileParser.make_table(self.input_files + str(filename))
         list_prot = TableWrapper.get_column(table_domain,int(self.protein_column_class))
         prot_id_anchor = TableWrapper.get_column(anchor_table, 0)
         prot_id_iupred_1 = TableWrapper.get_column(iupred_table_1, 0)
         prot_id_iupred_2 = TableWrapper.get_column(iupred_table_2, 0)
         prot_id_disordp = TableWrapper.get_column(disordp_table, 0)
         
         prot_id_anchor_t = TableWrapper.get_column(anchor_t, 0)
         prot_id_iupred_t = TableWrapper.get_column(iupred_t, 0)
         prot_id_disordp_t = TableWrapper.get_column(disordp_t, 0)
         
         # region file
         new_table_anchor = [line for n, line in enumerate(anchor_table) if prot_id_anchor[n] in list_prot]
         new_table_iupred_1 = [line for n, line in enumerate(iupred_table_1) if prot_id_iupred_1[n] in list_prot]
         new_table_iupred_2 = [line for n, line in enumerate(iupred_table_2) if prot_id_iupred_2[n] in list_prot]
         new_table_disordp = [line for n, line in enumerate(disordp_table) if prot_id_disordp[n] in list_prot]
         anchor_output_file_path = self.path_output_dir + feature + '_AnchorRegion.txt'
         iupred1_output_file_path = self.path_output_dir + feature + '_IUPredRegion_0.4.txt'
         iupred2_output_file_path = self.path_output_dir + feature + '_IUPredRegion_0.5.txt'
         disordp_output_file_path = self.path_output_dir_diso + feature + '_DisoRDPRegion.txt'
         
         # Table file (fraction)
         new_table_a = [line for n, line in enumerate(anchor_t) if prot_id_anchor_t[n] in list_prot]
         new_table_i = [line for n, line in enumerate(iupred_t) if prot_id_iupred_t[n] in list_prot]
         new_table_d = [line for n, line in enumerate(disordp_t) if prot_id_disordp_t[n] in list_prot]
         anchor_output_table = self.path_output_dir + feature + '_AnchorTable.txt'
         iupred_output_table = self.path_output_dir + feature + '_IUPredTable_0.4_0.5.txt'
         disordp_output_table = self.path_output_dir_diso + feature + '_DisoRDPTable.txt'
         
         
         # file writing
         
         # Region file
         FileWriter.write_table(anchor_output_file_path, new_table_anchor)
         FileWriter.write_table(iupred1_output_file_path, new_table_iupred_1)
         FileWriter.write_table(iupred2_output_file_path, new_table_iupred_2)
         FileWriter.write_table(disordp_output_file_path, new_table_disordp)
         
         # Table file
         FileWriter.write_table(anchor_output_table, new_table_a)
         FileWriter.write_table(iupred_output_table, new_table_i)
         FileWriter.write_table(disordp_output_table, new_table_d)
     
     Timer.get_instance().step(" End of tools analysis for specific protein ")
Ejemplo n.º 7
0
 def get_dataset_feature(dataset_path, col_feature, skip=1):
     dataset = FileParser.make_table(dataset_path, skip=skip)
     feature = TableWrapper.get_column(dataset, col_feature)
     return feature
Ejemplo n.º 8
0
    def merger_sequences(self):

        Logger.get_instance().info(
            " Union of the longest sequences and the random selected isoform ")

        # Input variables to merge the longest Novel sequences and random selected isoform of dataset 2

        self.path_input_longest = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_PATH_INPUT_PROPERTY, True)
        self.path_file_longest = self.path_input_longest + PropertyManager.get_instance(
        ).get_property(DataConstants.LONGEST_FILE_PROPERTY, True)
        self.path_file_isoform = self.path_input_longest + PropertyManager.get_instance(
        ).get_property(DataConstants.SELECTED_ISOFORM_FILE_PROPERTY, True)

        self.path_output_seq = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_PATH_OUTPUT_PROPERTY, True)
        self.path_file_seq_dataset_2 = self.path_output_seq + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_FILE_SEQ_DATASET_2_PROPERTY, True)

        FileParser.merge_file(self.path_file_longest, self.path_file_isoform,
                              self.path_file_seq_dataset_2)

        # Input variables to merge the sequences datasets (Novel_JProteomics and NatRevGenetics)

        self.path_input_seq_dataset1 = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_PATH_INPUT_DATASET_1_PROPERTY,
                       True)
        self.path_file_dataset1 = self.path_input_seq_dataset1 + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_FILE_DATASET_1_PROPERTY, True)

        self.path_file_dataset12 = self.path_output_seq + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_DATASET_12_PROPERTY, True)

        Logger.get_instance().info(
            " Union of sequences respectively of dataset 1 and the novel dataset 2 proteins \n "
        )

        FileParser.merge_file(self.path_file_dataset1,
                              self.path_file_seq_dataset_2,
                              self.path_file_dataset12)

        Logger.get_instance().info(" The New RBP Dataset has been created\n ")

        # This part checks if there are pseudo genes inside dataset 2
        # Make the comparison between the original genes gived as an input and the genes obtained after
        # connection to Ensembl
        # This check allows to find genes that are not anymore available or that are pseudogenes

        Logger.get_instance().info(
            " Comparison between original genes and Ensembl output ")
        self.path_home = Constants.PATH_HOME
        self.path_input_original_file = self.dataset_output = PropertyManager.get_instance(
        ).get_property(DataConstants.DATASET_OUTPUT_PROPERTY, True)
        self.original_file = self.path_home + self.path_input_original_file + Constants.FILE_DIFF

        original_genes = FileParser.read_file(self.original_file)

        self.path_output_seq = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_PATH_OUTPUT_PROPERTY, True)
        self.path_file_seq_dataset_2 = self.path_output_seq + PropertyManager.get_instance(
        ).get_property(DataConstants.FUSION_FILE_SEQ_DATASET_2_PROPERTY, True)

        final_headers = InfoFasta.get_header(self.path_file_seq_dataset_2)
        final_genes = [item[1:16] for item in final_headers]

        out_comparison = InfoDataset.comparison_dataset(original_genes,
                                                        final_genes,
                                                        header=False)

        genes = '\n'.join(out_comparison[1])

        Logger.get_instance().info(
            " The genes lost during the request to Ensembl are : \n" + genes)
Ejemplo n.º 9
0
    def anchor_info(input_file, prot_id, num_aa=10):

        Logger.get_instance().debug(
            ' Creation of a dictionary containing the dictionary analysis \
informations of protein ' + prot_id)

        # Initialization of a dictionary that will contain the output informations
        dictionary_anchor = {}

        # Reading of input file containing the anchor output for one protein
        anchor_table = FileParser.make_table(input_file, '\t', skip=2)

        # Extraction of anchor output information
        position = TableWrapper.get_column(anchor_table, 0)
        aminoacid = TableWrapper.get_column(anchor_table, 1)
        probability = TableWrapper.get_column(anchor_table, 2)
        binary_array = TableWrapper.get_column(anchor_table, 3)

        # Conversion in float number the probability array
        prob = [float(item) for item in probability]
        # Conversion in int number of binary array
        binary_array = [int(item) for item in binary_array]

        # Storing informations
        dictionary_anchor["position"] = position
        dictionary_anchor["aminoacids"] = aminoacid
        dictionary_anchor["probability"] = prob

        # Counting of ones number in binary array
        # This represent the number of aminoacids that has a probability prediction greater than 0.5
        num_ones = binary_array.count(1)
        length = len(binary_array)
        # Fraction calculation
        fraction = round(num_ones / float(length), 3)

        dictionary_anchor["values>threshold"] = num_ones
        dictionary_anchor["fraction"] = str(fraction)
        #
        # This section counts the regions that have almost 6 consecutive aminoacid (Anchor --> min length 6 AA)
        # for each region found the start and end positions are taken and memorized in a vector up_region
        count_one = 0
        up_region = []
        #
        # This for loop flows over the binary array and checks if the i-th aminoacid number is one or zero
        # if it is a one the variable count_one is increased by one
        #
        for ind, num in enumerate(binary_array):
            if num == 1:
                count_one += 1
            #
            # when the for loop bumps into a zero the ones counting is stopped and
            # the count_one is compared with number of 10 that representing the 10 aminoacids
            #
            elif num == 0:
                end = ind
                # if the count_one is effectively >= 10 the start and end positions of region are
                # memorized in the up_region vector
                if count_one >= num_aa:
                    start = end - count_one
                    region = [start, end]
                    count_one = 0
                    up_region.append(region)
                #
                # if the count one is not >= 10 the count_one is reseted
                else:
                    count_one = 0
            #
            # this section is necessary when the last number of binary array is not a zero
            # indeed it would not be possible to memorized the last ones region
            #
            # this if condition will be true when the binary array shows an one in the last i-th number
            # in this way it is possible memorized the last ones region in the vector up_region
            #
            if num == 1 and ind == len(binary_array) - 1:
                end = ind + 1
                if count_one >= num_aa:
                    start = end - count_one
                    region = [start, end]
                    count_one = 0
                    up_region.append(region)
                else:
                    count_one = 0
            else:
                pass

        #
        # This part creates a table with specific positions of ones regions
        # indeed the start and end position of up_region vector are mapped over the position vector
        # in order to extract the exactly positions
        #
        table = []
        for n, region in enumerate(up_region):
            row = []
            row.append(prot_id)
            n += 1
            numb = str(n)
            row.append(numb)
            start = position[region[0]]
            end = position[region[1] - 1]
            row.append(start)
            row.append(end)
            length = region[1] - region[0]
            row.append(str(length))
            table.append(row)

        num_region_up = len(up_region)

        # The informations are stored into dictionary

        dictionary_anchor["binary_array"] = binary_array
        dictionary_anchor["regions"] = up_region
        dictionary_anchor["num region"] = str(num_region_up)
        dictionary_anchor["anchor table"] = table

        return dictionary_anchor
Ejemplo n.º 10
0
    def list_get_seq(path_input,
                     type_query,
                     path_protein_list=None,
                     path_output=None):

        # the input file is read
        list_item = FileParser.read_file(path_input)
        dict_query = {1: 'all', 2: 'one'}
        count_duplicate_genes = 0
        all_seqs = ''
        prot_seq = []
        # For each gene in list the sequences are downloaded
        for i, item in enumerate(list_item):
            Logger.get_instance().info(
                str(i + 1) + ' Extraction of gene sequence(s) : ' + item)
            fasta_seq = Ensembl.get_sequence(item, dict_query[type_query])
            if fasta_seq == item + ' No available':
                pass
            elif fasta_seq == item + ' pseudogene':
                pass

            # If the gene have a sequence the output is memorized in seqs
            else:
                seqs = fasta_seq
                seqs = seqs + '\n'
                if path_protein_list == None:
                    pass

                # if path_protein_list is different to None
                # Among the isoform of gene will be get only that is present in list_protein
                else:
                    list_protein = FileParser.read_file(path_protein_list)
                    prot_genes = seqs.split('>')
                    protein_seq = [
                        '>' + fasta for fasta in prot_genes
                        if fasta[32:47] in list_protein
                    ]
                #
                # if path_output == None the information are stored in list o string
                #
                if path_output == None:
                    if path_protein_list == None:
                        all_seqs += seqs
                    else:
                        prot_seq.append(protein_seq)
                #
                # if path_output != None
                # the information will be appended in a file
                else:
                    file_out = FileUtils.open_text_a(path_output)
                    if path_protein_list == None:
                        file_out.write(seqs)
                    else:
                        if protein_seq == []:
                            count_duplicate_genes += 1
                            Logger.get_instance().info(
                                " Number of duplicated genes: " +
                                str(count_duplicate_genes))
                            Logger.get_instance().info(
                                " The gene duplicated is: " + str(item) + '|' +
                                str(list_protein[i]))
                        else:
                            file_out.write(protein_seq[0])

        # return information like string or list

        if path_output == None:
            if path_protein_list == None:
                all_seqs += seqs
                return seqs
            else:
                return protein_seq
        else:
            file_out.close()
Ejemplo n.º 11
0
    def mrna_protein(self):

        self.path_home = Constants.PATH_HOME
        self.jproteomics_seq = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVERNA_JPROTEOMICS_SEQ_PROPERTY,
                       True)
        self.jproteomics_info = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVERNA_JPROTEOMICS_INFO_PROPERTY,
                       True)

        # Reading J proteomics file containing information about the protein attendance in others datasets
        #
        info_table = FileParser.make_table(self.jproteomics_info, '\t', skip=1)
        #
        # column Extraction
        gene_info = TableWrapper.get_column(info_table, 1)
        castello = TableWrapper.get_column(info_table, 2)
        baltz = TableWrapper.get_column(info_table, 3)
        esc = TableWrapper.get_column(info_table, 4)
        rbpdb = TableWrapper.get_column(info_table, 5)
        rnacompete = TableWrapper.get_column(info_table, 6)

        mrna = [(cast_val, baltz_val, esc_val)
                for cast_val, baltz_val, esc_val in zip(castello, baltz, esc)]

        self.header = InfoFasta.get_header(self.jproteomics_seq, text=False)

        # Protein and gene of dataset 2 attend in final RBP dataset
        #
        gene_seq = [item.split('>')[1].split('|')[0] for item in self.header]
        prot_seq = [item.split('>')[1].split('|')[2] for item in self.header]

        # Construction of column containing the putative RNA target for each gene
        # in according to if condition
        putative_rna_target = []
        rbpdb_target = []
        for n, gene_id in enumerate(gene_seq):
            Logger.get_instance().info(gene_id)
            ind_gene_id = gene_info.index(gene_id)
            # if the gene have at least one Y in this array can be considered to have mRNA target
            if 'Y' in mrna[ind_gene_id]:
                rna_target = 'mRNA'
                Logger.get_instance().info(
                    'The putative Rna target of this gene is ' + rna_target)
                putative_rna_target.append([gene_id, prot_seq[n], rna_target])
            elif 'N' in mrna[ind_gene_id]:
                # if the gene attend in rnacompete means that the RNA target is unkown
                if 'N' in rbpdb[ind_gene_id] and 'Y' in rnacompete[ind_gene_id]:
                    rna_target = 'unknown'
                    Logger.get_instance().info(
                        'The putative Rna target of this gene is ' +
                        rna_target)
                    putative_rna_target.append(
                        [gene_id, prot_seq[n], rna_target])
                # if the gene attend in rbpdb means just it is a RBP protein (no information about RNA target)
                elif 'Y' in rbpdb[ind_gene_id] and 'N' in rnacompete[
                        ind_gene_id]:
                    rna_target = 'RBPDB'
                    Logger.get_instance().info('The gene is in ' + rna_target)
                    rbpdb_target.append([gene_id, prot_seq[n], rna_target])
            else:
                Logger.get_instance().info(' Check this line' +
                                           info_table[ind_gene_id])

        self.file_rna_target_jeproteomics = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVE_MRNA_GENE_JPROTEOMICS_PROPERTY,
                       True)
        self.file_rbpdb_jproteomics = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVE_RBPDB_GENE_JPROTEOMICS_PROPERTY,
                       True)

        # File Writing

        FileWriter.write_table(self.file_mrna_jeproteomics,
                               putative_rna_target,
                               symbol='\t')

        FileWriter.write_table(self.file_rbpdb_jproteomics,
                               rbpdb_target,
                               symbol='\t')
Ejemplo n.º 12
0
    def domain_classification(self):

        self.path_home = Constants.PATH_HOME
        self.file_domain = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_LIST_FILE_PROPERTY, True)
        self.file_jprot_information = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_LIST_JPROTEOMICS_PROPERTY, True)
        self.file_pfamid = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_FILE_PFAM_PROPERTY, True)

        # reading of pfam table in particular of the pfam id and the domain name
        table_pfam = FileParser.make_table(self.file_pfamid)
        pfamid = TableWrapper.get_column(table_pfam, 0)
        domain_name = TableWrapper.get_column(table_pfam, 3)

        # dictionary pfamid and domain name
        dict_pfam = {row[3]: row[0] for row in table_pfam}
        dict_pfam['DUF1785'] = '-'
        dict_pfam['DUF1898'] = '-'

        # reading of domain classification provided by dataset 2
        # dictionary motifs--> class

        table_domain = FileParser.make_table(self.file_domain)
        dict_type_domain = TableWrapper.make_dictionary(table_domain)

        # make a inverse dictionary class--> motifs
        inverse_table_domain = TableWrapper.inv_column(table_domain, 0, 1)
        dict_class_domain = TableWrapper.make_dictionary(inverse_table_domain)

        # reading of domain information of dataset 2
        jprot_table = FileParser.make_table(self.file_jprot_information,
                                            skip=1)

        for i, item in enumerate(jprot_table):
            if len(item) == 1:
                jprot_table[i] = [item[0], '.', '.']

        # Jproteomics
        # ===================================

        # extraction of domain for each gene
        # extraction of gene id
        domain_column_jprot = TableWrapper.get_column(jprot_table, 1)
        genes_jprot = TableWrapper.get_column(jprot_table, 0)
        pfam_id_jprot = TableWrapper.get_column(jprot_table, 2)

        # This part reads the type of domain for each gene and creates a new column with the type of domain
        # at the end returns a new table with namegene, domain name, pfam id, class of domain
        #
        # One gene/protein can have more than one domain
        # For each gene the following part checks if the classification of protein domains by comparing the domain with dict_class_domains:
        #
        #

        count_classical = 0
        count_nonclassic = 0
        count_unclissified = 0
        count_other_class = 0
        count_no_domain = 0
        new_table_jprot = []
        for i, gene in enumerate(genes_jprot):
            row = []
            row.append(gene)
            row.append(domain_column_jprot[i])
            row.append(pfam_id_jprot[i])
            string_domains = ''
            # If the protein hasn't any domain, add static 'no-domains' information
            if domain_column_jprot[i] == '.':
                string_domains += Constants.DOMAIN_NONE + Constants.DOMAIN_COMMA
                count_no_domain += 1
            # If the protein contains some domains checks the class and
            # makes a string containing the class domains separated by a comma
            else:
                domains = domain_column_jprot[i].split(',')
                for type_domain in domains:
                    print type_domain
                    if type_domain in dict_type_domain:
                        class_domain = dict_type_domain[type_domain][0]
                        if class_domain == Constants.DOMAIN_CLASSICAL:
                            string_domains += Constants.DOMAIN_CLASSICAL + Constants.DOMAIN_COMMA
                            count_classical += 1
                        elif class_domain == Constants.DOMAIN_NONCLASSICAL:
                            string_domains += Constants.DOMAIN_NONCLASSICAL + Constants.DOMAIN_COMMA
                            count_nonclassic += 1
                        elif class_domain == Constants.DOMAIN_UNKNOWN:
                            string_domains += Constants.DOMAIN_UNKNOWN + Constants.DOMAIN_COMMA
                            count_unclissified += 1
                    elif type_domain not in dict_type_domain:
                        string_domains += Constants.DOMAIN_OTHER + Constants.DOMAIN_COMMA
                        count_other_class += 1
                    else:
                        Logger.get_instance().info('unexpected case',
                                                   type_domain)
            # -1 allows to delete the last comma in string
            row.append(string_domains[0:len(string_domains) - 1])
            new_table_jprot.append(row)

        # print of proteins number for each domain class
        Logger.get_instance().info(str(count_classical))
        Logger.get_instance().info(str(count_nonclassic))
        Logger.get_instance().info(str(count_unclissified))
        Logger.get_instance().info(str(count_other_class))
        Logger.get_instance().info(str(count_no_domain))

        self.path_ouput_file_jprot = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_FINAL_TABLE_JPROT_PROPERTY, True)
        FileWriter.write_table(self.path_ouput_file_jprot, new_table_jprot)

        # NatRevGenetics
        # =========================================================
        # this part classifies the gene in according to the type of domain and creates a new table with
        # name gene, type domain, pfam id, clas of domain

        self.input_file_nrgenetics = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_LIST_NATREVGENETICS_PROPERTY, True)

        # reading of domain information of dataset 2
        nrgenetics_table = FileParser.make_table(self.input_file_nrgenetics,
                                                 skip=1)

        # extraction of domain domain for each gene
        # extraction of gene id
        domain_column_nrgenetics = TableWrapper.get_column(nrgenetics_table, 2)
        genes_nrgenetics = TableWrapper.get_column(nrgenetics_table, 0)

        # This part reads the type of domain for each gene and creates a new columns with the type of domain
        # at the end returns a new table with namegene, domain name, pfam id , class of domain
        #
        # One gene/protein can have more than one domain
        # For each gene the following part checks if the classification of protein domains by comparing the domain with dict_class_domains:
        #
        #
        count_classical = 0
        count_nonclassic = 0
        count_unclissified = 0
        count_other_class = 0
        count_no_domain = 0
        new_table_nrgenetics = []
        for i, gene in enumerate(genes_nrgenetics):
            row = []
            row.append(gene)
            row.append(domain_column_nrgenetics[i])
            string_domains = ''
            string_pfamid = ''
            # If the protein hasn't any domain, add static 'no-domains' information
            if domain_column_nrgenetics[i] == '.':
                string_domains += Constants.DOMAIN_NONE + Constants.DOMAIN_COMMA
                string_pfamid += '.,'
                count_no_domain += 1
            # If the protein contains some domains checks the class and
            # akes a string containing the class domains separated by a comma
            else:
                # the domains are separated by a comma
                domains = domain_column_nrgenetics[i].split(',')
                for type_domain in domains:
                    # the domain present also the number information
                    # X-domain[n] for this reason in order to take only the domain name
                    # the domain is split to '['
                    type_domain = type_domain.split('[')[0]
                    if type_domain in dict_pfam:
                        string_pfamid += dict_pfam[type_domain] + ','
                    else:
                        string_pfamid += '-,'
                    print type_domain
                    if type_domain in dict_type_domain:
                        class_domain = dict_type_domain[type_domain][0]
                        if class_domain == Constants.DOMAIN_CLASSICAL:
                            string_domains += Constants.DOMAIN_CLASSICAL + Constants.DOMAIN_COMMA
                            count_classical += 1
                        elif class_domain == Constants.DOMAIN_NONCLASSICAL:
                            string_domains += Constants.DOMAIN_NONCLASSICAL + Constants.DOMAIN_COMMA
                            count_nonclassic += 1
                        elif class_domain == Constants.DOMAIN_UNKNOWN:
                            string_domains += Constants.DOMAIN_UNKNOWN + Constants.DOMAIN_COMMA
                            count_unclissified += 1
                    elif type_domain not in dict_type_domain:
                        string_domains += Constants.DOMAIN_OTHER + Constants.DOMAIN_COMMA
                        count_other_class += 1
                    else:
                        print 'unexpected case', type_domain
            row.append(string_pfamid[0:len(string_pfamid) - 1])
            row.append(string_domains[0:len(string_domains) - 1])
            new_table_nrgenetics.append(row)

        # print of proteins number for each domain class
        Logger.get_instance().info(str(count_classical))
        Logger.get_instance().info(str(count_nonclassic))
        Logger.get_instance().info(str(count_unclissified))
        Logger.get_instance().info(str(count_other_class))
        Logger.get_instance().info(str(count_no_domain))

        self.path_ouput_file_nrgenetics = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_FINAL_TABLE_NATREVGENETICS, True)
        FileWriter.write_table(self.path_ouput_file_nrgenetics,
                               new_table_nrgenetics)

        # reading of sequences file in order to take the headers
        # reading of gene and protein id correspondences for different genes

        self.file_sequences = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_FILE_SEQ_PROPERTY, True)
        self.header = InfoFasta.get_header(self.file_sequences, text=False)
        gene_seq = [item.split('>')[1].split('|')[0] for item in self.header]
        prot_seq = [item.split('>')[1].split('|')[2] for item in self.header]

        # Construction of table containing the RBP protein and the corresponding class domain and pfam id
        count_j = 0
        count_n = 0
        final_table = []
        title = [
            'gene id', 'type_domain', 'pfam id', 'class domain', 'prot id'
        ]
        final_table.append(title)
        # table construction
        for n, gene in enumerate(gene_seq):
            Logger.get_instance().info(str(n + 1))
            Logger.get_instance().info(gene)
            # if gene is in jprot and in nrgenetics
            if gene in genes_jprot and gene in genes_nrgenetics:
                count_n += 1
                ind_gene = genes_nrgenetics.index(gene)
                row = new_table_nrgenetics[ind_gene]
                row.append(prot_seq[n])
            # if gene is in jprot and not in nrgenetics
            elif gene in genes_jprot and gene not in genes_nrgenetics:
                count_j += 1
                ind_gene = genes_jprot.index(gene)
                row = new_table_jprot[ind_gene]
                row.append(prot_seq[n])
            # if gene not in jprot and gene in nrgenetics
            elif gene not in genes_jprot and gene in genes_nrgenetics:
                count_n += 1
                ind_gene = genes_nrgenetics.index(gene)
                row = new_table_nrgenetics[ind_gene]
                row.append(prot_seq[n])
            # if the gene is not in both dataset: Error
            else:
                Logger.get_instance().info('Error' + gene + prot_seq[n])
            final_table.append(row)

        sort_table = TableWrapper.inv_column(final_table, 0, 4)

        sort_table = TableWrapper.inv_column(sort_table, 1, 4)
        sort_table = TableWrapper.inv_column(sort_table, 2, 4)
        sort_table = TableWrapper.inv_column(sort_table, 3, 4)

        self.final_file_table = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOMAIN_FINALE_TABLE_RBP_DATASET_PROPERTY,
                       True)
        FileWriter.write_table(self.file_final_table, sort_table)

        # Counting the number of protein with Classical domain, Non-classical, unclassified or combinations of this class
        prot_name = TableWrapper.get_column(sort_table, 1, 1)
        class_domain = TableWrapper.get_column(sort_table, 4, 1)
        classical = []
        nonclassical = []
        unclissified = []
        otherclass = []
        nodomain = []

        # there are several combinations of class domain in a protein
        # the list creation in according to class domain has been performed by the filters of spreadsheet (see documentation)
        for n, domain in enumerate(class_domain):
            diff_domain = domain.split(',')
            for item in diff_domain:
                if item == Constants.DOMAIN_CLASSICAL:
                    classical.append(prot_name[n])
                elif item == Constants.DOMAIN_NONCLASSICAL:
                    nonclassical.append(prot_name[n])
                elif item == Constants.DOMAIN_UNKNOWN:
                    unclissified.append(prot_name[n])
                elif item == Constants.DOMAIN_OTHER:
                    otherclass.append(prot_name[n])
                elif item == Constants.DOMAIN_NONE:
                    nodomain.append(prot_name[n])
                else:
                    # many others
                    pass
Ejemplo n.º 13
0
    def rna_target(self):

        self.path_home = Constants.PATH_HOME
        self.file_seq_natrevgenetics = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVERNA_NATREVGENETICS_SEQ_PROPERTY,
                       True)
        self.natrevgenetics_info = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVE_RNA_NATREVGENETICS_INFO_PROPERTY,
                       True)

        info_table = FileParser.make_table(self.natrevgenetics_info,
                                           '\t',
                                           skip=1)

        prot_info = TableWrapper.get_column(info_table, 1)
        putative_rna = TableWrapper.get_column(info_table, 3)

        self.header = InfoFasta.get_header(self.file_seq_natrevgenetics,
                                           text=False)

        gene_seq = [item.split('>')[1].split('|')[0] for item in self.header]
        prot_seq = [item.split('>')[1].split('|')[2] for item in self.header]

        # Creation of Table containing gene id, prot id and rna target

        putative_rna_target = []
        type_rna_target = []
        for n, prot in enumerate(prot_seq):
            Logger.get_instance().info(prot)
            index_prot = prot_info.index(prot)
            rna_target = putative_rna[index_prot]
            row = [gene_seq[n], prot_seq[n], rna_target]
            type_rna_target.append(rna_target)
            putative_rna_target.append(row)
            Logger.get_instance().info(" The putative rna target is " +
                                       rna_target)

        self.file_all_rna_target = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVE_ALL_RNA_TARGET_PROPERTY, True)

        FileWriter.write_table(self.file_all_rna_target,
                               putative_rna_target,
                               symbol='\t')

        # set of RNA target type in order to create different list
        #
        unique_rna_target = set(type_rna_target)

        info_new_table = FileParser.make_table(self.file_all_rna_target, '\t')

        # Columns extraction
        prot_name = TableWrapper.get_column(info_new_table, 1)
        type_rnatarget = TableWrapper.get_column(info_new_table, 2)

        file_output = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVE_RNA_OUTPUT_PROPERTY, True)

        # this for loop allows to create a proteins files for each RNA target type
        for item in unique_rna_target:
            file_name = file_output + item + PropertyManager.get_instance(
            ).get_property(
                DataConstants.PUTATIVE_RNA_TARGET_DATASET_NAME_PROPERTY, True)
            file_rna = FileUtils.open_text_a(file_name)
            for n, type_rna in enumerate(type_rnatarget):
                if type_rna == item:
                    file_rna.write(prot_name[n])

            file_rna.close()