コード例 #1
0
    def change_header(path_input_file, path_ouptut_file, source=1, type_id=1):

        file_input = FileUtils.open_text_r(path_input_file)
        seq_file = file_input.read()

        file_output = FileUtils.open_text_a(path_ouptut_file)

        # Warning: Check that the file have the '>' char only at beginning of header lines and not in other points
        # otherwise the split will occur in an incorrect way!
        seq_file_list = seq_file.split('>')[1:]

        for seq in seq_file_list:
            lines = seq.split('\n')
            header = lines[0]
            Logger.get_instance().info(header)
            # Ensembl
            if source == 1:
                new_header = '>' + header.split('|')[2] + '\n'  # see Note
            # Uniprot
            elif source == 2:
                diff_header = header.split(' ')[0]
                # AC
                if type_id == 1:
                    new_header = '>' + diff_header.split('|')[1] + '\n'
                # ID
                elif type_id == 2:
                    new_header = '>' + diff_header.split('|')[2] + '\n'

            fasta = new_header + '\n'.join(lines[1:])

            file_output.write(fasta)

        file_output.close()
コード例 #2
0
 def merge_file(namefile_1, namefile_2, new_namefile):
     file1 = FileUtils.open_text_r(namefile_1)
     file2 = FileUtils.open_text_r(namefile_2)
     new_file = FileUtils.open_text_a(new_namefile)
     text_1 = file1.read()
     text_2 = file2.read()
     new_file.write(text_1)
     new_file.write(text_2)
     new_file.close()
コード例 #3
0
    def make_iupred_file(input_path, output_path, th_1, th_2, num_aa, dataset_type):
        
        
        # initialization of file names
        file_name_1 = dataset_type + '_IupredTable' + '_t1_'+ str(th_1) + '_t2_' + str(th_2) + '.txt'
        file_name_2 = dataset_type + '_IupredRegion_' + str(th_1)  + '.txt'
        file_name_3 = dataset_type + '_IupredRegion_' + str(th_2)  + '.txt'
        
        
        num_aa_string = '('+ str(num_aa) +' AA)'
        
        # Files opening and title string writing
        file_1 = FileUtils.open_text_a(output_path + file_name_1)
        file_2 = FileUtils.open_text_a(output_path + file_name_2)
        file_3 = FileUtils.open_text_a(output_path + file_name_3)
        header_file_table = ['Protein', 'Fraction '+ str(th_1), 'Fraction ' + str(th_2), 'Region N.' +  num_aa_string +'th_'+ str(th_1) , 'Region N.'+ num_aa_string +'th_'+ str(th_2)]
        header_file_region = ['Protein', 'N', 'Start', 'End',  'Region length']
        
        header_string_table = '\t'.join(header_file_table)
        header_file_region = '\t'.join(header_file_region)
        
        file_1.write(header_string_table + '\n')
        file_2.write(header_file_region + '\n')
        file_3.write(header_file_region + '\n')
        
        # This command allows to taken the file names of protein that you want analyze
        list_file = subp.check_output(['ls', input_path])
        list_file = list_file.split('\n')
        if '' in list_file:
            list_file.remove('')

        # This section performs the iupred_string_info method (that calls also iupred_info method) 
        # for each protein file in list_file and simultaneously appends into files the output results
        # in a tab format
        for i, pred_file in enumerate(list_file):
            i += 1
            prot_id = pred_file.split('.')[0].split('_')[1]
            Logger.get_instance().info( str(i) + ' ' + prot_id)
            namefile = input_path + pred_file
            out_string = Iupred.iupred_string_info(namefile, prot_id, th_1, th_2, num_aa)
            
            string_file_1 = out_string[0]
            string_file_2 = out_string[1]
            string_file_3 = out_string[2]
            
            file_1.write(string_file_1 + '\n')
            file_2.write(string_file_2 + '\n')
            file_3.write(string_file_3 + '\n')
            
            
        file_1.close()
        file_2.close()
        file_3.close()
コード例 #4
0
 def iupred_analysis(self, fastafile, prot):
     
     self.fastafile = fastafile
     self.prot = prot
     
     # Calling of IUPred command
     iupred_out = subp.check_output([self.iupred_path+"iupred", self.fastafile, "long"])
     
     
     # Prediction output file ( Prediction_protname.txt)
     pred_file = FileUtils.open_text_w(self.path_output+'IUPred_' + self.prot + '.txt')
     index_prediction = iupred_out.index('Prediction output')
     iupred_out = iupred_out[index_prediction:] 
     iupred_out_list = iupred_out.split('\n')
     new_iupred_out = []
     new_iupred_out.append(iupred_out_list[0])
     new_iupred_out.append(iupred_out_list[1])
     for line in iupred_out_list[2:]:
         new_line = []
         for item in line.split(' '):
             if item != '':
                 new_line.append(item)
         new_line_string = '\t'.join(new_line)
         new_iupred_out.append(new_line_string)
             
     final_out = '\n'.join(new_iupred_out) 
         
     pred_file.write(final_out)
     pred_file.close()
コード例 #5
0
    def output_reading(filename):

        input_file = FileUtils.open_text_r(filename)

        text_file = []

        lines = input_file.readlines()
        string = ''
        for n, line in enumerate(lines):
            if line[0:1] == '>' and n == 0:
                string += line[1:]
            elif line[0:1] != '>' and n != 0:
                string += line
            elif line[0:1] == '>' and n != 0:
                # append in string format the output of one protein
                text_file.append(string)
                # reset the string variable and add the header
                string = ''
                string += line[1:]
            else:
                Logger.get_instance().warning(' Check this line : ' + line)

        text_file.append(string)

        return text_file
コード例 #6
0
    def make_dictionary(self):

        Logger.get_instance().info(
            " Creation of a dictionary for novel gene of dataset 2\
The dictionary structure is : \n \
{gene = [ isoform1, isoform2,...isoformN]}")

        self.path_home = Constants.PATH_HOME
        self.path_input_file = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_DICTIONARY_INPUT_FILE_PROPERTY,
                       True)

        self.dictionary_output_path = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_DICTIONARY_OUTPUT_PATH_PROPERTY,
                       True)
        self.output_file_path = self.dictionary_output_path + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_DICTIONARY_FILE_OUTPUT_PROPERTY,
                       True)

        dict_identifier = InfoFasta.make_dictionary(self.path_input_file)

        self.dict_file = FileUtils.open_text_w(self.output_file_path)

        pickle.dump(dict_identifier, self.dict_file)

        Logger.get_instance().info(
            " The creation of a dictionary is completed \n\n")
コード例 #7
0
 def read_file(namefile):
     f = FileUtils.open_text_r(namefile)
     listfile = []
     for line in f:
         item = line.strip()
         listfile.append(item)
     return listfile
コード例 #8
0
    def dictionary_identifier(self):

        Logger.get_instance().info(
            " Creation of a dictionary for novel gene of dataset 2\
The dictionary structure is : \n \
{gene = [ isoform1, isoform2,...isoformN]}")

        self.ensembl_path_output = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.ENSEMBL_OUTPUT_PATH_SEQUENCE_PROPERTY,
                       True)
        self.ensembl_output_dataset2 = self.ensembl_path_output + PropertyManager.get_instance(
        ).get_property(DataConstants.ENSEMBL_FILE_SEQUENCES_2_PROPERTY, True)

        self.dictionary_output = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.DICTIONARY_PATH_OUTPUT_PROPERTY, True)
        self.dictionary_namefile = self.dictionary_output + PropertyManager.get_instance(
        ).get_property(DataConstants.DICTIONARY_NAME_FILE_PROPERTY, True)

        dict_identifier = InfoFasta.make_dictionary(
            self.ensembl_output_dataset2)

        file_dict = FileUtils.open_text_w(self.dictionary_namefile)

        pickle.dump(dict_identifier, file_dict)

        Logger.get_instance().info(
            " The creation of a dictionary for novel gene in dataset 2 is completed \n\n"
        )
コード例 #9
0
    def isoform_sequences(self):

        Logger.get_instance().info(
            " Starting the random selection of isoforms with same length \n")
        Logger.get_instance().info(
            " The following headers are the proteins randomly selected \n")

        self.path_output_longest = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.LONGEST_PATH_OUTPUT_PROPERTY, True)

        self.path_file_isoform = self.path_output_longest + PropertyManager.get_instance(
        ).get_property(DataConstants.ISOFORM_FILE_PROPERTY, True)
        self.path_file_selected_isoform = self.path_output_longest + PropertyManager.get_instance(
        ).get_property(DataConstants.RANDOM_ISOFORM_SEQ_PROPERTY, True)

        # The headers of a Isoform fasta file are taken by InfoFasta class
        # You make sure that the arg text is equal to False because the input object is a file and not a list

        self.headers = InfoFasta.get_header(self.path_file_isoform, text=False)

        # Extraction of genes form headers line
        # This vector contains double gene because the file contains some isoform of the same gene

        gene_isoform = []
        for header in self.headers:
            gene = header[1:16]
            gene_isoform.append(gene)

        # gene set creation
        unique_gene = set(gene_isoform)

        # This for loop flows on the unique gene
        #
        random_header = []
        old_num_isoform = 0
        for gene in unique_gene:
            # For each gene counts how many isoform has
            num_isoform = gene_isoform.count(gene)
            item = range(0, num_isoform)
            # Select one isoform randomly
            sel = random.choice(item)
            # The header selected randomly are stored in array
            random_header.append(self.headers[old_num_isoform:old_num_isoform +
                                              num_isoform][sel])
            old_num_isoform = old_num_isoform + num_isoform

        self.file_random_seq = FileUtils.open_text_a(
            self.path_file_selected_isoform)

        # The sequences corresponding to header selected are extracted from isoform file

        for header in random_header:
            Logger.get_instance().info('Header selected : ' + header)
            identifier = header[33:48]
            sequence = InfoFasta.get_seq(self.path_file_isoform, identifier)
            fasta_seq = SeqToFasta.give_fasta(header, sequence)
            self.file_random_seq.write(fasta_seq)

        Logger.get_instance().info(" End of selection random sequences \n ")
コード例 #10
0
    def domain_all_protein(domain_region_file):

        file_domain = FileUtils.open_text_r(domain_region_file)

        # Importation of Dictionary
        dict_domain = pickle.load(file_domain)

        return dict_domain
コード例 #11
0
    def make_disordp_file(input_path, output_path, binding_partner, num_aa,
                          dataset_type):

        # initialization of file names
        file_name_1 = dataset_type + '_DisoRDPbindTable.txt'
        file_name_2 = dataset_type + '_DisoRDPbindRegion.txt'

        num_aa_string = '(' + str(num_aa) + ' AA)'

        # Files opening and title string writing
        file_1 = FileUtils.open_text_a(output_path + file_name_1)
        file_2 = FileUtils.open_text_a(output_path + file_name_2)
        header_file_table = [
            'Protein', 'Fraction ', 'Region N.' + num_aa_string
        ]
        header_file_region = ['Protein', 'N', 'Start', 'End', 'Region length']

        header_string_table = '\t'.join(header_file_table)
        header_file_region = '\t'.join(header_file_region)

        file_1.write(header_string_table + '\n')
        file_2.write(header_file_region + '\n')

        # Reading of DisoRDPbind output file
        protein_output_list = DisoRDPbind.output_reading(input_path)

        for n, output in enumerate(protein_output_list):
            if 'WARNING:' in output:
                prot = output.split('\n')[0]
                Logger.get_instance().warning(
                    str(n + 1) + "\n This protein contains >=10000 residues\
 (DisoRBDbind cannot predict the proteins with size >=10000) " + prot)
            else:
                Logger.get_instance().info(str(n + 1))
                results = DisoRDPbind.disordp_string_info(
                    output, binding_partner, num_aa)

                string_file_1 = results[0]
                string_file_2 = results[1]

                file_1.write(string_file_1 + '\n')
                file_2.write(string_file_2 + '\n')

        file_1.close()
        file_2.close()
コード例 #12
0
    def get_list_seq(path_input_list, path_output):

        seq_file = FileUtils.open_text_a(path_output)

        protein_list = FileParser.read_file(path_input_list)
        for protein in protein_list:
            seq = Uniprot.get_sequence(protein, format_out=True)
            seq_file.write(seq)

        seq_file.close()
コード例 #13
0
    def split_seq(file_sequences_path, path_output, start_header, end_header):

        # Through the subprocess method the grep unix command gets the header of fasta file
        #
        header_dataset = subp.check_output(['grep', '>', file_sequences_path])
        header = header_dataset.split('\n')
        file_seq = FileUtils.open_text_r(file_sequences_path)
        seq_obj = file_seq.readlines()

        for i, term in enumerate(header):
            prot = term[start_header:end_header]
            Logger.get_instance().info(str(i + 1) + ' ' + prot)

            # extraction of sequence from fasta file
            prot_seq = InfoFasta.get_seq(seq_obj, prot)

            # writing of sequence in a fasta file
            fasta_seq = SeqToFasta.give_fasta(term, prot_seq)
            file_out = FileUtils.open_text_w(path_output + prot + '.fasta')
            file_out.write(fasta_seq)
        file_out.close()
コード例 #14
0
    def domain_one_protein(domain_region_file, protname):

        file_domain = FileUtils.open_text_r(domain_region_file)

        # Importation of Dictionary
        dict_domain = pickle.load(file_domain)

        if protname in dict_domain:
            domain_prot = dict_domain[protname]
            return domain_prot
        else:
            Logger.get_instance().debug(" Protein without domains " + protname)
            return []
コード例 #15
0
    def get_header(seq_obj, header_identifier=None, text=True):
        if text == False:
            fasta = FileUtils.open_text_r(seq_obj)
        elif text == True:
            fasta = seq_obj
        HEADER = []  
        for line in fasta:                      
            if line[0:1] == '>':
                line = line.strip()
                HEADER.append(line)
                

        if header_identifier == None:
            return HEADER
        else:                
            for item in HEADER:
                if header_identifier in item:
                    return item
コード例 #16
0
 def get_seq(seq_obj, header_identifier, text=True):
     if text == False:
         fasta = FileUtils.open_text_r(seq_obj)
     elif text == True:
         fasta = seq_obj
     seq = ''
     flag = 0
     for n, line in enumerate(fasta):
         line = line.strip()
         if line[0:1] == '>' and flag == 0:
             if header_identifier in line:
                 flag = 1
         elif line[0:1] != '>' and flag == 1:
             seq+=line
         elif line[0:1] == '>' and flag == 1:
             finalseq = seq
             flag = 2
         finalseq = seq
     # The replace function is used in order to remove the star because the Ensembl sequences
     # show the star at the end of sequence but if the sequences doesn't show the start anything happens
     return finalseq.replace('*','')
コード例 #17
0
    def rna_target(self):

        self.path_home = Constants.PATH_HOME
        self.file_seq_natrevgenetics = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVERNA_NATREVGENETICS_SEQ_PROPERTY,
                       True)
        self.natrevgenetics_info = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVE_RNA_NATREVGENETICS_INFO_PROPERTY,
                       True)

        info_table = FileParser.make_table(self.natrevgenetics_info,
                                           '\t',
                                           skip=1)

        prot_info = TableWrapper.get_column(info_table, 1)
        putative_rna = TableWrapper.get_column(info_table, 3)

        self.header = InfoFasta.get_header(self.file_seq_natrevgenetics,
                                           text=False)

        gene_seq = [item.split('>')[1].split('|')[0] for item in self.header]
        prot_seq = [item.split('>')[1].split('|')[2] for item in self.header]

        # Creation of Table containing gene id, prot id and rna target

        putative_rna_target = []
        type_rna_target = []
        for n, prot in enumerate(prot_seq):
            Logger.get_instance().info(prot)
            index_prot = prot_info.index(prot)
            rna_target = putative_rna[index_prot]
            row = [gene_seq[n], prot_seq[n], rna_target]
            type_rna_target.append(rna_target)
            putative_rna_target.append(row)
            Logger.get_instance().info(" The putative rna target is " +
                                       rna_target)

        self.file_all_rna_target = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVE_ALL_RNA_TARGET_PROPERTY, True)

        FileWriter.write_table(self.file_all_rna_target,
                               putative_rna_target,
                               symbol='\t')

        # set of RNA target type in order to create different list
        #
        unique_rna_target = set(type_rna_target)

        info_new_table = FileParser.make_table(self.file_all_rna_target, '\t')

        # Columns extraction
        prot_name = TableWrapper.get_column(info_new_table, 1)
        type_rnatarget = TableWrapper.get_column(info_new_table, 2)

        file_output = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVE_RNA_OUTPUT_PROPERTY, True)

        # this for loop allows to create a proteins files for each RNA target type
        for item in unique_rna_target:
            file_name = file_output + item + PropertyManager.get_instance(
            ).get_property(
                DataConstants.PUTATIVE_RNA_TARGET_DATASET_NAME_PROPERTY, True)
            file_rna = FileUtils.open_text_a(file_name)
            for n, type_rna in enumerate(type_rnatarget):
                if type_rna == item:
                    file_rna.write(prot_name[n])

            file_rna.close()
コード例 #18
0
    def list_get_seq(path_input,
                     type_query,
                     path_protein_list=None,
                     path_output=None):

        # the input file is read
        list_item = FileParser.read_file(path_input)
        dict_query = {1: 'all', 2: 'one'}
        count_duplicate_genes = 0
        all_seqs = ''
        prot_seq = []
        # For each gene in list the sequences are downloaded
        for i, item in enumerate(list_item):
            Logger.get_instance().info(
                str(i + 1) + ' Extraction of gene sequence(s) : ' + item)
            fasta_seq = Ensembl.get_sequence(item, dict_query[type_query])
            if fasta_seq == item + ' No available':
                pass
            elif fasta_seq == item + ' pseudogene':
                pass

            # If the gene have a sequence the output is memorized in seqs
            else:
                seqs = fasta_seq
                seqs = seqs + '\n'
                if path_protein_list == None:
                    pass

                # if path_protein_list is different to None
                # Among the isoform of gene will be get only that is present in list_protein
                else:
                    list_protein = FileParser.read_file(path_protein_list)
                    prot_genes = seqs.split('>')
                    protein_seq = [
                        '>' + fasta for fasta in prot_genes
                        if fasta[32:47] in list_protein
                    ]
                #
                # if path_output == None the information are stored in list o string
                #
                if path_output == None:
                    if path_protein_list == None:
                        all_seqs += seqs
                    else:
                        prot_seq.append(protein_seq)
                #
                # if path_output != None
                # the information will be appended in a file
                else:
                    file_out = FileUtils.open_text_a(path_output)
                    if path_protein_list == None:
                        file_out.write(seqs)
                    else:
                        if protein_seq == []:
                            count_duplicate_genes += 1
                            Logger.get_instance().info(
                                " Number of duplicated genes: " +
                                str(count_duplicate_genes))
                            Logger.get_instance().info(
                                " The gene duplicated is: " + str(item) + '|' +
                                str(list_protein[i]))
                        else:
                            file_out.write(protein_seq[0])

        # return information like string or list

        if path_output == None:
            if path_protein_list == None:
                all_seqs += seqs
                return seqs
            else:
                return protein_seq
        else:
            file_out.close()
コード例 #19
0
    def longest_seq(seq_obj,
                    dict_identifier,
                    path_outfile_longest,
                    path_outfile_isoform,
                    type_obj='list'):

        if type_obj == 'file':
            type_text = False
        elif type_obj == 'list':
            type_text = True

        fileout_longest = FileUtils.open_text_a(path_outfile_longest)
        fileout_isoform = FileUtils.open_text_a(path_outfile_isoform)
        file_dict = open(dict_identifier, 'r')
        dict_ids = pickle.load(file_dict)

        # Possible conditions:
        #
        # 1) the gene has one longest protein
        #     - in this case this seq is added to longest file
        # 2) the gene has two protein with the same length
        #    a) the sequences are identical
        #        - in this case one of these identical sequences is added to longest file
        #    b) the sequences are different
        #        - in this case the isoform sequences are added to isoform file
        # 3) the gene has more than two protein with the same length
        #    a) the sequences are identical
        #        - in this case one of these is added to longest file
        #    b) the sequences are not identical
        #        - the different isoform are added to isoform file
        #
        # count variables have been initialized in order to check the output during the method elaboration
        #
        seq_count = 0
        double_seq_count = 0
        not_same_seq_count = 0
        same_seq_count = 0
        more_prot_count = 0
        prot_longest = []
        prot_double_lseq = []
        prot_double_prot = []
        more_two_prot = []
        more_two_lseq = []
        y = 0
        #
        # This for loop flows on the keys of dictionary
        for gene in dict_ids:
            y = y + 1
            Logger.get_instance().info(str(y) + ' Gene analysed : ' + gene)
            seqs = []  # will contain the isoform list of gene selected
            lenseq = []  # will contain the length of each isoform seq
            headers = []  # will contain the header of each isoform seq

            # this for loop flows on the isoforms of gene selected
            for prot in dict_ids[gene]:

                # This lines call InfoFasta class in order to extract
                # the seq, the length and the header of protein selected
                # all item are memorized in lists
                lenseq.append(InfoFasta.get_length(seq_obj, prot))
                seqs.append(InfoFasta.get_seq(seq_obj, prot, text=type_text))
                headers.append(
                    InfoFasta.get_header(seq_obj,
                                         header_identifier=prot,
                                         text=type_text))

            # Find the max length among the sequences
            # the index_max list contains the index in correspondence of sequence with max length
            len_max = max(lenseq)
            index_max = [
                item for item in range(len(lenseq)) if lenseq[item] == len_max
            ]
            #
            # The following if conditions check the length of index_max vector
            #
            # Condition 1)
            # -------------
            # if the length of index_max vector is equal to 1 it means that there is just one longest protein
            # the protein sequence is written into longest file
            #
            if len(index_max) == 1:
                Logger.get_instance().info(' If condition 1')
                seq_count += 1
                seq = SeqToFasta.give_fasta(
                    headers[index_max[0]],
                    seqs[index_max[0]])  # (See NOTE above)
                fileout_longest.write(seq)
                prot_longest.append(dict_ids[gene][index_max[0]])
            #
            # Condition 2)
            # -------------
            # if length of index_max is equal to 2 it means that there are two protein with same length
            #
            elif len(index_max) == 2:
                Logger.get_instance().info('If condition 2')
                double_seq_count += 1
                # Condition 2a
                # -------------
                # The proteins have the same sequences
                # One protein sequence is written into longest file
                if seqs[index_max[0]] == seqs[index_max[1]]:
                    Logger.get_instance().info('2a')
                    same_seq_count += 1
                    d_seq = SeqToFasta.give_fasta(
                        headers[index_max[0]],
                        seqs[index_max[0]])  # (See NOTE above)
                    fileout_longest.write(d_seq)
                # Condition 2b
                # -------------
                # The protein have different sequences
                # The sequences are written into isoform file
                else:
                    Logger.get_instance().info('2b')
                    not_same_seq_count += 1
                    for i in range(len(index_max)):
                        prot_double_lseq.append(seqs[index_max[i]])
                        prot_double_prot.append(dict_ids[gene][index_max[i]])
                        diff_seq = SeqToFasta.give_fasta(
                            headers[index_max[i]],
                            seqs[index_max[i]])  # (See NOTE above)
                        fileout_isoform.write(diff_seq)

            # Condition 3)
            # -------------
            # if the length of index_max is greater than two it means that there are more than two proteins
            # with same length
            else:
                more_prot_count += 1
                Logger.get_instance().info(' If condition 3')

                # Condition 3a
                # -------------
                # The isoforms with same length have actually the same sequences
                # One of this protein is written in longest file
                if seqs.count(seqs[index_max[0]]) == len(index_max):
                    Logger.get_instance().info('3a')
                    m_seq = SeqToFasta.give_fasta(
                        headers[index_max[0]],
                        seqs[index_max[0]])  # (See NOTE above)
                    fileout_longest.write(m_seq)

                # Condition 3b
                # -------------
                # Among the isoforms there are at least two isoforms with different sequences
                #
                else:
                    Logger.get_instance().info('3b')
                    more_two_prot.append(gene)
                    more_two_seqs = [
                    ]  # will contains only the sequences with max length
                    for n in index_max:
                        more_two_seqs.append(seqs[n])
                    more_two_lseq.append(list(set(more_two_seqs)))
                    for seq in set(
                            more_two_seqs
                    ):  # set(more_two_seqs) contains only the different sequences
                        # find the sequence index in the list of sequences
                        index_seq = seqs.index(seq)
                        mdiff_seq = SeqToFasta.give_fasta(
                            headers[index_seq],
                            seqs[index_seq])  # (See NOTE above)
                        fileout_isoform.write(mdiff_seq)
        fileout_isoform.close()
        fileout_longest.close()
コード例 #20
0
    def anchor_analysis(self, fastafile, motifslist, prot):

        # Calling of anchor command
        # anchor out contains the anchor output in text format
        anchor_out = subp.check_output([
            "perl", self.anchor_path + 'anchor.pl', fastafile, "-m", motifslist
        ])

        # Definition of the section index of anchor output in order to get a specific section of anchor output
        # Thereby in the next step it will be possible to write these sections in some file
        index_bind_reg = anchor_out.index('Predicted binding regions')
        index_motifs = anchor_out.index('Motifs')
        index_pred_profile = anchor_out.index('Prediction profile output')

        # The Anchor output can lack filtered regions section
        if 'Filtered regions' in anchor_out:
            index_filt_reg = anchor_out.index('Filtered regions')
        else:
            index_filt_reg = index_motifs

        # ===============================================================================
        # Files writing
        # ===============================================================================
        #
        # Prediction binding regions file ( PBR_protname.txt)
        # This section selects the Prediction binding region of anchor output
        # The PBR section is split in lines and the '#\t' character is removed
        #
        pbr_out = anchor_out[index_bind_reg:index_filt_reg]
        pbr_out_list = pbr_out.split('\n')
        pbr = [line[2:] for line in pbr_out_list if line[0:2] == '#\t']
        #
        # When a protein lacks predicting binding region in the output file is written "None" then
        # If the predicting binding regions are not in pbr_out the file writing is skipped
        #
        if 'None' in pbr_out:
            Logger.get_instance().info(
                "This protein doesn't contain predicted binding region  (" +
                prot + ')')
            pass
        elif 'None' not in pbr_out:
            new_pbr_out = [line.replace(' ', '') for line in pbr]
            pbr_file = FileUtils.open_text_w(self.path_output + 'PBR_' + prot +
                                             '.txt')
            pbr_file.write('\n'.join(new_pbr_out))
            pbr_file.close()
        #
        # Found Motifs file (FMotifs_protname.txt)
        #
        fmotifs_out = anchor_out[index_motifs:index_pred_profile]
        fmotifs_out_list = fmotifs_out.split('\n')
        fmotifs = [line[2:] for line in fmotifs_out_list if line[0:2] == '#\t']
        #
        # When a protein lacks Motif in the output file is written "None" then
        # If the Motif are not in fmotif_out the file writing is skipped
        #
        if 'None' in fmotifs_out:
            Logger.get_instance().info(
                "This protein doesn't contain any motifs (" + prot + ')')
            pass
        elif 'None' not in pbr_out:
            new_fmotifs = [line.replace(' ', '') for line in fmotifs]
            fmotifs_file = FileUtils.open_text_w(self.path_output +
                                                 'FMotifs_' + prot + '.txt')
            fmotifs_file.write('\n'.join(new_fmotifs))
            fmotifs_file.close()
        #
        # Prediction profile output (Pred_protname.txt)
        # This section is always present in anchor output
        #
        pred_file = FileUtils.open_text_w(self.path_output + 'Pred_' + prot +
                                          '.txt')
        pred_out = anchor_out[index_pred_profile:]
        string = '#   Columns:\n#   1 - Amino acid number\n#   2 -\
 One letter code\n#   3 - ANCHOR probability value\n#   4 - ANCHOR output\n#'

        pred_out = pred_out.replace(string, '')
        pred_out_list = pred_out.split('\n')
        new_pred_out = [line.replace(' ', '') for line in pred_out_list]
        final_out = '\n'.join(new_pred_out)
        pred_file.write(final_out)
        pred_file.close()