Beispiel #1
0
 def merge_file(namefile_1, namefile_2, new_namefile):
     file1 = FileUtils.open_text_r(namefile_1)
     file2 = FileUtils.open_text_r(namefile_2)
     new_file = FileUtils.open_text_a(new_namefile)
     text_1 = file1.read()
     text_2 = file2.read()
     new_file.write(text_1)
     new_file.write(text_2)
     new_file.close()
Beispiel #2
0
 def read_file(namefile):
     f = FileUtils.open_text_r(namefile)
     listfile = []
     for line in f:
         item = line.strip()
         listfile.append(item)
     return listfile
Beispiel #3
0
    def change_header(path_input_file, path_ouptut_file, source=1, type_id=1):

        file_input = FileUtils.open_text_r(path_input_file)
        seq_file = file_input.read()

        file_output = FileUtils.open_text_a(path_ouptut_file)

        # Warning: Check that the file have the '>' char only at beginning of header lines and not in other points
        # otherwise the split will occur in an incorrect way!
        seq_file_list = seq_file.split('>')[1:]

        for seq in seq_file_list:
            lines = seq.split('\n')
            header = lines[0]
            Logger.get_instance().info(header)
            # Ensembl
            if source == 1:
                new_header = '>' + header.split('|')[2] + '\n'  # see Note
            # Uniprot
            elif source == 2:
                diff_header = header.split(' ')[0]
                # AC
                if type_id == 1:
                    new_header = '>' + diff_header.split('|')[1] + '\n'
                # ID
                elif type_id == 2:
                    new_header = '>' + diff_header.split('|')[2] + '\n'

            fasta = new_header + '\n'.join(lines[1:])

            file_output.write(fasta)

        file_output.close()
Beispiel #4
0
    def output_reading(filename):

        input_file = FileUtils.open_text_r(filename)

        text_file = []

        lines = input_file.readlines()
        string = ''
        for n, line in enumerate(lines):
            if line[0:1] == '>' and n == 0:
                string += line[1:]
            elif line[0:1] != '>' and n != 0:
                string += line
            elif line[0:1] == '>' and n != 0:
                # append in string format the output of one protein
                text_file.append(string)
                # reset the string variable and add the header
                string = ''
                string += line[1:]
            else:
                Logger.get_instance().warning(' Check this line : ' + line)

        text_file.append(string)

        return text_file
Beispiel #5
0
    def domain_all_protein(domain_region_file):

        file_domain = FileUtils.open_text_r(domain_region_file)

        # Importation of Dictionary
        dict_domain = pickle.load(file_domain)

        return dict_domain
Beispiel #6
0
    def domain_one_protein(domain_region_file, protname):

        file_domain = FileUtils.open_text_r(domain_region_file)

        # Importation of Dictionary
        dict_domain = pickle.load(file_domain)

        if protname in dict_domain:
            domain_prot = dict_domain[protname]
            return domain_prot
        else:
            Logger.get_instance().debug(" Protein without domains " + protname)
            return []
Beispiel #7
0
    def get_header(seq_obj, header_identifier=None, text=True):
        if text == False:
            fasta = FileUtils.open_text_r(seq_obj)
        elif text == True:
            fasta = seq_obj
        HEADER = []  
        for line in fasta:                      
            if line[0:1] == '>':
                line = line.strip()
                HEADER.append(line)
                

        if header_identifier == None:
            return HEADER
        else:                
            for item in HEADER:
                if header_identifier in item:
                    return item
Beispiel #8
0
 def get_seq(seq_obj, header_identifier, text=True):
     if text == False:
         fasta = FileUtils.open_text_r(seq_obj)
     elif text == True:
         fasta = seq_obj
     seq = ''
     flag = 0
     for n, line in enumerate(fasta):
         line = line.strip()
         if line[0:1] == '>' and flag == 0:
             if header_identifier in line:
                 flag = 1
         elif line[0:1] != '>' and flag == 1:
             seq+=line
         elif line[0:1] == '>' and flag == 1:
             finalseq = seq
             flag = 2
         finalseq = seq
     # The replace function is used in order to remove the star because the Ensembl sequences
     # show the star at the end of sequence but if the sequences doesn't show the start anything happens
     return finalseq.replace('*','')
Beispiel #9
0
    def split_seq(file_sequences_path, path_output, start_header, end_header):

        # Through the subprocess method the grep unix command gets the header of fasta file
        #
        header_dataset = subp.check_output(['grep', '>', file_sequences_path])
        header = header_dataset.split('\n')
        file_seq = FileUtils.open_text_r(file_sequences_path)
        seq_obj = file_seq.readlines()

        for i, term in enumerate(header):
            prot = term[start_header:end_header]
            Logger.get_instance().info(str(i + 1) + ' ' + prot)

            # extraction of sequence from fasta file
            prot_seq = InfoFasta.get_seq(seq_obj, prot)

            # writing of sequence in a fasta file
            fasta_seq = SeqToFasta.give_fasta(term, prot_seq)
            file_out = FileUtils.open_text_w(path_output + prot + '.fasta')
            file_out.write(fasta_seq)
        file_out.close()