def merge_file(namefile_1, namefile_2, new_namefile): file1 = FileUtils.open_text_r(namefile_1) file2 = FileUtils.open_text_r(namefile_2) new_file = FileUtils.open_text_a(new_namefile) text_1 = file1.read() text_2 = file2.read() new_file.write(text_1) new_file.write(text_2) new_file.close()
def read_file(namefile): f = FileUtils.open_text_r(namefile) listfile = [] for line in f: item = line.strip() listfile.append(item) return listfile
def change_header(path_input_file, path_ouptut_file, source=1, type_id=1): file_input = FileUtils.open_text_r(path_input_file) seq_file = file_input.read() file_output = FileUtils.open_text_a(path_ouptut_file) # Warning: Check that the file have the '>' char only at beginning of header lines and not in other points # otherwise the split will occur in an incorrect way! seq_file_list = seq_file.split('>')[1:] for seq in seq_file_list: lines = seq.split('\n') header = lines[0] Logger.get_instance().info(header) # Ensembl if source == 1: new_header = '>' + header.split('|')[2] + '\n' # see Note # Uniprot elif source == 2: diff_header = header.split(' ')[0] # AC if type_id == 1: new_header = '>' + diff_header.split('|')[1] + '\n' # ID elif type_id == 2: new_header = '>' + diff_header.split('|')[2] + '\n' fasta = new_header + '\n'.join(lines[1:]) file_output.write(fasta) file_output.close()
def output_reading(filename): input_file = FileUtils.open_text_r(filename) text_file = [] lines = input_file.readlines() string = '' for n, line in enumerate(lines): if line[0:1] == '>' and n == 0: string += line[1:] elif line[0:1] != '>' and n != 0: string += line elif line[0:1] == '>' and n != 0: # append in string format the output of one protein text_file.append(string) # reset the string variable and add the header string = '' string += line[1:] else: Logger.get_instance().warning(' Check this line : ' + line) text_file.append(string) return text_file
def domain_all_protein(domain_region_file): file_domain = FileUtils.open_text_r(domain_region_file) # Importation of Dictionary dict_domain = pickle.load(file_domain) return dict_domain
def domain_one_protein(domain_region_file, protname): file_domain = FileUtils.open_text_r(domain_region_file) # Importation of Dictionary dict_domain = pickle.load(file_domain) if protname in dict_domain: domain_prot = dict_domain[protname] return domain_prot else: Logger.get_instance().debug(" Protein without domains " + protname) return []
def get_header(seq_obj, header_identifier=None, text=True): if text == False: fasta = FileUtils.open_text_r(seq_obj) elif text == True: fasta = seq_obj HEADER = [] for line in fasta: if line[0:1] == '>': line = line.strip() HEADER.append(line) if header_identifier == None: return HEADER else: for item in HEADER: if header_identifier in item: return item
def get_seq(seq_obj, header_identifier, text=True): if text == False: fasta = FileUtils.open_text_r(seq_obj) elif text == True: fasta = seq_obj seq = '' flag = 0 for n, line in enumerate(fasta): line = line.strip() if line[0:1] == '>' and flag == 0: if header_identifier in line: flag = 1 elif line[0:1] != '>' and flag == 1: seq+=line elif line[0:1] == '>' and flag == 1: finalseq = seq flag = 2 finalseq = seq # The replace function is used in order to remove the star because the Ensembl sequences # show the star at the end of sequence but if the sequences doesn't show the start anything happens return finalseq.replace('*','')
def split_seq(file_sequences_path, path_output, start_header, end_header): # Through the subprocess method the grep unix command gets the header of fasta file # header_dataset = subp.check_output(['grep', '>', file_sequences_path]) header = header_dataset.split('\n') file_seq = FileUtils.open_text_r(file_sequences_path) seq_obj = file_seq.readlines() for i, term in enumerate(header): prot = term[start_header:end_header] Logger.get_instance().info(str(i + 1) + ' ' + prot) # extraction of sequence from fasta file prot_seq = InfoFasta.get_seq(seq_obj, prot) # writing of sequence in a fasta file fasta_seq = SeqToFasta.give_fasta(term, prot_seq) file_out = FileUtils.open_text_w(path_output + prot + '.fasta') file_out.write(fasta_seq) file_out.close()