def change_header(path_input_file, path_ouptut_file, source=1, type_id=1): file_input = FileUtils.open_text_r(path_input_file) seq_file = file_input.read() file_output = FileUtils.open_text_a(path_ouptut_file) # Warning: Check that the file have the '>' char only at beginning of header lines and not in other points # otherwise the split will occur in an incorrect way! seq_file_list = seq_file.split('>')[1:] for seq in seq_file_list: lines = seq.split('\n') header = lines[0] Logger.get_instance().info(header) # Ensembl if source == 1: new_header = '>' + header.split('|')[2] + '\n' # see Note # Uniprot elif source == 2: diff_header = header.split(' ')[0] # AC if type_id == 1: new_header = '>' + diff_header.split('|')[1] + '\n' # ID elif type_id == 2: new_header = '>' + diff_header.split('|')[2] + '\n' fasta = new_header + '\n'.join(lines[1:]) file_output.write(fasta) file_output.close()
def merge_file(namefile_1, namefile_2, new_namefile): file1 = FileUtils.open_text_r(namefile_1) file2 = FileUtils.open_text_r(namefile_2) new_file = FileUtils.open_text_a(new_namefile) text_1 = file1.read() text_2 = file2.read() new_file.write(text_1) new_file.write(text_2) new_file.close()
def make_iupred_file(input_path, output_path, th_1, th_2, num_aa, dataset_type): # initialization of file names file_name_1 = dataset_type + '_IupredTable' + '_t1_'+ str(th_1) + '_t2_' + str(th_2) + '.txt' file_name_2 = dataset_type + '_IupredRegion_' + str(th_1) + '.txt' file_name_3 = dataset_type + '_IupredRegion_' + str(th_2) + '.txt' num_aa_string = '('+ str(num_aa) +' AA)' # Files opening and title string writing file_1 = FileUtils.open_text_a(output_path + file_name_1) file_2 = FileUtils.open_text_a(output_path + file_name_2) file_3 = FileUtils.open_text_a(output_path + file_name_3) header_file_table = ['Protein', 'Fraction '+ str(th_1), 'Fraction ' + str(th_2), 'Region N.' + num_aa_string +'th_'+ str(th_1) , 'Region N.'+ num_aa_string +'th_'+ str(th_2)] header_file_region = ['Protein', 'N', 'Start', 'End', 'Region length'] header_string_table = '\t'.join(header_file_table) header_file_region = '\t'.join(header_file_region) file_1.write(header_string_table + '\n') file_2.write(header_file_region + '\n') file_3.write(header_file_region + '\n') # This command allows to taken the file names of protein that you want analyze list_file = subp.check_output(['ls', input_path]) list_file = list_file.split('\n') if '' in list_file: list_file.remove('') # This section performs the iupred_string_info method (that calls also iupred_info method) # for each protein file in list_file and simultaneously appends into files the output results # in a tab format for i, pred_file in enumerate(list_file): i += 1 prot_id = pred_file.split('.')[0].split('_')[1] Logger.get_instance().info( str(i) + ' ' + prot_id) namefile = input_path + pred_file out_string = Iupred.iupred_string_info(namefile, prot_id, th_1, th_2, num_aa) string_file_1 = out_string[0] string_file_2 = out_string[1] string_file_3 = out_string[2] file_1.write(string_file_1 + '\n') file_2.write(string_file_2 + '\n') file_3.write(string_file_3 + '\n') file_1.close() file_2.close() file_3.close()
def iupred_analysis(self, fastafile, prot): self.fastafile = fastafile self.prot = prot # Calling of IUPred command iupred_out = subp.check_output([self.iupred_path+"iupred", self.fastafile, "long"]) # Prediction output file ( Prediction_protname.txt) pred_file = FileUtils.open_text_w(self.path_output+'IUPred_' + self.prot + '.txt') index_prediction = iupred_out.index('Prediction output') iupred_out = iupred_out[index_prediction:] iupred_out_list = iupred_out.split('\n') new_iupred_out = [] new_iupred_out.append(iupred_out_list[0]) new_iupred_out.append(iupred_out_list[1]) for line in iupred_out_list[2:]: new_line = [] for item in line.split(' '): if item != '': new_line.append(item) new_line_string = '\t'.join(new_line) new_iupred_out.append(new_line_string) final_out = '\n'.join(new_iupred_out) pred_file.write(final_out) pred_file.close()
def output_reading(filename): input_file = FileUtils.open_text_r(filename) text_file = [] lines = input_file.readlines() string = '' for n, line in enumerate(lines): if line[0:1] == '>' and n == 0: string += line[1:] elif line[0:1] != '>' and n != 0: string += line elif line[0:1] == '>' and n != 0: # append in string format the output of one protein text_file.append(string) # reset the string variable and add the header string = '' string += line[1:] else: Logger.get_instance().warning(' Check this line : ' + line) text_file.append(string) return text_file
def make_dictionary(self): Logger.get_instance().info( " Creation of a dictionary for novel gene of dataset 2\ The dictionary structure is : \n \ {gene = [ isoform1, isoform2,...isoformN]}") self.path_home = Constants.PATH_HOME self.path_input_file = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_DICTIONARY_INPUT_FILE_PROPERTY, True) self.dictionary_output_path = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_DICTIONARY_OUTPUT_PATH_PROPERTY, True) self.output_file_path = self.dictionary_output_path + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_DICTIONARY_FILE_OUTPUT_PROPERTY, True) dict_identifier = InfoFasta.make_dictionary(self.path_input_file) self.dict_file = FileUtils.open_text_w(self.output_file_path) pickle.dump(dict_identifier, self.dict_file) Logger.get_instance().info( " The creation of a dictionary is completed \n\n")
def read_file(namefile): f = FileUtils.open_text_r(namefile) listfile = [] for line in f: item = line.strip() listfile.append(item) return listfile
def dictionary_identifier(self): Logger.get_instance().info( " Creation of a dictionary for novel gene of dataset 2\ The dictionary structure is : \n \ {gene = [ isoform1, isoform2,...isoformN]}") self.ensembl_path_output = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.ENSEMBL_OUTPUT_PATH_SEQUENCE_PROPERTY, True) self.ensembl_output_dataset2 = self.ensembl_path_output + PropertyManager.get_instance( ).get_property(DataConstants.ENSEMBL_FILE_SEQUENCES_2_PROPERTY, True) self.dictionary_output = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.DICTIONARY_PATH_OUTPUT_PROPERTY, True) self.dictionary_namefile = self.dictionary_output + PropertyManager.get_instance( ).get_property(DataConstants.DICTIONARY_NAME_FILE_PROPERTY, True) dict_identifier = InfoFasta.make_dictionary( self.ensembl_output_dataset2) file_dict = FileUtils.open_text_w(self.dictionary_namefile) pickle.dump(dict_identifier, file_dict) Logger.get_instance().info( " The creation of a dictionary for novel gene in dataset 2 is completed \n\n" )
def isoform_sequences(self): Logger.get_instance().info( " Starting the random selection of isoforms with same length \n") Logger.get_instance().info( " The following headers are the proteins randomly selected \n") self.path_output_longest = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.LONGEST_PATH_OUTPUT_PROPERTY, True) self.path_file_isoform = self.path_output_longest + PropertyManager.get_instance( ).get_property(DataConstants.ISOFORM_FILE_PROPERTY, True) self.path_file_selected_isoform = self.path_output_longest + PropertyManager.get_instance( ).get_property(DataConstants.RANDOM_ISOFORM_SEQ_PROPERTY, True) # The headers of a Isoform fasta file are taken by InfoFasta class # You make sure that the arg text is equal to False because the input object is a file and not a list self.headers = InfoFasta.get_header(self.path_file_isoform, text=False) # Extraction of genes form headers line # This vector contains double gene because the file contains some isoform of the same gene gene_isoform = [] for header in self.headers: gene = header[1:16] gene_isoform.append(gene) # gene set creation unique_gene = set(gene_isoform) # This for loop flows on the unique gene # random_header = [] old_num_isoform = 0 for gene in unique_gene: # For each gene counts how many isoform has num_isoform = gene_isoform.count(gene) item = range(0, num_isoform) # Select one isoform randomly sel = random.choice(item) # The header selected randomly are stored in array random_header.append(self.headers[old_num_isoform:old_num_isoform + num_isoform][sel]) old_num_isoform = old_num_isoform + num_isoform self.file_random_seq = FileUtils.open_text_a( self.path_file_selected_isoform) # The sequences corresponding to header selected are extracted from isoform file for header in random_header: Logger.get_instance().info('Header selected : ' + header) identifier = header[33:48] sequence = InfoFasta.get_seq(self.path_file_isoform, identifier) fasta_seq = SeqToFasta.give_fasta(header, sequence) self.file_random_seq.write(fasta_seq) Logger.get_instance().info(" End of selection random sequences \n ")
def domain_all_protein(domain_region_file): file_domain = FileUtils.open_text_r(domain_region_file) # Importation of Dictionary dict_domain = pickle.load(file_domain) return dict_domain
def make_disordp_file(input_path, output_path, binding_partner, num_aa, dataset_type): # initialization of file names file_name_1 = dataset_type + '_DisoRDPbindTable.txt' file_name_2 = dataset_type + '_DisoRDPbindRegion.txt' num_aa_string = '(' + str(num_aa) + ' AA)' # Files opening and title string writing file_1 = FileUtils.open_text_a(output_path + file_name_1) file_2 = FileUtils.open_text_a(output_path + file_name_2) header_file_table = [ 'Protein', 'Fraction ', 'Region N.' + num_aa_string ] header_file_region = ['Protein', 'N', 'Start', 'End', 'Region length'] header_string_table = '\t'.join(header_file_table) header_file_region = '\t'.join(header_file_region) file_1.write(header_string_table + '\n') file_2.write(header_file_region + '\n') # Reading of DisoRDPbind output file protein_output_list = DisoRDPbind.output_reading(input_path) for n, output in enumerate(protein_output_list): if 'WARNING:' in output: prot = output.split('\n')[0] Logger.get_instance().warning( str(n + 1) + "\n This protein contains >=10000 residues\ (DisoRBDbind cannot predict the proteins with size >=10000) " + prot) else: Logger.get_instance().info(str(n + 1)) results = DisoRDPbind.disordp_string_info( output, binding_partner, num_aa) string_file_1 = results[0] string_file_2 = results[1] file_1.write(string_file_1 + '\n') file_2.write(string_file_2 + '\n') file_1.close() file_2.close()
def get_list_seq(path_input_list, path_output): seq_file = FileUtils.open_text_a(path_output) protein_list = FileParser.read_file(path_input_list) for protein in protein_list: seq = Uniprot.get_sequence(protein, format_out=True) seq_file.write(seq) seq_file.close()
def split_seq(file_sequences_path, path_output, start_header, end_header): # Through the subprocess method the grep unix command gets the header of fasta file # header_dataset = subp.check_output(['grep', '>', file_sequences_path]) header = header_dataset.split('\n') file_seq = FileUtils.open_text_r(file_sequences_path) seq_obj = file_seq.readlines() for i, term in enumerate(header): prot = term[start_header:end_header] Logger.get_instance().info(str(i + 1) + ' ' + prot) # extraction of sequence from fasta file prot_seq = InfoFasta.get_seq(seq_obj, prot) # writing of sequence in a fasta file fasta_seq = SeqToFasta.give_fasta(term, prot_seq) file_out = FileUtils.open_text_w(path_output + prot + '.fasta') file_out.write(fasta_seq) file_out.close()
def domain_one_protein(domain_region_file, protname): file_domain = FileUtils.open_text_r(domain_region_file) # Importation of Dictionary dict_domain = pickle.load(file_domain) if protname in dict_domain: domain_prot = dict_domain[protname] return domain_prot else: Logger.get_instance().debug(" Protein without domains " + protname) return []
def get_header(seq_obj, header_identifier=None, text=True): if text == False: fasta = FileUtils.open_text_r(seq_obj) elif text == True: fasta = seq_obj HEADER = [] for line in fasta: if line[0:1] == '>': line = line.strip() HEADER.append(line) if header_identifier == None: return HEADER else: for item in HEADER: if header_identifier in item: return item
def get_seq(seq_obj, header_identifier, text=True): if text == False: fasta = FileUtils.open_text_r(seq_obj) elif text == True: fasta = seq_obj seq = '' flag = 0 for n, line in enumerate(fasta): line = line.strip() if line[0:1] == '>' and flag == 0: if header_identifier in line: flag = 1 elif line[0:1] != '>' and flag == 1: seq+=line elif line[0:1] == '>' and flag == 1: finalseq = seq flag = 2 finalseq = seq # The replace function is used in order to remove the star because the Ensembl sequences # show the star at the end of sequence but if the sequences doesn't show the start anything happens return finalseq.replace('*','')
def rna_target(self): self.path_home = Constants.PATH_HOME self.file_seq_natrevgenetics = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.PUTATIVERNA_NATREVGENETICS_SEQ_PROPERTY, True) self.natrevgenetics_info = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.PUTATIVE_RNA_NATREVGENETICS_INFO_PROPERTY, True) info_table = FileParser.make_table(self.natrevgenetics_info, '\t', skip=1) prot_info = TableWrapper.get_column(info_table, 1) putative_rna = TableWrapper.get_column(info_table, 3) self.header = InfoFasta.get_header(self.file_seq_natrevgenetics, text=False) gene_seq = [item.split('>')[1].split('|')[0] for item in self.header] prot_seq = [item.split('>')[1].split('|')[2] for item in self.header] # Creation of Table containing gene id, prot id and rna target putative_rna_target = [] type_rna_target = [] for n, prot in enumerate(prot_seq): Logger.get_instance().info(prot) index_prot = prot_info.index(prot) rna_target = putative_rna[index_prot] row = [gene_seq[n], prot_seq[n], rna_target] type_rna_target.append(rna_target) putative_rna_target.append(row) Logger.get_instance().info(" The putative rna target is " + rna_target) self.file_all_rna_target = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.PUTATIVE_ALL_RNA_TARGET_PROPERTY, True) FileWriter.write_table(self.file_all_rna_target, putative_rna_target, symbol='\t') # set of RNA target type in order to create different list # unique_rna_target = set(type_rna_target) info_new_table = FileParser.make_table(self.file_all_rna_target, '\t') # Columns extraction prot_name = TableWrapper.get_column(info_new_table, 1) type_rnatarget = TableWrapper.get_column(info_new_table, 2) file_output = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.PUTATIVE_RNA_OUTPUT_PROPERTY, True) # this for loop allows to create a proteins files for each RNA target type for item in unique_rna_target: file_name = file_output + item + PropertyManager.get_instance( ).get_property( DataConstants.PUTATIVE_RNA_TARGET_DATASET_NAME_PROPERTY, True) file_rna = FileUtils.open_text_a(file_name) for n, type_rna in enumerate(type_rnatarget): if type_rna == item: file_rna.write(prot_name[n]) file_rna.close()
def list_get_seq(path_input, type_query, path_protein_list=None, path_output=None): # the input file is read list_item = FileParser.read_file(path_input) dict_query = {1: 'all', 2: 'one'} count_duplicate_genes = 0 all_seqs = '' prot_seq = [] # For each gene in list the sequences are downloaded for i, item in enumerate(list_item): Logger.get_instance().info( str(i + 1) + ' Extraction of gene sequence(s) : ' + item) fasta_seq = Ensembl.get_sequence(item, dict_query[type_query]) if fasta_seq == item + ' No available': pass elif fasta_seq == item + ' pseudogene': pass # If the gene have a sequence the output is memorized in seqs else: seqs = fasta_seq seqs = seqs + '\n' if path_protein_list == None: pass # if path_protein_list is different to None # Among the isoform of gene will be get only that is present in list_protein else: list_protein = FileParser.read_file(path_protein_list) prot_genes = seqs.split('>') protein_seq = [ '>' + fasta for fasta in prot_genes if fasta[32:47] in list_protein ] # # if path_output == None the information are stored in list o string # if path_output == None: if path_protein_list == None: all_seqs += seqs else: prot_seq.append(protein_seq) # # if path_output != None # the information will be appended in a file else: file_out = FileUtils.open_text_a(path_output) if path_protein_list == None: file_out.write(seqs) else: if protein_seq == []: count_duplicate_genes += 1 Logger.get_instance().info( " Number of duplicated genes: " + str(count_duplicate_genes)) Logger.get_instance().info( " The gene duplicated is: " + str(item) + '|' + str(list_protein[i])) else: file_out.write(protein_seq[0]) # return information like string or list if path_output == None: if path_protein_list == None: all_seqs += seqs return seqs else: return protein_seq else: file_out.close()
def longest_seq(seq_obj, dict_identifier, path_outfile_longest, path_outfile_isoform, type_obj='list'): if type_obj == 'file': type_text = False elif type_obj == 'list': type_text = True fileout_longest = FileUtils.open_text_a(path_outfile_longest) fileout_isoform = FileUtils.open_text_a(path_outfile_isoform) file_dict = open(dict_identifier, 'r') dict_ids = pickle.load(file_dict) # Possible conditions: # # 1) the gene has one longest protein # - in this case this seq is added to longest file # 2) the gene has two protein with the same length # a) the sequences are identical # - in this case one of these identical sequences is added to longest file # b) the sequences are different # - in this case the isoform sequences are added to isoform file # 3) the gene has more than two protein with the same length # a) the sequences are identical # - in this case one of these is added to longest file # b) the sequences are not identical # - the different isoform are added to isoform file # # count variables have been initialized in order to check the output during the method elaboration # seq_count = 0 double_seq_count = 0 not_same_seq_count = 0 same_seq_count = 0 more_prot_count = 0 prot_longest = [] prot_double_lseq = [] prot_double_prot = [] more_two_prot = [] more_two_lseq = [] y = 0 # # This for loop flows on the keys of dictionary for gene in dict_ids: y = y + 1 Logger.get_instance().info(str(y) + ' Gene analysed : ' + gene) seqs = [] # will contain the isoform list of gene selected lenseq = [] # will contain the length of each isoform seq headers = [] # will contain the header of each isoform seq # this for loop flows on the isoforms of gene selected for prot in dict_ids[gene]: # This lines call InfoFasta class in order to extract # the seq, the length and the header of protein selected # all item are memorized in lists lenseq.append(InfoFasta.get_length(seq_obj, prot)) seqs.append(InfoFasta.get_seq(seq_obj, prot, text=type_text)) headers.append( InfoFasta.get_header(seq_obj, header_identifier=prot, text=type_text)) # Find the max length among the sequences # the index_max list contains the index in correspondence of sequence with max length len_max = max(lenseq) index_max = [ item for item in range(len(lenseq)) if lenseq[item] == len_max ] # # The following if conditions check the length of index_max vector # # Condition 1) # ------------- # if the length of index_max vector is equal to 1 it means that there is just one longest protein # the protein sequence is written into longest file # if len(index_max) == 1: Logger.get_instance().info(' If condition 1') seq_count += 1 seq = SeqToFasta.give_fasta( headers[index_max[0]], seqs[index_max[0]]) # (See NOTE above) fileout_longest.write(seq) prot_longest.append(dict_ids[gene][index_max[0]]) # # Condition 2) # ------------- # if length of index_max is equal to 2 it means that there are two protein with same length # elif len(index_max) == 2: Logger.get_instance().info('If condition 2') double_seq_count += 1 # Condition 2a # ------------- # The proteins have the same sequences # One protein sequence is written into longest file if seqs[index_max[0]] == seqs[index_max[1]]: Logger.get_instance().info('2a') same_seq_count += 1 d_seq = SeqToFasta.give_fasta( headers[index_max[0]], seqs[index_max[0]]) # (See NOTE above) fileout_longest.write(d_seq) # Condition 2b # ------------- # The protein have different sequences # The sequences are written into isoform file else: Logger.get_instance().info('2b') not_same_seq_count += 1 for i in range(len(index_max)): prot_double_lseq.append(seqs[index_max[i]]) prot_double_prot.append(dict_ids[gene][index_max[i]]) diff_seq = SeqToFasta.give_fasta( headers[index_max[i]], seqs[index_max[i]]) # (See NOTE above) fileout_isoform.write(diff_seq) # Condition 3) # ------------- # if the length of index_max is greater than two it means that there are more than two proteins # with same length else: more_prot_count += 1 Logger.get_instance().info(' If condition 3') # Condition 3a # ------------- # The isoforms with same length have actually the same sequences # One of this protein is written in longest file if seqs.count(seqs[index_max[0]]) == len(index_max): Logger.get_instance().info('3a') m_seq = SeqToFasta.give_fasta( headers[index_max[0]], seqs[index_max[0]]) # (See NOTE above) fileout_longest.write(m_seq) # Condition 3b # ------------- # Among the isoforms there are at least two isoforms with different sequences # else: Logger.get_instance().info('3b') more_two_prot.append(gene) more_two_seqs = [ ] # will contains only the sequences with max length for n in index_max: more_two_seqs.append(seqs[n]) more_two_lseq.append(list(set(more_two_seqs))) for seq in set( more_two_seqs ): # set(more_two_seqs) contains only the different sequences # find the sequence index in the list of sequences index_seq = seqs.index(seq) mdiff_seq = SeqToFasta.give_fasta( headers[index_seq], seqs[index_seq]) # (See NOTE above) fileout_isoform.write(mdiff_seq) fileout_isoform.close() fileout_longest.close()
def anchor_analysis(self, fastafile, motifslist, prot): # Calling of anchor command # anchor out contains the anchor output in text format anchor_out = subp.check_output([ "perl", self.anchor_path + 'anchor.pl', fastafile, "-m", motifslist ]) # Definition of the section index of anchor output in order to get a specific section of anchor output # Thereby in the next step it will be possible to write these sections in some file index_bind_reg = anchor_out.index('Predicted binding regions') index_motifs = anchor_out.index('Motifs') index_pred_profile = anchor_out.index('Prediction profile output') # The Anchor output can lack filtered regions section if 'Filtered regions' in anchor_out: index_filt_reg = anchor_out.index('Filtered regions') else: index_filt_reg = index_motifs # =============================================================================== # Files writing # =============================================================================== # # Prediction binding regions file ( PBR_protname.txt) # This section selects the Prediction binding region of anchor output # The PBR section is split in lines and the '#\t' character is removed # pbr_out = anchor_out[index_bind_reg:index_filt_reg] pbr_out_list = pbr_out.split('\n') pbr = [line[2:] for line in pbr_out_list if line[0:2] == '#\t'] # # When a protein lacks predicting binding region in the output file is written "None" then # If the predicting binding regions are not in pbr_out the file writing is skipped # if 'None' in pbr_out: Logger.get_instance().info( "This protein doesn't contain predicted binding region (" + prot + ')') pass elif 'None' not in pbr_out: new_pbr_out = [line.replace(' ', '') for line in pbr] pbr_file = FileUtils.open_text_w(self.path_output + 'PBR_' + prot + '.txt') pbr_file.write('\n'.join(new_pbr_out)) pbr_file.close() # # Found Motifs file (FMotifs_protname.txt) # fmotifs_out = anchor_out[index_motifs:index_pred_profile] fmotifs_out_list = fmotifs_out.split('\n') fmotifs = [line[2:] for line in fmotifs_out_list if line[0:2] == '#\t'] # # When a protein lacks Motif in the output file is written "None" then # If the Motif are not in fmotif_out the file writing is skipped # if 'None' in fmotifs_out: Logger.get_instance().info( "This protein doesn't contain any motifs (" + prot + ')') pass elif 'None' not in pbr_out: new_fmotifs = [line.replace(' ', '') for line in fmotifs] fmotifs_file = FileUtils.open_text_w(self.path_output + 'FMotifs_' + prot + '.txt') fmotifs_file.write('\n'.join(new_fmotifs)) fmotifs_file.close() # # Prediction profile output (Pred_protname.txt) # This section is always present in anchor output # pred_file = FileUtils.open_text_w(self.path_output + 'Pred_' + prot + '.txt') pred_out = anchor_out[index_pred_profile:] string = '# Columns:\n# 1 - Amino acid number\n# 2 -\ One letter code\n# 3 - ANCHOR probability value\n# 4 - ANCHOR output\n#' pred_out = pred_out.replace(string, '') pred_out_list = pred_out.split('\n') new_pred_out = [line.replace(' ', '') for line in pred_out_list] final_out = '\n'.join(new_pred_out) pred_file.write(final_out) pred_file.close()