def jellyfish(parameters): print_logo("K-mer counting using jellyfish") print_info('Start k-mer counting using jellyfish.') for file_prefix in parameters['prefixes']: fasta_file = f'{file_prefix}.fasta' fasta_file_full_path = os.path.join(parameters['data_dir'], fasta_file) jellyfish_file = f'{file_prefix}.jf' jellyfish_file_full_path = os.path.join(parameters['jellyfish_out_dir'], jellyfish_file) output_file = f'{file_prefix}_dump.fasta' output_file_full_path = os.path.join(parameters['jellyfish_out_dir'], output_file) if os.path.exists(output_file_full_path): print_info(f'The output {output_file} file already exists. Skipping ...') continue if not kmer_counting(fasta_file_full_path, jellyfish_file_full_path, parameters): return False if not dump_jf_file(output_file_full_path, jellyfish_file_full_path, jellyfish_file, output_file): return False remove_jf_file(jellyfish_file_full_path, parameters) return True
def print_progress(self): self.counter += 1 offset = 100 diff = (self.counter / self.kmers_number) * 100 if not self.counter % offset or self.counter == self.kmers_number: print_info( f'Processed {self.counter} / {self.kmers_number} ({diff:.2f}%) kmers ...', self.worker_name)
def remove_jf_file(jellyfish_file, parameters): if parameters['keep_intermediate_jf_files'] == 'no': print_info(f'Deleting the {jellyfish_file} file ... ') try: os.remove(jellyfish_file) print_info(f"File '{jellyfish_file}' removed successfully") except FileNotFoundError: print_warning(f'The {jellyfish_file} file was not found.')
def run(self): print_logo("K-mer comparing") if self.parameters['run_tomtom'] == 'no': print_info("Analysis canceled. To run tomtom set 'run_tomtom' parameter to 'yes'") return True self.kmers_to_meme() self.tomtom() return True
def check_run(self): for prefix in self.parameters['prefixes']: if not os.path.exists( os.path.join(self.parameters['output_dir'], 'tables', f'table_{prefix}')): return True if self.parameters['keep_kmers_table'] == 'yes': print_info("Keeping k-mer counting from the previous run") return False return True
def bulk_fasta_to_oneline(parameters): print('') print_info('Converting FASTA files to text files.') for file_prefix in parameters['prefixes']: input_file = f'{file_prefix}.fasta' input_file_path = os.path.join(parameters['data_dir'], input_file) output_file = f'{file_prefix}_oneLine.txt' output_file_path = os.path.join(parameters['data_dir'], f'{file_prefix}_oneLine.txt') if os.path.exists(output_file_path): print_info( f'The output {output_file} file already exists. Skipping ...') continue print_info(f'Converting {input_file} into {output_file} ... ') try: fasta_to_oneline(input_file_path, output_file_path) except Exception as e: print_warning( f'Something went wrong during saving to the {output_file} file.' ) print_warning('Please, check the stderr output:\n') print(e) return False print_info("Conversion completed") return True
def dump_jf_file(output_file_full_path, jellyfish_file_full_path, jellyfish_file, output_file_name): print_info(f'Outputting counts from the {jellyfish_file} file to the {output_file_name} file ... ') result = subprocess.run(['jellyfish', 'dump', jellyfish_file_full_path, '-o', output_file_full_path], capture_output=True, text=True) if result.returncode: print_warning('Something went wrong during outputting counts') print_warning('Please, check the stderr output:') print(result.stderr) return False return True
def kmers_to_meme(self): """Converts kmer sequences into MEME format and saves them to the 'kmers.meme' file""" if os.path.exists(self.output_meme_file_path): os.remove(self.output_meme_file_path) print_info(f"Converting kmers into MEME format ... ") with open(self.output_meme_file_path, 'a+') as output: with open(self.stats_file_path, 'r') as file: for line in file: line = line.rstrip() line_splitted = line.split("\t") if len(line_splitted[0]) > 0: result = subprocess.run(['iupac2meme', line_splitted[0]], capture_output=True, text=True) output.write(result.stdout)
def kmer_counting(fasta_file, jellyfish_file, parameters): print('') print_info(f'Counting k-mers in the {fasta_file} file ... ') result = subprocess.run(['jellyfish', 'count', '-m', parameters['kmer_length'], '-s', parameters['hash_size'], '-t', parameters['threads_number'], # '-C', fasta_file, fasta_file, '-o', jellyfish_file], capture_output=True, text=True) if result.returncode: print_warning('Something went wrong during k-mer counting.') print_warning('Please, check the stderr output:') print(result.stderr) print(parameters['kmer_length'], parameters['hash_size'], parameters['threads_number'], jellyfish_file) return False return True
def run(self): print_logo("Statistic analysis") if not os.path.exists(self.merged_table_path): print_warning("the merged table does not exist") return False if not os.path.exists(os.path.join(self.parameters['output_dir'], 'stats', 'stats.txt')) \ or (os.path.exists(os.path.join(self.parameters['output_dir'], 'stats', 'stats.txt')) and self.parameters['keep_stats_file'] == 'no'): try: print_info("Applying Fisher test ...") self.chrom_len_calc() self.mite_total_len_calc() self.analyse() self.save_stats_to_file( os.path.join(self.parameters['output_dir'], 'stats', 'stats.txt')) except Exception: return False else: print_info( f"The output 'stats.txt' file exists. Loading saved data ... ") self.data = pd.read_csv(os.path.join(self.parameters['output_dir'], 'stats', 'stats.txt'), sep='\t') try: print("") print_info(f"Filter statistics data:") self.filter_kmers_by_p_corrected_bon_thresh() self.save_stats_to_file( os.path.join(self.parameters['output_dir'], 'stats', 'stats_filtered_1_corr_bonif_thresh.txt')) self.filter_kmers_by_freq_higher() self.save_stats_to_file( os.path.join(self.parameters['output_dir'], 'stats', 'stats_filtered_2_by_freq_higher.txt')) self.filter_kmers_by_freq_lesser() self.save_stats_to_file( os.path.join(self.parameters['output_dir'], 'stats', 'stats_filtered_3_by_freq_lesser.txt')) self.merge_coords_files() self.filter_coords_file() return True except Exception as e: print(f"Exception: {e}") return False
def tomtom(self): """Compares kmer motifs with database using tomtom""" print_info(f"Comparing kmer motifs with database using tomtom ... ") parameters = ['tomtom'] # parameters.append('-min-overlap') parameters.append('-min-overlap') parameters.append(self.parameters['min_overlap']) if self.parameters['internal'] == 'yes': parameters.append('-internal') if self.parameters['threshold_type'] == 'e-value': parameters.append('-evalue') parameters.append('-thresh') parameters.append(self.parameters['threshold_value']) parameters.append('-oc') parameters.append(os.path.join(self.parameters['output_dir'], 'tomtom', 'tomtom_out')) parameters.append(self.output_meme_file_path) parameters.append(self.parameters['motif_database']) # subprocess.run(parameters, capture_output=True, text=True) process = subprocess.Popen(parameters, stderr=subprocess.PIPE) while True: output = process.stderr.readline().decode('utf-8').rstrip() if output == '' and process.poll() is not None: break if output != '': try: if output[0] == 'P': print(f"\r\033[0K{output}", end='', flush=True) except IndexError: print(f"DEBUG: {output}") raise IndexError print("") if not process.returncode: print_info(f"Processing completed") print_info(f"Report in HTML format is available at: ./output/tomtom/tomtom_out/tomtom.html") else: print_warning("Something went wrong with tomtom run. Used command:") print(" ".join(parameters)) print(" ".join(parameters))
def worker(self, data_input): # print("Loading '{}' file ...".format(data_input["dump_file"])) worker_name = f"{data_input['chr_name']} worker" data_kmer = {} name_tmp = "" print_info( f"Start reading {os.path.basename(data_input['dump_file'])} file", worker_name) with open(data_input["dump_file"], 'r') as file: cont = True while cont: line = file.readline() if line == '': cont = False break line = line.rstrip() if line[0] == ">": name_tmp = line[1:] else: data_kmer[line] = name_tmp print_info( f"Reading {os.path.basename(data_input['dump_file'])} file completed. Read {len(data_kmer)} kmers.", worker_name) if len(list(data_kmer.keys())[0]) != int( self.parameters['kmer_length']): print_info( f'{red("Warning")} - The kmer length in {os.path.basename(data_input["dump_file"])} ({len(list(data_kmer.keys())[0])} bp) file is not equal to ' f'kmer length in config file ({self.parameters["kmer_length"]} bp)', worker_name) return print_info("Loading '{}' file ...".format(data_input["chr_file"]), worker_name) with open(data_input["chr_file"], 'r') as f: chromosome = f.read() #----------------------------------------# print_info( f"Loading '{os.path.basename(self.parameters['bed_file'])}' file ...", worker_name) data_mites = [] with open(self.parameters['bed_file'], 'r') as f: while True: line = f.readline() if line == '': break line = line.rstrip() data_mites.append(line.split("\t")) #----------------------------------------# if os.path.exists(data_input["output_file"]): print_info( "The file '{}' exists. Removing ...".format( data_input["output_file"]), worker_name) os.remove(data_input["output_file"]) output = open(data_input["output_file"], 'a+') output_data_template = {} output_data_template["edge"] = 0 output_data_template["genome"] = 0 mite_names = [] t = IntervalTree() for mite in data_mites: if mite[3] not in output_data_template.keys(): output_data_template[mite[3]] = 0 output_data_template[mite[3] + "_edge"] = 0 mite_names.append(mite[3]) mite_names.append(mite[3] + "_edge") if mite[0] == data_input["chr_name"]: t[int(mite[1]) - 0:int(mite[2])] = mite[3] mite_names = set(mite_names) output.write("\t".join([ "k-mer", "total_occurences_in_{}".format(data_input["chr_name"]), "\t".join(sorted(mite_names)), "edge", "genome" ])) output.write("\n") timer = Timer(len(data_kmer), worker_name) timer.startt() print_info("Started analysis ...", worker_name) log_file_path = os.path.join( self.parameters['output_dir'], 'tables', time.strftime('%y-%m-%d_%H-%M_') + data_input["chr_name"] + "_log.txt") # log = open(time.strftime('%y-%m-%d_%H-%M_') + data_input["chr_name"] + "_log.txt", 'a+') log = open(log_file_path, 'a+') log.write("Analysis started at " + time.ctime() + "\n") log.flush() kmer_No = 0 kmer_coords = {} for kmer in data_kmer.keys(): timer.print_progress() kmer_coords[kmer] = [] output_data = copy.deepcopy(output_data_template) kmer_occurences = self.my_find(chromosome, kmer) for kmer_occurence in kmer_occurences: kmer_occurence = int(kmer_occurence) # result = t[kmer_occurence + 1:kmer_occurence + 11] # result = t[kmer_occurence + 1:kmer_occurence + int(self.parameters['kmer_length']) + 1] result = t[kmer_occurence:kmer_occurence + int(self.parameters['kmer_length']) + 1] if result: if len(list(result)) > 2: print_info( f"\nThe interval tree length is higher than 2: {len( list(result) )} {data_input['chr_name']}", worker_name) log.write("\t".join([ "intTree>2", kmer, str(kmer_occurence), mite_name ]) + "\n") log.flush() log.close() exit(1) elif len(list( result)) > 1: # True if a k-mer overlaps two mites print_info( f"\nThe interval tree length is higher than 1: {data_input['chr_name']}", worker_name) log.write("\t".join([ "1<intTree<2", kmer, str(kmer_occurence), str(list(result)) ]) + "\n") log.flush() for interval in result: output_data["edge"] += 1 output_data[interval.data + "_edge"] += 1 else: result_parsed = list(result)[0] result_parsed_list = list(result_parsed) if (kmer_occurence) >= (result_parsed.begin - 1) and \ (kmer_occurence + int(self.parameters['kmer_length'])) <= result_parsed.end: output_data[result_parsed.data] += 1 kmer_coords[kmer].append("\t".join([ data_input["chr_name"], str(result_parsed_list[self.INTERVAL_FROM]), str(result_parsed_list[self.INTERVAL_TO]), f"{kmer};{result_parsed_list[self.INTERVAL_MITE_NAME]}" ])) # kmer_coords[kmer].append(data_input["chr_name"] + ":" + "-".join( # [str(result_parsed_list[0]), str(result_parsed_list[1])])) else: output_data["edge"] += 1 output_data[result_parsed.data + "_edge"] += 1 else: output_data["genome"] += 1 kmer_No += 1 output.write("\t".join([kmer, str(len(kmer_occurences))])) for mite_name in sorted(mite_names): output.write("\t" + str(output_data[mite_name])) output.write("\t" + str(output_data["edge"])) output.write("\t" + str(output_data["genome"])) output.write("\n") log.close() output.close() timer.stopp() self.write_kmer_coords_to_file(data_input['chr_name'], kmer_coords)
def stopp(self): self.stop = time.time() diff = self.stop - self.start print_info(f"Processing kmers finished in {diff:.2f} sek", self.worker_name)
def save_stats_to_file(self, filename): print_info(f"Saving data to file '{os.path.basename(filename)}'") self.data.to_csv(filename, sep='\t')