def with_mp(list_of_enzyme_commission_numbers, list_of_databases, cpus): list_of_downloads = [] unique_ids_dict = {} id_enzymes_dict = {} folder_parsed_information = "data/sequences" folder_management.create_folder(folder_parsed_information) work_list = [] for enzyme in list_of_enzyme_commission_numbers: for database in list_of_databases: file_path = "data/raw/{}_{}_raw.txt".format(enzyme, database) d = ((enzyme, database, file_path, unique_ids_dict, id_enzymes_dict, folder_parsed_information)) work_list.append(d) run_multiprocessing.run_mp(work_list, cpus, process_file)
def download_files(list_of_enzyme_commission_numbers, list_of_databases, cpus): """ Downloads a url into an HTML file Creates data/raw path if does not exist Creates a list of files to download, running multiprocessing for speed purposes Multiprocessing function runs using a list of tupples, number of cpus and a function """ data_path = "data/raw" folder_management.create_folder(data_path) list_of_downloads = [] for enzyme in list_of_enzyme_commission_numbers: for database in list_of_databases: download_data = ((enzyme, database, data_path)) list_of_downloads.append(download_data) run_multiprocessing.run_mp(list_of_downloads, cpus, download_manager)
def parse_information_files(list_of_enzyme_commission_numbers, list_of_databases, cpus): output_path = "data/parsed_raw/parsed_dictionary_of_raw_data.gzip" if not os.path.exists(output_path): print("Creating information compressed file. Saved at {}".format( output_path)) work_list = [] for enzyme in list_of_enzyme_commission_numbers: for database in list_of_databases: file_path = "data/raw/{}_{}_raw.txt".format(enzyme, database) work_list.append((file_path, )) result_list = run_multiprocessing.run_mp(work_list, cpus, process_file) processed_results = split_results(result_list) dump(processed_results, output_path, compression="gzip", set_default_extension=False) else: print("Information file have been found at {}. Loading it.".format( output_path)) processed_results = load(output_path, compression="gzip", set_default_extension=False) print("File loaded") return processed_results
def download_sequence_files(list_of_enzyme_commission_numbers, list_of_databases, cpus): list_of_downloads = [] for enzyme in list_of_enzyme_commission_numbers: for database in list_of_databases: list_of_files_not_found = open_data_not_found_file( enzyme, database) file_path = "data/{}_{}_raw.txt".format(enzyme, database, enzyme, database) sequence_folder_path = "data/downloads/{}_{}_downloaded.txt".format( enzyme, database) folder_management.create_folder(sequence_folder_path) data_list = process_file(enzyme, database, file_path, sequence_folder_path, list_of_files_not_found) list_of_downloads.extend(data_list) run_multiprocessing.run_mp(list_of_downloads, cpus, download_manager)
def sequence_extractor_from_database(list_of_enzyme_commission_numbers, list_of_databases, cpus): """ Right now, it is only working with uniprot. TAKE CARE It parses UNIPROT database, files coming from sprot and trembl Limitation: those files are very large (~70GB) Tip: compressed pickles of dictionaries (1M each) have been built previously, by chomping the files How this works: 1) It reads the information files, downloaded from KEGG, that are in HTML format 2) Gets the ID of the enzymes, in this case UNIPROT ID, using multiprocessing """ processed_results = parse_information_files( list_of_enzyme_commission_numbers, list_of_databases, cpus) single_uniprot_enzyme_dict = dict_of_uniprot_enzyme(processed_results) all_uniprot_sequences_set = get_set_of_uniprot_id(processed_results) folder_path = "../data_databases/uniprot_all" n = 1 work_list = [] for filename in os.listdir(folder_path): output_path = "data/enzymes_and_sequences/enzymes_and_sequences_{}.gzip".format( n) work_list.append((filename, n, folder_path, output_path, single_uniprot_enzyme_dict)) n += 1 run_multiprocessing.run_mp(work_list, cpus, extractor)