예제 #1
0
def with_mp(list_of_enzyme_commission_numbers, list_of_databases, cpus):
    list_of_downloads = []
    unique_ids_dict = {}
    id_enzymes_dict = {}
    folder_parsed_information = "data/sequences"
    folder_management.create_folder(folder_parsed_information)
    work_list = []
    for enzyme in list_of_enzyme_commission_numbers:
        for database in list_of_databases:
            file_path = "data/raw/{}_{}_raw.txt".format(enzyme, database)
            d = ((enzyme, database, file_path, unique_ids_dict,
                  id_enzymes_dict, folder_parsed_information))
            work_list.append(d)
    run_multiprocessing.run_mp(work_list, cpus, process_file)
def download_files(list_of_enzyme_commission_numbers, list_of_databases, cpus):
    """ Downloads a url into an HTML file
        Creates data/raw path if does not exist
        Creates a list of files to download, running multiprocessing for speed purposes
        Multiprocessing function runs using a list of tupples, number of cpus and a function
    """
    data_path = "data/raw"
    folder_management.create_folder(data_path)
    list_of_downloads = []
    for enzyme in list_of_enzyme_commission_numbers:
        for database in list_of_databases:
            download_data = ((enzyme, database, data_path))
            list_of_downloads.append(download_data)
    run_multiprocessing.run_mp(list_of_downloads, cpus, download_manager)
예제 #3
0
def parse_information_files(list_of_enzyme_commission_numbers,
                            list_of_databases, cpus):
    output_path = "data/parsed_raw/parsed_dictionary_of_raw_data.gzip"
    if not os.path.exists(output_path):
        print("Creating information compressed file. Saved at {}".format(
            output_path))
        work_list = []
        for enzyme in list_of_enzyme_commission_numbers:
            for database in list_of_databases:
                file_path = "data/raw/{}_{}_raw.txt".format(enzyme, database)
                work_list.append((file_path, ))
        result_list = run_multiprocessing.run_mp(work_list, cpus, process_file)
        processed_results = split_results(result_list)
        dump(processed_results,
             output_path,
             compression="gzip",
             set_default_extension=False)
    else:
        print("Information file have been found at {}. Loading it.".format(
            output_path))
        processed_results = load(output_path,
                                 compression="gzip",
                                 set_default_extension=False)
        print("File loaded")
    return processed_results
예제 #4
0
def download_sequence_files(list_of_enzyme_commission_numbers,
                            list_of_databases, cpus):
    list_of_downloads = []
    for enzyme in list_of_enzyme_commission_numbers:
        for database in list_of_databases:
            list_of_files_not_found = open_data_not_found_file(
                enzyme, database)
            file_path = "data/{}_{}_raw.txt".format(enzyme, database, enzyme,
                                                    database)
            sequence_folder_path = "data/downloads/{}_{}_downloaded.txt".format(
                enzyme, database)
            folder_management.create_folder(sequence_folder_path)
            data_list = process_file(enzyme, database, file_path,
                                     sequence_folder_path,
                                     list_of_files_not_found)
            list_of_downloads.extend(data_list)

    run_multiprocessing.run_mp(list_of_downloads, cpus, download_manager)
예제 #5
0
def sequence_extractor_from_database(list_of_enzyme_commission_numbers,
                                     list_of_databases, cpus):
    """ Right now, it is only working with uniprot. TAKE CARE
        It parses UNIPROT database, files coming from sprot and trembl
        Limitation: those files are very large (~70GB)
        Tip: compressed pickles of dictionaries (1M each) have been built previously, by chomping the files
        How this works:
        1) It reads the information files, downloaded from KEGG, that are in HTML format
        2) Gets the ID of the enzymes, in this case UNIPROT ID, using multiprocessing
    """
    processed_results = parse_information_files(
        list_of_enzyme_commission_numbers, list_of_databases, cpus)
    single_uniprot_enzyme_dict = dict_of_uniprot_enzyme(processed_results)
    all_uniprot_sequences_set = get_set_of_uniprot_id(processed_results)
    folder_path = "../data_databases/uniprot_all"
    n = 1
    work_list = []
    for filename in os.listdir(folder_path):
        output_path = "data/enzymes_and_sequences/enzymes_and_sequences_{}.gzip".format(
            n)
        work_list.append((filename, n, folder_path, output_path,
                          single_uniprot_enzyme_dict))
        n += 1
    run_multiprocessing.run_mp(work_list, cpus, extractor)