def run(pathdict, s, logging): """ Runner of the BLAST xml results parser. Parameters ---------- pathdict : dict Dictionary of the key paths and files associated with that List number. s : dict Settings dictionary extracted from excel settings file. logging : logging.Logger Logger for printing to console and logfile. Saved Files and Figures ----------------------- homol_df_orig_zip : zipfile Zipfile containing the following: homol_df_orig_pickle : pickled pd.DataFrame Dataframe containing all sequence extracted from the XML file. This can be large, as it contains the full query, markup and match sequences """ #Initialize protein list acc_not_in_homol_db = [] p_dict_logging = logging if s[ "use_multiprocessing"] != True else utils.Log_Only_To_Console() list_p = korbinian.utils.convert_summary_csv_to_input_list( s, pathdict, p_dict_logging, list_excluded_acc=acc_not_in_homol_db) logging.info( "~~~~~~~~~~~~ starting parsing BLAST results ~~~~~~~~~~~~" ) #Multithreading if s["use_multiprocessing"]: # number of processes is the number the settings, or the number of proteins, whichever is smallest n_processes = s["multiprocessing_cores"] if s[ "multiprocessing_cores"] < len(list_p) else len(list_p) #Scatter list_p to threads and start parallel execution with Pool(processes=n_processes) as thread_controller: thread_controller.map(parse_blast_result, list_p) #Sequential execution else: for p in list_p: parse_blast_result(p) logging.info( "\n" + "~~~~~~~~~~~~ finished parsing BLAST results ~~~~~~~~~~~~" )
def run_slice_TMDs_from_homologues(pathdict, s, logging): """For a list of proteins, slice TMD sequences from homologues and count gaps. Parameters ---------- pathdict : dict Dictionary of the key paths and files associated with that List number. s : dict Settings dictionary extracted from excel settings file. logging : logging.Logger Logger for printing to console and logfile. Saved Files and Figures ----------------------- see slice_1_TMD_from_homol """ logging.info( '~~~~~~~~~~~~ starting run_slice_TMDs_from_homologues ~~~~~~~~~~~~' ) # if multiprocessing is used, log only to the console p_dict_logging = logging if s[ "use_multiprocessing"] != True else utils.Log_Only_To_Console() # get list of accessions that could not be downloaded, and can immediately be excluded not_in_homol_db = utils.get_acc_list_from_txt( pathdict["acc_not_in_homol_db_txt"]) # create list of protein dictionaries to process list_p = korbinian.utils.convert_summary_csv_to_input_list( s, pathdict, p_dict_logging, list_excluded_acc=not_in_homol_db) # number of processes is the number the settings, or the number of proteins, whichever is smallest n_processes = s["multiprocessing_cores"] if s[ "multiprocessing_cores"] < len(list_p) else len(list_p) if s["use_multiprocessing"]: with Pool(processes=n_processes) as pool: slice_list = pool.map( korbinian.cons_ratio.slice.slice_TMD_1_prot_from_homol, list_p) # log the list of protein results (e.g. acc, "simap", True) to the actual logfile, not just the console logging.info("\nslice_list : {}".format(slice_list)) else: for p in list_p: korbinian.cons_ratio.slice.slice_TMD_1_prot_from_homol(p) logging.info( '\n~~~~~~~~~~~~ finished run_slice_TMDs_from_homologues ~~~~~~~~~~~~' )
def run_create_fasta(pathdict, s, logging): logging.info('~~~~~~~~~~~~ starting filter_and_save_fasta ~~~~~~~~~~~~') # if multiprocessing is used, log only to the console p_dict_logging = logging if s["use_multiprocessing"] != True else utils.Log_Only_To_Console() # create list of protein dictionaries to process list_p = korbinian.utils.convert_summary_csv_to_input_list(s, pathdict, p_dict_logging) # number of processes is the number the settings, or the number of proteins, whichever is smallest n_processes = s["multiprocessing_cores"] if s["multiprocessing_cores"] < len(list_p) else len(list_p) if s["use_multiprocessing"]: with Pool(processes=n_processes) as pool: fasta_list = pool.map(korbinian.fasta.filter_and_save_fasta, list_p) # log the list of protein results (e.g. acc, "simap", True) to the actual logfile, not just the console logging.info("fasta_list : {}".format(fasta_list)) else: for p in list_p: korbinian.fasta.filter_and_save_fasta(p) logging.info('~~~~~~~~~~~~ filter_and_save_fasta is finished ~~~~~~~~~~~~')
def save_fastagap(pathdict, s, logging): """Runs fastagap_save for each protein, using multiprocessing Pool. Parameters ---------- pathdict : dict Dictionary of the key paths and files associated with that List number. s : dict Settings dictionary extracted from excel settings file. logging : logging.Logger Logger for printing to console and/or logfile. If multiprocessing == True, logging.info etc will only print to console. """ logging.info( '~~~~~~~~~~~~ starting save_fastagap ~~~~~~~~~~~~' ) # if multiprocessing is used, log only to the console p_dict_logging = logging if s[ "use_multiprocessing"] != True else utils.Log_Only_To_Console() # create list of protein dictionaries to process list_p = korbinian.utils.convert_summary_csv_to_input_list( s, pathdict, p_dict_logging) # number of processes is the number the settings, or the number of proteins, whichever is smallest n_processes = s["multiprocessing_cores"] if s[ "multiprocessing_cores"] < len(list_p) else len(list_p) if s["use_multiprocessing"]: with Pool(processes=n_processes) as pool: fastagap_list = pool.map(korbinian.fastagap.fastagap_save, list_p) # log the list of protein results (e.g. acc, "simap", True) to the actual logfile, not just the console for item in fastagap_list: logging.info(item) else: for p in list_p: korbinian.fastagap.fastagap_save(p) logging.info( '~~~~~~~~~~~~ finished save_fastagap ~~~~~~~~~~~~' )
def run_parse_simap_to_csv(pathdict, s, logging): """For a dataframe containing a list of proteins, for each protein parses the SIMAP XML file to a csv file. Parameters ---------- pathdict : dict Dictionary of the key paths and files associated with that List number. s : dict Settings dictionary extracted from excel settings file. logging : logging.Logger Logger for printing to console and logfile. Saved Files and Figures ----------------------- acc_not_in_homol_db_txt : txt List of uniprot accessions that ar not in the homologue database (e.g. SIMAP) Any XML files with "Query failed: could not find the query sequence (check your query parameters)" will be added to this list For each protein (see parse_SIMAP_to_csv() below): homol_df_orig_zip : zipfile Zipfile containing the following: SIMAP_align_pretty_csv : csv CSV file containing the hit_number protein description and the pretty alignment for each homologue homol_df_orig_pickle : pickled pd.DataFrame Dataframe containing all sequence extracted from the XML file. This can be large, as it contains the full query, markup and match sequences """ logging.info('~~~~~~~~~~~~ starting parse_SIMAP_to_csv ~~~~~~~~~~~~') acc_not_in_homol_db = [] if os.path.isfile(pathdict["acc_not_in_homol_db_txt"]): # Extracts accession numbers out of file with open(pathdict["acc_not_in_homol_db_txt"], "r") as source: for line in source: line = line.strip() acc_not_in_homol_db.append(line) # if multiprocessing is used, log only to the console p_dict_logging = logging if s["use_multiprocessing"] != True else utils.Log_Only_To_Console() # create list of protein dictionaries to process list_p = korbinian.utils.convert_summary_csv_to_input_list(s, pathdict, p_dict_logging, list_excluded_acc=acc_not_in_homol_db) # number of processes is the number the settings, or the number of proteins, whichever is smallest n_processes = s["multiprocessing_cores"] if s["multiprocessing_cores"] < len(list_p) else len(list_p) if s["use_multiprocessing"]: with Pool(processes=n_processes) as pool: parse_simap_list = pool.map(parse_SIMAP_to_csv, list_p) # log the list of protein results to the actual logfile, not just the console logging.info(parse_simap_list) try: # remove all the None values from the list # note that we don't know exactly how they get there, as all return statements should give a tuple parse_simap_list = list(filter(None.__ne__, parse_simap_list)) df_parsed = pd.DataFrame(parse_simap_list) df_parsed.set_index(0, inplace=True) df_parsed.index.name = "acc" df_parsed.columns = ["finished", "result"] not_finished_df = df_parsed.loc[df_parsed.finished == False] finished_df = df_parsed.loc[df_parsed.finished == True] if not not_finished_df.empty: logging.info("\nparse_SIMAP_to_csv proteins not finished :\n\n{}\n".format(df_parsed.loc[df_parsed.finished == False])) if not finished_df.empty: logging.info("\nparse_SIMAP_to_csv proteins finished correctly:\n\n{}\n".format(df_parsed.loc[df_parsed.finished == True])) df_parsed["not_in_database"] = df_parsed.result.str.contains("not in simap database") new_acc_not_in_db_list = list(df_parsed.loc[df_parsed["not_in_database"]].index) new_acc_not_in_db_nr_set = set(new_acc_not_in_db_list) - set(acc_not_in_homol_db) # add accession number to the list of failed downloads with open(pathdict["acc_not_in_homol_db_txt"], "a") as source: for acc in new_acc_not_in_db_nr_set: source.write("\n{}".format(acc)) except (TypeError, IndexError, ValueError): logging.info(parse_simap_list) sys.stdout.write("TypeError, IndexError, parse_simap_list is not a list of 3-item tuples for some reason.") else: for p in list_p: parse_SIMAP_to_csv(p) logging.info('\n~~~~~~~~~~~~ finished parse_SIMAP_to_csv ~~~~~~~~~~~~')