Example #1
0
def run(pathdict, s, logging):
    """ Runner of the BLAST xml results parser.

    Parameters
    ----------
    pathdict : dict
        Dictionary of the key paths and files associated with that List number.
    s : dict
        Settings dictionary extracted from excel settings file.
    logging : logging.Logger
        Logger for printing to console and logfile.

    Saved Files and Figures
    -----------------------
    homol_df_orig_zip : zipfile
        Zipfile containing the following:
            homol_df_orig_pickle : pickled pd.DataFrame
                Dataframe containing all sequence extracted from the XML file.
                This can be large, as it contains the full query, markup and match sequences
    """
    #Initialize protein list
    acc_not_in_homol_db = []
    p_dict_logging = logging if s[
        "use_multiprocessing"] != True else utils.Log_Only_To_Console()
    list_p = korbinian.utils.convert_summary_csv_to_input_list(
        s, pathdict, p_dict_logging, list_excluded_acc=acc_not_in_homol_db)

    logging.info(
        "~~~~~~~~~~~~                 starting parsing BLAST results                 ~~~~~~~~~~~~"
    )

    #Multithreading
    if s["use_multiprocessing"]:
        # number of processes is the number the settings, or the number of proteins, whichever is smallest
        n_processes = s["multiprocessing_cores"] if s[
            "multiprocessing_cores"] < len(list_p) else len(list_p)

        #Scatter list_p to threads and start parallel execution
        with Pool(processes=n_processes) as thread_controller:
            thread_controller.map(parse_blast_result, list_p)
    #Sequential execution
    else:
        for p in list_p:
            parse_blast_result(p)

    logging.info(
        "\n" +
        "~~~~~~~~~~~~                 finished parsing BLAST results                 ~~~~~~~~~~~~"
    )
Example #2
0
def run_slice_TMDs_from_homologues(pathdict, s, logging):
    """For a list of proteins, slice TMD sequences from homologues and count gaps.

    Parameters
    ----------
    pathdict : dict
        Dictionary of the key paths and files associated with that List number.
    s : dict
        Settings dictionary extracted from excel settings file.
    logging : logging.Logger
        Logger for printing to console and logfile.

    Saved Files and Figures
    -----------------------
    see slice_1_TMD_from_homol
    """

    logging.info(
        '~~~~~~~~~~~~                 starting run_slice_TMDs_from_homologues                ~~~~~~~~~~~~'
    )
    # if multiprocessing is used, log only to the console
    p_dict_logging = logging if s[
        "use_multiprocessing"] != True else utils.Log_Only_To_Console()
    # get list of accessions that could not be downloaded, and can immediately be excluded
    not_in_homol_db = utils.get_acc_list_from_txt(
        pathdict["acc_not_in_homol_db_txt"])
    # create list of protein dictionaries to process
    list_p = korbinian.utils.convert_summary_csv_to_input_list(
        s, pathdict, p_dict_logging, list_excluded_acc=not_in_homol_db)

    # number of processes is the number the settings, or the number of proteins, whichever is smallest
    n_processes = s["multiprocessing_cores"] if s[
        "multiprocessing_cores"] < len(list_p) else len(list_p)

    if s["use_multiprocessing"]:
        with Pool(processes=n_processes) as pool:
            slice_list = pool.map(
                korbinian.cons_ratio.slice.slice_TMD_1_prot_from_homol, list_p)
            # log the list of protein results (e.g. acc, "simap", True) to the actual logfile, not just the console
        logging.info("\nslice_list : {}".format(slice_list))
    else:
        for p in list_p:
            korbinian.cons_ratio.slice.slice_TMD_1_prot_from_homol(p)
    logging.info(
        '\n~~~~~~~~~~~~                 finished run_slice_TMDs_from_homologues                ~~~~~~~~~~~~'
    )
Example #3
0
def run_create_fasta(pathdict, s, logging):
    logging.info('~~~~~~~~~~~~         starting filter_and_save_fasta           ~~~~~~~~~~~~')
    # if multiprocessing is used, log only to the console
    p_dict_logging = logging if s["use_multiprocessing"] != True else utils.Log_Only_To_Console()
    # create list of protein dictionaries to process
    list_p = korbinian.utils.convert_summary_csv_to_input_list(s, pathdict, p_dict_logging)
    # number of processes is the number the settings, or the number of proteins, whichever is smallest
    n_processes = s["multiprocessing_cores"] if s["multiprocessing_cores"] < len(list_p) else len(list_p)

    if s["use_multiprocessing"]:
        with Pool(processes=n_processes) as pool:
            fasta_list = pool.map(korbinian.fasta.filter_and_save_fasta, list_p)
            # log the list of protein results (e.g. acc, "simap", True) to the actual logfile, not just the console
            logging.info("fasta_list : {}".format(fasta_list))
    else:
        for p in list_p:
            korbinian.fasta.filter_and_save_fasta(p)
    logging.info('~~~~~~~~~~~~       filter_and_save_fasta is finished          ~~~~~~~~~~~~')
Example #4
0
def save_fastagap(pathdict, s, logging):
    """Runs fastagap_save for each protein, using multiprocessing Pool.

    Parameters
    ----------
    pathdict : dict
        Dictionary of the key paths and files associated with that List number.
    s : dict
        Settings dictionary extracted from excel settings file.
    logging : logging.Logger
        Logger for printing to console and/or logfile.
        If multiprocessing == True, logging.info etc will only print to console.

    """

    logging.info(
        '~~~~~~~~~~~~           starting save_fastagap             ~~~~~~~~~~~~'
    )
    # if multiprocessing is used, log only to the console
    p_dict_logging = logging if s[
        "use_multiprocessing"] != True else utils.Log_Only_To_Console()
    # create list of protein dictionaries to process
    list_p = korbinian.utils.convert_summary_csv_to_input_list(
        s, pathdict, p_dict_logging)
    # number of processes is the number the settings, or the number of proteins, whichever is smallest
    n_processes = s["multiprocessing_cores"] if s[
        "multiprocessing_cores"] < len(list_p) else len(list_p)

    if s["use_multiprocessing"]:
        with Pool(processes=n_processes) as pool:
            fastagap_list = pool.map(korbinian.fastagap.fastagap_save, list_p)
            # log the list of protein results (e.g. acc, "simap", True) to the actual logfile, not just the console
            for item in fastagap_list:
                logging.info(item)
    else:
        for p in list_p:
            korbinian.fastagap.fastagap_save(p)
    logging.info(
        '~~~~~~~~~~~~           finished save_fastagap             ~~~~~~~~~~~~'
    )
Example #5
0
def run_parse_simap_to_csv(pathdict, s, logging):
    """For a dataframe containing a list of proteins, for each protein parses the SIMAP XML file to a csv file.

    Parameters
    ----------
    pathdict : dict
        Dictionary of the key paths and files associated with that List number.
    s : dict
        Settings dictionary extracted from excel settings file.
    logging : logging.Logger
        Logger for printing to console and logfile.

    Saved Files and Figures
    -----------------------
    acc_not_in_homol_db_txt : txt
        List of uniprot accessions that ar not in the homologue database (e.g. SIMAP)
        Any XML files with "Query failed: could not find the query sequence (check your query parameters)"
        will be added to this list

    For each protein (see parse_SIMAP_to_csv() below):
        homol_df_orig_zip : zipfile
            Zipfile containing the following:
                SIMAP_align_pretty_csv : csv
                    CSV file containing the hit_number protein description and the pretty alignment for each homologue
                homol_df_orig_pickle : pickled pd.DataFrame
                    Dataframe containing all sequence extracted from the XML file.
                    This can be large, as it contains the full query, markup and match sequences

    """
    logging.info('~~~~~~~~~~~~                       starting parse_SIMAP_to_csv                      ~~~~~~~~~~~~')
    acc_not_in_homol_db = []
    if os.path.isfile(pathdict["acc_not_in_homol_db_txt"]):
        # Extracts accession numbers out of file
        with open(pathdict["acc_not_in_homol_db_txt"], "r") as source:
            for line in source:
                line = line.strip()
                acc_not_in_homol_db.append(line)

    # if multiprocessing is used, log only to the console
    p_dict_logging = logging if s["use_multiprocessing"] != True else utils.Log_Only_To_Console()
    # create list of protein dictionaries to process
    list_p = korbinian.utils.convert_summary_csv_to_input_list(s, pathdict, p_dict_logging, list_excluded_acc=acc_not_in_homol_db)

    # number of processes is the number the settings, or the number of proteins, whichever is smallest
    n_processes = s["multiprocessing_cores"] if s["multiprocessing_cores"] < len(list_p) else len(list_p)

    if s["use_multiprocessing"]:
        with Pool(processes=n_processes) as pool:
            parse_simap_list = pool.map(parse_SIMAP_to_csv, list_p)
        # log the list of protein results to the actual logfile, not just the console
        logging.info(parse_simap_list)
        try:
            # remove all the None values from the list
            # note that we don't know exactly how they get there, as all return statements should give a tuple
            parse_simap_list = list(filter(None.__ne__, parse_simap_list))
            df_parsed = pd.DataFrame(parse_simap_list)
            df_parsed.set_index(0, inplace=True)
            df_parsed.index.name = "acc"
            df_parsed.columns = ["finished", "result"]
            not_finished_df = df_parsed.loc[df_parsed.finished == False]
            finished_df = df_parsed.loc[df_parsed.finished == True]
            if not not_finished_df.empty:
                logging.info("\nparse_SIMAP_to_csv proteins not finished :\n\n{}\n".format(df_parsed.loc[df_parsed.finished == False]))
            if not finished_df.empty:
                logging.info("\nparse_SIMAP_to_csv proteins finished correctly:\n\n{}\n".format(df_parsed.loc[df_parsed.finished == True]))
            df_parsed["not_in_database"] = df_parsed.result.str.contains("not in simap database")
            new_acc_not_in_db_list = list(df_parsed.loc[df_parsed["not_in_database"]].index)
            new_acc_not_in_db_nr_set = set(new_acc_not_in_db_list) - set(acc_not_in_homol_db)
            # add accession number to the list of failed downloads
            with open(pathdict["acc_not_in_homol_db_txt"], "a") as source:
                for acc in new_acc_not_in_db_nr_set:
                    source.write("\n{}".format(acc))
        except (TypeError, IndexError, ValueError):
            logging.info(parse_simap_list)
            sys.stdout.write("TypeError, IndexError, parse_simap_list is not a list of 3-item tuples for some reason.")
    else:
        for p in list_p:
            parse_SIMAP_to_csv(p)
        logging.info('\n~~~~~~~~~~~~                       finished parse_SIMAP_to_csv                      ~~~~~~~~~~~~')