Example #1
0
def setup_error_logging(logfile, level_console="DEBUG", level_logfile="DEBUG"):
    """ Sets up error logging, and logs a number of system settings.

    Parameters
    ----------
    logfile : str
        Path to output logfile. If size exceeds limit set below in JSON settings, path.1, path.2 etc will be created.
    level_console : str
        Logging level for printing to console. DEBUG, WARNING or CRITICAL
    level_logfile : str
        Logging level for printing to logfile. DEBUG, WARNING or CRITICAL

    Returns
    -------
    logging : logging.Logger
        Logging object, for printing to console and logfile
    """
    # load the log settings in json format
    logsettings = json.dumps({
        "handlers": {
            "console": {
                "formatter": "brief",
                "class": "logging.StreamHandler",
                "stream": "ext://sys.stdout",
                "level": "DEBUG"
            },
            "file": {
                "maxBytes": 10000000,
                "formatter": "precise",
                "backupCount": 3,
                "class": "logging.handlers.RotatingFileHandler",
                "level": "DEBUG",
                "filename": "logfile.txt"
            }
        },
        "version": 1,
        "root": {
            "handlers": [
                "console",
                "file"
            ],
            "propagate": "no",
            "level": "DEBUG"
        },
        "formatters": {
            "simple": {
                "format": "format=%(asctime)s - %(name)s - %(levelname)s - %(message)s"
            },
            "precise": {
                "format": "%(asctime)s %(name)-15s %(levelname)-8s %(message)s"
            },
            "brief": {
                "format": "%(message)s"
            }
        }
    }, skipkeys=True, sort_keys=True, indent=4, separators=(',', ': '))

    config=json.loads(logsettings)
    # add user parameters to the logging settings (logfile, and logging levels)
    config['handlers']['file']['filename'] = logfile
    config['handlers']['console']['level'] = level_console
    config['handlers']['file']['level'] = level_logfile

    # create folder if necessary
    utils.make_sure_path_exists(logfile, isfile=True)
    #create a blank logging file
    with open(logfile, 'w') as f:
        pass

    #clear any previous logging handlers that might have been previously run in the console
    logging.getLogger('').handlers = []
    #load the logging settings from the modified json string
    logging.config.dictConfig(config)
    # collect a number of system settings that could be useful for troubleshooting
    system_settings_dict = {}
    system_settings_dict["system description"] = platform.uname()
    system_settings_dict["system"] = platform.system()
    system_settings_dict["architecture"] = platform.architecture()
    system_settings_dict["network_name"] = platform.node()
    system_settings_dict["release"] = platform.release()
    system_settings_dict["version"] = platform.version()
    system_settings_dict["machine"] = platform.machine()
    system_settings_dict["processor"] = platform.processor()
    system_settings_dict["python_version"] = platform.python_version()
    system_settings_dict["python_build"] = platform.python_build()
    system_settings_dict["python_compiler"] = platform.python_compiler()
    system_settings_dict["argv"] = sys.argv
    system_settings_dict["dirname(argv[0])"] = os.path.abspath(os.path.expanduser(os.path.dirname(sys.argv[0])))
    system_settings_dict["pwd"] = os.path.abspath(os.path.expanduser(os.path.curdir))
    system_settings_dict["total_ram"] = "{:0.2f} GB".format(psutil.virtual_memory()[0] / 1000000000)
    system_settings_dict["available_ram"] = "{:0.2f} GB ({}% used)".format(psutil.virtual_memory()[1] / 1000000000, psutil.virtual_memory()[2])
    # log the system settings
    logging.warning("system description : {}".format(system_settings_dict))
    #test error message reporting
    #logging.warning('LOGGING TEST:')
    #try:
    #    open('/path/to/does/not/exist', 'rb')
    #except (SystemExit, KeyboardInterrupt):
    #    raise
    #except Exception:
    #    logging.error('Failed to open file', exc_info=True)
    logging.warning('logging setup is successful (logging levels: console={}, logfile={}). \n'.format(level_console, level_logfile))
    return logging
Example #2
0
def slice_TMD_1_prot_from_homol(p):
    """ Slices TMDs from homologues, for a single protein in the list.

     - checks that the homol_df_orig_zip file exists with the full homologue sequences
     - if slice_juxtamembrane_regions is chosen, conducts JM slicing (currently not stable)
     - removes any old, existing fa_cr_sliced_TMDs_zip files
     - creates df_nonTMD_sliced Dataframe to hold the sliced nonTMD regions, based on the indices from all the regex searches
        - for each TMD:
             - identifies and slices out the TMD region from the query, markup and match from each SW alignment
             - the dataframe with sliced sequences for each TMD is added to the fa_cr_sliced_TMDs_zip as PROTEIN_NAME_TM01_sliced_df.pickle, PROTEIN_NAME_TM02_sliced_df.pickle, etc
             - adds the indices for each TMD to the df_nonTMD_sliced Dataframe
        - when each TMD is finished :
            df_nonTMD_sliced uses the indices for each TMD to create the indices for the full nonTMD region (korbinian.cons_ratio.slice.slice_nonTMD_seqs)
            df_nonTMD_sliced contains nonTMD region, as one large slice of query, markup or match from alignment, pieces joined end-to-end
            df_nonTMD_sliced is saved in fa_cr_sliced_TMDs_zip as PROTEIN_NAME_nonTMD_sliced_df.pickle

    Parameters
    ----------
    p : dict
        Protein Dictionary. Contains all input settings, sequences and filepaths related to a single protein.
        Protein-specific data is extracted from one row of the the list summary, e.g. List05_summary.csv, which is read as df.
        p also contains the GENERAL korbinian settings and filepaths for that list (pathdict, s, logging)

        Components
        ----------
        pathdict : dict
            Dictionary of the key paths and files associated with that List number.
        s : dict
            Settings dictionary extracted from excel settings file.
        logging : logging.Logger
            Logger for printing to console and/or logfile.
            If multiprocessing == True, logging.info etc will only print to console.
        p : protein-specific dictionary components
            acc, list_of_TMDs, description, TM01_seq, etc

    Dataframes
    ----------
    dfs
        Dataframe for Sequences
        index = hit_num
        columns = md5, FASTA_gapped_identity, hit_contains_SW_node, organism, X_in_match_seq, disallowed_words_not_in_descr, etc
    df_TMD
        Dataframe for 1 TMD, from 1 protein
        index = hit_num
        columns = 'organism', 'description', 'TM01_in_SW_alignment', 'TM01_start_in_SW_alignment', 'TM01_end_in_SW_alignment', 'TM01_SW_query_seq', 'TM01_SW_markup_seq', 'TM01_SW_match_seq', etc
    df_nonTMD_sliced
        Dataframe for all nonTMD region, from 1 protein
        index = hit_num
        columns = 'nested_tuple_indices_all_nonTMD_regions', 'nonTMD_markup', 'nonTMD_seq_match', 'len_query_align_seq', 'TM01_in_SW_alignment', 'TM01_start_in_SW_alignment', 'TM01_end_in_SW_alignment', 'TM02_in_SW_alignment',  etc

    Saved Files and Figures
    -----------------------
    fa_cr_sliced_TMDs_zip
        df_nonTMD_temp_pickle, e.g. A4ARX1_nonTMD_sliced_df.pickle
        TM_temp_pickle, E.g. A4ARX1_TM01_sliced_df.pickle

    Returns
    -------
    In all cases, a tuple (str, bool, str) is returned.

    if successful:
        return acc, True, "0"
    if not successful:
        return acc, False, "specific warning or reason why protein failed"
    """
    pathdict, s, logging = p["pathdict"], p["s"], p["logging"]
    acc = p["acc"]
    sys.stdout.write("{} ".format(acc))
    sys.stdout.flush()
    protein_name = p['protein_name']
    if not os.path.exists(p['homol_df_orig_zip']):
        warning = "{} Protein skipped. File does not exist".format(
            p['homol_df_orig_zip'])
        logging.info(warning)
        return acc, False, warning

    if utils.file_is_old(p['homol_df_orig_zip'],
                         s["oldest_acceptable_file_date"]):
        os.remove(p['homol_df_orig_zip']),
        message = "{} skipped, file is old and has been deleted".format(acc)
        logging.info(message)
        return acc, False, message

    dfs = utils.open_df_from_pickle_zip(p['homol_df_orig_zip'],
                                        delete_corrupt=True)
    if dfs.empty:
        warning = "{} Protein skipped, file deleted as it is possibly corrupt.".format(
            p['homol_df_orig_zip'])
        logging.info(warning)
        return acc, False, warning

    list_of_TMDs = ast.literal_eval(p['list_of_TMDs'])

    # create a boolean "p_is_multipass" to show whether protein is multipass (>1TMD) or singlepass (1TMD)
    if "TM02" in list_of_TMDs:
        p_is_multipass = True
    else:
        p_is_multipass = False

    # create counter for number of TMDs with some homologue data
    n_TMDs_w_homol = 0
    fa_cr_sliced_TMDs_zip = p['fa_cr_sliced_TMDs_zip']
    if os.path.isfile(fa_cr_sliced_TMDs_zip):
        if s["overwrite_sliced_homologues"] == True:
            # delete any existing sliced zipfile
            os.remove(fa_cr_sliced_TMDs_zip)
        else:
            warning = "{} skipped, output from slice_TMD_1_prot_from_homol already exists".format(
                acc)
            logging.info(warning)
            # skip this protein
            return acc, False, warning

    utils.make_sure_path_exists(fa_cr_sliced_TMDs_zip, isfile=True)

    # open new zipfile (NOTE, it must be closed later!!)
    with zipfile.ZipFile(fa_cr_sliced_TMDs_zip,
                         mode="a",
                         compression=zipfile.ZIP_DEFLATED) as homol_sliced_zip:

        # get directory for zip (and other temp files to be transferred)
        homol_dir = os.path.dirname(fa_cr_sliced_TMDs_zip)
        # create a specific dataframe to hold the nonTMD region, including indices (True, start, end) of all the TMD segments
        if "len_query_align_seq" not in dfs.columns:
            warning = "{} len_query_align_seq not in columns, protein skipped for slice_TMD_1_prot_from_homol".format(
                acc)
            logging.warning(warning)
            #skip protein
            return acc, False, warning

        # add the FASTA_gapped_identity and length of the alignment sequence from dfs, to act as the "end" of all the nonTMD regions
        df_nonTMD_sliced = dfs[['len_query_align_seq',
                                'SW_query_coverage']].copy()

        # start with an empty dataframe, that will be replaced if there is any data to analyse
        df_TMD = pd.DataFrame()
        for TMD in list_of_TMDs:
            query_TMD_sequence = p['%s_seq' % TMD]
            if type(query_TMD_sequence) == float:
                warning = "{} {} query_TMD_sequence is np.nan! skipping protein.".format(
                    acc, TMD)
                logging.warning(warning)
                # skip protein
                return acc, False, warning
            ## SHOULD NOT BE NECESSARY. OMPdb DATABASE NOW FIXED TO AVOID NAN VALUES IN TM_SEQ
            # if isinstance(query_TMD_sequence, float):
            #     warning = "{} {} query_TMD_sequence is a float ({}), probably np.nan.".format(acc, TMD, query_TMD_sequence)
            #     logging.warning(warning)
            #     return acc, False, warning
            df_TMD = korbinian.cons_ratio.slice.slice_1_TMD_from_homol(
                acc, TMD, query_TMD_sequence, dfs, s, logging)
            if df_TMD.empty:
                warning = "{} {} df_TMD.empty, probably number_of_rows_containing_data == 0".format(
                    acc, TMD, query_TMD_sequence)
                logging.warning(warning)
                # skip TMD, as number_of_rows_containing_data == 0
                # here I really should skip the protein too. It's tempting to use goto :). "from goto import goto" (http://entrian.com/goto/)
                return acc, False, warning
            n_TMDs_w_homol += 1
            # transfer the columns with indices across to the df_nonTMD_sliced
            cols = [
                '%s_in_SW_alignment' % TMD,
                '%s_start_in_SW_alignment' % TMD,
                '%s_end_in_SW_alignment' % TMD
            ]
            for col in cols:
                df_nonTMD_sliced[col] = df_TMD[col]

            TM_temp_pickle = os.path.join(
                homol_dir, "{}_{}_sliced_df.pickle".format(protein_name, TMD))
            with open(TM_temp_pickle, "wb") as f:
                pickle.dump(df_TMD, f, protocol=pickle.HIGHEST_PROTOCOL)
            homol_sliced_zip.write(TM_temp_pickle,
                                   arcname=os.path.basename(TM_temp_pickle))
            os.remove(TM_temp_pickle)
            sys.stdout.write(".")
            sys.stdout.flush()

        if df_TMD.empty:
            # skip protein, as number_of_rows_containing_data == 0 for at least one TMD (or at least the last TMD)
            warning = "{} skipped, number_of_rows_containing_data == 0 for at least one TMD".format(
                acc)
            logging.info(warning)
            return acc, False, warning

        df_nonTMD_sliced = korbinian.cons_ratio.slice.slice_nonTMD_seqs(
            dfs, df_nonTMD_sliced, list_of_TMDs)
        if df_nonTMD_sliced.empty:
            warning = "{} df_nonTMD_sliced is empty, probably this means no homologues contain all TMDs".format(
                acc)
            logging.warning(warning)
            #skip protein
            return acc, False, warning

        if s["slice_juxtamembrane_regions"] == True:
            for TMD in list_of_TMDs:
                ########################################################################################
                #                                                                                      #
                #        Define juxtamembrane regions associated with each TMD  [AAIMON]               #
                #                                                                                      #
                ########################################################################################
                # convert the tuple of (True, 32, 53) into separate dataframes.
                # http://stackoverflow.com/questions/29550414/how-to-split-column-of-tuples-in-pandas-dataframe

                if p_is_multipass:
                    next_TMD = "TM{:02d}".format(int(TMD[2:]) + 1)
                    prev_TMD = "TM{:02d}".format(int(TMD[2:]) - 1)
                    #df_next_TMD = df_TMD = korbinian.cons_ratio.slice.slice_1_TMD_from_homol(acc, next_TMD, query_TMD_sequence, dfs, s, logging)
                    #if TMD != "TM01":
                    #    df_prev_TMD = df_TMD = korbinian.cons_ratio.slice.slice_1_TMD_from_homol(acc, prev_TMD, query_TMD_sequence, dfs, s, logging)

                last_TMD_of_acc = list_of_TMDs[-1]

                if TMD == "TM01":
                    # np.where syntax: np.where(boolean_query, value_if_query_true, value_if_query_false)
                    # @RJ, If TM01_start_in_SW_alignment is not an integer above 0, replaces with np.nan?
                    df_nonTMD_sliced['start_juxta_before_TM01'] = np.where(
                        df_nonTMD_sliced['TM01_start_in_SW_alignment'] > 0, 0,
                        np.nan)
                    # if the TM01_start_in_SW_alignment is 0, there is no JM region N-terminal to the TMD, therefore replace end_juxta_before_TM01 with np.nan, otherwise use TM01_start_in_SW_alignment
                    df_nonTMD_sliced['end_juxta_before_TM01'] = np.where(
                        df_nonTMD_sliced['TM01_start_in_SW_alignment'] == 0,
                        np.nan, df_nonTMD_sliced['TM01_start_in_SW_alignment'])
                    # set the start of the juxta as the end of the TMD
                    df_nonTMD_sliced[
                        'start_juxta_after_TM01'] = df_nonTMD_sliced[
                            'TM01_end_in_SW_alignment']
                    # if there is only one TMD (search for TM02 rather than measuring length of list, in case of signal peptides)
                    if p_is_multipass:
                        # open up the dataframes of the next and previous TMD
                        # define the end_juxta_after_TM01 as the TM01 end + half of the TM01_to_TM02 JM region
                        # NOTE, due to np.nan this is a float. will be converted to integers later
                        df_nonTMD_sliced[
                            'end_juxta_after_TM01'] = df_nonTMD_sliced[
                                "TM01_end_in_SW_alignment"] + (
                                    (df_nonTMD_sliced[
                                        "TM02_start_in_SW_alignment"] -
                                     df_nonTMD_sliced[
                                         "TM01_end_in_SW_alignment"]) / 2)

                        # RJ original
                        ## problem('df_nonTMD_sliced["TM02_start_in_SW_alignment"] cannot exist yet, because the script iterates through the TMDs one at a time')
                        # df_nonTMD_sliced['end_juxta_after_TM01'] = df_nonTMD_sliced["TM01_end_in_SW_alignment"] + ((df_nonTMD_sliced["TM02_start_in_SW_alignment"] - df_nonTMD_sliced["TM01_end_in_SW_alignment"]) / 2).apply(lambda x: int(x) if not np.isnan(x) else np.nan)

                        # RJ commented out
                        # df_nonTMD_sliced['seq_juxta_after_TM01_in_query'] = df_nonTMD_sliced[df_nonTMD_sliced['start_juxta_after_TM01'].notnull()].apply(utils.slice_juxta_after_TMD_in_query, args = (TMD,), axis=1)
                        # df_nonTMD_sliced['seq_juxta_after_TM01_in_match'] = df_nonTMD_sliced[df_nonTMD_sliced['end_juxta_after_TM01'].notnull()].apply(utils.slice_juxta_after_TMD_in_match, args = (TMD,), axis=1)

                    else:
                        # if there is only one TMD, TM01 == last_TMD_of_acc
                        # @RJ replace with df_nonTMD_sliced['end_juxta_after_TM01'] = df_nonTMD_sliced['len_query_align_seq'] and use dropna to avoid nans later?
                        df_nonTMD_sliced['end_juxta_after_TM01'] = np.where(
                            utils.isNaN(
                                df_nonTMD_sliced['start_juxta_after_TM01']) ==
                            True, np.nan,
                            df_nonTMD_sliced['len_query_align_seq'])

                # the analysis is slow, so don't repeat TM01 if there is only one TM helix in the protein
                if p_is_multipass:
                    if not TMD == "TM01" and not TMD == last_TMD_of_acc:
                        df_nonTMD_sliced = juxta_function_1(
                            df_nonTMD_sliced, TMD)
                        # df_nonTMD_sliced['start_juxta_after_%s'%TMD] = np.where(utils.isNaN(df_nonTMD_sliced['TM%.2d_start_in_SW_alignment'%(int(TMD[2:])+1)])==True,np.nan,df_nonTMD_sliced['%s_end_in_SW_alignment'%TMD])
                        # df_nonTMD_sliced['end_juxta_before_%s'%TMD] = np.where(df_nonTMD_sliced["%s_start_in_SW_alignment"%TMD]!=0,df_nonTMD_sliced["%s_start_in_SW_alignment"%TMD],np.nan)
                        # df_nonTMD_sliced['end_juxta_after_%s'%TMD] = df_nonTMD_sliced["%s_end_in_SW_alignment"%TMD]+((df_nonTMD_sliced["TM%.2d_start_in_SW_alignment"%(int(TMD[2:])+1)]-df_nonTMD_sliced["%s_end_in_SW_alignment"%TMD])/2).apply(lambda x :int(x) if not np.isnan(x) else np.nan)
                        # df_nonTMD_sliced['start_juxta_before_%s'%TMD] = np.where(df_nonTMD_sliced["end_juxta_after_TM%.2d"%(int(TMD[2:])-1)] == df_nonTMD_sliced['end_juxta_before_%s'%TMD] ,df_nonTMD_sliced["end_juxta_after_TM%.2d"%(int(TMD[2:])-1)],df_nonTMD_sliced["end_juxta_after_TM%.2d"%(int(TMD[2:])-1)])
                        # df_nonTMD_sliced['seq_juxta_after_%s_in_query'%TMD] = df_nonTMD_sliced[df_nonTMD_sliced['start_juxta_after_%s'%TMD].notnull()].apply(utils.slice_juxta_after_TMD_in_query, args = (TMD,), axis=1)
                        # df_nonTMD_sliced['seq_juxta_after_%s_in_match'%TMD] = df_nonTMD_sliced[df_nonTMD_sliced['end_juxta_after_%s'%TMD].notnull()].apply(utils.slice_juxta_after_TMD_in_match, args = (TMD,), axis=1)

                    if TMD == last_TMD_of_acc:
                        df_nonTMD_sliced['start_juxta_before_%s' %
                                         TMD] = df_nonTMD_sliced[
                                             'end_juxta_after_%s' % prev_TMD]
                        df_nonTMD_sliced['end_juxta_before_%s' %
                                         TMD] = df_nonTMD_sliced[
                                             '%s_start_in_SW_alignment' % TMD]
                        df_nonTMD_sliced[
                            'start_juxta_after_%s' % TMD] = np.where(
                                df_nonTMD_sliced['%s_end_in_SW_alignment' %
                                                 TMD] ==
                                df_nonTMD_sliced['len_query_align_seq'],
                                np.nan,
                                df_nonTMD_sliced['%s_end_in_SW_alignment' %
                                                 TMD])
                        df_nonTMD_sliced[
                            'end_juxta_after_%s' % TMD] = np.where(
                                utils.isNaN(
                                    df_nonTMD_sliced['start_juxta_after_%s' %
                                                     TMD]) == True, np.nan,
                                df_nonTMD_sliced['len_query_align_seq'])
                        # df_nonTMD_sliced['seq_juxta_after_%s_in_query'%TMD] = df_nonTMD_sliced[df_nonTMD_sliced['start_juxta_after_%s'%TMD].notnull()].apply(utils.slice_juxta_after_TMD_in_query, args = (TMD,), axis=1)
                        # df_nonTMD_sliced['seq_juxta_after_%s_in_query'%TMD] = df_nonTMD_sliced.query_align_seq[int(df_nonTMD_sliced['start_juxta_after_TM10']):int(df_nonTMD_sliced['end_juxta_after_TM10'])]
                        # df_nonTMD_sliced['seq_juxta_after_%s_in_match'%TMD] =
                else:
                    # the end_juxta_after_TM01 is already defined, nothing else needs to be done for the single-pass proteins
                    pass

                last_TMD_of_acc = list_of_TMDs[-1]
                index_juxta = df_nonTMD_sliced['start_juxta_before_%s' %
                                               TMD].notnull().index
                q = np.array(dfs.loc[index_juxta, "query_align_seq"])
                st = np.array(df_nonTMD_sliced.loc[index_juxta,
                                                   'start_juxta_before_%s' %
                                                   TMD])
                st = st.astype(int)
                en = np.array(df_nonTMD_sliced.loc[index_juxta,
                                                   'end_juxta_before_%s' %
                                                   TMD])
                en = en.astype(int)
                q_sliced = [q[i][st[i]:en[i]] for i in range(len(q))]
                df_nonTMD_sliced['seq_juxta_before_%s_in_query' %
                                 TMD] = pd.Series(q_sliced, index=index_juxta)
                m = np.array(dfs.loc[index_juxta, "match_align_seq"])
                m_sliced = [m[i][st[i]:en[i]] for i in range(len(m))]
                df_nonTMD_sliced['seq_juxta_before_%s_in_match' %
                                 TMD] = pd.Series(m_sliced, index=index_juxta)

                #df_nonTMD_sliced['seq_juxta_before_%s_in_query' % TMD] = df_nonTMD_sliced[df_nonTMD_sliced['start_juxta_before_%s' % TMD].notnull()].apply(utils.slice_juxta_before_TMD_in_query, args=(TMD,), axis=1)
                #df_nonTMD_sliced['seq_juxta_before_%s_in_match' % TMD] = df_nonTMD_sliced[df_nonTMD_sliced['start_juxta_before_%s' % TMD].notnull()].apply(utils.slice_juxta_before_TMD_in_match, args=(TMD,), axis=1)
                if not TMD == last_TMD_of_acc:
                    index_juxta = df_nonTMD_sliced['end_juxta_after_%s' %
                                                   TMD].notnull().index
                    st = np.array(df_nonTMD_sliced.loc[index_juxta,
                                                       'start_juxta_after_%s' %
                                                       TMD])
                    st = st.astype(int)
                    en = np.array(df_nonTMD_sliced.loc[index_juxta,
                                                       'end_juxta_after_%s' %
                                                       TMD])
                    en = en.astype(int)
                    q_sliced = [q[i][st[i]:en[i]] for i in range(len(q))]
                    df_nonTMD_sliced['seq_juxta_after_%s_in_query' %
                                     TMD] = pd.Series(q_sliced,
                                                      index=index_juxta)
                    m_sliced = [m[i][st[i]:en[i]] for i in range(len(m))]
                    df_nonTMD_sliced['seq_juxta_after_%s_in_match' %
                                     TMD] = pd.Series(m_sliced,
                                                      index=index_juxta)
                    #df_nonTMD_sliced['seq_juxta_after_%s_in_query' % TMD] = df_nonTMD_sliced[df_nonTMD_sliced['end_juxta_after_%s' % TMD].notnull()].apply(utils.slice_juxta_after_TMD_in_query, args=(TMD,), axis=1)
                    #df_nonTMD_sliced['seq_juxta_after_%s_in_match' % TMD] = df_nonTMD_sliced[df_nonTMD_sliced['end_juxta_after_%s' % TMD].notnull()].apply(utils.slice_juxta_after_TMD_in_match, args=(TMD,), axis=1)
                else:
                    index_juxta = df_nonTMD_sliced['start_juxta_after_%s' %
                                                   TMD].notnull().index
                    st = np.array(df_nonTMD_sliced.loc[index_juxta,
                                                       'start_juxta_after_%s' %
                                                       TMD])
                    st = st.astype(int)
                    en = np.array(df_nonTMD_sliced.loc[index_juxta,
                                                       'end_juxta_after_%s' %
                                                       TMD])
                    en = en.astype(int)
                    q_sliced = [q[i][st[i]:en[i]] for i in range(len(q))]
                    df_nonTMD_sliced['seq_juxta_after_%s_in_query' %
                                     TMD] = pd.Series(q_sliced,
                                                      index=index_juxta)
                    m_sliced = [m[i][st[i]:en[i]] for i in range(len(m))]
                    df_nonTMD_sliced['seq_juxta_after_%s_in_match' %
                                     TMD] = pd.Series(m_sliced,
                                                      index=index_juxta)
                    # df_nonTMD_sliced['seq_juxta_after_%s_in_query' % TMD] = np.nan
                    # df_nonTMD_sliced['seq_juxta_after_%s_in_match' % TMD] = np.nan
                    # for hit in df_nonTMD_sliced.index:
                    #     if not utils.isNaN(df_nonTMD_sliced['start_juxta_after_%s' % TMD])[hit]:
                    #         # altered to .loc rather than ['seq_juxta_after_%s_in_match'%TMD][hit] after SettingWithCopyWarning
                    #         df_nonTMD_sliced.loc[hit, 'seq_juxta_after_%s_in_match' % TMD] = df_nonTMD_sliced.match_align_seq[hit][int(df_nonTMD_sliced.loc[hit, "start_juxta_after_%s" % TMD]):int(df_nonTMD_sliced.loc[hit, "end_juxta_after_%s" % TMD])]
                    #         df_nonTMD_sliced.loc[hit, 'seq_juxta_after_%s_in_query' % TMD] = df_nonTMD_sliced.query_align_seq[hit][int(df_nonTMD_sliced.loc[hit, "start_juxta_after_%s" % TMD]):int(df_nonTMD_sliced.loc[hit, "end_juxta_after_%s" % TMD])]

        df_nonTMD_temp_pickle = os.path.join(
            homol_dir, "{}_nonTMD_sliced_df.pickle".format(protein_name))
        with open(df_nonTMD_temp_pickle, "wb") as f:
            pickle.dump(df_nonTMD_sliced, f, protocol=pickle.HIGHEST_PROTOCOL)
        homol_sliced_zip.write(df_nonTMD_temp_pickle,
                               arcname=os.path.basename(df_nonTMD_temp_pickle))
        os.remove(df_nonTMD_temp_pickle)
        return acc, True, "0"
Example #3
0
def parse_SIMAP_to_csv(p):
    """ Parses the SIMAP XML file to csv for a single protein.

    Designed for use in multiprocessing, where logging.info will only print to the console, and the logfile will
    contain the messages in the return statements, telling if that protein was successful.

    Notes:
     - sdict is the dictionary with all the simap header info. It's not actually used anywhere further in the pipeline at the moment.


    Parameters
    ----------
    p : dict
        Protein Dictionary. Contains all input settings, sequences and filepaths related to a single protein.
        Protein-specific data is extracted from one row of the the list summary, e.g. List05_summary.csv, which is read as df.
        p also contains the GENERAL korbinian settings and filepaths for that list (pathdict, s, logging)

        Components
        ----------
        pathdict : dict
            Dictionary of the key paths and files associated with that List number.
        s : dict
            Settings dictionary extracted from excel settings file.
        logging : logging.Logger
            Logger for printing to console and/or logfile.
            If multiprocessing == True, logging.info etc will only print to console.
        p : protein-specific dictionary components
            acc, list_of_TMDs, description, TM01_seq, etc

    Saved Files and Figures
    -----------------------
    homol_df_orig_zip : zipfile
        Zipfile containing the following:
            SIMAP_align_pretty_csv : csv
                CSV file containing the hit_number protein description and the pretty alignment for each homologue
            homol_df_orig_pickle : pickled pd.DataFrame
                Dataframe containing all sequence extracted from the XML file.
                This can be large, as it contains the full query, markup and match sequences

    Returns
    -------
    In all cases, a tuple (str, bool, str) is returned.
    if sucsessful:
        return acc, True, "0"
    if not successful:
        return acc, False, "specific warning or reason why protein failed"
    """
    pathdict, s, logging = p["pathdict"], p["s"], p["logging"]
    acc = p["acc"]
    sys.stdout.write("{}, ".format(acc))
    sys.stdout.flush()
    protein_name = p['protein_name']

    # if overwrite_simap_parsed_to_csv is False, skip proteins where the homol_df_orig_zip file seems good
    if s["overwrite_simap_parsed_to_csv"] == False:
        if os.path.isfile(p['homol_df_orig_zip']):
            try:
                # open up the csv as a dataframe. Delete the zip file if a csv is not found.
                dfh_test = utils.open_df_from_pickle_zip(p['homol_df_orig_zip'], filename=os.path.basename(p['homol_df_orig_pickle']), delete_corrupt=True)
                description_of_first_hit = dfh_test.loc[1, 'description']
                logging.info('Protein %s: homologues already converted to csv. (%s)' % (p["acc"], description_of_first_hit))

                # The file seems fine. Skip to next protein.
                warning = "{} skipped, homologues already parsed to csv".format(p['protein_name'])
                logging.info(warning)
                return acc, False, warning

            except (EOFError, KeyError):
                # file may be corrupted, if script stopped unexpectedly before compression was finished
                logging.info('%s seems to be corrupted. File will be deleted and parsing from xml to csv repeated.' % p['homol_df_orig_zip'])
                os.remove(p['homol_df_orig_zip'])

    #set up counters
    number_of_hits_missing_protein_node = 0
    num_hits_with_SW_align_node = 0
    number_of_hits_missing_smithWatermanAlignment_node = 0
    ft_xml_path = p['SIMAP_feature_table_XML_path']
    homol_xml_path = p['SIMAP_homol_XML_path']
    SIMAP_tar = p['SIMAP_tar']
    homol_xml_filename = os.path.basename(homol_xml_path)

    #check which files exist
    homol_in_tar = utils.check_SIMAP_tarfile(SIMAP_tar, ft_xml_path, homol_xml_path, acc, logging, delete_corrupt=True)[-1]

    # NEW: XML is parsed if only the homol_in_tar (feature tables are not necessary)
    if not homol_in_tar:
        warning = "{} skipped (no homologues)".format(p['protein_name'])
        logging.info(warning)
        return acc, False, warning

    # create subfolders, if they don't exist
    subfolder = os.path.dirname(p['homol_df_orig_zip'])
    utils.make_sure_path_exists(subfolder)

    #extract the tarfile so that it can be read as xml
    tar = tarfile.open(p['SIMAP_tar'], 'r:gz')

    SIMAP_homologues_XML_file_extracted = tar.extractfile(homol_xml_filename)
    try:
        #parse_uniprot the XML file with elementtree, define the 'root' of the XML file
        simap_homologue_tree = ET.parse(SIMAP_homologues_XML_file_extracted)
        simap_homologue_root = simap_homologue_tree.getroot()
    except xml.etree.ElementTree.ParseError:
        # returns a tuple
        message = "{} contains xml file that gives a ParseError. " \
                  "In the future, file may be automatically deleted.".format(p['homol_df_orig_zip'])
        logging.info(message)
        return acc, False, message

    try:
        error = simap_homologue_root[0][0][1][0].text
        if "could not find the query sequence" in error:
            # returns a tuple
            message = "{} not in simap database".format(acc)
            logging.info(message)
            return acc, False, message
    except IndexError:
        # file is probably normal, as it doesn't contain the message saying that the protein is not found in the database
        pass

    # the sdict is the dictionary of info at top of SIMAP XML, before the matches start
    # it will be saved in a separate csv
    sdict = {}

    try:
        sdict['SIMAP_created'] = simap_homologue_root[0][0][0][0][2][1][0].attrib["created"]

        for parameters in simap_homologue_root[0][0][0][0].iter('parameters'):
            sdict['SIMAP_input_seq_details_dict'] = str(parameters[0][0].attrib)
            for SIMAP_filter in parameters.iter('filter'):
                SIMAP_filter_string = SIMAP_filter.text
            sdict['SIMAP_filter_string'] = str(SIMAP_filter_string)
            for resultSpecification in parameters.iter('resultSpecification'):
                SIMAP_resultSpecification_dict = resultSpecification.attrib
            sdict['SIMAP_resultSpecification_dict'] = '"%s"' % SIMAP_resultSpecification_dict
            for databases in parameters.iter('databases'):
                database_details_dict = databases[0].attrib
            sdict['database_details_dict'] = '"%s"' % database_details_dict
            sdict['simap_version'] = simap_homologue_root[0][0][0][0][0].attrib['version']
            sdict['SIMAP_total_hits'] = int(simap_homologue_root[0][0][0][1][0].attrib['total'])

        if sdict['simap_version'] != '4.0':
            logging.warning('WARNING! Your XML file is simap version %s,'
                            'however this SIMAP parser was developed for SIMAP version 4.0.' %
                             sdict['simap_version'])

        query_sequence_node = simap_homologue_root[0][0][0][0][2][0][0]
        ''' xxxx CURRENTLY THE df is filled with nan values,
            but that doesn't make sense as the script seems to work
        '''
        sdict['query_md5'] = query_sequence_node.attrib['md5']
        sdict['seqlen'] = int(query_sequence_node.attrib['length'])
        sdict['query_selfscore'] = query_sequence_node.attrib['selfscore']
        sdict['query_sequenceid'] = query_sequence_node.attrib['sequenceid']
        sdict['total_number_of_simap_hits'] = query_sequence_node[0].attrib['number_hits']
        sdict['query_sequence_from_homologue_XML_file'] = query_sequence_node[0][0].text
        sdict['number_of_hits_in_homologue_XML_file'] = int(simap_homologue_root[0][0][0][1][0].attrib['total'])
    except (IndexError, KeyError):
        warning = "{} skipped, homologue XML seems to be damaged. Error in reading general query details.".format(protein_name)
        logging.warning("{} skipped, homologue XML seems to be damaged. Error in reading general query details.".format(protein_name))
        # skip to the next protein
        return acc, False, warning

    if p['full_seq'].upper() != sdict['query_sequence_from_homologue_XML_file'].upper():

        logging.warning("...............................\n"
                        "{} WARNING: Mismatch between full_seq and SIMAP seq from XML file. Tarball with SIMAP XML is probably old and should be deleted.\n"
                        "full_seq : {}\n"
                        "XML_seq  : {}\n"
                        "Tarball  : {}\n"
                        "acc has been added to mismatch_full_seq_with_simap_txt\n"
                        "...............................\n".format(acc, p['full_seq'].upper(),sdict['query_sequence_from_homologue_XML_file'].upper(), p['SIMAP_tar']))
        # add accession number to the list of acc with a sequence mismatch
        mismatch_full_seq_with_simap_list = utils.get_acc_list_from_txt(pathdict["mismatch_full_seq_with_simap_txt"])
        if acc not in mismatch_full_seq_with_simap_list:
            with open(pathdict["mismatch_full_seq_with_simap_txt"], "a") as source:
                source.write("\n{}".format(acc))

    #for each hit, save all the relevant data in the form of a dictionary,
    # so it can be added to a csv file or used in other calculations
    simap_homologue_hits = simap_homologue_root[0][0][0][1][0]

    #see if there are any hits at all
    try:
        test2 = simap_homologue_root[0][0][0][1][0][0]
    except IndexError:
        warning = "{} skipped, homologue XML has no hits.".format(protein_name)
        logging.warning(warning)
        # skip to the next protein
        return acc, False, warning

    """OLD AMINO ACID SUBSTITUTION CODE. THIS IS SLOW, AND GIVES NO SIGNIFICANT DIFFERENCE TO
    AAIMON OR AASMON WITH THE SIMAP SMITH-WATERMAN MATRIX"""
    #load the amino acid substitution matrices from the settings file
    #list_of_aa_sub_matrices = s['aa_sub_matrices']
    #import the amino acid substitution matrices
    #utils.import_amino_acid_substitution_matrices()
    #add the similarity ratios to the csv_header_for_SIMAP_homologue_file.
    # These will depend on the individual settings
    #                    if s['["mp_calculate_TMD_conservation_with_aa_matrices']:
    #                        for j in range(s["gap_open_penalty_min"],
    #                                       s["gap_open_penalty_max"],
    #                                       s["gap_open_penalty_increment"]):
    #                            gap_open_penalty = j
    #                            gap_extension_penalty = j
    #                            for matrix_name in list_of_aa_sub_matrices:
    #                                column_name = 'sim_ratio_%s_gapo%i' % (matrix_name.replace("'", "")[0:-7], j)
    #                                csv_header_for_SIMAP_homologue_file.append(column_name)
    #import the necessary matrices
    #for matrix_name in list_of_aa_sub_matrices:
    #matrix = matrix_name[0:-7]
    #from Bio.SubsMat.MatrixInfo import matrix as matrix_name

    SIMAP_orig_csv = p['homol_df_orig_zip'][:-4] + ".csv"
    #fasta_file_path = p['fasta_file_path']

    #create an empty file
    open(SIMAP_orig_csv, 'w').close()

    #reopen to add match details iteratively from dictionary
    csvfile = open(SIMAP_orig_csv, 'a')

    #set up a bool to catch those files where not a single hit actually gives data
    at_least_one_hit_contains_SW_node = False

    for hit in simap_homologue_hits:
        match_details_dict = {}

        #add desired hit information to the dictionary for transfer to csv
        hit_num = int(hit.attrib['number'])
        match_details_dict['hit_num'] = hit_num
        match_details_dict['md5'] = hit[1].attrib['md5']

        #define the major nodes in the XML-file
        try:
            protein_node = hit[1][1]
            hit_contains_protein_node = True
        except IndexError:
            hit_contains_protein_node = False
            number_of_hits_missing_protein_node += 1
            logging.warning('%s hit %s contains no protein node' % (protein_name, match_details_dict['md5']))
        if not hit_contains_protein_node:
            #skip to next hit
            continue

        try:
            smithWatermanAlignment_node = hit[0][0][14]
            hit_contains_SW_node = True
            num_hits_with_SW_align_node += 1
        except IndexError:
            hit_contains_SW_node = False
        match_details_dict['hit_contains_SW_node'] = hit_contains_SW_node
        #add the description. Add a custom name if it is the first (query) hit
        if hit_num == 1:
            description = '%s_SIMAP_query_sequence' % protein_name
        else:
            description = protein_node.attrib['description']
        match_details_dict['description'] = description
        try:
            databaseId = int(protein_node[1].attrib['databaseId'])
            match_details_dict['databaseId'] = int(protein_node[1].attrib['databaseId'])
        except KeyError:
            databaseId = 0
            #match_details_dict['databaseId'] = int(0)
        #databaseId = int(protein_node[1].attrib['databaseId'])
        databasenode = protein_node[1]
        match_details_dict['database'] = databasenode.attrib['name']
        try:
            taxonomyNode = protein_node[2]
            match_details_dict['organism'] = taxonomyNode.attrib['name']
            match_details_dict['taxonomy_node_id'] = taxonomyNode.attrib['node_id']
            match_details_dict['taxonomy_rank'] = taxonomyNode.attrib['rank']
        except IndexError:
            #sequence is from an unknown organism, as it has no database node
            match_details_dict['description'] += ', no_database_node'
            match_details_dict['organism'] = 'no_database_node'
            match_details_dict['taxonomy_node_id'] = 'no_database_node'
            match_details_dict['taxonomy_rank'] = 'no_database_node'
        match_details_dict['len_full_match_seq'] = len(hit[1][0][0].text)
        #len_full_match_seq = len(full_match_seq)
        alignment_node = hit[0][0]
        #E-value for hit
        match_details_dict['FASTA_expectation'] = float(alignment_node[1].text)
        #convert identity from e.g. 80 (80%) to 0.8
        match_details_dict['FASTA_identity'] = float(alignment_node[3].text) / 100
        #strangely, I think gappedIdentity is the identity EXCLUDING gaps, which is a better value to base judgements on. convert identity from e.g. 80 (80%) to 0.8
        match_details_dict['FASTA_gapped_identity'] = float(alignment_node[4].text) / 100
        # creating the real observed changes from FASTA_gapped_identity - this is a percentage value now!!!
        match_details_dict['obs_changes'] = 100 - float(alignment_node[4].text)
        '''xxx notes on the gapped identity
        N.B The FASTA_gapped_identity data here is from the FASTA algorithm, that precedes the SW algorithm.
        Occasionally they don’t match!!!
        I calculate the TMD identity manually from the SW alignment, BUT
        currently for the calculation of membranous/nonmembranous I use the gappedIdentity from the FASTA output
        (the SW output inly has identity including gaps)
        -    if I counted the gaps from the SW alignment, I COULD recalculate the gappedIdentity for the SW alignment
        -    OR: I could simply remove the data where the FASTA and SW don’t match.
        '''
        #FASTA overlap should be the length of the aligned region after running the FASTA algorithm (alignment is not shown by SIMAP)
        match_details_dict['FASTA_overlap'] = int(alignment_node[5].text)
        match_details_dict['FASTA_query_coverage'] = float(alignment_node[11].text)
        match_details_dict['FASTA_match_coverage'] = float(alignment_node[12].text)
        #find the start and the stop of the hsp
        querySeq = alignment_node[6]
        match_details_dict['FASTA_query_start'] = int(querySeq.attrib['start'])
        match_details_dict['FASTA_query_end'] = int(querySeq.attrib['end'])
        matchSeq = alignment_node[7]
        match_details_dict['FASTA_match_start'] = int(matchSeq.attrib['start'])
        match_details_dict['FASTA_match_end'] = int(matchSeq.attrib['end'])
        """OLD CALCULATIONS THAT ARE NOW CONVERTED TO PANDAS ARRAY-WISE FUNCTIONS"""
        #some parameters that are needed for identity calculations later
        #FASTA_num_ident_res = FASTA_identity / 100.0 * FASTA_overlap
        #is_start_of_TMD_in_FASTA = True if FASTA_query_start <= TMDstart else False
        #is_end_of_TMD_in_FASTA = True if TMDend <= FASTA_query_end else False
        #is_TMD_in_FASTA_alignment = True if all([is_start_of_TMD_in_FASTA, is_end_of_TMD_in_FASTA]) else False
        '''***********************if the TMD region is actually covered by the hsp, then conduct some further analyses of the match TMD region*************************'''
        if hit_contains_SW_node:
            query_align_seq = ''
            '''For the moment, there is no need to put the whole match hsp sequence into the csv file'''
            #for smithWatermanAlignment in alignment_node.iter('smithWatermanAlignment'):
            match_details_dict['SW_query_score_ratio'] = smithWatermanAlignment_node[0].text
            match_details_dict['SW_match_score_ratio'] = smithWatermanAlignment_node[1].text
            match_details_dict['SW_query_coverage'] = smithWatermanAlignment_node[2].text
            match_details_dict['SW_match_coverage'] = smithWatermanAlignment_node[3].text
            match_details_dict['SW_coverage_ratio'] = smithWatermanAlignment_node[4].text
            match_details_dict['align_pretty'] = smithWatermanAlignment_node[8].text
            match_details_dict['SW_alignment_seq1offset'] = int(smithWatermanAlignment_node.attrib['alignment-seq1offset'])
            match_details_dict['SW_alignment_seq2offset'] = int(smithWatermanAlignment_node.attrib['alignment-seq2offset'])
            match_details_dict['SW_identity'] = float(smithWatermanAlignment_node.attrib['identity'])
            match_details_dict['SW_similarity'] = float(smithWatermanAlignment_node.attrib['similarity'])
            #Get the full sequences. Note that they greatly increase the size of the csv file.
            match_details_dict['query_align_seq'] = smithWatermanAlignment_node[5].text
            match_details_dict['align_markup_seq'] = smithWatermanAlignment_node[6].text
            match_details_dict['match_align_seq'] = smithWatermanAlignment_node[7].text
        else:
            number_of_hits_missing_smithWatermanAlignment_node += 1
        if hit_num == 1:
            #sort
            csv_header_for_SIMAP_homologue_file = sorted(list(match_details_dict.keys()))
            #save the csv header to the csv file
            writer = csv.writer(csvfile, delimiter=',', quotechar='"', lineterminator='\n',quoting=csv.QUOTE_NONNUMERIC, doublequote=True)
            writer.writerow(csv_header_for_SIMAP_homologue_file)
        #save the match_details_dict as a line in the csv file
        writer = csv.DictWriter(csvfile, fieldnames=csv_header_for_SIMAP_homologue_file,
                                extrasaction='ignore', delimiter=',', quotechar='"',
                                lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC,
                                doublequote=True)
        writer.writerow(match_details_dict)
    # close tar and csv
    csvfile.close()
    tar.close()

    # open csv as a dataframe,
    df_homol = pd.read_csv(SIMAP_orig_csv, sep=",", quoting=csv.QUOTE_NONNUMERIC, index_col="hit_num")
    if "query_align_seq" not in df_homol.columns:
        # this is a serious error in the XML file. None of the hits had a protein node. The file should probably be downloaded.
        warning = 'The homologue XML file likely has a serious error, "query_align_seq" is not in dataframe. ' \
                  'XML should probably be re-downloaded.\n' \
                  'df_homol["hit_contains_SW_node"].value_counts()\n{}'.format(df_homol["hit_contains_SW_node"].value_counts())
        logging.warning(warning)
        # skip this protein
        return acc, False, warning
    # get length of seq. Previously this was a lambda function that needed more filtering
    df_homol['len_query_align_seq'] = df_homol['query_align_seq'].str.len()

    # conduct the text searching for disallowed words
    words_not_allowed_in_description = ast.literal_eval(s["words_not_allowed_in_description"])
    # collect disallowed words in hit protein description (patent, synthetic, etc)
    df_homol['list_disallowed_words_in_descr'] = df_homol['description'].dropna().apply(utils.find_disallowed_words, args=(words_not_allowed_in_description,))
    # create a boolean column to select hits that do not contain these words in the description
    df_homol['disallowed_words_not_in_descr'] = df_homol['list_disallowed_words_in_descr'] == '[]'
    # check if there are non-IUPAC amino acids in the sequence (frequently large gaps from NG sequencing data)
    df_homol['X_in_match_seq'] = df_homol['match_align_seq'].str.contains("X")

    # restrict to just a few columns including the align_pretty that might be useful to check manually
    df_pretty = df_homol[["FASTA_gapped_identity", "obs_changes", "organism", "description", "align_pretty"]]
    # save the align_pretty to csv
    df_pretty.to_csv(p['SIMAP_align_pretty_csv'], sep=',', quoting=csv.QUOTE_NONNUMERIC)
    # drop the align_pretty column from the orig dataframe
    df_homol.drop('align_pretty', axis=1, inplace=True)
    # save the whole dataframe as a pickle for faster opening later
    with open(p['homol_df_orig_pickle'], "wb") as pick:
        pickle.dump(df_homol, pick, protocol=pickle.HIGHEST_PROTOCOL)

    simap_header_info_ser = pd.Series(sdict)
    simap_header_info_ser.to_csv(p['simap_header_info_csv'])

    # either create new zip and add ("w"), or open existing zip and add "a"
    with zipfile.ZipFile(p['homol_df_orig_zip'], mode="w", compression=zipfile.ZIP_DEFLATED) as zipout:
        #zipout.write(SIMAP_orig_csv, arcname=os.path.basename(SIMAP_orig_csv))
        zipout.write(p['SIMAP_align_pretty_csv'], arcname=os.path.basename(p['SIMAP_align_pretty_csv']))
        zipout.write(p['homol_df_orig_pickle'], arcname=os.path.basename(p['homol_df_orig_pickle']))
        zipout.write(p['simap_header_info_csv'], arcname=os.path.basename(p['simap_header_info_csv']))

    # delete temporary uncompressed files
    os.remove(SIMAP_orig_csv)
    os.remove(p['SIMAP_align_pretty_csv'])
    os.remove(p['homol_df_orig_pickle'])
    os.remove(p['simap_header_info_csv'])
    return acc, True, "0"
Example #4
0
def get_omp_TM_indices_and_slice_from_summary_table(
        OMPdb_list_csv, list_parsed_csv, OMPdb_topology_reliability_cutoff,
        logging, s):
    """ Take a csv parsed from OMPdb, get the TM indices and slice the TMDs for each protein

    Parameters:
    -----------
    OMPdb_list_csv : str
        Path to input csv with OMP sequences and membrane annotation
    list_summary_csv : str
        Path to output csv with the sliced TM sequences
    logging : logging.Logger
        Logger for printing to console and logfile.
    """
    logging.info(
        '~~~~~~~~~starting get_omp_TM_indices_and_slice_from_summary_table~~~~~~~~~'
    )
    df_KW = pd.read_csv(OMPdb_list_csv,
                        sep=",",
                        quoting=csv.QUOTE_NONNUMERIC,
                        index_col=0)

    # check if signal peptides should be extracted, modify keywords dict
    analyse_SiPe = False
    if 'SiPe' in s['regions']:
        analyse_SiPe = True

    # get sequence length
    df_KW["seqlen"] = df_KW["Sequence"].str.len()

    # Creating new column M_indices, which contains the indices of Ms
    df_KW["M_indices"] = df_KW.Topology.apply(getting_membrane_indices)

    # Converting empty entries to NaN
    df_KW["M_indices"] = df_KW.M_indices.apply(lambda x: np.nan
                                               if x == [] else x)

    num_proteins_BEFORE_dropping_those_without_mem_indices = df_KW.shape[0]

    # Extracting entries to a new Dataframe
    df_KW = df_KW[df_KW.M_indices.notnull()]

    num_proteins_AFTER_dropping_those_without_mem_indices = df_KW.shape[0]

    # Filter, cutting of Coverages under 85% & Creating new Index
    df_KW = df_KW.loc[df_KW["Coverage(%)"] >= 85]

    # df_KW.index = range(1,len(df_KW["Uniprot"])+1)
    num_proteins_AFTER_dropping_those_with_coverage_below_85 = df_KW.shape[0]

    # Creating new list (nested list)
    nested_list_of_membrane_borders = []

    # Filling nest with lists of start and end-points
    for n in df_KW.M_indices:
        m_borders = []
        m_borders.append(n[0])
        m_borders = check_for_border(n, m_borders)
        m_borders.append(n[-1])
        nested_list_of_membrane_borders.append(m_borders)

    array_membrane_borders = np.array(nested_list_of_membrane_borders)
    array_membrane_borders_corrected = []
    for subarray in array_membrane_borders:
        # logging.info(subarray[::2] = subarray[::2]*10)
        subarray = np.array(subarray)
        subarray[1::2] = subarray[1::2] + 1
        array_membrane_borders_corrected.append(list(subarray))

    nested_list_of_membrane_borders_python_indexstyle = array_membrane_borders_corrected

    # Creating new column, which contains start and end-points
    df_KW[
        "Membrane_Borders"] = nested_list_of_membrane_borders_python_indexstyle

    # Creating new column, which contains the Amoung of TMDS
    df_KW["number_of_TMDs"] = df_KW.Membrane_Borders.apply(
        lambda x: len(x) / 2)

    # Filter, filters out, if less than 8 or more than 24 TMDs
    # REMOVED. FILTERING BY NUMBER OF TMDS IS NOW DONE LATER, in PROT_LIST
    #df_KW["number_of_TMDs"] = df_KW["number_of_TMDs"].apply(lambda x: int(x) if 5 <= x <= 36 else np.nan)

    # Creating new dataframe without nan
    df_KW = df_KW[df_KW["number_of_TMDs"].notnull()]

    num_proteins_AFTER_dropping_those_without_TMDs = df_KW.shape[0]

    df_KW = df_KW[df_KW["Topology_Reli"] > OMPdb_topology_reliability_cutoff]

    num_proteins_AFTER_dropping_those_with_topology_reliability_below_cutoff = df_KW.shape[
        0]

    df_KW["TM_indices"] = df_KW["Membrane_Borders"].apply(
        lambda x: tuple(zip(x[::2], x[1::2])))

    # create a list of [TM01, TM02, TM03, etc.
    long_list_of_TMDs = []
    for i in range(1, 50):
        long_list_of_TMDs.append("TM{:02d}".format(i))

    # for the .set_value function, set dtype as object
    df_KW["list_of_TMDs"] = ""
    df_KW["list_of_TMDs"].astype(object)

    sys.stdout.write('slicing TMD and nonTMD sequences:\n')

    for row_nr, row in enumerate(df_KW.index):
        # get nested tuple of TMDs
        nested_tup_TMs = df_KW.loc[row, "TM_indices"]
        # slice long list of TMD names to get an appropriate list for that protein [TM01, TM02, TM03, etc.
        len_nested_tup_TMs = len(nested_tup_TMs)
        list_of_TMDs = long_list_of_TMDs[:len_nested_tup_TMs]
        # add that list to the dataframe (could also be added as a stringlist, but that's irritating somehow)
        df_KW.set_value(row, "list_of_TMDs", list_of_TMDs)
        # set seq for slicing
        full_seq = df_KW.loc[row, "Sequence"]
        # topology = df_KW.loc[row, "Topology"]
        # iterate through all the TMDs of that protein, slicing out the sequences
        for i in range(len(list_of_TMDs)):
            TMD = list_of_TMDs[i]
            tup = nested_tup_TMs[i]
            df_KW.loc[row, TMD + "_start"] = tup[0]
            df_KW.loc[row, TMD + "_end"] = tup[1]
            df_KW.loc[row,
                      TMD + "_seq"] = utils.slice_with_listlike(full_seq, tup)
            # df_KW.loc[row, TMD + "_top"] = utils.slice_with_listlike(topology, tup)
        if row_nr % 50 == 0:
            sys.stdout.write(". ")
            sys.stdout.flush()
            if row_nr % 500 == 0:
                sys.stdout.write("\n")
                sys.stdout.flush()
        ''' ~~   SLICE nonTMD sequence  ~~ '''
        #list_of_TMDs = df_KW.loc[row, 'list_of_TMDs'].copy()
        if 'SP01' in list_of_TMDs:
            list_of_TMDs.remove('SP01')
        # sequence from N-term. to first TMD
        nonTMD_first = df_KW.loc[row,
                                 'Sequence'][0:(df_KW.loc[row, 'TM01_start'] -
                                                1).astype('int64')]
        sequence = nonTMD_first
        # only for multipass proteins, generate sequences between TMDs
        if len(list_of_TMDs) == 0:
            # no TMDs are annotated, skip to next protein
            continue
        elif len(list_of_TMDs) > 1:
            for TM_Nr in range(len(list_of_TMDs) - 1):
                # the TMD is the equivalent item in the list
                TMD = list_of_TMDs[TM_Nr]
                # the next TMD, which contains the end index, is the next item in the list
                next_TMD = list_of_TMDs[TM_Nr + 1]
                between_TM_and_TMplus1 = df_KW.loc[row, 'Sequence'][df_KW.loc[
                    row, '%s_end' %
                    TMD].astype('int64'):df_KW.loc[row,
                                                   '%s_start' %
                                                   next_TMD].astype('int64') -
                                                                    1]
                sequence += between_TM_and_TMplus1
        last_TMD = list_of_TMDs[-1]
        # sequence from last TMD to C-term.
        nonTMD_last = df_KW.loc[row, 'Sequence'][df_KW.loc[
            row, '%s_end' % last_TMD].astype('int64'):df_KW.loc[row, 'seqlen']]
        sequence += nonTMD_last
        df_KW.loc[row, 'nonTMD_seq'] = sequence
        df_KW.loc[row, 'len_nonTMD'] = len(sequence)

        if analyse_SiPe == True:
            if pd.notnull(df_KW.loc[row, 'SP01_start']):
                list_of_TMDs.append('SP01')
                df_KW.set_value(row, "list_of_TMDs", list_of_TMDs)

    ########################################################################################
    #                                                                                      #
    #                slicing out TMD_seq_plus_surr shifted to prot_list.py                 #
    #                                                                                      #
    ########################################################################################
    # max_num_TMDs = df_KW["number_of_TMDs"].max()
    #
    # # n_aa_before_tmd = s["n_aa_before_tmd"]
    # # n_aa_after_tmd = s["n_aa_after_tmd"]
    # n_aa_before_tmd = 10
    # n_aa_after_tmd = 10
    #
    # # currently the loop is run for each TMD, based on the sequence with the most TMDs
    # for i in range(1, int(max_num_TMDs) + 1):
    #     TMD = 'TM%02d' % i
    #     # get the indices for TMD plus surrounding sequence
    #     df_KW = korbinian.prot_list.prot_list.get_indices_TMD_plus_surr_for_summary_file(df_KW, TMD, n_aa_before_tmd, n_aa_after_tmd)
    #     # slice out the TMD_seq_plus_surr for each TMD
    #     df_KW['%s_seq_plus_surr' % TMD] = df_KW[df_KW['%s_start' % TMD].notnull()].apply(utils.slice_uniprot_TMD_plus_surr_seq, args=(TMD,), axis=1)

    # rename columns to match protein lists from uniprot (Note that Family is currently translated as prot_descr)
    dict_ = {
        "Sequence": "full_seq",
        "Organism": "organism",
        "Uniprot": "uniprot_acc",
        "Gene_Name": "gene_name",
        "Topology_Reli": "topology_reliability",
        "Family": "prot_descr"
    }
    df_KW["betabarrel"] = True
    df_KW["multipass"] = True
    df_KW["singlepass"] = False
    # since all beta-barrel proteins have the N-terminus in the periplasm, "N-term is Extracellular" is False
    # you could make 100% sure of this by checking that the first letter of "Topology" is "I", but it is not really necessary
    df_KW["n_term_ec"] = False
    df_KW.rename(columns=dict_, inplace=True)
    df_KW["acc"] = df_KW["uniprot_acc"]
    df_KW["protein_name"] = df_KW["uniprot_acc"]
    num_proteins_AFTER_get_omp_TM_indices_and_slice_from_summary_table = df_KW.shape[
        0]

    # save to csv (presumably in summaries folder as a list number, so it is accessible by the rest of the scripts)
    utils.make_sure_path_exists(list_parsed_csv, isfile=True)
    df_KW.to_csv(list_parsed_csv, sep=",", quoting=csv.QUOTE_NONNUMERIC)

    logging.info(
        "\nnum_proteins_BEFORE_dropping_those_without_mem_indices : {}".format(
            num_proteins_BEFORE_dropping_those_without_mem_indices))
    logging.info(
        "num_proteins_AFTER_dropping_those_without_mem_indices : {}".format(
            num_proteins_AFTER_dropping_those_without_mem_indices))
    logging.info(
        "num_proteins_AFTER_dropping_those_with_coverage_below_85 : {}".format(
            num_proteins_AFTER_dropping_those_with_coverage_below_85))
    logging.info("num_proteins_AFTER_dropping_those_without_TMDs : {}".format(
        num_proteins_AFTER_dropping_those_without_TMDs))
    logging.info(
        "num_proteins_AFTER_dropping_those_with_topology_reliability_below_cutoff : {}"
        .format(
            num_proteins_AFTER_dropping_those_with_topology_reliability_below_cutoff
        ))
    logging.info(
        "num_proteins_AFTER_get_omp_TM_indices_and_slice_from_summary_table : {}"
        .format(
            num_proteins_AFTER_get_omp_TM_indices_and_slice_from_summary_table
        ))
    logging.info(
        '~~~~~~~~~finished get_omp_TM_indices_and_slice_from_summary_table~~~~~~~~~'
    )
Example #5
0
def parse_OMPdb_all_selected_to_csv(ListXX_OMPdb_nr_acc,
                                    ListXX_OMPdb_redundant_flatfile,
                                    OMPdb_list_csv, logging, s):
    """ Extracts ID, seq and topology data from the full OMPdb flatfile, saves to csv.

    Note that instead of parsing line-by-line and saving to a csv, this method store every single value into a huge dictionary, which for 3
    proteins looks like this:

    BB_SiPe                                        [True, True, False]
    Coverage(%)                                  [99.82, 96.61, 92.19]
    Description      [Pilin outer membrane usher protein SafC, Oute...
    NCBI_TaxID                                     [59201, 470, 59201]
    Organism         [Salmonella enterica I, Acinetobacter baumanni...
    SP01_start                                          [1, 1, np.nan]
    Sequence         [MKFKQPALLLFIAGVVHCANAHTYTFDASMLGDAAKGVDMSLFNQ...
    Topology         [IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
    Topology_Reli                                [84.13, 94.00, 93.07]
    Uniprot                       [A0A0F7J6A4, A0A090B0L0, A0A0F7J6D5]
    seqlen                                             [836, 356, 721]

    The size of this dictionary with >7000 entries may cause memory problems on a regular PC.
    Currently this method is functional, though, and is not a priority to be fixed.

    Parameters
    ----------
    ListXX_OMPdb_nr_acc : str
        Path to OMPdb list of non-redundant IDs, textfile.
    ListXX_OMPdb_redundant_flatfile : str
        Path to full OMPdb flatfile, all proteins, unzipped.
    OMPdb_list_csv : str
        Path to output csv file.

    Saved Files
    -----------
    OMPdb_list_csv : csv
        csv file derived from dfKW
        contains a row for each protein
        contains indices for TM regions

    Notes
    -----
    This script parses the original text file, rather than XML. Version on server was not working due to " ' error.
    The script works, but goes through an entire flatfile, so is unbelievably slow. Use at your own risk.
    """

    # Creating dictionary keywords
    keywords = {
        "Uniprot": [],
        "Family": [],
        "Gene_Name": [],
        "Organism": [],
        "NCBI_TaxID": [],
        "Coverage(%)": [],
        "Sequence": [],
        "Topology_Reli": [],
        "Topology": [],
        "Description": [],
        "Pfam_ID": [],
        "BB_SiPe": [],
        "seqlen": [],
        "fragment": []
    }

    # check if signal peptides should be added to the list_of_TMDs and analysed
    # signal peptides will still be detected, via "True" in BB_SiPe. This is useful for excluding potential TM01 mis-labelled as SP.
    analyse_SiPe = False
    if 'SiPe' in s['regions']:
        analyse_SiPe = True
        keywords.update({
            "SP01_start": [],
            "SP01_end": [],
            "SP01_seq": [],
            "SiPe_source": []
        })

    logging.info('analyse_SiPe: {}'.format(analyse_SiPe))

    # Start settings which are changed during the for loop
    take_next_seq = False
    take_next_topo = False
    take_ID = False

    # Empty lists which are filled during the for loop
    Raw_Sequences = []
    Raw_Topos = []
    ID_list = []

    # Extracts IDs out of file
    with open(ListXX_OMPdb_nr_acc) as source:
        for line in source:
            line = line.strip()
            ID_list.append(line)

    # Checking ListXX_OMPdb_redundant_flatfile(complete OMPdb in very unfriendly formatting)for IDs(which are stored in list of Potential IDs) and extracting information
    with open(ListXX_OMPdb_redundant_flatfile) as data_file:
        counter = 0
        db_cross_ref = {}
        save_db_cross_ref = False
        for line in data_file:
            line_list = line.strip().split(" ")
            # Further settings which are changed every loop
            sequence_header = False
            topo_header = False
            # If-conditions make sure, that the ID is in the list of Potential IDs and check for keywords in each line
            if "UNIPROT" in line_list and line_list[-1] in ID_list:
                keywords["Uniprot"].append(line_list[-1])
                take_ID = True
                counter += 1
                if counter % 100 == 0:
                    sys.stdout.write(". ")
                    sys.stdout.flush()
            if "FAMILY" in line_list and take_ID == True:
                keywords["Family"].append(" ".join(line_list[9:]))
            if "DESCRIPTION" in line_list and take_ID == True:
                keywords["Description"].append(" ".join(line_list[4:]))
            if "GENE_NAME" in line_list and take_ID == True:
                keywords["Gene_Name"].append(" ".join(line_list[6:]))
            if "ORGANISM" in line_list and take_ID == True:
                keywords["Organism"].append(" ".join(line_list[7:]))
            if "NCBI_TAXID" in line_list and take_ID == True:
                keywords["NCBI_TaxID"].append(line_list[-1])
            if "DB_REF" in line_list and take_ID == True:
                # add database cross references to special dict
                db_cross_ref.update(
                    {line_list[9][:-1]: line_list[10].split('|')})
            if "SIGNAL_PEPTIDE" in line_list and take_ID == True:
                if ' '.join(line_list[1:]) == 'No information available':
                    keywords["BB_SiPe"].append(False)
                    if analyse_SiPe == True:
                        keywords["SP01_start"].append(np.nan)
                        keywords["SP01_end"].append(np.nan)
                        keywords["SP01_seq"].append(np.nan)
                        keywords["SiPe_source"].append(
                            'No information available')
                else:
                    # assume there is a signal peptide that starts at 1 (not very optimum code!!!)
                    keywords["BB_SiPe"].append(True)
                    if analyse_SiPe == True:
                        keywords["SP01_start"].append(line_list[1][0])
                        keywords["SP01_end"].append(line_list[1][2:-1])
                        keywords["SP01_seq"].append(line_list[2][:-1])
                        keywords["SiPe_source"].append(' '.join(
                            line_list[-2:]))

            if "COVERAGE(%)" in line_list and take_ID == True:
                keywords["Coverage(%)"].append(line_list[-1])
            if "SEQUENCE" in line_list and take_ID == True:
                # after the "SEQUENCE" statement in a line_list, all db cross references are collected and can be saved
                save_db_cross_ref = True
                keywords["seqlen"].append(line_list[7])
                take_next_seq = True
                sequence_header = True
                # some of the OMPdb entries are labeled as fragments. These should be removed.
                if "Fragment:" in line:
                    searchstring = ".*Fragment\:([NC])"
                    match = re.match(searchstring, line)
                    if match:
                        N_or_C = match.group(1)
                    else:
                        N_or_C = "undefined"
                    keywords["fragment"].append("{}-term".format(N_or_C))
                else:
                    keywords["fragment"].append("undefined")
            # add db cross references from previous protein to keywords dict
            if save_db_cross_ref == True:
                if "Pfam" in db_cross_ref.keys():
                    keywords["Pfam_ID"].append(db_cross_ref["Pfam"])
                else:
                    keywords["Pfam_ID"].append(np.nan)
                # reset db_cross_ref for next cycle
                save_db_cross_ref = False
                db_cross_ref = {}
            if "TOPOLOGY" in line_list and take_ID == True:
                Raw_Sequences.extend(";")
                keywords["Topology_Reli"].append(
                    line_list[-1].strip('"').strip("%"))
                take_next_seq = False
                take_next_topo = True
                topo_header = True
            if take_next_seq == True and sequence_header != True and take_ID == True:
                Raw_Sequences.extend(line_list)
            if "//" in line_list and take_ID == True:
                Raw_Topos.extend(";")
                topo_header = False
                take_next_topo = False
                take_ID = False
            if take_next_topo == True and topo_header != True:
                Raw_Topos.extend(line_list)
        Sequences = "".join(Raw_Sequences).split(";")
        Sequences.remove("")
        keywords["Sequence"] = Sequences
        Topos = "".join(Raw_Topos).split(";")
        Topos.remove("")
        keywords["Topology"] = Topos

    # Creating Dataframe and saving it as csv
    dfKW = pd.DataFrame(keywords)
    # set the uniprot_acc as the index
    dfKW.set_index("Uniprot", inplace=True, drop=False)
    dfKW.index.name = "acc"

    # DEPRECATED. OMPdb seems to label everything as a fragment?
    # n_prot_before_dropping_fragments = dfKW.shape[0]
    # dfKW = dfKW.loc[dfKW.fragment == "no fragment annotation"]
    # n_prot_after_dropping_fragments = dfKW.shape[0]
    # n_prot_fragments_dropped = n_prot_before_dropping_fragments - n_prot_after_dropping_fragments

    n_fragments = dfKW.loc[dfKW.fragment != "N"].shape[0]
    logging.info("{}/{} proteins labeled as 'Fragment:N' in flatfile.".format(
        n_fragments, dfKW.shape[0]))

    utils.make_sure_path_exists(OMPdb_list_csv, isfile=True)
    dfKW.to_csv(OMPdb_list_csv)
    logging.info("parse_OMPdb_all_selected_to_csv is completed.\n"
                 "Final number of proteins = {}".format(dfKW.shape[0]))
Example #6
0
def parse_flatfile_to_csv(selected_uniprot_records_flatfile, n_aa_before_tmd, n_aa_after_tmd, analyse_sp, logging, list_parsed_csv, slice=True):
    """ Parses a flatfile of UniProt records to csv.

    Parameters
    ----------
    selected_uniprot_records_flatfile : str
        Path to UniProt flatfile containing selected records for analysis.
    n_aa_before_tmd : int
        Number of amino acids before the TMD to be included when slicing the "TMD_plus_surr".
    n_aa_after_tmd : int
        Number of amino acids before the TMD to be included when slicing the "TMD_plus_surr".
    analyse_sp : bool
        Whether to analyse the signal peptides.
    logging : logging.Logger
        Logger for printing to console and logfile.
    list_parsed_csv : str
        Path to output csv file containing the list of proteins for analysis.

    Dataframes
    ----------
    dfu
        Dataframe for Uniprot
        index = acc for each protein
        columns = 'uniprot_acc', 'prot_descr', 'full_seq', etc

    Saved Files and Figures
    -----------------------
    list_summary_csv : csv
        CSV from dfu, with info for a protein on each row.

    """
    logging.info('~~~~~~~~~~~~                 starting parse_flatfile_to_csv              ~~~~~~~~~~~~')
    if not os.path.isfile(selected_uniprot_records_flatfile):
        return "parse_flatfile_to_csv could not be run. Uniprot flatfile not found. ({})".format(selected_uniprot_records_flatfile)
    uniprot_dict_all_proteins = {}
    with open(selected_uniprot_records_flatfile, "r") as f:
        records = SwissProt.parse(f)
        count_of_uniprot_records_processed = 0
        for m, record in enumerate(records):
            # create an empty output dictionary to hold the uniprot data for each record
            output_dict = {}
            # extract the subcellular location detail from the (poorly organized and unsorted) uniprot comments section
            comments_dict = {}

            try:
                for comment in record.comments:
                    # splits comments based on first ":" symbol, creates a list called split_comment
                    split_comment = comment.strip().split(': ', 1)
                    # several comments have the same name. need to check if it is already in the dictionary
                    if split_comment[0] in comments_dict:
                        # list the different comments, one after another
                        comments_dict[split_comment[0]] += ", %s" % split_comment[1]
                    else:
                        comments_dict[split_comment[0]] = split_comment[1]
                output_dict['comments_subcellular_location_uniprot'] = comments_dict['SUBCELLULAR LOCATION']
            except (AttributeError, KeyError):
                # there are no comments in this uniprot file!
                logging.info('no comments in Uniprot file')
                output_dict['comments_subcellular_location_uniprot'] = ''

            # use regex to search for text describing subcellular locations
            # [ -]? accepts either space, hyphen, or no dividing character
            regex_word_dict = {'multipass': ['multi[ -]?(pass|span)', 'poly[ -]?topic'],
                               'singlepass': ['single[ -]?(pass|span)', 'bi[ -]?topic'],
                               'membrane': ['membran', 'lipid[ -](anchor|bound)'],
                               'typeI': ['type[ -](one|1|I)[ -]membran'],
                               'typeII': ['type[ -](two|2|II)[ -]membran']}
            # comments_subcellular_location_uniprot = 'Membrane; Bitopictype I membrane protein.'
            regex_subcell_loc_dict = {}
            for search_word in regex_word_dict:
                regex_subcell_loc_dict[search_word] = False
                regex_search_list = regex_word_dict[search_word]
                for regex_search_string in regex_search_list:
                    # search for the regex string, ignoring any mismatches in upper or lower case
                    comment_match = re.search(regex_search_string,
                                              output_dict['comments_subcellular_location_uniprot'],
                                              re.IGNORECASE)
                    if comment_match:
                        regex_subcell_loc_dict[search_word] = True
            # add all of the fields to the dictionary
            output_dict.update(regex_subcell_loc_dict)

            # print accession number
            sys.stdout.write("{}, ".format(record.accessions[0]))
            if count_of_uniprot_records_processed % 20 == 0:
                sys.stdout.write("\n".format(record.accessions[0]))
            sys.stdout.flush()

            # add data to dictionary
            output_dict['uniprot_acc'] = record.accessions[0]
            output_dict['organism'] = record.organism
            output_dict['uniprot_entry_name'] = record.entry_name
            output_dict['gene_name'] = record.gene_name
            output_dict['prot_descr'] = record.description
            output_dict['full_seq'] = record.sequence
            output_dict['uniprot_orgclass'] = record.organism_classification
            output_dict['uniprot_all_accessions'] = record.accessions
            output_dict['uniprot_KW'] = record.keywords
            output_dict['uniprot_features'] = record.features
            output_dict['seqlen'] = record.sequence_length

            # create a list of all the feature types (signal, transmem, etc)
            list_of_feature_types_in_uniprot_record = []
            for sublist in record.features:
                list_of_feature_types_in_uniprot_record.append(sublist[0])

            # list of the features that we want in the final csv
            desired_features_in_uniprot = ['TRANSMEM', 'VARIANT', 'CONFLICT', 'VAR_SEQ', 'VARSPLIC', 'TOPO_DOM']
            if analyse_sp == True:
                desired_features_in_uniprot.append('SIGNAL')
            desired_features_in_uniprot_dict = {}
            location_of_sp_in_feature_list = []
            location_of_tmds_in_feature_list = []
            location_of_non_tmds_in_feature_list = []

            # add bool if uniprot thinks that protein contains signal peptides
            if 'SIGNAL' in list_of_feature_types_in_uniprot_record:
                output_dict['uniprot_SiPe'] = True
            else:
                output_dict['uniprot_SiPe'] = False

            for feature in desired_features_in_uniprot:
                if feature in list_of_feature_types_in_uniprot_record:
                    # find the features in the feature list.
                    # For polytopic membrane proteins, there will be more than one tmd (labelled "TRANSMEM".
                    location_of_features_in_feature_list = [i for i, x in
                                                            enumerate(list_of_feature_types_in_uniprot_record) if
                                                            x == feature]
                    desired_features_in_uniprot_dict[feature] = location_of_features_in_feature_list
                    if feature == 'SIGNAL':
                        location_of_sp_in_feature_list = location_of_features_in_feature_list
                        # location_of_sp_in_feature_list.sort()
                    if feature == 'TRANSMEM':
                        location_of_tmds_in_feature_list = location_of_features_in_feature_list
                        # sort list to be sure that the "transmem" notation is definitely ordered correctly,
                        # as this order determines the TMD name
                        location_of_tmds_in_feature_list.sort()
                    if feature == 'TOPO_DOM':
                        location_of_non_tmds_in_feature_list = location_of_features_in_feature_list
                        # sort list to be sure that the "transmem" notation is definitely ordered correctly,
                        # as this order determines the TMD name
                        location_of_non_tmds_in_feature_list.sort()

            # count the number of SP
            output_dict['number_of_SP'] = len(location_of_sp_in_feature_list)
            # count the number of "TRANSMEM" TMDs listed in the feature-list
            output_dict['number_of_TMDs'] = len(location_of_tmds_in_feature_list)

            # information about location of first non-tmd (extracellular or periplasmic/cytoplasmic)
            if len(location_of_non_tmds_in_feature_list) > 0:
                output_dict['loc_start'] = record.features[location_of_non_tmds_in_feature_list[0]][3]
                output_dict['n_term_ec'] = "Extracellular" in output_dict["loc_start"]
            else:
                output_dict['loc_start'] = np.nan
                output_dict['n_term_ec'] = np.nan

            # number of TMDs excluding signal peptides (which might be added later)
            number_of_TMDs_excl_SP = len(location_of_tmds_in_feature_list)
            output_dict['number_of_TMDs_excl_SP'] = number_of_TMDs_excl_SP

            list_of_TMDs = ["TM{:02d}".format(n) for n in range(1, number_of_TMDs_excl_SP + 1)]
            output_dict['list_of_TMDs'] = list_of_TMDs
            output_dict['list_of_TMDs_excl_SP'] = list_of_TMDs

            if number_of_TMDs_excl_SP > 0:

                #list_of_TMDs = []
                TM_indices = []

                for n, TMD_location in enumerate(location_of_tmds_in_feature_list):
                    # consecutively number the TMDs based on the "TRANSMEM" location in the feature list
                    #TMD = 'TM{:02d}'.format(n+1)
                    #list_of_TMDs.append(TMD)

                    TM_start = record.features[TMD_location][1]
                    TM_end = record.features[TMD_location][2]

                    # remove any strings or floats
                    for TM_index in [TM_start, TM_end]:
                        TM_index = TM_index if isinstance(TM_index, int) else np.nan

                    # add to nested tuple
                    TM_indices.append((TM_start, TM_end))

                    # DEPRECATED
                    # # add the start and stop of each TMD, and the comments
                    # output_dict['%s_start'%TMD] = record.features[TMD_location][1]
                    # output_dict['%s_end'%TMD] = record.features[TMD_location][2]
                    # output_dict['%s_description'%TMD] = record.features[TMD_location][3]
                    # if isinstance(output_dict['%s_start'%TMD], str) or isinstance(output_dict['%s_end'%TMD], str):
                    #     logging.info("{} strings found in feature indices: {},{}".format(output_dict['uniprot_acc'], output_dict['%s_start'%TMD], output_dict['%s_end'%TMD]))
                    #     output_dict['%s_start' % TMD], output_dict['%s_end'%TMD] = np.nan, np.nan

                # information about SP location
                if output_dict['number_of_SP'] != 0:
                    for SP_location in location_of_sp_in_feature_list:
                        SP = 'SP01'
                        list_of_TMDs.append(SP)
                    for SP_location in location_of_sp_in_feature_list:
                        output_dict['SP01_start'] = record.features[SP_location][1]
                        output_dict['SP01_end'] = record.features[SP_location][2]
                        output_dict['SP01_description'] = record.features[SP_location][3]
                    if isinstance(output_dict['SP01_start'], str) or isinstance(output_dict['SP01_end'], str):
                        logging.info("{} strings found in feature indices: {},{}".format(output_dict['uniprot_acc'], output_dict['SP01_start'], output_dict['SP01_end']))
                        output_dict['SP01_start'], output_dict['SP01_end'] = np.nan, np.nan

                # add the list of TMD names to the dictionary and dataframe
                #output_dict['list_of_TMDs'] = list_of_TMDs
                output_dict["TM_indices"] = TM_indices

                # create a numpy array of any sequence variants are in the TMD (and SP) region
                list_of_variant_types_in_uniprot = ['VARIANT', 'CONFLICT', 'VARSPLIC', 'VAR_SEQ']
                for n, TMD in enumerate(list_of_TMDs):
                    TM_start = TM_indices[n][0]
                    TM_start = TM_indices[n][1]

                    # array_of_all_variants_in_tmd = np.zeros(4)
                    array_of_all_variants_in_tmd = np.array([])
                    for variant_type in list_of_variant_types_in_uniprot:
                        if variant_type in desired_features_in_uniprot_dict.keys():
                            # if that variant is in the uniprot data for that protein, create a list of the indices showing where that variant is found
                            list_of_variant_locations = list(desired_features_in_uniprot_dict[variant_type])
                            # get the specific start, end and details of that variant
                            for v in range(len(list_of_variant_locations)):
                                # get start
                                start_of_variant_in_seq = record.features[list_of_variant_locations[v]][1]
                                # get end
                                end_of_variant_in_seq = record.features[list_of_variant_locations[v]][2]
                                # get description
                                variant_description = record.features[list_of_variant_locations[v]][3]
                                variant_feature_identifier = record.features[list_of_variant_locations[v]][4]
                                # check if the variant is in the tmd
                                start_of_variant_is_after_start_of_tmd = True if start_of_variant_in_seq > TM_start else False
                                end_of_variant_is_before_end_of_tmd = True if end_of_variant_in_seq < TM_start else False
                                variant_is_in_tmd = True if all([start_of_variant_is_after_start_of_tmd, end_of_variant_is_before_end_of_tmd]) else False
                                # if the variants are the tmd region, add to numpy array
                                if variant_is_in_tmd:
                                    # create array of the variant data
                                    variant_array = np.array(
                                        [variant_type, start_of_variant_in_seq, end_of_variant_in_seq,
                                         variant_description, variant_feature_identifier])
                                    if array_of_all_variants_in_tmd.size != 0:
                                        # add array with the data for this variant to the array/list for all variants
                                        array_of_all_variants_in_tmd = np.row_stack(
                                            (array_of_all_variants_in_tmd, variant_array))
                                    else:
                                        # if the array is empty, replace the array for all variants with the array for the first variant
                                        array_of_all_variants_in_tmd = variant_array
                    # if there were variants added (array is not empty), convert to string and add them to the output dictionary
                    if array_of_all_variants_in_tmd.size:
                        output_dict['%s_seq_variants'%TMD] = str(array_of_all_variants_in_tmd)

            count_of_uniprot_records_processed += 1
            # nest each dictionary containing the data for each protein into a large dictionary that contains all data from all proteins
            uniprot_dict_all_proteins[output_dict['uniprot_acc']] = output_dict

        # convert that nested dict into a pandas dataframe, transverse
        dfu = pd.DataFrame(uniprot_dict_all_proteins).sort_index().T
        # count records in dataframe
        count_of_initial_uniprot_records = dfu.shape[0]
        # make a unique list of all TMD combinations in list([TM01], [TM01, TM03], etc)
        unique_TMD_combinations_orig = list(dfu.list_of_TMDs.astype(str).unique())
        # convert to python list
        unique_TMD_combinations_lists = [ast.literal_eval(s) for s in unique_TMD_combinations_orig if "nan" not in s]
        # grab all unique values into a large list(e.g. TM01, TM02, TM03 until last TM of protein with most TMs)
        unique_TMD_combinations_single_list = [i for i in itertools.chain.from_iterable(unique_TMD_combinations_lists)]
        # sort
        list_all_TMDs_in_dataset = sorted(list(set(unique_TMD_combinations_single_list)))

        # extract the organism domain (e.g. Eukaryota)
        dfu['uniprot_orgclass'] = dfu['uniprot_orgclass'].astype(str)
        dfu['organism_domain'] = dfu.uniprot_orgclass.apply(lambda x: x.strip("'[]").split("', '")[0])
        # convert python datatypes to strings, as these currently give a TypeError when saving to excel
        dfu['uniprot_all_accessions'] = dfu['uniprot_all_accessions'].astype(str)
        dfu['uniprot_KW'] = dfu['uniprot_KW'].astype(str)
        dfu['uniprot_features'] = dfu['uniprot_features'].astype(str)
        dfu['list_of_TMDs'] = dfu['list_of_TMDs'].astype(str)
        dfu['topol_source'] = "UniProt"         #Hotfix: "UniProt" instead of "uniprot" or
                                                #else all proteins will be filtered out later

        # save to a csv
        utils.make_sure_path_exists(list_parsed_csv, isfile=True)
        # count records in dataframe
        count_of_uniprot_records_added_to_csv = dfu.shape[0]

        dfu.to_csv(list_parsed_csv, sep=",", quoting=csv.QUOTE_NONNUMERIC)

    return '\n%i valid UniProt records parsed to csv (from %i initial)\n~~~~~~~~~~~~                 finished parse_flatfile_to_csv              ~~~~~~~~~~~~' % (count_of_uniprot_records_added_to_csv, count_of_initial_uniprot_records)
Example #7
0
def download_homologues_from_simap(pathdict, s, logging):
    """From the list of proteins in csv format, begins downloading homologues from the SIMAP database.

     - opens the csv file containing the list of proteins
     - opens or creates a text file with the list of failed downloads
     - checks if there is enough hard-drive space
     - checks what files currently exist (feature table, homologue, zip)
     - tries to download feature table (if download_feature_tables listed as TRUE in settings)
     - tries to download homologues
     - if both feature table and homologues exist, compresses both into a tarball and deletes original files
     - counts the number of failed downloads. Assumes most failed downloads are due to server errors on the SIMAP side.
     With more and more failed downloads, sleeps for longer and longer.

    Parameters
    ----------
    pathdict : dict
        Dictionary of the key paths and files associated with that List number.
    s : dict
        Settings dictionary extracted from excel settings file.
    logging : logging.Logger
        Logger for printing to console and logfile.

    Saved Files and Figures
    -----------------------
    PROTEIN_NAME_SIMAP.tar.gz : gzip file
        (e.g. A1A5B4_ANO9_HUMAN_SIMAP.tar.gz)

        Contains
        --------
        PROTEIN_NAME_feature_table.xml (e.g. A1A5B4_ANO9_HUMAN_feature_table.xml)
            XML feature table from SIMAP, with information regarding each protein.
        PROTEIN_NAME_homologues.xml (e.g. A1A5B4_ANO9_HUMAN_homologues.xml)
            homologues from SIMAP in SIMAP-XML (rather than BLAST-XML) format
        PROTEIN_NAME--DATE--RESEARCHERNAME.txt (e.g. A1DT13_A1DT13_HUMAN--20160811--Mark Teese.txt)
            [only in later versions] Text file showing the download date and researcher name.

    pathdict["failed_downloads_txt"] : txt
        File containing a list of accessions that could not be downloaded. At each run, the program checks
        if this file exists. If it doesn't exist, it will be created. If it exists, the settings file
        determines whether the previously failed downloads will be re-attempted.
    """
    logging.info("~~~~~~~~~~~~                 starting download_homologues_from_simap                ~~~~~~~~~~~~")
    df = pd.read_csv(pathdict["list_csv"], sep = ",", quoting = csv.QUOTE_NONNUMERIC, index_col = 0)

    if s["attempt_prev_failed_downloads"] == False:
        # get list of accessions that could not be downloaded, and can immediately be excluded
        acc_list_failed_downloads = utils.get_acc_list_from_txt(pathdict["failed_downloads_txt"])
        not_in_homol_db = utils.get_acc_list_from_txt(pathdict["acc_not_in_homol_db_txt"])
        acc_excluded_list = acc_list_failed_downloads + not_in_homol_db
        # the list of desired proteins = total_list - excluded
        acc_not_excluded = list(set(df.index) - set(acc_excluded_list))
        # filter dataframe to only contain the desired proteins, which have not been excluded
        df = df.loc[acc_not_excluded, :]

    max_hits = s["max_hits"]
    java_exec_str = s["java_exec_str"]
    max_memory_allocation = s["java_max_RAM_memory_allocated_to_simap_download"]
    taxid = s["taxid"]  # eg.'7227' for Drosophila melanogaster

    # if "Linux" in platform.system() or "Windows" in platform.system():
    #     # if Linux or Windows
    #     byteformat = "GB"
    #     data_harddrive = os.path.splitdrive(s["data_dir"])[0]
    #     # print initial hard-drive space
    #     size = utils.get_free_space(data_harddrive, byteformat)
    #     logging.info('Hard disk remaining space = {}'.format(size))
    # else:
    #     # assume the system is a mac
    #     # code works only on mac!!! reverted to previous version
    #     statvfs = os.statvfs(s["simap_dir"])
    #     available_space = statvfs.f_frsize * statvfs.f_bavail
    #     size = available_space / 1073741824
    #     # print initial hard-drive space
    #     logging.info('Hard disk remaining space = {:.2f} GB'.format(size))

    #iterate over each uniprot record contained in the dataframe. note that acc = uniprot accession number
    number_of_files_not_found = 0
    for acc in df.index:
        # check hand-drive space before each download
        # try:
        #     if "Linux" in platform.system() or "Windows" in platform.system():
        #         size = utils.get_free_space(data_harddrive, byteformat)
        #         if size[0] < 5:
        #             raise utils.HardDriveSpaceException("Hard drive space limit reached, there is only %s %s space left." % (size[0], size[1]))
        #     else:
        #         # MAC only stuff...
        #         statvfs = os.statvfs(s["simap_dir"])
        #         available_space = statvfs.f_frsize * statvfs.f_bavail
        #         size = available_space / 1073741824
        #         if size < 5:
        #             raise utils.HardDriveSpaceException("Hard drive space limit reached, there is only %s %s space left." % (size[0], size[1]))
        # except utils.HardDriveSpaceException as e:
        #     logging.warning(e)
        protein_name = df.loc[acc, 'protein_name']
        seqlen = df.loc[acc, 'seqlen']
        input_sequence = df.loc[acc, 'full_seq']
        SIMAP_tar = df.loc[acc, 'SIMAP_tar']
        ft_xml_path = df.loc[acc, 'SIMAP_feature_table_XML_path']
        homol_xml_path = df.loc[acc, 'SIMAP_homol_XML_path']
        date_file_path = df.loc[acc, 'SIMAP_download_date_file_path']

        # create directories to hold file, if necessary
        utils.make_sure_path_exists(homol_xml_path, isfile=True)

        #check which files exist and delete corrupt tarballs
        ft_XML_exists, homol_XML_exists, SIMAP_tar_exists, ff, hh = utils.check_SIMAP_tarfile(SIMAP_tar, ft_xml_path, homol_xml_path,
                                                                                                               acc, logging, delete_corrupt=True)
        ''' Windows command prompt accepts only 8191 characters.
            Limit protein length according to settings (typically max length = 3000)
        '''
        if 'Windows' in str(platform.system()):
            too_large_to_download_list = utils.get_acc_list_from_txt(pathdict["too_large_to_download_txt"])
            if seqlen > s["max_query_sequence_length"]:
                logging.warning('%s homologue download will be skipped. It cannot be processed into a java command in windows OS,'
                                'as the sequence is longer than %i characters (%i). Moving to next sequence' % (protein_name, s["max_query_sequence_length"],seqlen))
                # if the accession is not already in the text file, add it
                if acc not in too_large_to_download_list:
                    # add accession number to the list of failed downloads
                    with open(pathdict["too_large_to_download_txt"], "a") as source:
                        source.write("\n{}".format(acc))
                # skip this protein
                continue

        if SIMAP_tar_exists and s["overwrite_homologue_files"] == False:
            # skip this protein
            logging.info("{} SIMAP_tar_exists, download skipped.".format(acc))
            continue
        eaSimap_path = os.path.join(s["data_dir"], "programs", "eaSimap.jar")
        # NOTE: DOWNLOADING FEATURE TABLES IS NO LONGER CONSIDERED NECESSARY.
        if not ft_XML_exists and s["download_feature_tables"] == True:
            #download feature table from SIMAP
            korbinian.simap_download.retrieve_simap_feature_table(input_sequence,
                                                                  java_exec_str=java_exec_str,
                                                                  max_memory_allocation=500,
                                                                  output_file=ft_xml_path,
                                                                  eaSimap_path=eaSimap_path)
            utils.sleep_x_seconds(60)
        if not homol_XML_exists:
            #download homologue file from SIMAP
            korbinian.simap_download.retrieve_simap_homologues(input_sequence,
                                                               output_file=homol_xml_path,
                                                               max_hits=max_hits, java_exec_str=java_exec_str,
                                                               max_memory_allocation=max_memory_allocation, taxid=taxid,
                                                               eaSimap_path=eaSimap_path)
            # sometimes the SIMAP server seems to like a little rest in between downloads?
            utils.sleep_x_seconds(30)
        #now check again if the files exist
        ft_XML_exists, homol_XML_exists, SIMAP_tar_exists, ff, hh = utils.check_SIMAP_tarfile(SIMAP_tar, ft_xml_path, homol_xml_path,
                                                                                                               acc, logging)
        if not homol_XML_exists:
            # add accession number to the list of failed downloads
            with open(pathdict["failed_downloads_txt"], "a") as source:
                source.write("\n{}".format(acc))
            #add one to the list of consecutive failed downloads.
            number_of_files_not_found += 1
            if s["sleep_if_downloads_unsuccessful"]:
                # if a large number of downloads failed, then the SIMAP server is probably not working.
                # Wait some time and try again later.
                if number_of_files_not_found > 30:
                    sys.stdout.write("\nnumber_of_files_not_found = {}, sleeping for 24 h".format(number_of_files_not_found))
                    utils.sleep_x_hours(24)
                if number_of_files_not_found == 20:
                    sys.stdout.write("\nnumber_of_files_not_found = {}, sleeping for 6 h".format(number_of_files_not_found))
                    utils.sleep_x_hours(6)
                if number_of_files_not_found == 15:
                    sys.stdout.write("\nnumber_of_files_not_found = {}, sleeping for 1 h".format(number_of_files_not_found))
                    utils.sleep_x_hours(1)
        else:
            # if download is successful or file exists, the SIMAP server must be working,
            # therefore reset the number_of_files_not_found
            number_of_files_not_found = 0
            # create an empty text file with the download date
            date = strftime("%Y%m%d")
            with open(date_file_path, "w") as f:
                f.write("{}\nEmpty file with the date.\nHave a nice day!".format(date))
            with tarfile.open(SIMAP_tar, mode='w:gz') as tar:
                #add the files to the compressed tarfile
                logging.info('%s XML files will be moved into the tarball, original XML files deleted' % protein_name)
                tar.add(homol_xml_path, arcname=os.path.basename(homol_xml_path))
                tar.add(date_file_path, arcname=os.path.basename(date_file_path))
                if ft_XML_exists:
                    tar.add(ft_xml_path, arcname=os.path.basename(ft_xml_path))
            #delete the original files
            try:
                os.remove(homol_xml_path)
                os.remove(date_file_path)
                if ft_XML_exists:
                    os.remove(ft_xml_path)
            except FileNotFoundError:
                pass
    logging.info("~~~~~~~~~~~~                 finished download_homologues_from_simap                ~~~~~~~~~~~~")
Example #8
0
def run_statements(s):
    list_number = s["list_number"]
    # setup error logging
    logging = korbinian.common.setup_keyboard_interrupt_and_error_logging(
        s, list_number)
    # print the list number describing the protein list
    logging.warning("list_number : {}".format(list_number))

    # open the tab containing the list-specific settings as a dataframe
    df_list_settings = pd.read_excel(s["excel_file_with_settings"],
                                     sheetname="lists",
                                     index_col=0)
    relevant_row = df_list_settings.loc[list_number, :].to_dict()
    if np.nan in relevant_row.values():
        raise ValueError(
            "The row for List{} in the lists tab of the settings file is missing some values."
            .format(list_number))
    # add the relevant row (e.g. for List01) to the existing settings dictionary
    # this adds max_lipo_homol, rand_TM, rand_nonTM, etc to the dictionary
    s.update(relevant_row)

    # set a base folder for the summaries, e.g. "D:\Databases\summaries\05\" for list 05
    base_filename_summaries = os.path.join(s["data_dir"], "summaries",
                                           '%02d' % list_number,
                                           'List%02d' % list_number)

    # create dictionary of paths for output files
    # for example the basic pathdict["list_csv"] for list 5 is "D:\Databases\summaries\05\List05_summary.csv"
    pathdict = korbinian.common.create_pathdict(base_filename_summaries, s)

    utils.make_sure_path_exists(pathdict["settings_copy_xlsx"], isfile=True)
    # copy the settings file used for the analysis
    copyfile(s["excel_file_with_settings"], pathdict["settings_copy_xlsx"])

    ########################################################################################
    #                                                                                      #
    #      prot_list, OMPdb (create a list of proteins from the OMPdb database)            #
    #                                                                                      #
    ########################################################################################

    if s["OMPdb_extract_omp_IDs_from_nr_fasta"]:
        ListXX_OMPdb_nr_fasta = os.path.join(
            s["data_dir"], "OMPdb",
            "List{:02d}_OMPdb_nr_fasta.txt".format(list_number))
        ListXX_OMPdb_nr_acc = os.path.join(
            s["data_dir"], "OMPdb",
            "List{:02d}_OMPdb_nr_acc.txt".format(list_number))
        korbinian.prot_list.parse_OMPdb.extract_omp_IDs_from_nr_fasta(
            ListXX_OMPdb_nr_fasta, ListXX_OMPdb_nr_acc, logging)

    if s["OMPdb_parse_OMPdb_all_selected_to_csv"]:
        ListXX_OMPdb_nr_acc = os.path.join(
            s["data_dir"], "OMPdb",
            "List{:02d}_OMPdb_nr_acc.txt".format(list_number))
        ListXX_OMPdb_redundant_flatfile = os.path.join(
            s["data_dir"], "OMPdb",
            "List{:02d}_OMPdb_redundant_flatfile.flat".format(list_number))
        OMPdb_list_csv = os.path.join(
            s["data_dir"], "OMPdb",
            "List{:02d}_OMPdb_summary.csv".format(list_number))
        korbinian.prot_list.parse_OMPdb.parse_OMPdb_all_selected_to_csv(
            ListXX_OMPdb_nr_acc, ListXX_OMPdb_redundant_flatfile,
            OMPdb_list_csv, logging, s)

    if s["OMPdb_get_TM_indices_and_slice"]:
        OMPdb_list_csv = os.path.join(
            s["data_dir"], "OMPdb",
            "List{:02d}_OMPdb_summary.csv".format(list_number))
        list_parsed_csv = pathdict["list_parsed_csv"]
        OMPdb_topology_reliability_cutoff = s[
            "OMPdb_topology_reliability_cutoff"]
        korbinian.prot_list.parse_OMPdb.get_omp_TM_indices_and_slice_from_summary_table(
            OMPdb_list_csv, list_parsed_csv, OMPdb_topology_reliability_cutoff,
            logging, s)

    ########################################################################################
    #                                                                                      #
    #      prot_list, UniProt (create a list of proteins from the UniProt database)        #
    #                                                                                      #
    ########################################################################################

    # define the uniprot directory with selected records
    uniprot_dir = os.path.join(s["data_dir"], 'uniprot')
    selected_uniprot_records_flatfile = os.path.join(
        uniprot_dir, 'selected',
        'List%02d_selected_uniprot_records_flatfile.txt' % list_number)

    if s["parse_large_flatfile_with_list_uniprot_accessions"]:
        input_accession_list_path = os.path.join(
            s["data_dir"], "uniprot", "selected",
            "List{:02d}_uniprot_accessions.txt".format(list_number))
        korbinian.prot_list.uniprot_retrieve.parse_large_flatfile_with_list_uniprot_accessions(
            s, input_accession_list_path, uniprot_dir, logging,
            selected_uniprot_records_flatfile)

    if s["retrieve_uniprot_data_for_acc_list_in_xlsx_file"]:
        input_uniprot_flatfile = "function not implemented!"
        excelfile_with_uniprot_accessions = os.path.join(
            base_filename_summaries, '.xlsx')
        korbinian.prot_list.uniprot_retrieve.retrieve_uniprot_data_for_acc_list_in_xlsx_file(
            excelfile_with_uniprot_accessions, input_uniprot_flatfile,
            selected_uniprot_records_flatfile, logging)

    if s["create_nonred_uniprot_flatfile_via_uniref"] == True:
        korbinian.prot_list.uniprot_nonredundant.create_nonred_uniprot_flatfile_via_uniref(
            s, uniprot_dir, selected_uniprot_records_flatfile, logging)

    if s["parse_flatfile_to_csv"]:
        n_aa_before_tmd = s["n_aa_before_tmd"]
        n_aa_after_tmd = s["n_aa_after_tmd"]
        list_parsed_csv = pathdict["list_parsed_csv"]
        # whether to analyse signal peptides for this dataset
        analyse_sp = True if "SiPe" in s["regions"] else False
        output = korbinian.prot_list.uniprot_parse.parse_flatfile_to_csv(
            selected_uniprot_records_flatfile, n_aa_before_tmd, n_aa_after_tmd,
            analyse_sp, logging, list_parsed_csv)
        logging.info(output)

    ########################################################################################
    #                                                                                      #
    #                             Membrane protein filtering                               #
    #                                                                                      #
    ########################################################################################

    if s["TM_filtering_SCAMPI"]:
        korbinian.filtering.scampi.run_filtering(pathdict, s, logging)

    if s["TM_filtering_SignalP"]:
        korbinian.filtering.signalP.run_filtering(pathdict, s, logging)

    if s["TM_filtering_TMSEG_PSI-BLAST_createDatabase"]:
        korbinian.blast.psiblast.create_BLAST_database(pathdict, s, logging)
    if s["TM_filtering_TMSEG_PSI-BLAST"]:
        korbinian.blast.psiblast.run_BLAST(pathdict, s, logging)
    if s["TM_filtering_TMSEG"]:
        korbinian.filtering.tmseg.run_filtering(pathdict, s, logging)

    ########################################################################################
    #                                                                                      #
    #                            prepare_protein_list                                      #
    #                                                                                      #
    ########################################################################################

    if s["prepare_protein_list"]:
        korbinian.prot_list.prot_list.get_topology_for_prot_list(
            s, pathdict, logging)
        korbinian.prot_list.prot_list.slice_TMDs_in_prot_list(
            s, pathdict, logging)
        korbinian.prot_list.prot_list.prepare_protein_list(
            s, pathdict, logging)

    if s['generate_scampi_input_files']:
        korbinian.prot_list.SCAMPI.generate_scampi_input_files(
            pathdict, s, logging)

    if s['generate_SignalP_input_files']:
        korbinian.prot_list.SCAMPI.generate_SignalP_input_files(
            pathdict, s, logging)

    ########################################################################################
    #                                                                                      #
    #                         run simap download, parse simap                              #
    #                                                                                      #
    ########################################################################################
    if s["download_homologues"]:
        korbinian.simap_download.download_homologues_from_simap(
            pathdict, s, logging)

    if s["parse_simap_to_csv"]:
        korbinian.simap_parse.run_parse_simap_to_csv(pathdict, s, logging)

    ########################################################################################
    #                                                                                      #
    #                         run BLAST and parse results                                  #
    #                                                                                      #
    ########################################################################################
    if s["BLASTp_search"]:
        if s["BLAST_modus"] == "online":
            korbinian.blast.blastp.run_BLAST_online(pathdict, s, logging)
        elif s["BLAST_modus"] == "local":
            korbinian.blast.blastp.run_BLAST_local(pathdict, s, logging)

    if s["BLAST_parser"]:
        korbinian.blast.blast_parser.run(pathdict, s, logging)

    ########################################################################################
    #                                                                                         #
    #            run_create_fasta, run_calculate_AAIMON_ratios                             #
    #                                                                                      #
    ########################################################################################

    if s["slice_TMDs_from_homologues"]:
        korbinian.cons_ratio.slice.run_slice_TMDs_from_homologues(
            pathdict, s, logging)

    if s["create_fasta"]:
        korbinian.fasta.run_create_fasta(pathdict, s, logging)

    if s["calculate_AAIMON_ratios"]:
        korbinian.cons_ratio.cons_ratio.run_calculate_AAIMONs(
            pathdict, s, logging)

    if s['filter_truncated_alignments']:
        korbinian.cons_ratio.cons_ratio.throw_out_truncated_sequences(
            pathdict, s, logging)

    if s["gather_AAIMON_ratios"]:
        # reassign pathdict that could have been recreated during gather depending on settings
        pathdict = korbinian.cons_ratio.gather.gather_AAIMONs(
            pathdict, logging, s)

    ########################################################################################
    #                                                                                      #
    #                             gap density analysis                                     #
    #                                                                                      #
    ########################################################################################

    if s["calculate_gap_densities"]:
        korbinian.gap.run_calculate_gap_densities(pathdict, s, logging)

    if s["gather_gap_densities"]:
        korbinian.gap.gather_gap_densities(pathdict, s, logging)

    if s["create_graph_of_gap_density"]:
        korbinian.gap_figs.create_graph_of_gap_density(pathdict, s, logging)

    if s["save_fastagap"]:
        korbinian.fastagap.save_fastagap(pathdict, s, logging)

    if s["calc_fastagap_densities"]:
        korbinian.fastagap.run_calc_fastagap_densities(pathdict, s, logging)

    ########################################################################################
    #                                                                                      #
    #                 conservation ratio (AAIMON ratio) figures                            #
    #                                                                                      #
    ########################################################################################
    if s["run_keyword_analysis"]:
        output = korbinian.cons_ratio.keywords.keyword_analysis(
            pathdict, s, logging)
        logging.info(output)
    '''+++++++++++++++ Summary figures describing the conservation ratios of proteins in the list ++++++++++++++++++'''
    if s["save_figures_describing_proteins_in_list"]:
        return_statement = korbinian.cons_ratio.figs.save_figures_describing_proteins_in_list(
            pathdict, s, logging)
        logging.info(return_statement)
    '''+++++++++++++++ Summary figures describing the conservation ratios of proteins in the list ++++++++++++++++++'''
    # if s["compare_lists"]:
    #     korbinian.cons_ratio.compare_lists_old.compare_rel_con_lists(pathdict, s, logging)

    if "gather_pretty_alignments" in s.keys():
        if s["gather_pretty_alignments"]:
            korbinian.cons_ratio.gather.gather_pretty_alignments(
                pathdict, logging, s)

    if s['send_email_when_finished']:
        korbinian.utils.send_email_when_finished(s, pathdict)

    sys.stdout.write(
        '\n~~~~~~~~~~~~         List {} finished           ~~~~~~~~~~~~\n'.
        format(list_number))
Example #9
0
def run_psiblast_on_fasta_queries_in_folder(query_dir,
                                            databases_dir,
                                            psiblast_exec_str,
                                            db,
                                            timeout_h,
                                            retry_failed=False,
                                            retry_successful=False,
                                            retry_timeout=False,
                                            n_threads=5):
    """Runs standalone PSIBLAST on every query fasta file in a folder.

    What you need:
         - query folder with I3L0P3.fasta, P42532.fasta etc
         - standalone BLAST must be working
         - databases folder with databases\\uniref\\vertebra90\\vertebra90.fasta, or databases\\uniref\\uniref90\\uniref90.fasta etc
    Where are the output files saved?
        - databases\\BLAST\\PSI\\vertebra90\\P4\\P42532_PSIBLAST.tar.gz

    Parameters
    ----------
    query_dir : str
        Folder containing protein sequences in fasta format.
    databases_dir : str
        Databases directory, e.g. "D:\Databases"
    psiblast_exec_str : str
        Path to psiblast executable
        if you are using linux or your Windows environmental variables are working, can simply be "psiblast"
    db : str
        Database for PSI-BLAST
        e.g. "metazoa90"
        Determines the filepath for the .fasta containing the search database.
    timeout_h : int
        Hours allotted before timeout in PSIBLAST command.
        Since some proteins are extremely slow, suggest making a quick run through the list first (2 hr),
        and if time permits, a slow run later for the last few (6h).
    retry_failed : bool
        If True, proteins in the list of failed acc will be re-attempted
    retry_successful : bool
        If True, previous files will be overwritten

    Usage
    -----
    %capture
    from korbinian.psiblast import run_psiblast_on_fasta_queries_in_folder

    # SET YOUR GENERAL DATABASES DIRECTORY
    databases_dir = r"D:\\Databases"
    # uniref files must be in databases\\uniref\\database

    # SET THE DIRECTORY CONTAINING THE FASTA SEQUENCES YOU WANT TO USE AS A QUERY
    query_dir = r"D:\\Databases\\xtestproteins\\BLAST_small"
    # SET YOUR BLAST EXECUTABLE (windows)
    psiblast_exec_str = r"C:\\Program Files\\NCBI\\blast-2.6.0+\\bin\\psiblast.exe"
    run_psiblast_on_fasta_queries_in_folder(query_dir, databases_dir, psiblast_exec_str)

    """
    # set location of logfile
    date_string = time.strftime("%Y%m%d")
    logfile = os.path.join(query_dir,
                           "{}_PSI-BLAST_logfile.txt".format(date_string))
    logging = korbinian.common.setup_error_logging(logfile)
    # set location of txt file containing the failed sequences
    failed_psiblast_list_txt = os.path.join(query_dir,
                                            "failed_PSIBLAST_list.txt")
    timeout_psiblast_list_txt = os.path.join(query_dir,
                                             "timeout_PSIBLAST_list.txt")
    ########################################################################################
    #                                                                                      #
    #       Create a list of all FASTA files in a particular folder for analysis           #
    #                                                                                      #
    ########################################################################################
    query_fasta_list = glob.glob(query_dir + "\*.fasta")
    logging.info("query_fasta_list[0:5] : {}".format(query_fasta_list[0:5]))

    ########################################################################################
    #                                                                                      #
    #                        Get list of previously failed sequences                       #
    #                                                                                      #
    ########################################################################################
    if os.path.isfile(failed_psiblast_list_txt):
        failed_psiblast_list = utils.get_acc_list_from_txt(
            failed_psiblast_list_txt)
    else:
        failed_psiblast_list = []
    if os.path.isfile(timeout_psiblast_list_txt):
        timeout_psiblast_list = utils.get_acc_list_from_txt(
            timeout_psiblast_list_txt)
    else:
        timeout_psiblast_list = []
    logging.info("failed_psiblast_list[0:5] : {}".format(
        failed_psiblast_list[0:5]))
    logging.info("timeout_psiblast_list[0:5] : {}".format(
        timeout_psiblast_list[0:5]))
    ########################################################################################
    #                                                                                      #
    #                create a dictionary, s, with various parameters                       #
    #               can be converted to a korbinian dictionary later                       #
    #                                                                                      #
    ########################################################################################
    s = {}
    s["psiblast_exec_str"] = psiblast_exec_str
    s["evalue"] = "1e-5"
    s["inclusion_ethresh"] = "1e-5"
    s["num_threads"] = n_threads
    # s["db"] = "metazoa90"
    s["num_descriptions"] = 3000
    s["num_alignments"] = 3000
    command_str = '"{psiblast_exec_str}" -query {query} -db {db} -out_pssm {out_pssm} -out_ascii_pssm {out_ascii_pssm} '\
    '-out {out_BLAST_xml} -evalue {evalue} -inclusion_ethresh {inclusion_ethresh} -num_iterations 3 '\
    '-use_sw_tback -seg no -num_threads {num_threads} -num_descriptions {num_descriptions} -num_alignments {num_alignments} -comp_based_stats 1'
    logging.info("Example of command str, before inserting variables".format(
        command_str))

    ########################################################################################
    #                                                                                      #
    #         Run PSI-BLAST for each query sequence:                                       #
    #             input: query.fas, database.fasta (after makeblastdb)                     #
    #             output: query.pssm, query_ascii.pssm, query_BLAST.xml, query_date.txt    #
    #                     (compressed into a tarball, query_PSIBLAST.tar.gz)               #
    #                                                                                      #
    ########################################################################################
    # define the BLAST database. Note that you should specify the fasta file.
    db_path = os.path.join(databases_dir,
                           "uniref\{db}\{db}.fasta".format(db=db))

    for query in query_fasta_list:
        acc = os.path.basename(query).split(".")[0]
        # get first two letters of acc, used as a subfolder
        first2 = acc[:2]
        # create a basename, e.g. "D:\Databases\BLAST\PSI\vertebra90\P4\P42532" from which files are created
        basename = r"D:\Databases\BLAST\PSI\{db}\{first2}\{acc}".format(
            db=db, first2=first2, acc=acc)
        # create path for output files
        out_pssm = basename + ".pssm"
        out_ascii_pssm = basename + "_ascii.pssm"
        out_BLAST_xml = basename + "_BLAST.xml"
        date_file_path = basename + "_BLAST_date.txt"
        PSIBLAST_tar = basename + "_PSIBLAST.tar.gz"

        # if the tar exists or the accession has previously failed, skip to the next protein
        if os.path.exists(PSIBLAST_tar) and retry_successful == False:
            message = "{} PSIBLAST_tar exists, file skipped".format(acc)
            logging.info(message)
            continue
        if acc in failed_psiblast_list and retry_failed == False:
            message = "{} acc is in failed_psiblast_list, file skipped".format(
                acc)
            logging.info(message)
            continue
        if acc in timeout_psiblast_list and retry_timeout == False:
            message = "{} acc is in timeout_psiblast_list, file skipped".format(
                acc)
            logging.info(message)
            continue
        # print accession
        logging.info("\n{}".format(acc))
        # create folders if necessary
        utils.make_sure_path_exists(out_ascii_pssm, isfile=True)
        # start a timer
        start = time.clock()

        ########################################################################################
        #                                                                                      #
        #                        run the PSI-BLAST command-line argument                       #
        #                                                                                      #
        ########################################################################################
        # create full command string to be run, as if in the console
        c = command_str.format(psiblast_exec_str=s["psiblast_exec_str"],
                               query=query,
                               db=db_path,
                               out_pssm=out_pssm,
                               out_ascii_pssm=out_ascii_pssm,
                               out_BLAST_xml=out_BLAST_xml,
                               evalue=s["evalue"],
                               inclusion_ethresh=s["inclusion_ethresh"],
                               num_threads=s["num_threads"],
                               num_descriptions=s["num_descriptions"],
                               num_alignments=s["num_alignments"])
        logging.info("{}".format(c))

        command = utils.Command(c)
        # Run the command. Set the timeout in seconds
        command.run(timeout=int(timeout_h * 60 * 60))
        # wait 5 seconds. In some cases, the files are not immediately recognised as existing?
        utils.sleep_x_seconds(5, print_stuff=False)

        ########################################################################################
        #                                                                                      #
        #               if successful, move output files into a tarball                        #
        #                                                                                      #
        ########################################################################################
        # check which output files exist (e.g. [True, True, False])
        output_file_exists_list = [
            os.path.exists(out_pssm),
            os.path.exists(out_ascii_pssm),
            os.path.exists(out_BLAST_xml)
        ]
        logging.info("pssm, ascii_pssm, xml exists : {}".format(
            output_file_exists_list))

        # create boolean to catch timeout errors
        there_is_an_error_in_file_deletion = False

        # if all output files exist, create a date file and move all to a tarball
        if False not in output_file_exists_list:
            duration = time.clock() - start
            with open(date_file_path, "w") as f:
                f.write(
                    "Acc\t{}\nDate\t{}\nDatabase\t{}\nGreeting\tHave a nice day!"
                    .format(acc, date_string, db))
            # move all files into the tarball
            file_list = [
                out_pssm, out_ascii_pssm, out_BLAST_xml, date_file_path
            ]
            with tarfile.open(PSIBLAST_tar, mode='w:gz') as tar:
                # add the files to the compressed tarfile
                logging.info(
                    '{} files will be moved into the tarball, original files deleted.\nPSIBLAST duration = {:0.3f} min'
                    .format(acc, duration / 60))
                for file in file_list:
                    try:
                        tar.add(file, arcname=os.path.basename(file))
                    except FileNotFoundError:
                        # wait 10 seconds. In some cases, the files are not immediately recognised as existing? Very rare.
                        utils.sleep_x_seconds(10, print_stuff=False)
                        # here is where I should import goto, and goto the line above :)
                        try:
                            tar.add(file, arcname=os.path.basename(file))
                        except FileNotFoundError:
                            # For whatever reason the file's still not there. Give up.
                            logging.warning(
                                '{}, file could not be added to tarball. Filepath = {}. '
                                .format(acc, file))
            # wait 5 seconds. In some cases, the files are not immediately recognised as existing?
            utils.sleep_x_seconds(5, print_stuff=False)
            # delete the original files
            for file in file_list:
                try:
                    os.remove(file)
                except (FileNotFoundError, PermissionError):
                    logging.warning(
                        '{} ERROR. Could not be deleted'.format(file))
                    there_is_an_error_in_file_deletion = True

        else:
            if acc not in failed_psiblast_list:
                # add accession number to the list of failed blast sequences
                with open(failed_psiblast_list_txt, "a") as source:
                    source.write("\n{}".format(acc))

        if there_is_an_error_in_file_deletion:
            if acc not in timeout_psiblast_list:
                # PSIBLAST probably timed out, files are incomplete
                # add accession number to the list of timed-out sequences
                with open(timeout_psiblast_list_txt, "a") as source:
                    source.write("\n{}".format(acc))
Example #10
0
def parse_TMSEG_results_DEPRECATED(pathdict, s, logging):
    """DEPRECATED METHOD BASED ON LARGE FILE OF ALL TMSEG RESULTS

    USE METHODS BASED ON INDIVIDUAL TMSEG DATAFILES INSTEAD.

    """
    logging.info("~~~~~~~~~~~~                        starting parse_TMSEG_results_DEPRECATED                    ~~~~~~~~~~~~")
    # create or open dataframe for protein list summary
    if os.path.isfile(pathdict["prot_list_summary_csv"]):
        df_PLS = pd.read_csv(pathdict["prot_list_summary_csv"], index_col=0)
    else:
        df_PLS = pd.DataFrame(columns=["v", "date"])
    # get the timestamp for current time
    t = time.ctime(time.time())

    list_number = s['list_number']

    # define the uniprot directory with selected records
    uniprot_dir = os.path.join(s["data_dir"], 'uniprot')
    selected_uniprot_records_flatfile = os.path.join(uniprot_dir, 'selected', 'List%02d_selected_uniprot_records_flatfile.txt' % list_number)
    n_aa_before_tmd = s["n_aa_before_tmd"]
    n_aa_after_tmd = s["n_aa_after_tmd"]
    list_parsed_csv = pathdict["list_parsed_csv"]
    # check if the lists tab says to analyse the signal peptides
    analyse_sp = True if "SiPe" in s["regions"] else False
    output = korbinian.prot_list.uniprot_parse.parse_flatfile_to_csv(selected_uniprot_records_flatfile, n_aa_before_tmd, n_aa_after_tmd, analyse_sp, logging, list_parsed_csv, slice=False)
    logging.info(output)

    TMSEG_fastalike_path = pathdict['TMSEG_fastalike']
    TMSEG_top_txtoutput_path = pathdict['TMSEG_top_txtoutput']
    TMSEG_nonTM_outpath = pathdict['TMSEG_nonTM']

    df_parsed = pd.read_csv(pathdict["list_parsed_csv"], sep=",", quoting=csv.QUOTE_NONNUMERIC, index_col=0, low_memory=False)

    columns_to_keep = ['organism_domain', 'uniprot_acc', 'uniprot_all_accessions', 'uniprot_entry_name', 'uniprot_features',
                       'uniprot_orgclass', 'uniprot_SiPe', 'singlepass', 'typeI', 'typeII', 'uniprot_KW', 'organism', 'prot_descr', 'membrane',
                       'multipass', 'gene_name', 'comments_subcellular_location_uniprot', 'uniprot_SiPe', 'full_seq']

    # # for datasets without SP found, turn off analyse_sp
    # if analyse_sp == True and 'SP01_start' in df_parsed.columns:
    #     columns_to_keep = ['SP01_start', 'SP01_end', 'SP01_seq']
    # else:
    #     analyse_sp == False

    acc_list_orig = list(df_parsed.index)

    if os.path.isfile(TMSEG_fastalike_path):
        df_PLS.loc["TMSEG_fastalike_path", :] = ("exists", t)
        sys.stdout.write("Extracting topology from TMSEG_fastalike file.")
        # DEPRECATED drop the full sequence, and get from TMSEG
        #df_parsed.drop('full_seq', axis=1, inplace=True)

        # read data from file
        # list will have acc, seq, topology, acc, seq, topology etc
        input_data = []
        with open(TMSEG_fastalike_path) as data_file:
            for line in data_file:
                line = line.strip()
                if line[0] == '>':
                    line = line[1:]
                    line = line.split(' ')
                    line = line[0].split('|')
                    uniprot_acc = line[0]
                    input_data.append(uniprot_acc)
                else:
                    input_data.append(line)

        # initialise pandas dataframe with uniprot accession as index
        df_TMSEG = pd.DataFrame(index=input_data[0::3])

        # add the signal peptide definitions from UniProt, to be used for slicing the nonTMD etc later
        if analyse_sp:
            for col in ['SP01_start', 'SP01_end', 'SP01_seq']:
                df_TMSEG[col] = df_parsed[col]

        # drop unnecessary columns from df_parsed, to be merged later
        df_parsed = df_parsed[columns_to_keep]

        # add selected columns from input_data list
        #df_TMSEG['uniprot_entry_name'] = input_data[1::5]
        #df_TMSEG['prot_descr'] = input_data[2::5]
        df_TMSEG['full_seq'] = input_data[1::3]
        df_TMSEG['topo'] = input_data[2::3]

        acc_list_TMSEG = df_TMSEG.index.tolist()

        TMSEG_avail_list = set(acc_list_TMSEG).intersection(set(acc_list_orig))
        TMSEG_unavail_list = list(set(acc_list_orig) - set(acc_list_TMSEG))

        df_PLS.loc["n_prot_TMSEG_file"] = (len(acc_list_TMSEG), t)

        # create a boolean whether the TMSEG topology is available
        df_parsed.loc[TMSEG_avail_list,"TMSEG_avail"] = True
        df_parsed.loc[TMSEG_unavail_list, "TMSEG_avail"] = False

        # drop proteins from df_TMSEG that are not in the listxx_parsed.csv
        df_TMSEG = df_TMSEG.loc[TMSEG_avail_list, :]

        fa_dir = pathdict['TMSEG_unavail_fa_dir']
        utils.make_sure_path_exists(fa_dir)
        for acc in TMSEG_unavail_list:
            out_fasta = os.path.join(fa_dir, "{}.fasta".format(acc))
            seq = df_parsed.loc[acc, "full_seq"]
            with open(out_fasta, "w") as f:
                f.write(">{}\n{}".format(acc, seq))

        n_prot_TMSEG_file_not_in_list = len(set(acc_list_TMSEG) - set(acc_list_orig))
        logging.info("n_prot_TMSEG_file_not_in_list as not in listxx_parsed.csv = {} ({} remaining)".format(n_prot_TMSEG_file_not_in_list, len(TMSEG_avail_list)))
        df_PLS.loc["n_prot_TMSEG_file_not_in_list"] = (n_prot_TMSEG_file_not_in_list, t)

        if df_TMSEG.shape[0] == 0:
            return sys.stdout.write('no remaining proteins in list!')

        # get list of uniprot accessions of proteins where no transmembrane region was predicted
        list_nonTMD = []
        for acc in df_TMSEG.index:
            if 'N' in df_TMSEG.loc[acc, 'topo']:
                list_nonTMD.append(acc)

        # write list of nonTM proteins to file
        # outpath = '/Volumes/Musik/Databases/TMSEG/humanU90_nonTMD.txt'
        file = open(TMSEG_nonTM_outpath, 'w')
        for line in list_nonTMD:
            file.write('{}\n'.format(line))
        file.close()

        # drop proteins that do not contain TM regions
        df_TMSEG = df_TMSEG.drop(list_nonTMD)

        # create a boolean whether the TMSEG topology is available
        TMSEG_avail_and_TM = set(TMSEG_avail_list) - set(list_nonTMD)
        TMSEG_avail_but_SOL = set(acc_list_orig).intersection(set(list_nonTMD))
        df_parsed["membrane"] = np.nan
        df_parsed.loc[TMSEG_avail_and_TM, "membrane"] = True
        df_parsed.loc[TMSEG_avail_but_SOL, "membrane"] = False

        # add seqlen and indices for all TMD and SiPe regions
        df_TMSEG["seqlen"] = df_TMSEG.full_seq.apply(lambda x: len(x))
        #df_TMSEG['M_indices'] = df_TMSEG.topo.apply(get_list_TM_residues_from_topo_string)
        #df_TMSEG['SiPe_indices'] = df_TMSEG.topo.apply(get_list_TM_residues_from_topo_string, args=("S"))

        df_TMSEG['TM_indices'] = df_TMSEG.topo.apply(get_TM_indices_from_TMSEG_topo_str)
        df_TMSEG['SiPe_indices'] = df_TMSEG.topo.apply(get_TM_indices_from_TMSEG_topo_str, args=("S"))

        # # Creating new list (nested list)
        # nested_list_of_membrane_borders = []
        #
        # ########################################################################################
        # #                                                                                      #
        # #              Extract the membrane indices in UniProt Indexing style                  #
        # #                                                                                      #
        # ########################################################################################
        # # Filling nest with lists of start and end-points
        # for m_index_list in df_TMSEG.M_indices:
        #     m_borders = []
        #     # add the first membrane index (e.g. 13)
        #     m_borders.append(m_index_list[0])
        #     m_borders = korbinian.prot_list.parse_OMPdb.check_for_border(m_index_list, m_borders)
        #     # add the last membrane index (e.g. 33)
        #     m_borders.append(m_index_list[-1])
        #     nested_list_of_membrane_borders.append(m_borders)
        #
        # # DEPRECATED
        # #FOR CONSISTENCY, LEAVE INDEXING STYLE AS UNIPROT
        # # ########################################################################################
        # # #                                                                                      #
        # # #            Convert to python indexing style (NECESSARY?? NOT COMPAT WITH UNIPROT!)   #
        # # #                                                                                      #
        # # ########################################################################################
        # # array_membrane_borders = np.array(nested_list_of_membrane_borders)
        # # nested_list_m_borders_python_indexstyle = []
        # # for subarray in array_membrane_borders:
        # #     # convert to array
        # #     subarray = np.array(subarray)
        # #     # add 1 to the second index number, to allow slicing
        # #     subarray[1::2] = subarray[1::2] + 1
        # #     # add to list with corrected values, python index style
        # #     nested_list_m_borders_python_indexstyle.append(list(subarray))
        #
        # # Creating new column, which contains start and end-points
        # #df_TMSEG["Membrane_Borders"] = nested_list_m_borders_python_indexstyle
        #
        # df_TMSEG["Membrane_Borders"] = nested_list_of_membrane_borders
        #
        # # Creating new column, which contains the number of TMDS
        # #df_TMSEG["number_of_TMDs"] = df_TMSEG.Membrane_Borders.apply(lambda x: len(x) / 2)
        #
        # df_TMSEG["TM_indices"] = df_TMSEG["Membrane_Borders"].apply(lambda x: tuple(zip(x[::2], x[1::2])))

        # create a list of [TM01, TM02, TM03, etc.
        long_list_of_TMDs = []
        for i in range(1, 50):
            long_list_of_TMDs.append("TM{:02d}".format(i))

        ## for the .set_value function, set dtype as object
        df_TMSEG["list_of_TMDs"] = ""
        df_TMSEG["list_of_TMDs"].astype(object)

        sys.stdout.write('slicing TMD and nonTMD sequences:\n')

        for n, acc in enumerate(df_TMSEG.index):
            # get nested tuple of TMDs
            nested_tup_TMs = df_TMSEG.loc[acc, "TM_indices"]
            # slice long list of TMD names to get an appropriate list for that protein [TM01, TM02, TM03, etc.
            len_nested_tup_TMs = len(nested_tup_TMs)
            list_of_TMDs = long_list_of_TMDs[:len_nested_tup_TMs]
            # add that list to the dataframe (could also be added as a stringlist, but that's irritating somehow)
            #df_TMSEG.loc[acc, 'list_of_TMDs'] = list_of_TMDs
            df_TMSEG.set_value(acc, "list_of_TMDs", list_of_TMDs)
            # set seq for slicing
            full_seq = df_TMSEG.loc[acc, "full_seq"]
            # topo = dft.loc[acc, "Topology"]
            # iterate through all the TMDs of that protein, slicing out the sequences
            for i, TMD in enumerate(list_of_TMDs):
                TMD = list_of_TMDs[i]
                start, end = nested_tup_TMs[i]
                # with UniProt indexing, need to slice with -1, not like python index style
                df_TMSEG.loc[acc, "%s_start" % TMD] = start
                df_TMSEG.loc[acc, "%s_end" % TMD] = end
                # for python indexing of the TMD rather than uniprot, the start should be minus 1
                python_indexing_tuple = (start - 1, end)
                df_TMSEG.loc[acc, "%s_seq" % TMD] = utils.slice_with_listlike(full_seq, python_indexing_tuple)
                df_TMSEG.loc[acc, "%s_seqlen" % TMD] = len(df_TMSEG.loc[acc, "%s_seq" % TMD])
                # dft.loc[acc, TMD + "_top"] = utils.slice_with_listlike(topo, tup)

            #DEPRECATED, ONLY REINSTATE IF YOU REALLY WANT TMSEG SP DEFINITIONS TO STAY
            # # add signal peptides and their corresponding values to list_of_TMDs
            # if analyse_sp == True:
            #     if type(df_parsed.loc[acc, 'SP01_seq']) == str:
            #         list_of_TMDs.append('SP01')
            #         df_TMSEG.set_value(acc, "list_of_TMDs", list_of_TMDs)

                # # code necessary for TMSEG signal peptides - depreciated by MO 20.04.2017
                # SiPe_indices = df_TMSEG.loc[acc, 'SiPe_indices']
                # if SiPe_indices != []:
                #     df_TMSEG.loc[acc, 'SP01_start'] = SiPe_indices[0]
                #     df_TMSEG.loc[acc, 'SP01_end'] = SiPe_indices[-1]
                #     df_TMSEG.loc[acc, 'SP01_seq'] = full_seq[SiPe_indices[0]:SiPe_indices[-1]+1]
                #     list_of_TMDs.append('SP01')
                #     df_TMSEG.set_value(acc, "list_of_TMDs", list_of_TMDs)

            if n % 50 == 0 and n != 0:
                sys.stdout.write(". ")
                sys.stdout.flush()
                if n % 500 == 0:
                    sys.stdout.write("\n")
                    sys.stdout.flush()

        # slice out the nonTM segments with a function
        # note that for some reason, this is very slow after merging the dataframes
        df_TMSEG = slice_nonTMD_in_prot_list(df_TMSEG)

        #df_TOP = pd.merge(df_parsed, df_TMSEG, how="left", left_on=True, suffixes=('_list_parsed', ""))# left_index=True, right_index=False,
        df_TOP = df_parsed.merge(df_TMSEG, how="left", suffixes=('_list_parsed', ""))  # left_index=True, right_index=False,

        # actually, I'd prefer to keep these for troubleshooting purposes
        # cols_to_drop = ['M_indices', 'SiPe_indices', 'Membrane_Borders', 'TM_indices']
        # df_TMSEG.drop(cols_to_drop, axis=1, inplace=True)

    elif os.path.isfile(TMSEG_top_txtoutput_path):
        df_PLS.loc["TMSEG_top_txtoutput_path", :] = ("exists", t)
        """ PARSE DATA WITH THE FOLLOWING FORMAT, proteins listed one after each other

        IMPORTANT : this format is sub-optimal, because the sequences come from uniprot, and the predictions from TMPRED

        Can only be trusted when they are from the same date: best to use TMPRED output which also contains the orig sequence.

        ---
        ID: A4ZUB1
        # TRANSMEM	6	18	4
        # TRANSMEM	50	67	7
        SIG: SIGNAL 1 22 {ECO:0000255}.
        TMH: TRANSMEM 53 69 Helical. {ECO:0000255}.
        ---
        """
        # if the regions column in the lists tab is "TM01" instead of the usual "TM", take only the first TM
        take_only_the_first_TM = s["regions"] == "TM01"

        # create dataframe for text topology (dftt)
        dftt = pd.DataFrame()
        with open(TMSEG_top_txtoutput_path, "r") as f:
            acc_counter = 0
            for line in f:
                if line[0:4] == "ID: ":
                    acc = line.split(" ")[1].strip("\n")
                    dftt.loc[acc_counter, "acc"] = acc
                    acc_counter += 1
                    # reset the TM_counter
                    TM_counter = 1
                if line[0:10] == "# TRANSMEM":
                    if TM_counter > 1:
                        if take_only_the_first_TM:
                            # skip to next line, as the first TM is already taken
                            continue

                    # split by tab
                    split = line.split("\t")
                    # the start is split[1] (end is not really necessary here)
                    start = split[1]
                    # note that acc_counter += 1 is already + 1 for the next protein,
                    # therefore the dftt.loc is acc_counter-1
                    dftt.loc[acc_counter - 1, "TM{:02d}_start".format(TM_counter)] = start
                    end = split[2]
                    # note that acc_counter += 1 is already + 1 for the next protein,
                    # therefore the dftt.loc is acc_counter-1
                    dftt.loc[acc_counter - 1, "TM{:02d}_end".format(TM_counter)] = end
                    TM_counter += 1
        # add an extra number_of_TMDs column, so they can be counted consistently
        dftt["number_of_TMDs"] = 0
        for row in dftt.index:
            # drop TM02_start etc if they don't contain data
            subset = dftt.loc[row, :].dropna()
            # count columns
            n_cols = subset.shape[0]
            # calculate number of columns (TM01_start, TM01_end) /2, which is the number of TMDs
            number_of_TMDs = int((n_cols - 2) / 2)
            dftt.loc[row, "number_of_TMDs"] = number_of_TMDs
            dftt.loc[row, "list_of_TMDs"] = str(["TM{:02d}".format(n) for n in range(1, number_of_TMDs + 1)])
        # set the acc as the index, so it can be merged with df_parsed
        dftt.set_index("acc", drop=False, inplace=True)
        # save temp csv with TMSEG output
        TMSEG_txtoutput_parsed_csv = TMSEG_top_txtoutput_path[:-4] + "TMSEG_txtoutput_parsed.csv"
        dftt.to_csv(TMSEG_txtoutput_parsed_csv)

        df = pd.merge(dftt, df_parsed, left_index=True, right_index=True, suffixes=('', '_list_parsed'))

        # convert from string to python list
        if isinstance(df['list_of_TMDs'][0], str):
            df['list_of_TMDs'] = df['list_of_TMDs'].dropna().apply(lambda x: ast.literal_eval(x))

        # (re)define sequence length
        df["seqlen"] = df["full_seq"].str.len()

        # slice out all the TMD sequences
        for n, acc in enumerate(df.index):
            list_of_TMDs = df.loc[acc, "list_of_TMDs"]
            # add that list to the dataframe (could also be added as a stringlist, but that's irritating somehow)
            # set seq for slicing
            full_seq = df.loc[acc, "full_seq"]
            # iterate through all the TMDs of that protein, slicing out the sequences
            for i in range(len(list_of_TMDs)):
                TMD = list_of_TMDs[i]
                tuple_slice_indices = (df.loc[acc, "%s_start" % TMD], df.loc[acc, "%s_end" % TMD])
                df.loc[acc, "%s_seq" % TMD] = utils.slice_with_listlike(full_seq, tuple_slice_indices)
                df.loc[acc, "%s_seqlen" % TMD] = len(df.loc[acc, "%s_seq" % TMD])

            # add signal peptides and their corresponding values to list_of_TMDs
            if analyse_sp == True:
                if type(df_parsed.loc[acc, 'SP01_seq']) == str:
                    list_of_TMDs.append('SP01')
                    df.set_value(acc, "list_of_TMDs", list_of_TMDs)

        start = time.clock()
        # slice out the nonTM segments with a function
        # note that for some reason, this is very slow after merging the dataframes
        df_TOP = slice_nonTMD_in_prot_list(df)
        sys.stdout.write("\ntime taken : {:0.03f} s".format(time.clock() - start))

    else:
        raise FileNotFoundError("None of the TMSEG combined output files were found.")

    # define number of TMDs (includes Signal peptides!)
    df_TOP["number_of_TMDs"] = df_TOP["list_of_TMDs"].dropna().apply(lambda x : len(x))
    df_TOP['parse_TMSEG'] = True
    df_TOP.to_csv(pathdict["list_parsed_csv"], sep=",", quoting=csv.QUOTE_NONNUMERIC)
    logging.info("\n~~~~~~~~~~~~                       parse_TMSEG_results_DEPRECATED is finished                  ~~~~~~~~~~~~")