Exemple #1
0
def get_column_method(column, method):
    column = col_name2cat(column)
    if is_meta_ml(method):
        method = get_default_ml_method()
    elif is_meta_mp(method):
        method = get_default_mp_method()
    else:
        return column, method
    return get_personalized_feature_name(column, method), method
Exemple #2
0
def _validate_input(columns, data, data_sep, date_column, html, html_compressed, id_index, name_column, tree_nwk,
                    copy_only):
    logger = logging.getLogger('pastml')
    logger.debug('\n=============INPUT DATA VALIDATION=============')
    root = read_tree(tree_nwk)
    num_neg = 0
    for _ in root.traverse():
        if _.dist < 0:
            num_neg += 1
            _.dist = 0
    if num_neg:
        logger.warning('Input tree contained {} negative branches: we put them to zero.'.format(num_neg))
    logger.debug('Read the tree {}.'.format(tree_nwk))

    df = pd.read_csv(data, sep=data_sep, index_col=id_index, header=0, dtype=str)
    df.index = df.index.map(str)
    logger.debug('Read the annotation file {}.'.format(data))

    # As the date column is only used for visualisation if there is no visualisation we are not gonna validate it
    years, tip2date = [], {}
    if html_compressed or html:
        if date_column:
            if date_column not in df.columns:
                raise ValueError('The date column "{}" not found among the annotation columns: {}.'
                                 .format(date_column, _quote(df.columns)))
            try:
                df[date_column] = pd.to_datetime(df[date_column], infer_datetime_format=True)
            except ValueError:
                try:
                    df[date_column] = pd.to_datetime(df[date_column], format='%Y.0')
                except ValueError:
                    raise ValueError('Could not infer the date format for column "{}", please check it.'
                                     .format(date_column))

            tip2date = df.loc[[_.name for _ in root], date_column].apply(date2years).to_dict()
            if not tip2date:
                raise ValueError('Could not find any dates for the tree tips in column {}, please check it.'
                                 .format(date_column))
        annotate_depth(root)
        if not date_column:
            tip2date = {tip.name: round(getattr(tip, DEPTH), 6) for tip in root}
        else:
            tip2date = {t: round(d, 6) if d is not None else None for (t, d) in tip2date.items()}

        dates = [_ for _ in tip2date.values() if _ is not None]
        if not dates:
            tip2date = {tip.name: round(getattr(tip, DEPTH), 6) for tip in root}
            dates = [_ for _ in tip2date.values() if _ is not None]
            date_column = None
            logger.warning('The date column does not contains dates for any of the tree tips, '
                           'therefore we will ignore it')

        min_date = min(dates)
        max_date = max(dates)
        dates = sorted(dates)
        years = sorted({dates[0], dates[len(dates) // 2],
                        dates[1 * len(dates) // 4], dates[3 * len(dates) // 4], dates[-1]})
        logger.debug("Extracted tip {}: they vary between {} and {}."
                     .format('dates' if date_column else 'distances', min_date, max_date))

    if columns:
        if isinstance(columns, str):
            columns = [columns]
        unknown_columns = set(columns) - set(df.columns)
        if unknown_columns:
            raise ValueError('{} of the specified columns ({}) {} not found among the annotation columns: {}.'
                             .format('One' if len(unknown_columns) == 1 else 'Some',
                                     _quote(unknown_columns),
                                     'is' if len(unknown_columns) == 1 else 'are',
                                     _quote(df.columns)))
        df = df[columns]

    df.columns = [col_name2cat(column) for column in df.columns]

    node_names = {n.name for n in root.traverse() if n.name}
    df_index_names = set(df.index)
    filtered_df = df.loc[node_names & df_index_names, :]
    if not filtered_df.shape[0]:
        tip_name_representatives = []
        for _ in root.iter_leaves():
            if len(tip_name_representatives) < 3:
                tip_name_representatives.append(_.name)
            else:
                break
        raise ValueError('Your tree tip names (e.g. {}) do not correspond to annotation id column values (e.g. {}). '
                         'Check your annotation file.'
                         .format(', '.join(tip_name_representatives),
                                 ', '.join(list(df_index_names)[: min(len(df_index_names), 3)])))
    logger.debug('Checked that tip names correspond to annotation file index.')

    if html_compressed and name_column:
        name_column = col_name2cat(name_column)
        if name_column not in df.columns:
            raise ValueError('The name column ("{}") should be one of those specified as columns ({}).'
                             .format(name_column, _quote(df.columns)))
    elif len(df.columns) == 1:
        name_column = df.columns[0]

    percentage_unknown = filtered_df.isnull().sum(axis=0) / filtered_df.shape[0]
    max_unknown_percentage = percentage_unknown.max()
    if max_unknown_percentage >= (.9 if not copy_only else 1):
        raise ValueError('{:.1f}% of tip annotations for column "{}" are unknown, '
                         'not enough data to infer ancestral states. '
                         'Check your annotation file and if its id column corresponds to the tree tip names.'
                         .format(max_unknown_percentage * 100, percentage_unknown.idxmax()))
    percentage_unique = filtered_df.nunique() / filtered_df.count()
    max_unique_percentage = percentage_unique.max()
    if filtered_df.count()[0] > 100 and max_unique_percentage > .5:
        raise ValueError('The column "{}" seem to contain non-categorical data: {:.1f}% of values are unique. '
                         'PASTML cannot infer ancestral states for a tree with too many tip states.'
                         .format(percentage_unique.idxmax(), 100 * max_unique_percentage))
    logger.debug('Finished input validation.')
    return root, df, years, tip2date, name_column
Exemple #3
0
def pastml_pipeline(tree, data, data_sep='\t', id_index=0,
                    columns=None, prediction_method=MPPA, model=F81, parameters=None,
                    name_column=None, date_column=None, tip_size_threshold=REASONABLE_NUMBER_OF_TIPS,
                    out_data=None, html_compressed=None, html=None, work_dir=None,
                    verbose=False, forced_joint=False, upload_to_itol=False, itol_id=None, itol_project=None,
                    itol_tree_name=None):
    """
    Applies PASTML to the given tree with the specified states and visualizes the result (as html maps).

    :param tree: path to the input tree in newick format (must be rooted).
    :type tree: str

    :param data: path to the annotation file in tab/csv format with the first row containing the column names.
    :type data: str
    :param data_sep: (optional, by default '\t') column separator for the annotation table.
        By default is set to tab, i.e. for tab-delimited file. Set it to ',' if your file is csv.
    :type data_sep: char
    :param id_index: (optional, by default is 0) index of the column in the annotation table
        that contains the tree tip names, indices start from zero.
    :type id_index: int

    :param columns: (optional) name(s) of the annotation table column(s) that contain character(s)
        to be analysed. If not specified all annotation table columns will be considered.
    :type columns: str or list(str)
    :param prediction_method: (optional, default is pastml.ml.MPPA) ancestral character reconstruction method(s),
        can be one of the max likelihood (ML) methods: pastml.ml.MPPA, pastml.ml.MAP, pastml.ml.JOINT,
        one of the max parsimony (MP) methods: pastml.parsimony.ACCTRAN, pastml.parsimony.DELTRAN,
        pastml.parsimony.DOWNPASS; or pastml.acr.COPY to keep the annotated character states as-is without inference.
        One can also specify one of the meta-methods: pastml.ml.ALL, pastml.ml.ML, pastml.parsimony.MP,
        that would perform ACR with multiple methods (all of them for pastml.ml.ALL,
        all the ML methods for pastml.ml.ML, or all the MP methods for pastml.parsimony.MP)
        and save/visualise the results as multiple characters suffixed with the corresponding method.
        When multiple ancestral characters are specified (with ``columns`` argument),
        the same method can be used for all of them (if only one method is specified),
        or different methods can be used (specified in the same order as ``columns``).
        If multiple methods are given, but not for all the characters,
        for the rest of them the default method (pastml.ml.MPPA) is chosen.'
    :type prediction_method: str or list(str)
    :param forced_joint: (optional, default is False) add JOINT state to the MPPA state selection
        even if it is not selected by Brier score.
    :type forced_joint: bool
    :param model: (optional, default is pastml.models.f81_like.F81) evolutionary model(s) for ML methods
        (ignored by MP methods).
        When multiple ancestral characters are specified (with ``columns`` argument),
        the same model can be used for all of them (if only one model is specified),
        or different models can be used (specified in the same order as ``columns``).
        If multiple models are given, but not for all the characters,
        for the rest of them the default model (pastml.models.f81_like.F81) is chosen.
    :type model: str or list(str)
    :param parameters: optional way to fix some of the ML-method parameters.
        Could be specified as
        (1a) a dict {column: {param: value}},
        where column corresponds to the character for which these parameters should be used,
        or (1b) in a form {column: path_to_param_file};
        or (2) as a list of paths to parameter files
        (in the same order as ``columns`` argument that specifies characters)
        possibly given only for the first few characters;
        or (3) as a path to parameter file (only for the first character).
        Each file should be tab-delimited, with two columns: the first one containing parameter names,
        and the second, named "value", containing parameter values.
        Parameters can include character state frequencies (parameter name should be the corresponding state,
        and parameter value - the float frequency value, between 0 and 1),
        and tree branch scaling factor (parameter name pastml.ml.SCALING_FACTOR).
    :type parameters: str or list(str) or dict

    :param name_column: (optional) name of the annotation table column to be used for node names
        in the compressed map visualisation
        (must be one of those specified in ``columns``, if ``columns`` are specified).
        If the annotation table contains only one column, it will be used by default.
    :type name_column: str
    :param date_column: (optional) name of the annotation table column that contains tip dates,
        if specified it is used to add a time slider to the visualisation.
    :type date_column: str
    :param tip_size_threshold: (optional, by default is 15) recursively remove the tips
        of size less than threshold-th largest tip from the compressed map (set to 1e10 to keep all).
        The larger it is the less tips will be trimmed.
    :type tip_size_threshold: int

    :param out_data: path to the output annotation file with the reconstructed ancestral character states.
    :type out_data: str
    :param html_compressed: path to the output compressed visualisation file (html).
    :type html_compressed: str
    :param html: (optional) path to the output tree visualisation file (html).
    :type html: str
    :param work_dir: (optional) path to the folder where pastml parameter, named tree
        and marginal probability (for marginal ML methods (pastml.ml.MPPA, pastml.ml.MAP) only) files are to be stored.
        Default is <path_to_input_file>/<input_file_name>_pastml. If the folder does not exist, it will be created.
    :type work_dir: str

    :param verbose: (optional, default is False) print information on the progress of the analysis.
    :type verbose: bool

    :param upload_to_itol: (optional, default is False) whether the annotated tree should be uploaded to iTOL
        (https://itol.embl.de/)
    :type upload_to_itol: bool
    :param itol_id: (optional) iTOL user batch upload ID that enables uploading to your iTOL account
        (see https://itol.embl.de/help.cgi#batch). If not specified, the tree will not be associated to any account.
    :type itol_id: str
    :param itol_project: (optional) iTOL project the annotated tree should be uploaded to
        (must exist, and itol_id must be specified). If not specified, the tree will not be associated to any project.
    :type itol_project: str
    :param itol_tree_name: (optional) name for the tree uploaded to iTOL.
    :type itol_tree_name: str

    :return: void
    """
    logger = _set_up_pastml_logger(verbose)

    root, df, years, tip2date, name_column = \
        _validate_input(columns, data, data_sep, date_column, html, html_compressed, id_index, name_column, tree,
                        copy_only=COPY == prediction_method or (isinstance(prediction_method, list)
                                                                and all(COPY == _ for _ in prediction_method)))

    if not date_column:
        date_column = 'Dist. to root'

    if parameters:
        if isinstance(parameters, str):
            parameters = [parameters]
        if isinstance(parameters, list):
            parameters = dict(zip(df.columns, parameters))
        elif isinstance(parameters, dict):
            parameters = {col_name2cat(col): params for (col, params) in parameters.items()}
        else:
            raise ValueError('Parameters should be either a list or a dict, got {}.'.format(type(parameters)))
    else:
        parameters = {}

    if not work_dir:
        work_dir = get_pastml_work_dir(tree)
    os.makedirs(work_dir, exist_ok=True)

    acr_results = acr(root, df, prediction_method=prediction_method, model=model, column2parameters=parameters,
                      force_joint=forced_joint)
    column2states = {acr_result[CHARACTER]: acr_result[STATES] for acr_result in acr_results}

    if not out_data:
        out_data = os.path.join(work_dir, get_combined_ancestral_state_file())
    state_df = _serialize_predicted_states(sorted(column2states.keys()), out_data, root)

    # a meta-method would have added a suffix to the name feature
    if html_compressed and name_column and name_column not in column2states:
        ml_name_column = get_personalized_feature_name(name_column, get_default_ml_method())
        name_column = ml_name_column if ml_name_column in column2states \
            else get_personalized_feature_name(name_column, get_default_mp_method())

    itol_result = None
    pool = ThreadPool()
    new_tree = os.path.join(work_dir, get_named_tree_file(tree))
    root.write(outfile=new_tree, format_root_node=True, format=3)
    async_result = pool.map_async(func=_serialize_acr, iterable=((acr_res, work_dir) for acr_res in acr_results))
    if upload_to_itol:
        itol_result = pool.apply_async(func=generate_itol_annotations,
                                       args=(column2states, work_dir, acr_results, state_df, date_column, tip2date,
                                             new_tree, itol_id, itol_project,
                                             itol_tree_name))

    if html or html_compressed:
        logger.debug('\n=============VISUALISATION=====================')
        visualize(root, column2states=column2states,
                  html=html, html_compressed=html_compressed, years=years, tip2date=tip2date,
                  name_column=name_column, tip_size_threshold=tip_size_threshold, date_column=date_column)

    async_result.wait()
    if itol_result:
        itol_result.wait()
    pool.close()

    return root
Exemple #4
0
from pastml.tree import read_tree
from pastml import col_name2cat

if '__main__' == __name__:
    import argparse

    parser = argparse.ArgumentParser()

    parser.add_argument('--tree', required=True, type=str)
    parser.add_argument('--states', required=True, type=str)
    parser.add_argument('--drm', required=True, type=str)
    parser.add_argument('--loc', required=True, type=str)
    parser.add_argument('--out_tree', required=True, type=str)
    params = parser.parse_args()

    drm = col_name2cat(params.drm)
    loc = col_name2cat(params.loc)

    df = pd.read_csv(params.states, header=0, index_col=0,
                     sep='\t')[[drm, loc]]
    df.index = df.index.map(str)
    tree = read_tree(params.tree)
    preannotate_tree(df, tree)

    max_tdr_size = 0
    tdr_root = None
    for _ in tree.traverse('postorder'):
        resistant = getattr(_, drm, set())
        tdr_size = 0
        if resistant == {'resistant'}:
            tdr_size = 1 if _.is_leaf() else sum(