def get_column_method(column, method): column = col_name2cat(column) if is_meta_ml(method): method = get_default_ml_method() elif is_meta_mp(method): method = get_default_mp_method() else: return column, method return get_personalized_feature_name(column, method), method
def _validate_input(columns, data, data_sep, date_column, html, html_compressed, id_index, name_column, tree_nwk, copy_only): logger = logging.getLogger('pastml') logger.debug('\n=============INPUT DATA VALIDATION=============') root = read_tree(tree_nwk) num_neg = 0 for _ in root.traverse(): if _.dist < 0: num_neg += 1 _.dist = 0 if num_neg: logger.warning('Input tree contained {} negative branches: we put them to zero.'.format(num_neg)) logger.debug('Read the tree {}.'.format(tree_nwk)) df = pd.read_csv(data, sep=data_sep, index_col=id_index, header=0, dtype=str) df.index = df.index.map(str) logger.debug('Read the annotation file {}.'.format(data)) # As the date column is only used for visualisation if there is no visualisation we are not gonna validate it years, tip2date = [], {} if html_compressed or html: if date_column: if date_column not in df.columns: raise ValueError('The date column "{}" not found among the annotation columns: {}.' .format(date_column, _quote(df.columns))) try: df[date_column] = pd.to_datetime(df[date_column], infer_datetime_format=True) except ValueError: try: df[date_column] = pd.to_datetime(df[date_column], format='%Y.0') except ValueError: raise ValueError('Could not infer the date format for column "{}", please check it.' .format(date_column)) tip2date = df.loc[[_.name for _ in root], date_column].apply(date2years).to_dict() if not tip2date: raise ValueError('Could not find any dates for the tree tips in column {}, please check it.' .format(date_column)) annotate_depth(root) if not date_column: tip2date = {tip.name: round(getattr(tip, DEPTH), 6) for tip in root} else: tip2date = {t: round(d, 6) if d is not None else None for (t, d) in tip2date.items()} dates = [_ for _ in tip2date.values() if _ is not None] if not dates: tip2date = {tip.name: round(getattr(tip, DEPTH), 6) for tip in root} dates = [_ for _ in tip2date.values() if _ is not None] date_column = None logger.warning('The date column does not contains dates for any of the tree tips, ' 'therefore we will ignore it') min_date = min(dates) max_date = max(dates) dates = sorted(dates) years = sorted({dates[0], dates[len(dates) // 2], dates[1 * len(dates) // 4], dates[3 * len(dates) // 4], dates[-1]}) logger.debug("Extracted tip {}: they vary between {} and {}." .format('dates' if date_column else 'distances', min_date, max_date)) if columns: if isinstance(columns, str): columns = [columns] unknown_columns = set(columns) - set(df.columns) if unknown_columns: raise ValueError('{} of the specified columns ({}) {} not found among the annotation columns: {}.' .format('One' if len(unknown_columns) == 1 else 'Some', _quote(unknown_columns), 'is' if len(unknown_columns) == 1 else 'are', _quote(df.columns))) df = df[columns] df.columns = [col_name2cat(column) for column in df.columns] node_names = {n.name for n in root.traverse() if n.name} df_index_names = set(df.index) filtered_df = df.loc[node_names & df_index_names, :] if not filtered_df.shape[0]: tip_name_representatives = [] for _ in root.iter_leaves(): if len(tip_name_representatives) < 3: tip_name_representatives.append(_.name) else: break raise ValueError('Your tree tip names (e.g. {}) do not correspond to annotation id column values (e.g. {}). ' 'Check your annotation file.' .format(', '.join(tip_name_representatives), ', '.join(list(df_index_names)[: min(len(df_index_names), 3)]))) logger.debug('Checked that tip names correspond to annotation file index.') if html_compressed and name_column: name_column = col_name2cat(name_column) if name_column not in df.columns: raise ValueError('The name column ("{}") should be one of those specified as columns ({}).' .format(name_column, _quote(df.columns))) elif len(df.columns) == 1: name_column = df.columns[0] percentage_unknown = filtered_df.isnull().sum(axis=0) / filtered_df.shape[0] max_unknown_percentage = percentage_unknown.max() if max_unknown_percentage >= (.9 if not copy_only else 1): raise ValueError('{:.1f}% of tip annotations for column "{}" are unknown, ' 'not enough data to infer ancestral states. ' 'Check your annotation file and if its id column corresponds to the tree tip names.' .format(max_unknown_percentage * 100, percentage_unknown.idxmax())) percentage_unique = filtered_df.nunique() / filtered_df.count() max_unique_percentage = percentage_unique.max() if filtered_df.count()[0] > 100 and max_unique_percentage > .5: raise ValueError('The column "{}" seem to contain non-categorical data: {:.1f}% of values are unique. ' 'PASTML cannot infer ancestral states for a tree with too many tip states.' .format(percentage_unique.idxmax(), 100 * max_unique_percentage)) logger.debug('Finished input validation.') return root, df, years, tip2date, name_column
def pastml_pipeline(tree, data, data_sep='\t', id_index=0, columns=None, prediction_method=MPPA, model=F81, parameters=None, name_column=None, date_column=None, tip_size_threshold=REASONABLE_NUMBER_OF_TIPS, out_data=None, html_compressed=None, html=None, work_dir=None, verbose=False, forced_joint=False, upload_to_itol=False, itol_id=None, itol_project=None, itol_tree_name=None): """ Applies PASTML to the given tree with the specified states and visualizes the result (as html maps). :param tree: path to the input tree in newick format (must be rooted). :type tree: str :param data: path to the annotation file in tab/csv format with the first row containing the column names. :type data: str :param data_sep: (optional, by default '\t') column separator for the annotation table. By default is set to tab, i.e. for tab-delimited file. Set it to ',' if your file is csv. :type data_sep: char :param id_index: (optional, by default is 0) index of the column in the annotation table that contains the tree tip names, indices start from zero. :type id_index: int :param columns: (optional) name(s) of the annotation table column(s) that contain character(s) to be analysed. If not specified all annotation table columns will be considered. :type columns: str or list(str) :param prediction_method: (optional, default is pastml.ml.MPPA) ancestral character reconstruction method(s), can be one of the max likelihood (ML) methods: pastml.ml.MPPA, pastml.ml.MAP, pastml.ml.JOINT, one of the max parsimony (MP) methods: pastml.parsimony.ACCTRAN, pastml.parsimony.DELTRAN, pastml.parsimony.DOWNPASS; or pastml.acr.COPY to keep the annotated character states as-is without inference. One can also specify one of the meta-methods: pastml.ml.ALL, pastml.ml.ML, pastml.parsimony.MP, that would perform ACR with multiple methods (all of them for pastml.ml.ALL, all the ML methods for pastml.ml.ML, or all the MP methods for pastml.parsimony.MP) and save/visualise the results as multiple characters suffixed with the corresponding method. When multiple ancestral characters are specified (with ``columns`` argument), the same method can be used for all of them (if only one method is specified), or different methods can be used (specified in the same order as ``columns``). If multiple methods are given, but not for all the characters, for the rest of them the default method (pastml.ml.MPPA) is chosen.' :type prediction_method: str or list(str) :param forced_joint: (optional, default is False) add JOINT state to the MPPA state selection even if it is not selected by Brier score. :type forced_joint: bool :param model: (optional, default is pastml.models.f81_like.F81) evolutionary model(s) for ML methods (ignored by MP methods). When multiple ancestral characters are specified (with ``columns`` argument), the same model can be used for all of them (if only one model is specified), or different models can be used (specified in the same order as ``columns``). If multiple models are given, but not for all the characters, for the rest of them the default model (pastml.models.f81_like.F81) is chosen. :type model: str or list(str) :param parameters: optional way to fix some of the ML-method parameters. Could be specified as (1a) a dict {column: {param: value}}, where column corresponds to the character for which these parameters should be used, or (1b) in a form {column: path_to_param_file}; or (2) as a list of paths to parameter files (in the same order as ``columns`` argument that specifies characters) possibly given only for the first few characters; or (3) as a path to parameter file (only for the first character). Each file should be tab-delimited, with two columns: the first one containing parameter names, and the second, named "value", containing parameter values. Parameters can include character state frequencies (parameter name should be the corresponding state, and parameter value - the float frequency value, between 0 and 1), and tree branch scaling factor (parameter name pastml.ml.SCALING_FACTOR). :type parameters: str or list(str) or dict :param name_column: (optional) name of the annotation table column to be used for node names in the compressed map visualisation (must be one of those specified in ``columns``, if ``columns`` are specified). If the annotation table contains only one column, it will be used by default. :type name_column: str :param date_column: (optional) name of the annotation table column that contains tip dates, if specified it is used to add a time slider to the visualisation. :type date_column: str :param tip_size_threshold: (optional, by default is 15) recursively remove the tips of size less than threshold-th largest tip from the compressed map (set to 1e10 to keep all). The larger it is the less tips will be trimmed. :type tip_size_threshold: int :param out_data: path to the output annotation file with the reconstructed ancestral character states. :type out_data: str :param html_compressed: path to the output compressed visualisation file (html). :type html_compressed: str :param html: (optional) path to the output tree visualisation file (html). :type html: str :param work_dir: (optional) path to the folder where pastml parameter, named tree and marginal probability (for marginal ML methods (pastml.ml.MPPA, pastml.ml.MAP) only) files are to be stored. Default is <path_to_input_file>/<input_file_name>_pastml. If the folder does not exist, it will be created. :type work_dir: str :param verbose: (optional, default is False) print information on the progress of the analysis. :type verbose: bool :param upload_to_itol: (optional, default is False) whether the annotated tree should be uploaded to iTOL (https://itol.embl.de/) :type upload_to_itol: bool :param itol_id: (optional) iTOL user batch upload ID that enables uploading to your iTOL account (see https://itol.embl.de/help.cgi#batch). If not specified, the tree will not be associated to any account. :type itol_id: str :param itol_project: (optional) iTOL project the annotated tree should be uploaded to (must exist, and itol_id must be specified). If not specified, the tree will not be associated to any project. :type itol_project: str :param itol_tree_name: (optional) name for the tree uploaded to iTOL. :type itol_tree_name: str :return: void """ logger = _set_up_pastml_logger(verbose) root, df, years, tip2date, name_column = \ _validate_input(columns, data, data_sep, date_column, html, html_compressed, id_index, name_column, tree, copy_only=COPY == prediction_method or (isinstance(prediction_method, list) and all(COPY == _ for _ in prediction_method))) if not date_column: date_column = 'Dist. to root' if parameters: if isinstance(parameters, str): parameters = [parameters] if isinstance(parameters, list): parameters = dict(zip(df.columns, parameters)) elif isinstance(parameters, dict): parameters = {col_name2cat(col): params for (col, params) in parameters.items()} else: raise ValueError('Parameters should be either a list or a dict, got {}.'.format(type(parameters))) else: parameters = {} if not work_dir: work_dir = get_pastml_work_dir(tree) os.makedirs(work_dir, exist_ok=True) acr_results = acr(root, df, prediction_method=prediction_method, model=model, column2parameters=parameters, force_joint=forced_joint) column2states = {acr_result[CHARACTER]: acr_result[STATES] for acr_result in acr_results} if not out_data: out_data = os.path.join(work_dir, get_combined_ancestral_state_file()) state_df = _serialize_predicted_states(sorted(column2states.keys()), out_data, root) # a meta-method would have added a suffix to the name feature if html_compressed and name_column and name_column not in column2states: ml_name_column = get_personalized_feature_name(name_column, get_default_ml_method()) name_column = ml_name_column if ml_name_column in column2states \ else get_personalized_feature_name(name_column, get_default_mp_method()) itol_result = None pool = ThreadPool() new_tree = os.path.join(work_dir, get_named_tree_file(tree)) root.write(outfile=new_tree, format_root_node=True, format=3) async_result = pool.map_async(func=_serialize_acr, iterable=((acr_res, work_dir) for acr_res in acr_results)) if upload_to_itol: itol_result = pool.apply_async(func=generate_itol_annotations, args=(column2states, work_dir, acr_results, state_df, date_column, tip2date, new_tree, itol_id, itol_project, itol_tree_name)) if html or html_compressed: logger.debug('\n=============VISUALISATION=====================') visualize(root, column2states=column2states, html=html, html_compressed=html_compressed, years=years, tip2date=tip2date, name_column=name_column, tip_size_threshold=tip_size_threshold, date_column=date_column) async_result.wait() if itol_result: itol_result.wait() pool.close() return root
from pastml.tree import read_tree from pastml import col_name2cat if '__main__' == __name__: import argparse parser = argparse.ArgumentParser() parser.add_argument('--tree', required=True, type=str) parser.add_argument('--states', required=True, type=str) parser.add_argument('--drm', required=True, type=str) parser.add_argument('--loc', required=True, type=str) parser.add_argument('--out_tree', required=True, type=str) params = parser.parse_args() drm = col_name2cat(params.drm) loc = col_name2cat(params.loc) df = pd.read_csv(params.states, header=0, index_col=0, sep='\t')[[drm, loc]] df.index = df.index.map(str) tree = read_tree(params.tree) preannotate_tree(df, tree) max_tdr_size = 0 tdr_root = None for _ in tree.traverse('postorder'): resistant = getattr(_, drm, set()) tdr_size = 0 if resistant == {'resistant'}: tdr_size = 1 if _.is_leaf() else sum(