def _train_test_model(task): ''' This is the actual top level function that will be run (possibly remotely) to do the actual work of creating and evaluating models with different parameters. ''' train_set = _SFrame(task.params['train_set']) del task.params['train_set'] test_set = None if task.params['test_set'] is not None: test_set = _SFrame(task.params['test_set']) del task.params['test_set'] model_factory = task.params['model_factory'] del task.params['model_factory'] # Create the model model = model_factory(train_set, **task.params) results = _SFrame({'model_name': [model.name()]}) # Save 'model_details' model_info = _get_all_model_fields(model) results['model_details'] = [_flatten_to_single_sframe_saveable_dict(model_info)] # Save test info if test_set: test_evaluation_info = model.evaluate(test_set) results['test_metrics'] = [_flatten_to_single_sframe_saveable_dict(test_evaluation_info)] task.outputs['results'] = results
def _combiner(**tasks): """ Take the return values from each task, and return the combined result. The combined result is a tuple, where the first element is a list of models, and the second sframe is a summary sframe containing the searched parameters and the evaluation result. """ # Concatenate output from all the tasks. models = [] evaluations = [] parameters = [] metadatas = [] for t in tasks.values(): if t is not None: # If an exception occurred, t is None models.append(t['model']) evaluations.append(t['evaluation']) parameters.append(t['parameters']) metadatas.append(t['metadata']) if all(m is None for m in models): models = None # SFrame contains all the evaluation results, one row per model if all(type(x) in (int, float, str, list, type(None)) for x in evaluations): evaluation_sframe = _SFrame({'metric': evaluations}) else: evaluation_sframe = _SArray(evaluations).unpack( column_name_prefix=None) # SFrame contains all metadata, one row per model if all(type(x) in (int, float, str, list, type(None)) for x in metadatas): metadata_sframe = _SFrame({'metadata': metadatas}) else: metadata_sframe = _SArray(metadatas).unpack( column_name_prefix=None) # SFrame contains all the tuning parameters, one row per model if all(x is None or len(x) == 0 for x in parameters): parameter_sframe = _SFrame( {'parameters': [None] * len(parameters)}) else: parameter_sframe = _SArray(parameters).unpack( column_name_prefix=None) # Make a summary sframe concatenating horizontally the evalution_sframe # and paramter_sframe summary_sframe = _SFrame() param_columns = sorted(parameter_sframe.column_names()) metric_columns = sorted(evaluation_sframe.column_names()) metadata_columns = sorted(metadata_sframe.column_names()) summary_sframe[param_columns] = parameter_sframe[param_columns] summary_sframe[metric_columns] = evaluation_sframe[metric_columns] summary_sframe[metadata_columns] = metadata_sframe[metadata_columns] return _OrderedDict([('models', models), ('summary', summary_sframe)])
def _combiner(**tasks): """ Take the return values from each task, and return the combined result. The combined result is a tuple, where the first element is a list of models, and the second sframe is a summary sframe containing the searched parameters and the evaluation result. """ # Concatenate output from all the tasks. models = [] evaluations = [] parameters = [] metadatas = [] for t in tasks.values(): if t is not None: # If an exception occurred, t is None models.append(t['model']) evaluations.append(t['evaluation']) parameters.append(t['parameters']) metadatas.append(t['metadata']) if all(m is None for m in models): models = None # SFrame contains all the evaluation results, one row per model if all( type(x) in (int, float, str, list, type(None)) for x in evaluations): evaluation_sframe = _SFrame({'metric': evaluations}) else: evaluation_sframe = _SArray(evaluations).unpack( column_name_prefix=None) # SFrame contains all metadata, one row per model if all(type(x) in (int, float, str, list, type(None)) for x in metadatas): metadata_sframe = _SFrame({'metadata': metadatas}) else: metadata_sframe = _SArray(metadatas).unpack(column_name_prefix=None) # SFrame contains all the tuning parameters, one row per model if all(x is None or len(x) == 0 for x in parameters): parameter_sframe = _SFrame({'parameters': [None] * len(parameters)}) else: parameter_sframe = _SArray(parameters).unpack(column_name_prefix=None) # Make a summary sframe concatenating horizontally the evalution_sframe # and paramter_sframe summary_sframe = _SFrame() param_columns = sorted(parameter_sframe.column_names()) metric_columns = sorted(evaluation_sframe.column_names()) metadata_columns = sorted(metadata_sframe.column_names()) summary_sframe[param_columns] = parameter_sframe[param_columns] summary_sframe[metric_columns] = evaluation_sframe[metric_columns] summary_sframe[metadata_columns] = metadata_sframe[metadata_columns] return _OrderedDict([('models', models), ('summary', summary_sframe)])
def _combine_mps_tasks(**tasks): # Concatenate output from all the completed tasks. models = [] evaluations = [] parameters = [] metadatas = [] status = {'Failed': 0, 'Completed': 0} for t in tasks.values(): if t is not None: # If an exception occurred, t is None models.append(t['model']) evaluations.append(t['evaluation']) parameters.append(t['parameters']) metadatas.append(t['metadata']) status['Completed'] += 1 else: status['Failed'] += 1 if all(m is None for m in models): models = None if all(x is None or len(x) == 0 for x in parameters): parameters = _SArray([None] * len(parameters), dtype=dict) evaluations = _SArray(evaluations, dtype=dict) parameters = _SArray(parameters, dtype=dict) metadatas = _SArray(metadatas, dtype=dict) summary = _SFrame({'metric': evaluations, 'metadata': metadatas, 'parameters': parameters}) return _OrderedDict([('models', models), ('summary', summary), ('status', status)])
def _combine_mps_tasks(**tasks): # Concatenate output from all the completed tasks. models = [] evaluations = [] parameters = [] metadatas = [] status = {'Failed': 0, 'Completed': 0} for t in tasks.values(): if t is not None: # If an exception occurred, t is None models.append(t['model']) evaluations.append(t['evaluation']) parameters.append(t['parameters']) metadatas.append(t['metadata']) status['Completed'] += 1 else: status['Failed'] += 1 if all(m is None for m in models): models = None if all(x is None or len(x) == 0 for x in parameters): parameters = _SArray([None] * len(parameters), dtype=dict) evaluations = _SArray(evaluations, dtype=dict) parameters = _SArray(parameters, dtype=dict) metadatas = _SArray(metadatas, dtype=dict) summary = _SFrame({ 'metric': evaluations, 'metadata': metadatas, 'parameters': parameters }) return _OrderedDict([('models', models), ('summary', summary), ('status', status)])
def _combine_sframes(summaries): summary = _SFrame() summary['metadata'] = _SArray(dtype=dict) summary['metric'] = _SArray(dtype=dict) summary['parameters'] = _SArray(dtype=dict) for s in summaries: summary = summary.append(s) return summary
def __getitem__(self, fold_id): if self._cached_list is not None: return self._cached_list[fold_id] else: test = self.sframe_splits[fold_id] train = _SFrame() for i in range(self.num_folds): if i != fold_id: train = train.append(self.sframe_splits[i]) return train, test
def _flatten_to_single_sframe_saveable_dict(dic): ''' Takes a dict or an SFrame. Does it's best to convert that to a dictionary that can be saved as an entry in an SFrame. ''' if not dic: return {} test_sframe = _SFrame() def test_save_to_sframe(v): # An exception will be thrown if assignment fails. test_sframe['test'] = [v] data = {} for key, value in dic.items(): try: if isinstance(value, _SFrame): data[key] = {} for inner_key in value.column_names(): data[key][str(inner_key)] = list(value[inner_key]) test_save_to_sframe(data[key]) elif isinstance(value, _SArray): value = list(value) test_save_to_sframe(value) data[key] = value elif isinstance(value, dict): value = _flatten_to_single_sframe_saveable_dict(value) test_save_to_sframe(value) data[key] = value else: test_save_to_sframe(value) data[key] = value except: data[key] = "Unable to store field" return data
def _train_test_model(model_factory, folds, model_parameters, evaluator, return_model, metadata): """ This is the actual top level function that will be run (possibly remotely) to do the actual work of creating and evaluating models with different parameters. Parameters ---------- model_factory : function same as model_factory from model_parameter_search folds : KFold model_parameters : dict dictionary of model parameters evaluator : function function that takes model, training_set, and test_set and return a dictionary of simple typed values as evaluation result return_model : bool If true, include the model object in the return value. metadata : dict Dictionary of metadata describing this task. Return ------ out: dict The return dictionary contains the following fields: - parameters : a dictionary of parameters being searched - model : the model object if `return_model` is True. None otherwise. - evaluation : the output of the evaluator function - metadata : the user-provided metadata for this run. """ if 'fold_id' in metadata: fold_id = metadata['fold_id'] else: fold_id = 0 training_set, validation_set = folds[fold_id] if isinstance(training_set, str): training_set = _SFrame(training_set) if isinstance(validation_set, str): validation_set = _SFrame(validation_set) # Create the model model = model_factory(training_set, **model_parameters) # Pack results. result = {} result['parameters'] = model_parameters result['model'] = model if return_model else None # Evaluate results (capture exceptions). evaluate_result = evaluator(model, training_set, validation_set) # Return the results as dictionaries. if evaluate_result is not None: _raise_if_evaluator_return_is_not_packable(evaluate_result) result['evaluation'] = evaluate_result result['metadata'] = metadata return result
def _get_summary_struct(self): """ Returns a structured description of the model, including (where relevant) the schema of the training data, description of the training data, training statistics, and model hyperparameters. Returns ------- sections : list (of list of tuples) A list of summary sections. Each section is a list. Each item in a section list is a tuple of the form: ('<feature>','<field>') section_titles: list A list of section titles. The order matches that of the 'sections' object. """ sections = [] fields = [] _features = _precomputed_field( _internal_utils.pretty_print_list(self.features)) _exclude = _precomputed_field( _internal_utils.pretty_print_list(self.excluded_features)) header_fields = [("Features", "features"), ("Excluded Features", "excluded_features")] sections.append("Model Fields") fields.append(header_fields) if self.user_column_interpretations: sections.append("User Specified Interpretations") fields.append( list(sorted(self.get("user_column_interpretations").items()))) column_interpretations = self.get("column_interpretations") features = self.get("features") if self.get("fitted") and features is not None: n_rows = len(features) transform_info = [None] * n_rows for i, f in enumerate(features): interpretation = column_interpretations[f] input_type = self.input_types[f] description, output_type = _get_interpretation_description_and_output_type( interpretation, input_type) transform_info[i] = (f, input_type.__name__, interpretation, description, output_type.__name__) transform_table = _SFrame() transform_table["Column"] = [t[0] for t in transform_info] transform_table["Type"] = [t[1] for t in transform_info] transform_table["Interpretation"] = [t[2] for t in transform_info] transform_table["Transforms"] = [t[3] for t in transform_info] transform_table["Output Type"] = [t[4] for t in transform_info] fields[-1].append(transform_table) return fields, sections
def bm25(dataset, query, k1=1.5, b=.75): """ For a given query and set of documents, compute the BM25 score for each document. If we have a query with words q_1, ..., q_n the BM25 score for a document is: .. math:: \sum_{i=1}^N IDF(q_i)\\frac{f(q_i) * (k_1+1)}{f(q_i) + k_1 * (1-b+b*|D|/d_avg))} where * :math:`\mbox{IDF}(q_i) = log((N - n(q_i) + .5)/(n(q_i) + .5)` * :math:`f(q_i)` is the number of times q_i occurs in the document * :math:`n(q_i)` is the number of documents containing q_i * :math:`|D|` is the number of words in the document * :math:`d_avg` is the average number of words per document in the corpus * :math:`k_1` and :math:`b` are free parameters. Parameters ---------- dataset : SArray of type dict, list, or str An SArray where each element eitherrepresents a document in: * **dict** : a bag-of-words format, where each key is a word and each value is the number of times that word occurs in the document. * **list** : The list is converted to bag of words of format, where the keys are the unique elements in the list and the values are the counts of those unique elements. After this step, the behaviour is identical to dict. * **string** : Behaves identically to a **dict**, where the dictionary is generated by converting the string into a bag-of-words format. For example, 'I really like really fluffy dogs" would get converted to {'I' : 1, 'really': 2, 'like': 1, 'fluffy': 1, 'dogs':1}. query : A list, set, or SArray of type str A list, set or SArray where each element is a word. k1 : float, optional Free parameter which controls the relative importance of term frequencies. Recommended values are [1.2, 2.0]. b : float, optional Free parameter which controls how much to downweight scores for long documents. Recommended value is 0.75. Returns ------- out : SFrame An SFrame containing the BM25 score for each document containing one of the query words. The doc_id column is the row number of the document. Examples -------- >>> dataset = graphlab.SArray([ {'a':5, 'b':7, 'c':10}, {'a':3, 'c':1, 'd':2}, {'a':10, 'b':3, 'e':5}, {'a':1}, {'f':5}]) >>> query = ['a', 'b', 'c'] >>> graphlab.text_analytics.bm25(dataset, query) References ---------- .. [BM25] `"Okapi BM-25" <http://en.wikipedia.org/wiki/Okapi_BM25>`_ """ _mt._get_metric_tracker().track('toolkit.text_analytics.bm25') if type(dataset) != _graphlab.SArray: raise TypeError, 'bm25 requires an SArray of dict, list, or str type'+\ ', where each dictionary whose keys are words and whose values' + \ ' are word frequency.' sf = _SFrame({'docs' : dataset}) if type(query) is dict: # For backwards compatibility query = query.keys() if type(query) is _graphlab.SArray: query = list(query) if type(query) is set: query = list(query) if type(query) is not list: raise TypeError, 'The query must either be an SArray of str type, '+\ ' a list of strings, or a set of strings.' # Calculate BM25 sf = sf.add_row_number('doc_id') sf = sf.dropna('docs') # Drop missing documents scores = _graphlab.feature_engineering.BM25('docs',query, k1, b, output_column_name = 'bm25').fit_transform(sf) # Find documents with query words if scores['docs'].dtype() is dict: scores['doc_terms'] = scores['docs'].dict_keys() elif scores['docs'].dtype() is list: scores['doc_terms'] = scores['docs'].apply(lambda x: list(set(x))) elif scores['docs'].dtype() is str: scores['doc_terms'] = count_words(scores['docs']).dict_keys() else: # This should never occur (handled by BM25) raise TypeError, 'bm25 requires an SArray of dict, list, or str type' scores['doc_counts'] = scores['doc_terms'].apply(lambda x: len([word for word in query if word in x])) scores = scores[scores['doc_counts'] > 0] # Drop documents without query word scores = scores.select_columns(['doc_id','bm25']) return scores
def _train_test_model(model_factory, folds, model_parameters, evaluator, return_model, metadata): """ This is the actual top level function that will be run (possibly remotely) to do the actual work of creating and evaluating models with different parameters. Parameters ---------- model_factory : function same as model_factory from model_parameter_search folds : KFold model_parameters : dict dictionary of model parameters evaluator : function function that takes model, training_set, and test_set and return a dictionary of simple typed values as evaluation result return_model : bool If true, include the model object in the return value. metadata : dict Dictionary of metadata describing this task. Returns ------- out: dict The return dictionary contains the following fields: - parameters : a dictionary of parameters being searched - model : the model object if `return_model` is True. None otherwise. - evaluation : the output of the evaluator function - metadata : the user-provided metadata for this run. """ if 'fold_id' in metadata: fold_id = metadata['fold_id'] else: fold_id = 0 training_set, validation_set = folds[fold_id] if isinstance(training_set, str): training_set = _SFrame(training_set) if isinstance(validation_set, str): validation_set = _SFrame(validation_set) # Some parameters require a validation set is provided, e.g. early stopping. if _sys.version_info.major == 3: argspec = _inspect.getargspec(model_factory) args = argspec.args else: args, varargs, varkw, defaults = _inspect.getargspec(model_factory) if 'validation_set' in args and not 'validation_set' in model_parameters: model_parameters['validation_set'] = validation_set # Create the model model = model_factory(training_set, **model_parameters) # Remove validation_set from model_parameters before summarizing if 'validation_set' in model_parameters: del model_parameters['validation_set'] # Pack results. result = {} result['parameters'] = model_parameters result['model'] = model if return_model else None # Evaluate results (capture exceptions). evaluate_result = evaluator(model, training_set, validation_set) # Return the results as dictionaries. if evaluate_result is not None: _raise_if_evaluator_return_is_not_packable(evaluate_result) result['evaluation'] = evaluate_result result['metadata'] = metadata return result
def bm25(dataset, query, k1=1.5, b=.75): """ For a given query and set of documents, compute the BM25 score for each document. If we have a query with words q_1, ..., q_n the BM25 score for a document is: .. math:: \sum_{i=1}^N IDF(q_i)\\frac{f(q_i) * (k_1+1)}{f(q_i) + k_1 * (1-b+b*|D|/d_avg))} where * :math:`\mbox{IDF}(q_i) = log((N - n(q_i) + .5)/(n(q_i) + .5)` * :math:`f(q_i)` is the number of times q_i occurs in the document * :math:`n(q_i)` is the number of documents containing q_i * :math:`|D|` is the number of words in the document * :math:`d_avg` is the average number of words per document in the corpus * :math:`k_1` and :math:`b` are free parameters. Parameters ---------- dataset : SArray of type dict, list, or str An SArray where each element eitherrepresents a document in: * **dict** : a bag-of-words format, where each key is a word and each value is the number of times that word occurs in the document. * **list** : The list is converted to bag of words of format, where the keys are the unique elements in the list and the values are the counts of those unique elements. After this step, the behaviour is identical to dict. * **string** : Behaves identically to a **dict**, where the dictionary is generated by converting the string into a bag-of-words format. For example, 'I really like really fluffy dogs" would get converted to {'I' : 1, 'really': 2, 'like': 1, 'fluffy': 1, 'dogs':1}. query : A list, set, or SArray of type str A list, set or SArray where each element is a word. k1 : float, optional Free parameter which controls the relative importance of term frequencies. Recommended values are [1.2, 2.0]. b : float, optional Free parameter which controls how much to downweight scores for long documents. Recommended value is 0.75. Returns ------- out : SFrame An SFrame containing the BM25 score for each document containing one of the query words. The doc_id column is the row number of the document. Examples -------- .. sourcecode:: python >>> import graphlab >>> dataset = graphlab.SArray([ {'a':5, 'b':7, 'c':10}, {'a':3, 'c':1, 'd':2}, {'a':10, 'b':3, 'e':5}, {'a':1}, {'f':5}]) >>> query = ['a', 'b', 'c'] >>> graphlab.text_analytics.bm25(dataset, query) References ---------- .. [BM25] `"Okapi BM-25" <http://en.wikipedia.org/wiki/Okapi_BM25>`_ """ _mt._get_metric_tracker().track('toolkit.text_analytics.bm25') if type(dataset) != _graphlab.SArray: raise TypeError('bm25 requires an SArray of dict, list, or str type'+\ ', where each dictionary whose keys are words and whose values' + \ ' are word frequency.') sf = _SFrame({'docs' : dataset}) if type(query) is dict: # For backwards compatibility query = list(query.keys()) if type(query) is _graphlab.SArray: query = list(query) if type(query) is set: query = list(query) if type(query) is not list: raise TypeError('The query must either be an SArray of str type, '+\ ' a list of strings, or a set of strings.') # Calculate BM25 sf = sf.add_row_number('doc_id') sf = sf.dropna('docs') # Drop missing documents scores = _graphlab.feature_engineering.BM25('docs',query, k1, b, output_column_name = 'bm25').fit_transform(sf) # Find documents with query words if scores['docs'].dtype() is dict: scores['doc_terms'] = scores['docs'].dict_keys() elif scores['docs'].dtype() is list: scores['doc_terms'] = scores['docs'].apply(lambda x: list(set(x))) elif scores['docs'].dtype() is str: scores['doc_terms'] = count_words(scores['docs']).dict_keys() else: # This should never occur (handled by BM25) raise TypeError('bm25 requires an SArray of dict, list, or str type') scores['doc_counts'] = scores['doc_terms'].apply(lambda x: len([word for word in query if word in x])) scores = scores[scores['doc_counts'] > 0] # Drop documents without query word scores = scores.select_columns(['doc_id','bm25']) return scores
def bm25(dataset, query, k1=1.5, b=.75): """ For a given query and set of documents, compute the BM25 score for each document. If we have a query with words q_1, ..., q_n the BM25 score for a document is: .. math:: \sum_{i=1}^N IDF(q_i)\\frac{f(q_i) * (k_1+1)}{f(q_i + k_1 * (1-b+b*|D|/d_avg))} where * :math:`\mbox{IDF}(q_i) = log((N - n(q_i) + .5)/(n(q_i) + .5)` * :math:`f(q_i)` is the number of times q_i occurs in the document * :math:`n(q_i)` is the number of documents containing q_i * :math:`|D|` is the number of words in the document * :math:`d_avg` is the average number of words per document in the corpus * :math:`k_1` and :math:`b` are free parameters. Parameters ---------- dataset : SArray of type dict An SArray where each element represents a document in bag-of-words format, where each key is a word and each value is the number of times that word occurs in the document. query : SArray of type str An SArray where each element is a word. k1 : float, optional Free parameter which controls the relative importance of term frequencies. Recommended values are [1.2, 2.0]. b : float, optional Free parameter which controls how much to downweight scores for long documents. Recommended value is 0.75. Returns ------- out : SFrame An SFrame containing the BM25 score for each document containing one of the query words. The doc_id column is the row number of the document. Example ------- >>> dataset = graphlab.SArray([ {'a':5, 'b':7, 'c':10}, {'a':3, 'c':1, 'd':2}, {'a':10, 'b':3, 'e':5}, {'a':1}, {'f':5}]) >>> query = ['a', 'b', 'c'] >>> graphlab.text_analytics.bm25(dataset, query) References ---------- .. [BM25] `"Okapi BM-25" <http://en.wikipedia.org/wiki/Okapi_BM25>`_ """ _mt._get_metric_tracker().track('toolkit.text_analytics.bm25') if type(dataset) != _graphlab.SArray: raise TypeError, 'bm25 requires an SArray of dict type, where each '+\ 'dictionary whose keys are words and whose values are word frequency.' if type(query) not in [_graphlab.SArray, dict, list]: raise TypeError, 'The query must either be an SArray of str type, '+\ ' a list of strings, or a dictionary.' if type(query) == dict: query = query.keys() if type(query) == list: query = _graphlab.SArray(query) query = _SFrame({'word': query}) # Add row ids N = len(dataset) d = _SFrame({'bow':dataset}) d = d.add_row_number('doc_id') # Compute document length and average doc length d['doc_length'] = d['bow'].dict_values().apply(lambda x: sum(x)) avg_doc_length = d['doc_length'].mean() # Convert into 'stacked' format d_stacked = d.stack('bow', new_column_name=['word', 'tf'], drop_na=False) query = query.join(d_stacked) # Compute number of docs in which each word appears doc_freq = query.groupby('word', {'doc_freq': _graphlab.aggregate.COUNT}) query = query.join(doc_freq, on='word') # Compute IDF for each word in the query query['idf'] = query['doc_freq'].apply(lambda n: math.log((N - n + .5) / (n + .5))) # Compute the score of each word for each document query['word_score'] = query['idf'] * \ (query['tf'] * (k1 + 1.0)) / \ (query['tf'] + k1 * 1.0 * (1.0 - b + b * query['doc_length'] / avg_doc_length)) # Compute BM25 result = query.groupby('doc_id', {'bm25': _graphlab.aggregate.SUM('word_score')}) return result.sort('doc_id')