def _train_test_model(task):
    '''
    This is the actual top level function that will be run (possibly remotely) to do the actual work
    of creating and evaluating models with different parameters.
    '''

    train_set = _SFrame(task.params['train_set'])
    del task.params['train_set']
    test_set = None
    if task.params['test_set'] is not None:
        test_set = _SFrame(task.params['test_set'])
    del task.params['test_set']

    model_factory = task.params['model_factory']
    del task.params['model_factory']

    # Create the model
    model = model_factory(train_set, **task.params)
    results = _SFrame({'model_name': [model.name()]})

    # Save 'model_details'
    model_info = _get_all_model_fields(model)
    results['model_details'] = [_flatten_to_single_sframe_saveable_dict(model_info)]

    # Save test info
    if test_set:
        test_evaluation_info = model.evaluate(test_set)
        results['test_metrics'] = [_flatten_to_single_sframe_saveable_dict(test_evaluation_info)]

    task.outputs['results'] = results
def _combiner(**tasks):
    """
    Take the return values from each task, and return
    the combined result.

    The combined result is a tuple, where the first
    element is a list of models, and the second
    sframe is a summary sframe containing
    the searched parameters and the evaluation result.
    """
    # Concatenate output from all the tasks.
    models = []
    evaluations = []
    parameters = []
    metadatas = []
    for t in tasks.values():
        if t is not None:  # If an exception occurred, t is None
            models.append(t['model'])
            evaluations.append(t['evaluation'])
            parameters.append(t['parameters'])
            metadatas.append(t['metadata'])

    if all(m is None for m in models):
        models = None

    # SFrame contains all the evaluation results, one row per model
    if all(type(x) in (int, float, str, list, type(None))
           for x in evaluations):
        evaluation_sframe = _SFrame({'metric': evaluations})
    else:
        evaluation_sframe = _SArray(evaluations).unpack(
            column_name_prefix=None)

    # SFrame contains all metadata, one row per model
    if all(type(x) in (int, float, str, list, type(None))
           for x in metadatas):
        metadata_sframe = _SFrame({'metadata': metadatas})
    else:
        metadata_sframe = _SArray(metadatas).unpack(
            column_name_prefix=None)

    # SFrame contains all the tuning parameters, one row per model
    if all(x is None or len(x) == 0 for x in parameters):
        parameter_sframe = _SFrame(
            {'parameters': [None] * len(parameters)})
    else:
        parameter_sframe = _SArray(parameters).unpack(
            column_name_prefix=None)

    # Make a summary sframe concatenating horizontally the evalution_sframe
    # and paramter_sframe
    summary_sframe = _SFrame()
    param_columns = sorted(parameter_sframe.column_names())
    metric_columns = sorted(evaluation_sframe.column_names())
    metadata_columns = sorted(metadata_sframe.column_names())
    summary_sframe[param_columns] = parameter_sframe[param_columns]
    summary_sframe[metric_columns] = evaluation_sframe[metric_columns]
    summary_sframe[metadata_columns] = metadata_sframe[metadata_columns]
    return _OrderedDict([('models', models), ('summary', summary_sframe)])
Beispiel #3
0
def _combiner(**tasks):
    """
    Take the return values from each task, and return
    the combined result.

    The combined result is a tuple, where the first
    element is a list of models, and the second
    sframe is a summary sframe containing
    the searched parameters and the evaluation result.
    """
    # Concatenate output from all the tasks.
    models = []
    evaluations = []
    parameters = []
    metadatas = []
    for t in tasks.values():
        if t is not None:  # If an exception occurred, t is None
            models.append(t['model'])
            evaluations.append(t['evaluation'])
            parameters.append(t['parameters'])
            metadatas.append(t['metadata'])

    if all(m is None for m in models):
        models = None

    # SFrame contains all the evaluation results, one row per model
    if all(
            type(x) in (int, float, str, list, type(None))
            for x in evaluations):
        evaluation_sframe = _SFrame({'metric': evaluations})
    else:
        evaluation_sframe = _SArray(evaluations).unpack(
            column_name_prefix=None)

    # SFrame contains all metadata, one row per model
    if all(type(x) in (int, float, str, list, type(None)) for x in metadatas):
        metadata_sframe = _SFrame({'metadata': metadatas})
    else:
        metadata_sframe = _SArray(metadatas).unpack(column_name_prefix=None)

    # SFrame contains all the tuning parameters, one row per model
    if all(x is None or len(x) == 0 for x in parameters):
        parameter_sframe = _SFrame({'parameters': [None] * len(parameters)})
    else:
        parameter_sframe = _SArray(parameters).unpack(column_name_prefix=None)

    # Make a summary sframe concatenating horizontally the evalution_sframe
    # and paramter_sframe
    summary_sframe = _SFrame()
    param_columns = sorted(parameter_sframe.column_names())
    metric_columns = sorted(evaluation_sframe.column_names())
    metadata_columns = sorted(metadata_sframe.column_names())
    summary_sframe[param_columns] = parameter_sframe[param_columns]
    summary_sframe[metric_columns] = evaluation_sframe[metric_columns]
    summary_sframe[metadata_columns] = metadata_sframe[metadata_columns]
    return _OrderedDict([('models', models), ('summary', summary_sframe)])
def _combine_mps_tasks(**tasks):
    # Concatenate output from all the completed tasks.
    models = []
    evaluations = []
    parameters = []
    metadatas = []
    status = {'Failed': 0, 'Completed': 0}
    for t in tasks.values():
        if t is not None:  # If an exception occurred, t is None
            models.append(t['model'])
            evaluations.append(t['evaluation'])
            parameters.append(t['parameters'])
            metadatas.append(t['metadata'])
            status['Completed'] += 1
        else:
            status['Failed'] += 1

    if all(m is None for m in models):
        models = None
    if all(x is None or len(x) == 0 for x in parameters):
        parameters = _SArray([None] * len(parameters), dtype=dict)
    evaluations = _SArray(evaluations, dtype=dict)
    parameters = _SArray(parameters, dtype=dict)
    metadatas = _SArray(metadatas, dtype=dict)

    summary = _SFrame({'metric': evaluations,
                       'metadata': metadatas,
                       'parameters': parameters})

    return _OrderedDict([('models', models),
                         ('summary', summary),
                         ('status', status)])
Beispiel #5
0
def _combine_mps_tasks(**tasks):
    # Concatenate output from all the completed tasks.
    models = []
    evaluations = []
    parameters = []
    metadatas = []
    status = {'Failed': 0, 'Completed': 0}
    for t in tasks.values():
        if t is not None:  # If an exception occurred, t is None
            models.append(t['model'])
            evaluations.append(t['evaluation'])
            parameters.append(t['parameters'])
            metadatas.append(t['metadata'])
            status['Completed'] += 1
        else:
            status['Failed'] += 1

    if all(m is None for m in models):
        models = None
    if all(x is None or len(x) == 0 for x in parameters):
        parameters = _SArray([None] * len(parameters), dtype=dict)
    evaluations = _SArray(evaluations, dtype=dict)
    parameters = _SArray(parameters, dtype=dict)
    metadatas = _SArray(metadatas, dtype=dict)

    summary = _SFrame({
        'metric': evaluations,
        'metadata': metadatas,
        'parameters': parameters
    })

    return _OrderedDict([('models', models), ('summary', summary),
                         ('status', status)])
 def _combine_sframes(summaries):
     summary = _SFrame()
     summary['metadata'] = _SArray(dtype=dict)
     summary['metric'] = _SArray(dtype=dict)
     summary['parameters'] = _SArray(dtype=dict)
     for s in summaries:
         summary = summary.append(s)
     return summary
Beispiel #7
0
 def _combine_sframes(summaries):
     summary = _SFrame()
     summary['metadata'] = _SArray(dtype=dict)
     summary['metric'] = _SArray(dtype=dict)
     summary['parameters'] = _SArray(dtype=dict)
     for s in summaries:
         summary = summary.append(s)
     return summary
 def __getitem__(self, fold_id):
     if self._cached_list is not None:
         return self._cached_list[fold_id]
     else:
         test = self.sframe_splits[fold_id]
         train = _SFrame()
         for i in range(self.num_folds):
             if i != fold_id:
                 train = train.append(self.sframe_splits[i])
         return train, test
 def __getitem__(self, fold_id):
     if self._cached_list is not None:
         return self._cached_list[fold_id]
     else:
         test = self.sframe_splits[fold_id]
         train = _SFrame()
         for i in range(self.num_folds):
             if i != fold_id:
                 train = train.append(self.sframe_splits[i])
         return train, test
def _flatten_to_single_sframe_saveable_dict(dic):
    '''
    Takes a dict or an SFrame. Does it's best to convert that to a dictionary that can be saved as
    an entry in an SFrame.
    '''
    if not dic:
        return {}

    test_sframe = _SFrame()
    def test_save_to_sframe(v):
        # An exception will be thrown if assignment fails.
        test_sframe['test'] = [v]

    data = {}
    for key, value in dic.items():
        try:
            if isinstance(value, _SFrame):
                data[key] = {}
                for inner_key in value.column_names():
                    data[key][str(inner_key)] = list(value[inner_key])
                test_save_to_sframe(data[key])
            elif isinstance(value, _SArray):
                value = list(value)
                test_save_to_sframe(value)
                data[key] = value
            elif isinstance(value, dict):
                value = _flatten_to_single_sframe_saveable_dict(value)
                test_save_to_sframe(value)
                data[key] = value
            else:
                test_save_to_sframe(value)
                data[key] = value
        except:
            data[key] = "Unable to store field"

    return data
def _train_test_model(model_factory,
                      folds,
                      model_parameters,
                      evaluator,
                      return_model,
                      metadata):
    """
    This is the actual top level function that will be run (possibly remotely)
    to do the actual work of creating and evaluating models with
    different parameters.

    Parameters
    ----------
    model_factory : function
      same as model_factory from model_parameter_search

    folds : KFold

    model_parameters : dict
      dictionary of model parameters

    evaluator : function
      function that takes model, training_set, and test_set
      and return a dictionary of simple typed values as evaluation result

    return_model : bool
      If true, include the model object in the return value.

    metadata : dict
      Dictionary of metadata describing this task.

    Return
    ------
    out: dict
      The return dictionary contains the following fields:

      - parameters : a dictionary of parameters being searched
      - model : the model object if `return_model` is True. None otherwise.
      - evaluation : the output of the evaluator function
      - metadata : the user-provided metadata for this run.
    """
    if 'fold_id' in metadata:
        fold_id = metadata['fold_id']
    else:
        fold_id = 0

    training_set, validation_set = folds[fold_id]

    if isinstance(training_set, str):
        training_set = _SFrame(training_set)

    if isinstance(validation_set, str):
        validation_set = _SFrame(validation_set)

    # Create the model
    model = model_factory(training_set, **model_parameters)

    # Pack results.
    result = {}
    result['parameters'] = model_parameters
    result['model'] = model if return_model else None

    # Evaluate results (capture exceptions).
    evaluate_result = evaluator(model, training_set, validation_set)

    # Return the results as dictionaries.
    if evaluate_result is not None:
        _raise_if_evaluator_return_is_not_packable(evaluate_result)
    result['evaluation'] = evaluate_result
    result['metadata'] = metadata
    return result
Beispiel #12
0
    def _get_summary_struct(self):
        """
        Returns a structured description of the model, including (where relevant)
        the schema of the training data, description of the training data,
        training statistics, and model hyperparameters.

        Returns
        -------
        sections : list (of list of tuples)
            A list of summary sections.
              Each section is a list.
                Each item in a section list is a tuple of the form:
                  ('<feature>','<field>')
        section_titles: list
            A list of section titles.
              The order matches that of the 'sections' object.
        """

        sections = []
        fields = []

        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.features))
        _exclude = _precomputed_field(
            _internal_utils.pretty_print_list(self.excluded_features))

        header_fields = [("Features", "features"),
                         ("Excluded Features", "excluded_features")]

        sections.append("Model Fields")
        fields.append(header_fields)

        if self.user_column_interpretations:
            sections.append("User Specified Interpretations")
            fields.append(
                list(sorted(self.get("user_column_interpretations").items())))

        column_interpretations = self.get("column_interpretations")
        features = self.get("features")

        if self.get("fitted") and features is not None:

            n_rows = len(features)
            transform_info = [None] * n_rows

            for i, f in enumerate(features):
                interpretation = column_interpretations[f]
                input_type = self.input_types[f]
                description, output_type = _get_interpretation_description_and_output_type(
                    interpretation, input_type)

                transform_info[i] = (f, input_type.__name__, interpretation,
                                     description, output_type.__name__)

            transform_table = _SFrame()
            transform_table["Column"] = [t[0] for t in transform_info]
            transform_table["Type"] = [t[1] for t in transform_info]
            transform_table["Interpretation"] = [t[2] for t in transform_info]
            transform_table["Transforms"] = [t[3] for t in transform_info]
            transform_table["Output Type"] = [t[4] for t in transform_info]

            fields[-1].append(transform_table)

        return fields, sections
Beispiel #13
0
def bm25(dataset, query, k1=1.5, b=.75):
    """
    For a given query and set of documents, compute the BM25 score for each
    document. If we have a query with words q_1, ..., q_n the BM25 score for
    a document is:

        .. math:: \sum_{i=1}^N IDF(q_i)\\frac{f(q_i) * (k_1+1)}{f(q_i) + k_1 * (1-b+b*|D|/d_avg))}

    where

    * :math:`\mbox{IDF}(q_i) = log((N - n(q_i) + .5)/(n(q_i) + .5)`
    * :math:`f(q_i)` is the number of times q_i occurs in the document
    * :math:`n(q_i)` is the number of documents containing q_i
    * :math:`|D|` is the number of words in the document
    * :math:`d_avg` is the average number of words per document in the corpus
    * :math:`k_1` and :math:`b` are free parameters.

    Parameters
    ----------
    dataset : SArray of type dict, list, or str
        An SArray where each element eitherrepresents a document in:

        * **dict** : a bag-of-words format, where each key is a word and each
          value is the number of times that word occurs in the document.

        * **list** : The list is converted to bag of words of format, where the 
          keys are the unique elements in the list and the values are the counts
          of those unique elements. After this step, the behaviour is identical 
          to dict.

        * **string** : Behaves identically to a **dict**, where the dictionary 
          is generated by converting the string into a bag-of-words format. 
          For example, 'I really like really fluffy dogs" would get converted 
          to {'I' : 1, 'really': 2, 'like': 1, 'fluffy': 1, 'dogs':1}.

    query : A list, set, or SArray of type str
        A list, set or SArray where each element is a word.

    k1 : float, optional
        Free parameter which controls the relative importance of term
        frequencies. Recommended values are [1.2, 2.0].

    b : float, optional
        Free parameter which controls how much to downweight scores for long
        documents. Recommended value is 0.75.

    Returns
    -------
    out : SFrame
        An SFrame containing the BM25 score for each document containing one of
        the query words. The doc_id column is the row number of the document.

    Examples
    --------

        >>> dataset = graphlab.SArray([
          {'a':5, 'b':7, 'c':10},
          {'a':3, 'c':1, 'd':2},
          {'a':10, 'b':3, 'e':5},
          {'a':1},
          {'f':5}])

        >>> query = ['a', 'b', 'c']
        >>> graphlab.text_analytics.bm25(dataset, query)


    References
    ----------
    .. [BM25] `"Okapi BM-25" <http://en.wikipedia.org/wiki/Okapi_BM25>`_
    """
    _mt._get_metric_tracker().track('toolkit.text_analytics.bm25')

    if type(dataset) != _graphlab.SArray:
        raise TypeError, 'bm25 requires an SArray of dict, list, or str type'+\
            ', where each dictionary whose keys are words and whose values' + \
            ' are word frequency.'
    sf = _SFrame({'docs' : dataset})

    if type(query) is dict:  # For backwards compatibility
        query = query.keys()
    if type(query) is _graphlab.SArray:
        query = list(query)
    if type(query) is set:
        query = list(query)
    if type(query) is not list:
        raise TypeError, 'The query must either be an SArray of str type, '+\
           ' a list of strings, or a set of strings.'

    # Calculate BM25
    sf = sf.add_row_number('doc_id')
    sf = sf.dropna('docs') # Drop missing documents
    scores = _graphlab.feature_engineering.BM25('docs',query, k1, b, output_column_name = 'bm25').fit_transform(sf)

    # Find documents with query words

    if scores['docs'].dtype() is dict:
        scores['doc_terms'] = scores['docs'].dict_keys()
    elif scores['docs'].dtype() is list:
        scores['doc_terms'] = scores['docs'].apply(lambda x: list(set(x)))
    elif scores['docs'].dtype() is str:
        scores['doc_terms'] = count_words(scores['docs']).dict_keys()
    else:
        # This should never occur (handled by BM25)
        raise TypeError, 'bm25 requires an SArray of dict, list, or str type'
    scores['doc_counts'] = scores['doc_terms'].apply(lambda x: len([word for word in query if word in x]))
    scores = scores[scores['doc_counts'] > 0] # Drop documents without query word
    scores = scores.select_columns(['doc_id','bm25'])

    return scores
Beispiel #14
0
def _train_test_model(model_factory, folds, model_parameters, evaluator,
                      return_model, metadata):
    """
    This is the actual top level function that will be run (possibly remotely)
    to do the actual work of creating and evaluating models with
    different parameters.

    Parameters
    ----------
    model_factory : function
      same as model_factory from model_parameter_search

    folds : KFold

    model_parameters : dict
      dictionary of model parameters

    evaluator : function
      function that takes model, training_set, and test_set
      and return a dictionary of simple typed values as evaluation result

    return_model : bool
      If true, include the model object in the return value.

    metadata : dict
      Dictionary of metadata describing this task.

    Returns
    -------
    out: dict
      The return dictionary contains the following fields:

      - parameters : a dictionary of parameters being searched
      - model : the model object if `return_model` is True. None otherwise.
      - evaluation : the output of the evaluator function
      - metadata : the user-provided metadata for this run.
    """
    if 'fold_id' in metadata:
        fold_id = metadata['fold_id']
    else:
        fold_id = 0

    training_set, validation_set = folds[fold_id]

    if isinstance(training_set, str):
        training_set = _SFrame(training_set)

    if isinstance(validation_set, str):
        validation_set = _SFrame(validation_set)

    # Some parameters require a validation set is provided, e.g. early stopping.
    if _sys.version_info.major == 3:
        argspec = _inspect.getargspec(model_factory)
        args = argspec.args
    else:
        args, varargs, varkw, defaults = _inspect.getargspec(model_factory)

    if 'validation_set' in args and not 'validation_set' in model_parameters:
        model_parameters['validation_set'] = validation_set

    # Create the model
    model = model_factory(training_set, **model_parameters)

    # Remove validation_set from model_parameters before summarizing
    if 'validation_set' in model_parameters:
        del model_parameters['validation_set']

    # Pack results.
    result = {}
    result['parameters'] = model_parameters
    result['model'] = model if return_model else None

    # Evaluate results (capture exceptions).
    evaluate_result = evaluator(model, training_set, validation_set)

    # Return the results as dictionaries.
    if evaluate_result is not None:
        _raise_if_evaluator_return_is_not_packable(evaluate_result)
    result['evaluation'] = evaluate_result
    result['metadata'] = metadata
    return result
def bm25(dataset, query, k1=1.5, b=.75):
    """
    For a given query and set of documents, compute the BM25 score for each
    document. If we have a query with words q_1, ..., q_n the BM25 score for
    a document is:

        .. math:: \sum_{i=1}^N IDF(q_i)\\frac{f(q_i) * (k_1+1)}{f(q_i) + k_1 * (1-b+b*|D|/d_avg))}

    where

    * :math:`\mbox{IDF}(q_i) = log((N - n(q_i) + .5)/(n(q_i) + .5)`
    * :math:`f(q_i)` is the number of times q_i occurs in the document
    * :math:`n(q_i)` is the number of documents containing q_i
    * :math:`|D|` is the number of words in the document
    * :math:`d_avg` is the average number of words per document in the corpus
    * :math:`k_1` and :math:`b` are free parameters.

    Parameters
    ----------
    dataset : SArray of type dict, list, or str
        An SArray where each element eitherrepresents a document in:

        * **dict** : a bag-of-words format, where each key is a word and each
          value is the number of times that word occurs in the document.

        * **list** : The list is converted to bag of words of format, where the
          keys are the unique elements in the list and the values are the counts
          of those unique elements. After this step, the behaviour is identical
          to dict.

        * **string** : Behaves identically to a **dict**, where the dictionary
          is generated by converting the string into a bag-of-words format.
          For example, 'I really like really fluffy dogs" would get converted
          to {'I' : 1, 'really': 2, 'like': 1, 'fluffy': 1, 'dogs':1}.

    query : A list, set, or SArray of type str
        A list, set or SArray where each element is a word.

    k1 : float, optional
        Free parameter which controls the relative importance of term
        frequencies. Recommended values are [1.2, 2.0].

    b : float, optional
        Free parameter which controls how much to downweight scores for long
        documents. Recommended value is 0.75.

    Returns
    -------
    out : SFrame
        An SFrame containing the BM25 score for each document containing one of
        the query words. The doc_id column is the row number of the document.

    Examples
    --------
    .. sourcecode:: python

        >>> import graphlab

        >>> dataset = graphlab.SArray([
          {'a':5, 'b':7, 'c':10},
          {'a':3, 'c':1, 'd':2},
          {'a':10, 'b':3, 'e':5},
          {'a':1},
          {'f':5}])

        >>> query = ['a', 'b', 'c']
        >>> graphlab.text_analytics.bm25(dataset, query)


    References
    ----------
    .. [BM25] `"Okapi BM-25" <http://en.wikipedia.org/wiki/Okapi_BM25>`_
    """
    _mt._get_metric_tracker().track('toolkit.text_analytics.bm25')

    if type(dataset) != _graphlab.SArray:
        raise TypeError('bm25 requires an SArray of dict, list, or str type'+\
            ', where each dictionary whose keys are words and whose values' + \
            ' are word frequency.')
    sf = _SFrame({'docs' : dataset})

    if type(query) is dict:  # For backwards compatibility
        query = list(query.keys())
    if type(query) is _graphlab.SArray:
        query = list(query)
    if type(query) is set:
        query = list(query)
    if type(query) is not list:
        raise TypeError('The query must either be an SArray of str type, '+\
           ' a list of strings, or a set of strings.')

    # Calculate BM25
    sf = sf.add_row_number('doc_id')
    sf = sf.dropna('docs') # Drop missing documents
    scores = _graphlab.feature_engineering.BM25('docs',query, k1, b, output_column_name = 'bm25').fit_transform(sf)

    # Find documents with query words

    if scores['docs'].dtype() is dict:
        scores['doc_terms'] = scores['docs'].dict_keys()
    elif scores['docs'].dtype() is list:
        scores['doc_terms'] = scores['docs'].apply(lambda x: list(set(x)))
    elif scores['docs'].dtype() is str:
        scores['doc_terms'] = count_words(scores['docs']).dict_keys()
    else:
        # This should never occur (handled by BM25)
        raise TypeError('bm25 requires an SArray of dict, list, or str type')
    scores['doc_counts'] = scores['doc_terms'].apply(lambda x: len([word for word in query if word in x]))
    scores = scores[scores['doc_counts'] > 0] # Drop documents without query word
    scores = scores.select_columns(['doc_id','bm25'])

    return scores
def bm25(dataset, query, k1=1.5, b=.75):
    """
    For a given query and set of documents, compute the BM25 score for each
    document. If we have a query with words q_1, ..., q_n the BM25 score for
    a document is:

        .. math:: \sum_{i=1}^N IDF(q_i)\\frac{f(q_i) * (k_1+1)}{f(q_i + k_1 * (1-b+b*|D|/d_avg))}

    where

    * :math:`\mbox{IDF}(q_i) = log((N - n(q_i) + .5)/(n(q_i) + .5)`
    * :math:`f(q_i)` is the number of times q_i occurs in the document
    * :math:`n(q_i)` is the number of documents containing q_i
    * :math:`|D|` is the number of words in the document
    * :math:`d_avg` is the average number of words per document in the corpus
    * :math:`k_1` and :math:`b` are free parameters.

    Parameters
    ----------
    dataset : SArray of type dict
        An SArray where each element represents a document in bag-of-words
        format, where each key is a word and each value is the number of times
        that word occurs in the document.

    query : SArray of type str
        An SArray where each element is a word.

    k1 : float, optional
        Free parameter which controls the relative importance of term
        frequencies. Recommended values are [1.2, 2.0].

    b : float, optional
        Free parameter which controls how much to downweight scores for long
        documents. Recommended value is 0.75.

    Returns
    -------
    out : SFrame
        An SFrame containing the BM25 score for each document containing one of
        the query words. The doc_id column is the row number of the document.

    Example
    -------

        >>> dataset = graphlab.SArray([
          {'a':5, 'b':7, 'c':10},
          {'a':3, 'c':1, 'd':2},
          {'a':10, 'b':3, 'e':5},
          {'a':1},
          {'f':5}])

        >>> query = ['a', 'b', 'c']
        >>> graphlab.text_analytics.bm25(dataset, query)


    References
    ----------
    .. [BM25] `"Okapi BM-25" <http://en.wikipedia.org/wiki/Okapi_BM25>`_
    """
    _mt._get_metric_tracker().track('toolkit.text_analytics.bm25')

    if type(dataset) != _graphlab.SArray:
        raise TypeError, 'bm25 requires an SArray of dict type, where each '+\
            'dictionary whose keys are words and whose values are word frequency.'

    if type(query) not in [_graphlab.SArray, dict, list]:
        raise TypeError, 'The query must either be an SArray of str type, '+\
           ' a list of strings, or a dictionary.'

    if type(query) == dict:
        query = query.keys()
    if type(query) == list:
        query = _graphlab.SArray(query)
    query = _SFrame({'word': query})

    # Add row ids
    N = len(dataset)
    d = _SFrame({'bow':dataset})
    d = d.add_row_number('doc_id')

    # Compute document length and average doc length
    d['doc_length'] = d['bow'].dict_values().apply(lambda x: sum(x))
    avg_doc_length = d['doc_length'].mean()

    # Convert into 'stacked' format
    d_stacked = d.stack('bow', new_column_name=['word', 'tf'], drop_na=False)
    query = query.join(d_stacked)

    # Compute number of docs in which each word appears
    doc_freq = query.groupby('word', {'doc_freq': _graphlab.aggregate.COUNT})
    query = query.join(doc_freq, on='word')

    # Compute IDF for each word in the query
    query['idf'] = query['doc_freq'].apply(lambda n: math.log((N - n + .5) / (n + .5)))

    # Compute the score of each word for each document
    query['word_score'] = query['idf'] * \
        (query['tf'] * (k1 + 1.0)) / \
        (query['tf'] + k1 * 1.0 * (1.0 - b + b * query['doc_length'] / avg_doc_length))

    # Compute BM25
    result = query.groupby('doc_id', {'bm25': _graphlab.aggregate.SUM('word_score')})
    return result.sort('doc_id')