def mapper1(key, value):
    """ Starting with input like:
    K=>row_id, V=>1\t<html>...Earthquake strikes in <b>Chile!<b>...</html>
    where 1 is the issue ID
    
    Output:
    K=>(row_id, 'earthquake', 1), V=>1
    K=>(row_id, 'strike', 1), V=>1
    K=>(row_id, 'chile', 1), V=>1
    ...
    """
    issue_id, doc = value.split('\t')
    
    doc = html_to_story(doc)
    
    for word in search_features(doc):
        yield (key, issue_id, word), 1
Exemple #2
0
def mapper1(key, value):
    """ Starting with input like:
    K=>row_id, V=>1\t<html>...Earthquake strikes in <b>Chile!<b>...</html>
    where 1 is the issue ID
    
    Output:
    K=>(row_id, 'earthquake', 1), V=>1
    K=>(row_id, 'strike', 1), V=>1
    K=>(row_id, 'chile', 1), V=>1
    ...
    """
    issue_id, doc = value.split('\t')

    doc = html_to_story(doc)

    for word in search_features(doc):
        yield (key, issue_id, word), 1
Exemple #3
0
    def classify(self, words):
        memcache = get_cache()
        doc_features = search_features(words)
        row_vals = []
        
        cache_keys = [hash_key(feature) for feature in doc_features]
        row_vals.extend(memcache.get_multi(cache_keys).values())        
        
        # Words are unknown
        if not row_vals:
            return []
        rows = numpy.array(row_vals)
        cols = numpy.array([0 for row in row_vals])
        data = numpy.array([1 for row in row_vals])
        matrix = csc_matrix( (data, (rows, cols)), shape=(self.num_features, 1) )
        matrix = matrix.transpose()
        probs = self.model.predict_log_proba(matrix)
        if not probs.any() or len(probs)==0:
            return []

        return sorted( [(self.classification_dict.get(idx), prob) for idx, prob in enumerate(probs[0])], key=lambda item: item[1], reverse=True)[:5]