def mapper1(key, value): """ Starting with input like: K=>row_id, V=>1\t<html>...Earthquake strikes in <b>Chile!<b>...</html> where 1 is the issue ID Output: K=>(row_id, 'earthquake', 1), V=>1 K=>(row_id, 'strike', 1), V=>1 K=>(row_id, 'chile', 1), V=>1 ... """ issue_id, doc = value.split('\t') doc = html_to_story(doc) for word in search_features(doc): yield (key, issue_id, word), 1
def classify(self, words): memcache = get_cache() doc_features = search_features(words) row_vals = [] cache_keys = [hash_key(feature) for feature in doc_features] row_vals.extend(memcache.get_multi(cache_keys).values()) # Words are unknown if not row_vals: return [] rows = numpy.array(row_vals) cols = numpy.array([0 for row in row_vals]) data = numpy.array([1 for row in row_vals]) matrix = csc_matrix( (data, (rows, cols)), shape=(self.num_features, 1) ) matrix = matrix.transpose() probs = self.model.predict_log_proba(matrix) if not probs.any() or len(probs)==0: return [] return sorted( [(self.classification_dict.get(idx), prob) for idx, prob in enumerate(probs[0])], key=lambda item: item[1], reverse=True)[:5]