def geo_aliases(cls, total_docs_by_geo, min_doc_count=1000): keep_geos = total_docs_by_geo.filter( lambda (geo, count): count >= min_doc_count) alias_geos = total_docs_by_geo.subtract(keep_geos) return alias_geos.keys() \ .flatMap(lambda key: [(neighbor, key) for neighbor in geohash.neighbors(key)]) \ .join(keep_geos) \ .map(lambda (neighbor, (key, count)): (key, (neighbor, count))) \ .groupByKey() \ .map(lambda (key, values): (key, sorted(values, key=operator.itemgetter(1), reverse=True)[0][0]))
def doc_scores(cls, doc_words, doc_frequency, total_docs, min_count=1): if min_count > 1: doc_frequency = cls.filter_min_doc_frequency(doc_frequency, min_count=min_count) num_partitions = doc_words.getNumPartitions() doc_ids_word_stats = doc_words.join( doc_frequency).map(lambda (word, ((doc_id, pos), doc_frequency)): (doc_id, (word, pos, doc_frequency))) docs_tfidf = doc_ids_word_stats.groupByKey() \ .mapValues(lambda vals: [(word, TFIDF.tfidf_score(1.0, doc_frequency, total_docs)) for word, pos, doc_frequency in sorted(vals, key=operator.itemgetter(1))]) return docs_tfidf.coalesce(num_partitions)
def doc_scores(cls, doc_words, geo_doc_frequency, total_docs_by_geo, min_count=1): if min_count > 1: geo_doc_frequency = cls.filter_min_doc_frequency( geo_doc_frequency, min_count=min_count) num_partitions = doc_words.getNumPartitions() geo_doc_frequency_totals = geo_doc_frequency.map(lambda ((geo, word), count): (geo, (word, count))) \ .join(total_docs_by_geo) \ .map(lambda (geo, ((word, count), num_docs)): ((geo, word), (count, num_docs))) doc_ids_word_stats = doc_words.join(geo_doc_frequency_totals) \ .map(lambda ((geo, word), ((doc_id, pos), (doc_frequency, num_docs))): (doc_id, (word, pos, doc_frequency, num_docs))) docs_tfidf = doc_ids_word_stats.groupByKey() \ .mapValues(lambda vals: [(word, TFIDF.tfidf_score(1.0, doc_frequency, num_docs)) for word, pos, doc_frequency, num_docs in sorted(vals, key=operator.itemgetter(1))]) return docs_tfidf.coalesce(num_partitions)
def doc_scores(cls, doc_words, word_info_gain): num_partitions = doc_words.getNumPartitions() doc_word_stats = doc_words.join(word_info_gain).map(lambda (word, ( (doc_id, pos), info_gain)): (doc_id, (word, pos, info_gain))) docs_info_gain = doc_word_stats.groupByKey() \ .mapValues(lambda vals: [(word, val) for word, pos, val in sorted(vals, key=operator.itemgetter(1))]) return docs_info_gain.coalesce(num_partitions)
def geo_aliases(cls, total_docs_by_geo, min_doc_count=1000): keep_geos = total_docs_by_geo.filter( lambda geo_count: geo_count[1] >= min_doc_count) alias_geos = total_docs_by_geo.subtract(keep_geos) return list(alias_geos.keys()) \ .flatMap(lambda key: [(neighbor, key) for neighbor in geohash.neighbors(key)]) \ .join(keep_geos) \ .map(lambda neighbor_key_count: (neighbor_key_count[1][0], (neighbor_key_count[0], neighbor_key_count[1][1]))) \ .groupByKey() \ .map(lambda key_values: (key_values[0], sorted(key_values[1], key_values[0]=operator.itemgetter(1), reverse=True)[0][0]))