def run(name, anchors): topics = ankura.recover_topics(dataset, anchors) features = ankura.topic_combine(topics, dataset) train, test = ankura.pipeline.train_test_split(features, .9) vw_contingency = ankura.measure.vowpal_contingency(train, test, 'dirname') print(name, 'accuracy:', ankura.measure.vowpal_accuracy(train, test, 'dirname')) print(name, 'f-Measure:', vw_contingency.fmeasure()) print(name, 'ari:', vw_contingency.ari()) print(name, 'rand:', vw_contingency.rand()) print(name, 'vi:', vw_contingency.vi()) coherence = [] for topic in ankura.topic.topic_summary_indices(topics, dataset, 10): coherence.append(ankura.measure.topic_coherence(topic, dataset)) print(name, 'coherence-10:', numpy.mean(coherence)) coherence = [] for topic in ankura.topic.topic_summary_indices(topics, dataset, 15): coherence.append(ankura.measure.topic_coherence(topic, dataset)) print(name, 'coherence-15:', numpy.mean(coherence)) coherence = [] for topic in ankura.topic.topic_summary_indices(topics, dataset, 20): coherence.append(ankura.measure.topic_coherence(topic, dataset)) print(name, 'coherence-20:', numpy.mean(coherence))
def run(name, anchors): topics = ankura.recover_topics(dataset, anchors) features = ankura.topic_combine(topics, dataset) train, test = ankura.pipeline.train_test_split(features, .9) vw_contingency = ankura.measure.vowpal_contingency( train, test, 'dirname') print(name, 'accuracy:', ankura.measure.vowpal_accuracy(train, test, 'dirname')) print(name, 'f-Measure:', vw_contingency.fmeasure()) print(name, 'ari:', vw_contingency.ari()) print(name, 'rand:', vw_contingency.rand()) print(name, 'vi:', vw_contingency.vi()) coherence = [] for topic in ankura.topic.topic_summary_indices(topics, dataset, 10): coherence.append(ankura.measure.topic_coherence(topic, dataset)) print(name, 'coherence-10:', numpy.mean(coherence)) coherence = [] for topic in ankura.topic.topic_summary_indices(topics, dataset, 15): coherence.append(ankura.measure.topic_coherence(topic, dataset)) print(name, 'coherence-15:', numpy.mean(coherence)) coherence = [] for topic in ankura.topic.topic_summary_indices(topics, dataset, 20): coherence.append(ankura.measure.topic_coherence(topic, dataset)) print(name, 'coherence-20:', numpy.mean(coherence))
def demo(): """Runs the newsgroups demo""" dataset = get_newsgroups() anchors = ankura.gramschmidt_anchors(dataset, 20, 500) topics = ankura.recover_topics(dataset, anchors) for topic in ankura.topic.topic_summary_tokens(topics, dataset, 20): print(' '.join(topic))
def topic_inference(raw_anchors): """Returns infered topic info from raw anchors""" dataset = args.get_dataset() if raw_anchors is None: anchor_tokens, anchors = args.default_anchors() else: anchor_tokens = ankura.util.tuplize(json.loads(raw_anchors)) anchors = user_anchors(anchor_tokens) topics = ankura.recover_topics(dataset, anchors, epsilon=1e-6) topic_summary = ankura.topic.topic_summary_tokens(topics, dataset, n=15) return topics, topic_summary, anchor_tokens