def amazon_anchors(): """Retrieves default anchors for amazon using Gram-Schmidt""" dataset = get_amazon() anchors, indices = ankura.gramschmidt_anchors(dataset, 30, 500, return_indices=True) anchor_tokens = [[dataset.vocab[index]] for index in indices] return anchor_tokens, anchors
def newsgroup_anchors(): """Retrieves default anchors for newsgroups using Gram-Schmidt""" dataset = get_newsgroups() anchors, indices = ankura.gramschmidt_anchors(dataset, 20, 500, return_indices=True) anchor_tokens = [[dataset.vocab[index]] for index in indices] return anchor_tokens, anchors
def demo(): """Runs the newsgroups demo""" dataset = get_newsgroups() anchors = ankura.gramschmidt_anchors(dataset, 20, 500) topics = ankura.recover_topics(dataset, anchors) for topic in ankura.topic.topic_summary_tokens(topics, dataset, 20): print(' '.join(topic))
def demo(): """Runs the demo""" dataset = get_newsgroups() def run(name, anchors): topics = ankura.recover_topics(dataset, anchors) features = ankura.topic_combine(topics, dataset) train, test = ankura.pipeline.train_test_split(features, .9) vw_contingency = ankura.measure.vowpal_contingency( train, test, 'dirname') print(name, 'accuracy:', ankura.measure.vowpal_accuracy(train, test, 'dirname')) print(name, 'f-Measure:', vw_contingency.fmeasure()) print(name, 'ari:', vw_contingency.ari()) print(name, 'rand:', vw_contingency.rand()) print(name, 'vi:', vw_contingency.vi()) coherence = [] for topic in ankura.topic.topic_summary_indices(topics, dataset, 10): coherence.append(ankura.measure.topic_coherence(topic, dataset)) print(name, 'coherence-10:', numpy.mean(coherence)) coherence = [] for topic in ankura.topic.topic_summary_indices(topics, dataset, 15): coherence.append(ankura.measure.topic_coherence(topic, dataset)) print(name, 'coherence-15:', numpy.mean(coherence)) coherence = [] for topic in ankura.topic.topic_summary_indices(topics, dataset, 20): coherence.append(ankura.measure.topic_coherence(topic, dataset)) print(name, 'coherence-20:', numpy.mean(coherence)) run('default', ankura.gramschmidt_anchors(get_newsgroups(), 20, 500)) run('title-avg', get_title_anchors(dataset, ankura.anchor.vector_average)) run('title-min', get_title_anchors(dataset, ankura.anchor.vector_min)) run('title-max', get_title_anchors(dataset, ankura.anchor.vector_max)) run('title-or', get_title_anchors(dataset, ankura.anchor.vector_or))
def demo(): """Runs the demo""" dataset = get_newsgroups() def run(name, anchors): topics = ankura.recover_topics(dataset, anchors) features = ankura.topic_combine(topics, dataset) train, test = ankura.pipeline.train_test_split(features, .9) vw_contingency = ankura.measure.vowpal_contingency(train, test, 'dirname') print(name, 'accuracy:', ankura.measure.vowpal_accuracy(train, test, 'dirname')) print(name, 'f-Measure:', vw_contingency.fmeasure()) print(name, 'ari:', vw_contingency.ari()) print(name, 'rand:', vw_contingency.rand()) print(name, 'vi:', vw_contingency.vi()) coherence = [] for topic in ankura.topic.topic_summary_indices(topics, dataset, 10): coherence.append(ankura.measure.topic_coherence(topic, dataset)) print(name, 'coherence-10:', numpy.mean(coherence)) coherence = [] for topic in ankura.topic.topic_summary_indices(topics, dataset, 15): coherence.append(ankura.measure.topic_coherence(topic, dataset)) print(name, 'coherence-15:', numpy.mean(coherence)) coherence = [] for topic in ankura.topic.topic_summary_indices(topics, dataset, 20): coherence.append(ankura.measure.topic_coherence(topic, dataset)) print(name, 'coherence-20:', numpy.mean(coherence)) run('default', ankura.gramschmidt_anchors(get_newsgroups(), 20, 500)) run('title-avg', get_title_anchors(dataset, ankura.anchor.vector_average)) run('title-min', get_title_anchors(dataset, ankura.anchor.vector_min)) run('title-max', get_title_anchors(dataset, ankura.anchor.vector_max)) run('title-or', get_title_anchors(dataset, ankura.anchor.vector_or))