Exemple #1
0
def amazon_anchors():
    """Retrieves default anchors for amazon using Gram-Schmidt"""
    dataset = get_amazon()
    anchors, indices = ankura.gramschmidt_anchors(dataset, 30, 500,
                                                  return_indices=True)
    anchor_tokens = [[dataset.vocab[index]] for index in indices]
    return anchor_tokens, anchors
Exemple #2
0
def newsgroup_anchors():
    """Retrieves default anchors for newsgroups using Gram-Schmidt"""
    dataset = get_newsgroups()
    anchors, indices = ankura.gramschmidt_anchors(dataset, 20, 500,
                                                  return_indices=True)
    anchor_tokens = [[dataset.vocab[index]] for index in indices]
    return anchor_tokens, anchors
Exemple #3
0
def demo():
    """Runs the newsgroups demo"""
    dataset = get_newsgroups()
    anchors = ankura.gramschmidt_anchors(dataset, 20, 500)
    topics = ankura.recover_topics(dataset, anchors)

    for topic in ankura.topic.topic_summary_tokens(topics, dataset, 20):
        print(' '.join(topic))
Exemple #4
0
def amazon_anchors():
    """Retrieves default anchors for amazon using Gram-Schmidt"""
    dataset = get_amazon()
    anchors, indices = ankura.gramschmidt_anchors(dataset,
                                                  30,
                                                  500,
                                                  return_indices=True)
    anchor_tokens = [[dataset.vocab[index]] for index in indices]
    return anchor_tokens, anchors
Exemple #5
0
def newsgroup_anchors():
    """Retrieves default anchors for newsgroups using Gram-Schmidt"""
    dataset = get_newsgroups()
    anchors, indices = ankura.gramschmidt_anchors(dataset,
                                                  20,
                                                  500,
                                                  return_indices=True)
    anchor_tokens = [[dataset.vocab[index]] for index in indices]
    return anchor_tokens, anchors
Exemple #6
0
def demo():
    """Runs the demo"""
    dataset = get_newsgroups()

    def run(name, anchors):
        topics = ankura.recover_topics(dataset, anchors)
        features = ankura.topic_combine(topics, dataset)
        train, test = ankura.pipeline.train_test_split(features, .9)

        vw_contingency = ankura.measure.vowpal_contingency(
            train, test, 'dirname')
        print(name, 'accuracy:',
              ankura.measure.vowpal_accuracy(train, test, 'dirname'))
        print(name, 'f-Measure:', vw_contingency.fmeasure())
        print(name, 'ari:', vw_contingency.ari())
        print(name, 'rand:', vw_contingency.rand())
        print(name, 'vi:', vw_contingency.vi())

        coherence = []
        for topic in ankura.topic.topic_summary_indices(topics, dataset, 10):
            coherence.append(ankura.measure.topic_coherence(topic, dataset))
        print(name, 'coherence-10:', numpy.mean(coherence))

        coherence = []
        for topic in ankura.topic.topic_summary_indices(topics, dataset, 15):
            coherence.append(ankura.measure.topic_coherence(topic, dataset))
        print(name, 'coherence-15:', numpy.mean(coherence))

        coherence = []
        for topic in ankura.topic.topic_summary_indices(topics, dataset, 20):
            coherence.append(ankura.measure.topic_coherence(topic, dataset))
        print(name, 'coherence-20:', numpy.mean(coherence))

    run('default', ankura.gramschmidt_anchors(get_newsgroups(), 20, 500))
    run('title-avg', get_title_anchors(dataset, ankura.anchor.vector_average))
    run('title-min', get_title_anchors(dataset, ankura.anchor.vector_min))
    run('title-max', get_title_anchors(dataset, ankura.anchor.vector_max))
    run('title-or', get_title_anchors(dataset, ankura.anchor.vector_or))
Exemple #7
0
def demo():
    """Runs the demo"""
    dataset = get_newsgroups()

    def run(name, anchors):
        topics = ankura.recover_topics(dataset, anchors)
        features = ankura.topic_combine(topics, dataset)
        train, test = ankura.pipeline.train_test_split(features, .9)

        vw_contingency = ankura.measure.vowpal_contingency(train, test, 'dirname')
        print(name, 'accuracy:', ankura.measure.vowpal_accuracy(train, test, 'dirname'))
        print(name, 'f-Measure:', vw_contingency.fmeasure())
        print(name, 'ari:', vw_contingency.ari())
        print(name, 'rand:', vw_contingency.rand())
        print(name, 'vi:', vw_contingency.vi())

        coherence = []
        for topic in ankura.topic.topic_summary_indices(topics, dataset, 10):
            coherence.append(ankura.measure.topic_coherence(topic, dataset))
        print(name, 'coherence-10:', numpy.mean(coherence))

        coherence = []
        for topic in ankura.topic.topic_summary_indices(topics, dataset, 15):
            coherence.append(ankura.measure.topic_coherence(topic, dataset))
        print(name, 'coherence-15:', numpy.mean(coherence))

        coherence = []
        for topic in ankura.topic.topic_summary_indices(topics, dataset, 20):
            coherence.append(ankura.measure.topic_coherence(topic, dataset))
        print(name, 'coherence-20:', numpy.mean(coherence))

    run('default', ankura.gramschmidt_anchors(get_newsgroups(), 20, 500))
    run('title-avg', get_title_anchors(dataset, ankura.anchor.vector_average))
    run('title-min', get_title_anchors(dataset, ankura.anchor.vector_min))
    run('title-max', get_title_anchors(dataset, ankura.anchor.vector_max))
    run('title-or', get_title_anchors(dataset, ankura.anchor.vector_or))