Ejemplo n.º 1
0
def add_metric(dataset, analysis, force_import=False, *args, **kwargs):
    analysis = Analysis.objects.get(dataset__name=dataset, name=analysis)
    for attr in analysis.dataset.attribute_set.all():
        for val in attr.value_set.all():
            metric_name = 'Token Count for %s: %s' % (attr.name, val.value)
            print metric_name
            try:
                metric = TopicMetric.objects.get(name=metric_name,
                        analysis=analysis)
                if not force_import:
                    raise RuntimeError('%s is already in the database for this '
                            'analysis!' % metric_name)
            except TopicMetric.DoesNotExist:
                metric = TopicMetric(name=metric_name, analysis=analysis)
                metric.save()
            topics = analysis.topics.all()
            docs = [d.id for d in analysis.dataset.documents.filter(
                    attributevaluedocument__attribute=attr,
                    attributevaluedocument__value=val)]
            for topic in topics:
                count = 0
                for dt in topic.documenttopic_set.filter(document__id__in=docs):
                    count += dt.count
                tmv = TopicMetricValue(topic=topic, metric=metric, value=count)
                tmv.save()
def add_metric(dataset, analysis, force_import=False, *args, **kwargs):
    analysis = Analysis.objects.get(dataset__name=dataset, name=analysis)
    for attr in analysis.dataset.attribute_set.all():
        for val in attr.value_set.all():
            metric_name = 'Document Entropy for %s: %s' % (attr.name,
                                                           val.value)
            print metric_name
            try:
                metric = TopicMetric.objects.get(name=metric_name,
                                                 analysis=analysis)
                if not force_import:
                    raise RuntimeError(
                        '%s is already in the database for this '
                        'analysis!' % metric_name)
            except TopicMetric.DoesNotExist:
                metric = TopicMetric(name=metric_name, analysis=analysis)
                metric.save()
            topics = analysis.topics.all()
            docs = [
                d.id for d in analysis.dataset.documents.filter(
                    attributevaluedocument__attribute=attr,
                    attributevaluedocument__value=val)
            ]
            for topic in topics:
                ent = 0
                for dt in topic.documenttopic_set.filter(
                        document__id__in=docs):
                    prob = dt.count / topic.total_count
                    ent -= prob * (log(prob) / log(2))
                tmv = TopicMetricValue(topic=topic, metric=metric, value=ent)
                tmv.save()
Ejemplo n.º 3
0
def add_metric(dataset, analysis, force_import=False, *args, **kwargs):
    analysis = Analysis.objects.get(dataset__name=dataset, name=analysis)
    try:
        metric = TopicMetric.objects.get(name='Percent Tokens Positive '
                'Sentiment', analysis=analysis)
        if not force_import:
            raise RuntimeError('Sentiment is already in the database '
                    'for this analysis!')
    except TopicMetric.DoesNotExist:
        metric = TopicMetric(name='Percent Tokens Positive Sentiment',
                analysis=analysis)
        metric.save()

    # call stuff to classify documents and get sentiment information, as in
    # parse_dependencies.py

    data_root = analysis.dataset.dataset_dir
    topics = analysis.topics.all()
    for topic in topics:
        positive = 0;
        negative = 0;
        for docTopic in topic.documenttopic_set.all():
            filename = data_root + '/' + docTopic.document.filename
            print topic, filename
            sentiment = float(sentiment_document(filename))
            print 'sentiment returned:', sentiment
            if sentiment == 1 :
                positive += docTopic.count
            print '%d/%d' % (positive, topic.total_count)
        # compute aggregate information for topic
        topicSentiment = float(positive)/float(topic.total_count)
        tmv = TopicMetricValue(topic=topic, metric=metric, value=topicSentiment)
        tmv.save()
Ejemplo n.º 4
0
def add_metric(dataset, analysis, force_import=False, *args, **kwargs):
    analysis = Analysis.objects.get(dataset__name=dataset, name=analysis)
    try:
        metric = TopicMetric.objects.get(name=metric_name, analysis=analysis)
        if not force_import:
            raise RuntimeError('%s is already in the database for this '
                               'analysis!' % metric_name)
    except TopicMetric.DoesNotExist:
        metric = TopicMetric(name=metric_name, analysis=analysis)
        metric.save()

    conn = sqlite3.connect(kwargs['counts'])
    c = conn.cursor()
    c.execute("select words from total_counts")
    for row in c:
        total_words = float(row[0])
    c.execute("select cooccurrences from total_counts")
    for row in c:
        total_cooccurrences = float(row[0])
    topics = analysis.topics.all()
    for topic in topics:
        topicwords = topic.topicword_set.filter(
            word__ngram=False).order_by('-count')
        # We just grab the first ten words - there's probably a better way to
        # do this
        words = [tw.word.type for tw in topicwords[:10]]
        total_pmi = 0
        for w1 in words:
            for w2 in words:
                if w1 == w2: continue
                total_pmi += compute_pmi(w1, w2, c, total_words,
                                         total_cooccurrences)
        average_pmi = total_pmi / (len(words)**2)
        tmv = TopicMetricValue(topic=topic, metric=metric, value=average_pmi)
        tmv.save()
Ejemplo n.º 5
0
def add_metric(dataset, analysis, force_import=False, *args, **kwargs):
    analysis = Analysis.objects.get(dataset__name=dataset, name=analysis)
    try:
        metric = TopicMetric.objects.get(name=metric_name, analysis=analysis)
        if not force_import:
            raise RuntimeError('%s is already in the database for this '
                               'analysis!' % metric_name)
    except TopicMetric.DoesNotExist:
        metric = TopicMetric(name=metric_name, analysis=analysis)
        metric.save()

    conn = sqlite3.connect(kwargs['counts'])
    c = conn.cursor()
    c.execute("select words from total_counts")
    for row in c:
        total_words = float(row[0])
    c.execute("select cooccurrences from total_counts")
    for row in c:
        total_cooccurrences = float(row[0])
    topics = analysis.topics.all()
    for topic in topics:
        topicwords = topic.topicword_set.filter(
            word__ngram=False).order_by('-count')
        # We just grab the first ten words - there's probably a better way to
        # do this
        words = [tw.word.type for tw in topicwords[:10]]
        total_pmi = 0
        for w1 in words:
            for w2 in words:
                if w1 == w2: continue
                total_pmi += compute_pmi(w1, w2, c, total_words,
                                         total_cooccurrences)
        average_pmi = total_pmi / (len(words)**2)
        tmv = TopicMetricValue(topic=topic, metric=metric, value=average_pmi)
        tmv.save()
Ejemplo n.º 6
0
def add_metric(dataset, analysis, force_import=False):
    analysis = Analysis.objects.get(dataset__name=dataset, name=analysis)
    try:
        metric = TopicMetric.objects.get(name=metric_name,
                analysis=analysis)
        if not force_import:
            raise RuntimeError('Number of tokens is already in the database '
                    'for this analysis!')
    except TopicMetric.DoesNotExist:
        metric = TopicMetric(name='Number of tokens', analysis=analysis)
        metric.save()
    topics = analysis.topics.all()
    for topic in topics:
        tmv = TopicMetricValue(topic=topic, metric=metric,
                value=topic.tokens.count())
        tmv.save()
Ejemplo n.º 7
0
def add_metric(dataset, analysis, force_import=False, *args, **kwargs):
    analysis = Analysis.objects.get(dataset__name=dataset, name=analysis)
    try:
        metric = TopicMetric.objects.get(name=metric_name, analysis=analysis)
        if not force_import:
            raise RuntimeError('Word Entropy is already in the database '
                    'for this analysis!')
    except TopicMetric.DoesNotExist:
        metric = TopicMetric(name='Word Entropy', analysis=analysis)
        metric.save()
    topics = analysis.topic_set.all()
    for topic in topics:
        entropy = 0
        for tw in topic.topicword_set.all():
            prob = tw.count / topic.total_count
            entropy -= prob * (log(prob) / log(2))
        tmv = TopicMetricValue(topic=topic, metric=metric, value=entropy)
        tmv.save()
    transaction.commit()
Ejemplo n.º 8
0
def add_metric(dataset, analysis, force_import=False, *args, **kwargs):
    # try:
        analysis = Analysis.objects.get(dataset__name=dataset, name=analysis)
        try:
            metric = TopicMetric.objects.get(name=metric_name,
                    analysis=analysis)
            if not force_import:
                raise RuntimeError('Number of types is already in the database '
                        'for this analysis!')
        except TopicMetric.DoesNotExist:
            metric = TopicMetric(name='Number of types', analysis=analysis)
            metric.save()
        topics = analysis.topics.all()
        for topic in topics:
    #        types = WordType.objects.filter(tokens__topics__contains=topic).all()
            types = set(x[0] for x in topic.tokens.values_list('type__type'))
            tmv = TopicMetricValue(topic=topic, metric=metric,
                    value=len(types))
            tmv.save()
Ejemplo n.º 9
0
def add_metric(dataset, analysis, force_import=False, *args, **kwargs):
    analysis = Analysis.objects.get(dataset__name=dataset, name=analysis)
    for attr in analysis.dataset.attribute_set.all():
        name = 'Value Entropy for Attribute %s' % attr.name
        try:
            metric = TopicMetric.objects.get(name=name, analysis=analysis)
            if not force_import:
                raise RuntimeError('%s is already in the database for this '
                        'analysis!' % name)
        except TopicMetric.DoesNotExist:
            metric = TopicMetric(name=name, analysis=analysis)
            metric.save()
        topics = analysis.topics.all()
        for topic in topics:
            entropy = 0
            for avt in topic.attributevaluetopic_set.filter(attribute=attr):
                prob = avt.count / topic.total_count
                entropy -= prob * (log(prob) / log(2))
            tmv = TopicMetricValue(topic=topic, metric=metric, value=entropy)
            tmv.save()
Ejemplo n.º 10
0
def add_metric(dataset, analysis, force_import=False):
    analysis = Analysis.objects.get(dataset__name=dataset, name=analysis)
    try:
        ## TODO: add more intelligent checking; check for any TopicMetricValues
        metric = TopicMetric.objects.get(name=metric_name, analysis=analysis)
        if not force_import:
            raise RuntimeError('Word Entropy is already in the database '
                    'for this analysis!')
    except TopicMetric.DoesNotExist:
        metric = TopicMetric(name='Word Entropy', analysis=analysis)
        metric.save()
    topics = analysis.topics.all()
    for topic in topics:
        total_count = float(topic.tokens.count())
        topictokencounts = topic.tokens.values('type__type').annotate(count=Count('type__type'))
        entropy = 0
        for tw in topictokencounts:
            prob = float(tw['count']) / total_count
            entropy -= prob * (log(prob) / log(2))
        tmv = TopicMetricValue(topic=topic, metric=metric, value=entropy)
        tmv.save()
def add_metric(dataset, analysis, force_import=False, *args, **kwargs):
    # try:
    analysis = Analysis.objects.get(dataset__name=dataset, name=analysis)
    try:
        metric = TopicMetric.objects.get(name=metric_name,
                analysis=analysis)
        if not force_import:
            raise RuntimeError('%s is already in the database for this '
                    'analysis!' % metric_name)
    except TopicMetric.DoesNotExist:
        metric = TopicMetric(name=metric_name, analysis=analysis)
        metric.save()
    topics = analysis.topics.all()
    for topic in topics:
        total_count = float(topic.tokens.count())
        entropy = 0
        doctopic_counts = topic.tokens.values('document__id').annotate(count=Count('document__id'))
        for dt in doctopic_counts:
            prob = float(dt['count']) / total_count
            entropy -= prob * (log(prob) / log(2))
        tmv = TopicMetricValue(topic=topic, metric=metric, value=entropy)
        tmv.save()
Ejemplo n.º 12
0
def add_metric(dataset, analysis, force_import=False, *args, **kwargs):
    analysis = Analysis.objects.get(dataset__name=dataset, name=analysis)

    try:
        metric = TopicMetric.objects.get(name=metric_name, analysis=analysis)
        if not force_import:
            raise RuntimeError('%s is already in the database for this '
                    'analysis!' % metric_name)
    except TopicMetric.DoesNotExist:
        metric = TopicMetric(name=metric_name, analysis=analysis)
        metric.save()
    if 'state_file' not in kwargs:
        raise RuntimeError('I need a state file for this metric!')
    state_file = open(kwargs['state_file'])
    # this is specific to mallet state files!
    _ = state_file.readline()
    alpha_vector = state_file.readline()
    alphas = alpha_vector.split(': ')[1].split()
    for number, alpha in enumerate(alphas):
        topic = analysis.topic_set.get(number=number)
        tmv = TopicMetricValue(topic=topic, metric=metric,
                value=float(alpha))
        tmv.save()
    transaction.commit()