def add_metric(dataset, analysis, force_import=False, *args, **kwargs): analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) for attr in analysis.dataset.attribute_set.all(): for val in attr.value_set.all(): metric_name = 'Token Count for %s: %s' % (attr.name, val.value) print metric_name try: metric = TopicMetric.objects.get(name=metric_name, analysis=analysis) if not force_import: raise RuntimeError('%s is already in the database for this ' 'analysis!' % metric_name) except TopicMetric.DoesNotExist: metric = TopicMetric(name=metric_name, analysis=analysis) metric.save() topics = analysis.topics.all() docs = [d.id for d in analysis.dataset.documents.filter( attributevaluedocument__attribute=attr, attributevaluedocument__value=val)] for topic in topics: count = 0 for dt in topic.documenttopic_set.filter(document__id__in=docs): count += dt.count tmv = TopicMetricValue(topic=topic, metric=metric, value=count) tmv.save()
def add_metric(dataset, analysis, force_import=False, *args, **kwargs): analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) for attr in analysis.dataset.attribute_set.all(): for val in attr.value_set.all(): metric_name = 'Document Entropy for %s: %s' % (attr.name, val.value) print metric_name try: metric = TopicMetric.objects.get(name=metric_name, analysis=analysis) if not force_import: raise RuntimeError( '%s is already in the database for this ' 'analysis!' % metric_name) except TopicMetric.DoesNotExist: metric = TopicMetric(name=metric_name, analysis=analysis) metric.save() topics = analysis.topics.all() docs = [ d.id for d in analysis.dataset.documents.filter( attributevaluedocument__attribute=attr, attributevaluedocument__value=val) ] for topic in topics: ent = 0 for dt in topic.documenttopic_set.filter( document__id__in=docs): prob = dt.count / topic.total_count ent -= prob * (log(prob) / log(2)) tmv = TopicMetricValue(topic=topic, metric=metric, value=ent) tmv.save()
def add_metric(dataset, analysis, force_import=False, *args, **kwargs): analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) try: metric = TopicMetric.objects.get(name='Percent Tokens Positive ' 'Sentiment', analysis=analysis) if not force_import: raise RuntimeError('Sentiment is already in the database ' 'for this analysis!') except TopicMetric.DoesNotExist: metric = TopicMetric(name='Percent Tokens Positive Sentiment', analysis=analysis) metric.save() # call stuff to classify documents and get sentiment information, as in # parse_dependencies.py data_root = analysis.dataset.dataset_dir topics = analysis.topics.all() for topic in topics: positive = 0; negative = 0; for docTopic in topic.documenttopic_set.all(): filename = data_root + '/' + docTopic.document.filename print topic, filename sentiment = float(sentiment_document(filename)) print 'sentiment returned:', sentiment if sentiment == 1 : positive += docTopic.count print '%d/%d' % (positive, topic.total_count) # compute aggregate information for topic topicSentiment = float(positive)/float(topic.total_count) tmv = TopicMetricValue(topic=topic, metric=metric, value=topicSentiment) tmv.save()
def add_metric(dataset, analysis, force_import=False, *args, **kwargs): analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) try: metric = TopicMetric.objects.get(name=metric_name, analysis=analysis) if not force_import: raise RuntimeError('%s is already in the database for this ' 'analysis!' % metric_name) except TopicMetric.DoesNotExist: metric = TopicMetric(name=metric_name, analysis=analysis) metric.save() conn = sqlite3.connect(kwargs['counts']) c = conn.cursor() c.execute("select words from total_counts") for row in c: total_words = float(row[0]) c.execute("select cooccurrences from total_counts") for row in c: total_cooccurrences = float(row[0]) topics = analysis.topics.all() for topic in topics: topicwords = topic.topicword_set.filter( word__ngram=False).order_by('-count') # We just grab the first ten words - there's probably a better way to # do this words = [tw.word.type for tw in topicwords[:10]] total_pmi = 0 for w1 in words: for w2 in words: if w1 == w2: continue total_pmi += compute_pmi(w1, w2, c, total_words, total_cooccurrences) average_pmi = total_pmi / (len(words)**2) tmv = TopicMetricValue(topic=topic, metric=metric, value=average_pmi) tmv.save()
def add_metric(dataset, analysis, force_import=False): analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) try: metric = TopicMetric.objects.get(name=metric_name, analysis=analysis) if not force_import: raise RuntimeError('Number of tokens is already in the database ' 'for this analysis!') except TopicMetric.DoesNotExist: metric = TopicMetric(name='Number of tokens', analysis=analysis) metric.save() topics = analysis.topics.all() for topic in topics: tmv = TopicMetricValue(topic=topic, metric=metric, value=topic.tokens.count()) tmv.save()
def add_metric(dataset, analysis, force_import=False, *args, **kwargs): analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) try: metric = TopicMetric.objects.get(name=metric_name, analysis=analysis) if not force_import: raise RuntimeError('Word Entropy is already in the database ' 'for this analysis!') except TopicMetric.DoesNotExist: metric = TopicMetric(name='Word Entropy', analysis=analysis) metric.save() topics = analysis.topic_set.all() for topic in topics: entropy = 0 for tw in topic.topicword_set.all(): prob = tw.count / topic.total_count entropy -= prob * (log(prob) / log(2)) tmv = TopicMetricValue(topic=topic, metric=metric, value=entropy) tmv.save() transaction.commit()
def add_metric(dataset, analysis, force_import=False, *args, **kwargs): # try: analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) try: metric = TopicMetric.objects.get(name=metric_name, analysis=analysis) if not force_import: raise RuntimeError('Number of types is already in the database ' 'for this analysis!') except TopicMetric.DoesNotExist: metric = TopicMetric(name='Number of types', analysis=analysis) metric.save() topics = analysis.topics.all() for topic in topics: # types = WordType.objects.filter(tokens__topics__contains=topic).all() types = set(x[0] for x in topic.tokens.values_list('type__type')) tmv = TopicMetricValue(topic=topic, metric=metric, value=len(types)) tmv.save()
def add_metric(dataset, analysis, force_import=False, *args, **kwargs): analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) for attr in analysis.dataset.attribute_set.all(): name = 'Value Entropy for Attribute %s' % attr.name try: metric = TopicMetric.objects.get(name=name, analysis=analysis) if not force_import: raise RuntimeError('%s is already in the database for this ' 'analysis!' % name) except TopicMetric.DoesNotExist: metric = TopicMetric(name=name, analysis=analysis) metric.save() topics = analysis.topics.all() for topic in topics: entropy = 0 for avt in topic.attributevaluetopic_set.filter(attribute=attr): prob = avt.count / topic.total_count entropy -= prob * (log(prob) / log(2)) tmv = TopicMetricValue(topic=topic, metric=metric, value=entropy) tmv.save()
def add_metric(dataset, analysis, force_import=False): analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) try: ## TODO: add more intelligent checking; check for any TopicMetricValues metric = TopicMetric.objects.get(name=metric_name, analysis=analysis) if not force_import: raise RuntimeError('Word Entropy is already in the database ' 'for this analysis!') except TopicMetric.DoesNotExist: metric = TopicMetric(name='Word Entropy', analysis=analysis) metric.save() topics = analysis.topics.all() for topic in topics: total_count = float(topic.tokens.count()) topictokencounts = topic.tokens.values('type__type').annotate(count=Count('type__type')) entropy = 0 for tw in topictokencounts: prob = float(tw['count']) / total_count entropy -= prob * (log(prob) / log(2)) tmv = TopicMetricValue(topic=topic, metric=metric, value=entropy) tmv.save()
def add_metric(dataset, analysis, force_import=False, *args, **kwargs): # try: analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) try: metric = TopicMetric.objects.get(name=metric_name, analysis=analysis) if not force_import: raise RuntimeError('%s is already in the database for this ' 'analysis!' % metric_name) except TopicMetric.DoesNotExist: metric = TopicMetric(name=metric_name, analysis=analysis) metric.save() topics = analysis.topics.all() for topic in topics: total_count = float(topic.tokens.count()) entropy = 0 doctopic_counts = topic.tokens.values('document__id').annotate(count=Count('document__id')) for dt in doctopic_counts: prob = float(dt['count']) / total_count entropy -= prob * (log(prob) / log(2)) tmv = TopicMetricValue(topic=topic, metric=metric, value=entropy) tmv.save()
def add_metric(dataset, analysis, force_import=False, *args, **kwargs): analysis = Analysis.objects.get(dataset__name=dataset, name=analysis) try: metric = TopicMetric.objects.get(name=metric_name, analysis=analysis) if not force_import: raise RuntimeError('%s is already in the database for this ' 'analysis!' % metric_name) except TopicMetric.DoesNotExist: metric = TopicMetric(name=metric_name, analysis=analysis) metric.save() if 'state_file' not in kwargs: raise RuntimeError('I need a state file for this metric!') state_file = open(kwargs['state_file']) # this is specific to mallet state files! _ = state_file.readline() alpha_vector = state_file.readline() alphas = alpha_vector.split(': ')[1].split() for number, alpha in enumerate(alphas): topic = analysis.topic_set.get(number=number) tmv = TopicMetricValue(topic=topic, metric=metric, value=float(alpha)) tmv.save() transaction.commit()