def  abbrivation_list():
    projection={'_id':False}
    abbrivations_dict={}
    res=load_from_mongo('abbrivations','list',projection=projection)
    for i in res:
        for key in i.keys():
            key=str(key)
            i[key]=str(i[key])
            abbrivations_dict[key]=i[key]
    return abbrivations_dict
def abbrivation_list():
    projection = {'_id': False}
    abbrivations_dict = {}
    res = mongo.load_from_mongo('abbrivations', 'list', projection=projection)
    for i in res:
        for key in i.keys():
            key = str(key)
            i[key] = str(i[key])
            abbrivations_dict[key] = i[key]
    return abbrivations_dict
    for key in accum_term.keys():
        num_pos=0
        num_neg =0
        adjusted_score=0
        term_value=0
        total_sum =0
        for score in accum_term[key]:
            total_sum = total_sum + score
        term_value = (total_sum)/len(accum_term[key])
        term_value=int(term_value)
        add_to_senti_file[key]=term_value
    #add to new words to database or file
    with open ('new_words.txt','w') as f:
        for key,value in add_to_senti_file.items():
            f.write(key+'\t'+str(value)+'\n')
            #f.write('\t'.join(tupl))

    
    
    return score,actual_tweet,tweet_id
   
    
res=mongo.load_from_mongo('input','data')
for i in res:
    a,b,c= sentiment_of_tweet(i,abb_list,sentiment)
    g={}
    g['_id']= c
    g['text'] = b
    g['sentiment'] = a
    mongo.save_to_mongo(g,'output_final','with_ourscores')
from mongo import load_from_mongo
import numpy as np
import re
import lda
feature_array=[]
result=load_from_mongo("hindu_modified","docs1")
f=open('vocab.txt','r')
feature_vector=f.read().split('\n')
#print len(feature_vector)
len_of_feature_vector=len(feature_vector)
#print feature_vector[:10]

for each in result:
   
    text=each["text"]
    #print text
    text = re.sub(r'[^a-zA-Z0-9 ]',' ',text)
    text_tokens=text.split(" ")
    
    text_dist={}
    for each in text_tokens:
        if each.lower() in text_dist.keys():
            text_dist[each.lower()] =text_dist[each.lower()]+1
        else:
            text_dist[each.lower()] = 1
    vector=[0]*len_of_feature_vector
    for each in text_dist.keys():
      
        #print feature_vector.index(each)
        
        vector[feature_vector.index(each)]=text_dist[each]
Beispiel #5
0
from __future__ import division
##True class A (TA) - correctly classified into class A
##False class A (FA) - incorrectly classified into class A
##True class B (TB) - correctly classified into class B
##False class B (FB) - incorrectly classified into class B
from mongo import load_from_mongo
from pylab import *

#estimated=load_from_mongo('input','data')
observed_ourscores=load_from_mongo('tes','coll_copy')

TA=0
FA=0
TB=0
FB=0
#labels = 'positive','negative'


for i in observed_ourscores:
    tweet_id=i['_id']
    #print 'id',tweet_id
    senti_ob=i['sentiment']
    #print senti_ob
    senti=load_from_mongo('train','coll_copy',criteria={'_id':tweet_id},projection={'_id':0,'sentiment':1})
    print tweet_id
    senti_es = str(senti[0].values()[0])
    #print senti_es
    
    #print senti_es
    if senti_ob>0 and senti_es=='1':
        #print 'positive'
Beispiel #6
0
from __future__ import division
##True class A (TA) - correctly classified into class A
##False class A (FA) - incorrectly classified into class A
##True class B (TB) - correctly classified into class B
##False class B (FB) - incorrectly classified into class B
from mongo import load_from_mongo
from pylab import *

#estimated=load_from_mongo('input','data')
observed_outscores=load_from_mongo('output_final','without_scores')

TA=0
FA=0
TB=0
FB=0
labels = 'positive','negative'


for i in observed_outscores:
    tweet_id=i['_id']
    #print 'id',tweet_id
    senti_ob=i['sentiment']
    #print senti_ob
    senti=load_from_mongo('input','data',criteria={'_id':tweet_id},projection={'_id':0,'sentiment':1})
    senti_es = str(senti[0].values()[0])
    #print senti_es
    
    #print senti_es
    if senti_ob>0 and senti_es=='positive':
        #print 'positive'
        TA =TA+1
    except:
        avg_word_length = 0.0
    try:
        avg_para_length = num_sentences / float(num_para)
    except:
        avg_para_length = 0.0
    try:
        avg_sent_length = sum(length) / float(num_sentences)
    except:
        avg_sent_length = 0.0
    return ("%.2f" % avg_para_length), ("%.2f" %
                                        avg_sent_length), ("%.2f" %
                                                           avg_word_length)


email_content = load_from_mongo('email_content', 'coll_ten')
for i in email_content:
    name = i['name']
    text = i['email']
    token = i['email_number']
    _id = i['_id']
    ##    fdist=nltk.FreqDist(ngrams(text)[0])
    ##    fdist1=nltk.FreqDist(ngrams(text)[1])
    ##    with open('test.txt','a') as f:
    ##        for k,v in fdist1.items():
    ##            for iter in k:
    ##                f.write(iter)
    ##                if iter!=k[-1]:
    ##                    f.write(",")
    ##            f.write('\n')
    let_f = sorted(letter_freq(text).iteritems())
    regex = re.compile(r"not\b")
    if regex.search(tweet):
        negated = True
        tweet = re.sub(r"not\b", "", tweet)
    #preprocessing may increase accuracy

    tweet_id = text['_id']
    tweet = re.sub(r"(?:\@|https?\://)\S+", "", tweet)

    tweet = re.sub(r"\b\d+\b", "", tweet)
    tweet = tweet.strip().lower()
    word_list = tweet.split()
    for k in word_list:
        if any(k == s for s in pos_words):
            score = score + 1
        elif any(k == s for s in neg_words):
            score = score - 1
    if negated:
        score = -score
    return score, actual_text, tweet_id


res = mongo.load_from_mongo('input', 'data')
for i in res:
    a, b, c = sentiment(i, abb_list)
    g = {}
    g['_id'] = c
    g['text'] = b
    g['sentiment'] = a
    mongo.save_to_mongo(g, 'output_final', 'without_scores')
from mongo import load_from_mongo
from mongo import save_to_mongo




stopwords_file='stopwords.txt'
stop_lis=[]
def stopwords_list(filename):
    with open(filename,'r') as f:
        for line in f:
            line=line.replace('\n','')
            stop_lis.append(line)
stopwords_list(stopwords_file)


def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not \
                     in stop_lis])
    return text


docs_before=load_from_mongo("hindu","docs1")
for each in docs_before:
    each["text"]=remove_stopwords(each["text"])
    save_to_mongo(each,"hindu_modified","docs1")
    
from mongo import load_from_mongo
results=load_from_mongo("hindu","docs1")
##f=open("titles",'w')
##count=0
##print len(results)
##for each,i in results,range(len(results)):
##    text=''
##    text=text+str(count)+" "+each['HD']
##    
##    #print text
##    f.write(text)
##    #f.write('\n')
##    #count=count+1
##    #print count
##    
##g=open("titles",'r')
##lis=g.read()
##print lis
##print len(lis)
##titles=tuple(lis)
##print len(titles)
##print titles

print titles