def createQueriesDictionary(Data):
    
    InvertedIndex, Queries = dm.readInvertedIndex(), {}

    N = 537933 # Total Number of Queries
    
    for data in Data:
        
        for i in range(1, 3):
            qid, query = data[i], [data[i+2]]
            if qid not in Queries:
                Queries.update({qid:query})
        
    for qid in Queries:

        Words = TextBlob(Queries[qid][0]).lower().words # Dictionary word -> frequency
        
        Hashes, Weights = [], []
        
        try:
            maxf = max(Words.count(w) for w in Words) # Max Frequency of a term in the query
        except:
            continue # Corrupted Data
            
        for w in Words:
            
            Hashes.append(hashFunction(w, 64))
            
            f, n = Words.count(w)/maxf, len(InvertedIndex[w]) # f(t), n(t)
            
            idf = math.log(N/n)/math.log(N) # IDF(t)
            
            Weights.append(f*idf)
        
        
        queryHash = HashQuery(Hashes, Weights)
        Queries[qid].append(queryHash)
        
    
    with open('Queries.txt', 'wb') as file:
        pickle.dump(Queries, file)
Exemple #2
0
def tweet_processor(path, part, freq=1):  
    myFile = pd.read_csv(path, sep=',')
    tweets = myFile["text"]
    if "May" in path: 
        part = 1
    # if "May" not in path:
    tweets = tweets[int(len(tweets)*(part-1)*0.5):int(len(tweets)*part*0.5)] 
    blob =  " ".join(myFile["text"]).split(" ") 
    processed_tweets = []
    compound_sent = [] 
    print("n tweets: ",len(tweets))
    sid = SentimentIntensityAnalyzer()
    for tweet in tweets:
        cleaned_tweet = p.clean(tweet.lower())
        filtered_tweet= clean_tweets(cleaned_tweet) 
        ss = sid.polarity_scores(filtered_tweet) 
        cur_sent = [ss['neg'],ss['pos'], ss['neu'], ss['compound']]  
        blob = TextBlob(filtered_tweet)
        Sentiment = blob.sentiment     
        polarity = Sentiment.polarity
        subjectivity = Sentiment.subjectivity
        if filtered_tweet != "" and len(filtered_tweet) >2: 
            processed_tweets.append(filtered_tweet)  
            compound_sent.append(cur_sent)
    # np.savetxt("processed_tweets.csv", processed_tweets, delimiter=",", fmt='%s') 
    compound_sent = np.asarray(compound_sent)
    freqs = []
    
    print("number of words: ",len((" ".join(processed_tweets).split(" ")))) 
    print("unique words: ",len(set(" ".join(processed_tweets).split(" ")))) 

    if freq ==0: #Use blob counting
        words = set(blob.split(" "))
        for word in set(blob.split(" ")):
            if word != "" and len(word)>2: 
                freqs.append([word,blob.count(word)])    
        freqs = np.asarray(freqs)
        freqs = freqs[np.argsort(freqs[:, 1])][::-1]

    if freq ==1: #Use NLTK freqdist
        freqs = pfreq_dist(" ".join(processed_tweets).split(" "))
        freqs = np.asarray(freqs)  
    return processed_tweets, freqs, compound_sent