def getRelevantPositiveTweets(collectionName,listTweet,countTweets): global listRelevantTweet global probWords collection =getCollection(collectionName) sorted_probability = probWords wordPresent = True # bool to check if the word is present in the list of tweets or not word = "" while(countTweets > 0): #sort the list of Tweets based on sentence score sortedList = sorted(listTweet,key=lambda x: x[2], reverse=True) if not wordPresent: sorted_probability[word] *= sorted_probability[word] # fix the probability of word if none of the tweets contains it if sorted_probability[word] <= 0 : # delete the word if the probabilty of it becomes zero del sorted_probability[word] if not sorted_probability: # return if all the words in the dictionary are deleted beacuse of 0 probability return # sort the probabilities and return the highest probability word word = (sorted(sorted_probability.items(), key=operator.itemgetter(1),reverse = True))[0][0] wordPresent = False for id,tweet,probability in sortedList: if word in tweet.split(): docs = collection.find({"_id":id}) for doc in docs: listRelevantTweet.append((doc['Tweet'],doc['Sentiment'])) # Append the relevant tweet to the list break sorted_probability = fixWordProbability(sorted_probability,tweet) # Fix the word probabilities listTweet.remove((id,tweet,probability)) # Remove that tweet form list of Tweets listTweet = fixSentenceProbability(listTweet,sorted_probability) # Fix the sentence probability of all the other tweets countTweets-=1 wordPresent =True break
def NV2_CollectNews(driver, info, db_mongo): try: for i in range(len(info)): url = info[i] LOG.info(f"{Normalize(url['time']).upper()} - {datetime.now()}") collection = getCollection(db_mongo, str(Normalize(url['time']).upper())) change_page(driver, url['url_time'], False) for link_ in list( StartBeautifulSoup(driver).findAll("a", { "class": "feed-post-link gui-color-primary gui-color-hover" })): tittle_ = Normalize(str(link_.text).strip()).upper().replace( ' ', '_') if getNew(collection, tittle_): continue info_N3 = NV3_ColetaConteudo(driver, str(link_.get('href')).strip(), tittle_) new_ = { 'titulo': tittle_, 'url_news': str(link_.get('href')).strip() } for info_ in list(info_N3.keys()): new_[info_] = info_N3[info_] insert_documents(collection, new_) except Exception as e: LOG.error(f'ERRO AO REALIZAR COLETA NIVEL 2 NAVEGADOR : {e}') return []
def calculateSumBasic(collectionName,personName): collection = getCollection(collectionName) dictWords = {} # dictionary to hold thw word counts listSentiment0 =[] # list conatting the positive sentiment tweets with relevance 1 listSentiment1 =[] # list conatting the negative sentiment tweets with relevance 1 global probWords tweets = [] for document in collection.find({},no_cursor_timeout=False).batch_size(100): tweet = document['Tweet'] tweet = process(tweet) # basic processing of tweet tweet = removePerson(tweet,personName) # remove the person name from tweet tweet = stemming(tweet) # stem the tweet tweet = removeStopWords(tweet) # remove stop words # eliminate the tweet with length less than 2 if len(tweet) < 2: continue tweets.append((document['_id'],tweet,document['Sentiment'],document['Relevance'])) tweetWords = tweet.split() # count the frequency of words in the tweet for word in tweetWords: if dictWords.has_key(word): value = dictWords[word] dictWords[word] = value + 1 else: dictWords[word] = 1 totalWordsCount = len(dictWords) # total number of words in input set # get the probability distribution of every word in input for key in dictWords.keys(): probWords[key] = dictWords[key]/float(totalWordsCount) # Calculate the sumbasic score and upadte in DB for id,tweet,sentiment,relevance in tweets: tweetWords = tweet.split() numberOfWordsinTweet= len(tweetWords) probability = 0 for word in tweetWords: count = tweetWords.count(word) probability += (probWords[word]/float(count)) prob = probability collection.update_one({"_id":id},{'$set': {'sumBasic': prob}}) if sentiment == 0 and relevance == 1: listSentiment0.append((id,tweet,prob)) elif sentiment == 1 and relevance == 1: listSentiment1.append((id,tweet,prob)) del tweets[:] # free the list of tweetss return listSentiment0,listSentiment1
def pefromVotingAndLabelOnTest(classifiers,collectionName): #classify the data in the test Instance collection = getCollection(collectionName) for tweet,documentId in test: countPositive = 0 countNegative = 0 #print tweet for classifier in classifiers: sentiment = classifier.classify(extract_features(tweet)) #print (str(sentiment)) if sentiment == 0: countPositive+=1 else: countNegative+=1 if countNegative > countPositive: collection.update_one({"_id":documentId },{'$set': {'Sentiment': 1}}) else: collection.update_one({"_id":documentId },{'$set': {'Sentiment': 0}})
def getTheRelevantTweets(collectionName,percentage,personName): global listRelevantTweet collection = getCollection(collectionName) totalTweets = collection.find({}).count() # Total number of Tweets collected for personName totalRelevant1 = collection.find({'Relevance':1}).count() # Number of relevant tweets totalPositive = collection.find({'Relevance':1,'Sentiment':0}).count() # Number of positive tweets which are relevant totalNegative = collection.find({'Relevance':1,'Sentiment':1}).count() # Number of negative tweets which are relevant print("Relevance 1 Sentiment 0",str(totalPositive)) print("Relevance 1 Sentiment 1",str(totalNegative)) totalTwetsToFind = int(percentage*totalTweets) # total number of tweets to find for summarization # If the number of relevant tweets are less than the percentage of tweets to find return all the relevant tweets if totalRelevant1 <= totalTwetsToFind: print("Relevant Tweets are less than the percentage of Tweets to find") return fetchAllTheRelevant(collectionName) ratioPositiveNegative = totalPositive/totalNegative # ratio of positive to negative tweets print ("Total tweets to find:",str(totalTwetsToFind)) postiveToFind = int((ratioPositiveNegative*totalTwetsToFind)/float(ratioPositiveNegative+1)) # Number of positive tweets to find negativeToFind = int(totalTwetsToFind-postiveToFind) # Number of negative tweets to find print("Positive to find:",str(postiveToFind)) print("Negative to find:",str(negativeToFind)) listTweets = calculateSumBasic(collectionName,personName) # call to calculate initial sumbasic score # get the relevant positive and negative tweets for listTweet in listTweets: getRelevantPositiveTweets(collectionName,listTweet,postiveToFind) postiveToFind = negativeToFind print ("Number of tweets Found:"+str(len(listRelevantTweet))) return listRelevantTweet
def fetchAllTheRelevant(collectionName): listRelevance = [] collection = getCollection(collectionName) for document in collection.find({'Relevance':1},no_cursor_timeout=False): listRelevance.append((document['Tweet'],document['Sentiment'])) return listRelevance