def __init__(self): # Tokenize words and keep emoticons self.posFeatures = [] self.negFeatures = [] self.posWords = [] self.negWords = [] self.pp = PreProcessor() # Import tweet data with open('Sentiment Analysis Dataset.csv') as csvfile: reader = csv.DictReader(csvfile) for row in tqdm(reader): if row['Sentiment'] == '1': # Divide http_@_# and keep emoticons posWord = re.findall(r"[http]+://[^\s]*|@[^\s]*|#[^\s]*|[\w']+|[:)-;=)(>o3Dx^\/*w8~_T|]+", row["SentimentText"].rstrip()) # Preprocess posWord = self.pp.process(posWord) self.posFeatures.append(posWord) self.posWords.append(' '.join(posWord)) elif row['Sentiment'] == '0': negWord = re.findall(r"[http]+://[^\s]*|@[^\s]*|#[^\s]*|[\w']+|[:)-;=)(>o3Dx^\/*w8~_T|]+", row["SentimentText"].rstrip()) negWord = self.pp.process(negWord) self.negFeatures.append(negWord) self.negWords.append(' '.join(negWord)) shuffle(self.posFeatures) shuffle(self.negFeatures) shuffle(self.posWords) shuffle(self.negWords)
def wordNetWithTweets(datas, pathDestination, pathDestinationWordNet): preProcessor = PreProcessor() for data in datas: data = data.replace("\n", "") dataNome = data + ".csv" arq = open(pathDestination + dataNome) destino = codecs.open(pathDestinationWordNet + dataNome, "a", "utf-8") for linha in arq: tweet = json.loads(linha) #created_at = tweet['created_at'] #created_at = parser.parse(created_at) #created_at = str(created_at.day)+"-"+str(created_at.month) text = tweet["tweet_text"] text = text.lower() text = preProcessor.textFilter(text) text = preProcessor.removeNonAlphaNumericValues(text) text = preProcessor.remove_stopWords(text) text = text.strip() #text = text.replace(" ",",") destino.write(text + "\n") arq.close() destino.close() destino.close()
class DataLoader: def __init__(self): # Tokenize words and keep emoticons self.posFeatures = [] self.negFeatures = [] self.pp = PreProcessor() with open('Sentiment Analysis Dataset.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: if row['Sentiment'] == '1': posWord = re.findall( r"[http]+://[^\s]*|@[^\s]*|#[^\s]*|[\w']+|[:)-;=)(>o3Dx^\/*w8~_T|]+", row["SentimentText"].rstrip()) self.posFeatures.append(self.pp.process(posWord)) elif row['Sentiment'] == '0': negWord = re.findall( r"[http]+://[^\s]*|@[^\s]*|#[^\s]*|[\w']+|[:)-;=)(>o3Dx^\/*w8~_T|]+", row["SentimentText"].rstrip()) self.negFeatures.append(self.pp.process(negWord)) shuffle(self.posFeatures) shuffle(self.negFeatures) def get_data(self): return self.posFeatures, self.negFeatures # POS tagging def select(self, feature_select): posFeatures_selected = [] negFeatures_selected = [] for words in self.posFeatures: posFeatures_selected.append([feature_select(words), 'pos']) for words in self.negFeatures: negFeatures_selected.append([feature_select(words), 'neg']) return posFeatures_selected, negFeatures_selected # Get train&test data def split(self, posdata, negdata, train_num, test_num): train_num /= 2 test_num /= 2 trainFeatures = posdata[:train_num] + negdata[:train_num] testFeatures = posdata[-test_num:] + negdata[-test_num:] return trainFeatures, testFeatures
def __init__(self): # Tokenize words and keep emoticons self.posFeatures = [] self.negFeatures = [] self.pp = PreProcessor() with open('Sentiment Analysis Dataset.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: if row['Sentiment'] == '1': posWord = re.findall( r"[http]+://[^\s]*|@[^\s]*|#[^\s]*|[\w']+|[:)-;=)(>o3Dx^\/*w8~_T|]+", row["SentimentText"].rstrip()) self.posFeatures.append(self.pp.process(posWord)) elif row['Sentiment'] == '0': negWord = re.findall( r"[http]+://[^\s]*|@[^\s]*|#[^\s]*|[\w']+|[:)-;=)(>o3Dx^\/*w8~_T|]+", row["SentimentText"].rstrip()) self.negFeatures.append(self.pp.process(negWord)) shuffle(self.posFeatures) shuffle(self.negFeatures)
def evaluateAllTweetMessages(train_set, clf, folderData, folderResultsAnalysis): ''' Params: "clf" that is a method to tweets classification "folderData" where are files separated by data which have has also a file data.txt with name of all files one per line "folderResultsAnalysis" where will be saved the analyzes grouped by day ''' arq_results = open(folderResultsAnalysis + "sentimentsEveryDay.csv", "w") arq_results.write("date,VHVL,HVL,MVL,LVL,NVI\n") arq_date = open(folderData + "datas.txt") cont = 0 for date in arq_date: cont += 1 print(cont) date = date.replace("\n", "") arq_tweets = open(folderData + date + ".json") tweets_list = [] for linha in arq_tweets: tweet = json.loads(linha) ''' Tweet preprocessing ''' message = tweet['text'] message = preProcessingMessages(PreProcessor(), message.encode("utf-8")) tweets_list.append(message) results = classMessages(train_set, clf, tweets_list, date) result = str(date) + "," + str(results['VHVL']) + "," + str( results['HVL']) + "," + str(results['MVL']) + "," + str( results['LVL']) + "," + str(results['NVI']) + "\n" arq_results.write(result) arq_tweets.close() arq_date.close() arq_results.close()
class DataLoader: def __init__(self): # Tokenize words and keep emoticons self.posFeatures = [] self.negFeatures = [] self.posWords = [] self.negWords = [] self.pp = PreProcessor() # Import tweet data with open('Sentiment Analysis Dataset.csv') as csvfile: reader = csv.DictReader(csvfile) for row in tqdm(reader): if row['Sentiment'] == '1': # Divide http_@_# and keep emoticons posWord = re.findall(r"[http]+://[^\s]*|@[^\s]*|#[^\s]*|[\w']+|[:)-;=)(>o3Dx^\/*w8~_T|]+", row["SentimentText"].rstrip()) # Preprocess posWord = self.pp.process(posWord) self.posFeatures.append(posWord) self.posWords.append(' '.join(posWord)) elif row['Sentiment'] == '0': negWord = re.findall(r"[http]+://[^\s]*|@[^\s]*|#[^\s]*|[\w']+|[:)-;=)(>o3Dx^\/*w8~_T|]+", row["SentimentText"].rstrip()) negWord = self.pp.process(negWord) self.negFeatures.append(negWord) self.negWords.append(' '.join(negWord)) shuffle(self.posFeatures) shuffle(self.negFeatures) shuffle(self.posWords) shuffle(self.negWords) # Return a whole sentence def get_data(self): return self.posWords, self.negWords # Return a unigram feature def get_uni_data(self): posFeatures = list(self.posFeatures) negFeatures = list(self.negFeatures) for i, words in tqdm(enumerate(self.posFeatures)): posFeatures[i] = words for i, words in tqdm(enumerate(self.negFeatures)): negFeatures[i] = words # Without label 'pos'&'neg' return posFeatures, negFeatures # Return a unigram&bigram feature def get_unibi_data(self): posFeatures = list(self.posFeatures) negFeatures = list(self.negFeatures) for i, word in tqdm(enumerate(self.posFeatures)): word_new = list(word) word_new.extend(self.bigram(word)) posFeatures[i] = word_new for i, word in tqdm(enumerate(self.negFeatures)): word_new = list(word) word_new.extend(self.bigram(word)) negFeatures[i] = word_new # Without label 'pos'&'neg' return posFeatures, negFeatures # POS tagging def get_posdata(self): posFeatures_pos = [] negFeatures_pos = [] for words in tqdm(self.posFeatures): posFeatures_pos.append([nltk.pos_tag(words[0]), 'pos']) for words in tqdm(self.negFeatures): negFeatures_pos.append([nltk.neg_tag(words[0]), 'neg']) return posFeatures_pos, negFeatures_pos # Get train&val data # Parameters posdata&negdata should be labeled def get_ratio_data(posdata, negdata, r): posCutoff = int(math.floor(len(posdata) * r)) negCutoff = int(math.floor(len(negdata) * r)) trainFeatures = posdata[:posCutoff] + negdata[:negCutoff] valFeatures = posdata[posCutoff:] + negdata[negCutoff:] return trainFeatures, valFeatures def bigram(self, words): return list(nltk.bigrams(words))