def fit_data(df, vocab_dic): tokenizer = tk.WordPunctTokenizer() max_review_len = 0 max_voc_len = 0 user_review_dic = {} item_review_dic = {} for user, group in df.groupby(['reviewerID']): user_review_dic[user] = {} for asin, review in zip(group.asin, group.reviewText): user_review_dic[user][asin] = set([]) if type(review) is not str: continue tokens = tokenizer.tokenize(review) for token in tokens: if token in vocab_dic: user_review_dic[user][asin].add(vocab_dic[token]) max_voc_len = max(max_voc_len, len(user_review_dic[user][asin])) max_review_len = max(max_review_len, len(user_review_dic[user])) for asin, group in df.groupby(['asin']): item_review_dic[asin] = {} for reviewerID, review in zip(group.reviewerID, group.reviewText): item_review_dic[asin][reviewerID] = set([]) if type(review) is not str: continue tokens = tokenizer.tokenize(review) for token in tokens: if token in vocab_dic: item_review_dic[asin][reviewerID].add(vocab_dic[token]) max_voc_len = max(max_voc_len, len(item_review_dic[asin][reviewerID])) max_review_len = max(max_review_len, len(item_review_dic[asin])) return user_review_dic, item_review_dic, max_review_len, max_voc_len
def fit_data(df, vocab_dic): tokenizer = tk.WordPunctTokenizer() max_count = 0 user_review_dic = {} item_review_dic = {} for user, group in df.groupby(['reviewerID']): user_review_dic[user] = set([]) cur_count = 0 doc = ' '.join(group['reviewText'].values) tokens = tokenizer.tokenize(doc) for token in tokens: if token in vocab_dic: if vocab_dic[token] not in user_review_dic[user]: user_review_dic[user].add(vocab_dic[token]) cur_count += 1 max_count = max(cur_count, max_count) for item, group in df.groupby(['asin']): item_review_dic[item] = set([]) doc = ' '.join(group['reviewText'].values) cur_count = 0 tokens = tokenizer.tokenize(doc) for token in tokens: if token in vocab_dic: if vocab_dic[token] not in item_review_dic[item]: item_review_dic[item].add(vocab_dic[token]) cur_count += 1 max_count = max(cur_count, max_count) return user_review_dic, item_review_dic, max_count
def load_word(file): sents_up = [] sents_down = [] labels = [] aspects = [] lines = open(file, 'r', encoding="utf8").readlines() for i in range(0, len(lines), 3): polarity = lines[i + 2].split()[0] if polarity == 'conflict': continue tokenizer = tk.WordPunctTokenizer() sentences = tokenizer.tokenize(lines[i].strip()) aspect = tokenizer.tokenize(lines[i + 1].strip()) aspects.append(aspect) if "aspectTerm" not in sentences: print(sentences) continue sents_up.append(sentences[:sentences.index("aspectTerm")] + aspect) sents_down.append(aspect + sentences[sentences.index("aspectTerm") + 1:]) if polarity == 'negative': labels.append(-1) elif polarity == 'neutral': labels.append(0) elif polarity == 'positive': labels.append(1) up_max_len = max([len(elem) for elem in sents_up]) down_max_len = max([len(elem) for elem in sents_down]) return sents_up, sents_down, aspects, labels, up_max_len, down_max_len
def apply_preprocessors(self, column): punctuationTokenizer = tokenize.WordPunctTokenizer() processed_sentece = list() for each in tqdm(column, desc="RemovePonctuationPreprocessor"): filtered_sentence = list() wordish = punctuationTokenizer.tokenize(each) for item in wordish: if item not in [p for p in punctuation]: filtered_sentence.append(item) processed_sentece.append(' '.join(filtered_sentence)) return processed_sentece
def frequency_word(self): list_df = list_dataframe() dataframe_frequency = list_df[0] word_punct_tokenize = tokenize.WordPunctTokenizer() #transformando a coluna de um dataframe em um array text_array = dataframe_frequency["comments"].array #convertendo para um unico texto todos os items do array text_join = ' '.join(text_array) #tokenizar o todo o texto tokens = word_punct_tokenize.tokenize(text_join) #Calculando a frequencia de cada palavra freq_dist = FreqDist(tokens) count_frequency = list(freq_dist) total_word_frequency = count_frequency[:self._total_words] self._data = total_word_frequency
def remove_stop_words(text): text = unidecode.unidecode(text).lower() token_punct = tokenize.WordPunctTokenizer() token = token_punct.tokenize(text) stemmer = nltk.RSLPStemmer() words = nltk.corpus.stopwords.words('portuguese') words_without_accent = [unidecode.unidecode(item) for item in words] stopwords = words + words_without_accent + list(punctuation) without_stop_words = [ stemmer.stem(item) for item in token if item not in stopwords ] return " ".join(without_stop_words)
def caracteres_specials(): comments_db_for_carac_specials = comments_dataframe[0] punct = punctuation + "…" token = tokenize.WordPunctTokenizer() comment_whitout_punctuation = [] for comment in comments_db_for_carac_specials['comments']: comments = [] separate_comment = token.tokenize(comment) for word in separate_comment: if (word not in punct): comments.append(word) comment_whitout_punctuation.append(' '.join(comments)) comments_db_for_carac_specials['comments'] = comment_whitout_punctuation update_dataframe(comments_db_for_carac_specials)
def tokenize(self, sentences, task_ids): # nltk TweetTokenizer for stance tweet_tokenizer = tokenize.TweetTokenizer() # nltk WordPunctTokenizer for NLI punct_tokenizer = tokenize.WordPunctTokenizer() all_sentence = [] for sentence, task_id in zip(sentences, task_ids): if task_id == 0: # stance tokenize_sent = tweet_tokenizer.tokenize(sentence) elif task_id == 1: # NLI tokenize_sent = punct_tokenizer.tokenize(sentence) all_sentence.append(tokenize_sent) return all_sentence
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) templates = [ SymmetricProximateTokensTemplate(ProximateTagsRule, (1,1)), SymmetricProximateTokensTemplate(ProximateTagsRule, (2,2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1,2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1,3)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)), SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)), ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1,1)), ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)), ] default_tagger = nltk.DefaultTagger('NN') # now test the inpickled tagger on an imortal sentence. flu = 'And now for something different' tokens = tokenize.WordPunctTokenizer().tokenize(flu) for term, POS in tagger.tag(tokens): print "\t", term, POS
def main(): # Import CSV dataset file to Pandas DataFrame: print( "====================================================================") print( "========================== SCRIPT START ============================\n" ) path = 'data' dataset_csv = 'real_and_fake_news_corpus_pt_br.csv' print("Importing CSV dataset '{}' from folder '{}':\n".format( dataset_csv, path)) news = pd.read_csv(os.path.join(path, dataset_csv)) print("Imported {} instances with {} attributes.".format( news.shape[0], news.shape[1])) classification = news["Tag"].replace(["FAKE", "REAL"], [0, 1]) news["classification"] = classification print("The logistic regression resulted in {}% of accuracy.\n".format( 100 * text_classifier(news, "news_text_normalized", "classification"))) print( "====================================================================\n" ) print("Word cloud without any data preprocessing.\n") print("Word cloud of all news:\n") _, all_words, _ = word_cloud_complete(news, "news_text_normalized", "classification") print("\n\n\nWord cloud of real news:\n") word_cloud_real(news, "news_text_normalized", "classification") print("\n\n\nWord cloud of fake news:\n") word_cloud_fake(news, "news_text_normalized", "classification") print( "\n\n\n====================================================================\n" ) #plt.show(block=False) #input('press <ENTER> to continue') #plt.savefig('word_cloud.png') #Tokenization of dataset print( "====================================================================") print("TOKENIZATION STARTED") token_space = tokenize.WhitespaceTokenizer() token_phrase = token_space.tokenize(all_words) frequency = nltk.FreqDist(token_phrase) df_frequency = pd.DataFrame({ "Word": list(frequency.keys()), "Frequency": list(frequency.values()) }) print("The 10 most frequent words in this dataset are:\n") print(df_frequency.nlargest(columns="Frequency", n=10)) plt.figure(figsize=(12, 8)) ax = sns.barplot(data=df_frequency.nlargest(columns="Frequency", n=30), x="Word", y="Frequency", color='gray') ax.set(ylabel="Number of appearances") plt.show() pareto(news, "news_text_normalized", 10) #Cleaning only stopwords lowercase stopwords = nltk.corpus.stopwords.words("portuguese") clean_phrase = list() for text in news.news_text_normalized: new_phrase = list() text_words = token_space.tokenize(text) for word in text_words: if word not in stopwords: new_phrase.append(word) clean_phrase.append(' '.join(new_phrase)) news["traitement_1"] = clean_phrase # Cleaning stopwords lowercase and punctuation punctuation_list = list() for punct in punctuation: punctuation_list.append(punct) personalized_stopwords = [ "“", "”", ",", "”,", '""."', '"),"', '–', 'R', '..', '""","', '[...]', ').', '...', '"."""', '),', '".' ] stopwords_with_punctuation = punctuation_list + stopwords + personalized_stopwords token_punct = tokenize.WordPunctTokenizer() clean_phrase = list() for text in news["traitement_1"]: new_phrase = list() text_words = token_punct.tokenize(text) for word in text_words: if word not in stopwords_with_punctuation: new_phrase.append(word) clean_phrase.append(' '.join(new_phrase)) news["traitement_2"] = clean_phrase no_accent = [unidecode.unidecode(text) for text in news["traitement_2"]] stopwords_with_punctuation_no_accent = [ unidecode.unidecode(text) for text in stopwords_with_punctuation ] news["traitement_3"] = no_accent no_accent_news_text_normalized = [ unidecode.unidecode(text) for text in news["news_text_normalized"] ] news["traitement_4"] = no_accent_news_text_normalized processed_phrase = list() for text in news["traitement_4"]: new_phrase = list() text = text.lower() text_words = token_punct.tokenize(text) for word in text_words: if word not in stopwords_with_punctuation_no_accent: new_phrase.append(word) processed_phrase.append(' '.join(new_phrase)) news["traitement_4"] = processed_phrase # Traitement 5 - Selected stopwords by deseption analisis in portuguese: # news["traitement_5"] = no_accent_news_text_normalized # selected_stopwords = punctuation_list + stopwords + personalized_stopwords # for text in news["traitement_5"]: # new_phrase = list() # text = text.lower() # text_words = token_punct.tokenize(text) # for word in text_words: # if word not in selected_stopwords: # new_phrase.append(word) # processed_phrase.append(' '.join(new_phrase)) # news["traitement_5"] = processed_phrase pareto(news, "traitement_1", 10) print( "==========================================================================\n" ) print("Word cloud with preprocessing 1 - simple cleaning of stopwords.\n") print("Word cloud of all news:\n") _, all_words, _ = word_cloud_complete(news, "traitement_1", "classification") print("\n\n\nWord cloud of real news:\n") word_cloud_real(news, "traitement_1", "classification") print("\n\n\nWord cloud of fake news:\n") word_cloud_fake(news, "traitement_1", "classification") print( "\n\n\n====================================================================\n" ) print( "==========================================================================" ) pareto(news, "traitement_2", 10) print( "==========================================================================\n" ) print( "Word cloud with preprocessing 2 - cleaning of stopwords and punctuation.\n" ) print("Word cloud of all news:\n") _, all_words, _ = word_cloud_complete(news, "traitement_2", "classification") print("\n\n\nWord cloud of real news:\n") word_cloud_real(news, "traitement_2", "classification") print("\n\n\nWord cloud of fake news:\n") word_cloud_fake(news, "traitement_2", "classification") print( "\n\n\n====================================================================\n" ) print( "==========================================================================" ) pareto(news, "traitement_3", 10) print( "=================================================================================\n" ) print( "Word cloud with preprocessing 3 - cleaning of accents, stopwords and punctuation.\n" ) print("Word cloud of all news:\n") _, all_words, _ = word_cloud_complete(news, "traitement_3", "classification") print("\n\n\nWord cloud of real news:\n") word_cloud_real(news, "traitement_3", "classification") print("\n\n\nWord cloud of fake news:\n") word_cloud_fake(news, "traitement_3", "classification") print( "\n\n\n==========================================================================\n" ) print( "================================================================================" ) pareto(news, "traitement_4", 10) print( "============================================================================================\n" ) print( "Word cloud with preprocessing 4 - cleaning of accents, stopwords, punctuation in lower case.\n" ) print("Word cloud of all news:\n") _, all_words, _ = word_cloud_complete(news, "traitement_4", "classification") print("\n\n\nWord cloud of real news:\n") word_cloud_real(news, "traitement_4", "classification") print("\n\n\nWord cloud of fake news:\n") word_cloud_fake(news, "traitement_4", "classification") print( "\n\n\n======================================================================================\n" ) print( "============================================================================================" ) print(news.head())
def tokenize_data(corpus, col): for idx, row in corpus.iterrows(): corpus.at[idx, col] = nltk_token.WordPunctTokenizer().tokenize( clean(row[col])) return corpus
def tokenize_data(corpus, col): #sent_tokenizer = toks[self.params["tokenize"]] for idx, row in corpus.iterrows(): corpus.at[idx, col] = nltk_token.WordPunctTokenizer().tokenize(clean(row[col])) return corpus
class WordPunctTokenizer(BaseTokenizer): """ Split by words and (keep) punctuation. """ tokenizer = tokenize.WordPunctTokenizer() name = 'Word & Punctuation'
k += 1 self.svd_transformer = TruncatedSVD(n_components=k) except Exception as ex: print(ex) return self.svd_transformer.fit(X) def transform(self, X, Y=None): return self.svd_transformer.transform(X) def get_params(self, deep=True): return {} punctuation_token = tokenize.WordPunctTokenizer() space_token = tokenize.WhitespaceTokenizer() list_punctuation = [point for point in punctuation] punctuation_stopwords = list_punctuation + stop_words without_accents = [] without_accents_stop_words = [] @app.route('/') def index(): return flask.render_template('index.html') def tokenize(df): processed_sentence = list()
# -*- coding: utf-8 -*- import nltk.tokenize as tk #分词包 doc = "Are you curious about tokenization? " \ "Let's see how it works! " \ "We need to analyze a couple of sentences " \ "with punctuations to see it in action." print(doc) # 基于句子分词 tokens = tk.sent_tokenize(doc) for i, token in enumerate(tokens): print(i + 1, token) print(1, '+' * 19) # 基于单词的拆分 tokens = tk.word_tokenize(doc) for i, token in enumerate(tokens): print(i + 1, token) print(2, '+' * 19) # 单词标点分词器对象,会以所有标点拆分 tokenizer = tk.WordPunctTokenizer() tokens = tokenizer.tokenize(doc) for i, token in enumerate(tokens): print(i + 1, token)
def tokenize_with_lower(self, text): return [ word.lower() for word in tokenize.WordPunctTokenizer().tokenize(text) ]
class WordPunctTokenizer(BaseTokenizer): """ 根据单词分词, 保留标点. This example. → (This), (example), (.)""" tokenizer = tokenize.WordPunctTokenizer() name = '单词 & 标点'
def AO_lAssessOpinion(AO_sDocument): ''' This function Input - A document Output - An array with Percent Pocitive Words Percent Negative Words Some of the above Summery line with the above percents and tagged list of all the opionion words in teh documents ''' AO_lOpinion = [0, 0, 0, "-", "-" ] # [Negative , Positive, Total, AO_sLine,AO_sFullOpinion] AO_fPosWords = 0 AO_fNegWords = 0 AO_sLine = " " if AO_sDocument.strip() == '': return AO_lOpinion # Break the document into sentences AO_lSentences = sent_tokenize(AO_sDocument) # For all the sentences in the document for i in range(0, len(AO_lSentences)): # identify the POS tokens = tokenize.WordPunctTokenizer().tokenize(AO_lSentences[i]) AO_CorrectedSentence = '' for t in tokens: AO_CorrectedSentence = AO_CorrectedSentence + AO_sCorrect(t) + ' ' AO_CorrectedSentence = AO_CorrectedSentence.strip().capitalize() tokens = tokenize.WordPunctTokenizer().tokenize(AO_CorrectedSentence) AO_lTokens = list(tagger.tag(tokens)) AO_bNetNegation = False AO_sNegationPhrase = '' # for all the individual words in the sentece for j in range(0, len(AO_lTokens)): AO_fWordSentiment = 0 AO_lTokens[j] = list(AO_lTokens[j]) # handle don't, won't can't. This is a three tokens parsing. AO_bTriGram = False if j + 2 < len(AO_lTokens): if str(AO_lTokens[j + 1][0]) == 'a': if str(AO_lTokens[j + 2][0]) == "t": AO_bTriGram = True if j + 1 < len(AO_lTokens): if str(AO_lTokens[j][0]) == 'a': if str(AO_lTokens[j + 1][0]) == "t": AO_bTriGram = True if j > 2: if str(AO_lTokens[j - 1][0]) == 'a': if str(AO_lTokens[j][0]) == "t": AO_sTriGram = str(AO_lTokens[j - 2][0] + "'" + AO_lTokens[j][0]) AO_lTokens[j][0] = AO_sTriGram AO_bTriGram = False if AO_bTriGram == False: # see if this is a negator if str(AO_lTokens[j][0]).lower() in AO_setNegationWords: AO_bNetNegation = not AO_bNetNegation if AO_bNetNegation: AO_sNegationPhrase = AO_sNegationPhrase + ' ' + AO_lTokens[ j][0] else: AO_sNegationPhrase = '' # is this a negation word , we will deal with it with next words if str(AO_lTokens[j][0]).lower() not in AO_setNegationWords: # if this is an internsifier, we will deal with it with next word if AO_fAssessWord(AO_lTokens[j], ['int']) == 0: AO_fWordSentiment = AO_fAssessWord( AO_lTokens[j], ['adj', 'adv', 'noun', 'verb', 'minqinghu']) # if the word is a positive word if abs(AO_fWordSentiment) > 0: if (j == 0): if (AO_fWordSentiment > 0): # Firts ford positive sentiment AO_fPosWords = AO_fPosWords + AO_fWordSentiment AO_sLine = AO_sLine +\ 'P (' +\ str(AO_fWordSentiment) +\ '): ' +\ str(AO_lTokens[j][0]) +\ ' ' +\ str(str(AO_lTokens[j][1])) +\ ' ~ ' elif (AO_fWordSentiment < 0): # First word Negative sentiment AO_fNegWords = AO_fNegWords + AO_fWordSentiment AO_sLine = AO_sLine +\ 'N(' +\ str(AO_fWordSentiment) +\ '): ' +\ str(AO_lTokens[j][0]) +\ ',' +\ str(AO_lTokens[j][1]) +\ ' ' else: # if this is not the first word in the sentence, # then it may have been intensified by the word before it AO_fIntensifier = AO_fAssessWord( AO_lTokens[j - 1], ['int']) if (AO_fWordSentiment > 0) \ and (AO_fIntensifier == float(0)) \ and (AO_bNetNegation == True): # Negated Positve sentiment (but not intencified) AO_fPosWords = AO_fPosWords + AO_fWordSentiment - AO_iNEGATIONconstatnt AO_sLine = AO_sLine +\ 'NegatedP (' +str(AO_fWordSentiment) +\ ' - ' +\ str(AO_iNEGATIONconstatnt) +\ '): ' +\ AO_sNegationPhrase +\ ' ~ ' elif (AO_fWordSentiment > 0) \ and (AO_fIntensifier <> float(0)) \ and (AO_bNetNegation == False): # Intencified Positve sentiment that is not negated AO_fSentiment = AO_fWordSentiment * ( 1 + AO_fIntensifier) AO_fPosWords = AO_fPosWords + AO_fSentiment # double the scoring AO_sLine = AO_sLine +\ 'emphP((1+' +\ str(AO_fIntensifier) +\ ')*(' +\ str(AO_fWordSentiment ) +\ ')=('+str(AO_fSentiment) +\ ')): ' +\ str(AO_lTokens[j-1][0]) +\ ' ' +\ str(AO_lTokens[j][0]) +\ ' , ' +\ str(AO_lTokens[j-1][1]) +\ ' ' +\ str(AO_lTokens[j][1]) +\ ' ~ ' elif (AO_fWordSentiment > 0) \ and (AO_fIntensifier <> float(0)) \ and (AO_bNetNegation == True): #Negated postive word that is intencified AO_fSentiment = AO_fWordSentiment * ( 1 + AO_fIntensifier ) - AO_iNEGATIONconstatnt AO_fPosWords = AO_fPosWords + AO_fSentiment # double the scoring AO_sLine = AO_sLine +\ 'NegatedEmphP((1+' +\ str(AO_fIntensifier) +\ ')*(' +\ str(AO_fWordSentiment ) +\ '-'+str(AO_iNEGATIONconstatnt) +\ ')=(' +\ str(AO_fSentiment) +\ ')): ' +\ AO_sNegationPhrase +\ ' ~ ' elif (AO_fWordSentiment > 0) \ and (AO_fIntensifier == float(0)) \ and (AO_bNetNegation == False): # Nth Positive sentiment, nither negated nor intencified AO_fPosWords = AO_fPosWords + AO_fWordSentiment AO_sLine = AO_sLine +\ 'P (' +\ str(AO_fWordSentiment) +\ '): ' +\ str(AO_lTokens[j][0]) +\ ' ' +\ str(str(AO_lTokens[j][1])) +\ ' ~ ' elif (AO_fWordSentiment < 0) \ and (AO_fIntensifier <> float(0)) \ and (AO_bNetNegation == True): # negated emphesised negative sentimet AO_fSentiment = AO_fWordSentiment * ( 1 + AO_fIntensifier ) + AO_iNEGATIONconstatnt AO_fNegWords = AO_fNegWords + AO_fSentiment # double the scoring AO_sLine = AO_sLine +\ 'NegatedEmphN((1+' +\ str(AO_fIntensifier) +\ ')*(' +\ str(AO_fWordSentiment ) +\ '-' +\ str(AO_iNEGATIONconstatnt) +\ ')=(' +\ AO_sNegationPhrase +\ ' ~ ' elif (AO_fWordSentiment < 0) \ and (AO_fIntensifier <> float(0)) \ and (AO_bNetNegation == False): # emphesised Negative sentiment AO_fSentiment = AO_fWordSentiment * ( 1 + AO_fIntensifier) AO_fNegWords = AO_fNegWords + AO_fSentiment # double the scoring AO_sLine = AO_sLine +\ 'emphN((1+' +\ str(AO_fIntensifier) +\ ')*(' +\ str(AO_fWordSentiment ) +\ ')=('+str(AO_fSentiment) +\ ')): ' +\ str(AO_lTokens[j-1][0]) +\ ' ' +\ str(AO_lTokens[j][0]) +\ ' , ' +\ str(AO_lTokens[j-1][1]) +\ ',' +\ str(AO_lTokens[j][1]) +\ ' ~ ' elif (AO_fWordSentiment < 0) \ and (AO_fIntensifier == float(0)) \ and (AO_bNetNegation == True): # Negated Negative Sentiment AO_fNegWords = AO_fNegWords + AO_fWordSentiment + AO_iNEGATIONconstatnt AO_sLine = AO_sLine +\ 'N(' +\ str(AO_fWordSentiment) +\ '+' +\ str(AO_iNEGATIONconstatnt) +\ '): ' +\ AO_sNegationPhrase +\ ' ~ ' elif (AO_fWordSentiment < 0) \ and (AO_fIntensifier == float(0)) \ and (AO_bNetNegation == False): # Nth Negative sentiment AO_fNegWords = AO_fNegWords + AO_fWordSentiment AO_sLine = AO_sLine +\ 'N(' +\ str(AO_fWordSentiment) +\ '): ' +\ str(AO_lTokens[j][0]) +\ ',' +\ str(AO_lTokens[j][1]) +\ ' ~ ' # end elif # end if - first word # endif negarive word # endif this is an intensifier # endif the word is a negator # for all the words in the sentence #for all the sentences in the document if len(AO_lTokens) > 0: a = round(AO_fPosWords, 2) b = round(AO_fNegWords, 2) c = round(a + b, 2) if c > 0: AO_stentiment = "positive" elif c < 0: AO_stentiment = "negative" elif c == 0: AO_stentiment = "neutral" else: AO_stentiment = "ambiguas" if AO_sLine.strip() <> '': AO_sFullOpinion = 'The sentiment of "%s" is %s(%s) as %s' % ( AO_sDocument, AO_stentiment, c, AO_sLine) else: AO_sFullOpinion = 'The sentiment of "%s" is %s.' % (AO_sDocument, AO_stentiment) AO_lOpinion = [a, b, c, AO_sLine, AO_sFullOpinion] return AO_lOpinion
def __init__(self, path): print("Now load in glove model.") #self.model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=False) self.vocab_size, self.embedding_size, self.model = self.embedding_in(path) print("Model loaded!") self.tokenizer = tk.WordPunctTokenizer()
from nltk import tokenize from string import punctuation nltk.download("all") data = pd.read_csv("data.csv") foo_words = nltk.corpus.stopwords.words("english") list_new_phrase = [] list_punc_phrase = [] list_strem_phrase = [] list_adjective_phrase = [] list_other_words = [] token_space = tokenize.WhitespaceTokenizer() token_punctuation = tokenize.WordPunctTokenizer() stemmer = nltk.RSLPStemmer() for punc in punctuation: foo_words.append(punc) c = 0 list_places = [] for c in data['City']: if ' ' in str(c): words_text = token_space.tokenize(c) for w in words_text: foo_words.append(str(w).lower()) else: foo_words.append(str(c).lower()) for c in [
palavras_texto = token_espaco.tokenize(opniao) for palavra in palavras_texto: if palavra not in palavras_irrelevantes: nova_frase.append(palavra) frase_processada.append(' '.join(nova_frase)) resenha["tratamento1"] = frase_processada resenha.head() classificar_texto(resenha, 'tratamento1', 'classificacao') pareto(resenha, 'tratamento1', 10) #PARTE 2 #separar a pontuação do texto frase = "Olá mundo!" token_pontuacao = tokenize.WordPunctTokenizer() token_frase = token_pontuacao.tokenize(frase) print(token_frase) #retirada de pontuação print(punctuation) pontuacao = list() for ponto in punctuation: pontuacao.append(ponto) print(pontuacao) pontuacao_stop_words = pontuacao + palavras_irrelevantes frase_processada = list() for opiniao in resenha["tratamento1"]:
def treat_punctuation(text): tokenizer = tokenize.WordPunctTokenizer() return ' '.join(tokenizer.tokenize(text))
# remove punctuation from each word # print(palavras_ementa) table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in palavras_ementa] # remove remaining tokens that are not alphabetic words = [word for word in stripped if word.isalpha()] # print(palavras_ementa) for palavra in words: if palavra.lower() not in stop_words: nova_ementa.append(wordnet_lemmatizer.lemmatize(palavra)) #nova_ementa.append(stemmer.stem(palavra)) #nova_ementa.append(palavra) ementas_processadas.append(' '.join(nova_ementa)) return ementas_processadas """aqui utiliza os metodos anteriores para aplicar tratamento nas frases removendo as stop words e exibindo o retorno no formato de grafico posteriormente""" ementas_trat_1 = [unidecode.unidecode(texto.lower()) for texto in dados_civil_e_crime["EMENTA_SEM_VERBATIZACAO"]] #ementas_trat_1 = [unidecode.unidecode(texto.lower()) for texto in dados_civil_e_crime["EMENTA"]] dados_civil_e_crime["EMENTA_SEM_CAPUT_TRAT_1"] = ementas_trat_1 punct_tokenize = tokenize.WordPunctTokenizer() dados_civil_e_crime['EMENTAS_PROCESSADAS'] = remove_stopwords(tokenizador, dados_civil_e_crime.EMENTA_SEM_CAPUT_TRAT_1) df_frequencia = gera_frequencia(tokenizador, dados_civil_e_crime.EMENTAS_PROCESSADAS) escreve_pareto(df_frequencia) nuvem_palavras_processadas = gera_nuvem_palavras(dados_civil_e_crime.EMENTAS_PROCESSADAS) desenha_palavras(nuvem_palavras_processadas)
from sklearn.preprocessing import OneHotEncoder, LabelEncoder import copy, inspect from scipy.spatial.distance import cosine stem = SnowballStemmer("english").stem link_re = re.compile(r"(http(s)?[^\s]*)|(pic\.[s]*)") hashtag_re = re.compile(r"#[a-zA-Z0-9_]+") mention_re = re.compile(r"@[a-zA-Z0-9_]+") pat_type = {'links': link_re, 'hashtags': hashtag_re, 'mentions': mention_re} tokenizers = {'treebank': nltk_token.TreebankWordTokenizer().tokenize, 'wordpunct': nltk_token.WordPunctTokenizer().tokenize, 'tweettokenize': nltk_token.TweetTokenizer().tokenize} def read_file(path): if not os.path.exists(path): raise ValueError("Path does not point to existing file: {}".format(path)) return ending = path.split('.')[-1] if ending == 'csv': return pd.read_csv(path) elif ending == 'tsv': return pd.read_csv(path, delimiter='\t') elif ending == 'pkl': return pd.read_pickle(path) elif ending == 'json': return pd.read_json(path)