Beispiel #1
0
def fit_data(df, vocab_dic):
    tokenizer = tk.WordPunctTokenizer()
    max_review_len = 0
    max_voc_len = 0
    user_review_dic = {}
    item_review_dic = {}
    for user, group in df.groupby(['reviewerID']):
        user_review_dic[user] = {}
        for asin, review in zip(group.asin, group.reviewText):
            user_review_dic[user][asin] = set([])
            if type(review) is not str:
                continue
            tokens = tokenizer.tokenize(review)

            for token in tokens:
                if token in vocab_dic:
                    user_review_dic[user][asin].add(vocab_dic[token])
            max_voc_len = max(max_voc_len, len(user_review_dic[user][asin]))
        max_review_len = max(max_review_len, len(user_review_dic[user]))

    for asin, group in df.groupby(['asin']):
        item_review_dic[asin] = {}
        for reviewerID, review in zip(group.reviewerID, group.reviewText):
            item_review_dic[asin][reviewerID] = set([])
            if type(review) is not str:
                continue
            tokens = tokenizer.tokenize(review)
            for token in tokens:
                if token in vocab_dic:
                    item_review_dic[asin][reviewerID].add(vocab_dic[token])
            max_voc_len = max(max_voc_len,
                              len(item_review_dic[asin][reviewerID]))
        max_review_len = max(max_review_len, len(item_review_dic[asin]))

    return user_review_dic, item_review_dic, max_review_len, max_voc_len
Beispiel #2
0
def fit_data(df, vocab_dic):
    tokenizer = tk.WordPunctTokenizer()
    max_count = 0
    user_review_dic = {}
    item_review_dic = {}
    for user, group in df.groupby(['reviewerID']):
        user_review_dic[user] = set([])
        cur_count = 0
        doc = ' '.join(group['reviewText'].values)
        tokens = tokenizer.tokenize(doc)
        for token in tokens:
            if token in vocab_dic:
                if vocab_dic[token] not in user_review_dic[user]:
                    user_review_dic[user].add(vocab_dic[token])
                    cur_count += 1
        max_count = max(cur_count, max_count)

    for item, group in df.groupby(['asin']):
        item_review_dic[item] = set([])
        doc = ' '.join(group['reviewText'].values)
        cur_count = 0
        tokens = tokenizer.tokenize(doc)
        for token in tokens:
            if token in vocab_dic:
                if vocab_dic[token] not in item_review_dic[item]:
                    item_review_dic[item].add(vocab_dic[token])
                    cur_count += 1
        max_count = max(cur_count, max_count)

    return user_review_dic, item_review_dic, max_count
Beispiel #3
0
def load_word(file):
    sents_up = []
    sents_down = []
    labels = []
    aspects = []
    lines = open(file, 'r', encoding="utf8").readlines()
    for i in range(0, len(lines), 3):
        polarity = lines[i + 2].split()[0]
        if polarity == 'conflict':
            continue
        tokenizer = tk.WordPunctTokenizer()
        sentences = tokenizer.tokenize(lines[i].strip())
        aspect = tokenizer.tokenize(lines[i + 1].strip())
        aspects.append(aspect)
        if "aspectTerm" not in sentences:
            print(sentences)
            continue
        sents_up.append(sentences[:sentences.index("aspectTerm")] + aspect)
        sents_down.append(aspect + sentences[sentences.index("aspectTerm") + 1:])

        if polarity == 'negative':
            labels.append(-1)
        elif polarity == 'neutral':
            labels.append(0)
        elif polarity == 'positive':
            labels.append(1)

    up_max_len = max([len(elem) for elem in sents_up])
    down_max_len = max([len(elem) for elem in sents_down])
    return sents_up, sents_down, aspects, labels, up_max_len, down_max_len
    def apply_preprocessors(self, column):
        punctuationTokenizer = tokenize.WordPunctTokenizer()

        processed_sentece = list()
        for each in tqdm(column, desc="RemovePonctuationPreprocessor"):
            filtered_sentence = list()
            wordish = punctuationTokenizer.tokenize(each)
            for item in wordish:
                if item not in [p for p in punctuation]:
                    filtered_sentence.append(item)
            processed_sentece.append(' '.join(filtered_sentence))

        return processed_sentece
Beispiel #5
0
    def frequency_word(self):
        list_df = list_dataframe()
        dataframe_frequency = list_df[0]
        word_punct_tokenize = tokenize.WordPunctTokenizer()
        #transformando a coluna de um dataframe em um array
        text_array = dataframe_frequency["comments"].array
        #convertendo para um unico texto todos os items do array
        text_join = ' '.join(text_array)
        #tokenizar o todo o texto
        tokens = word_punct_tokenize.tokenize(text_join)
        #Calculando a frequencia de cada palavra
        freq_dist = FreqDist(tokens)

        count_frequency = list(freq_dist)
        total_word_frequency = count_frequency[:self._total_words]
        self._data = total_word_frequency
Beispiel #6
0
def remove_stop_words(text):

    text = unidecode.unidecode(text).lower()
    token_punct = tokenize.WordPunctTokenizer()
    token = token_punct.tokenize(text)
    stemmer = nltk.RSLPStemmer()

    words = nltk.corpus.stopwords.words('portuguese')
    words_without_accent = [unidecode.unidecode(item) for item in words]
    stopwords = words + words_without_accent + list(punctuation)

    without_stop_words = [
        stemmer.stem(item) for item in token if item not in stopwords
    ]

    return " ".join(without_stop_words)
Beispiel #7
0
def caracteres_specials():
    comments_db_for_carac_specials = comments_dataframe[0]
    punct = punctuation + "…"
    token = tokenize.WordPunctTokenizer()

    comment_whitout_punctuation = []
    for comment in comments_db_for_carac_specials['comments']:
        comments = []
        separate_comment = token.tokenize(comment)
        for word in separate_comment:
            if (word not in punct):
                comments.append(word)

        comment_whitout_punctuation.append(' '.join(comments))

    comments_db_for_carac_specials['comments'] = comment_whitout_punctuation
    update_dataframe(comments_db_for_carac_specials)
Beispiel #8
0
    def tokenize(self, sentences, task_ids):
        # nltk TweetTokenizer for stance
        tweet_tokenizer = tokenize.TweetTokenizer()

        # nltk WordPunctTokenizer for NLI
        punct_tokenizer = tokenize.WordPunctTokenizer()

        all_sentence = []
        for sentence, task_id in zip(sentences, task_ids):
            if task_id == 0:  # stance
                tokenize_sent = tweet_tokenizer.tokenize(sentence)
            elif task_id == 1:  # NLI
                tokenize_sent = punct_tokenizer.tokenize(sentence)

            all_sentence.append(tokenize_sent)

        return all_sentence
    [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
     (r'(The|the|A|a|An|an)$', 'AT'),   # articles
     (r'.*able$', 'JJ'),                # adjectives
     (r'.*ness$', 'NN'),                # nouns formed from adjectives
     (r'.*ly$', 'RB'),                  # adverbs
     (r'.*s$', 'NNS'),                  # plural nouns
     (r'.*ing$', 'VBG'),                # gerunds
     (r'.*ed$', 'VBD'),                 # past tense verbs
     (r'.*', 'NN')                      # nouns (default)
])

templates = [
    SymmetricProximateTokensTemplate(ProximateTagsRule, (1,1)),
    SymmetricProximateTokensTemplate(ProximateTagsRule, (2,2)),
    SymmetricProximateTokensTemplate(ProximateTagsRule, (1,2)),
    SymmetricProximateTokensTemplate(ProximateTagsRule, (1,3)),
    SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)),
    SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)),
    SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)),
    SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)),
    ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1,1)),
    ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)),
    ]

default_tagger = nltk.DefaultTagger('NN')

# now test the inpickled tagger on an imortal sentence.
flu = 'And now for something different'
tokens = tokenize.WordPunctTokenizer().tokenize(flu)
for term, POS in tagger.tag(tokens):
  print "\t", term, POS
Beispiel #10
0
def main():
    # Import CSV dataset file to Pandas DataFrame:
    print(
        "====================================================================")
    print(
        "========================== SCRIPT START ============================\n"
    )
    path = 'data'
    dataset_csv = 'real_and_fake_news_corpus_pt_br.csv'
    print("Importing CSV dataset '{}' from folder '{}':\n".format(
        dataset_csv, path))
    news = pd.read_csv(os.path.join(path, dataset_csv))
    print("Imported {} instances with {} attributes.".format(
        news.shape[0], news.shape[1]))

    classification = news["Tag"].replace(["FAKE", "REAL"], [0, 1])
    news["classification"] = classification

    print("The logistic regression resulted in {}% of accuracy.\n".format(
        100 * text_classifier(news, "news_text_normalized", "classification")))

    print(
        "====================================================================\n"
    )
    print("Word cloud without any data preprocessing.\n")
    print("Word cloud of all news:\n")
    _, all_words, _ = word_cloud_complete(news, "news_text_normalized",
                                          "classification")
    print("\n\n\nWord cloud of real news:\n")
    word_cloud_real(news, "news_text_normalized", "classification")
    print("\n\n\nWord cloud of fake news:\n")
    word_cloud_fake(news, "news_text_normalized", "classification")
    print(
        "\n\n\n====================================================================\n"
    )
    #plt.show(block=False)
    #input('press <ENTER> to continue')
    #plt.savefig('word_cloud.png')

    #Tokenization of dataset
    print(
        "====================================================================")
    print("TOKENIZATION STARTED")
    token_space = tokenize.WhitespaceTokenizer()
    token_phrase = token_space.tokenize(all_words)

    frequency = nltk.FreqDist(token_phrase)
    df_frequency = pd.DataFrame({
        "Word": list(frequency.keys()),
        "Frequency": list(frequency.values())
    })
    print("The 10 most frequent words in this dataset are:\n")
    print(df_frequency.nlargest(columns="Frequency", n=10))

    plt.figure(figsize=(12, 8))
    ax = sns.barplot(data=df_frequency.nlargest(columns="Frequency", n=30),
                     x="Word",
                     y="Frequency",
                     color='gray')
    ax.set(ylabel="Number of appearances")
    plt.show()

    pareto(news, "news_text_normalized", 10)

    #Cleaning only stopwords lowercase
    stopwords = nltk.corpus.stopwords.words("portuguese")

    clean_phrase = list()
    for text in news.news_text_normalized:
        new_phrase = list()
        text_words = token_space.tokenize(text)
        for word in text_words:
            if word not in stopwords:
                new_phrase.append(word)
        clean_phrase.append(' '.join(new_phrase))

    news["traitement_1"] = clean_phrase

    # Cleaning stopwords lowercase and punctuation
    punctuation_list = list()
    for punct in punctuation:
        punctuation_list.append(punct)

    personalized_stopwords = [
        "“", "”", ",", "”,", '""."', '"),"', '–', 'R', '..', '""","', '[...]',
        ').', '...', '"."""', '),', '".'
    ]
    stopwords_with_punctuation = punctuation_list + stopwords + personalized_stopwords
    token_punct = tokenize.WordPunctTokenizer()
    clean_phrase = list()
    for text in news["traitement_1"]:
        new_phrase = list()
        text_words = token_punct.tokenize(text)
        for word in text_words:
            if word not in stopwords_with_punctuation:
                new_phrase.append(word)
        clean_phrase.append(' '.join(new_phrase))

    news["traitement_2"] = clean_phrase

    no_accent = [unidecode.unidecode(text) for text in news["traitement_2"]]
    stopwords_with_punctuation_no_accent = [
        unidecode.unidecode(text) for text in stopwords_with_punctuation
    ]

    news["traitement_3"] = no_accent

    no_accent_news_text_normalized = [
        unidecode.unidecode(text) for text in news["news_text_normalized"]
    ]

    news["traitement_4"] = no_accent_news_text_normalized

    processed_phrase = list()

    for text in news["traitement_4"]:
        new_phrase = list()
        text = text.lower()
        text_words = token_punct.tokenize(text)
        for word in text_words:
            if word not in stopwords_with_punctuation_no_accent:
                new_phrase.append(word)
        processed_phrase.append(' '.join(new_phrase))

    news["traitement_4"] = processed_phrase

    # Traitement 5 - Selected stopwords by deseption analisis in portuguese:
    # news["traitement_5"] = no_accent_news_text_normalized
    # selected_stopwords = punctuation_list + stopwords + personalized_stopwords
    #     for text in news["traitement_5"]:
    #     new_phrase = list()
    #     text = text.lower()
    #     text_words = token_punct.tokenize(text)
    #     for word in text_words:
    #         if word not in selected_stopwords:
    #             new_phrase.append(word)
    #     processed_phrase.append(' '.join(new_phrase))

    # news["traitement_5"] = processed_phrase

    pareto(news, "traitement_1", 10)
    print(
        "==========================================================================\n"
    )
    print("Word cloud with preprocessing 1 - simple cleaning of stopwords.\n")
    print("Word cloud of all news:\n")
    _, all_words, _ = word_cloud_complete(news, "traitement_1",
                                          "classification")
    print("\n\n\nWord cloud of real news:\n")
    word_cloud_real(news, "traitement_1", "classification")
    print("\n\n\nWord cloud of fake news:\n")
    word_cloud_fake(news, "traitement_1", "classification")
    print(
        "\n\n\n====================================================================\n"
    )
    print(
        "=========================================================================="
    )

    pareto(news, "traitement_2", 10)
    print(
        "==========================================================================\n"
    )
    print(
        "Word cloud with preprocessing 2 - cleaning of stopwords and punctuation.\n"
    )
    print("Word cloud of all news:\n")
    _, all_words, _ = word_cloud_complete(news, "traitement_2",
                                          "classification")
    print("\n\n\nWord cloud of real news:\n")
    word_cloud_real(news, "traitement_2", "classification")
    print("\n\n\nWord cloud of fake news:\n")
    word_cloud_fake(news, "traitement_2", "classification")
    print(
        "\n\n\n====================================================================\n"
    )
    print(
        "=========================================================================="
    )

    pareto(news, "traitement_3", 10)
    print(
        "=================================================================================\n"
    )
    print(
        "Word cloud with preprocessing 3 - cleaning of accents, stopwords and punctuation.\n"
    )
    print("Word cloud of all news:\n")
    _, all_words, _ = word_cloud_complete(news, "traitement_3",
                                          "classification")
    print("\n\n\nWord cloud of real news:\n")
    word_cloud_real(news, "traitement_3", "classification")
    print("\n\n\nWord cloud of fake news:\n")
    word_cloud_fake(news, "traitement_3", "classification")
    print(
        "\n\n\n==========================================================================\n"
    )
    print(
        "================================================================================"
    )

    pareto(news, "traitement_4", 10)
    print(
        "============================================================================================\n"
    )
    print(
        "Word cloud with preprocessing 4 - cleaning of accents, stopwords, punctuation in lower case.\n"
    )
    print("Word cloud of all news:\n")
    _, all_words, _ = word_cloud_complete(news, "traitement_4",
                                          "classification")
    print("\n\n\nWord cloud of real news:\n")
    word_cloud_real(news, "traitement_4", "classification")
    print("\n\n\nWord cloud of fake news:\n")
    word_cloud_fake(news, "traitement_4", "classification")
    print(
        "\n\n\n======================================================================================\n"
    )
    print(
        "============================================================================================"
    )

    print(news.head())
Beispiel #11
0
def tokenize_data(corpus, col):
    for idx, row in corpus.iterrows():
        corpus.at[idx, col] = nltk_token.WordPunctTokenizer().tokenize(
            clean(row[col]))
    return corpus
Beispiel #12
0
def tokenize_data(corpus, col):
    #sent_tokenizer = toks[self.params["tokenize"]]
    for idx, row in corpus.iterrows():
        corpus.at[idx, col] = nltk_token.WordPunctTokenizer().tokenize(clean(row[col]))
    return corpus
Beispiel #13
0
class WordPunctTokenizer(BaseTokenizer):
    """ Split by words and (keep) punctuation. """
    tokenizer = tokenize.WordPunctTokenizer()
    name = 'Word & Punctuation'
Beispiel #14
0
                    k += 1

            self.svd_transformer = TruncatedSVD(n_components=k)
        except Exception as ex:
            print(ex)

        return self.svd_transformer.fit(X)

    def transform(self, X, Y=None):
        return self.svd_transformer.transform(X)

    def get_params(self, deep=True):
        return {}


punctuation_token = tokenize.WordPunctTokenizer()
space_token = tokenize.WhitespaceTokenizer()
list_punctuation = [point for point in punctuation]
punctuation_stopwords = list_punctuation + stop_words
without_accents = []
without_accents_stop_words = []


@app.route('/')
def index():
    return flask.render_template('index.html')


def tokenize(df):
    processed_sentence = list()
Beispiel #15
0
# -*- coding: utf-8 -*-
import nltk.tokenize as tk  #分词包


doc = "Are you curious about tokenization? " \
      "Let's see how it works! " \
      "We need to analyze a couple of sentences " \
      "with punctuations to see it in action."
print(doc)
# 基于句子分词
tokens = tk.sent_tokenize(doc)
for i, token in enumerate(tokens):
    print(i + 1, token)
print(1, '+' * 19)
# 基于单词的拆分
tokens = tk.word_tokenize(doc)
for i, token in enumerate(tokens):
    print(i + 1, token)
print(2, '+' * 19)

# 单词标点分词器对象,会以所有标点拆分
tokenizer = tk.WordPunctTokenizer()
tokens = tokenizer.tokenize(doc)
for i, token in enumerate(tokens):
    print(i + 1, token)
Beispiel #16
0
 def tokenize_with_lower(self, text):
     return [
         word.lower()
         for word in tokenize.WordPunctTokenizer().tokenize(text)
     ]
Beispiel #17
0
class WordPunctTokenizer(BaseTokenizer):
    """ 根据单词分词, 保留标点. This example. → (This), (example), (.)"""
    tokenizer = tokenize.WordPunctTokenizer()
    name = '单词 & 标点'
Beispiel #18
0
def AO_lAssessOpinion(AO_sDocument):
    '''
    This function
    Input   - A document
    
    Output  - An array with Percent Pocitive Words
                            Percent Negative Words
                            Some of the above
                            Summery line with the above percents and tagged list of all the opionion words in teh documents 
    '''

    AO_lOpinion = [0, 0, 0, "-", "-"
                   ]  # [Negative , Positive, Total, AO_sLine,AO_sFullOpinion]
    AO_fPosWords = 0
    AO_fNegWords = 0
    AO_sLine = " "

    if AO_sDocument.strip() == '':
        return AO_lOpinion

    # Break the document into sentences
    AO_lSentences = sent_tokenize(AO_sDocument)

    # For all the sentences in the document
    for i in range(0, len(AO_lSentences)):

        # identify the POS
        tokens = tokenize.WordPunctTokenizer().tokenize(AO_lSentences[i])

        AO_CorrectedSentence = ''
        for t in tokens:
            AO_CorrectedSentence = AO_CorrectedSentence + AO_sCorrect(t) + ' '
        AO_CorrectedSentence = AO_CorrectedSentence.strip().capitalize()
        tokens = tokenize.WordPunctTokenizer().tokenize(AO_CorrectedSentence)

        AO_lTokens = list(tagger.tag(tokens))

        AO_bNetNegation = False
        AO_sNegationPhrase = ''

        # for all the individual words in the sentece
        for j in range(0, len(AO_lTokens)):

            AO_fWordSentiment = 0

            AO_lTokens[j] = list(AO_lTokens[j])

            # handle don't, won't can't. This is a three tokens parsing.
            AO_bTriGram = False

            if j + 2 < len(AO_lTokens):
                if str(AO_lTokens[j + 1][0]) == 'a':
                    if str(AO_lTokens[j + 2][0]) == "t":
                        AO_bTriGram = True

            if j + 1 < len(AO_lTokens):
                if str(AO_lTokens[j][0]) == 'a':
                    if str(AO_lTokens[j + 1][0]) == "t":
                        AO_bTriGram = True

            if j > 2:
                if str(AO_lTokens[j - 1][0]) == 'a':
                    if str(AO_lTokens[j][0]) == "t":
                        AO_sTriGram = str(AO_lTokens[j - 2][0] + "'" +
                                          AO_lTokens[j][0])
                        AO_lTokens[j][0] = AO_sTriGram
                        AO_bTriGram = False

            if AO_bTriGram == False:
                # see if this is a negator
                if str(AO_lTokens[j][0]).lower() in AO_setNegationWords:
                    AO_bNetNegation = not AO_bNetNegation

                if AO_bNetNegation:

                    AO_sNegationPhrase = AO_sNegationPhrase + ' ' + AO_lTokens[
                        j][0]

                else:
                    AO_sNegationPhrase = ''

                # is this a negation word , we will deal with it with next words
                if str(AO_lTokens[j][0]).lower() not in AO_setNegationWords:

                    # if this is an internsifier, we will deal with it with next word
                    if AO_fAssessWord(AO_lTokens[j], ['int']) == 0:

                        AO_fWordSentiment = AO_fAssessWord(
                            AO_lTokens[j],
                            ['adj', 'adv', 'noun', 'verb', 'minqinghu'])

                        # if the word is a positive word
                        if abs(AO_fWordSentiment) > 0:

                            if (j == 0):
                                if (AO_fWordSentiment > 0):
                                    # Firts ford positive sentiment
                                    AO_fPosWords = AO_fPosWords + AO_fWordSentiment
                                    AO_sLine = AO_sLine                     +\
                                    'P ('                                   +\
                                    str(AO_fWordSentiment)                  +\
                                    '): '                                   +\
                                    str(AO_lTokens[j][0])                   +\
                                    ' '                                     +\
                                    str(str(AO_lTokens[j][1]))              +\
                                    ' ~ '

                                elif (AO_fWordSentiment < 0):
                                    # First word Negative sentiment
                                    AO_fNegWords = AO_fNegWords + AO_fWordSentiment
                                    AO_sLine = AO_sLine                     +\
                                    'N('                                    +\
                                    str(AO_fWordSentiment)                  +\
                                    '): '                                   +\
                                    str(AO_lTokens[j][0])                   +\
                                    ','                                     +\
                                    str(AO_lTokens[j][1])                   +\
                                    '  '

                            else:  # if this is not the first word in the sentence,
                                # then it may have been intensified by the word before it
                                AO_fIntensifier = AO_fAssessWord(
                                    AO_lTokens[j - 1], ['int'])

                                if   (AO_fWordSentiment > 0)      \
                                and (AO_fIntensifier == float(0)) \
                                and (AO_bNetNegation == True):
                                    # Negated Positve sentiment (but not intencified)

                                    AO_fPosWords = AO_fPosWords + AO_fWordSentiment - AO_iNEGATIONconstatnt
                                    AO_sLine = AO_sLine                     +\
                                    'NegatedP (' +str(AO_fWordSentiment)    +\
                                    ' - '                                   +\
                                    str(AO_iNEGATIONconstatnt)              +\
                                    '): '                                   +\
                                    AO_sNegationPhrase                      +\
                                    ' ~ '

                                elif (AO_fWordSentiment > 0)      \
                                and (AO_fIntensifier <> float(0)) \
                                and (AO_bNetNegation == False):
                                    #  Intencified Positve sentiment that is not negated

                                    AO_fSentiment = AO_fWordSentiment * (
                                        1 + AO_fIntensifier)
                                    AO_fPosWords = AO_fPosWords + AO_fSentiment  # double the scoring
                                    AO_sLine = AO_sLine                     +\
                                    'emphP((1+'                             +\
                                    str(AO_fIntensifier)                    +\
                                    ')*('                                   +\
                                    str(AO_fWordSentiment )                 +\
                                    ')=('+str(AO_fSentiment)                +\
                                    ')): '                                  +\
                                    str(AO_lTokens[j-1][0])                 +\
                                    ' '                                     +\
                                    str(AO_lTokens[j][0])                   +\
                                    ' , '                                   +\
                                    str(AO_lTokens[j-1][1])                 +\
                                    ' '                                     +\
                                    str(AO_lTokens[j][1])                   +\
                                    ' ~ '


                                elif (AO_fWordSentiment > 0)       \
                                and (AO_fIntensifier <>  float(0)) \
                                and (AO_bNetNegation == True):
                                    #Negated postive word that is intencified

                                    AO_fSentiment = AO_fWordSentiment * (
                                        1 + AO_fIntensifier
                                    ) - AO_iNEGATIONconstatnt
                                    AO_fPosWords = AO_fPosWords + AO_fSentiment  # double the scoring
                                    AO_sLine = AO_sLine                     +\
                                    'NegatedEmphP((1+'                      +\
                                    str(AO_fIntensifier)                    +\
                                    ')*('                                   +\
                                    str(AO_fWordSentiment )                 +\
                                    '-'+str(AO_iNEGATIONconstatnt)          +\
                                    ')=('                                   +\
                                    str(AO_fSentiment)                      +\
                                    ')): '                                  +\
                                    AO_sNegationPhrase                      +\
                                    ' ~ '


                                elif (AO_fWordSentiment > 0)       \
                                and (AO_fIntensifier ==  float(0)) \
                                and (AO_bNetNegation == False):
                                    # Nth Positive sentiment, nither negated nor intencified

                                    AO_fPosWords = AO_fPosWords + AO_fWordSentiment
                                    AO_sLine = AO_sLine                     +\
                                    'P ('                                   +\
                                    str(AO_fWordSentiment)                  +\
                                    '): '                                   +\
                                    str(AO_lTokens[j][0])                   +\
                                    ' '                                     +\
                                    str(str(AO_lTokens[j][1]))              +\
                                    ' ~ '

                                elif (AO_fWordSentiment < 0)      \
                                and (AO_fIntensifier <> float(0)) \
                                and (AO_bNetNegation == True):
                                    # negated emphesised negative sentimet

                                    AO_fSentiment = AO_fWordSentiment * (
                                        1 + AO_fIntensifier
                                    ) + AO_iNEGATIONconstatnt
                                    AO_fNegWords = AO_fNegWords + AO_fSentiment  # double the scoring
                                    AO_sLine = AO_sLine                     +\
                                    'NegatedEmphN((1+'                      +\
                                    str(AO_fIntensifier)                    +\
                                    ')*('                                   +\
                                    str(AO_fWordSentiment )                 +\
                                    '-'                                     +\
                                    str(AO_iNEGATIONconstatnt)              +\
                                    ')=('                                   +\
                                    AO_sNegationPhrase                      +\
                                    ' ~ '


                                elif (AO_fWordSentiment < 0)      \
                                and (AO_fIntensifier <> float(0)) \
                                and (AO_bNetNegation == False):
                                    # emphesised Negative sentiment

                                    AO_fSentiment = AO_fWordSentiment * (
                                        1 + AO_fIntensifier)
                                    AO_fNegWords = AO_fNegWords + AO_fSentiment  # double the scoring
                                    AO_sLine = AO_sLine                     +\
                                    'emphN((1+'                             +\
                                    str(AO_fIntensifier)                    +\
                                    ')*('                                   +\
                                    str(AO_fWordSentiment )                 +\
                                    ')=('+str(AO_fSentiment)                +\
                                    ')): '                                  +\
                                    str(AO_lTokens[j-1][0])                 +\
                                    ' '                                     +\
                                    str(AO_lTokens[j][0])                   +\
                                    ' , '                                   +\
                                    str(AO_lTokens[j-1][1])                 +\
                                    ','                                     +\
                                    str(AO_lTokens[j][1])                   +\
                                    ' ~ '

                                elif (AO_fWordSentiment < 0)       \
                                and (AO_fIntensifier ==  float(0)) \
                                and (AO_bNetNegation == True):
                                    # Negated Negative Sentiment

                                    AO_fNegWords = AO_fNegWords + AO_fWordSentiment + AO_iNEGATIONconstatnt
                                    AO_sLine = AO_sLine                     +\
                                    'N('                                    +\
                                    str(AO_fWordSentiment)                  +\
                                    '+'                                     +\
                                    str(AO_iNEGATIONconstatnt)              +\
                                    '): '                                   +\
                                    AO_sNegationPhrase                      +\
                                    ' ~ '


                                elif (AO_fWordSentiment < 0)       \
                                and (AO_fIntensifier ==  float(0)) \
                                and (AO_bNetNegation == False):
                                    # Nth Negative sentiment

                                    AO_fNegWords = AO_fNegWords + AO_fWordSentiment
                                    AO_sLine = AO_sLine                     +\
                                    'N('                                    +\
                                    str(AO_fWordSentiment)                  +\
                                    '): '                                   +\
                                    str(AO_lTokens[j][0])                   +\
                                    ','                                     +\
                                    str(AO_lTokens[j][1])                   +\
                                    ' ~ '

                            # end elif
                        # end if - first word
                    # endif negarive word
                # endif this is an intensifier
            # endif the word is a negator
        # for all the words in the sentence
    #for all the sentences in the document

    if len(AO_lTokens) > 0:

        a = round(AO_fPosWords, 2)
        b = round(AO_fNegWords, 2)
        c = round(a + b, 2)

        if c > 0:
            AO_stentiment = "positive"
        elif c < 0:
            AO_stentiment = "negative"
        elif c == 0:
            AO_stentiment = "neutral"
        else:
            AO_stentiment = "ambiguas"

        if AO_sLine.strip() <> '':
            AO_sFullOpinion = 'The sentiment of "%s" is %s(%s) as %s' % (
                AO_sDocument, AO_stentiment, c, AO_sLine)
        else:
            AO_sFullOpinion = 'The sentiment of "%s" is %s.' % (AO_sDocument,
                                                                AO_stentiment)

        AO_lOpinion = [a, b, c, AO_sLine, AO_sFullOpinion]

    return AO_lOpinion
Beispiel #19
0
	def __init__(self, path):
		print("Now load in glove model.")
		#self.model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=False)
		self.vocab_size, self.embedding_size, self.model = self.embedding_in(path)
		print("Model loaded!")
		self.tokenizer = tk.WordPunctTokenizer()
Beispiel #20
0
from nltk import tokenize
from string import punctuation

nltk.download("all")

data = pd.read_csv("data.csv")
foo_words = nltk.corpus.stopwords.words("english")

list_new_phrase = []
list_punc_phrase = []
list_strem_phrase = []
list_adjective_phrase = []
list_other_words = []

token_space = tokenize.WhitespaceTokenizer()
token_punctuation = tokenize.WordPunctTokenizer()
stemmer = nltk.RSLPStemmer()

for punc in punctuation:
    foo_words.append(punc)
c = 0

list_places = []
for c in data['City']:
    if ' ' in str(c):
        words_text = token_space.tokenize(c)
        for w in words_text:
            foo_words.append(str(w).lower())
    else:
        foo_words.append(str(c).lower())
for c in [
Beispiel #21
0
    palavras_texto = token_espaco.tokenize(opniao)
    for palavra in palavras_texto:
        if palavra not in palavras_irrelevantes:
            nova_frase.append(palavra)
    frase_processada.append(' '.join(nova_frase))

resenha["tratamento1"] = frase_processada

resenha.head()
classificar_texto(resenha, 'tratamento1', 'classificacao')
pareto(resenha, 'tratamento1', 10)

#PARTE 2
#separar a pontuação do texto
frase = "Olá mundo!"
token_pontuacao = tokenize.WordPunctTokenizer()
token_frase = token_pontuacao.tokenize(frase)

print(token_frase)

#retirada de pontuação
print(punctuation)
pontuacao = list()
for ponto in punctuation:
    pontuacao.append(ponto)
print(pontuacao)

pontuacao_stop_words = pontuacao + palavras_irrelevantes

frase_processada = list()
for opiniao in resenha["tratamento1"]:
Beispiel #22
0
def treat_punctuation(text):
    tokenizer = tokenize.WordPunctTokenizer()
    return ' '.join(tokenizer.tokenize(text))
Beispiel #23
0
    # remove punctuation from each word
  #  print(palavras_ementa)
    
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in palavras_ementa]
     # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    
  #  print(palavras_ementa)
    for palavra in words:
      if palavra.lower() not in stop_words:
        nova_ementa.append(wordnet_lemmatizer.lemmatize(palavra))
        #nova_ementa.append(stemmer.stem(palavra))
        #nova_ementa.append(palavra)
    ementas_processadas.append(' '.join(nova_ementa))
  return ementas_processadas

"""aqui utiliza os metodos anteriores para aplicar tratamento nas frases removendo as stop words e exibindo o retorno no formato de grafico posteriormente"""

ementas_trat_1 = [unidecode.unidecode(texto.lower()) for texto in dados_civil_e_crime["EMENTA_SEM_VERBATIZACAO"]]
#ementas_trat_1 = [unidecode.unidecode(texto.lower()) for texto in dados_civil_e_crime["EMENTA"]]
dados_civil_e_crime["EMENTA_SEM_CAPUT_TRAT_1"] = ementas_trat_1

punct_tokenize = tokenize.WordPunctTokenizer()
dados_civil_e_crime['EMENTAS_PROCESSADAS'] = remove_stopwords(tokenizador, dados_civil_e_crime.EMENTA_SEM_CAPUT_TRAT_1)

df_frequencia = gera_frequencia(tokenizador, dados_civil_e_crime.EMENTAS_PROCESSADAS)
escreve_pareto(df_frequencia)

nuvem_palavras_processadas = gera_nuvem_palavras(dados_civil_e_crime.EMENTAS_PROCESSADAS)
desenha_palavras(nuvem_palavras_processadas)
Beispiel #24
0
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import copy, inspect
from scipy.spatial.distance import cosine

stem = SnowballStemmer("english").stem

link_re = re.compile(r"(http(s)?[^\s]*)|(pic\.[s]*)")
hashtag_re = re.compile(r"#[a-zA-Z0-9_]+")
mention_re = re.compile(r"@[a-zA-Z0-9_]+")

pat_type = {'links': link_re,
            'hashtags': hashtag_re,
            'mentions': mention_re}

tokenizers = {'treebank': nltk_token.TreebankWordTokenizer().tokenize,
              'wordpunct': nltk_token.WordPunctTokenizer().tokenize,
              'tweettokenize': nltk_token.TweetTokenizer().tokenize}

def read_file(path):
    if not os.path.exists(path):
        raise ValueError("Path does not point to existing file: {}".format(path))
        return
    ending = path.split('.')[-1]
    if ending == 'csv':
        return pd.read_csv(path)
    elif ending == 'tsv':
        return pd.read_csv(path, delimiter='\t')
    elif ending == 'pkl':
        return pd.read_pickle(path)
    elif ending == 'json':
        return pd.read_json(path)