Beispiel #1
0
def modelTrainingLexicon(traginingData, testData):
    print("--Lexicon Model--")
    tab = []
    dataLexiconFeature = []
    dataLexiconFeatureT = []
    for data in traginingData:
        booleanNeg = False
        pos_score = neg_score = obj_score = 0
        tagData = pos_tag(data[0])
        negationData = mark_negation(data[0])
        pos_score, neg_score, obj_score =tagCount(data,tagData,negationData,pos_score,neg_score,obj_score,booleanNeg)
        total = int(pos_score) - int(neg_score)
        if (total < 0):
            overall = 'neg'
        elif (total > 0):
            overall = 'pos'
        elif (total == 0):
            overall = 'neutre'
        tab.append(pos_score)
        tab.append(neg_score)
        tab.append(obj_score)
        feats = ({'positive': pos_score, 'negative': neg_score}, data[1])
        dataLexiconFeature.append(feats)

    for dataT in testData:
        booleanNegT = False
        pos_scoreT = neg_scoreT = obj_scoreT = 0
        tagData = pos_tag(dataT[0])
        negationDataT = mark_negation(dataT[0])
        pos_scoreT, neg_scoreT, obj_score = tagCount(dataT, tagData, negationDataT, pos_scoreT, neg_scoreT, obj_scoreT,
                                                   booleanNegT)
        total = int(pos_scoreT) - int(neg_scoreT)

        tab.append(pos_scoreT)
        tab.append(neg_scoreT)
        tab.append(obj_scoreT)
        featsT = ({'positive': pos_scoreT, 'negative': neg_scoreT}, dataT[1])
        dataLexiconFeatureT.append(featsT)


    classifier = NaiveBayesClassifier.train(dataLexiconFeature)
    realSet = collections.defaultdict(set)
    testSet = collections.defaultdict(set)

    tabPr = []
    tabOut = []

    for i, (feat, ovAll) in enumerate(dataLexiconFeatureT):
        realSet[ovAll].add(i)
        predicted = classifier.classify(feat)
        tabOut.append(predicted)
        tabPr.append(predicted)
        testSet[predicted].add(i)


    print("Accuracy Naive Bayes for Lexicon Model : ", nltk.classify.util.accuracy(classifier, dataLexiconFeatureT))

    return realSet, testSet, tabPr, tabOut
def bow_freq(data, token):
    positive = readwords('positive-words.txt')
    negative = readwords('negative-words.txt')
    negative = negative[36:]
    positive = positive[36:]
    stemmer = PorterStemmer()
    negative = [stemmer.stem(w) for w in negative]
    positive = [stemmer.stem(w) for w in positive]
    negative = list(dict.fromkeys(negative))
    positive = list(dict.fromkeys(positive))

    ps = {k: v for v, k in enumerate(positive)}
    ns = {k: v for v, k in enumerate(negative)}

    sample = []
    for i in range(len(data)):
        if ('\n' in data.iloc[i]['Summary and Review']):
            temp = ''.join(data.iloc[i]['Summary and Review'].split('\n'))
        else:
            temp = data.iloc[i]['Summary and Review']
        temp = " . ".join(temp.split('.')).split()
        sample.append(temp)
    #negation
    for i in range(len(sample)):
        mark_negation(sample[i], shallow=True)

    for i in range(len(sample)):
        temp = []
        for w in sample[i]:
            s = stemmer.stem(w)
            if ('NEG' in w):
                temp.append('NEG')
            elif (s in ps or s in ns):
                temp.append(w)
        sample[i] = ' '.join(temp)
    ns['NEG'] = 0

    simple = []
    for r in range(len(sample)):
        tN = 0
        tP = 0
        l = len(data['Summary and Review'][r])
        row = sample[r].split()
        for c in range(len(row)):
            if (stemmer.stem(row[c]) in ps):
                tP += 1
            else:
                tN += 1
        simple.append([tP, tN, l])

    return simple
Beispiel #3
0
def filterNegation(linein):
    assert type(linein) == unicode
    lineout = linein
    parts = linein.split(u' ')
    markednageation = mark_negation(parts)
    lineout = u' '.join(markednageation)
    return lineout
Beispiel #4
0
def make_unigram_feature_set(documents, min_freq=1, mark_negation=False):
    """
    This function goes through a corpus and retains all candidate unigram features
     making a feature set. Optionally, it can also preprocess the corpus annotating
     with _NEG words that are in the scope of a negation (using NLTK helper functions).
    
    :param documents: all documents, each a list of words
    :param min_freq: minimum frequency of a token for it to be part of the feature set
    :param mark_negation: whether to preprocess the document using NLTK's nltk.sentiment.util.mark_negation
        see the documentation `nltk.sentiment.util.mark_negation?`
    :returns: unigram feature set
    """

    counter = Counter()
    for doc in documents:
        if mark_negation == True:
            doc = util.mark_negation(doc)
        counter.update(doc)
    features = []
    for f, n in counter.most_common():
        if n >= min_freq:
            features.append(f)
        else:
            break
    return frozenset(features)
Beispiel #5
0
    def count_emotions(self, text):
        # Clean up the string of characters
        temp0 = break_contractions(text)  # Break up contractions
        temp1 = lemmatize_words(
            temp0.split())  # Split string to words, then lemmatize
        temp2 = mark_negation(temp1,
                              double_neg_flip=True)  # Account for negations
        temp3 = remove_stopwords(temp2)  # Remove any stopwords

        # check_spelling(temp2)  # Function is no longer useful

        # Count number of emotional associations for each valid word
        bank = []
        wordcount = 0
        for word in temp3:
            if word in self.associations:
                bank.extend(self.associations[word])
                wordcount += 1

        # Returns a tuple of integers for negative, positive, anger, fear, anticipation,
        # surprise, trust, sadness, joy, disgust, and total word count, respectively.
        return ((bank.count('negative'), bank.count('positive'),
                 bank.count('anger'), bank.count('fear'),
                 bank.count('anticipation'), bank.count('surprise'),
                 bank.count('trust'), bank.count('sadness'), bank.count('joy'),
                 bank.count('disgust'), wordcount))
def preprocess(s, lowercase=False, tokenizer=tokenize, word_transformation = '', handle_negation = True):
    """
    Improve tokenization with different options
    :param s: sentence to tokenize
    :param lowercase: lowercase or not tokens
    :param tokenizer: which tokenizer to use
    :param word_transformation: stemming, lemmatize or nothing
    :param handle_negation:
    :return: tokens
    """
    tokens = tokenizer(s)
    if word_transformation == 'stem':
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    elif word_transformation == 'lemmatize':
        lemmatizer = WordNetLemmatizer()
        tagged = pos_tag(tokens)
        tokens = []
        for word, tag in tagged:
            wntag = get_wordnet_pos(tag)
            if wntag is None:
                lemma = lemmatizer.lemmatize(word)
            else:
                lemma = lemmatizer.lemmatize(word, pos=wntag)
            tokens.append(lemma)
    if lowercase:
        tokens = [token if emoticon_RE.search(token) else token.lower() for token in tokens]
    if handle_negation:
        tokens = mark_negation(tokens)
    return tokens
Beispiel #7
0
def tokenize_with_negation(text):
    """
    Split a text into lower-case tokens, removing all punctuation tokens and stopwords
    :param text: input text
    :return: lowercase word tokens, without punctuation or stopwords
    """
    # List of stop words in English
    english_stopwords = set(stopwords.words('english'))
    # Set of negated stopwords
    negated_stopwords = set(word + "_NEG" for word in english_stopwords)
    # List of all stopwords, including negated words
    all_stopwords = english_stopwords.union(negated_stopwords)

    tokens = []
    for sent in sent_tokenize(text):
        pretokens = word_tokenize(sent.lower())
        # exclude punctuation
        pretokens = [
            token for token in pretokens
            if any(char.isalpha() for char in token)
        ]
        # exclude negated stop words (tagged as negated)
        pretokens = mark_negation(pretokens)
        tokens.extend(token for token in pretokens
                      if token not in all_stopwords)
    return tokens
Beispiel #8
0
def clean(comment):
    """Return processed tokens for a given comment."""
    # Split into "words"
    tokens = comment.split()
    # Remove punctuation
    re_punc = re.compile(f"[{re.escape(string.punctuation)}]")
    tokens = [re_punc.sub('', word) for word in tokens]
    # Remove non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]
    # Remove short tokens (making sure not to remove four letter words)
    tokens = [word for word in tokens if len(word) > 2]
    # Remove long tokens (possibly URLs)
    tokens = [word for word in tokens if len(word) < 20]
    # Make tokens lowercase
    tokens = [word.lower() for word in tokens]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization with POS to account for things like plurals
    lem = WordNetLemmatizer()
    tokens = [lem.lemmatize(token, lemm_pos(token)) for token in tokens]
    # Add negations
    tokens = mark_negation(tokens)
    # Fill with blank if a comment is totally removed by processing
    if len(tokens) == 0:
        tokens = [" "]
    else:
        pass
    # Remove "talk" typical at the end of certain wiki comments
    if tokens[-1] == 'talk':
        clean_comment = " ".join(tokens[:-1])
    else:
        clean_comment = " ".join(tokens)
    return clean_comment
def filterNegation(linein):
    assert type(linein)==unicode
    lineout=linein
    parts=linein.split(u' ')
    markednageation=mark_negation(parts)
    lineout= u' '.join(markednageation)
    return lineout
Beispiel #10
0
def superTokenize(text, mark_neg, remSW, lem, stem):
    tokens = TweetTokenizer().tokenize(text)
    if mark_neg: tokens = mark_negation(tokens)
    if remSW: tokens = remStopWords(tokens)
    if lem: tokens = lemmatize(tokens)
    if stem: tokens = stemmize(tokens)
    return tokens
Beispiel #11
0
def make_feature_map(document, feature_set, binary=True, mark_negation=False):
    """
    This function takes a document, possibly pre-processes it by marking words in the scope of negation, 
     and constructs a dict indicating which features in `feature_set` fire. Features may be binary, 
     flagging occurrence, or integer, indicating the number of occurrences.
     If no feature can be extracted, a special feature is fired, namely 'EMPTY()'.
     
    :param document: a list of words
    :param feature_set: set of features we are looking for
    :param binary: whether we are indicating presence or counting features in feature_set
    :param mark_negation: whether we should apply NLTK's mark_negation to document before applying the feature function
    :returns: dict with entries 'contains(f)=1/0' for binary features or 'count(f)=n' for count features
    """
    if mark_negation:
        document = util.mark_negation(document)
    dic = defaultdict(float)
    for i in feature_set:
        if i in document:
            if binary:
                x = f"contains({i})"
                dic[x] = 1.0
            else:
                x = f"count({i})"
                if x not in dic:
                    dic[x] = 1.0
                else:
                    dic[x] += 1.0
    if len(dic) == 0:
        dic["EMPTY()"] = 1.0
    return dic
Beispiel #12
0
def clean_doc(doc):
    """Return processed tokens for a given document."""
    # Split into "words"
    tokens = doc.split()
    # Remove punctuation
    re_punc = re.compile(f"[{re.escape(string.punctuation)}]")
    tokens = [re_punc.sub('', word) for word in tokens]
    # Remove non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]
    # Remove short tokens (making sure not to remove four letter words)
    tokens = [word for word in tokens if len(word) > 2]
    # Remove long tokens (possibly URLs)
    tokens = [word for word in tokens if len(word) < 20]
    # Make tokens lowercase
    tokens = [word.lower() for word in tokens]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization to account for things like plurals
    # Takes in part of speech (POS)
    lem = WordNetLemmatizer()
    tokens = [lem.lemmatize(token, lemm_pos(token)) for token in tokens]
    # Add negations
    tokens = mark_negation(tokens)
    # Return tokens
    clean_comment = " ".join(tokens)
    return clean_comment
Beispiel #13
0
def runSentanal(train, test):
    sentanal = SentimentAnalyzer()

    all_words_neg = sentanal.all_words([mark_negation(doc) for doc in train])

    unigramFeats = sentanal.unigram_word_feats(all_words_neg, min_freq=4)
    sentanal.add_feat_extractor(extract_unigram_feats,
                                unigrams=unigramFeats,
                                handle_negation=True)

    # bigramFeats = sentanal.
    # sentanal.add_feat_extractor(extract_bigram_feats, bigrams=bigramFeats)

    trainList = sentanal.apply_features(train)
    testList = sentanal.apply_features(test)
    trainer = NaiveBayesClassifier.train
    classifier = sentanal.train(trainer, trainList)
    classifier.show_most_informative_features()

    # creates array for storing values
    values = []

    # display results
    for key, value in sorted(sentanal.evaluate(testList).items()):
        print('{0}: {1}'.format(key, value))
        values.append(value)

    # write results to csv
    with open(OUTPUT_CSV, mode='a') as csvFile:
        writer = csv.writer(csvFile, delimiter=',')
        writer.writerow(values)
def swn_score(text):
    """ Calculate score with sentiwordnet library.
        Return score for sentence.
    """
    score = 0.0

    if text is not None:
        # mark negation
        words = mark_negation(text.split())

        # remove stopwords
        words = [t for t in words if t not in stopwords.words('english')]

        # select sense for each word
        words_sense = {}
        for word in words:
            clean_word = word.replace('_NEG', '')
            if wn.synsets(clean_word):
                words_sense[word] = wn.synsets(clean_word)[0]

        # calculate score
        for word, sense in words_sense.items():
            pos_score = swn.senti_synset(sense.name()).pos_score()
            neg_score = swn.senti_synset(sense.name()).neg_score()
            if '_NEG' in word:
                pos_score, neg_score = neg_score, pos_score
            score += (pos_score - neg_score)
        if len(words_sense) != 0:
            score /= len(words_sense)
    return score
Beispiel #15
0
    def negation_tagger(sentence):
        """Tags negation for list of tokens that comprises of a sentence

        :param list sentences: the premise or hypothesis
        :rtype: list
        :return: "_NEG" appended for tokens within negation's scope
        """
        return mark_negation(sentence)
Beispiel #16
0
def addfeatures(cleaned_tokens_list):
    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words(
        [mark_negation(token_list) for token_list in cleaned_tokens_list])
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg,
                                                       min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_feats)
Beispiel #17
0
    def tweet_preprocessing(self, doc):
        """
      Tweet preprocessing method
      """
        # Handle emojis
        doc = emoji_parser(doc)
        # Handle negetion
        doc = re.sub(r' isnt ', r' is not ', doc).strip()
        doc = re.sub(r' arent ', r' are not ', doc).strip()
        doc = re.sub(r' aint ', r' is not ', doc).strip()
        doc = re.sub(r' ain ', r' is not ', doc).strip()
        doc = re.sub(r' wasnt ', r' was not ', doc).strip()
        doc = re.sub(r' wasn ', r' was not ', doc).strip()
        doc = re.sub(r' werent ', r' were not ', doc).strip()
        doc = re.sub(r' dont ', r' do not ', doc).strip()
        doc = re.sub(r' doesnt ', r' does not ', doc).strip()
        doc = re.sub(r' didnt ', r' did not ', doc).strip()
        doc = re.sub(r' wont ', r' will not ', doc).strip()
        doc = re.sub(r' won\'t ', r' will not ', doc).strip()
        doc = re.sub(r' havent ', r' have not ', doc).strip()
        doc = re.sub(r' hasnt ', r' has not ', doc).strip()
        doc = re.sub(r' hadnt ', r' had not ', doc).strip()
        doc = re.sub(r' wouldnt ', r' would not ', doc).strip()
        doc = re.sub(r' shouldnt ', r' should not ', doc).strip()
        doc = re.sub(r' shallnt ', r' shall not ', doc).strip()
        doc = re.sub(r' cannot ', r' can not ', doc).strip()
        doc = re.sub(r' cant ', r' can not ', doc).strip()
        doc = re.sub(r'can\'t', r' can not ', doc).strip()
        doc = re.sub(r' couldnt ', r' could not ', doc).strip()
        doc = re.sub(r'([a-zA-Z].+)n\?t', r' \1 not ', doc).strip()
        doc = re.sub(r'([a-zA-Z].+)n\'t', r' \1 not ', doc).strip()
        # capture apostrofe suffixes
        doc = re.sub(r'([a-zA-Z].+)\'ve', r' \1 have ', doc).strip()
        doc = re.sub(r'([a-zA-Z].+)\'re', r' \1 are ', doc).strip()
        doc = re.sub(r'([a-zA-Z].+)\'s', r' \1 \'s ', doc).strip()
        # capture explamation mark (!)
        # doc = re.sub(r'(!{2,})', '<EXCLAMATION>.', doc).strip()
        # capture question mark (?)
        # doc = re.sub(r'(\?{2,})', '<QUESTION>.', doc).strip()
        # remove words starting with &
        doc = re.sub(r' &[\w;]+ ', ' ', doc).strip()
        # remove numbers
        doc = re.sub(r'[0-9]+', '', doc).strip()
        # remove links
        doc = re.sub(r'http[s]?.+\b', '', doc).strip()
        # remove underscores
        doc = re.sub(r'_+', '', doc).strip()
        # remove single letters
        doc = re.sub(r' [a-zA-Z] ', ' ', doc).strip()
        # remove periods (.)
        doc = re.sub(r'(\.)', '', doc).strip()

        # Tweet tokenization with TweetTokenizer module
        tk = TweetTokenizer(strip_handles=True, reduce_len=True)
        tokens = tk.tokenize(doc)
        if self.handle_negation:
            tokens = mark_negation(tokens)
        return tokens
def clean_and_tokenize_review(review_text: str) -> List[str]:
    """
    Uses wordpunct_tokenize to keep punctuation groups together
    # https://www.nltk.org/api/nltk.tokenize.html#nltk.tokenize.regexp.WordPunctTokenizer

    Removes nltk stopwords from token list
    """
    tokens: List[str] = wordpunct_tokenize(review_text)
    mark_negation(tokens, shallow=True)
    # transform to normalize words
    ret = []
    for token in tokens:
        split_token = token.split("_")
        split_token[0] = split_token[0].lower()
        if split_token[0] in stopwords:
            continue
        ret.append("_".join(split_token))
    return ret
def score_review(review):
    sentiment_scores = []
    sents = sent_tokenize(review)
    for sent in sents:
        wds = word_tokenize(sent)
        wds = mark_negation(wds)
        sent_scores = score_sent(wds)
        sentiment_scores.append(sent_scores)
    return sum(sentiment_scores) / len(sentiment_scores)
def tokenize_with_negation(text):
    # split text into lower-case tokens, removing all-punctuation tokens and stopwords
    tokens = []
    for sentence in sent_tokenize(text):
        pretokens = word_tokenize(sentence.lower())
        pretokens = [x for x in pretokens if any(i.isalpha() for i in x)]
        pretokens = mark_negation(pretokens)
        tokens.extend(x for x in pretokens if x not in all_stopwords)
    return tokens
def mark_neg(list_of_lists_of_tokens, double_neg_flip=False):
    """
    Return count of words in each text
    
    Parameters
    ----------
    - list_of_lists_of_tokens : dataframe column whose cells contain lists of word-tokenised sentences
    - OUTPUT : 
    """

    return [mark_negation(sent) for sent in list_of_lists_of_tokens]
def stemming_tokenizer(sentence):
    sentence = word_tokenize(sentence)
    stop_words = set(stopwords.words('english'))
    newsentence = []
    for word in sentence:
        if word not in stop_words:
            if len(word) >= 3:
                newsentence.append(word)

    newsentence = mark_negation(newsentence)
    return newsentence
Beispiel #23
0
def mark_neg(list_of_lists_of_tokens, double_neg_flip=False):
    """
    Mark negations, i.e., append _NEG suffix to words that appear in the scope between a negation
    and a punctuation mark.
    
    Parameters
    ----------
    - list_of_lists_of_tokens : dataframe column whose cells contain lists of word-tokenised sentences
    - OUTPUT : 
    """

    return [mark_negation(sent) for sent in list_of_lists_of_tokens]
Beispiel #24
0
def tokenize_with_negations(text: str) -> list:
    # marking negative words and finally eliminating remaining punctuation
    # what is important - this method should be used as last.
    # it returns text tokenized
    # Surrounding stop mark with spaces is also important to get rid of
    # problem with
    # stop mark connected with the following word.
    text = mark_negation([
        x
        for x in word_tokenize(re.sub(pattern=r"\.", repl=" . ", string=text))
    ])
    text = [x for x in text if x not in ['.', ':', ';', '!', '?', 'no', 'not']]
    return text
Beispiel #25
0
def negation_check(sentence, set_terms):
    words = word_tokenize(sentence)
    negations = mark_negation(words)
    only_neg = list(set(negations).difference(words))
    if (len(only_neg) == 0):
        return set_terms
    only_neg = [x[:-4] for x in only_neg]
    terms = list(set_terms)
    for i, term in enumerate(terms):
        for t in term.split():
            if t in only_neg:
                terms[i] = term + '_NEG'
                break
    return set(terms)
def feature_set(post_list):
    """
    Expects a list of cleaned posts in sentence format and returns a featureset
    calculated by marking negation then doing a count vectorization and tf-idf
    transform
    """
    # mark negation
    # count vectorizer
    # tf-idf
    # isn't f****d up -> isn't fucked_NEG up_NEG
    marked = [mark_negation(p) for p in post_list]
    tv = TfidfVectorizer(min_df=1)
    marked_words = flatten(as_words(marked))
    return tv.fit_transform(marked_words)
 def one_vector_senti(self, sentence):
     sentence = ' '.join(mark_negation(sentence.split()))
     sentence = str(sentence).lower()
     global_vec = []
     global_vec.extend(self.Vader_API(sentence))
     global_vec.extend([self.senti_Strength.score(sentence)])
     global_vec.extend([self.afn.score(sentence)])
     global_vec.extend(self.huliu.score(sentence))
     global_vec.extend([self.senti_wordNet.score(sentence)])
     global_vec.extend(self.effect_WN.score(sentence))
     global_vec.extend(self.sentic_net.score(sentence))
     global_vec.extend(self.subj_cue_senti.score(sentence))
     global_vec.extend(self.emo_lex_senti.score(sentence))
     return global_vec
def filterData(data):
    positive = readwords('positive-words.txt')
    negative = readwords('negative-words.txt')
    negative = negative[36:]
    positive = positive[36:]
    stemmer = PorterStemmer()
    negative = [stemmer.stem(w) for w in negative]
    positive = [stemmer.stem(w) for w in positive]
    negative = list(dict.fromkeys(negative))
    positive = list(dict.fromkeys(positive))

    ps = {k: v for v, k in enumerate(positive)}
    ns = {k: v for v, k in enumerate(negative)}

    sample = []
    for i in range(len(data)):
        if ('\n' in data.iloc[i]['Summary and Review']):
            temp = ''.join(data.iloc[i]['Summary and Review'].split('\n'))
        else:
            temp = data.iloc[i]['Summary and Review']
        temp = " . ".join(temp.split('.')).split()
        sample.append(temp)
    #negation
    for i in range(len(sample)):
        mark_negation(sample[i], shallow=True)

    for i in range(len(sample)):
        temp = []
        for w in sample[i]:
            s = stemmer.stem(w)
            if ('NEG' in w):
                temp.append('NEG')
            elif (s in ps or s in ns):
                temp.append(w)
        sample[i] = ' '.join(temp)
    data['Summary and Review'] = sample
    return data
Beispiel #29
0
    def process_document(cls, document):

        document = cls.html_processing(document)

        #tokenize
        words = cls.t_word_tokenizer.tokenize(document)

        #print(" \n Tokenizing: {} \n".format(words))
        #expand contractions
        words = cls.expand_contractions(words)
        #print("Expanding contractions: {} \n".format(words))

        # to lowercase
        words = list(map(str.lower, words))

        tagged_sentence = pos_tag(words)
        proper_nouns_tags = ['IN', 'NNP', 'PRP', 'PRP$', 'WP$']
        tagged_sentence = [(word, tag) for word, tag in tagged_sentence
                           if tag not in proper_nouns_tags]

        #print("Filtering tags: {} \n".format(tagged_sentence))

        words = []
        for word, tag in tagged_sentence:
            wordnet_tag = cls.find_wordnet_tag(tag)
            if wordnet_tag != '':
                word = cls.remove_apos(word)
                words.append(
                    cls.lemmatizer.lemmatize(word.lower(), wordnet_tag))
            elif word in string.punctuation:
                words.append(word)

        #print("Lemmatize: {} \n".format(words))
        # must be reviewed
        words = [
            word for word in words if word not in string.punctuation
            and len(word) > 1 and cls.is_english_word(word.lower())
        ]
        #print("Punctuation and english: {} \n".format(words))

        words = mark_negation(words)
        #print("Negation: {} \n".format(words))

        stop_wrods = set(cls.english_stopwords + cls.my_stopwords)
        words = [word for word in words if word.lower() not in stop_wrods]

        #print("Stop words: {} \n".format(words))

        return words
Beispiel #30
0
def process(tweet):
    global i
    print(i)
    i += 1

    text = tweet['text'].lower()
    lang = tweet['lang']
    try:
        mentions = tweet['mentions']
        urls = tweet['urls']
    except KeyError:
        mentions = []
        urls = []

    # remove mentions and urls from tweets
    text = filter_tweet(text, mentions, urls)

    # seperate emojis and text
    [text, emoji_text] = separate_emojis(text)

    # remove English contractions
    if lang == 'en':
        text = expand_contractions(text)

    if lang == 'ar':
        text = remove_arabic_variants(text)

    #remove stop words
    text = remove_stopwords(text, lang)

    # mark negations and remove punctuation from text
    if lang == 'en':
        text = mark_negation(text)
    text = remove_punct(text)

    # normalize text by removing repetions and steming
    # old_text = text
    text = normalize_repititions(text, lang)
    if lang == 'ar':
        text = farasa.lemmatize(text)
    if lang == 'en':
        text = stem_words(text)
    # if(old_text != text):
    #     print('old text: %s, new text: %s' %(tweet['text'], ' '.join(text)))

    text += emoji_text

    print(text)
    return text
    def __call__(self, t):
        t = self.reduce_lengthening(t)
        tokens = t.split(' ')

        cleaned_tokens = []
        for token in tokens:
            token = self.replace_username(token)
            token = self.replace_link(token)
            cleaned_tokens.append(token)

        rebuild_str = ' '.join(cleaned_tokens)

        negated_tokens = mark_negation(list(self.tknzr.tokenize(rebuild_str)))
        list_of_trigrams = list([' '.join(s) for s in trigrams(negated_tokens)])
        return list_of_trigrams
Beispiel #32
0
def sent_tokenize(x):   # have trouble with double negation, input a df
    
    stopword = set(stopwords.words('english')) - {'he', 'him', 'his', 'himself',
                                                  'she', 'her', "she's", 'her', 'hers', 'herself',
                                                'they', 'them', 'their', 'theirs', 'themselves'}
    
    lmtzer = WordNetLemmatizer()
    x = x.lower()
    temp = re.sub("\,",'.',x)
    word = re.findall('[a-zA-Z]+|:\)|\.\.\.+|[!]+|\!\?|\.',temp)
    word = mark_negation(word)
    word = [i for i in word if i not in stopword]
    word_tag = nltk.pos_tag(word)
    lmt_word = [lmtzer.lemmatize(i_pair[0], pos=wordnet_pos(i_pair[1])) for i_pair in word_tag]
    return lmt_word
 def f(s):
     return mark_negation(tokenizer.tokenize(s))
 def __call__(self, t):
     text = self.reduce_lengthening(t)
     tokens = list(self.tknzr.tokenize(text))
     negated_tokens = mark_negation(tokens)
     list_of_skipgrams = list(skipgrams(negated_tokens, self.n, self.k))
     return list([' '.join(s) for s in list_of_skipgrams])