Example #1
0
def custom_extract_features(data):
    """ 
        Implement your own feature extraction function here.

        The function should modify data by adding a binary np.array
        ex["FEATURES"] to each ex in data.
    """
    # Replace this with your own code.
    tokenizer = nltk.TweetTokenizer(preserve_case=False, reduce_len=True)
    stemmer = PorterStemmer()
    sw = {w: True for w in set(stopwords.words('english'))}

    get_worst = lambda x: max([e.neg_score() for e in x])
    get_best = lambda x: max([e.pos_score() for e in x])

    for data_set in data.values():
        for ex in data_set:
            # Words that are clearly loaded one way
            # ex['BODY'] = [(word) for word in tokenizer.tokenize(ex['BODY']) if len(list(swn.senti_synsets(word))) == 0 or np.abs(get_best(list(swn.senti_synsets(word))) - get_worst(list(swn.senti_synsets(word)))) > 0.25]

            # Preserve words not in the sentinet i.e. likely smileys, abbreviations etc and only keep the words that have at least some sentiment score
            ex['BODY'] = [
                (word) for word in tokenizer.tokenize(ex['BODY'])
                if len(list(swn.senti_synsets(word))) == 0 or (
                    get_best(list(swn.senti_synsets(word))) > 0.1
                    or get_worst(list(swn.senti_synsets(word))) > 0.1)
            ]

            # Filter stopwords
            # ex['BODY'] = [(word) for word in (tokenizer.tokenize(ex['BODY'])) if word not in sw]
            # ex['BODY'] = [(word) for word in (tokenizer.tokenize(ex['BODY']))]

    extract_features(data)
def review_to_wordlist(review_text):

    review_text = emoji_to_text(review_text)

    review_text = abbreviation_to_text(review_text)

    review_text = re.sub("(@[\w]*\ )+", "@USER", review_text)

    duplicateSpacePattern = re.compile(r'\ +')
    review_text = re.sub(duplicateSpacePattern, ' ', review_text).strip()

    # review_text = re.sub("@[\w]*\ ", " ", review_text)

    # review_text = re.sub("(@[\w]*\ )+", "@USER ", review_text).strip()     #将重复出现的@USER替换成只有一个的@USER
    # print(review_text)

    # review_text = re.sub("[!?,.]", " ", review_text).strip()

    review_text = ekphrasis_config(review_text)

    review_text = re.sub("[^a-zA-Z0-9\@\&\:]", " ", str(review_text))

    # review_text = review_text.lower()
    # print(review_text)
    # words = stanford_tokenizer(review_text)
    # words = nltk.word_tokenize(review_text)
    # words = nltk.TweetTokenizer().tokenize(review_text)

    words = nltk.TweetTokenizer(strip_handles=True,
                                reduce_len=True).tokenize(review_text)
    # return (review_text)
    return (words)
def load_4_aligning(
    sent_path,
    max_len = 64, 
    max_sent = 200000,
    ):
    '''
    This function emulates load_align_corpus from utils.FTalignment.py
    To use when wanting to encode english sentences with different models, 
    instead of encoding different language sentences with the same model
    '''
    import nltk
    sentences_1 = []
    bad_idx = []
    with open(sent_path) as sent_file:
        for i, line in enumerate(sent_file):
            if i >= max_sent:
                break
          
            sent_1 = nltk.TweetTokenizer().tokenize(line)    
            if len(sent_1) > max_len:
                bad_idx.append(i)
            else:
                sentences_1.append(sent_1)

        alignments = [   np.array([  [j,j] for j,_ in enumerate(sent)  ])   for sent  in sentences_1   ]
        return sentences_1, sentences_1, alignments
Example #4
0
def read_data(data):
    text = open(data).read()

    # clean up data
    tokenizer = nltk.TweetTokenizer()
    # separate sentences
    tokens = [tokenizer.tokenize(t) for t in nltk.sent_tokenize(text)]

    # remove the text preceding the class name of each sentence
    for i in range(len(tokens)):
        if 'pos' in tokens[i]:
            while tokens[i][0] != 'pos':
                tokens[i].remove(tokens[i][0])

        elif 'neg' in tokens[i]:
            while tokens[i][0] != 'neg':
                tokens[i].remove(tokens[i][0])

    # get words and vocabulary
    vocab = nltk.word_tokenize(text)
    vocab = list(set(w.lower() for w in vocab))

    # remove the class names from vocabulary
    if 'pos' in vocab:
        vocab.remove('pos')
    elif 'neg' in vocab:
        vocab.remove('neg')

    return tokens, vocab
def strip_handles(positive_sentences, negative_sentences, all_words,
                  positive_words, negative_words):
    # Remove the handles, reduce the length and put to lowercase
    tknzr = nltk.TweetTokenizer(strip_handles=True,
                                reduce_len=True,
                                preserve_case=False)

    # get the number of positive sentences
    pos_num_sentences = len(positive_sentences)
    num_neg = 0

    # use the tokenizer on each word
    for sentences in positive_sentences:
        tknzr_list = tknzr.tokenize(sentences)

        # add word to its dictionary
        split_into_words(tknzr_list, all_words, positive_words)

    for sentences in negative_sentences:
        tknzr_list = tknzr.tokenize(sentences)
        split_into_words(tknzr_list, all_words, negative_words)

        # when the number of negative sentences processed is the same as the positive stop
        if num_neg == pos_num_sentences:
            break
        else:
            num_neg += 1

    print("stripped handles and reduced length")
    print("Removed Stopwords")
    print("Used lemmatizer")
    print("Converted emoticons to text")
def tokenizer1(tweet):
    """
    splits tweet into initial list of tokens
    """
    tknzr = nltk.TweetTokenizer(preserve_case=True, strip_handles=False)
    token_list = tknzr.tokenize(tweet)
    return token_list
Example #7
0
def tokenize(tweets):
    tokenizer = nltk.TweetTokenizer()
    tweets_tokenized = []
    for text in tweets:
        cleaned = clean_text(text)
        tweets_tokenized.append(tokenizer.tokenize(cleaned))
    return tweets_tokenized
Example #8
0
def tokenize(tweet):
    """
    tokenize the tweet
    splits the tweet in lowercase words, i.e. tokens
    returns a set of tokens
    """
    tokenizer = nltk.TweetTokenizer()
    return [word.lower() for word in tokenizer.tokenize(tweet)]
Example #9
0
def tokSentence(filename):
    file = open(filename)
    text = file.readlines()
    file.close()
    tok = nk.TweetTokenizer(reduce_len=True, strip_handles=False)
    print 'tokenize tweets---->',
    tokSentences = [tok.tokenize(i) for i in text]
    print 'finish'
    return tokSentences
    def __init__(self):
        self.tweet_tokenizer = nltk.TweetTokenizer(preserve_case=False)
        self.stop_words = set(stopwords.words('english'))
        #self.stemmer = nltk.SnowballStemmer('english')
        self.stemmer = nltk.PorterStemmer()

        self.custom_stopwords = [
            'that', 'for', 'in', 'this', 'is', 'of', 'to', 'it', 'and', 'the'
        ]
def load_tweets_to_df(valid_labels=[0, 1], tweets_dir=config.TRAINING_TWEETS_PATH, labels_dir=config.TRAINING_LABELS_PATH, valid_users=None):
    tknzr_pos_tagging = nltk.TweetTokenizer(preserve_case=True, reduce_len=True)

    tweet_files = glob.glob(os.path.join(tweets_dir, '*.*'))

    with open(labels_dir, 'r') as f:
        reader = csv.reader(f)
        next(reader, None)
        users = {row[0]: {'age': row[1], 'num_tweets': row[2], 'gender': row[3], 'condition': row[4]} for row in reader}

        X = []
        D = []
        Y = []
        Users = []
        NLTK_PTAGS = []
        CMU_PTAGS = []
        tweets_to_tag = []
        i = 0
        for file in tweet_files:
            username = os.path.splitext(os.path.basename(file))[0]
            if valid_users is not None and username not in valid_users:
                continue
            print(username)

            label = config.LABEL_IDS[users[username]['condition']]
            if label not in valid_labels:
                continue
            tweet_file = open(file, 'r')
            tweets = []
            dates = []
            nltk_pos_tags = []
            cmu_pos_tags = []
            for line in tweet_file:
                tweet = json.loads(line)
                if not is_retweeted_tweet(tweet):
                    t = preprocess(tweet['text'], preserve_case=True)
                    tweets.append(t)
                    dates.append(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))
                    nltk_pos_tags.append(correct_emoji_pos_tags(nltk.pos_tag(tknzr_pos_tagging.tokenize(tweet['text']))))
                    cmu_pos_tags.append(i)
                    tweets_to_tag.append(clean_for_cmu_tagging(tweet['text']))
                    i += 1
            tweet_file.close()
            D.append(np.array(dates))
            X.append(np.array(tweets))
            Y.append(label)
            Users.append(username)
            NLTK_PTAGS.append(nltk_pos_tags)
            CMU_PTAGS.append(cmu_pos_tags)

        PTAGS_NEW = []
        tagged = CMUTweetTagger.runtagger_parse(tweets_to_tag)
        for record in CMU_PTAGS:
            PTAGS_NEW.append([correct_emoji_pos_tags(tagged[r]) for r in record])
        CMU_PTAGS = PTAGS_NEW
        df = pd.DataFrame(data=np.vstack([X, CMU_PTAGS, NLTK_PTAGS, D, Y]).transpose(), index=Users, columns=['tweets', 'cmu_pos_tags', 'nltk_pos_tags', 'created_at', 'labels'])
        return df
Example #12
0
def get_corpus(data):
    # [input]: dataframe [['text','reply']]
    # [output]: corpus: sentence list
    data_text = [sent.lower() for sent in data['text'].values.tolist()]
    data_reply = [sent.lower() for sent in data['reply'].values.tolist()]
    tw_tokenizer = nltk.TweetTokenizer()
    corpus_text = [tw_tokenizer.tokenize(sentence) for sentence in data_text]
    corpus_reply = [tw_tokenizer.tokenize(sentence) for sentence in data_reply]
    return (corpus_text, corpus_reply)
 def is_user_mention(self):
     """
     checks if token is a user mention
     """
     temp = nltk.TweetTokenizer(strip_handles=True)
     result = temp.tokenize(self.token)
     if result == []:
         return True
     else:
         return False
Example #14
0
def preprocess(text):
    text = clean_tweet(text)
    tknzr = nltk.TweetTokenizer()
    text = tknzr.tokenize(text)
    ignore_words = set(stopwords.words('english'))
    # lowercase, remove words less than len 2 & remove numbers in tokenized list
    return [
        word.lower() for word in text
        if len(word) > 2 and not word.isdigit() and not word in ignore_words
    ]
 def __init__(self, textLabel, predictor, predictionThermo,
              confidenceThermo, dayLabel, hourLabel):
     QtWidgets.QTextEdit.__init__(self)
     self.textLabel = textLabel
     self.predictor = predictor
     self.predictionThermo = predictionThermo
     self.confidenceThermo = confidenceThermo
     self.dayLabel = dayLabel
     self.hourLabel = hourLabel
     self.tweet_tokenizer = nltk.TweetTokenizer()
     self.stemmer = nltk.wordnet.WordNetLemmatizer()
Example #16
0
    def _regenerate_dictionaries(self, statistics=False) -> None:
        """Regenerates used n-grams, tfidf everytime data change.

        This can occur either when the training size is changed or a new
        training set is obtained."""
        # TF-IDF
        tknz = nltk.TweetTokenizer()
        self.tfidf \
            = TfidfVectorizer(tokenizer=tknz.tokenize,
                              max_features=self.max_tfidf)
        # get_raw_data returns tuple of asked attributes (that is (text,))
        self.tfidf.fit(
            list(
                map(lambda a: a[0],
                    self.get_raw_data(SampleTypeEnum.TRAIN, 'text'))))
        if statistics:
            self.print(
                f'Number of unique TF-IDF words: {len(self.tfidf.get_feature_names())}'
            )

        # n-grams - mutual information
        vectorizer: CountVectorizer = CountVectorizer(tokenizer=tknz.tokenize)
        # get_raw_data returns tuple of asked attributes (that is (text,))
        word_matrix \
            = vectorizer.fit_transform(list(map(lambda i: i[0],
                                                self.get_raw_data(SampleTypeEnum.TRAIN,
                                                                  'text'))))
        labels: List[str] \
            = list(map(lambda a: a[0],
                       self.get_raw_data(SampleTypeEnum.TRAIN, 'classification')))

        mi = mutual_info_classif(word_matrix, labels)
        top_mi = top_n_indexes(mi, self.max_ngrams)
        ngrams = vectorizer.get_feature_names()
        self.used_ngrams = set(map(lambda i: ngrams[i], top_mi))

        if statistics:
            self.print(f'Number of unique unigrams: {len(self.used_ngrams)}')

        # geneea entities
        # convert lists of entities into set and then join them into one set
        self.used_entities \
            = reduce(lambda a, b: a.union(b),
                     map(lambda i: set(i[0]),
                         self.get_raw_data(SampleTypeEnum.TRAIN,
                                           'entities')))
        if statistics:
            self.print(f'Number of unique entities: {len(self.used_entities)}')

        if statistics:
            train = self.get_raw_data(SampleTypeEnum.TRAIN, 'classification')
            test = self.get_raw_data(SampleTypeEnum.TEST, 'classification')
            counts = Counter(train) + Counter(test)
            self.print(counts)
Example #17
0
def lemmatize_text(sentence):
    # tokenize tweet
    tokenized_text = nltk.TweetTokenizer().tokenize(sentence)
    # verb lemmatization
    lemmatizer = nltk.WordNetLemmatizer()
    filtered_tokens = []
    for word in tokenized_text:
        token = lemmatizer.lemmatize(word)
        filtered_tokens.append(token)
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text
Example #18
0
 def extract_features(self):
     vector = CountVectorizer(
         min_df=2,
         tokenizer=nltk.TweetTokenizer(False).tokenize,
         encoding='ISO-8859-1',
         stop_words=nltk.corpus.stopwords.words('english'))
     train_counts = vector.fit_transform(self.train.data)
     test_counts = vector.transform(self.test.data)
     tfidf_transformer = TfidfTransformer()
     train_tfidf = tfidf_transformer.fit_transform(train_counts)
     test_tfidf = tfidf_transformer.transform(test_counts)
     return train_counts, test_counts
    def __init__(self, redis_client: redis.client.Redis, local: bool = False):
        self.redis_client = redis_client
        self.redis_table = "cryptos"
        self.local = local
        if self.local:
            self.crypto_db = {}
        self.loaded = False

        self.check_and_load_cryptos()

        self.ttokenizer = nltk.TweetTokenizer()  # tt = tweet tokenizer
        self.stemmer = nltk.PorterStemmer()  # porter stemmer
def tweet_tokenize(tweet):
    tweet = tweet["text"]
    tokenizer = nltk.TweetTokenizer(strip_handles=False, reduce_len=True)
    tokens = tokenizer.tokenize(tweet)
    ret = []
    for tok in tokens:
        if tok[0] == "@":
            ret.append("@user")
        elif tok[0] == "#":
            ret.append(tok[1:])
        else:
            ret.append(tok)
    return ret
 def get_tokens(self):
     tokenizer = nltk.TweetTokenizer()
     tokens = []
     for p in self.content:
         raw_tokens = self.clean_tokens(tokenizer.tokenize(p))
         while raw_tokens.count('.') != 0:
             bound = raw_tokens.index('.')
             sentence = raw_tokens[:bound]
             raw_tokens = raw_tokens[bound + 1:]
             tokens.append(sentence)
         if len(raw_tokens) != 0:
             tokens.append(raw_tokens)
     return tokens[:]
Example #22
0
def process_tokens(tweet):
    """
    Create the tokens and remove the stop words
    
    """
    stop_words = set([
        'the', 'to', 'in', 'on', 'and', 'of', 'a', 'for', 'at', 'with', 'be',
        'it', 'that', '-', 'this'
    ])
    tknzr = nltk.TweetTokenizer(strip_handles=True)
    tokens = tknzr.tokenize(tweet)

    return [token for token in tokens if token not in stop_words]
Example #23
0
 def clean_tweets(self, doc):
     """ remove punctuation and stopwords """
     tknzr = nltk.TweetTokenizer()
     if self.kwargs.get('vader'):
         words = ' '.join([word for word in tknzr.tokenize(doc['text']) if word.isalpha()])
         return words
     else:
         words = ' '.join([word for word in tknzr.tokenize(doc)
                           if word.isalpha() and word not in self.stopwords and len(word) > 2])
         self.words.append(words)
         if self.kwargs.get('debug'):
             print(words)
         return words
def initialize():
    """ Performs preprocessing on the wine_data table"""
    global tokens  # this is jank, I know. I'll fix this later but it works for now
    tokens = []
    for d in reviews:
        #Makes each word in a review lowercase
        d = d.lower()
        # Breaks each review down into its individual terms and stores the
        # words in a list where each word is an individual entry
        # EXAMPLE: "hello world" ==> ["hello", "world"]
        tokens.append(nltk.TweetTokenizer().tokenize(d))
    tokens = del_stop_word(tokens)
    tokens = del_punc(tokens)
    tokens = porter_stem(tokens)
def tokenize(text):
    text = text.lower()  # face toate literele mici
    text = re.sub('[^A-Za-z]', ' ', text)  # scoate caracterele nonalfabetice
    """stops = set(stopwords.words("italian"))
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    filtered_words = [word for word in text.split() if word not in stops]
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    text = " ".join(filtered_words)
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    text = gensim.parsing.preprocessing.strip_numeric(text)
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
"""
    return nltk.TweetTokenizer(reduce_len=True,
                               strip_handles=True).tokenize(text)
Example #26
0
def init_word_tokenizers(main, lang, word_tokenizer = 'default'):
    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang]

    # NLTK
    if word_tokenizer.startswith('nltk_'):
        if word_tokenizer == 'nltk_nist':
            if 'nltk_nist_tokenizer' not in main.__dict__:
                main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()
        elif word_tokenizer == 'nltk_nltk':
            if 'nltk_nltk_tokenizer' not in main.__dict__:
                main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer()
        elif word_tokenizer == 'nltk_penn_treebank':
            if 'nltk_treebank_tokenizer' not in main.__dict__:
                main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer()
        elif word_tokenizer == 'nltk_tok_tok':
            if 'nltk_toktok_tokenizer' not in main.__dict__:
                main.nltk_toktok_tokenizer = nltk.ToktokTokenizer()
        elif word_tokenizer == 'nltk_twitter':
            if 'nltk_tweet_tokenizer' not in main.__dict__:
                main.nltk_tweet_tokenizer = nltk.TweetTokenizer()
    # Sacremoses
    elif word_tokenizer == 'sacremoses_moses':
        lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang))
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__:
            main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses)
    # spaCy
    elif word_tokenizer.startswith('spacy_'):
        init_spacy_models(main, lang)
    # Chinese
    elif word_tokenizer == 'pkuseg_zho':
        if 'pkuseg_word_tokenizer' not in main.__dict__:
            main.pkuseg_word_tokenizer = pkuseg.pkuseg()
    # Chinese & Japanese
    elif word_tokenizer.startswith('wordless_'):
        init_spacy_models(main, 'eng_us')
        init_spacy_models(main, 'other')
    # Japanese
    elif word_tokenizer.startswith('sudachipy_jpn'):
        if 'sudachipy_word_tokenizer' not in main.__dict__:
            main.sudachipy_word_tokenizer = sudachipy.Dictionary().create()
    # Tibetan
    elif word_tokenizer == 'botok_bod':
        if 'botok_word_tokenizer' not in main.__dict__:
            main.botok_word_tokenizer = botok.WordTokenizer()
def tokenize(text):
    '''Generic wrapper around different tokenization methods.
    '''
    text = text.lower()  # lowercase all text
    text = re.sub(r'@[A-Z0-9a-z_:!@#$%^&()=+,.></?;|@#]+', 'user',
                  text)  # replace users with "user"
    text = text.replace("#", "")  # delete hashtags
    text = re.sub('https?://[A-Za-z0-9./#]+', 'link',
                  text)  # replace links with "link"
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = text.strip()  # remove leading and ending spaces

    res = ""
    stop_words = set(stopwords.words('italian'))
    text = text.split()
    for word in text:
        cuv = word
        for stop_word in stop_words:
            if word == stop_word or len(
                    word
            ) < 4:  # deleting most common words in italian and words that have less than 4 characters
                cuv = ""
        res += cuv + " "
    res = res[:-1]
    result = res

    if len(result) > 2:
        if result[0] == " ":
            result = result[1:]

    result = ''.join(result)
    '''
    for cuvant in result:
        if len(cuvant) < 4:
            result = result.replace(cuvant, "")
    stemming = ''.join(result)
    

    stemming = [stemmer.stem(k) for k in result]
    stemming = ' '.join(stemming)
    stemming = sp(stem)
    lemma = []
    for cuvant in stemming:
        lemma.append(cuvant.lemma_)
    result = ' '.join(lemma)
    '''
    # return nltk.WordPunctTokenizer().tokenize(result)
    return nltk.TweetTokenizer(strip_handles=True,
                               reduce_len=True).tokenize(result)
Example #28
0
 def analyze(self, text):
     """Analyze text for sentiment, returning its score."""
     self.score = 0
     tokenizer = nltk.TweetTokenizer(strip_handles=True, reduce_len=True)
     tweettokens = [
         i.lower() for i in tokenizer.tokenize(text) if len(i) > 2
     ]
     for token in tweettokens:
         if token in self.positives:
             self.score = self.score + 1
         elif token in self.negatives:
             self.score = self.score - 1
     #print(tweettokens, "\t score = ",self.score)
     # TODO
     return self.score
def tokenize(term_vector):
    
    term_tokens = [ ]
    
    for d in term_vector:

        #Makes each word in a review lowercase
        d = d[0].lower()

        # Breaks each review down into its individual terms and stores the
        # words in a list where each word is an individual entry
        # EXAMPLE: "hello world" ==> ["hello", "world"]
        term_tokens.append( nltk.TweetTokenizer().tokenize( d ) )
        
    return term_tokens
Example #30
0
    def __init__(self, dist_file_path=None):
        """ Initialize module with default data/english.dist file """
        if dist_file_path is None:
            dist_file_path = os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "data/english.dist")

        with open(dist_file_path, "rb") as distributions_file:
            pickle_dict = pickle.load(distributions_file)
            self.uni_dist = pickle_dict["uni_dist"]
            self.backward_bi_dist = pickle_dict["backward_bi_dist"]
            self.forward_bi_dist = pickle_dict["forward_bi_dist"]
            self.trigram_dist = pickle_dict["trigram_dist"]
            self.word_casing_lookup = pickle_dict["word_casing_lookup"]
        self.tknzr = nltk.TweetTokenizer()