コード例 #1
0
ファイル: test_api.py プロジェクト: s/preprocessor
 def test_tokenize(self):
     tweet = "Packathon was a really #nice :) challenging 👌. @packathonorg http://packathon.org"
     p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY)
     tokenized_tweet = p.tokenize(tweet)
     self.assertEqual(
         tokenized_tweet, "Packathon was a really $HASHTAG$ $SMILEY$ challenging $EMOJI$. $MENTION$ $URL$"
     )
コード例 #2
0
def processTweet(tweet):
    # process the tweets
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    yourstring = tweet.encode('ascii', 'ignore').decode('ascii')
    new_tweet = p.clean(yourstring)
    str = p.tokenize(new_tweet)
    #return str
    with open('CNG_test_main.txt', 'a') as createFile:
        createFile.write((str) + '\n')
    #for raw_tweet in str:
    #createFile.write(json.dumps(raw_tweet) + '\n')
    createFile.close()
コード例 #3
0
def preprocessData(fn, save_fn, corpus_file):
    print("Preprocessing {}...".format(fn))
    with open(os.path.join(param.dump_folder, fn), "rb") as handle:
        sent_list, label_list = pickle.load(handle)
    print("Tokenization...")
    sent_list = [p.tokenize(sent) for sent in sent_list]
    sent_list = [clean_str(sent).split()[:] for sent in sent_list]
    with open(os.path.join(param.dump_folder, save_fn), "wb") as handle:
        pickle.dump((sent_list, label_list), handle)
    # write to corpus with comments
    if fn.endswith("comm.data"):
        corpus_file.write("\n".join([" ".join(sent) for sent in sent_list]))
        corpus_file.write("\n")
    print("Done preprocessing, save to {}".format(save_fn))
コード例 #4
0
ファイル: utilities.py プロジェクト: orestxherija/smm4h2018
def text_preprocessor(doc):
    # separate hyperlinks from adjacent text, e.g. goodbypic.twitter.com -> goodbye pic.twitter.com
    doc = re.sub(r'(\w*)(https?|pic\.)', r'\1 \2', doc)
    # uniformize twitter-specific tokens
    doc = preprocessor.tokenize(doc)
    # extract text from *, e.g. *nope* -> nope
    doc = re.sub(r'\*(.*?)\*', r'\1', doc)
    # replace & symbol
    doc = re.sub(r'&', r' and ', doc)
    # lower-casing
    doc = doc.lower()
    # uniformize some corpus specific errors
    doc = re.sub(r'xan ', r'xanax ', doc)
    doc = re.sub(r'rogain', r'rogaine ', doc)
    doc = re.sub(r'adderal|aderall', r'adderall', doc)
    # normalizer multiple occurences of vowels/consonants
    doc = re.sub(r'(\w)\1\1+', r'\1', doc)
    # remove reddit symbol /r/
    doc = re.sub(r'/r/', r'', doc)
    # remove text between {}
    doc = re.sub(r'\{(.*?)\}', r'', doc)
    # uniformize emojis and numbers
    preprocessor.set_options(preprocessor.OPT.EMOJI, preprocessor.OPT.NUMBER)
    # split NUMBER and EMOJI when adjacent to text
    doc = preprocessor.tokenize(doc)
    doc = re.sub(r'(w*)(EMOJI|NUMBER)', r'\1 \2', doc)
    doc = re.sub(r'(EMOJI|NUMBER)(w*)', r'\1 \2', doc)
    # remove non-alphanumeric characters
    doc = re.sub('[^A-Za-z ]+', '', doc)
    # remove very long words >=15 and short words <2
    doc = ' '.join([item for item in doc.split() if 1 < len(item) < 18])
    # lower-casing
    doc = doc.lower()
    # remove multiple sequential occurences of the same token
    doc = re.sub(r'(\w+) \1 \1+', r'\1', doc)
    return doc
コード例 #5
0
def raw_tweet_prep_test(raw_tweet, stopwords, html_re, space_replace_re,
                        repeating_re, single_char_re):
    tweet_tokenized = html_re.sub(' ', raw_tweet)
    tweet_tokenized = p.tokenize(tweet_tokenized.lower().replace('\n', ' '))
    tweet_tokenized = space_replace_re.sub(' ', tweet_tokenized)
    tweet_tokenized = repeating_re.sub(r"\1", tweet_tokenized)
    #raw_tweet = ' '.join(raw_tweet)

    #tweet_tokenized = single_char_re.sub(' ', tweet_tokenized)
    tweet_tokenized = tweet_tokenized.strip().split()
    words = [w for w in tweet_tokenized if w not in stopwords]
    if len(words) > 1:
        return words
    else:
        raise Exception("Input tweet too short")
コード例 #6
0
def main(event: func.EventHubEvent) -> str:
    text = ""
    try:
        tweet = json.loads(event.get_body().decode('utf-8'))
        text = tweet[0]["text"]
        logging.info('Python EventHub trigger processed a tweet: %s', text)

    except KeyError:
        logging.error('Error parsing tweet.')
        pass

    else:
        # Tokenize the tweet and outputs it.
        tokenized = p.tokenize(text)
        logging.info('Tweet tokenized into: %s', tokenized)
        return tokenized
def get_pred(xyz):
    tweet = xyz
    tokens = p.tokenize(tweet)
    l = []
    l.append(tokens)
    arr = np.array(list(vocab_processor.transform(l)))
    tmp = loaded_model.predict_proba(arr)
    acc = max(tmp[0]) * 100
    print(arr)
    res = np.argmax(loaded_model.predict(arr), 1)
    print(res)
    if res[0] == 1:
        val = "Sexist or Racist Post"
    else:
        val = "Neutral Post"
    return {'result': val, 'accuracy': acc}
コード例 #8
0
    def segment_posts_2(self, posts, personality):
        post_split = posts.split("|||")
        final_post_list = []
        for post in post_split:
            # Preprocess the tweets
            post = p.tokenize(post)
            post = self.post_process(post)

            # append to the final string
            final_post_list.append(post)

        final_post_str = " ".join(final_post_list)
        # Append to the row list
        one_personalit_one_big_post = {
            'type': personality,
            'post': final_post_str
        }
        self.row_list.append(one_personalit_one_big_post)
コード例 #9
0
def preprocess_text(message):
    for i, row in dataframe.iterrows():

        # print row['message']
        # clean_tweet = re.match('(.*?)http.*?\s?(.*?)', message)

        message = p.tokenize(message)
        pattern = re.compile("[^\w$ ]")
        message = pattern.sub('', message)
        message = re.sub('[0-9]+', '', message)
        message = message.replace('$PIC$', '$PIC$ ')
        message = message.replace('$NUMBER$', '$NUMBER$ ')
        message = message.replace('$URL$', '$URL$ ')
        if message.__contains__('$PIC$') | message.__contains__('$URL'):
            message = message.rsplit('$', 1)[0] + "$"
        message = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', message)
        print message
        return message
コード例 #10
0
def parseFile(filename, outfile):
    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='\n')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                print(f'Column names are {", ".join(row)}')
                line_count += 1
            else:
                #print(row)
                print(line_count)
                if len(row) is not 0:
                    print(row)
                    # Removing mentions, URLS, emojiis and tokenizing
                    clean_row = p.clean(row[0])
                    clean_row = deEmojify(clean_row)
                    tokens = p.tokenize(clean_row)

                    temp = " {}"
                    final = temp.format(tokens)

                    letters_only_text = re.sub(r"[^a-zA-Z\'\#]", " ",
                                               final).lower()
                    word_array = letters_only_text.split()
                    word_array = [
                        term for term in word_array if term not in stop_words
                    ]

                    # Recreating string
                    cleaned = " ".join(word_array)
                    print(cleaned)
                    if len(cleaned) > 0:
                        preprocessed.append(cleaned)
                    line_count += 1

        print(f'Processed {line_count} lines.')
    print(len(preprocessed))

    # Writing preprocessed, emoji-free tweets to csv file.
    with open(outfile, mode='w') as csv_file:
        for i in range(len(preprocessed)):
            csv_file.write(preprocessed[i])
            csv_file.write('\n')
            csv_file.write('\n')
コード例 #11
0
def preprocess(s, lowercase=False):
    import string
    table = str.maketrans('', '', string.punctuation)
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED)
    s = p.tokenize(s)
    tokens_raw = s.replace("\n", "").strip("'[]\' '").split()

    # Added lemmatization (1/1/18). Potentially should remove to compare results
    # tokens = gensim.utils.lemmatize(s, stopwords=STOPWORDS, min_length=1)

    # Without lemmatization, remove punctuation
    if lowercase:
        tokens = [
            token.strip().lower().translate(table) for token in tokens_raw
        ]
    else:
        tokens = [token.strip().translate(table) for token in tokens_raw]

    return tokens
コード例 #12
0
def getText(text, pattern, subs, stopwords):
    if text is not None and len(text) > 0:
        text = text.replace(u'\n', u' ').replace('(', ' (').strip().lower()
        tlines = preprocessor.parse_field2(text)
        lines = []
        for t in tlines:
            t = preprocessor.join_tokens(
                [tok for tok in preprocessor.tokenize(t)])
            t = pattern.sub(lambda m: subs[re.escape(m.group(0))], t)
            t = preprocessor.join_tokens([
                preprocessor.strip_leading_and_trailing_nums(tok)
                for tok in preprocessor.tokenize2(t) if tok not in stopwords
                and tok not in string.punctuation and len(tok) > 1
                and not preprocessor.containsThreeConsecTokens(tok)
            ])
            lines.append(t)
        t = preprocessor.join_tokens(lines)
        return t if len(t) > 1 else None
    return None
コード例 #13
0
def raw_tweet_prep_stem_test(raw_tweet, stopwords, stemmer, html_re,
                             space_replace_re, repeating_re, single_char_re):
    # remove some more things ('s, 'm, 't, html symbol, other non english char, and repeating expression)
    tweet_tokenized = html_re.sub(' ', raw_tweet)
    print(tweet_tokenized)
    tweet_tokenized = p.tokenize(tweet_tokenized.lower().replace('\n', ' '))
    print(tweet_tokenized)
    tweet_tokenized = space_replace_re.sub(' ', tweet_tokenized)
    print(tweet_tokenized)
    tweet_tokenized = repeating_re.sub(r"\1", tweet_tokenized)
    print(tweet_tokenized)
    # tokenize and replace url with 'URL', numbers with 'NUMBER' and 'EMOJi'
    #tweet_tokenized = single_char_re.sub(' ', tweet_tokenized)
    tweet_tokenized = tweet_tokenized.strip().split()
    print(tweet_tokenized)
    words = [stemmer.stem(w) for w in tweet_tokenized if w not in stopwords]
    if len(words) > 1:
        return words
    else:
        raise Exception("Input tweet too short")
コード例 #14
0
def _preprocess_tweet(tweet, stop_words, stemmer):
    tweet_ = _preprocess_tags(tweet)
    tweet_ = p.tokenize(tweet_)  # emoji, smiley, number
    tweet_ = tokenizer_.tokenize(tweet_)

    def https2url(token):
        if "https" in token:
            return "url"
        else:
            return token

    tweet_ = list(map(https2url, tweet_))
    # removes all not word token:
    tweet_ = list(
        filter(lambda token: token.isalpha() and len(token) > 1, tweet_))
    # removes all the stop words:
    tweet_ = list(filter(lambda token: token not in stop_words, tweet_))
    # stemming:
    tweet_ = [stemmer.stem(token) for token in tweet_]
    tweet_ = " ".join(tweet_)
    return tweet_
コード例 #15
0
def clean_data(line):

    ## Remove @, reduce length, handle strip
    tokenizer = TweetTokenizer(strip_handles=False, reduce_len=True)
    line = ' '.join(tokenizer.tokenize(line))

    ## Remove url, emoji, mention, prserved words, only preserve smiley
    #pre.set_options(pre.OPT.URL, pre.OPT.EMOJI, pre.OPT.MENTION, pre.OPT.RESERVED)
    pre.set_options(pre.OPT.URL, pre.OPT.RESERVED)
    line = pre.tokenize(line)

    ## Remove non-sacii
    line = ''.join([i if ord(i) else '' for i in line])  # remove non-sacii
    #if not line:
    #    line = 'RT'
    line = line + ' <end>'
    """
    line = line.replace(r'https?://\S+', r'') # remove url
    if line.startswith('RT @'): 
        line = line.replace(r'RT ', r'') # remove RT (retweet)"""
    return line
コード例 #16
0
	def tokenize_short_text(self, raw_tweet_text):

		tweet_text = raw_tweet_text
		#tweet_text = tweet_text.strip()
		#tweet_text = unidecode.unidecode(tweet_text)
		
		if self.args.use_lowercase:
			tweet_text = tweet_text.lower()
		
		if self.tokenizer > 0:
			if self.tokenizer == 1:
				uttterance_tokens = word_tokenize(tweet_text)
			if self.tokenizer == 2:
				uttterance_tokens = wordpunct_tokenize(tweet_text)
			if self.tokenizer == 3:
				uttterance_tokens = self.tweet_tokenizer.tokenize(tweet_text)
			if self.tokenizer == 4:
				tweet_text = clean(tweet_text)
				tweet_text = self.remove_accented_chars(tweet_text)
				uttterance_tokens = self.tweetokenizer.tokenize(tweet_text)
				uttterance_tokens = self.remove_duplicated_sequential_words(uttterance_tokens)
				uttterance_tokens = self.remove_stopwords(uttterance_tokens)

			if self.tokenizer == 5:
				tweet_text = tokenize(' '.join(self.tweet_tokenizer.tokenize(tweet_text)))
				return tweet_text
			
			if self.tokenizer == 6:
				tweet_text = clean(' '.join(self.tweet_tokenizer.tokenize(tweet_text)))
				return tweet_text

			if self.stem:
				uttterance_tokens = [list(map(self.stemmer.stem, sub)) for sub in uttterance_tokens]
			if self.lemmatize:
				uttterance_tokens = [[self.lemmatizer.lemmatize(tok, pos='v') for tok in sub] for sub in uttterance_tokens]
			
			tweet_text = " ".join(uttterance_tokens)
		
		return tweet_text
コード例 #17
0
def preprocess_word_based(tweets, vocab_model):
    p.set_options(p.OPT.EMOJI, p.OPT.URL, p.OPT.MENTION)  # , p.OPT.HASHTAG)
    batch = []
    pattern = re.compile('[^0-9a-z\s]+', re.UNICODE)
    for tweet in tweets:
        # Should I remove hashtags completely, or just remove the symbol?
        clean = p.tokenize(tweet)
        clean = split_hashtags(clean)
        clean = clean.lower()
        clean = pattern.sub(' ', clean)
        words = clean.split()
        res = []
        for word in words:
            if word != '':
                try:
                    vec = vocab_model.word_vec(word)
                except KeyError:
                    vec = unknown_vector
                finally:
                    res.append(vec)
        batch.append(res)
    return batch
コード例 #18
0
def predict_personality_from_post(model, post, wb):
	# Preprocess the tweets
	post = p.tokenize(post);
	#print(post);

	# Grab the embeddings and averages
	embeddings_from_post = np.array(wb.compute_embeddings([post], wb.embedding_index));
	embedding_avgs = wb.compute_average(embeddings_from_post);

	print('[+] Embedding length from test text: ' + str(len(embeddings_from_post)));
	print(wb.compute_average(embeddings_from_post).shape);
	print('[+] Prediction: ');

	# Compute the prediction
	prediction = model.predict(embedding_avgs);
	print(prediction);

	# Get personality type 
	index_personality = prediction[0] - 1;
	personality_type = personality_types[index_personality];

	return personality_type;
コード例 #19
0
def preprocess(text):
    # remove the hashtags
    text = text.replace("#", '')
    text = text.replace("-", ' ')
    text = text.lower()

    # token the url, clean out the emojis
    tp.set_options(tp.OPT.URL)
    text = tp.tokenize(text)
    tp.set_options(tp.OPT.EMOJI)
    text = tp.clean(text)

    # flesh out all punctuation and tokenize words into base form
    text = "".join([char for char in text if char not in string.punctuation])
    text = nltk.word_tokenize(text)

    # # initialize a lemmatizer from sklearn and apply to all words in the given list
    lemmatizer = WordNetLemmatizer()
    lem_text = [lemmatizer.lemmatize(word) for word in text]

    final_text = " ".join([word for word in lem_text])

    return final_text
コード例 #20
0
def plot_attention_graph(model,x,Tx,Ty,human_vocab,layer=7):
    # Process input
    tokens = np.array([tokenize(x,human_vocab,Tx)])
    tokens_oh = oh_2d(tokens,len(human_vocab))

    # Monitor model layer
    layer = model.layers[layer]

    layer_over_time = K.function(model.inputs,[layer.get_oiuput_at(t) for t in range(Ty)])
    layer_output = layer_over_time([tokens_oh])
    layer_output = [row.flatten().tolist() for row in layer_output]

    # Get model output
    prediction = get_prediction(model,tokens_oh)[1]

    # Graph the data
    fig = plt.figure()
    fig.set_figwidth(20)
    fig.set_figwidth(1,8)
    ax = fig.add_subplot(111)

    plt.title('Attention Values per Timestep')

    plt.rc('figure')
    cax = plt.imshow(layer_output,vmin=0,vmax=1)
    fig.colorbar(cax)

    plt.xlabel('Input')
    ax.set_xticks(range(Tx))
    ax.set_xtickalabels(x)

    plt.ylabel('Output')
    ax.set_yticks(range(Ty))
    ax.set_yticklabels(prediction)

    plt.show()
コード例 #21
0
def preprocessing2(text):
        text = text.decode('ascii','ignore')
        proc.set_options(proc.OPT.URL, proc.OPT.MENTION, proc.OPT.HASHTAG)
        clean_ver = proc.tokenize(text).lower()
        return str(clean_ver)
コード例 #22
0
PATH = './Dataset'
# Fetch your test tweets and labels:
"""
Both files are stored in .pkl format.
1) x_test : list containing all tweets of users
2) y_test : contains binary class values as 1: Hate | 0:Counter

"""
x_test = pickle.load(open(os.path.join(PATH,'x_test.pkl'),'rb'))
y_test = pickle.load(open(os.path.join(PATH,'y_test.pkl'),'rb'))


#>>>>Preprocessing:
prep_tweets = []
for tweet in tqdm(x_test):
    prep_tweets.append(prep.tokenize(x_test))
    
##****************************************************************************
# # TF-IDF Vectorizers :
word_vectorizer = TfidfVectorizer(vocabulary=pickle.load(open("word_vocab.pkl", "rb")) # pretrained vocabulary from 6 million tweets on word level
char_vectorizer = TfidfVectorizer(vocabulary=pickle.load(open("char_vocab.pkl", "rb")) # pretrained vocabulary from 6 million tweets on char level

char_features = char_vectorizer.transform(prep_tweets)
word_features = word_vectorizer.transform(prep_tweets)
#*******************************************************************************

#>>>>Lexical Features : 
'''
!pip install empath
from empath import Empath
Run it on x_test and store it in ./Models/ in .pkl format
コード例 #23
0
import preprocessor as p
import re

str_to_clean = 'Preprocessor is #awesome 👍 https://github.com/s/preprocessor';
clean_str = p.tokenize(str_to_clean);

#print([word for word in clean_str.split() if word.startswith('$') and word.endswith('$')])

new_str = [];
for word in clean_str.split():
	if word.startswith('$') and word.endswith('$'):
		word = '<' + word[1:len(word)-1] + '>';
	new_str.append(word);
new_str = " ".join(new_str);

#m = re.sub(r'/\$(URL|EMOJI)\$', r'<\1>', clean_str);
m = re.findall(r'\b\$\w*?\$\b', clean_str.rstrip());
print(m);

#print(clean_str);
#print(new_str);
コード例 #24
0
def cleanQuery(query):

    tokens = tokenize(query)
    query = []
    for i in range(len(tokens)):
        token = tokens[i]
        print('token : ', token)

        if not bool(re.search('^\d+$', token)):
            alpha_num = bool(re.search('(^\d+|\d+$)', token))
            if misspelled_synonym.has_key(token):
                print('i am here 1')
                token = misspelled_synonym[token]
                query = query + tokenize(token)
                continue
            elif i == 0 and len(
                    tokens) == 1 and dictionary[token] < 50 and alpha_num:
                print('i am here 2')
                token = correct_alpha_num(token)
            elif i == 0 and len(tokens) == 1 and fw_dictionary[token] < 50:
                print('i am here 3')
                token = correct(token)
            elif i == 0 and len(tokens) > 1 and dictionary[
                    tokens[1]] > 50 and alpha_num:
                print('i am here 4')
                token = correct_alpha_num(token, nex=tokens[1])
            elif i == 0 and len(tokens) > 1 and dictionary[
                    token] < 450 and dictionary[tokens[1]] < 20:
                print('i am here 5')
                token = correct(token)
            elif i == 0 and len(tokens) > 1 and dictionary[
                    tokens[1]] > 50 and cPBigram(word=token,
                                                 nex=tokens[1]) == 0:
                print('i am here 6')
                token = correct(token, nex=tokens[1])
            elif i > 0 and dictionary[query[-1]] >= 1 and cPBigram(
                    word=token, prev=query[-1]) == 0 and alpha_num:
                print('i am here 7')
                token = correct_alpha_num(token, prev=query[-1])
            elif i > 0 and dictionary[query[-1]] >= 20 and cPBigram(
                    token, query[-1]) == 0:
                print('i am here 8' + query[-1])
                token = correct(token, prev=query[-1])
            elif i > 0 and dictionary[token] <= 5 and cPBigram(
                    token, query[-1]) == 0:
                print('i am here 9')
                token = correct(token)
            else:
                print('i am here 10')
                query.append(token)
                continue

            for tkn in token:
                if dictionary[tkn]:
                    query.append(tkn)
                elif len(tkn) <= 15:
                    sep_tokens = ws.segment(tkn)
                    if Pwords2(sep_tokens) >= 1.5e-06:
                        query = query + sep_tokens
                    else:
                        query.append(tkn)
                else:
                    query.append(tkn)

        else:
            query.append(token)

    clean_query = []
    for token in merge_tokens(query):
        clean_query = clean_query + tokenize(
            misspelled_synonym.get(token, token))

    return (' '.join(clean_query), Pwords2(clean_query))
コード例 #25
0
ファイル: data_cleaning.py プロジェクト: Perenz/UHInternship
def text_cleaner(text):
    #Set the options to tokenize. URL -> $URL$, MENTION -> $MENTION$
    pp.set_options(pp.OPT.URL)
    toRtn = pp.tokenize(text)
    return toRtn.replace("$URL$", "")
コード例 #26
0
def preprocess_text(text: str, opts, nlpengine=None, lang='en', special_tags=["<pad>", "<eos>"],
                    use_tw_preprocessor=True):
    if use_tw_preprocessor:
        ## ! There is a bug in original package for twitter preprocessing
        # Sometomes regexp for link preprocessing freezes
        # So we preprocess links separately
        text = re.sub(r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", "$URL$",
                      text.strip())
        twitter_preprocessor.set_options('mentions')
        text = twitter_preprocessor.tokenize(text)
        # processed_chunk = twitter_preprocessor.clean(text)
    if nlpengine is None:
        global nlp
        if nlp is None:
            nlp = spacy.load(lang)
            nlp.add_pipe(nlp.create_pipe('sentencizer'))
            for x in ['URL', 'MENTION', 'HASHTAG', 'RESERVED', 'EMOJI', 'SMILEY', 'NUMBER', ]:
                nlp.tokenizer.add_special_case(f'${x}$', [{ORTH: f'${x}$'}])
        nlpengine = nlp

    BLvec = []
    POSvec = []
    DEPvec = []
    NERvec = []

    processed_chunk = ""
    doc = nlpengine(text)
    doclen = 0
    for sentence in doc.sents:
        for w in sentence:

            # Some phrases are automatically tokenized by Spacy
            # i.e. New York, in that case we want New_York in our dictionary
            word = "_".join(w.text.split())
            if word.isspace() or word == "":
                continue
            if opts.remove_stop_words and word.lower() in stopWords:
                continue

            if opts.remove_puncuation and word in punctuation:
                continue

            # Spacy lemmatized I,He/She/It into artificial
            # -PRON- lemma, which is unwanted
            if opts.lemmatize_words:
                output = w.lemma_ if w.lemma_ != '-PRON-' else w.lower_
            else:
                output = word

            if opts.to_lowercase:
                output = output.lower()

            if opts.replace_nums and output.replace('.', '', 1).isdigit():
                output = opts.num_replacement

            output = output.replace("n't", "not")
            doclen += 1
            processed_chunk += "%s " % (output)

            # Sometimes, when the word contains punctuation and we split it manually
            # the output can contain multiple tokens
            # In such case, just copy the features..., it happens rarely

            if opts.returnbiglettervector:
                BLvec.append(int(w.text[0].isupper()))
            if opts.returnposvector:
                POSvec.append(POS_dict.get(w.pos, POS_dict['UNK']))
            if opts.returnDEPvector:
                try:
                    DEPvec.append(validDEPS.index(w.dep_.lower()))
                except ValueError:
                    DEPvec.append(validDEPS.index('UNK'))
            if opts.returnNERvector:
                try:
                    NERvec.append(validNER.index(w.ent_type_))
                except ValueError:
                    NERvec.append(validNER.index('UNK'))

        if opts.add_eos:
            doclen += 1
            processed_chunk += opts.eos + "\n"
            if opts.returnbiglettervector:
                BLvec.append(0)
            if opts.returnposvector:
                POSvec.append(POS_dict['EOS'])
            if opts.returnDEPvector:
                DEPvec.append(0)
            if opts.returnNERvector:
                NERvec.append(0)
        else:
            processed_chunk += "\n"

    processed_chunk = processed_chunk.strip()
    assert len(processed_chunk.split()) == len(BLvec) == len(POSvec) == len(DEPvec) == len(NERvec)
    return processed_chunk, BLvec, POSvec, DEPvec, NERvec
コード例 #27
0
 def tokenize(text, lower=True):
     cleaned_text = tweet_preprocessor.tokenize(text)
     return cleaned_text.lower() if lower else cleaned_text
コード例 #28
0
ファイル: util.py プロジェクト: lenkaB/botornot
def extract_stats(clean_tweet, tweet_id, human_or_bot):
    found = human_relationships_identifier(clean_tweet)
    # print(clean_tweet,found)

    pos_list = []
    pos_counter = collections.Counter()
    spacy_stats = nlp(clean_tweet)
    for token in spacy_stats:
        pos_counter[token.pos_] += 1
        pos_list.append(token.pos_)

    entity_list = []
    entity_label_list = []
    for ent in spacy_stats.ents:
        entity_label_list.append(ent.label_)
        entity_list.append(ent.text)
    char_count = len(spacy_stats.text)

    unique_words = []
    processed_tweet = p.tokenize(clean_tweet)

    curve = ttr_curve(pos_list)
    processed_word_count = 0
    spell_error_count = 0
    for word in processed_tweet.split():
        # clean_word = re.sub(r'[^A-Za-z]', "", word)
        # if dictionary.check(clean_word) is False and not word.startswith('$'):
        #    print(word)
        #    spell_error_count += 1
        processed_word_count += 1
        if word not in unique_words:
            unique_words.append(word)

    if processed_word_count > 0:
        ttr = float(len(unique_words)) / float(processed_word_count)
    else:
        ttr = 0

    i = 1
    arr = []
    for pos in pos_list:
        arr.append(i)
        i += 1

    if len(arr):
        ttr_slope = linregress(arr, curve)[0]
    else:
        ttr_slope = 0

    tweet_dict = {
        'index': tweet_id,
        'raw': clean_tweet,
        'preprocessed tweet': processed_tweet,
        'char count': char_count,
        'pos': pos_list,
        'pronouns': pos_counter['PRON'],
        'nouns': pos_counter['NOUN'],
        'verbs': pos_counter['VERB'],
        'adverbs': pos_counter['ADV'],
        'adjectives': pos_counter['ADJ'],
        'symbols': pos_counter['SYM'],
        'punctuation': pos_counter['PUNCT'],
        'proper nouns': pos_counter['PROPN'],
        'entity label': entity_label_list,
        'word count': processed_word_count,
        'unique word count': len(unique_words),
        'TTR': ttr,
        'entity raw text': entity_list,
        'mentions': clean_tweet.count('@'),
        'hashtags': clean_tweet.count('#'),
        'urls': clean_tweet.count('$URL$'),
        'class': human_or_bot,
        'relationship words': found,
        'count(rel words)': len(found),
        'ttr curve': curve,
        'ttr slope': ttr_slope
    }

    return tweet_dict
コード例 #29
0
def tokenize(sent):
    print(p.tokenize(p.clean(sent)))
    return p.tokenize(p.clean(sent))
コード例 #30
0
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("src/model.h5")
print("Loaded model from disk")

#################################### Pre-process Tweets ################################
index_vocabolario = {idx: w for w, idx in vocabolario_index_twitter.items()}

id_text_pad_list = []
for i in range(len(data)):
    try:
        id_text_pad_list += [
            (data[i]['id'],
             replace_word_index_twitter((substitute_label(
                 (normalize(p.tokenize(
                     data[i]['retweeted_status']['text']))).split())).split()))
        ]
    except:
        id_text_pad_list += [
            (data[i]['id'],
             replace_word_index_twitter((substitute_label(
                 (normalize(p.tokenize(data[i]['text']))).split())).split()))
        ]

array_pad = np.array([j for i, j in id_text_pad_list])
padding = sequence.pad_sequences(array_pad, maxlen=40, padding='post')
prediction = loaded_model.predict_classes(padding)
pred = ['negative' if i == 0 else 'positive' for i in prediction]
dict_id_sentiment = {}
for i, j in enumerate(array_pad):
    dict_id_sentiment[id_text_pad_list[i][0]] = pred[i]
コード例 #31
0
import preprocessor
import mlcs

if __name__ == '__main__':
    with open('testcases/colors.json') as f:
        text = f.read()
    tokens = [token for i, token in preprocessor.tokenize(text)]
    mlcs.printResults(tokens)
    # time complexity: 110595198
    # space complexity: 487550
コード例 #32
0
 def test_tokenize(self):
     tweet = 'Packathon was a really #nice :) challenging 👌. @packathonorg http://packathon.org'
     tokenized_tweet = p.tokenize(tweet)
     self.assertEqual(tokenized_tweet, 'Packathon was a really $HASHTAG$ $SMILEY$ challenging $EMOJI$. $MENTION$ $URL$')