def tokenize_tweet_content_to_types(
        dataset: pd.DataFrame, tokenize_type_list: List[str]) -> pd.DataFrame:
    """Tokenize all tweets with with defined contents i.e 'something #DataScience' with 'something $HASHTAG$' """
    tuple_to_unpack = get_filter_objects_as_tuple(tokenize_type_list)
    p.set_options(*tuple_to_unpack)
    dataset["text"] = dataset["text"].apply(lambda txt: p.tokenize(txt))
    return dataset
def cleaning(string):
    #Emoji patterns
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE)
    string = emoji_pattern.sub(r"", string)
    string = re.sub("RT", "", string)
    string = re.sub("#", "", string)
    string = re.sub("\+", "", string)
    string = re.sub("\/", " or ", string)
    string = re.sub("&", " and ", string)
    string = re.sub("-", " ", string)
    string = re.sub("'", "", string)
    string = re.sub("🤔", "", string)
    string = re.sub("[¦]", "", string)
    string = re.sub("[()@:<>{}`=~|.,%_]", " ", string)

    #Replace short-forms & Acronyms with proper English

    string = translator(string)

    p.set_options(p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.MENTION)
    result = p.clean(string)
    return result
def gettweets():
    ## this search uses tweepy.Cursor for larger tweet count, filters out retweets, english only, and uses extended tweet mode to get full tweet.
    for tweet in tweepy.Cursor(api.search,
                               q='Reckful -filter:retweets',
                               lang="en",
                               tweet_mode='extended').items(300):
        ## the following is necessary to get full tweet (.full_text), instead of being limited to 120 chars (tweepy default)
        try:
            stringtweet = tweet.retweeted_status.full_text
            numlikes = tweet.retweeted_status.favorite_count
            retweetedstatus = "True"
        except AttributeError:  # Not a Retweet
            stringtweet = tweet.full_text
            numlikes = tweet.favorite_count
            retweetedstatus = "False"

        # tweet preprocessor options, removes unnecessary characters. I chose to leave in hashtags and mentions, otherwise use p.OPT.HASHTAG and p.OPT.MENTION
        p.set_options(p.OPT.URL, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY,
                      p.OPT.NUMBER)
        clean_text = p.clean(stringtweet)

        #Creates a sentimenttweet object from Textblob module, which contains tuple (polarity, subjectivity). Accessed on line 40.
        sentimenttweet = TextBlob(clean_text)

        csvWriter.writerow((clean_text, sentimenttweet.sentiment.polarity,
                            sentimenttweet.sentiment.subjectivity,
                            tweet.user.screen_name, tweet.user.location,
                            numlikes, tweet.retweet_count, tweet.created_at))
Exemple #4
0
def preprocess(tweet):
    tweet = tweet.lower()
    
    tweet = re.sub(r'\\n', ' ', tweet)
    tweet = re.sub(r'(\S)(https?):', r'\1 \2:', tweet)
    p.set_options(p.OPT.MENTION, p.OPT.URL, p.OPT.EMOJI, p.OPT.HASHTAG)
    tweet = p.tokenize(tweet)
    
    tokenizer = nltk.tokenize.TweetTokenizer()
    tweet = tokenizer.tokenize(tweet)
    tweet = ' '.join(tweet)
    tweet = re.sub(r'\$ ([A-Z]+?) \$', r'$\1$', tweet)
    
    tweet = tweet.split(' ')

    ### Stopwords removal ###
    language = os.getenv('LANGUAGE')
    stop_words = set(stopwords.words(language))
    new_sentence = []
    for w in tweet:
        if w not in stop_words:
            new_sentence.append(w)
    tweet = ' '.join(new_sentence)

    tweet = unidecode.unidecode(tweet)

    p.set_options(p.OPT.NUMBER)
    tweet = p.tokenize(tweet)
    tweet = re.sub(r'([!¡]\s?){3,}', r' $EXCLAMATION$ ', tweet)
    tweet = re.sub(r'([¿?]\s?){3,}', r' $QUESTION$ ', tweet)
    tweet = re.sub(r'(\.\s?){3,}', r' $ELLIPSIS$ ', tweet)
    tweet = re.sub(r'\b(?:a*(?:(h+|j+)a+|s+)+(h+|j+)?|(?:l+o+)+l+)\b', r' $LOL$ ', tweet, flags=re.I)

    return tweet
def cleanup_text(docs, logging=False):
    texts = []
    full_tokens = []
    counter = 1
    for doc in docs:
        if counter % 1000 == 0 and logging:
            print("Processed %d out of %d documents." % (counter, len(docs)))
        counter += 1
        pr.set_options(pr.OPT.URL, pr.OPT.EMOJI, pr.OPT.HASHTAG,
                       pr.OPT.MENTION, pr.OPT.RESERVED, pr.OPT.SMILEY,
                       pr.OPT.NUMBER)
        doc = pr.clean(doc)
        doc = con.fix(doc)
        doc = ''.join(ch for ch in doc if ord(ch) < 128)
        doc = nlp(doc, disable=['parser', 'ner'])
        tokens = [
            tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-'
        ]
        tokens = [
            tok for tok in tokens if tok not in stopwords
            and tok not in punctuations and re.sub("[0-9]*", '', tok) != ''
        ]
        full_tokens += tokens
        tokens = ' '.join(tokens)
        texts.append(tokens)
    rare_tuple = nltk.FreqDist(full_tokens).most_common()[-50:]
    rare_words = [tup[0] for tup in rare_tuple]
    final_texts = []
    for text in texts:
        tokens = [tok for tok in text.split(' ') if tok not in rare_words]
        tokens = ' '.join(tokens)
        final_texts.append(tokens)
    return pd.Series(final_texts)
def preprocessor_tweet(s):

    tweet_p.set_options(tweet_p.OPT.EMOJI,
                        tweet_p.OPT.URL,
                        tweet_p.OPT.RESERVED,
                        tweet_p.OPT.SMILEY,
                        tweet_p.OPT.MENTION)
    s = re.sub(r'@petrogustavo', 'petrogustavo', s)
    s = re.sub(r'@sergio_fajardo', 'sergio_fajardo', s)
    s = re.sub(r'@IvanDuque','IvanDuque',s)
    s = re.sub(r'@AlvaroUribeVel','AlvaroUribeVel',s)
    s = re.sub(r'@JuanManSantos','JuanManSantos',s)
    s = re.sub(r'@German_Vargas','German_Vargas',s)
    s = re.sub(r'@ClaudiaLopez','ClaudiaLopez',s)
    s = re.sub(r'@DeLaCalleHum','DeLaCalleHum',s)
    s = tweet_p.clean(s)
    s = re.sub(r'\b(?:a*(?:ja)+h?|(?:l+o+)+l+)\b', ' ', s)
    s = re.sub(r'[^\w]', ' ', s)
    # s = re.sub(r'^https?:\/\/.*[\r\n]*', '', s)
    # s = re.sub(r'#', '', s)
    # s = re.sub(r'¡+', '', s)
    # s = re.sub(r':', '', s)
    # s = re.sub(r'!+', '', s)
    # s = re.sub(r'"', '', s)


    # s = re.sub(r'/[-?]/', '', s)
    # s = re.sub(r'¿+', '', s)
    # s = re.sub(r'@\w+', '', s)
    s = strip_accents_unicode(s.lower())
    s = tweet_p.clean(s)

    return s
Exemple #7
0
def clean(lang):
    '''
    Create new folder for cleaned data.
    Remove duplicate Tweets and do some tweet cleaning.
    '''
    merged_file = 'data/TweetText_Label/{}_tweet_label.csv'.format(lang)
    clean_file = 'data/clean/{}.csv'.format(lang)
    merged = pd.read_csv(merged_file, engine='python')
    #merged.drop_duplicates(['Tweet text'], inplace=True)

    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED)

    text = []
    labels = []
    for index, row in merged.iterrows():
        try:
            label = row['SentLabel']
            clean_row = p.clean(row['Tweet text'])
            text.append(clean_row)
            labels.append(label)
            if index % 1000 == 0: print(index)
        except:
            continue
    cleaned = pd.DataFrame({'Text': text, 'HandLabels': labels})
    cleaned.to_csv(clean_file)
Exemple #8
0
    def test_set_options(self):
        tweet = "Preprocessor now has custom #options support! https://github.com/s/preprocessor"
        p.set_options(p.OPT.URL)
        parsed_tweet = p.parse(tweet)

        self.assertIsNone(parsed_tweet.hashtags)
        self.assertIsNotNone(parsed_tweet.urls)
Exemple #9
0
    def __init__(self, setname, embeddings, example_length=30, range=None):
        assert setname in [
            'train1', 'train2', 'train3', 'train4', 'train', 'test', 'val'
        ]
        self.example_length = example_length
        self.setname = setname
        self.path = os.path.join(dir_path, setname)
        index = os.path.join(self.path, 'index.csv')

        # maps index of points in the dataset to tweet_ids
        self.index = pd.read_csv(index, index_col=0)
        if range is not None and len(range) == 2:
            start = range[0]
            end = range[1]
            self.index = self.index[start:end].reset_index()
        self.len = len(self.index)

        # define method for deleting urls
        p.set_options(p.OPT.URL)  # remove only URLs
        self.clean = p.clean

        # patter that identifies all but alphanumeric characters and spaces
        self.pattern = re.compile(r'([^\s\w]|_)+')

        # get dict that maps word to embeddings
        self.embeddings = embeddings
def clean_tweet(tweet):
    # removes @ mentions, hashtags, emojis, twitter reserved words and numbers
    p.set_options(p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY,
                  p.OPT.NUMBER)
    clean = p.clean(tweet)

    # transforms every url to "<url>" token and every hashtag to "<hashtag>" token
    p.set_options(p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.SMILEY,
                  p.OPT.NUMBER, p.OPT.HASHTAG, p.OPT.URL)
    clean = p.tokenize(clean)
    clean = re.sub(r'\$HASHTAG\$', '<hashtag>', clean)
    clean = re.sub(r'\$URL\$', '<url>', clean)

    # preprocessor doesn't seem to clean all emojis so we run text trough emoji regex to clean leftovers
    clean = re.sub(emoji.get_emoji_regexp(), '', clean)

    # removing zero-width character which is often bundled with emojis
    clean = re.sub(u'\ufe0f', '', clean)

    # remove multiple empty spaces with one
    clean = re.sub(r' +', ' ', clean)

    # replace &gt; and &lt;
    clean = re.sub(r'&gt;', '>', clean)
    clean = re.sub(r'&lt;', '<', clean)

    # strip any leftover spaces at the beginning and end
    clean = clean.strip()

    return clean
Exemple #11
0
def first_step(directory, fname, ext, language):
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION)
    os.chdir(directory)
    f_path = fname
    f_out_p = fname.split(".")[0].replace('Stream', '') + "Preprocess1" + language.upper() + "." + ext
    fo = open(f_out_p, 'wb')
    if not ext == 'json':
        fo.write("id;created_at;text" + "\n")
    with open(f_path, 'r') as FILE:
        next(FILE)
        i = 0
        for line in FILE:
            tweet = json.loads(line)
            if 'extended_tweet' in tweet:
                extended_tweet = tweet['extended_tweet']
                text = extended_tweet['full_text'].encode('unicode_escape')
            else:
                text = tweet['text'].encode('unicode_escape')
            try:
                if detect(text) == language.lower():
                    text = p.clean(text)
                    text = C.hexadecimal_conversion(text)
                    text = C.expression_clean(text)
                    if not ext == 'json':
                        fo.write("%s;%s;%s\n" % (i, tweet['created_at'], text))
                    elif ext == 'json':
                        twt = {"id": i,
                               "created_at": tweet['created_at'],
                               "text": text.decode('unicode_escape')}
                        fo.write(json.dumps(twt) + '\n')
                    i += 1
            except lang_detect_exception.LangDetectException:
                print("Lang Detect exception for: ", text)
    fo.close()
    return
def csv_read_and_write(read_path, write_path1, write_path2):
    with open(write_path1, 'w') as outFile1, open(write_path2,
                                                  'w') as outFile2:
        file_writer1 = csv.writer(outFile1)
        file_writer2 = csv.writer(outFile2)

        i = 1
        with open(read_path, 'r') as inFile:
            fileReader = csv.reader(inFile)
            for row in fileReader:
                tweet = row[4]
                p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG)
                cleaned_tweet = p.clean(tweet)
                cleaned_tweet = unicode(cleaned_tweet, 'utf-8')
                is_english = detect_language(
                    cleaned_tweet
                )  # where we call the function that detects if it is english or not
                print(is_english)

                data = [
                    row[0], row[1], row[2], row[3], row[4], row[5], row[6],
                    row[7], row[8], row[9]
                ]

                if is_english is True:
                    file_writer1.writerow(data)
                else:
                    file_writer2.writerow(data)
                i = i + 1
Exemple #13
0
def clean_documents(documents):
    documents_clean = []

    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
    for d in documents:
        # Remove Unicode
        d = d.lower()
        # removing url,emoji,smiley,number
        document_test = p.clean(d)

        #remove stop_words
        document_test = remove_stopwords(document_test)

        document_test = re.sub(r'[^\x00-\x7F]+', ' ', document_test)
        # Remove Mentions
        document_test = re.sub(r'@\w+', '', document_test)
        # Lowercase the document

        # Remove punctuations
        document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ',
                               document_test)
        # Lowercase the numbers
        document_test = re.sub(r'[0-9]', '', document_test)
        # Remove the doubled space

        documents_clean.append(document_test)
    return documents_clean
Exemple #14
0
def preprocess_tweet(tweet):
    cleaned_tweet = tweet.lower()  # lowercase the tweet
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.HASHTAG)  # set options for the preprocessor
    cleaned_tweet = p.clean(cleaned_tweet.encode("ascii", "ignore"))
    #cleaned_tweet = remove_stopwords(cleaned_tweet)  # remove stopwords
    #print cleaned_tweet
    return cleaned_tweet;
 def clean_tweet(self, tweet):
     """
     Uses tweet-preprocessor library to clean tweets if wanted.
     """
     p.set_options(p.OPT.URL)
     cleaned_tweet = p.clean(tweet)
     return cleaned_tweet
    def test_set_options(self):
        tweet = 'Preprocessor now has custom #options support! https://github.com/s/preprocessor'
        p.set_options(p.OPT.URL)
        parsed_tweet = p.parse(tweet)

        self.assertIsNone(parsed_tweet.hashtags)
        self.assertIsNotNone(parsed_tweet.urls)
Exemple #17
0
def csv_read_and_write(read_path, write_path):
    with open (write_path, 'wb') as outFile1:
        file_writer1 = csv.writer(outFile1)

        i = 1;
        with open(read_path,'r') as inFile:
            fileReader = csv.reader(inFile)
            for row in fileReader:
                tweet = row[4]
                p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG)
                cleaned_tweet = p.clean(tweet)
                print(i)
                print(cleaned_tweet)

                data = [row[0],
                        row[1],
                        row[2],
                        row[3],
                        cleaned_tweet,
                        row[5],
                        row[6],
                        row[7],
                        row[8],
                        row[9]]

                file_writer1.writerow(data)

                i = i + 1
Exemple #18
0
    def on_data(self, data):

        p.set_options(p.OPT.EMOJI, p.OPT.SMILEY)
        #print('\ncurrent_process = ',mp.current_process())

        if not 'retweeted_status' in data:
            decoded = json.loads(data)
            write_txt = p.clean(decoded['text'])
            if 'extended_tweet' in data:
                try:
                    write_txt = p.clean(decoded['extended_tweet']['full_text'])
                except:
                    pass

            with open('data/streaming_tweets_save.csv',
                      'a',
                      encoding='utf-8',
                      newline='') as file:
                csvwriter = csv.writer(file)
                csvwriter.writerow([
                    decoded['id'], decoded['created_at'], write_txt,
                    decoded['retweet_count'], decoded['favorite_count'],
                    decoded['user']['screen_name'], decoded['user']['name'],
                    decoded['user']['verified'],
                    decoded['user']['followers_count'],
                    decoded['user']['friends_count'], decoded['source'],
                    decoded['user']['url']
                ])

        return True

        def on_error(self, status):
            print('\nERROR status = ', status)
    def test_clean_urls(self):
        tweet = 'canbe foundathttp://www.osp.gatech.edu/rates/(http://www.osp.gatech.edu/rates/).'
        p.set_options(p.OPT.URL)
        cleaned_tweet = p.clean(tweet)
        self.assertEqual("canbe foundat.", cleaned_tweet)

        tweet = 'Nature:先日フランスで起きた臨床試験事故https://t.co/aHk5ok9CDg 原因究明まだなので早急な印象がするけど、低用量投与を1回' \
                'やった後で、(別のボランティアに)高用量の投与とかしてる試験方式にも問題があるだろうみたいなことを書いてる'
        cleaned_tweet = p.clean(tweet)
        self.assertEqual(
            'Nature:先日フランスで起きた臨床試験事故 原因究明まだなので早急な印象がするけど、'
            '低用量投与を1回やった後で、(別のボランティアに)高用量の投与とかしてる試験方式にも問題があるだろうみたいなことを書いてる',
            cleaned_tweet)

        tweet = '[https://link.springer.com/article/10.1007/s10940\\-016\\-9314\\-9]'
        cleaned_tweet = p.clean(tweet)
        self.assertEqual('[]', cleaned_tweet)

        tweet = '(https://link.springer.com/article/10.1007/s10940-016-9314-9)'
        cleaned_tweet = p.clean(tweet)
        self.assertEqual('()', cleaned_tweet)

        tweet = 'check this link: https://fa.wikipedia.org/wiki/%D8%AD%D9%85%D9%84%D9%87_%D8%A8%D9%87_%DA%A9%D9%88%DB%8C' \
                '_%D8%AF%D8%A7%D9%86%D8%B4%DA%AF%D8%A7%D9%87_%D8%AA%D9%87%D8%B1%D8%A7%D9%86_(%DB%B1%DB%B8%E2%80%93%DB%B2%' \
                'DB%B3_%D8%AA%DB%8C%D8%B1_%DB%B1%DB%B3%DB%B7%DB%B8) …'
        cleaned_tweet = p.clean(tweet)
        self.assertEqual('check this link: …', cleaned_tweet)
Exemple #20
0
def clean_tweet(tweet):
    contents = tweet["text"].lower()
    # May want to change these
    prepro.set_options(prepro.OPT.URL, prepro.OPT.EMOJI, prepro.OPT.SMILEY,
                       prepro.OPT.NUMBER)
    clean_contents = prepro.clean(contents)
    tweet["text"] = clean_contents
Exemple #21
0
def is_one_canditate_mentioned(tweet):
    # TODO: Check that candiate is mentioned
    trumps_names = ["donald", "trump"]
    bidens_names = ["joe", "biden"]
    opponent_names = {
        "Donald Trump lang:en": bidens_names,
        "Trump lang:en": bidens_names,
        "Joe Biden lang:en": trumps_names,
        "Biden lang:en": trumps_names,
    }
    candidate_names = {
        "Donald Trump lang:en": trumps_names,
        "Trump lang:en": trumps_names,
        "Joe Biden lang:en": bidens_names,
        "Biden lang:en": bidens_names,
    }
    contents = tweet["text"].lower()
    prepro.set_options(prepro.OPT.URL, prepro.OPT.HASHTAG)
    clean_contents = prepro.clean(contents)

    for opponent_name in opponent_names[tweet["Candidate"]]:
        if clean_contents.find(opponent_name) != -1:
            return False

    for name in candidate_names[tweet["Candidate"]]:
        if clean_contents.find(name) != -1:
            return True

    return False
Exemple #22
0
 def test_clean_reserved_words(self):
     tweet = "Awesome!!! RT @RT: This is a tweet about art ART. FAV #RT #FAV #hashtag"
     p.set_options(p.OPT.RESERVED)
     cleaned_tweet = p.clean(tweet)
     self.assertEqual(
         'Awesome!!! @RT: This is a tweet about art ART. #RT #FAV #hashtag',
         cleaned_tweet)
    def _cleanTweet(self, tweet):
        # set preprocessor to remove links, mentions, and reserved words (FAV, RT, etc.)
        p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED)
        # clean tweet with preprocessor and remove unwanted symbols (hashtags, quotes, question marks)
        tweet = p.clean(tweet.translate(None, '#?"'))

        return tweet
Exemple #24
0
 def test_tokenize(self):
     tweet = "Packathon was a really #nice :) challenging 👌. @packathonorg http://packathon.org"
     p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY)
     tokenized_tweet = p.tokenize(tweet)
     self.assertEqual(
         tokenized_tweet, "Packathon was a really $HASHTAG$ $SMILEY$ challenging $EMOJI$. $MENTION$ $URL$"
     )
Exemple #25
0
        def clean_tweet(text):
            p.set_options(p.OPT.URL, p.OPT.MENTION)
            ps = PorterStemmer()

            text = p.clean(text)
            text = text.lower()
            text = text.translate(str.maketrans('', '', string.punctuation))
            text = text.strip()

            text = word_tokenize(text)

            # Insert negative_word label for negative words
            text = [
                negative_transformer(laugh_transformer(ps.stem(w)))
                for w in text if w not in words_to_remove
            ]

            # removing vowels (coooool -> cool) -> COMMENTED SINCE REDUCING PERFORMANCE
            # text = [remove_vowel(w) for w in text if not w in stop_words]
            # OR: another approach before tokenization tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))

            # Slang correction
            # TODO getting input file and create dictionary

            # Mispelled word correction -> using PyEnchant (not working on kernels) - bu

            return ' '.join(text)
Exemple #26
0
def compareWithOriginal(tweet_database):
    orig_tweet = []
    # stores all the original tweets used to train in orig_tweet list
    with open(
            './twitterScrubber/cleanData/new_FakeKenty_tweets_clean_train.txt',
            'r',
            encoding='utf-8') as fp:
        tweet = fp.readline()
        while tweet:
            # Looks for actual tweets
            if tweet.strip() != "==========":
                orig_tweet.append(tweet)

            tweet = fp.readline()

    # compare with tweets generated from gpt-2 and take out duplicates
    num = 0
    dup = 0
    newlist = []
    for tweet in tweet_database:
        dup_found = False
        for o in orig_tweet:
            if tweet == o:
                dup += 1
                dup_found = True
        if dup_found == False:
            p.set_options(p.OPT.URL)
            tweet = p.clean(tweet)
            newlist.append(tweet)
        num += 1
        print('...Looped through {}/{} tweets and found {} duplicates'.format(
            num, len(tweet_database), dup))

    return newlist
Exemple #27
0
    def __init__(self,
                 data_filepath=os.path.join('..', 'data', 'tweet_data.txt'),
                 seed=3):
        self.seed = seed
        # self.tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
        self.label_encoder = OneHotEncoder(sparse=False)

        # Read in dataset
        df = pd.read_csv(data_filepath,
                         header=None,
                         names=['Tweet', 'Label'],
                         delimiter='\t').dropna()
        data = df['Tweet']

        # preprocess tweets to remove mentions, URL's
        p.set_options(p.OPT.MENTION, p.OPT.URL)  # P.OPT.HASHTAG
        data = data.apply(p.clean)

        # Tokenize special Tweet characters
        # p.set_options(p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.RESERVED, p.OPT.NUMBER)
        # data = data.apply(p.tokenize)

        data = data.tolist()

        # One Hot encode labels
        labels = self.label_encoder.fit_transform(df['Label'].values.reshape(
            -1, 1))

        # Split data
        self.train_X, self.test_X, self.train_Y, self.test_Y = train_test_split(
            data, labels, test_size=0.2, random_state=self.seed)
def preprocess_tweet(tweet):
    cleaned_tweet = tweet.lower()  # lowercase the tweet
    p.set_options(p.OPT.URL, p.OPT.MENTION,
                  p.OPT.HASHTAG)  # set options for the preprocessor
    cleaned_tweet = p.clean(cleaned_tweet)
    #cleaned_tweet = remove_stopwords(cleaned_tweet)  # remove stopwords
    return cleaned_tweet
def clean_data(connection, df_tweets):
    """ A function that cleans tweets from URLs; Reserved keywords like RT,FAV; """

    cursor = connection.cursor()
    for i in range(len(df_tweets)):
        print("ORIGINAL: ", df_tweets.loc[i, 'CONTENT'])
        id = df_tweets.loc[i, 'ID']

        p.set_options(p.OPT.URL, p.OPT.RESERVED)

        cleaned_content = p.clean(df_tweets.loc[i, 'CONTENT'])
        #change the character " to ' to prevent quote error when writing to database
        cleaned_content = cleaned_content.replace('"', "'")
        df_tweets.at[i, 'CONTENT'] = cleaned_content

        print("CLEANED: ", df_tweets.at[i, 'CONTENT'])

        #write changes to database
        query = 'UPDATE GoldenSet SET CONTENT = "' + str(
            df_tweets.at[i, 'CONTENT']) + '" WHERE ID = ' + str(id) + ";"
        cursor.execute(query)
        print('-' * 40 + '\n')
    cursor.close()
    print(
        "\n\n\n----------------------- CLEANING DATA FINISHED -------------------------\n\n\n"
    )
Exemple #30
0
    def prerocess_tweets_texts(texts: pd.Series) -> pd.Series:
        """
        Perform basic preprocessing before more elaborate preprocessing upon
        EDA.

        Parameters
        ----------
        texts : pd.Series
            The texts of tweets to preprocess.

        Returns
        -------
        pd.Series
            The texts of tweets processed.

        """

        ret = []
        # Remove URLs, emojis, mentions and smilies from tweets
        pptweet.set_options(pptweet.OPT.URL, pptweet.OPT.EMOJI,
                            pptweet.OPT.MENTION, pptweet.OPT.SMILEY)
        for text in texts:
            ret.append(pptweet.clean(text))

        return pd.Series(data=ret, index=texts.index)
def csv_read_and_write(read_path, write_path):
    with open(write_path, 'wb') as outFile:
        file_writer = csv.writer(outFile)

        i = 1
        with open(read_path, 'r') as inFile:
            fileReader = csv.reader(inFile)
            for row in fileReader:
                tweet = row[4]
                print("raw tweet : " + tweet)

                decode = decode_tweet(tweet)
                if decode is not None:
                    tweet = decode

                    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI)
                    tweet = p.clean(tweet)
                    print("semi-cleaned tweet : " + tweet)

                    verdict = is_prayformarawi_tweet(tweet)
                    print(verdict)

                    data = [
                        row[0], row[1], row[2], row[3], row[4], row[5], row[6],
                        row[7], row[8], row[9]
                    ]

                    if verdict is True:
                        file_writer.writerow(data)
                        i = i + 1

                print("#prayformarawi tweets count: ", i)
def function_udf(input_str):
    input_str = re.sub(r'RT', '', input_str)
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION)
    input_str = p.clean(input_str)
    return ' '.join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
               input_str).split())
def set_sentence_(sentence):
	p.set_options(p.OPT.URL, p.OPT.EMOJI)
	sentence=p.clean(sentence)
	sentence=hashtag_power(sentence)
	p.set_options(p.OPT.HASHTAG)
	sentence=p.clean(sentence)
	sentence=punc(sentence)
	sentence=Enleve_Accents(sentence)
	return sentence
	def __init__(self):
		self.load_nltk()
		self.model = None
		self.abbreviations = None
		self.spell_check = None
		self.session = None

		preprocessor.set_options(preprocessor.OPT.URL, preprocessor.OPT.MENTION, preprocessor.OPT.RESERVED,
								 preprocessor.OPT.EMOJI, preprocessor.OPT.SMILEY)

		self.load_db_tweets()
Exemple #35
0
    def test_parse(self):
        tweet = "A tweet with #hashtag :) @mention 😀 and http://github.com/s."
        p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY)
        parsed_tweet = p.parse(tweet)

        self.assertIsNotNone(parsed_tweet.urls)
        self.assertEqual(1, len(parsed_tweet.urls))

        self.assertIsNotNone(parsed_tweet.hashtags)
        self.assertEqual(1, len(parsed_tweet.hashtags))

        self.assertIsNotNone(parsed_tweet.mentions)
        self.assertEqual(1, len(parsed_tweet.mentions))

        self.assertIsNone(parsed_tweet.reserved_words)

        self.assertIsNotNone(parsed_tweet.emojis)
        self.assertEqual(1, len(parsed_tweet.emojis))
        self.assertEqual("😀", parsed_tweet.emojis[0].match)

        self.assertIsNotNone(parsed_tweet.smileys)
        self.assertEqual(1, len(parsed_tweet.smileys))
        self.assertEqual(":)", parsed_tweet.smileys[0].match)
Exemple #36
0
 def test_clean(self):
     tweet = "Hello there! @pyistanbul #packathon was awesome 😀. http://packathon.org"
     p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY)
     cleaned_tweeet = p.clean(tweet)
     self.assertEqual(cleaned_tweeet, "Hello there! was awesome .")