Ejemplo n.º 1
0
def tokenize_data(df_comments,
                  ignore_stopwords=True,
                  keepcaps=False,
                  decontract=True,
                  remove_punct=True):
    if ignore_stopwords:
        ignore_stopwords = 'english'

    tokenizer = CrazyTokenizer(ignore_stopwords=ignore_stopwords,
                               keepcaps=keepcaps,
                               subreddits='',
                               reddit_usernames='',
                               emails='',
                               urls='',
                               decontract=decontract,
                               remove_punct=remove_punct)

    tokens = []
    for i in tqdm(range(df_comments.shape[0])):
        current_tokens = tokenizer.tokenize(
            df_comments.iloc[i, df_comments.columns.get_loc('body')])
        tokens.append(current_tokens)
    df_comments['tokens'] = tokens

    return df_comments
Ejemplo n.º 2
0
def test_url_tokenizing():
    tokenizer = CrazyTokenizer(urls='domain')
    tokens = tokenizer.tokenize(url_text)
    assert tokens == [
        'i', 'always', 'go', 'to', 'rt', 'to', 'chat', 'about', 'politics',
        'cnn', 'sucks', 'man'
    ]
Ejemplo n.º 3
0
    def tokenize(self):
        word_tokenize = CrazyTokenizer(twitter_handles='split',
                                       hashtags='split',
                                       decontract=True)

        return self.__dataframe['preprocessed_text'] \
                .apply(lambda text: word_tokenize.tokenize(text))
Ejemplo n.º 4
0
def test_url_fast_unwrapping():
    tokenizer = CrazyTokenizer(urls='domain_unwrap_fast')
    tokens = tokenizer.tokenize(short_url_text)
    assert tokens == [
        'jobs', 'jobs', 'jobs', 'unemployment', 'claims', 'have', 'fallen',
        'to', 'a', '45-year', 'low', 'bloomberg'
    ]
Ejemplo n.º 5
0
def test_splithashtags():
    tokenizer = CrazyTokenizer(splithashtags=True, hashtags=False)
    tokens = tokenizer.tokenize(hashtag_text)
    assert tokens == [
        'make', 'america', 'great', 'again', 'make', 'russia', 'drunk',
        'again', 'maga'
    ]
Ejemplo n.º 6
0
def test_tokenizing():
    tokenizer = CrazyTokenizer(
        lowercase=True,
        keepcaps=True,
        normalize=3,
        ignorequotes=True,
        ignorestopwords=['is', 'are', 'am', 'not', 'a', 'the'],
        stem=False,
        removepunct=True,
        removebreaks=True,
        remove_nonunicode=False,
        decontract=False,
        splithashtags=True,
        twitter_handles='TOKENTWITTERHANDLE',
        urls='',
        hashtags=False,
        numbers=False,
        subreddits='TOKENSUBREDDIT',
        reddit_usernames='TOKENREDDITOR',
        emails='TOKENEMAIL',
        extra_patterns=None,
        pos_emojis=True,
        neg_emojis=None,
        neutral_emojis=None)

    tokens = tokenizer.tokenize(story_of_my_life)
    correct_answer = [
        'hi', 'my', 'name', 'TOKENTWITTERHANDLE', 'I', 'looove', 'beer',
        'plato', 'once', 'said', 'bad', 'way', 'to', 'phrase', 'it', 'another',
        'pint', 'please', 'by', 'way', 'do', "n't", 'forget', 'to', 'visit',
        'I', "'m", 'also', 'on', 'reddit', 'as', 'TOKENREDDITOR', 'I',
        'especially', 'love', 'TOKENSUBREDDIT', 'sending', 'my', 'love', 'to',
        'you', 'as', 'they', 'say', 'POS_EMOJI', '24'
    ]
    assert tokens == correct_answer
Ejemplo n.º 7
0
def test_decontract():
    tokenizer = CrazyTokenizer(decontract=True)
    tokens = tokenizer.tokenize(decontract_text)
    assert tokens == [
        'i', 'have', 'been', 'waiting', 'to', 'drink', 'this', 'beer', 'i',
        'will', 'not', 'give', 'it', 'to', 'you'
    ]
Ejemplo n.º 8
0
def process_data(tweet) :
    #remove symbols, @s
    #remove hashtags
    #tokenize
    num_tweet = re.sub("\d+", "",tweet)
    name_tweet = ''
    for sent in nltk.sent_tokenize(num_tweet):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            try:
                if chunk.label() in ('PERSON','ORGANIZATION'):
                    if name_tweet[-1:] in ('#','@'):
                       name_tweet = name_tweet[:-1]
                    else:
                        pass
                else :
                    for c in chunk.leaves():
                        name_tweet = name_tweet + ' ' + str(c[0])
            except AttributeError:
                if (name_tweet[-1:] in punctuation and name_tweet[-1:] not in ('!','?','.','&')) or (str(chunk[0]) in punctuation and str(chunk[0])!='&') :
                    name_tweet = name_tweet + str(chunk[0])
                else :
                    name_tweet = name_tweet + ' ' + str(chunk[0])
    stopWords = stopwords.words('english')
    stopWords.extend(['$','trump','warren','sen.','senator','mayor','president','kamala','harris','silent','deleted','sanders','berniesanders','ami','klobuchar','pete','beto',"o'rourke"])
    tokenizer = CrazyTokenizer(normalize=2,hashtags=False,remove_punct=True,decontract=True,latin_chars_fix=True,ignore_stopwords=stopWords,ignore_quotes=True,remove_nonunicode=True,twitter_handles='ANOTHER_TWITTER_USER',urls='URL',pos_emojis=True,neg_emojis=True,neutral_emojis=True)
    token_tweet = tokenizer.tokenize(name_tweet)
    clean_tweet = [word.strip() for word in token_tweet if len(word)>1]
    return clean_tweet
Ejemplo n.º 9
0
def test_removepunct():
    tokenizer = CrazyTokenizer(remove_punct=True)
    tokens = tokenizer.tokenize(punct_text)
    print(tokens)
    assert tokens == ['this', 'is', 'the', 'text', 'which', 'contains', 'a',
                      'lot', 'of', 'punctuation', 'amazing', "is", "n't",
                      'it', 'who', 'knows']
Ejemplo n.º 10
0
def test_keep_untokenized():
    tokenizer = CrazyTokenizer(
        keep_untokenized=['New York City', 'Los Angeles'])
    tokens = tokenizer.tokenize(untokenized_text)
    assert tokens == [
        'rats', 'are', 'actually', 'more', 'polite', 'in', 'new_york_city',
        'than', 'in', 'los_angeles'
    ]
Ejemplo n.º 11
0
def test_extra_patterns():
    tokenizer = CrazyTokenizer(extra_patterns=[(
        'zagovor', re.compile(('([S,s]partak|[S,s]paratka|[S,s]partalke)')),
        'GAZPROM')])
    tokens = tokenizer.tokenize(spartak_text)
    assert tokens == [
        'GAZPROM', 'is', 'a', 'champion', 'GAZPROM', 'is', 'the', 'best'
    ]
Ejemplo n.º 12
0
def test_replacement():
    tokenizer = CrazyTokenizer(twitter_handles='handle', urls='url', hashtags='hashtag',
                               numbers='number', subreddits='subreddit', reddit_usernames='redditor',
                               emails='email')
    tokens = tokenizer.tokenize(replacement_text)
    assert tokens == ['url', 'is', 'number', 'number',
                      'site', 'according', 'to', 'handle', 'url']
    tokens = tokenizer.tokenize(replacement_text2)
    assert tokens == ['email', 'was', 'hacked', 'by',
                      'redditor', 'from', 'subreddit', 'hashtag']
Ejemplo n.º 13
0
def process_data(tweet):

    letterText = re.sub("\d+", "", tweet)
    stopWords = list(STOPWORDS)
    stopWords.extend([
        'demdebate', 're', 'campaign', 'senator', 'sen', 'mayor', 'president',
        'trump', 'RT', 'bernie', 'warren', 'kamala', 'buttigieg', 'castro',
        'beto', 'klobuchar', 'joe', 'rogan', 'elizabeth', 'sander', 'sanders',
        'candidate', 'utm', 'source', 'harris', 'biden', 'debate', 'people'
    ])

    name_tweet = ""
    for sent in nltk.sent_tokenize(letterText):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            try:
                if chunk.label() in ('PERSON', 'ORGANIZATION'):
                    if name_tweet[-1:] in ('@', '#') or str(
                            chunk[0]) == 'GOPDebates':
                        name_tweet = name_tweet[:-1]
                    else:
                        pass
                else:
                    for c in chunk.leaves():
                        name_tweet = name_tweet + ' ' + str(c[0])
            except AttributeError:
                if (name_tweet[-1:] in punctuation
                        and name_tweet[-1:] not in ('!', '?', '.', '&')) or (
                            str(chunk[0]) in punctuation
                            and str(chunk[0]) not in ('&', '#', '@')):
                    name_tweet = name_tweet + str(chunk[0])
                else:
                    name_tweet = name_tweet + ' ' + str(chunk[0])
    url_tweet = replaceURL(name_tweet)
    per_str = re.sub(r"[^a-zA-Z0-9 @]", ' ', url_tweet)
    tokenizer = CrazyTokenizer(normalize=2,
                               hashtags='',
                               remove_punct=True,
                               decontract=True,
                               latin_chars_fix=True,
                               ignore_stopwords=stopWords,
                               ignore_quotes=True,
                               remove_nonunicode=True,
                               twitter_handles='',
                               urls='URL',
                               pos_emojis=True,
                               neg_emojis=True,
                               neutral_emojis=True)
    token_tweet = tokenizer.tokenize(per_str)

    clean_tweet = [word.strip() for word in token_tweet if len(word) > 1]
    return " ".join(clean_tweet)
Ejemplo n.º 14
0
def process_data(tweet):

    letterText = re.sub("\d+", "", tweet)
    stopWords = list(stopwords.words('english'))
    stemmer = SnowballStemmer('english')
    stopWords.extend(
        ['senator', 'sen.', 'mayor', 'president', 'trump', 'RT', 'bernie'])

    name_tweet = ""
    for sent in nltk.sent_tokenize(letterText):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            try:
                if chunk.label() in ('PERSON', 'ORGANIZATION'):
                    if name_tweet[-1:] in ('@', '#') or str(
                            chunk[0]) == 'GOPDebates':
                        name_tweet = name_tweet[:-1]
                    else:
                        pass
                else:
                    for c in chunk.leaves():
                        name_tweet = name_tweet + ' ' + str(c[0])
            except AttributeError:
                if (name_tweet[-1:] in punctuation
                        and name_tweet[-1:] not in ('!', '?', '.', '&')) or (
                            str(chunk[0]) in punctuation
                            and str(chunk[0]) not in ('&', '#', '@')):
                    name_tweet = name_tweet + str(chunk[0])
                else:
                    name_tweet = name_tweet + ' ' + str(chunk[0])

    tokenizer = CrazyTokenizer(normalize=2,
                               hashtags='',
                               remove_punct=True,
                               decontract=True,
                               latin_chars_fix=True,
                               ignore_stopwords=stopWords,
                               ignore_quotes=True,
                               remove_nonunicode=True,
                               twitter_handles='',
                               urls='URL',
                               pos_emojis=True,
                               neg_emojis=True,
                               neutral_emojis=True)
    token_tweet = tokenizer.tokenize(name_tweet)
    clean_tweet = [
        stemmer.stem(word.strip()) for word in token_tweet if len(word) > 1
    ]
    return " ".join(clean_tweet)
Ejemplo n.º 15
0
def tokenize(partition):
    partition_name = "{}-{}-{}".format(partition["tw_year"].iloc[0],
                                       partition["tw_month"].iloc[0],
                                       partition["tw_day"].iloc[0])
    start = timer()
    print("Begining Tokenization: {}".format(partition_name))
    tokenizer = CrazyTokenizer(extra_patterns=PATTERNS,
                               lowercase=True,
                               normalize=3,
                               ignore_quotes=False,
                               ignore_stopwords=True,
                               stem="lemm",
                               remove_punct=True,
                               remove_numbers=True,
                               remove_breaks=True,
                               decontract=True,
                               hashtags="split",
                               twitter_handles='',
                               urls=False)
    partition["tokens"] = partition["full_text"].apply(tokenizer.tokenize)
    table = pa.Table.from_pandas(partition)
    pq.write_to_dataset(table,
                        root_path=OUTPUT_DIR,
                        partition_cols=['tw_year', 'tw_month', 'tw_day'])
    end = timer()
    print("Tokenization Finished for {}. Took {} seconds.".format(
        partition_name, end - start))
Ejemplo n.º 16
0
def test_lowercase_keepcaps():
    tokenizer = CrazyTokenizer(lowercase=True, keepcaps=True)
    tokens = tokenizer.tokenize(vova_text)
    assert tokens == ['vladimir', 'putin',
                      'is', 'the', 'BEST', 'AND', 'AMAZING']
    tokenizer = CrazyTokenizer(lowercase=True, keepcaps=False)
    tokens = tokenizer.tokenize(vova_text)
    assert tokens == ['vladimir', 'putin',
                      'is', 'the', 'best', 'and', 'amazing']
    tokenizer = CrazyTokenizer(lowercase=False, keepcaps=False)
    tokens = tokenizer.tokenize(vova_text)
    assert tokens == ['Vladimir', 'Putin',
                      'is', 'the', 'BEST', 'AND', 'AMAZING']
Ejemplo n.º 17
0
def tokenize_reddit(comments_directory, output_directory,
                    subsample=100000, val_size=0.1, test_size=0.1, random_state=24):
    csv_files = glob.glob(osp.join(comments_directory, '*.csv'))
    df_comments = pd.concat((pd.read_csv(csv_file, lineterminator='\n', usecols=[
        'id', 'body', 'subreddit', 'created_utc']) for csv_file in csv_files))

    df_comments.drop_duplicates('id', inplace=True)
    df_comments['created_utc'] = pd.to_datetime(
        df_comments['created_utc'], unit='s')

    df_comments = df_comments.sample(frac=1.0, random_state=random_state)
    df_comments = df_comments.groupby('subreddit').head(subsample)

    tokenizer = CrazyTokenizer(
        keepcaps=False,
        decontract=True,
        ignore_stopwords='english',
        subreddits='',
        reddit_usernames='',
        numbers='',
        emails='',
        urls='')

    tokens = []
    for i in tqdm(range(df_comments.shape[0])):
        current_tokens = tokenizer.tokenize(
            df_comments.iloc[i, df_comments.columns.get_loc('body')])
        tokens.append(current_tokens)
    df_comments['tokens'] = tokens
    del tokens

    df_train_val, df_test = train_test_split(
        df_comments, test_size=test_size, random_state=random_state, shuffle=True)
    df_train, df_val = train_test_split(
        df_train_val, test_size=val_size, random_state=random_state, shuffle=True)

    df_train = df_train.loc[df_train.tokens.str.len() > 0]
    df_val = df_val.loc[df_val.tokens.str.len() > 0]
    df_test = df_test.loc[df_test.tokens.str.len() > 0]

    df_train.to_csv(osp.join(output_directory, 'reddit_train.csv'), index=False)
    df_val.to_csv(osp.join(output_directory, 'reddit_val.csv'), index=False)
    df_test.to_csv(osp.join(output_directory, 'reddit_test.csv'), index=False)
Ejemplo n.º 18
0
def tokenize_image_titles(
    data_path: str,
    offset: int = 0,
    limit: int = None,
    invalidate_cache: bool = False,
    debug_info: bool = False,
) -> None:
    article_paths = [
        join(data_path, f) 
        for f in listdir(data_path) if isdir(join(data_path, f))
    ]
    
    valid_limit = _validated_limit(limit, offset, len(article_paths))
    tokenizer = CrazyTokenizer(hashtags='split')
    mapper = str.maketrans({x: '' for x in string.punctuation})
    regex = re.compile(r'(\d+)')

    for i in range(offset, offset + valid_limit):
        path = article_paths[i]
        if debug_info: print(i, path)
    
        meta_path = join(path, 'img/', 'meta.json')
        meta_arr = _getJSON(meta_path)['img_meta']
        for meta in meta_arr:
            if 'parsed_title' in meta and not invalidate_cache:
                continue
                
            filename = os.path.splitext(meta['title'])[0]
            sentence = filename.translate(mapper)
            sentence = regex.sub(r' \g<1> ', sentence)

            tokens = []
            for word in sentence.split():
                tokens += (
                    tokenizer.tokenize("#" + word) 
                    if not word.isdigit() 
                    else [word]
                )
            
            meta['parsed_title'] = " ".join(tokens)
                
        _dump(meta_path, {"img_meta": meta_arr})
Ejemplo n.º 19
0
def test_normalize():
    tokenizer = CrazyTokenizer(normalize=3)
    tokens = tokenizer.tokenize(norm_text)
    assert tokens == ['eeeboy', 'this', 'shiiit', 'is', 'good']
    tokenizer = CrazyTokenizer(normalize=2)
    tokens = tokenizer.tokenize(norm_text)
    assert tokens == ['eeboy', 'this', 'shiit', 'is', 'good']
Ejemplo n.º 20
0
def test_stop():
    tokenizer = CrazyTokenizer(ignore_stopwords=[
                               'vladimir', 'putin', 'and'], lowercase=False)
    tokens = tokenizer.tokenize(vova_text)
    assert tokens == ['is', 'the', 'BEST', 'AMAZING']
    tokenizer = CrazyTokenizer(ignore_stopwords=True)
    tokens = tokenizer.tokenize(english_stop)
    assert tokens == []
def tokenize_tweets(tweets_file, output_file):
    tweets = pd.read_csv(tweets_file, parse_dates=[
                         'created_at'], lineterminator='\n')
    tweets['id'] = pd.to_numeric(tweets['id'])
    tweets.drop_duplicates('id', inplace=True)

    tokenizer = CrazyTokenizer(
        keepcaps=False,
        decontract=True,
        ignore_stopwords='english',
        twitter_handles='realname',
        hashtags='split',
        numbers='',
        emails='',
        urls='')

    tokens = []
    for i in tqdm(range(tweets.shape[0])):
        current_tokens = tokenizer.tokenize(
            tweets.iloc[i, tweets.columns.get_loc('text')])
        tokens.append(current_tokens)
    tweets['tokens'] = tokens

    tweets.to_csv(output_file, index=False)
Ejemplo n.º 22
0
def test_annoying_case():
    tokenizer = CrazyTokenizer()
    tokens = tokenizer.tokenize(annoying_case)
    assert tokens == ['b', '@realdonaldtrump', '@crazyfrogspb',
                      '*****@*****.**', '#maga',
                      '#russiago', 'http://fscorelab.ru/overview#scoring']
    tokenizer = CrazyTokenizer(emails='EMAIL', twitter_handles='HANDLE',
                               urls='domain', hashtags='split')
    tokens = tokenizer.tokenize(annoying_case)
    assert tokens == ['b', 'HANDLE', 'HANDLE', 'EMAIL', 'maga', 'russia', 'go',
                      'fscorelab']
Ejemplo n.º 23
0
def test_ngrams():
    tokenizer = CrazyTokenizer(ngrams=2)
    tokens = tokenizer.tokenize(ngrams_text)
    assert tokens == ['we', 'need', 'more', 'tokens',
                      'we_need', 'need_more', 'more_tokens']
def splitHashtags(sentence):
    tokenizer = CrazyTokenizer(hashtags='split')
    sentence = tokenizer.tokenize(sentence)
    return ' '.join(sentence)
Ejemplo n.º 25
0
def test_ignorequotes():
    tokenizer = CrazyTokenizer(ignore_quotes=True, remove_punct=True)
    tokens = tokenizer.tokenize(quotes_text)
    assert tokens == ['said', 'no', 'one', 'ever']
Ejemplo n.º 26
0
def test_repeated():
    tokenizer = CrazyTokenizer(
        pos_emojis=True, neg_emojis=True, neutral_emojis=True)
    for i in range(100):
        tokenizer.tokenize(trump_rant)
Ejemplo n.º 27
0
def test_emoji():
    tokenizer = CrazyTokenizer(
        pos_emojis=True, neg_emojis=True, neutral_emojis=True)
    tokens = tokenizer.tokenize(doc_emoji)
    assert tokens == ['POS_EMOJI', 'NEG_EMOJI', 'NEG_EMOJI']
Ejemplo n.º 28
0
def test_handles_split():
    tokenizer = CrazyTokenizer(twitter_handles='split')
    tokens = tokenizer.tokenize(splithandle_text)
    assert tokens == ['real', 'donald', 'trump', 'loves', 'breitbart', 'news']
Ejemplo n.º 29
0
def test_realname():
    tokenizer = CrazyTokenizer(hashtags='split', twitter_handles='realname')
    tokens = tokenizer.tokenize(realname_text)
    assert tokens == ['donald', 'j.', 'trump', 'please', 'make', 'america',
                      'great', 'again']
Ejemplo n.º 30
0
def test_hex():
    tokenizer = CrazyTokenizer(latin_chars_fix=True)
    tokens = tokenizer.tokenize(hex_text)
    assert tokens == ['i', "m", 'so', 'annoyed', 'by', 'these', 'characters',
                      '😢']