Ejemplo n.º 1
0
def test_main():
    after_labelled_text_list = \
             [u'RT bitch'  # RT
            , u'RT bitch'  # dup
            , u'I will kill you \u203c\ufe0f \ud83d\ude14'  # emoji unicode
            , u'@wojespn: Bitches think cause you Avoid drama you pussy'  # @someone
            , u'SpaceJam 2 ain\u2019t going to record itself'  # ain't
            , u"I'm not an easy man"
            , u'keeps creating tsunamis https://t.\u2026'  # url & \u0206
            , u'PLEASE, HELP ME! https://t.co/Obor3uOyJU'  # url
            , u'JUST WANT TO BE LOUD!!!'  # upper letter
            , u'like you all arent tired of me'  # arent
            , u'test \n\n'  # \n
            , u'I SAW #GOT7'  ##GOT7
            , u'numbers here: 123'
            , u"I am a-word-with-puc two-link"  # a-b
            , u'string emoji :)'  # string emoji
            , u'drop punctuations :!@#$%^&*()_+[]|\/'":><.,;"  # puncs
            , u'repeatitive words appppppple goood'  # compressed words
         ]
    test_instance = preprocessing(after_labelled_text_list)
    test_text_list = test_instance.main()

    assert test_text_list == [['bitch'], ['kill'],
                              [
                                  'avoid', 'bitch', 'caus', 'drama', 'pussi',
                                  'think'
                              ], ['go', 'record', 'spacejam'],
                              ['easi', 'im', 'man'],
                              ['creat', 'keep', 'tsunami'], ['help', 'pleas'],
                              ['loud', 'want'], ['like', 'tire'], ['test'],
                              ['got', 'saw'], ['number'],
                              ['awordwithpuc', 'twolink'], ['emoji', 'string'],
                              ['drop', 'punctuat'],
                              ['appl', 'good', 'repeatit', 'word']]
Ejemplo n.º 2
0
def test_lemmatizing():  #nothing really changes
    text_list = [[u'dream', u'nasty', u'hopefully', u'better'],
                 [u'rounded', u'ambiguous']]
    test_instance = preprocessing(text_list)
    test_text_list = test_instance.lemmatizing(test_instance.text_list)
    assert test_text_list == [[u'dream', u'nasty', u'hopefully', u'better'],
                              [u'rounded', u'ambiguous']]
Ejemplo n.º 3
0
def test_stemming():
    text_list = [[u'dream', u'nasty', u'hopefully', u'better'],
                 [u'rounded', u'ambiguous']]
    test_instance = preprocessing(text_list)
    test_text_list = test_instance.stemming(test_instance.text_list)
    assert test_text_list == [[u'dream', u'nasti', u'hope', u'better'],
                              [u'round', u'ambigu']]
Ejemplo n.º 4
0
def test_drop_duplicates():
    text_list = [
        [u'dream', u'land'],
        [u'land', u'dream'],
        [u'land', u'dream'],
    ]
    test_instance = preprocessing(text_list)
    test_text_list = test_instance.drop_duplicates(test_instance.text_list)
    assert test_text_list == [[u'dream', u'land']]
Ejemplo n.º 5
0
def test_tokenize_and_stop_word_filter():
    text_list = [
        u'', u' so what', u'test html', u'numbers here ', u'string emoji ',
        u'test  ', u'drop punctuations', u'\U0001f612'
    ]
    test_instance = preprocessing(text_list)
    test_text_list = test_instance.tokenize_and_stop_word_filter(
        test_instance.text_list)

    assert test_text_list == [
        [u'test', u'html'],
        [u'numbers'],
        [u'string', u'emoji'],
        [u'test'],
        [u'drop', u'punctuations'],
    ]
Ejemplo n.º 6
0
def test_remove_punc_and_symbols():
    after_labelled_text_list = [
        u'https://t.co/Obor3uOyJU'  #url
        ,
        u'@aabb_c so what'  #@
        ,
        u'<a>test html</a>'  #html
        ,
        u'numbers here 123'  #number
        ,
        u'string emoji:):-( :)'  #string emoji
        ,
        u'test\n\n'  #\n
        ,
        u'drop punctuations:!@#$%^&*()_-+[]|\/'
        ":><.,;"
    ]
    test_instance = preprocessing(after_labelled_text_list)
    test_text_list = test_instance.remove_punc_and_symbols(
        test_instance.text_list)
    assert test_text_list == [
        u'', u' so what', u'test html', u'numbers here ', u'string emoji ',
        u'test  ', u'drop punctuations'
    ]
# # --- start with using SemEval2017-task4-test-dataset --- #
# url = '/Users/yibingyang/Documents/final_thesis_project/Data/Twitter/test_dataset/'
# filename = 'SemEval2017-task4-test-dataset.txt'
# tweets, labels = read_and_cleaning(url, file)
#
# # --- get only posi and nega tweets and labels --- #
# posi_tweets, nega_tweets = get_only_posi_nega_tweets_and_lables(tweets, labels)

# # --- preprocessing --- #
# posi_tweets = preprocessing(posi_tweets).main()
# nega_tweets = preprocessing(nega_tweets).main()
#
# # --- save the result to file --- #
# save_to_file(posi_tweets, '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/test_dataset/positive_test_tweets_after_preprocessing.txt')
# save_to_file(nega_tweets, '/Users/yibingyang/Documents/thesis_project_new/Data/Twitter/test_dataset/negative_test_tweets_after_preprocessing.txt')


# --- start with yelp reviews --- #
url = '/Users/yibingyang/Documents/thesis_project_new/Data/E-Commerce/raw_data/'
filename = 'yelp-reviews.csv'
posi_tweets, nega_tweets = read_and_cleaning_via_df(url, filename)

# --- preprocessing --- #
posi_tweets = preprocessing(posi_tweets).main()
nega_tweets = preprocessing(nega_tweets).main()

# --- save the result to file --- #
save_to_file(posi_tweets, '/Users/yibingyang/Documents/thesis_project_new/Data/E-Commerce/after_preprocessing/yelp_posi_after_preprocessing.txt')
save_to_file(nega_tweets, '/Users/yibingyang/Documents/thesis_project_new/Data/E-Commerce/after_preprocessing/yelp_nega_after_preprocessing.txt')

Ejemplo n.º 8
0
def test_expand_contractions():
    after_labelled_text_list = [u"I'm not an easy man"]
    test_instance = preprocessing(after_labelled_text_list)
    test_text_list = test_instance.expand_contractions(test_instance.text_list)
    assert test_text_list == [u"I am not an easy man"]
Ejemplo n.º 9
0
def test_convert_punc():
    after_labelled_text_list = [u'I ain\u2019t going to record\u2010\u2002']
    test_instance = preprocessing(after_labelled_text_list)
    test_text_list = test_instance.convert_punc(test_instance.text_list)
    assert test_text_list == [u"I ain't going to record- "]
Ejemplo n.º 10
0
def test_lowercase():
    after_labelled_text_list = [u'JUST WANT TO BE LOUD!!!']
    test_instance = preprocessing(after_labelled_text_list)
    test_text_list = test_instance.lowercase(test_instance.text_list)
    assert test_text_list == [u'just want to be loud!!!']
Ejemplo n.º 11
0
def test_compressed_repetitive_words():
    text_list = [[u'soooo', u'gooood'], [u'ook', u'ooook']]
    test_instance = preprocessing(text_list)
    test_text_list = test_instance.compressed_repetitive_words(
        test_instance.text_list)
    assert test_text_list == [[u'soo', u'good'], [u'ook', u'ook']]