def __init__(self, N, file_name):

        self.dic_ngram = {}
        self.total_count = 0
        self.N = N

        self.cf = TextCleaner(file_name)
Esempio n. 2
0
def test_words():
    """test spliting a sentence into words"""
    text = TextCleaner()
    sentence = "a necro philiac \"tim burton's corpse bride\""
    word_list = text.words(sentence)
    assert word_list == ['a', 'necro', 'philiac',
                         'tim', "burton's", 'corpse', 'bride']
def test_clean_df_multilingual():
    input_df = pd.DataFrame(
        {
            "input_text": [
                "I did a 10k run this morning at 6h34 follow me @superRunnerdu95 didn't I?",
                "Nous cherchâmes des informations sur https://www.google.com/ le 03/11/2046 l'aventures",
                "#Barcelona Fútbol es la vida [email protected] ℌ ①",
            ],
            "language": ["en", "fr", "es"],
        }
    )
    token_filters = {"is_stop", "is_measure", "is_datetime", "like_url", "like_email", "is_username", "is_hashtag"}
    text_cleaner = TextCleaner(
        tokenizer=MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path),
        token_filters=token_filters,
        lemmatization=True,
        lowercase=False,
        unicode_normalization=UnicodeNormalization.NFKD,
    )
    output_df = text_cleaner.clean_df(df=input_df, text_column="input_text", language_column="language")
    cleaned_text_column = list(text_cleaner.output_column_descriptions.keys())[0]
    cleaned_texts = output_df[cleaned_text_column].values.tolist()
    expected_cleaned_texts = [
        "run morning follow not ?",
        "chercher information aventurer",
        "Fútbol vida H 1",
    ]
    assert cleaned_texts == expected_cleaned_texts
Esempio n. 4
0
def main(filename):
    tc = TextCleaner("corpse_bride.txt")
    list_of_sentences = tc.read_file()

    RANK = 10

    unigram = NgramFrequencies(RANK)
    bigram = NgramFrequencies(RANK)
    trigram = NgramFrequencies(RANK)

    for sentence in list_of_sentences:
        words = sentence.split()
        for i in range(len(words)):
            unigram.add_item(words[i])
            if i < len(words) - 1:
                bigram.add_item(words[i] + "_" + words[i + 1])
            if i < len(words) - 2:
                trigram.add_item(words[i] + "_" + words[i + 1] + "_" +
                                 words[i + 2])

    print("Top 10 unigrams:")
    print(unigram.top_n_freqs())
    print("Top 10 bigrams:")
    print(bigram.top_n_freqs())
    print("Top 10 trigrams:")
    print(trigram.top_n_freqs())
Esempio n. 5
0
def test_clean_text():
    """test text_clean"""
    text_cleaner = TextCleaner("corpse_bride.txt")
    text_cleaner.text = "Hi, Mr.Lee -went to the park. Let's go"
    text_cleaner.clean_text()
    assert text_cleaner.sentence == [
        "hi COMMA mr lee went to the park", " let's go"
    ]
def test_clean_df_english():
    input_df = pd.DataFrame({"input_text": ["Hi, I have two apples costing 3$ 😂    \n and unicode has #snowpersons ☃"]})
    token_filters = {"is_punct", "is_stop", "like_num", "is_symbol", "is_currency", "is_emoji"}
    text_cleaner = TextCleaner(tokenizer=MultilingualTokenizer(), token_filters=token_filters, lemmatization=True)
    output_df = text_cleaner.clean_df(df=input_df, text_column="input_text", language="en")
    cleaned_text_column = list(text_cleaner.output_column_descriptions.keys())[0]
    cleaned_text = output_df[cleaned_text_column][0]
    expected_cleaned_text = "apple cost unicode #snowpersons"
    assert cleaned_text == expected_cleaned_text
Esempio n. 7
0
def test_open_file():
    """test open file"""
    text_cleaner = TextCleaner("corpse_bride.txt")
    text_f = open("corpse_bride.txt")
    text_f.readline()
    text_f.readline()
    text = text_f.read()
    assert text == text_cleaner.text
    text_cleaner = TextCleaner("corpse.txt")
Esempio n. 8
0
def test_pre_process():
    """test all preprocess"""
    text = TextCleaner()
    line = "\"Tim Burton's Corpse Bride\". Marks the Dr. Liu latest, venture."
    text.pre_process(line)
    assert text.word_list == [
        ["tim", "burton's", "corpse", "bride"],
        ["marks", "the", "drdot", "liu", "latest", "COMMA", "venture"],
        []
    ]
Esempio n. 9
0
def test_clean_file():
    """test the clean file method"""
    text = TextCleaner()
    f = open("test_file.txt")
    text.clean_file(f)
    assert (text.text) == [[
        'a', 'bunch', 'of', 'cute', 'and', 'spooky', 'animals', 'are',
        'dropping', 'by.'
    ], ['pick', 'trick', 'or', 'treat.'], ['trick', 'COMMA', '', 'treat.'],
                           [
                               'by', 'mr', 'zeng', 'COMMA', 'mrs', 'liao',
                               'and', 'dr', 'zhang'
                           ]]
Esempio n. 10
0
def main(filename):
    tc = TextCleaner(filename)
    word_list = tc.do_the_cleaning()
    ng_uni = NgramFrequencies(word_list, UNI_COUNT)
    ng_bi = NgramFrequencies(word_list, BI_COUNT)
    ng_tri = NgramFrequencies(word_list, TRI_COUNT)
    ng_uni.add_item()
    ng_bi.add_item()
    ng_tri.add_item()

    print('Top 10 unigrams:')
    print_output_ngram(ng_uni.top_n_freqs(10))
    print('Top 10 bigrams:')
    print_output_ngram(ng_bi.top_n_freqs(10))
    print('Top 10 trigrams:')
    print_output_ngram(ng_tri.top_n_freqs(10))
Esempio n. 11
0
def main():
    with open("corpse_bride.txt", "r") as f:
        content = f.read()

    word_lists = TextCleaner().clean(content)
    print_ngram(1, "unigrams", word_lists)
    print_ngram(2, "bigrams", word_lists)
    print_ngram(3, "trigrams", word_lists)
Esempio n. 12
0
def main():
    clean_text = TextCleaner()

    try:
        f = open(sys.argv[1])
        clean_text.clean_file(f)
    except FileNotFoundError:
        print("Can't find", sys.argv[1])
        return

    text = clean_text.text

    # Report top ten unigrams by frequency
    unigram = NgramFrequencies()
    print("Top 10 unigram:")
    for line in text:
        for char in line:
            unigram.add_item(char)
    print_output(unigram.frequency(10))

    # Report top ten bigrams by frequency
    # if word end with ".", then it cannot connect with the next word
    bigram = NgramFrequencies()
    print("Top 10 bigram:")
    for line in text:
        for i in range(len(line) - 1):
            if "." in line[i]:
                continue
            else:
                bi_pattern = line[i] + "_" + line[i + 1]
                bigram.add_item(bi_pattern)
    print_output(bigram.frequency(10))

    # Report top ten trigrams by frequency
    # if word itself and the next word end with "."
    # then they cannot form trigram
    trigram = NgramFrequencies()
    print("Top 10 trigram:")
    for line in text:
        for j in range(len(line) - 2):
            if "." in line[j] or "." in line[j + 1]:
                continue
            else:
                tri_pattern = line[j] + "_" + line[j + 1] + "_" + line[j + 2]
                trigram.add_item(tri_pattern)
    print_output(trigram.frequency(10))
class NgramFrequencies:

    def __init__(self, N, file_name):

        self.dic_ngram = {}
        self.total_count = 0
        self.N = N

        self.cf = TextCleaner(file_name)

    def make_ngram(self):
        # make n_gram word, convert them to string and add them
        # to dictionary!
        list_ngram = []

        for sentence in self.cf.open_file():
            for i in range((self.N - 1), len(sentence)):
                for j in range(self.N - 1, -1, -1):
                    list_ngram.append(sentence[i - j])
                list_to_str = '_'.join(map(str, list_ngram))
                self.add_item(list_to_str)
                list_ngram = []

    def add_item(self, ngram):
        # This method takes an ngram and increment it's count the dictionary.

        self.total_count += 1

        if ngram in self.dic_ngram.keys():
            self.dic_ngram[ngram] += 1
        else:
            self.dic_ngram[ngram] = 1

    def top_n_counts(self, n):
        # This method return list of words sorted on the count.
        # with the most frequent first.

        sorted_word = sorted(self.dic_ngram.items(),
                             key=lambda x: x[1],
                             reverse=True)
        return sorted_word[:n]

    def frequency(self):
        # This method return dictionary of frequency of words!

        freq_dic = {key: round(self.dic_ngram[key]/self.total_count, 3) for
                    key in self.dic_ngram.keys()}
        return freq_dic

    def top_n_freqs(self, n):
        # This method return list of sorted frequencies of words!

        temp_dic = self.frequency()
        sorted_freq = sorted(temp_dic.items(),
                             key=lambda x: x[1], reverse=True)
        return sorted_freq[:n]
Esempio n. 14
0
 def get_email_dict_array(self, clean=False):
     extension = self.file_name.split('.')[-1].strip()
     if extension == 'csv':
         self._read_in_csv(self.file_name)
     elif extension == 'xlsx':
         data = self._read_in_xlxs(self.file_name)
     else:
         print "Unsupported data format!"
     email_dict_array = []
     for row in data:
         subject = ' '.join(TextCleaner(row[3]).tokenize_str())
         body = ' '.join(TextCleaner(row[4]).tokenize_str())
         email_dict_array.append({
             'direction': row[1],
             'date': row[2],
             'subject': subject,
             'body': body
         })
     return email_dict_array
Esempio n. 15
0
	def __init__(self, filename):
		# Member variables
		self.email_data = []
		self.lda = None
		self.feature_names = None
		self.num_topics = NUM_TOPICS
		self.num_words_per_topic = NUM_WORDS_PER_TOPIC
		self.num_features = NUM_FEATURES

		# Load emails from full path to file
		emails = EmailLoader(filename).get_email_dict_array()

		# Process emails into a list of email body contents
		for email_rec in emails:
			if email_rec['body']:
				# Clean the text and add to list
				cleaner = TextCleaner(email_rec['body'])

				self.email_data.append(" ".join(cleaner.tokenize_str()))
Esempio n. 16
0
def main():
    '''read the file and process by calling methods'''
    file_name = "corpse_bride.txt"
    global N  # so that N can be called in print_output()
    N = 10
    # open the file
    try:
        f = open(file_name, encoding="utf8")
    except Exception:
        print("Can't open corpse_bride.txt")
        return
    # handle TextCleaner class
    clean = TextCleaner(f)
    # handle NgramFrequencies class
    ngram = NgramFrequencies(clean.format())
    ngram.add_item()
    ngram.top_n_counts(N)
    ngram.frequency()
    # call print_output
    print_output(ngram.top_n_freqs(N))
Esempio n. 17
0
def main():
    """collect n-gram frequencies and print the top 10 of eacy type out"""
    text_cleaner = TextCleaner("corpse_bride.txt")
    clean_text(text_cleaner)
    print("Top 10 unigrams: ")
    n_gram_1 = NgramFrequencies(text_cleaner.sentence, 1)
    top_n_count(n_gram_1)
    print("Top 10 bigrams: ")
    n_gram_2 = NgramFrequencies(text_cleaner.sentence, 2)
    top_n_count(n_gram_2)
    print("Top 10 trigrams: ")
    n_gram_3 = NgramFrequencies(text_cleaner.sentence, 3)
    top_n_count(n_gram_3)
    print("Check frequency for N-gram word:")
    gram_input = input("Enter unigram/bigram/trigram: ")
    if gram_input == "unigram":
        check_freq(n_gram_1)
    elif gram_input == "bigram":
        check_freq(n_gram_2)
    elif gram_input == "trigram":
        check_freq(n_gram_3)
Esempio n. 18
0
def main(file_name):
    """Given the file name, print n-grams frequencies
    String -> None"""
    text = TextCleaner()
    ngrams = NgramFrequencies()
    text.read_file(file_name)
    for i in range(0, len(text.lines)):
        text.pre_process(text.lines[i])

    for word_per_list in text.word_list:
        ngrams.fill_in_dic(word_per_list)

    ngrams_list = [
        ngrams.unigrams_dic, ngrams.bigrams_dic, ngrams.trigrams_dic
    ]
    ngrams_name_list = ["unigrams", "bigrams", "trigrams"]
    for i in range(3):
        grams_top = ngrams.top_n_grams(ngrams_list[i], 10)
        print_output(grams_top, ngrams_name_list[0])
def test_constructor():
    '''Test the constructor'''
    clean_file = TextCleaner('test')
    assert clean_file.file_name == 'test'
def test_change_comma():
    '''Test the change comma method'''
    clean_file = TextCleaner("")
    actual_string = clean_file.change_comma("ab, cd,")
    assert actual_string == "ab COMMA cd COMMA"
def test_split_file():
    '''Test the split file method'''
    clean_file = TextCleaner("")
    actual_list = clean_file.split_file("mr. dr. ms. ab cd. ab ab.")
    assert actual_list == [['mr', 'dr', 'ms', 'ab', 'cd'], ['ab', 'ab']]
Esempio n. 22
0
def test_text_cleaner():
    """test the constructor"""
    text_cleaner = TextCleaner("corpse_bride.txt")
    assert text_cleaner.sentence == []
Esempio n. 23
0
def test_sentence():
    """test splitting a paragraph into sentence"""
    text = TextCleaner()
    line = "we are align students. we are fall students"
    sentence_list = text.sentence(line)
    assert sentence_list == ["we are align students", " we are fall students"]
Esempio n. 24
0
def test_constructor():
    """Test the constructor"""
    text = TextCleaner()
    assert text.new_line is None
    assert text.text == []
Esempio n. 25
0
def test_constructor():
    """test the constructor"""
    text = TextCleaner()
    assert text.lines == []
    assert text.word_list == []
Esempio n. 26
0
def cleaned_comments(subreddit):
    tc = TextCleaner()
    return [tc.clean_text(line).text for line in comments(subreddit)]
def test_delete_punctuation():
    '''Test the delete punctuation method'''
    clean_file = TextCleaner("")
    actual_string = clean_file.delet_punctuation("ab)($><")
    assert actual_string == "ab"
Esempio n. 28
0
from text_cleaner import TextCleaner
import sys


filename = sys.argv[2]
tc = TextCleaner(filename)


def test_deal_special_dot():
    assert tc.deal_special_dot("abcdmr.") == "abcdmr"
    assert tc.deal_special_dot("abcddr.abcd") == "abcddrabcd"


def test_deal_commna():
    assert tc.deal_comma("abc,de") == "abc COMMAde"


def test_deal_apostro():
    assert tc.deal_apostro("burton's") == "burtonAPOSs"


def test_split_sentence():
    assert tc.split_sentence("I am a girl. You are a boy.") == [
        "I am a girl", " You are a boy", ""]


def test_deal_punc():
    assert tc.deal_punc("A necro- philiac entertainment-for") == [
        "A", "necro-", "philiac", "entertainment-for"]

def test_open_file():
    '''Test the open file method'''
    clean_file = TextCleaner('test_file.txt')
    actual_list = clean_file.open_file()
    assert actual_list == [["ab", "dr", "d"], ["gh", "COMMA", "ab's"]]
Esempio n. 30
0
def test_init():
    '''test if text cleaner opens file properly'''
    f = open("corpse_bride.txt", encoding="utf8")
    tc = TextCleaner(f)
    assert tc.f == f
    return tc