class Preprocessing:
    def __init__(self):
        self.di = DataCollector()

    def to_upper(self):
        data = self.di.collect_data()
        upper = data['data'].upper()
        return upper
Exemple #2
0
text = PUNCTUATION_PATTERN.sub('', text)
print(text)

print('INFO: aaaaand lowercase...')
text = text.lower()
print(text)

print('INFO: removing whitespaces...')
text = text.lstrip()
text = ' '.join(text.split())
print(text)'''

#collect data/clean it
print('INFO: starting data collection...')
collector = DataCollector()
collector.collect_data()
print('INFO: data collection complete...')
print('INFO: starting data cleaning...')
collector.clean_corpus()
print('INFO: data cleaning complete...')

#print(collector.corpus)

bigram_model = BigramModel(collector)

word_lengths = []
f = open('word_length.txt', r)

for line in f:
    word_lengths.append(line)