def find_cluster_reps(target, mock): log(f'Clustering {target}...') # Checking spelling can help normalize text. speller = autocorrect.Speller(lang='en') # Open the source data file and use it as a corpus for clustering. While # the corpus is filtered for spelling and stopwords, the original tweets # are presented. log('\tReading in data...') samp = sample(target) corpus = [filter(row['text'], speller) for row in samp] # If we're mocking the data, it's very easy. Just return random tweets from # the corpus. if mock: subsamp = samp[np.random.choice(samp.shape[0], 3)] reps = [[0, np.random.normal(0.75, 0.25), item] for item in subsamp] # Mock cluster sizes so they appear reasonable. reps[0][0] = np.random.randint(SAMPLE_SIZE / 5, SAMPLE_SIZE / 1.5) reps[1][0] = np.random.randint(SAMPLE_SIZE / 10, SAMPLE_SIZE / 5) reps[2][0] = np.random.randint(SAMPLE_SIZE / 10, SAMPLE_SIZE / 5) reps = sorted(reps, key=lambda x: x[0], reverse=True) # Otherwise, find reps "the hard way" using clustering. else: reps = agglomerate(samp, corpus) log('...done.') return reps
def preprocess(text): text = text.lower() tokens = nltk.word_tokenize(text) speller = autocorrect.Speller(lang="ru") tokens = list(map(speller, tokens)) lemmatizer = nltk.stem.WordNetLemmatizer() tokens = list(map(lambda word: lemmatizer.lemmatize(word), tokens)) return tokens
def autocorrect_words(text): speller = autocorrect.Speller(lang="en") words = text.split(" ") for i in range(len(words)): try: corrected_word = speller(words[i]) # print(word[i], "=", corrected_word) word[i] = corrected_word except: pass text = " ".join(words) return text
def train_naive_bayes_model(links_file_path, words_dir_path, words_option=0, use_correct=False): global stop_words total_lines = 0 with open(links_file_path, 'r', newline='', encoding='utf-8') as links_file: total_lines = len(links_file.readlines()) - 1 words_dict = {} words_dir_path += '/' spller = autocorrect.Speller(lang='en') with open(links_file_path, 'r') as links_file: cnt = 0 spamreader = csv.reader(links_file, delimiter=',', quotechar='\"') for record in spamreader: if cnt > 0: mood = record[3] input_file_path = words_dir_path + record[0] + '.txt' encoding = '' with open(input_file_path, 'rb') as input_file: data = input_file.read() encoding = chardet.detect(data) with open(input_file_path, 'r', encoding=encoding['encoding']) as input_file: existing_words = set() for line in input_file: tokens = line.split() for token in tokens: token = token.translate(table).lower().strip() if len(token) > 0 and token not in stop_words: if use_correct: token = spller(token) if token not in existing_words: if token not in words_dict: words_dict[token] = { 'relaxed': 0, 'angry': 0, 'happy': 0, 'sad': 0 } words_dict[token][mood] += 1 if words_option != 0: existing_words.add(token) print(cnt, '/', total_lines, end='\n') cnt += 1 return words_dict
def spell_correction(text): return autocorrect.Speller().autocorrect_sentence(text) buffer = "" new_text = "" for char in text: if char != ' ' and char not in string.punctuation: buffer += char else: if buffer.lower() not in dictionary.words: new_text += spell(buffer) + " " else: new_text += buffer + " " if char != " ": new_text += char buffer = "" if len(buffer) > 0 and buffer.lower() not in dictionary.words: new_text += spell(buffer) else: new_text += buffer return new_text
import pandas as pd import regex as re import spacy from nltk.stem.snowball import SnowballStemmer from pandarallel import pandarallel from tqdm import tqdm # !python -m spacy download en nlp = spacy.load('en') path = r'data/content_dataset.csv' data = pd.read_csv(path, encoding='utf-8') data_full = pd.read_csv(r'data/content_dataset_full.csv', encoding='utf-8') spl = autocorrect.Speller(lang='en') def remove_xa0(text): # Remove the \xa0 symbol from the text, this appears due to some encoding error return unicodedata.normalize("NFKD", text) def remove_symbols(text): # remove anything that's not alphanumeric and whitespace pattern = '[^\w\s]|-|_' return re.sub(pattern, "", text) def stemmer(text): stem = SnowballStemmer(language='english')
def pre_process_get_spell_corrector(app_tokens): # update the auto corrector with the App's vocab speller = autocorrect.Speller() for word in app_tokens: speller.nlp_data[word] = 1 return speller
def test_naive_bayes_model(links_file_path, words_dir_path, nbmodel_file_path, nboutput_file_path, words_option=0, use_correct=False): words_dict = {} speller = autocorrect.Speller(lang='en') total_lines = 0 with open(links_file_path, 'r', newline='', encoding='utf-8') as links_file: total_lines = len(links_file.readlines()) - 1 with open(nbmodel_file_path, 'r') as nbmodel_file: cnt = 0 moods = [] for line in nbmodel_file: if cnt > 0: tokens = line.split(',') words_dict[tokens[0]] = {} for i in range(len(moods)): words_dict[tokens[0]][moods[i].strip()] = int( tokens[i + 1].strip()) else: moods = line.split(',')[1:] cnt += 1 with open(links_file_path, 'r') as links_file: with open(nboutput_file_path, 'w') as nboutput_file: cnt = 0 words_dir_path += '/' spamreader = csv.reader(links_file, delimiter=',', quotechar='\"') for record in spamreader: if cnt > 0: input_file_path = words_dir_path + record[0] + '.txt' encoding = '' possibility_dict = { 'relaxed': 0, 'angry': 0, 'happy': 0, 'sad': 0 } with open(input_file_path, 'rb') as input_file: data = input_file.read() encoding = chardet.detect(data) with open(input_file_path, 'r', encoding=encoding['encoding']) as input_file: existing_words = set() for line in input_file: tokens = line.split() for token in tokens: token = token.translate(table).lower().strip() if use_correct: token = speller(token) if len(token) > 0 and token in words_dict: if token not in existing_words: total = sum(words_dict[token].values()) for mood in words_dict[token]: possibility_dict[ mood] += math.log2( words_dict[token][mood] / total) if words_option != 0: existing_words.add(token) nboutput_file.write( record[0] + ',' + record[3] + ',' + max(possibility_dict, key=possibility_dict.get) + '\n') print(cnt, '/', total_lines) else: nboutput_file.write('Index,actual mood,predicted mood\n') cnt += 1