Ejemplo n.º 1
0
def find_cluster_reps(target, mock):
    log(f'Clustering {target}...')

    # Checking spelling can help normalize text.
    speller = autocorrect.Speller(lang='en')

    # Open the source data file and use it as a corpus for clustering. While
    # the corpus is filtered for spelling and stopwords, the original tweets
    # are presented.
    log('\tReading in data...')
    samp = sample(target)
    corpus = [filter(row['text'], speller) for row in samp]

    # If we're mocking the data, it's very easy. Just return random tweets from
    # the corpus.
    if mock:
        subsamp = samp[np.random.choice(samp.shape[0], 3)]
        reps = [[0, np.random.normal(0.75, 0.25), item] for item in subsamp]
        # Mock cluster sizes so they appear reasonable.
        reps[0][0] = np.random.randint(SAMPLE_SIZE / 5, SAMPLE_SIZE / 1.5)
        reps[1][0] = np.random.randint(SAMPLE_SIZE / 10, SAMPLE_SIZE / 5)
        reps[2][0] = np.random.randint(SAMPLE_SIZE / 10, SAMPLE_SIZE / 5)
        reps = sorted(reps, key=lambda x: x[0], reverse=True)
    # Otherwise, find reps "the hard way" using clustering.
    else:
        reps = agglomerate(samp, corpus)

    log('...done.')
    return reps
Ejemplo n.º 2
0
def preprocess(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)

    speller = autocorrect.Speller(lang="ru")
    tokens = list(map(speller, tokens))

    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = list(map(lambda word: lemmatizer.lemmatize(word), tokens))
    return tokens
Ejemplo n.º 3
0
def autocorrect_words(text):
    speller = autocorrect.Speller(lang="en")
    words = text.split(" ")
    for i in range(len(words)):
        try:
            corrected_word = speller(words[i])
            # print(word[i], "=", corrected_word)
            word[i] = corrected_word
        except:
            pass
    text = " ".join(words)
    return text
Ejemplo n.º 4
0
def train_naive_bayes_model(links_file_path,
                            words_dir_path,
                            words_option=0,
                            use_correct=False):
    global stop_words
    total_lines = 0
    with open(links_file_path, 'r', newline='',
              encoding='utf-8') as links_file:
        total_lines = len(links_file.readlines()) - 1
    words_dict = {}
    words_dir_path += '/'
    spller = autocorrect.Speller(lang='en')
    with open(links_file_path, 'r') as links_file:
        cnt = 0
        spamreader = csv.reader(links_file, delimiter=',', quotechar='\"')
        for record in spamreader:
            if cnt > 0:
                mood = record[3]
                input_file_path = words_dir_path + record[0] + '.txt'
                encoding = ''
                with open(input_file_path, 'rb') as input_file:
                    data = input_file.read()
                    encoding = chardet.detect(data)
                with open(input_file_path, 'r',
                          encoding=encoding['encoding']) as input_file:
                    existing_words = set()
                    for line in input_file:
                        tokens = line.split()
                        for token in tokens:
                            token = token.translate(table).lower().strip()
                            if len(token) > 0 and token not in stop_words:
                                if use_correct:
                                    token = spller(token)
                                if token not in existing_words:
                                    if token not in words_dict:
                                        words_dict[token] = {
                                            'relaxed': 0,
                                            'angry': 0,
                                            'happy': 0,
                                            'sad': 0
                                        }
                                    words_dict[token][mood] += 1
                                if words_option != 0:
                                    existing_words.add(token)
                print(cnt, '/', total_lines, end='\n')
            cnt += 1
    return words_dict
def spell_correction(text):
    return autocorrect.Speller().autocorrect_sentence(text)

    buffer = ""
    new_text = ""

    for char in text:
        if char != ' ' and char not in string.punctuation:
            buffer += char
        else:
            if buffer.lower() not in dictionary.words:
                new_text += spell(buffer) + " "
            else:
                new_text += buffer + " "
            if char != " ":
                new_text += char
            buffer = ""

    if len(buffer) > 0 and buffer.lower() not in dictionary.words:
        new_text += spell(buffer)
    else:
        new_text += buffer

    return new_text
import pandas as pd
import regex as re
import spacy
from nltk.stem.snowball import SnowballStemmer
from pandarallel import pandarallel
from tqdm import tqdm

# !python -m spacy download en

nlp = spacy.load('en')

path = r'data/content_dataset.csv'
data = pd.read_csv(path, encoding='utf-8')
data_full = pd.read_csv(r'data/content_dataset_full.csv', encoding='utf-8')

spl = autocorrect.Speller(lang='en')


def remove_xa0(text):
    # Remove the \xa0 symbol from the text, this appears due to some encoding error
    return unicodedata.normalize("NFKD", text)


def remove_symbols(text):
    # remove anything that's not alphanumeric and whitespace
    pattern = '[^\w\s]|-|_'
    return re.sub(pattern, "", text)


def stemmer(text):
    stem = SnowballStemmer(language='english')
Ejemplo n.º 7
0
def pre_process_get_spell_corrector(app_tokens):
    # update the auto corrector with the App's vocab
    speller = autocorrect.Speller()
    for word in app_tokens:
        speller.nlp_data[word] = 1
    return speller
Ejemplo n.º 8
0
def test_naive_bayes_model(links_file_path,
                           words_dir_path,
                           nbmodel_file_path,
                           nboutput_file_path,
                           words_option=0,
                           use_correct=False):
    words_dict = {}
    speller = autocorrect.Speller(lang='en')
    total_lines = 0
    with open(links_file_path, 'r', newline='',
              encoding='utf-8') as links_file:
        total_lines = len(links_file.readlines()) - 1

    with open(nbmodel_file_path, 'r') as nbmodel_file:
        cnt = 0
        moods = []
        for line in nbmodel_file:
            if cnt > 0:
                tokens = line.split(',')
                words_dict[tokens[0]] = {}
                for i in range(len(moods)):
                    words_dict[tokens[0]][moods[i].strip()] = int(
                        tokens[i + 1].strip())
            else:
                moods = line.split(',')[1:]
                cnt += 1
    with open(links_file_path, 'r') as links_file:
        with open(nboutput_file_path, 'w') as nboutput_file:
            cnt = 0
            words_dir_path += '/'
            spamreader = csv.reader(links_file, delimiter=',', quotechar='\"')
            for record in spamreader:
                if cnt > 0:
                    input_file_path = words_dir_path + record[0] + '.txt'
                    encoding = ''
                    possibility_dict = {
                        'relaxed': 0,
                        'angry': 0,
                        'happy': 0,
                        'sad': 0
                    }
                    with open(input_file_path, 'rb') as input_file:
                        data = input_file.read()
                        encoding = chardet.detect(data)
                    with open(input_file_path,
                              'r',
                              encoding=encoding['encoding']) as input_file:
                        existing_words = set()
                        for line in input_file:
                            tokens = line.split()
                            for token in tokens:
                                token = token.translate(table).lower().strip()
                                if use_correct:
                                    token = speller(token)
                                if len(token) > 0 and token in words_dict:
                                    if token not in existing_words:
                                        total = sum(words_dict[token].values())
                                        for mood in words_dict[token]:
                                            possibility_dict[
                                                mood] += math.log2(
                                                    words_dict[token][mood] /
                                                    total)
                                    if words_option != 0:
                                        existing_words.add(token)
                        nboutput_file.write(
                            record[0] + ',' + record[3] + ',' +
                            max(possibility_dict, key=possibility_dict.get) +
                            '\n')
                    print(cnt, '/', total_lines)
                else:
                    nboutput_file.write('Index,actual mood,predicted mood\n')
                cnt += 1