def add_vocab_word(text, label, vocab, stopwords): """ A function that adds new words to vocabulary for a class """ tokens = tokenize_words(text) previous = [] not_flag = False for t in tokens: w = t.strip().lower() w = morph.parse(w)[0].normal_form if w.isdigit(): continue elif w in stopwords: continue elif w in string.punctuation: not_flag = False elif w in previous: continue elif w.isalnum(): if not_flag: w = 'НЕ_' + w vocab[label].append(w) previous.append(w) if w == 'не': not_flag = True else: continue
def tonal_classifier(text, tonal_dict, stopwords): tokens = tokenize_words(text) previous = [] total_sent = 0 for t in tokens: w = t.strip().lower() w = morph.parse(w)[0].normal_form if w.isdigit(): continue elif w in stopwords: continue elif w in string.punctuation: not_flag = False elif w in previous: continue elif w.isalnum(): previous.append(w) if w not in tonal_dict.keys(): # if word not in dictionary, ignore it continue else: total_sent += tonal_dict[w] else: continue if total_sent >= 0: return 'pos' else: return 'neg'
def parse_appeal(i, doc): message = doc.strip().split('\n') doc_id = message.pop(0) if doc_id.isdigit(): doc_id = int(doc_id) message = '\n'.join(message) if message and detect(message) == 'uk': tokens = [token.lower() for token in tokenize_words(message)] vector = get_vector(tokens) return doc_id, i, tokens, vector return None, None, None, None
def perceptron_classifier(text, vocab, stopwords): """ A function to predict the label of the text given perc_vocab """ tokens = tokenize_words(text) guess_dict = {'pos': 0, 'neg': 0} for w in tokenize_text(text, vocab, stopwords): guess_dict['pos'] += vocab[w]['pos'] guess_dict['neg'] += vocab[w]['neg'] if guess_dict['pos'] == guess_dict['neg']: return random.choice(['pos', 'neg']) else: return max(guess_dict, key=guess_dict.get)
def NB_classifier(text, prob_dict, priors, stopwords): """ A function to classify text into positive or negative, given dictionary of probabilities from train dataset, using a Naive Bayes algorithm """ labels = list(priors.keys()) tokens = tokenize_words(text) # initialize lists of conditional probabilities probs = {} for lab in labels: probs[lab] = [] previous = [] not_flag = False for t in tokens: w = t.strip().lower() w = morph.parse(w)[0].normal_form if w.isdigit(): continue elif w in stopwords: continue elif w in string.punctuation: not_flag = False elif w in previous: continue elif w.isalnum(): if not_flag: w = 'НЕ_' + w previous.append(w) if w == 'не': not_flag = True if w not in prob_dict.keys(): # if word not in dictionary, ignore it continue else: # add Bayes probabilities for both classes for lab in labels: probs[lab].append(prob_dict[w]['p_' + lab] + priors[lab]) else: continue # calculate sum of log probabilities sums_of_log_probs = {} for lab in labels: sums_of_log_probs[lab] = sum(p for p in probs[lab]) return max(sums_of_log_probs, key=sums_of_log_probs.get)
def tokenize_text(text, vocab, stopwords): tokens = tokenize_words(text) res_tokens = [] for t in tokens: w = t.strip().lower() w = morph.parse(w)[0].normal_form if w.isdigit(): continue elif w in stopwords: continue elif w in string.punctuation: not_flag = False elif w.isalnum(): if w not in vocab.keys(): # if word not in dictionary, ignore it continue else: res_tokens.append(w) else: continue return res_tokens