コード例 #1
0
ファイル: naive_bayes.py プロジェクト: sudodoki/prj-nlp
def add_vocab_word(text, label, vocab, stopwords):
    """
    A function that adds new words to vocabulary for a class
    """
    tokens = tokenize_words(text)
    previous = []
    not_flag = False
    for t in tokens:
        w = t.strip().lower()
        w = morph.parse(w)[0].normal_form
        if w.isdigit():
            continue
        elif w in stopwords:
            continue
        elif w in string.punctuation:
            not_flag = False
        elif w in previous:
            continue
        elif w.isalnum():
            if not_flag:
                w = 'НЕ_' + w
            vocab[label].append(w)
            previous.append(w)
            if w == 'не':
                not_flag = True
        else:
            continue
コード例 #2
0
ファイル: tonal_classifier.py プロジェクト: sudodoki/prj-nlp
def tonal_classifier(text, tonal_dict, stopwords):
    tokens = tokenize_words(text)
    previous = []
    total_sent = 0
    for t in tokens:
        w = t.strip().lower()
        w = morph.parse(w)[0].normal_form
        if w.isdigit():
            continue
        elif w in stopwords:
            continue
        elif w in string.punctuation:
            not_flag = False
        elif w in previous:
            continue
        elif w.isalnum():
            previous.append(w)
            if w not in tonal_dict.keys():
                # if word not in dictionary, ignore it
                continue
            else:
                total_sent += tonal_dict[w]
        else:
            continue
    if total_sent >= 0:
        return 'pos'
    else:
        return 'neg'
コード例 #3
0
def parse_appeal(i, doc):
    message = doc.strip().split('\n')
    doc_id = message.pop(0)
    if doc_id.isdigit():
        doc_id = int(doc_id)
        message = '\n'.join(message)
        if message and detect(message) == 'uk':
            tokens = [token.lower() for token in tokenize_words(message)]
            vector = get_vector(tokens)
            return doc_id, i, tokens, vector
    return None, None, None, None
コード例 #4
0
def perceptron_classifier(text, vocab, stopwords):
    """
    A function to predict the label
    of the text given perc_vocab
    """
    tokens = tokenize_words(text)
    guess_dict = {'pos': 0, 'neg': 0}
    for w in tokenize_text(text, vocab, stopwords):
        guess_dict['pos'] += vocab[w]['pos']
        guess_dict['neg'] += vocab[w]['neg']
    if guess_dict['pos'] == guess_dict['neg']:
        return random.choice(['pos', 'neg'])
    else:
        return max(guess_dict, key=guess_dict.get)
コード例 #5
0
ファイル: naive_bayes.py プロジェクト: sudodoki/prj-nlp
def NB_classifier(text, prob_dict, priors, stopwords):
    """
    A function to classify text into positive or negative,
    given dictionary of probabilities from train dataset,
    using a Naive Bayes algorithm
    """
    labels = list(priors.keys())
    tokens = tokenize_words(text)
    # initialize lists of conditional probabilities
    probs = {}
    for lab in labels:
        probs[lab] = []
    previous = []
    not_flag = False
    for t in tokens:
        w = t.strip().lower()
        w = morph.parse(w)[0].normal_form
        if w.isdigit():
            continue
        elif w in stopwords:
            continue
        elif w in string.punctuation:
            not_flag = False
        elif w in previous:
            continue
        elif w.isalnum():
            if not_flag:
                w = 'НЕ_' + w
            previous.append(w)
            if w == 'не':
                not_flag = True
            if w not in prob_dict.keys():
                # if word not in dictionary, ignore it
                continue
            else:
                # add Bayes probabilities for both classes
                for lab in labels:
                    probs[lab].append(prob_dict[w]['p_' + lab] + priors[lab])
        else:
            continue
    # calculate sum of log probabilities
    sums_of_log_probs = {}
    for lab in labels:
        sums_of_log_probs[lab] = sum(p for p in probs[lab])
    return max(sums_of_log_probs, key=sums_of_log_probs.get)
コード例 #6
0
def tokenize_text(text, vocab, stopwords):
    tokens = tokenize_words(text)
    res_tokens = []
    for t in tokens:
        w = t.strip().lower()
        w = morph.parse(w)[0].normal_form
        if w.isdigit():
            continue
        elif w in stopwords:
            continue
        elif w in string.punctuation:
            not_flag = False
        elif w.isalnum():
            if w not in vocab.keys():
                # if word not in dictionary, ignore it
                continue
            else:
                res_tokens.append(w)
        else:
            continue
    return res_tokens