コード例 #1
0
def predict(documents: List[str]):
    document_classes = {
        'REVIEWS': documents
    }

    word_classes = tokenize_classes(document_classes)

    with open('models/model.pkl', 'rb') as input_file:
        model = pickle.load(input_file)

    document_words = word_classes['REVIEWS']

    predictions = []
    for document in document_words:
        positive_prob = model['POS_PROB']
        negative_prob = model['NEG_PROB']
        
        for word in document:
            if word in model['COND_POS_PROBS']:
                positive_prob += model['COND_POS_PROBS'][word]['logprob']
            else:
                positive_prob += model['COND_POS_PROBS'][-1]['logprob']

            if word in model['COND_NEG_PROBS']:
                negative_prob += model['COND_NEG_PROBS'][word]['logprob']
            else:
                negative_prob += model['COND_NEG_PROBS'][-1]['logprob']

        if positive_prob >= negative_prob:
            predictions.append('POS')
        else:
            predictions.append('NEG')

    return predictions
コード例 #2
0
ファイル: predict.py プロジェクト: cesaregm7/tmnlp-tarea5
def predict(documents: List[str]):
    document_classes = {'UNK': documents}

    word_classes = tokenize_classes(document_classes)
    basePath = os.path.dirname(os.path.abspath(__file__))

    with open(basePath + '/../../models/model.pkl', 'rb') as input_file:
        model = pickle.load(input_file)

    document_words = word_classes['UNK']

    predictions = []
    for document in document_words:

        #Calculando las probabilidades de pertenecer a cada clase
        positive_prob = model['POS_PROB']
        negative_prob = model['NEG_PROB']
        for word in document:
            if word in model['COND_POS_PROBS']:
                positive_prob += model['COND_POS_PROBS'][word]['logprob']
            else:
                positive_prob += model['COND_POS_PROBS'][-1]['logprob']

            if word in model['COND_NEG_PROBS']:
                negative_prob += model['COND_NEG_PROBS'][word]['logprob']
            else:
                negative_prob += model['COND_NEG_PROBS'][-1]['logprob']

        #Determinando a que clase pertenece la oracion
        if positive_prob >= negative_prob:
            predictions.append('POS')
        else:
            predictions.append('NEG')

    return predictions
コード例 #3
0
ファイル: train.py プロジェクト: DrCesar/text-mining-project
def train():
    df = read_sample()
    document_classes = create_classes(df)

    word_classes = tokenize_classes(document_classes, False)

    negative_words = [
        item for sublist in word_classes['NEG'] for item in sublist
    ]
    positive_words = [
        item for sublist in word_classes['POS'] for item in sublist
    ]

    dictionary = create_dictionary([negative_words, positive_words])

    negative_split = split_data(negative_words, (1, 0.0, 0.0))
    positive_split = split_data(positive_words, (1, 0.0, 0.0))

    negative_bow = dictionary.doc2bow(negative_split['train'])
    positive_bow = dictionary.doc2bow(positive_split['train'])

    total_negative = len(negative_split['train']) + len(negative_bow)
    total_positive = len(positive_split['train']) + len(positive_bow)

    negative_prob = np.log(
        len(negative_split['train']) /
        (len(negative_split['train']) + len(positive_split['train'])))
    positive_prob = np.log(
        len(positive_split['train']) /
        (len(negative_split['train']) + len(positive_split['train'])))

    negative_word_probs = {}
    for id, count in negative_bow:
        negative_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_negative),
        }

    negative_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_negative)}

    positive_word_probs = {}
    for id, count in positive_bow:
        positive_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_positive),
        }

    positive_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_positive)}

    model = {
        'POS_PROB': positive_prob,
        'NEG_PROB': negative_prob,
        'COND_POS_PROBS': positive_word_probs,
        'COND_NEG_PROBS': negative_word_probs,
    }

    with open("models/model.pkl", "wb") as output_file:
        pickle.dump(model, output_file)
    logging.info('Model saved to artifact model.pkl')
コード例 #4
0
from src.features.utils import sent_to_words
from src.features.tokenize import tokenize_classes
from src.data.prepare_data import read_sample
#######finalizan referencias

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Obtener datos
data = read_sample()

data_words = list(sent_to_words(data))

#agregar tokenize
data_lemmatized = tokenize_classes(data_words)
print(data_lemmatized)
# Creamos diccionario
id2word = corpora.Dictionary(data_lemmatized)
print(id2word)
#for key, value in id2word.items():
#   print(key, value)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
#muestra el id y la frecuencia
#[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]
コード例 #5
0
def train():
    df = read_sample()
    logging.info('Source data file read succesfully')

    document_classes = create_classes(df)
    logging.info('Documents split between different classes.')

    logging.info('Tokenization started. This will for sure take some time.')
    word_classes = tokenize_classes(document_classes, False)
    logging.info('Tokenization completed for all documents.')

    negative_words = [
        item for sublist in word_classes['NEG'] for item in sublist
    ]
    positive_words = [
        item for sublist in word_classes['POS'] for item in sublist
    ]

    dictionary = create_dictionary([negative_words, positive_words])
    logging.info('Dictionary generated from all document words.')

    negative_split = split_data(negative_words)
    positive_split = split_data(positive_words)

    negative_bow = dictionary.doc2bow(negative_split['train'])
    positive_bow = dictionary.doc2bow(positive_split['train'])

    logging.info('Counts for bag of words for documents in all classes done')

    total_negative = len(negative_split['train']) + len(negative_bow)
    total_positive = len(positive_split['train']) + len(positive_bow)

    negative_prob = np.log(
        len(negative_split['train']) /
        (len(negative_split['train']) + len(positive_split['train'])))
    positive_prob = np.log(
        len(positive_split['train']) /
        (len(negative_split['train']) + len(positive_split['train'])))

    negative_word_probs = {}
    for id, count in negative_bow:
        negative_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_negative),
        }

    negative_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_negative)}

    positive_word_probs = {}
    for id, count in positive_bow:
        positive_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_positive),
        }

    positive_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_positive)}

    model = {
        'POS_PROB': positive_prob,
        'NEG_PROB': negative_prob,
        'COND_POS_PROBS': positive_word_probs,
        'COND_NEG_PROBS': negative_word_probs,
    }
    logging.info('Log probabilities for tokens in all classed computed')
    basePath = os.path.dirname(os.path.abspath(__file__))
    with open(basePath + "/../../models/model.pkl", "wb") as output_file:
        pickle.dump(model, output_file)
    logging.info('Model saved to artifact model.pkl')