def predict(documents: List[str]): document_classes = { 'REVIEWS': documents } word_classes = tokenize_classes(document_classes) with open('models/model.pkl', 'rb') as input_file: model = pickle.load(input_file) document_words = word_classes['REVIEWS'] predictions = [] for document in document_words: positive_prob = model['POS_PROB'] negative_prob = model['NEG_PROB'] for word in document: if word in model['COND_POS_PROBS']: positive_prob += model['COND_POS_PROBS'][word]['logprob'] else: positive_prob += model['COND_POS_PROBS'][-1]['logprob'] if word in model['COND_NEG_PROBS']: negative_prob += model['COND_NEG_PROBS'][word]['logprob'] else: negative_prob += model['COND_NEG_PROBS'][-1]['logprob'] if positive_prob >= negative_prob: predictions.append('POS') else: predictions.append('NEG') return predictions
def predict(documents: List[str]): document_classes = {'UNK': documents} word_classes = tokenize_classes(document_classes) basePath = os.path.dirname(os.path.abspath(__file__)) with open(basePath + '/../../models/model.pkl', 'rb') as input_file: model = pickle.load(input_file) document_words = word_classes['UNK'] predictions = [] for document in document_words: #Calculando las probabilidades de pertenecer a cada clase positive_prob = model['POS_PROB'] negative_prob = model['NEG_PROB'] for word in document: if word in model['COND_POS_PROBS']: positive_prob += model['COND_POS_PROBS'][word]['logprob'] else: positive_prob += model['COND_POS_PROBS'][-1]['logprob'] if word in model['COND_NEG_PROBS']: negative_prob += model['COND_NEG_PROBS'][word]['logprob'] else: negative_prob += model['COND_NEG_PROBS'][-1]['logprob'] #Determinando a que clase pertenece la oracion if positive_prob >= negative_prob: predictions.append('POS') else: predictions.append('NEG') return predictions
def train(): df = read_sample() document_classes = create_classes(df) word_classes = tokenize_classes(document_classes, False) negative_words = [ item for sublist in word_classes['NEG'] for item in sublist ] positive_words = [ item for sublist in word_classes['POS'] for item in sublist ] dictionary = create_dictionary([negative_words, positive_words]) negative_split = split_data(negative_words, (1, 0.0, 0.0)) positive_split = split_data(positive_words, (1, 0.0, 0.0)) negative_bow = dictionary.doc2bow(negative_split['train']) positive_bow = dictionary.doc2bow(positive_split['train']) total_negative = len(negative_split['train']) + len(negative_bow) total_positive = len(positive_split['train']) + len(positive_bow) negative_prob = np.log( len(negative_split['train']) / (len(negative_split['train']) + len(positive_split['train']))) positive_prob = np.log( len(positive_split['train']) / (len(negative_split['train']) + len(positive_split['train']))) negative_word_probs = {} for id, count in negative_bow: negative_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_negative), } negative_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_negative)} positive_word_probs = {} for id, count in positive_bow: positive_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_positive), } positive_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_positive)} model = { 'POS_PROB': positive_prob, 'NEG_PROB': negative_prob, 'COND_POS_PROBS': positive_word_probs, 'COND_NEG_PROBS': negative_word_probs, } with open("models/model.pkl", "wb") as output_file: pickle.dump(model, output_file) logging.info('Model saved to artifact model.pkl')
from src.features.utils import sent_to_words from src.features.tokenize import tokenize_classes from src.data.prepare_data import read_sample #######finalizan referencias stop_words = stopwords.words('english') stop_words.extend(['from', 'subject', 're', 'edu', 'use']) # Obtener datos data = read_sample() data_words = list(sent_to_words(data)) #agregar tokenize data_lemmatized = tokenize_classes(data_words) print(data_lemmatized) # Creamos diccionario id2word = corpora.Dictionary(data_lemmatized) print(id2word) #for key, value in id2word.items(): # print(key, value) # Create Corpus texts = data_lemmatized # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] #muestra el id y la frecuencia #[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]
def train(): df = read_sample() logging.info('Source data file read succesfully') document_classes = create_classes(df) logging.info('Documents split between different classes.') logging.info('Tokenization started. This will for sure take some time.') word_classes = tokenize_classes(document_classes, False) logging.info('Tokenization completed for all documents.') negative_words = [ item for sublist in word_classes['NEG'] for item in sublist ] positive_words = [ item for sublist in word_classes['POS'] for item in sublist ] dictionary = create_dictionary([negative_words, positive_words]) logging.info('Dictionary generated from all document words.') negative_split = split_data(negative_words) positive_split = split_data(positive_words) negative_bow = dictionary.doc2bow(negative_split['train']) positive_bow = dictionary.doc2bow(positive_split['train']) logging.info('Counts for bag of words for documents in all classes done') total_negative = len(negative_split['train']) + len(negative_bow) total_positive = len(positive_split['train']) + len(positive_bow) negative_prob = np.log( len(negative_split['train']) / (len(negative_split['train']) + len(positive_split['train']))) positive_prob = np.log( len(positive_split['train']) / (len(negative_split['train']) + len(positive_split['train']))) negative_word_probs = {} for id, count in negative_bow: negative_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_negative), } negative_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_negative)} positive_word_probs = {} for id, count in positive_bow: positive_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_positive), } positive_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_positive)} model = { 'POS_PROB': positive_prob, 'NEG_PROB': negative_prob, 'COND_POS_PROBS': positive_word_probs, 'COND_NEG_PROBS': negative_word_probs, } logging.info('Log probabilities for tokens in all classed computed') basePath = os.path.dirname(os.path.abspath(__file__)) with open(basePath + "/../../models/model.pkl", "wb") as output_file: pickle.dump(model, output_file) logging.info('Model saved to artifact model.pkl')