for i in range(limit-len(vectors)): vectors.append(np.zeros(w2v_model.vector_size)) return np.concatenate(vectors) def index2term(model_index, item_index): return '$W2V_ITEM_{model}_{item}'.format(model=str(model_index), item=str(item_index)) utils.init_logger() with io.open(configs.W2V_CONFIG, 'r') as f: config = json.load(f, encoding='utf-8') CONFIG_WORD2VEC_MODELS = "w2v_models" CONFIG_WORD2VEC_UNIONTYPE = config["union_type"] W2V_MODELS = [] for model in config[CONFIG_WORD2VEC_MODELS]: model_params = config[CONFIG_WORD2VEC_MODELS][model] if model_params['enabled'] == 'true': model_path = os.path.join( os.path.dirname(configs.DATA_ROOT), model_params['path']) logging.info("Loading Word2Vec model: {} ...".format(model_path)) W2V_MODELS.append(Word2Vec.load_word2vec_format(model_path)) if __name__ == "__main__": utils.vectorization_core(vectorizer, init_term_vocabulary=False)
vector {index1: value1, ... , indexN: valueN} """ features = labeled_message['features'] vector = model_features_only.feature_vectorizer(features, term_voc) terms = labeled_message['terms'] for term in terms: index = term_voc.get_term_index(term) vector[index] = tf(term, terms) * idf(term, term_voc, doc_voc) return vector def tf(term, doc_terms): """ Calculate tf measure for a document """ return doc_terms.count(term)*1.0/len(doc_terms) def idf(term, term_voc, doc_voc): """ Calculate idf measure for vocabulary """ return math.log(doc_voc.get_docs_count()*1.0 / doc_voc.get_term_in_docs_count(term)) if __name__ == "__main__": utils.vectorization_core(vectorizer)
features = labeled_message['features'] vector = model_features_only.feature_vectorizer(features, term_voc) terms = labeled_message['terms'] for term in terms: index = term_voc.get_term_index(term) vector[index] = tf(term, terms) * idf(term, doc_voc, '1') - \ tf(term, terms) * idf(term, doc_voc, '-1') return vector def tf(term, terms): """ Boolean tf """ return 1 if terms.count(term) > 0 else 0 def idf(term, doc_voc, sentiment): """ sentiment idf measure """ N = doc_voc.get_docs_count(sentiment) df = doc_voc.get_term_in_docs_count(term, sentiment) return math.log((N + 0.5) / (df + 0.5)) if __name__ == "__main__": utils.vectorization_core(vectorizer, merge_doc_vocabularies=True)
vector {index1: value1, ... , indexN: valueN} """ features = labeled_message['features'] vector = model_features_only.feature_vectorizer(features, term_voc) terms = labeled_message['terms'] for term in terms: index = term_voc.get_term_index(term) vector[index] = tf(term, terms) * idf(term, term_voc, doc_voc) return vector def tf(term, doc_terms): """ Calculate tf measure for a document """ return doc_terms.count(term) * 1.0 / len(doc_terms) def idf(term, term_voc, doc_voc): """ Calculate idf measure for vocabulary """ return math.log(doc_voc.get_docs_count() * 1.0 / doc_voc.get_term_in_docs_count(term)) if __name__ == "__main__": utils.vectorization_core(vectorizer)
for i in range(limit - len(vectors)): vectors.append(np.zeros(w2v_model.vector_size)) return np.concatenate(vectors) def index2term(model_index, item_index): return '$W2V_ITEM_{model}_{item}'.format(model=str(model_index), item=str(item_index)) utils.init_logger() with io.open(configs.W2V_CONFIG, 'r') as f: config = json.load(f, encoding='utf-8') CONFIG_WORD2VEC_MODELS = "w2v_models" CONFIG_WORD2VEC_UNIONTYPE = config["union_type"] W2V_MODELS = [] for model in config[CONFIG_WORD2VEC_MODELS]: model_params = config[CONFIG_WORD2VEC_MODELS][model] if model_params['enabled'] == 'true': model_path = os.path.join(os.path.dirname(configs.DATA_ROOT), model_params['path']) logging.info("Loading Word2Vec model: {} ...".format(model_path)) W2V_MODELS.append(Word2Vec.load_word2vec_format(model_path)) if __name__ == "__main__": utils.vectorization_core(vectorizer, init_term_vocabulary=False)