Ejemplo n.º 1
0
        for i in range(limit-len(vectors)):
            vectors.append(np.zeros(w2v_model.vector_size))

    return np.concatenate(vectors)


def index2term(model_index, item_index):
    return '$W2V_ITEM_{model}_{item}'.format(model=str(model_index),
                                             item=str(item_index))


utils.init_logger()

with io.open(configs.W2V_CONFIG, 'r') as f:
    config = json.load(f, encoding='utf-8')

CONFIG_WORD2VEC_MODELS = "w2v_models"
CONFIG_WORD2VEC_UNIONTYPE = config["union_type"]

W2V_MODELS = []
for model in config[CONFIG_WORD2VEC_MODELS]:
        model_params = config[CONFIG_WORD2VEC_MODELS][model]
        if model_params['enabled'] == 'true':
            model_path = os.path.join(
                os.path.dirname(configs.DATA_ROOT), model_params['path'])
            logging.info("Loading Word2Vec model: {} ...".format(model_path))
            W2V_MODELS.append(Word2Vec.load_word2vec_format(model_path))

if __name__ == "__main__":
    utils.vectorization_core(vectorizer, init_term_vocabulary=False)
Ejemplo n.º 2
0
        vector {index1: value1, ... , indexN: valueN}
    """
    features = labeled_message['features']
    vector = model_features_only.feature_vectorizer(features, term_voc)

    terms = labeled_message['terms']
    for term in terms:
        index = term_voc.get_term_index(term)
        vector[index] = tf(term, terms) * idf(term, term_voc, doc_voc)

    return vector


def tf(term, doc_terms):
    """
    Calculate tf measure for a document
    """
    return doc_terms.count(term)*1.0/len(doc_terms)


def idf(term, term_voc, doc_voc):
    """
    Calculate idf measure for vocabulary
    """
    return math.log(doc_voc.get_docs_count()*1.0 /
                    doc_voc.get_term_in_docs_count(term))


if __name__ == "__main__":
    utils.vectorization_core(vectorizer)
    features = labeled_message['features']
    vector = model_features_only.feature_vectorizer(features, term_voc)

    terms = labeled_message['terms']
    for term in terms:
        index = term_voc.get_term_index(term)
        vector[index] = tf(term, terms) * idf(term, doc_voc, '1') - \
            tf(term, terms) * idf(term, doc_voc,  '-1')

    return vector


def tf(term, terms):
    """
    Boolean tf
    """
    return 1 if terms.count(term) > 0 else 0


def idf(term, doc_voc, sentiment):
    """
    sentiment idf measure
    """
    N = doc_voc.get_docs_count(sentiment)
    df = doc_voc.get_term_in_docs_count(term, sentiment)
    return math.log((N + 0.5) / (df + 0.5))


if __name__ == "__main__":
    utils.vectorization_core(vectorizer, merge_doc_vocabularies=True)
Ejemplo n.º 4
0
        vector {index1: value1, ... , indexN: valueN}
    """
    features = labeled_message['features']
    vector = model_features_only.feature_vectorizer(features, term_voc)

    terms = labeled_message['terms']
    for term in terms:
        index = term_voc.get_term_index(term)
        vector[index] = tf(term, terms) * idf(term, term_voc, doc_voc)

    return vector


def tf(term, doc_terms):
    """
    Calculate tf measure for a document
    """
    return doc_terms.count(term) * 1.0 / len(doc_terms)


def idf(term, term_voc, doc_voc):
    """
    Calculate idf measure for vocabulary
    """
    return math.log(doc_voc.get_docs_count() * 1.0 /
                    doc_voc.get_term_in_docs_count(term))


if __name__ == "__main__":
    utils.vectorization_core(vectorizer)
Ejemplo n.º 5
0
        for i in range(limit - len(vectors)):
            vectors.append(np.zeros(w2v_model.vector_size))

    return np.concatenate(vectors)


def index2term(model_index, item_index):
    return '$W2V_ITEM_{model}_{item}'.format(model=str(model_index),
                                             item=str(item_index))


utils.init_logger()

with io.open(configs.W2V_CONFIG, 'r') as f:
    config = json.load(f, encoding='utf-8')

CONFIG_WORD2VEC_MODELS = "w2v_models"
CONFIG_WORD2VEC_UNIONTYPE = config["union_type"]

W2V_MODELS = []
for model in config[CONFIG_WORD2VEC_MODELS]:
    model_params = config[CONFIG_WORD2VEC_MODELS][model]
    if model_params['enabled'] == 'true':
        model_path = os.path.join(os.path.dirname(configs.DATA_ROOT),
                                  model_params['path'])
        logging.info("Loading Word2Vec model: {} ...".format(model_path))
        W2V_MODELS.append(Word2Vec.load_word2vec_format(model_path))

if __name__ == "__main__":
    utils.vectorization_core(vectorizer, init_term_vocabulary=False)