Esempio n. 1
0
def load_yelp_data():
    business_handler = data_handler.DataHandler('yelp_business_database')
    user_handler = data_handler.DataHandler('yelp_user_database')
    load_businesses(business_handler)
    load_users(user_handler)
    load_reviews(business_handler=business_handler, user_handler=user_handler)
    load_tips(business_handler=business_handler, user_handler=user_handler)
    load_stars(user_handler)
Esempio n. 2
0
def _write_to_db(new_entity):
    handler = data_handler.DataHandler()
    # if entity by that name already exists, remove it
    if handler.get_entities({"name": new_entity["name"]}):
        handler.remove_entities({"name": new_entity["name"]})
    # add the new entity
    handler.create_entity(new_entity)
Esempio n. 3
0
 def __init__(self, word_vec_size, num_compare_entities, db_name=SETTINGS['default_db']):
     self.word_vectors = KeyedVectors.load_word2vec_format(SETTINGS['word_vec_source'], binary=True)
     self.handler = data_handler.DataHandler(db_name)
     self.word_vec_size = word_vec_size
     self.num_compare_entities = num_compare_entities
     self.total_entity_count = self.handler.entity_count()
     self.entity_dict = self._create_entity_dict(self.handler)
Esempio n. 4
0
def test_embeddings_with_ids(embeds, tasks=TASKS, data_gen=None, truncate=True, embed_size=300, db='person2vec_database', callbacks=[]):
    handler = data_handler.DataHandler(db)

    # can pass a training_data_generator to save time, but, if none is passed, create one
    if not data_gen:
        data_gen = training_data_generator.EmbeddingDataGenerator(300, 4)

    if 'biz_type' in tasks:
        entities = _get_entities_from_db(handler, '_id')
    else:
        entities = _get_entities_from_db(handler)

    return _run_tasks(tasks, entities, embeds, truncate, data_gen, embed_size, callbacks)
Esempio n. 5
0
def test_word2vec(word2vec_object, tasks=TASKS, data_gen=None, embed_size=300):
    handler = data_handler.DataHandler()

    # can pass a training_data_generator to save time, but, if none is passed, create one
    if not data_gen:
        data_gen = training_data_generator.EmbeddingDataGenerator(300, 4)

    entities = _get_entities_from_db(handler)
    entities = entities.drop([name for name in entities.index.values if _name_not_has_vec(name, data_gen)])
    word_vecs = _associate_names_with_word_vecs(entities, data_gen)
    word_vecs.reset_index(inplace=True)
    word_vecs['_id'] = pandas.Series([_get_id_for_name(name, handler) for name in word_vecs['index']])
    word_vecs.set_index('index', inplace=True)
    word_vecs.set_index('_id', inplace=True)

    _run_tasks(tasks=tasks, entities=entities, embeds=word_vecs, data_gen=data_gen, truncate=False, embed_size=embed_size)
Esempio n. 6
0
def _build_default_model(
        num_compare_entities=DEFAULT_SETTINGS['num_compare_entities'],
        word_vec_size=DEFAULT_SETTINGS['word_vec_size']):
    # setting variables for size of incoming data
    handler = data_handler.DataHandler()
    num_total_entities = handler.entity_count()
    snip_size = DEFAULT_SETTINGS['snippet_size']
    embedding_size = DEFAULT_SETTINGS['embedding_size']

    input_tensor_words = Input(shape=(
        snip_size,
        word_vec_size,
    ),
                               dtype='float32',
                               name='word_input')
    input_tensor_entity = Input(shape=(num_compare_entities, ),
                                dtype='int32',
                                name='entity_input')

    word_flatten_layer = Flatten()(input_tensor_words)

    entity_embedding_layer = Embedding(
        num_total_entities,
        embedding_size,
        input_length=num_compare_entities,
        name='entity_embedding')(input_tensor_entity)
    entity_embedding_layer = Flatten()(entity_embedding_layer)

    word_branch = Dense(1000, activation="relu",
                        name='dense_sentence_layer')(word_flatten_layer)

    joint_embeds = Concatenate(name='joint_embeds')(
        [word_branch, entity_embedding_layer])

    nex = Dense(1000, activation="relu",
                name='dense_consolidator')(joint_embeds)
    full_out = Dense(num_compare_entities,
                     activation='softmax',
                     name='final_output')(nex)

    model = Model([input_tensor_words, input_tensor_entity], full_out)

    opt = DEFAULT_SETTINGS['optimizer']
    loss = DEFAULT_SETTINGS['loss']
    model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])

    return model
Esempio n. 7
0
def load_yelp_data():
    business_handler = data_handler.DataHandler('yelp_business_database_small')
    load_businesses(business_handler)
    load_reviews(business_handler=business_handler)
    load_tips(business_handler=business_handler)
Esempio n. 8
0
def main(db_name):
    handler = data_handler.DataHandler(db_name)
    snippet_creator.snippetize_db(handler)
    data_gen = training_data_generator.EmbeddingDataGenerator(db_name=db_name)
    model, data_gen = train.train_model(data_gen=data_gen)
    handler.save_embeddings_to_db(model, data_gen)
Esempio n. 9
0
from person2vec.generators import training_data_generator
from person2vec import data_handler
import numpy as np

data_gen = training_data_generator.EmbeddingDataGenerator(300, 4)
handler = data_handler.DataHandler()

hello_vec = [
    -0.05419922, 0.01708984, -0.00527954, 0.33203125, -0.25, -0.01397705,
    -0.15039062, -0.265625, 0.01647949, 0.3828125, -0.03295898, -0.09716797,
    -0.16308594, -0.04443359, 0.00946045, 0.18457031, 0.03637695, 0.16601562,
    0.36328125, -0.25585938, 0.375, 0.171875, 0.21386719, -0.19921875,
    0.13085938, -0.07275391, -0.02819824, 0.11621094, 0.15332031, 0.09082031,
    0.06787109, -0.0300293, -0.16894531, -0.20800781, -0.03710938, -0.22753906,
    0.26367188, 0.012146, 0.18359375, 0.31054688, -0.10791016, -0.19140625,
    0.21582031, 0.13183594, -0.03515625, 0.18554688, -0.30859375, 0.04785156,
    -0.10986328, 0.14355469, -0.43554688, -0.0378418, 0.10839844, 0.140625,
    -0.10595703, 0.26171875, -0.17089844, 0.39453125, 0.12597656, -0.27734375,
    -0.28125, 0.14746094, -0.20996094, 0.02355957, 0.18457031, 0.00445557,
    -0.27929688, -0.03637695, -0.29296875, 0.19628906, 0.20703125, 0.2890625,
    -0.20507812, 0.06787109, -0.43164062, -0.10986328, -0.2578125, -0.02331543,
    0.11328125, 0.23144531, -0.04418945, 0.10839844, -0.2890625, -0.09521484,
    -0.10351562, -0.0324707, 0.07763672, -0.13378906, 0.22949219, 0.06298828,
    0.08349609, 0.02929688, -0.11474609, 0.00534058, -0.12988281, 0.02514648,
    0.08789062, 0.24511719, -0.11474609, -0.296875, -0.59375, -0.29492188,
    -0.13378906, 0.27734375, -0.04174805, 0.11621094, 0.28320312, 0.00241089,
    0.13867188, -0.00683594, -0.30078125, 0.16210938, 0.01171875, -0.13867188,
    0.48828125, 0.02880859, 0.02416992, 0.04736328, 0.05859375, -0.23828125,
    0.02758789, 0.05981445, -0.03857422, 0.06933594, 0.14941406, -0.10888672,
    -0.07324219, 0.08789062, 0.27148438, 0.06591797, -0.37890625, -0.26171875,
    -0.13183594, 0.09570312, -0.3125, 0.10205078, 0.03063965, 0.23632812,