Esempio n. 1
0
def train_ann():
    """
    Function to run ANN process:
    - Get posts from mongo
    - Vectorize posts
    - Create and train ANN model from vectorized posts
    - Save similar posts to mongo database
  """
    database_url = config['database_url']
    database_name = config['database_name']

    utils.log("ANN train", "Get posts...")
    posts = get_posts(database_url, database_name)
    posts.to_csv(config['model_path'] + "ann_posts.csv")
    utils.log("ANN train", "Prepare posts...")
    vectors, popular_tags, popular_categorical = prepare_posts(posts)
    utils.log("ANN train", "Prepare model...")
    model = create_model(vectors)
    utils.log("ANN train", "Train model...")
    train_model(model)
    os.system("ann_predict stop")
    utils.log("ANN train", "Save model...")
    joblib.dump(popular_tags, config['model_path'] + "popular_tags.pkl")
    joblib.dump(popular_categorical,
                config['model_path'] + "popular_categorical.pkl")
    vectors.to_csv(config['model_path'] + "vectors.csv")
    model.save(config['model_path'] + "similar.ann")
    unset_similar_posts(database_url, database_name)
    os.system("ann_predict start")
def predict_ffm():
    """
    Function to run prediction process:
    - Get all posts in a model
    - Get only new posts
    - Generate pairs with similar posts for each user
    - Load a model from disk
    - Get FFM predictions
    - Save recommendations to a mongo database
  """
    database_url = config['database_url']
    database = config['database_name']
    utils.log("FFM predict", "Prepare model...")
    model = ffm.read_model(config['model_path'] + "model.bin")
    mappings = joblib.load(config['model_path'] + "mappings.pkl")

    while True:
        utils.log("FFM predict", "Get posts...")
        posts = get_posts(database_url, database)
        utils.log("FFM predict", "Create dataset...")
        events = utils.get_events(database_url, database)
        dataset = create_dataset(posts, events)
        utils.log("FFM predict", "Extend events...")
        dataset = extend_events(dataset, posts)
        mappings, ffm_dataset_X, ffm_dataset_y = create_ffm_dataset(
            dataset, mappings)
        ffm_dataset = ffm.FFMData(ffm_dataset_X, ffm_dataset_y)
        dataset["prediction"] = model.predict(ffm_dataset)
        utils.log("FFM predict", "Save recommendations...")
        save_recommendations(
            dataset[["user_id", "post_permlink", "prediction"]], database_url,
            database)
def sync_accounts():
  url = config['database_url']
  database = config['database_name']
  accounts_path = config['accounts_path']
  utils.log("Sync accounts", "Get accounts from a file...")
  accounts = pd.read_csv(accounts_path, names=["user_id", "name"])
  client = MongoClient(url)
  db = client[database]
  db.account.drop()
  db.account.insert_many(accounts.to_dict('records'))
def remove_short_texts(texts):
    """
    Function to remove short texts from a corpus
  """
    utils.log("Doc2Vec", "Find length of texts...")
    text_lengths = [len(text) for text in tqdm(texts)]
    text_length_quantile = np.percentile(np.array(text_lengths),
                                         TEXT_LENGTH_QUANTILE)
    utils.log("Doc2Vec", "Remove short texts...")
    return [text for text in texts if len(text) >= text_length_quantile]
def remove_short_words(texts):
    """
    Function to remove short words from texts
  """
    utils.log("Doc2Vec", "Find length of words...")
    word_lengths = [len(item) for sublist in tqdm(texts) for item in sublist]
    word_length_quantile = np.percentile(np.array(word_lengths),
                                         WORD_LENGTH_QUANTILE)
    utils.log("Doc2Vec", "Remove short words...")
    return [[word for word in text if len(word) >= word_length_quantile]
            for text in tqdm(texts)]
Esempio n. 6
0
def sync_events():
    url = config['database_url']
    database = config['database_name']
    events_database = config['events_database']
    utils.log("Sync events", "Get events from a database...")
    events = get_events(events_database['host'], events_database['database'],
                        events_database['user'], events_database['password'],
                        url, database)
    client = MongoClient(url)
    db = client[database]
    if len(events):
        db.raw_event.insert_many(events)
def remove_low_frequent_words(texts):
    """
    Function to remove low frequent words from texts
  """
    utils.log("Doc2Vec", "Remove low frequent words...")
    dictionary = FreqDist([item for sublist in texts for item in sublist])
    word_frequencies = list(dictionary.values())
    low_word_frequency_quantile = np.percentile(np.array(word_frequencies),
                                                LOW_WORD_FREQUENCY_QUANTILE)
    return [[
        word for word in text
        if dictionary[word] >= low_word_frequency_quantile
    ] for text in tqdm(texts)]
def predict_doc2vec():
    database_url = config['database_url']
    database_name = config['database_name']
    utils.log("Doc2Vec predict", "Restore model...")
    model = models.doc2vec.Doc2Vec.load(config['model_path'] +
                                        'golos.doc2vec_model')

    while True:
        utils.wait_between_iterations()
        utils.log("Doc2Vec predict", "Get posts...")
        posts = get_posts(database_url, database_name)
        if posts.shape[0] > 0:
            utils.log("Doc2Vec predict", "Prepare posts...")
            texts, usable_texts = prepare_posts(posts)
            utils.log("Doc2Vec predict", "Save inferred vectors...")
            save_document_vectors(database_url, database_name, posts, texts,
                                  model)
def convert_events():
    database_url = config['database_url']
    database_name = config['database_name']
    utils.log("Sync events", "Get raw events from database...")
    raw_events = get_raw_events(database_url, database_name)
    utils.log("Sync events", "Convert events...")
    events = convert_dataframe(raw_events)
    utils.log("Sync events", "Save events...")
    remove_last_events(database_url, database_name)
    save_events(database_url, database_name, events)
Esempio n. 10
0
def predict_ann():
    database_url = config['database_url']
    database_name = config['database_name']
    utils.log("ANN predict", "Restore model...")
    popular_tags = joblib.load(config['model_path'] + "popular_tags.pkl")
    popular_categorical = joblib.load(config['model_path'] +
                                      "popular_categorical.pkl")
    ann_posts = pd.read_csv(config['model_path'] + "ann_posts.csv")
    model = AnnoyIndex(estimate_number_of_features())
    model.load(config['model_path'] + 'similar.ann')

    while True:
        utils.wait_between_iterations()
        utils.log("ANN predict", "Get posts...")
        posts = get_posts(database_url, database_name)
        if posts.shape[0] > 0:
            utils.log("ANN predict", "Prepare posts...")
            vectors, popular_tags, popular_categorical = prepare_posts(
                posts, popular_tags, popular_categorical)
            utils.log("ANN predict", "Save similar posts...")
            save_similar_posts(database_url, database_name, posts, vectors,
                               ann_posts, model)
def train_doc2vec():
    """
    Function to run Doc2Vec process:
    - Get all posts from mongo
    - Prepare post bodies
    - Create Doc2Vec model
    - Find and save Doc2Vec vectors for each model
  """
    database_url = config['database_url']
    database_name = config['database_name']

    utils.log("Doc2Vec train", "Get posts...")
    posts = get_posts(database_url, database_name)
    utils.log("Doc2Vec train", "Prepare posts...")
    texts, usable_texts = prepare_posts(posts)
    utils.log("Doc2Vec train", "Prepare model...")
    model = create_model(usable_texts)
    os.system("doc2vec_predict stop")
    utils.log("Doc2Vec train", "Save model...")
    model.save(config['model_path'] + 'golos.doc2vec_model')
    unset_inferred_vectors(database_url, database_name)
    os.system("doc2vec_predict start")
Esempio n. 12
0
def train_ffm():
  """
    Function to train FFM model
    - Get all events from mongo database
    - Convert them to a set of unique user-post pairs with a coefficient depending on user sympathy
    - Extend events with a posts info
    - Convert events to a format for FFM algorithm
    - Build model with chosen train and test set
    - Save trained model
  """
  database_url = config['database_url']
  database = config['database_name']

  utils.log("FFM train", "Prepare events...")
  events = utils.get_events(database_url, database)

  events.to_csv(config['model_path'] + "prepared_events.csv")
  # events = pd.read_csv("prepared_events.csv").drop(["Unnamed: 0"], axis=1)

  utils.log("FFM train", "Prepare posts...")
  posts = get_posts(database_url, database, events)

  posts.to_csv(config['model_path'] + "prepared_posts.csv")
  # posts = pd.read_csv("prepared_posts.csv").drop(["Unnamed: 0"], axis=1)

  utils.log("FFM train", "Extend events...")
  events = extend_events(events, posts)

  utils.log("FFM train", "Save events...")
  events.to_csv(config['model_path'] + "extended_events.csv")

  # events = pd.read_csv("extended_events.csv").drop(["Unnamed: 0"], axis=1)

  utils.log("FFM train", "Create ffm dataset...")
  mappings, X, y = create_ffm_dataset(events)
  joblib.dump(X, config['model_path'] + "X.pkl")
  joblib.dump(y, config['model_path'] + "y.pkl")
  train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3)
  utils.log("FFM train", "Build model...")
  model, train_auc_roc, test_auc_roc = build_model(train_X, train_y, test_X, test_y)
  utils.log("FFM train", train_auc_roc)
  utils.log("FFM train", test_auc_roc)
  os.system("ffm_predict stop")
  model.save_model(config['model_path'] + "model.bin")
  joblib.dump(mappings, config['model_path'] + "mappings.pkl")
  os.system("ffm_predict start")