def test_get_answerers_strategy(self): question_list = [101274] only_accepted = data.GetAnswerersStrategy(votes_threshold=None, verbose=3) all = data.GetAnswerersStrategy(votes_threshold=5, verbose=3) accepted_answer = only_accepted.get_answers_list(question_list) self.assertTrue(len(accepted_answer) == 1) all_answers = all.get_answers_list(question_list) self.assertEqual(len(all_answers), 3) pass print("start 2") all_answers_before = all.get_answers_list( question_list, before_timepoint=make_datetime("01.01.2018 12:00")) self.assertEqual(len(all_answers_before), 2) all_answers_two_questions = all.get_answers_list( question_list + [173621], make_datetime("01.01.2018 12:00")) self.assertEqual(len(all_answers_two_questions), 4) pass
print_ttm_topics = False raw_question_features_path = os.path.join(cache_dir, "raw_question_features.pickle") load_feat_pairs = True train_qu_pairs_dataframe_path = os.path.join( cache_dir, "train_qu_pairs_dataframe.pickle") train_qu_targets_path = os.path.join(cache_dir, "train_qu_targets.pickle") train_qu_qids_path = os.path.join(cache_dir, "train_qu_qids.pickle") test_qu_pairs_dataframe_path = os.path.join(cache_dir, "test_qu_pairs_dataframe.pickle") test_qu_targets_path = os.path.join(cache_dir, "test_qu_targets.pickle") test_qu_qids_path = os.path.join(cache_dir, "test_qu_qids.pickle") use_all_users_in_train = True training_questions_start_time = make_datetime("01.01.2015 00:00") training_questions_end_time = make_datetime("01.06.2016 00:01") testing_questions_start_time = make_datetime("01.06.2016 00:02") testing_questions_end_time = make_datetime("31.12.2016 23:59") db_access = Data(verbose=3) #TTM FEATURES if load_question_features: all_questions_features = pd.read_pickle(raw_question_features_path) else: #SET UP TTM db_access.set_time_range(start=None, end=training_questions_end_time) posts_for_fitting_ttm = db_access.query( "SELECT Id as Question_Id, Body, Tags, CreationDate FROM Posts WHERE PostTypeId = {questionPostType}", use_macros=True)
from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier import pandas as pd import numpy as np import os import sys import utils import data_utils # define training and testing times success_n = 5 SPLIT_OPTION = "time" # time or fold TRAIN_END_DATE = data_utils.make_datetime("01.01.2015 00:01") TEST_START_DATE = data_utils.make_datetime("01.01.2015 00:01") TEST_END_DATE = data_utils.make_datetime("01.01.2017 00:01") SPLIT_FOLD = 0.9 # Load data data_dir = "burel_data" files = sorted(os.listdir(data_dir)) print("available files:", files) dfs = [] for f in files: if f[0] != ".": load_csv = pd.read_csv(os.path.join(data_dir, f)) print(load_csv.shape) dfs.append(load_csv) df_read = pd.concat(dfs) print("is sorted?", all(np.diff(df_read["decision_time"]) >= 0)) # split in train and test
def test_questions_at_time(self): dh_cached = data.DataHandleCached() a1 = dh_cached.existing_questions_at_time( make_datetime("23.05.2015 12:12")) pass
data_handle = data.Data() # define which features to include --> with TTM feature_collection = gp_features.GP_Feature_Collection( gp_features.GP_Features_affinity(), gp_features.GP_Features_TTM(), gp_features.GP_Features_Question(), gp_features.GP_Features_user()) # parameters for suggested questions hour_threshold_suggested_answer = 24 only_open_questions_suggestable = False filter_nan_asker_id = True start_time = None # data_utils.make_datetime("01.01.2012 00:01") end_time = data_utils.make_datetime("01.01.2016 00:01") # data_utils.make_datetime("01.03.2012 00:01") all_feates_collector = list() n_candidates_collector = list() save_every = 10000 q_a_pair_counter = 1 for i, event in enumerate(data_utils.all_answer_events_iterator(timedelta(days=2), start_time=start_time, end_time=end_time)): if np.isnan(event.answerer_user_id) or np.isnan(event.asker_user_id): continue if i%100 ==0 : avg_candidates = np.mean(n_candidates_collector) print("Preptraining at {}| on average {} candidates in the last {} suggested_question_events".format(event.answer_date, avg_candidates, len(n_candidates_collector)))
parser.add_argument("--only_pretraining", action="store_true") args = parser.parse_args() #Choose either "sklearn-GP" or "osgpr" model_choice = args.model_choice #For osgpr, M is the number of pseudo-points (for sparse approx) M_points = args.m if model_choice == "osgpr": import tensorflow as tf import gpflow as GPflow import streaming_sparse_gp.osgpr as osgpr import streaming_sparse_gp.osgpr_utils as osgpr_utils start_time_online_learning = data_utils.make_datetime("01.01.2015 00:01") hour_threshold_suggested_answer = 24 sigma = 1 beta = args.beta n_preds = 5 ## The following flags only take affect if you redo the all_events.pickle file. i.e. redo_database_dumps =True time_delta_scores_after_posts = timedelta(days=2) # only takes affect with new database filter_nan_asker = True filter_nan_answerer = True # i.e. skip events where asker or answerer field is empty. Also candidate questions with empty akser id will be ignored ##### only_open_questions_suggestable = False # if True candidate questions only contain questions which have no accepted answer at event time -> some people answer questions that already have an accepted answer save_n_negative_suggestons = args.save_n_neg