Esempio n. 1
0
    def test_get_answerers_strategy(self):
        question_list = [101274]
        only_accepted = data.GetAnswerersStrategy(votes_threshold=None,
                                                  verbose=3)

        all = data.GetAnswerersStrategy(votes_threshold=5, verbose=3)

        accepted_answer = only_accepted.get_answers_list(question_list)
        self.assertTrue(len(accepted_answer) == 1)

        all_answers = all.get_answers_list(question_list)
        self.assertEqual(len(all_answers), 3)
        pass
        print("start 2")
        all_answers_before = all.get_answers_list(
            question_list, before_timepoint=make_datetime("01.01.2018 12:00"))
        self.assertEqual(len(all_answers_before), 2)

        all_answers_two_questions = all.get_answers_list(
            question_list + [173621], make_datetime("01.01.2018 12:00"))
        self.assertEqual(len(all_answers_two_questions), 4)
        pass
Esempio n. 2
0
print_ttm_topics = False
raw_question_features_path = os.path.join(cache_dir,
                                          "raw_question_features.pickle")
load_feat_pairs = True
train_qu_pairs_dataframe_path = os.path.join(
    cache_dir, "train_qu_pairs_dataframe.pickle")
train_qu_targets_path = os.path.join(cache_dir, "train_qu_targets.pickle")
train_qu_qids_path = os.path.join(cache_dir, "train_qu_qids.pickle")
test_qu_pairs_dataframe_path = os.path.join(cache_dir,
                                            "test_qu_pairs_dataframe.pickle")
test_qu_targets_path = os.path.join(cache_dir, "test_qu_targets.pickle")
test_qu_qids_path = os.path.join(cache_dir, "test_qu_qids.pickle")

use_all_users_in_train = True

training_questions_start_time = make_datetime("01.01.2015 00:00")
training_questions_end_time = make_datetime("01.06.2016 00:01")
testing_questions_start_time = make_datetime("01.06.2016 00:02")
testing_questions_end_time = make_datetime("31.12.2016 23:59")

db_access = Data(verbose=3)

#TTM FEATURES
if load_question_features:
    all_questions_features = pd.read_pickle(raw_question_features_path)
else:
    #SET UP TTM
    db_access.set_time_range(start=None, end=training_questions_end_time)
    posts_for_fitting_ttm = db_access.query(
        "SELECT Id as Question_Id, Body, Tags, CreationDate FROM Posts WHERE PostTypeId = {questionPostType}",
        use_macros=True)
Esempio n. 3
0
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import os
import sys
import utils
import data_utils

# define training and testing times
success_n = 5
SPLIT_OPTION = "time"  # time or fold
TRAIN_END_DATE = data_utils.make_datetime("01.01.2015 00:01")
TEST_START_DATE = data_utils.make_datetime("01.01.2015 00:01")
TEST_END_DATE = data_utils.make_datetime("01.01.2017 00:01")
SPLIT_FOLD = 0.9

# Load data
data_dir = "burel_data"
files = sorted(os.listdir(data_dir))
print("available files:", files)
dfs = []
for f in files:
    if f[0] != ".":
        load_csv = pd.read_csv(os.path.join(data_dir, f))
        print(load_csv.shape)
        dfs.append(load_csv)
df_read = pd.concat(dfs)
print("is sorted?", all(np.diff(df_read["decision_time"]) >= 0))

# split in train and test
Esempio n. 4
0
 def test_questions_at_time(self):
     dh_cached = data.DataHandleCached()
     a1 = dh_cached.existing_questions_at_time(
         make_datetime("23.05.2015 12:12"))
     pass
Esempio n. 5
0
data_handle = data.Data()

# define which features to include --> with TTM
feature_collection = gp_features.GP_Feature_Collection(
gp_features.GP_Features_affinity(),
gp_features.GP_Features_TTM(),
gp_features.GP_Features_Question(),
gp_features.GP_Features_user())

# parameters for suggested questions
hour_threshold_suggested_answer = 24
only_open_questions_suggestable = False 
filter_nan_asker_id = True

start_time = None # data_utils.make_datetime("01.01.2012 00:01")
end_time = data_utils.make_datetime("01.01.2016 00:01") # data_utils.make_datetime("01.03.2012 00:01")

all_feates_collector = list()

n_candidates_collector = list()

save_every = 10000
q_a_pair_counter = 1

for i, event in enumerate(data_utils.all_answer_events_iterator(timedelta(days=2), start_time=start_time, end_time=end_time)):
    if np.isnan(event.answerer_user_id) or np.isnan(event.asker_user_id):
         continue
    
    if i%100 ==0 :
        avg_candidates = np.mean(n_candidates_collector)
        print("Preptraining at {}| on average {} candidates in the last {} suggested_question_events".format(event.answer_date, avg_candidates, len(n_candidates_collector)))
Esempio n. 6
0
parser.add_argument("--only_pretraining", action="store_true")

args = parser.parse_args()

#Choose either "sklearn-GP" or "osgpr"
model_choice = args.model_choice
#For osgpr, M is the number of pseudo-points (for sparse approx)
M_points = args.m

if model_choice == "osgpr":
    import tensorflow as tf
    import gpflow as GPflow
    import streaming_sparse_gp.osgpr as osgpr
    import streaming_sparse_gp.osgpr_utils as osgpr_utils

start_time_online_learning =  data_utils.make_datetime("01.01.2015 00:01")
hour_threshold_suggested_answer = 24
sigma = 1
beta = args.beta
n_preds = 5

## The following flags only take affect if you redo the all_events.pickle file. i.e. redo_database_dumps =True
time_delta_scores_after_posts = timedelta(days=2) # only takes affect with new database
filter_nan_asker = True
filter_nan_answerer = True # i.e. skip events where asker or answerer field is empty. Also candidate questions with empty akser id will be ignored
#####


only_open_questions_suggestable = False # if True candidate questions only contain questions which have no accepted answer at event time -> some people answer questions that already have an accepted answer

save_n_negative_suggestons = args.save_n_neg