def main(): usage_str = 'Usage:\n' \ '1. Dir for pairs rdd.\n' \ '2. Dir for matrix output - should also contain the info_dict json with num of rows and cols.\n' \ '3. Name of output matrix.' if (len(sys.argv) != 4): print(usage_str) return input_rdd_dir = sys.argv[1] output_dir = sys.argv[2] out_name = sys.argv[3] conf = SparkConf().set("spark.driver.maxResultSize", "30G"). \ set("spark.hadoop.validateOutputSpecs", "false"). \ set('spark.default.parallelism', '100') spark = SparkContext.getOrCreate(conf=conf) rdd = load_three_tuple_rdd(spark, input_rdd_dir) info_dict = load_dict(add_slash_to_dir(output_dir) + 'info_dict.json') n_rows = int(info_dict['rows']) n_cols = int(info_dict['cols']) result_mat = tuples_rdd_to_csr(rdd, (n_rows, n_cols)) f1 = open(add_slash_to_dir(output_dir) + out_name + '_sparse_scipy.pickle', mode='wb') pickle.dump(result_mat, f1) f1.close()
def main(): usage_str = 'Receives a matrix in rdd form and outputs a list of holdout pairs.\n' \ 'Args:\n' \ '1. Dir of matrix in HDFS\n' \ '2. Output dir (non-HDFS)\n' \ '3. -b for both nonzeros and zeros, -nz for nonzeros only. (any other input will default to -nz)' if (len(sys.argv) != 4): print usage_str return hdfs_dir = sys.argv[1] output_dir = sys.argv[2] nonzeros_option = sys.argv[3] conf = SparkConf().set("spark.driver.maxResultSize", "4G").\ set('spark.default.parallelism', '200') spark = SparkContext(conf=conf) test_rdd_matrix = load_count_vector_matrix( spark, hdfs_dir).map(lambda x: x[1]).zipWithIndex() if (nonzeros_option == '-b'): holdout_rdd = test_rdd_matrix.map( lambda x: holdout_row_both(x[0], x[1])) else: holdout_rdd = test_rdd_matrix.map( lambda x: holdout_row_nonzero(x[0], x[1])) holdout_pairs = holdout_rdd.reduce(lambda x, y: (x[0] + y[0], x[1] + y[1])) pickle.dump( holdout_pairs, open(add_slash_to_dir(output_dir) + 'holdout_pairs.pkl', mode='w'))
def main(): usage_str = 'Gets the name of the .csv file containing names and ids, and creates and saves the doc id to ' \ 'doc name dictionary.\n' \ 'Args:\n' \ '1. Input file name.\n' \ '2. Output dir.' if len(sys.argv) != 3: print(usage_str) return input_filename = sys.argv[1] output_dir = sys.argv[2] dict_result = get_id_name_dict(input_filename) print('Dict generated!') make_sure_path_exists(add_slash_to_dir(output_dir)) with open(add_slash_to_dir(output_dir) + 'docid_docname.json', 'w') as f: json.dump(dict_result, f)
def main(): if (len(sys.argv) != 2): print( 'Please give the input directory containing the histogram data (the output dir of create_doc_edit_histograms.py).' ) return input_dir = sys.argv[1] doc_lengths = pickle.load( open(add_slash_to_dir(input_dir) + 'doc_lengths.pkl', mode='rb')) doc_edit_total = pickle.load( open(add_slash_to_dir(input_dir) + 'doc_edit_total.pkl', mode='rb')) doc_edit_distinct = pickle.load( open(add_slash_to_dir(input_dir) + 'doc_edit_distinct.pkl', mode='rb')) user_edit_total = pickle.load( open(add_slash_to_dir(input_dir) + 'user_edit_total.pkl', mode='rb')) doc_keep_list = [ x for x in doc_lengths if doc_lengths[x] >= DOC_STUB_THRESH ] user_keep_list = [x for x in user_edit_total] print('Doc total edit average after stub removal') print(sum([doc_edit_total[x] for x in doc_keep_list]) / len(doc_keep_list)) print('Doc distinct edit average after stub removal') print( sum([doc_edit_distinct[x] for x in doc_keep_list]) / len(doc_keep_list)) print('Length of doc keep list:') print(len(doc_keep_list)) print('Length of user keep list:') print(len(user_keep_list)) pickle.dump( user_keep_list, open(add_slash_to_dir(input_dir) + 'user_keep_list.pkl', mode='wb')) pickle.dump( doc_keep_list, open(add_slash_to_dir(input_dir) + 'doc_keep_list.pkl', mode='wb'))
def main(): usage_str = 'Creates a user-doc count-vectorised matrix out of revision history data. Can do a train-test split. ' \ 'Has two filtering schemes, which are determined by the first argument. If first arg is -old, the ' \ 'old approach is used, which is deprecated and will be removed. If it is -new, the new approach ' \ 'is used. For -old, the args are:\n' \ '\t1. The input file.\n' \ '\t2. The directory for the RDD output.\n' \ '\t3. The directory for the json dictionary and test id list (if to be saved).\n' \ '\t4. The name of the non-admin pages file.\n' \ '\t5. Name of bot names file to filter them out, -none for no such filtering\n' \ '\t6. Name of list of docs to discard, -none for no such thing\n' \ '\t7. Train/test split fraction. Must be a float in the range [0,1). If it\'s 0, no split is performed.\n' \ '\t8. If it is -f then inactive users are filtered, settings in constants.py. -nf for no filtering.\n' \ 'For -new, the args are:\n' \ '\t1. The input file.\n' \ '\t2. The directory for the RDD output.\n' \ '\t3. The directory for the json dictionary and test id list (if to be saved).\n' \ '\t4. Name of doc keep-list. -none to keep all.\n' \ '\t5. Name of user keep-list. -none to keep all.\n' \ '\t6. Train/test split fraction. Must be a float in the range [0,1). If 0, no split is performed.\n' \ 'In this case, user and doc lower thresholds and the doc upper threshold are set in constants.py.' if (len(sys.argv) < 2): print(usage_str) return if (sys.argv[1] == '-old'): old_mode = True if len(sys.argv) != 10: print(usage_str) return filter_inactives = False input_filename = sys.argv[2] output_dir_rdd = sys.argv[3] output_dir_dict = sys.argv[4] nonadmin_pages_filename = sys.argv[5] bots_filename = sys.argv[6] discard_list_filename = sys.argv[7] if (bots_filename == '-none'): bots_filename = None if (discard_list_filename == '-none'): discard_list_filename = None if (nonadmin_pages_filename == '-none'): nonadmin_pages_filename = None split_frac = 0 try: split_frac = float(sys.argv[8]) if (split_frac > 1 or split_frac < 0): print(usage_str) return except: print(usage_str) return if (sys.argv[9] == '-f'): filter_inactives = True elif (sys.argv[9] == '-nf'): filter_inactives = False else: print(usage_str) return else: # TODO Add an option for getting an edit count matrix vs getting an edit size matrix. # Edit count option just uses the existing code, while edit size matrix uses the new code. old_mode = False if len(sys.argv) != 8: print(usage_str) return input_filename = sys.argv[2] output_dir_rdd = sys.argv[3] output_dir_dict = sys.argv[4] doc_keep_list = sys.argv[5] user_keep_list = sys.argv[6] if (doc_keep_list == '-none'): doc_keep_list = None if (user_keep_list == '-none'): user_keep_list = None split_frac = 0 try: split_frac = float(sys.argv[7]) if (split_frac > 1 or split_frac < 0): print(usage_str) return except: print(usage_str) return spark = SparkContext.getOrCreate() input_rdd = tsv_to_rdd(spark, input_filename) if (old_mode): filtered_rdd = filter_user_doc_data( spark, input_rdd, pages_filename=nonadmin_pages_filename, user_freq_filtration=filter_inactives, bots_filename=bots_filename, doc_discard_list_filename=discard_list_filename) else: filtered_rdd = iterative_keeplist_filtering( spark, input_rdd, user_keep_list_filename=user_keep_list, doc_keep_list_filename=doc_keep_list) print('**********************Filtering complete************************') col_dict, count_vec_matrix = create_user_doc_count(spark, filtered_rdd, removeAnonymous=True) print( '******************Data matrix creation completed!*************************' ) train_ids_list = None test_ids_list = None if (split_frac != 0): train_ids_list, test_ids_list = train_test_split_ids( spark, count_vec_matrix, test_frac=split_frac, id_col=0) save_dict(output_dir_dict, col_dict) if (split_frac == 0): save_rdd_mat(output_dir_rdd, count_vec_matrix) else: train_count_vec = get_sub_rdd_by_id_list(spark, count_vec_matrix, train_ids_list) test_count_vec = get_sub_rdd_by_id_list(spark, count_vec_matrix, test_ids_list) save_rdd_mat( add_slash_to_dir(output_dir_rdd) + 'train', train_count_vec) save_rdd_mat(add_slash_to_dir(output_dir_rdd) + 'test', test_count_vec) if (test_ids_list is not None): save_id_list([int(test_id) for test_id in test_ids_list], output_dir_dict) n_rows = count_vec_matrix.count() n_cols = count_vec_matrix.first()[1].size info_dict = { 'rows': n_rows, 'cols': n_cols, 'split fraction': split_frac, 'doc_filter_thresh': DOC_INACTIVITY_THRESH, 'user_inactivity_thresh': USER_INACTIVITY_THRESH, 'user_bot_thresh': USER_BOT_THRESH, 'doc_stub_thresh': DOC_STUB_THRESH } save_dict(output_dir_dict, info_dict, 'info_dict.json') print('Total number of rows:\n' + str(n_rows) + '\nTotal number of cols:\n' + str(n_cols))
def main(): usage_str = 'Gets a scipy sparse matrix in pickle form, calculates its k top singular vectors ' \ '(w/o mean-centering).\n' \ 'Args:\n' \ '1. Input file dir\n' \ '2. Number of singular vectors desired (max 1000, default 20)\n' \ '3. -w for inverse row freq weighting, -n for normal\n' \ '4. List of rows to keep (-none for none)\n' \ '5. -b to binarise the input, -nb to leave it be.\n' \ '6. Name of the input file.\n' \ '7. Whether or not to transpose the original matrix. -t to transpose, -n otherwise.' if (len(sys.argv) != 8): print(usage_str) return input_dir = sys.argv[1] k = 20 try: k = int(sys.argv[2]) except: k = 20 if (k > 1000 or k < 0): k = 20 do_weighting = False if (sys.argv[3] == '-w'): do_weighting = True elif (sys.argv[3] == '-n'): do_weighting = False else: print(usage_str) return keep_users_filename = sys.argv[4] binarise = False if (sys.argv[5] == '-b'): binarise = True elif (sys.argv[5] == '-nb'): binarise = False else: print(usage_str) return if (keep_users_filename != '-none'): keep_rows = np.array(pickle.load(open(keep_users_filename, mode='rb'))) else: keep_rows = None input_filename = sys.argv[6] transpose_input = False if sys.argv[7] == '-t': transpose_input = True elif sys.argv[7] == '-n': transpose_input = False else: print(usage_str) return in_file = open(add_slash_to_dir(input_dir) + input_filename, mode='rb') X = pickle.load(in_file) if (keep_rows is not None): X = X[keep_rows, :] if transpose_input: X = X.transpose() if (binarise): X = csr_matrix((np.array([1] * len(X.data)), X.indices, X.indptr), shape=X.shape) if (do_weighting): X = inv_row_freq_weighting(X) singular_vecs = get_singular_vecs(X, k) f1 = open(add_slash_to_dir(input_dir) + 'v_sing_vecs.pkl', mode='wb') pickle.dump(singular_vecs, f1) f1.close() print('V singular vecs saved') f2 = open(add_slash_to_dir(input_dir) + 'utimessigma_sing_vecs.pkl', mode='wb') utimessigma = X.dot(singular_vecs[:, 1:]) pickle.dump(utimessigma, f2) f2.close() print('U*Sigma saved')
def main(): usage_str = '1. Concept-doc file name, -none if there isn\'t any.\n' \ '2. Concept-word-count file name\n' \ '3. Output rdd dir\n' \ '4. Dir for doc filtering dict, optional\n' \ '5. Dir for output dict' if (len(sys.argv) != 6): print(usage_str) return concept_doc_filename = sys.argv[1] concept_term_filename = sys.argv[2] output_dir_rdd = sys.argv[3] filter_dict_dir = sys.argv[4] output_dir_dict = sys.argv[5] conf = SparkConf().set("spark.driver.maxResultSize", "2G").\ set("spark.hadoop.validateOutputSpecs", "false").\ set('spark.default.parallelism', '100') spark = SparkContext.getOrCreate(conf=conf) #Making the concept-doc map to convert concept values in the concept-doc rdd to doc-term... or not. c2dmap_bc = None if concept_doc_filename != '-none': rdd1 = tsv_to_rdd(spark, concept_doc_filename) concept_to_doc_map = generate_concept_doc_dict(rdd1) c2dmap_bc = spark.broadcast(concept_to_doc_map) #Loading the concept-term rdd and mapping the concepts to docs. rdd2 = tsv_to_rdd(spark, concept_term_filename) rdd2 = rdd2.map(lambda x: (int(x[0]), (int(x[1]), int(x[2])))) if c2dmap_bc is not None: rdd2 = rdd2.map(lambda x: (c2dmap_bc.value[x[0]], x[1])) doc_row_dict = None #Here we filter the docs if a dict of ids and indices of docs we want to keep is given as input # (e.g. when docs are already columns of a user-doc matrix). if (filter_dict_dir != '-none'): filtering_dict = intify_dict(load_dict(add_slash_to_dir(filter_dict_dir) + 'col_dict.json')) filter_dict_bc = spark.broadcast(filtering_dict) rdd2 = rdd2.filter(lambda x: x[0] in filter_dict_bc.value) rdd2 = rdd2.map(lambda x: (filter_dict_bc.value[x[0]], x[1])) #very important, because filtering with a dict means that we want the number of docs here #to be equal to number of docs in that dict (for matrix multiplication). docs_num = len(filtering_dict) else: doc_row_dict = rdd2.map(lambda x: x[0]).distinct().zipWithIndex().collectAsMap() doc_row_dict_bc = spark.broadcast(doc_row_dict) rdd2 = rdd2.map(lambda x: (doc_row_dict_bc.value[x[0]], x[1])) docs_num = len(doc_row_dict) #Now we calculate word frequencies. This will be used both for filtering and for calculating IDF. word_doc_freq = rdd2.map(lambda x: (x[1][0], 1)).reduceByKey(add).collectAsMap() #Filtering now doc_freq_bc = spark.broadcast(word_doc_freq) rdd2 = rdd2.filter(lambda x: doc_freq_bc.value[x[1][0]] < DOC_FREQ_UPPER_REL * docs_num and doc_freq_bc.value[x[1][0]] > DOC_FREQ_LOWER_ABS) #Now we calculate TF-IDF and instead of doc-word-count we have doc-word-tf_idf. Also filtering out those docs #with 0 tf_idf value. tf_idf_pairs = rdd2.map(lambda x: (x[0], x[1][0], np.log(1+x[1][1]) * np.log(docs_num / doc_freq_bc.value[x[1][0]]))) tf_idf_pairs = tf_idf_pairs.filter(lambda x: x[2] > 0) #Now we want to make map the word ids of the rdd to word indices which are actually column indices for a #doc-term matrix. word_index_dict = tf_idf_pairs.map(lambda x: x[1]).distinct().zipWithIndex().collectAsMap() word_index_d_bc = spark.broadcast(word_index_dict) tf_idf_pairs = tf_idf_pairs.map(lambda x: (x[0], word_index_d_bc.value[x[1]], x[2])) words_num = len(word_index_dict) total_count = tf_idf_pairs.count() #count_vec_matrix = tf_idf_pairs.reduceByKey(add).map(lambda x: (x[0], SparseVector(words_num, {a[0]:a[1] for a in x[1]}))) #Now saving. tf_idf_pairs.saveAsTextFile(add_slash_to_dir(output_dir_rdd)) print('************************Saved RDD, now saving dicts*****************************') info_dict = {'rows': docs_num, 'cols': words_num, 'vals': total_count, 'upper_df_thresh':DOC_FREQ_UPPER_REL, 'lower_df_thresh':DOC_FREQ_LOWER_ABS} save_dict(output_dir_dict, info_dict, 'info_dict.json') save_dict(output_dir_dict, word_index_dict) if (doc_row_dict is not None): save_dict(output_dir_dict, doc_row_dict, 'row_dict.json')
def main(): usage_str = 'Takes an rdd matrix, converts it into a csr matrix. The rdd matrix can either be (id, SparseVector)' \ ', in which case we use indices instead of the ids, or already indexed as (index, SparseVector), and' \ 'in the latter case there is also the option of filtering the rows using a dictionary ' \ '(this is for the doc-term matrix), which maps ' \ 'the ids to indices. We also save the row dict.\n' \ '1. rdd dir name\n' \ '2. output dir for matrix\n' \ '3. -b for binarise, -n otherwise\n' \ '4. name for the output file\n' \ '5. dir for doc filtering dict, optional, -invert for special case of already indexed input rdd' if (len(sys.argv) < 5 or len(sys.argv) > 6): print(usage_str) return input_rdd_dir = sys.argv[1] output_dir = sys.argv[2] to_bin = True if (sys.argv[3] == '-n'): to_bin = False elif (sys.argv[3] == '-b'): to_bin = True else: print(usage_str) return out_name = sys.argv[4] input_dict_dir = None already_indexed = False invert_rdd = False if (len(sys.argv) == 6): input_dict_dir = sys.argv[5] if input_dict_dir == '-invert': already_indexed = True invert_rdd = True input_dict_dir = None conf = SparkConf().set("spark.driver.maxResultSize", "30G").\ set("spark.hadoop.validateOutputSpecs", "false").\ set('spark.default.parallelism', '100') spark = SparkContext.getOrCreate(conf=conf) count_vec_matrix = load_count_vector_matrix(spark, add_slash_to_dir(input_rdd_dir)+'count_vector_matrix') if (to_bin): count_vec_matrix = binarise_rdd(count_vec_matrix) if (input_dict_dir is not None): in_dict = intify_dict(load_dict(add_slash_to_dir(input_dict_dir)+'col_dict.json')) dict_broadcast = spark.broadcast(in_dict) count_vec_matrix = count_vec_matrix.filter(lambda x: x[0] in dict_broadcast.value) count_vec_matrix = count_vec_matrix.map(lambda x: (x[1], dict_broadcast.value[x[0]])) already_indexed = True cols_num = count_vec_matrix.first()[0].size else: cols_num = count_vec_matrix.first()[1].size if (invert_rdd): count_vec_matrix = count_vec_matrix.map(lambda x: (x[1], x[0])) rows_num = count_vec_matrix.count() print('shape calculated!') result = mat_rdd_to_csr(count_vec_matrix, (rows_num, cols_num), already_indexed=already_indexed) row_dict = count_vec_matrix.map(lambda x:x[0]).zipWithIndex().collectAsMap() json.dump(row_dict, open(add_slash_to_dir(output_dir)+out_name+'_row_dict.json', mode='w')) f1 = open(add_slash_to_dir(output_dir)+out_name+'_sparse_scipy.pickle', mode='wb') pickle.dump(result, f1) f1.close()
def main(): usage_str = 'This script produces doc and user edit counts (total and distinct) and doc sizes, using the ' \ 'following steps:\n' \ '\t* Removing bots.\n' \ '\t* Removing administrative pages.\n' \ '\t* Calculating the doc lengths by counting the number of their words.' \ '\t* Calculating total and distinct edit counts for users and docs.' \ 'Args:\n' \ '1. The input revision history file.\n' \ '2. The directory for the output files (all are dicts).\n' \ '3. The name of the non-admin pages file.\n' \ '4. Name of bot names file to filter them out.\n' \ '5. Concept-doc file name, -none if nonexistent.\n' \ '6. Concept-word-count file name\n' \ '7. Whether to filter out by name or not. -f filters, -n does not.' if (len(sys.argv) != 8): print(usage_str) return input_filename = sys.argv[1] output_dir = sys.argv[2] nonadmin_pages_filename = sys.argv[3] bots_filename = sys.argv[4] concept_doc_filename = sys.argv[5] concept_term_filename = sys.argv[6] filter_by_name = False if (sys.argv[7] == '-f'): filter_by_name = True elif (sys.argv[7] == '-n'): filter_by_name = False else: print(usage_str) return conf = SparkConf().set("spark.driver.maxResultSize", "10G").set('spark.default.parallelism', '100') spark = SparkContext.getOrCreate(conf=conf) input_rdd = tsv_to_rdd(spark, input_filename) filtered_rdd = filter_user_doc_data(spark, input_rdd, pages_filename=nonadmin_pages_filename, admin_filtration=True, bots_filename=bots_filename, doc_discard_list_filename=None, user_discard_list_filename=None, user_freq_filtration=False, doc_freq_filtration=False, discard_by_name=filter_by_name) remaining_docs = filtered_rdd.map(lambda x: (x[1], 1)).distinct().collectAsMap() c2dmap_bc = None if concept_doc_filename != '-none': rdd1 = tsv_to_rdd(spark, concept_doc_filename) concept_to_doc_map = generate_concept_doc_dict(rdd1) c2dmap_bc = spark.broadcast(concept_to_doc_map) remaining_docs_bc = spark.broadcast(remaining_docs) rdd2 = tsv_to_rdd(spark, concept_term_filename) rdd2 = rdd2.map(lambda x: (int(x[0]), int(x[2]))) if c2dmap_bc is not None: rdd2 = rdd2.map(lambda x: (c2dmap_bc.value[x[0]], x[1])) rdd2 = rdd2.filter(lambda x: x[0] in remaining_docs_bc.value) doc_lengths = rdd2.reduceByKey(add).collectAsMap() doc_edit_total = filtered_rdd.map(lambda x: (x[1], 1)).reduceByKey( add).collectAsMap() doc_edit_distinct = filtered_rdd.distinct().map( lambda x: (x[1], 1)).reduceByKey(add).collectAsMap() user_edit_total = filtered_rdd.map(lambda x: (x[0], 1)).reduceByKey( add).collectAsMap() user_edit_distinct = filtered_rdd.distinct().map( lambda x: (x[0], 1)).reduceByKey(add).collectAsMap() print('Doc total edit average') print( sum([doc_edit_total[x] for x in doc_edit_total]) / len(doc_edit_total)) print('Doc distinct edit average') print( sum([doc_edit_distinct[x] for x in doc_edit_distinct]) / len(doc_edit_distinct)) print('User total edit average') print( sum([user_edit_total[x] for x in user_edit_total]) / len(user_edit_total)) print('User distinct edit average') print( sum([user_edit_distinct[x] for x in user_edit_distinct]) / len(user_edit_distinct)) print('Doc length average') print(sum([doc_lengths[x] for x in doc_lengths]) / len(doc_lengths)) pickle.dump( doc_lengths, open(add_slash_to_dir(output_dir) + 'doc_lengths.pkl', mode='wb')) pickle.dump( doc_edit_total, open(add_slash_to_dir(output_dir) + 'doc_edit_total.pkl', mode='wb')) pickle.dump( doc_edit_distinct, open(add_slash_to_dir(output_dir) + 'doc_edit_distinct.pkl', mode='wb')) pickle.dump( user_edit_total, open(add_slash_to_dir(output_dir) + 'user_edit_total.pkl', mode='wb')) pickle.dump( user_edit_distinct, open(add_slash_to_dir(output_dir) + 'user_edit_distinct.pkl', mode='wb'))
def main(): usage_str = 'Performs offline testing of CF recommendations using user edit histories.\n' \ 'ATTENTION: The library "implicit" used in this script requires Python 3. Do not attempt running ' \ 'with Python 2.\n' \ 'Args:\n' \ '1. Training user-doc matrix.\n' \ '2. Test user-doc matrix.\n' \ '3. Holdout pairs.\n' \ '4. Output dir' if len(sys.argv) != 5: print(usage_str) return train_ud_filename = sys.argv[1] test_ud_filename = sys.argv[2] holdout_filename = sys.argv[3] output_dir = sys.argv[4] at_ks = [20, 50, 100, 200, 300] E_train = pickle.load(open(train_ud_filename, 'rb'), encoding='latin1') E_test = pickle.load(open(test_ud_filename, mode='rb'), encoding='latin1') heldout_pairs = pickle.load(open(holdout_filename, mode='rb'), encoding='latin1') test_users, test_docs = heldout_pairs E_test_modified = erase_heldout(E_test, heldout_pairs) print('Data loaded, starting creation of training matrix...') n_train_users = E_train.shape[0] training_mat = vstack([E_train, E_test_modified]).transpose().tocsr() print('Starting training...') model = implicit.als.AlternatingLeastSquares(factors=50) model.fit(training_mat) max_at_k = max(at_ks) recommended_pairs = [] user_counter = 0 recommendation_test_mat = training_mat.transpose().tocsr() print('Calculating recommendations') for user_index in np.unique(test_users): user_counter += 1 if user_counter % 100 == 0: print(user_counter) nonzero_indices = set(E_test_modified[user_index, :].nonzero()[1]) user_index_in_training_mat = n_train_users + user_index article_index_ranking = model.recommend( user_index_in_training_mat, recommendation_test_mat, N=max_at_k, filter_already_liked_items=True) article_index_ranking = [x[0] for x in article_index_ranking] new_recommended_pairs = rankings_to_recommendation_tuples( article_index_ranking, max_at_k, user_index, E_test, nonzero_indices, ascending=False) recommended_pairs.extend(new_recommended_pairs) result_dict = py3_recoms_to_prec_recall(recommended_pairs, at_ks) make_sure_path_exists(add_slash_to_dir(output_dir)) output_text = open(add_slash_to_dir(output_dir) + 'prec_and_recall_cf.txt', mode='w') save_textual_desc_prec_and_recall(at_ks, output_text, result_dict) pickle.dump( result_dict, open(add_slash_to_dir(output_dir) + 'out_dict_cf.pkl', mode='wb'))
def main(): usage_str = 'Takes the test user-doc matrix, heldout pairs (a tuple of two lists) and the doc latent matrix, ' \ 'calculates answers to questionnaire based on non-heldout data for test users, and evaluates on ' \ 'the heldout pairs, with precision@k and recall@k.\n' \ 'Args:\n' \ '1. Test user-doc matrix\n' \ '2. Heldout pairs\n' \ '3. Doc latent matrix\n' \ '4. Output dir\n' \ '5. Number of questions to consider.\n' \ '6. Whether to turn off stratification. -s to stratify, -n otherwise.' if (len(sys.argv) != 7): print(usage_str) return input_matrix_name = sys.argv[1] heldout_pairs_name = sys.argv[2] doc_latent_name = sys.argv[3] #question_filename = sys.argv[4] output_dir = sys.argv[4] holdout_option = '-nz' n_q = int(sys.argv[5]) stratify = True if sys.argv[6] == '-s': stratify = True elif sys.argv[6] == '-n': stratify = False else: print(usage_str) return E_test = pickle.load(open(input_matrix_name, mode='rb')) print(E_test.shape) heldout_pairs = pickle.load(open(heldout_pairs_name, mode='r')) print('Number of test users:') print(np.unique(heldout_pairs[0]).size) doc_latent = pickle.load(open(doc_latent_name, mode='rb')) print(doc_latent.shape) if (n_q > doc_latent.shape[1]): n_q = doc_latent.shape[1] doc_latent_d = n_q doc_latent = doc_latent[:, :doc_latent_d] at_ks = [20, 50, 100, 200, 300] levels = 7 result_dict = dotproduct_test_article_space( E_test, heldout_pairs, doc_latent, n_q=n_q, at_ks=at_ks, user_holdout_size=DEFAULT_HOLDOUT_SIZE, levels=levels, col_normalise=False, stratify=stratify) make_sure_path_exists(add_slash_to_dir(output_dir)) output_filename = 'q_based_out_dict' + holdout_option if stratify: output_filename = output_filename + '_stratified.pkl' else: output_filename = output_filename + '_unstratified.pkl' pickle.dump( result_dict, open(add_slash_to_dir(output_dir) + output_filename, mode='wb')) output_text = open(add_slash_to_dir(output_dir) + 'prec_and_recall' + holdout_option + '_' + str(levels) + '_' + str(n_q) + '_.txt', mode='w') save_textual_desc_prec_and_recall(at_ks, output_text, result_dict)
def main(): usage_str = 'This script receives a user-doc edit matrix (non-binary) and a doc-term latent representation matrix. ' \ 'It computes two matrices, a latent user matrix and a latent doc matrix, both in the same latent ' \ 'space as the input doc-term matrix.\n' \ 'Running modes are -b for batch and -i for individual. ' \ 'If first arg is -i, then the rest of the args are:\n' \ '1. Dir and name of the user-doc matrix.\n' \ '2. Dir and name of the doc-term latent matrix.\n' \ '3. Output directory.\n' \ '4. Number of latent dimensions.\n' \ '5. Name of file containing list of user indices to keep. Use -none if you don\'t want any such filtering.\n' \ '6. User matrix init mode: -p for (0,1), -s for (-1,1) (both will then have their rows normalised)\n' \ '7. -f for full training, -v for validation set separation (saves a file for the errors too)\n' \ '8. alpha and lambda and theta in the format alpha,lambda,theta, e.g. 1,1e-3,1e-4\n' \ '\nIf the first arg is -b, you should give the address of a file that contains the arguments ' \ 'listed above (each in one line).' if (len(sys.argv) < 2): print(usage_str) return mode_arg = sys.argv[1] if (mode_arg == '-i'): if (len(sys.argv) != 10): print(usage_str) return input_user_doc = sys.argv[2] input_doc_term = sys.argv[3] output_dir = sys.argv[4] n_latent = sys.argv[5] user_filter_filename = sys.argv[6] symmetry_arg = sys.argv[7] validation_arg = sys.argv[8] alphalambda = sys.argv[9] elif (mode_arg == '-b'): if (len(sys.argv) != 3): print(usage_str) return args_filename = sys.argv[2] args_file = open(args_filename, mode='r') arg_contents = args_file.readlines() arg_contents = [x.strip() for x in arg_contents] arg_contents = [x for x in arg_contents if len(x) > 0] if (len(arg_contents) != 8): print(usage_str) return input_user_doc = arg_contents[0] input_doc_term = arg_contents[1] output_dir = arg_contents[2] n_latent = arg_contents[3] user_filter_filename = arg_contents[4] symmetry_arg = arg_contents[5] validation_arg = arg_contents[6] alphalambda = arg_contents[7] else: print(usage_str) return try: n_latent = int(n_latent) except: print(usage_str) return user_init_symmetric = False if (symmetry_arg == '-p'): user_init_symmetric = False elif (symmetry_arg == '-s'): user_init_symmetric = True else: print(usage_str) return do_validation = False if (validation_arg == '-f'): do_validation = False elif (validation_arg == '-v'): do_validation = True else: print(usage_str) return split_alphalambda = alphalambda.strip('(').strip(')').split(',') gamma_ = -1 if (len(split_alphalambda) < 3 or len(split_alphalambda) > 4): print(usage_str) return try: alpha_ = float(split_alphalambda[0]) lambda_ = float(split_alphalambda[1]) theta_ = float(split_alphalambda[2]) if (len(split_alphalambda) == 4): gamma_ = float(split_alphalambda[3]) except: print(usage_str) return # Loading the input matrices print('Loading...') ud_in_file = open(input_user_doc, mode='rb') user_doc_sparse_mat_original = csr_matrix(pickle.load(ud_in_file)) ud_in_file.close() if (user_filter_filename != '-none'): user_filter_list = pickle.load(open(user_filter_filename, mode='rb')) user_doc_sparse_mat_original = user_doc_sparse_mat_original[ user_filter_list, :] validation_pairs = None if (do_validation): user_doc_sparse_mat, validation_pairs = do_validation_split( user_doc_sparse_mat_original) else: user_doc_sparse_mat = user_doc_sparse_mat_original print('User-doc matrix loaded') dt_in_file = open(input_doc_term, mode='rb') doc_original_latent = np.array(pickle.load(dt_in_file)) doc_original_latent -= np.mean(doc_original_latent, axis=0) #doc_original_latent = doc_original_latent / np.linalg.norm(doc_original_latent, axis=0) dt_in_file.close() print('Loading completed') user_doc_nonzero_indices = user_doc_sparse_mat.nonzero() n_nonzeros = len(user_doc_nonzero_indices[0]) if n_latent < doc_original_latent.shape[1] and n_latent > 0: doc_original_latent = doc_original_latent[:, 0:n_latent] else: n_latent = doc_original_latent.shape[1] n_users = user_doc_sparse_mat.shape[0] n_docs = user_doc_sparse_mat.shape[1] doc_original_latent /= ( np.linalg.norm(doc_original_latent, axis=1).reshape( (doc_original_latent.shape[0], 1)) + 1e-60) print('Number of users: ' + str(n_users)) print('Number of docs: ' + str(n_docs)) print('Number of latent dimensions: ' + str(n_latent)) # Initialising print('Initialising') if (user_init_symmetric): user_latent = np.random.rand(n_users, n_latent) * 2 - 1 else: user_latent = np.random.rand(n_users, n_latent) #user_latent /= np.linalg.norm(user_latent) #user_latent *= np.sqrt(np.sum(user_doc_sparse_mat.data*user_doc_sparse_mat.data)) user_latent -= np.mean(user_latent, axis=0) user_latent /= (np.linalg.norm(user_latent, axis=1).reshape( (user_latent.shape[0], 1)) + 1e-60) #user_latent = user_doc_sparse_mat.dot(doc_original_latent) / (1+np.array(user_doc_sparse_mat.sum(axis=1)).reshape(user_doc_sparse_mat.shape[0],1)) #user_latent /= ((np.linalg.norm(user_latent, axis = 1)*np.linalg.norm(user_latent, axis = 1)).reshape((user_latent.shape[0], 1))+1e-60) #user_latent *= 50 #user_latent /= np.linalg.norm(user_latent,axis=0) doc_latent = doc_original_latent.copy() #doc_latent = doc_latent / np.linalg.norm(doc_latent, axis=0) # kappa_ and epsilon_ and gamma_ and n_iter are read from json; but alpha_ and lambda_ are given as inputs. params_dict = json.load(open('minibatch_settings.json', mode='r')) kappa_ = params_dict['kappa_'] epsilon_ = params_dict['epsilon_'] if gamma_ == -1: gamma_ = params_dict['gamma_'] n_iter = params_dict['n_iter'] zeta_ = params_dict['zeta_'] #theta_ = params_dict['theta_'] # These are the old values we used to use. # kappa_ = 10 # epsilon_ = 20 # gamma_ = 5e-2 # n_iter = 200000 # alpha_ = 1 # lambda_ = 1e-3 # theta_ = 1e-4 errors_list = [] print('Initialisation complete.') i = 0 while i < n_iter: user_old = user_latent.copy() doc_old = doc_latent.copy() step_size = gamma_ / (1 + int(i / GAMMA_DECREASE_STEP)) rand_choices = random.sample(range(0, n_nonzeros), int(0.9 * BATCH_SIZE)) user_rand_indices = [ user_doc_nonzero_indices[0][rand_index] for rand_index in rand_choices ] doc_rand_indices = [ user_doc_nonzero_indices[1][rand_index] for rand_index in rand_choices ] user_rand_indices.extend([ random.randint(0, n_users - 1) for j in range(0, BATCH_SIZE - len(rand_choices)) ]) doc_rand_indices.extend([ random.randint(0, n_docs - 1) for j in range(0, BATCH_SIZE - len(rand_choices)) ]) for rand_index in range(0, len(user_rand_indices)): user_index = user_rand_indices[rand_index] doc_index = doc_rand_indices[rand_index] e_ui = user_doc_sparse_mat[user_index, doc_index] r_ui = int(e_ui > 0) c_ui = 1 + kappa_ * np.log(1 + e_ui / epsilon_) coef1 = -2 * c_ui * ( r_ui - np.dot(user_old[user_index, :], doc_old[doc_index, :])) user_latent[user_index, :] -= step_size * (coef1 * doc_old[doc_index, :]) doc_latent[doc_index, :] -= step_size * (coef1 * user_old[user_index, :]) uri_set = set(user_rand_indices) dri_set = set(doc_rand_indices) for user_index in uri_set: user_latent[user_index, :] -= 2 * step_size * alpha_ * user_old[ user_index, :] for doc_index in dri_set: doc_latent[doc_index, :] -= step_size * ( 2 * lambda_ * (doc_old[doc_index, :] - doc_original_latent[doc_index, :]) + zeta_ * (1.0 * (doc_old[doc_index, :] > 0).astype(int) - 1.0 * (doc_old[doc_index, :] < 0).astype(int))) qtq_minus_diag = doc_old.transpose().dot(doc_old) qtq_minus_diag -= np.diag(np.diag(qtq_minus_diag)) doc_latent -= 4 * theta_ * step_size * doc_old.dot(qtq_minus_diag) i += BATCH_SIZE print(i, alpha_, lambda_, theta_, gamma_, zeta_) #do_gd_step(user_doc_sparse_mat, user_latent, doc_latent, doc_original_latent, user_index,doc_index, i, # kappa_, epsilon_, alpha_, lambda_, gamma_, theta_) if (i % (20 * BATCH_SIZE) == 0): print('Errors:') current_error = calc_error(user_doc_sparse_mat, user_latent, doc_latent, doc_original_latent, kappa_, epsilon_, alpha_, lambda_) print(current_error) #errors_list.append(current_error) print('----------------') print('Saving') make_sure_path_exists(add_slash_to_dir(output_dir)) if (not do_validation or do_validation): f1 = open(add_slash_to_dir(output_dir) + 'user_latent.pickle', mode='wb') pickle.dump(user_latent, f1) f1.close() f2 = open(add_slash_to_dir(output_dir) + 'doc_latent.pickle', mode='wb') pickle.dump(doc_latent, f2) f2.close() f3 = open(add_slash_to_dir(output_dir) + 'params.json', mode='w') json.dump( { 'alpha_': alpha_, 'lambda_': lambda_, 'gamma_': gamma_, 'n_iter': n_iter, 'kappa_': kappa_, 'epsilon_': epsilon_, 'theta_': theta_, 'zeta_': zeta_, 'BATCH_SIZE': BATCH_SIZE }, f3) f3.close() if (do_validation): k_topics = 50 print('Calculating validation errors:') pred_error, n_distinct_val_users = prediction_error_pair_sqerr( user_doc_sparse_mat_original, user_latent, doc_latent, validation_pairs, kappa_, epsilon_) #The maximum cohesion score ever possible is 2*k_topics (possible if all top and bottom scores come out as 1) cohesion_score, _, _ = calc_cohesion_score(doc_latent, doc_original_latent, k=k_topics) error_dict = { 'prediction_error': pred_error, 'cohesion_score': cohesion_score, 'n_validation_users': n_distinct_val_users, 'k_topics': k_topics, 'n_validation_nonzeros': len(validation_pairs[0]) } json.dump(error_dict, open(add_slash_to_dir(output_dir) + 'errors.json', mode='w')) print('Prediction error:') print(pred_error) print('Cohesion score:') print(cohesion_score)
def main(): usage_str = 'Shows interpretations of singular vectors or principal components. The vectors are assumed to be in ' \ 'column form, i.e. a 2d array of shape (n_dims, n_components). The output consists of the questions ' \ 'both in txt and json formats, and the column-normalised latent representations ' \ 'used for recommendation plus the list of ' \ 'article ids that should be avoided since they appear in the questions.\n' \ '1. Name of doc names file (mapping of doc name to doc id)\n' \ '2. Dir of id to column index mapping dict\n' \ '3. Name of pickle file containing the Q matrix. (the outputs will be saved in the same dir)\n' \ '4. Optional, number of questions. If not provided, generates all the questions.' if (len(sys.argv) < 4 or len(sys.argv) > 5): print(usage_str) return name_filename = sys.argv[1] dict_dir = sys.argv[2] vectors_file_name = sys.argv[3] n_questions = -1 if (len(sys.argv) == 5): try: n_questions = int(sys.argv[4]) if (n_questions < 1): print(usage_str) return except: print(usage_str) return col_dict = json.load( open(add_slash_to_dir(dict_dir) + 'col_dict.json', mode='r')) col_dict = intify_dict(invert_dict(col_dict)) names_dict = get_id_name_dict(name_filename) Q = np.array(pickle.load(open(vectors_file_name, mode='rb'))) if (n_questions == -1 or n_questions > Q.shape[1]): n_questions = Q.shape[1] Q = Q[:, :n_questions] out_file_txt = open(vectors_file_name + '_interpreted_' + str(n_questions) + '.txt', mode='w') out_file_json = open(vectors_file_name + '_questions_dict_' + str(n_questions) + '.json', mode='w') latent_reps, id_avoid_list = create_questions(col_dict, Q, names_dict, out_file_txt, out_file_json) out_file_txt.close() out_file_json.close() name_avoid_list = [names_dict[x] for x in id_avoid_list] pickle.dump( latent_reps, open(vectors_file_name + '_latent_rep_' + str(n_questions) + '.pkl', mode='wb')) pickle.dump( id_avoid_list, open(vectors_file_name + '_id_avoid_list_' + str(n_questions) + '.pkl', mode='wb')) pickle.dump( name_avoid_list, open(vectors_file_name + '_name_avoid_list_' + str(n_questions) + '.pkl', mode='wb'))