def main(): if len(sys.argv) != 3: print('usage: python postagger-test.py', file=sys.stderr) print(' <str: test prefix>', file=sys.stderr) print(' <str: model prefix>', file=sys.stderr) return test_prefix = sys.argv[1] model_prefix = sys.argv[2] print('loading data ...', file=sys.stderr) # load test data test_words = [w.lower() for w in utils.read_data(test_prefix + '.words')] test_pos = utils.read_data(test_prefix + '.pos') # load dictionary word_ids = Dictionary.load(model_prefix + '.wordid') pos_ids = Dictionary.load(model_prefix + '.posid') # make word/POS IDs test_wids = [word_ids[w] for w in test_words] test_pids = [pos_ids[w] for w in test_pos] # load and test tagger tagger = POSTagger.load(model_prefix) tagger.test(test_wids, test_pids)
def test_choose_bins(self): """ test of the choose_bins function """ data_to_test = fct.read_data(file) bin_1, bin_n, bins = fct.choose_bins(data_to_test, 'height', 0.1) assert round(bin_1, 1) == 1.6 assert round(bin_n, 1) == 1.9 assert round(bins, 1) == 15.
def test_plot_histogram(self): """ test of the plot_histogram function """ data_to_test = fct.read_data(file) data_F = fct.sort_data(data_to_test, 'F') bin_1, bin_n, bins = fct.choose_bins(data_F, 'age', 1.) spot1, width1 = fct.plot_histogram(data_F['age'].values, bin_1, bin_n, bins, 'left', 'women') assert round(spot1[-1] - spot1[0], 1) == 1.0
def test_doublehistogram(self): """ test of the doublehistogram function """ data_to_test = fct.read_data(file) spot1, spot2, width1, width2 = fct.doublehistogram( data_to_test, list(data_to_test)[1], 1., 300, './figures/tests/') assert round(width1, 2) == round(width2, 2) assert round(spot1[0] + width1, 2) == round(spot2[0] - width1, 2)
def main(): if len(sys.argv) != 9: print('usage: python postagger-train.py', file=sys.stderr) print(' <str: train prefix>', file=sys.stderr) print(' <str: dev prefix>', file=sys.stderr) print(' <str: model prefix>', file=sys.stderr) print(' <int: word n-gram size>', file=sys.stderr) print(' <int: POS n-gram size>', file=sys.stderr) print(' <int: word window size>', file=sys.stderr) print(' <int: POS history size>', file=sys.stderr) print(' <int: max iteration>', file=sys.stderr) return train_prefix = sys.argv[1] dev_prefix = sys.argv[2] model_prefix = sys.argv[3] word_ngram_size = int(sys.argv[4]) pos_ngram_size = int(sys.argv[5]) word_window_size = int(sys.argv[6]) pos_history_size = int(sys.argv[7]) max_iteration = int(sys.argv[8]) print('loading data ...', file=sys.stderr) # load train/dev data train_words = [w.lower() for w in utils.read_data(train_prefix + '.words')] train_pos = utils.read_data(train_prefix + '.pos') dev_words = [w.lower() for w in utils.read_data(dev_prefix + '.words')] dev_pos = utils.read_data(dev_prefix + '.pos') # make dictionary word_ids = Dictionary(train_words, frozen=True) pos_ids = Dictionary(train_pos, frozen=True) word_ids.save(model_prefix + '.wordid') pos_ids.save(model_prefix + '.posid') # make word/POS IDs train_wids = [word_ids[w] for w in train_words] train_pids = [pos_ids[w] for w in train_pos] dev_wids = [word_ids[w] for w in dev_words] dev_pids = [pos_ids[w] for w in dev_pos] # train tagger = POSTagger(word_ngram_size, pos_ngram_size, word_window_size, pos_history_size) tagger.train(len(pos_ids), train_wids, train_pids, dev_wids, dev_pids, max_iteration, model_prefix)
def main(): data_set_list = ['MNIST', 'lung_small', 'warpPIE10P', 'Yale', 'digits'] n_clusters_list = [10, 7, 10, 15, 10] meth = ['MCFS-I'] for i in range(len(data_set_list)): print('dataset: {}'.format(data_set_list[i])) data, label = utils.read_data(data_set_list[i]) # test n_emb for n_emb in range(1, 31): print('n_emb: {}/{}'.format(n_emb, 50)) with open("../Result/" + data_set_list[i] + "_nemb_test.txt", 'a') as f: line = str(n_emb) + '\n' f.write(line) for j in range(len(meth)): print('method: {}\ttime: {}'.format( meth[j], time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) for o in range(3): print('step: {}/3\ttime: {}'.format( str(o + 1), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) star_time = datetime.now() weight = MCFS.mcfs(X=data, n_selected_features=100, i=j, n_emb=n_emb, n_neighbors=0) idx = MCFS.feature_ranking(weight) selected_data = data[:, idx[0:100]] end_time = datetime.now() # perform kmeans clustering based on the selected features and repeats 5 times nmi_total = 0.0 for k in range(3): nmi_total += MCFS.eval_cluster_prediction( selected_data, label, n_clusters_list[i]) # output the average NMI with open( "../Result/" + data_set_list[i] + "_nemb_test.txt", 'a') as f: line = meth[j] + ': ' + str(float(nmi_total) / 3) + '\tcost_time: ' + \ str((end_time-star_time).seconds) + 's\n' f.write(line) with open("../Result/" + data_set_list[i] + "_nemb_test.txt", 'a') as f: line = '\n\n' f.write(line)
def main(): logging.set_verbosity_info() parser = argparse.ArgumentParser() parser.add_argument('--model_path', default='best_model_ckpt_0', type=str) parser.add_argument('--seed', default=202105, type=int) args = parser.parse_args() seed_random(args.seed) data_path = './user_data/duality_pair_pretrain_no_nsp.txt' vocab_path = './user_data/vocab.txt' model_path = './user_data/nezha-cn-base' output_path = './user_data/pretrained-nezha-base' tokenizer = BertTokenizer.from_pretrained(vocab_path) data = read_data(data_path, tokenizer) train_dataset = TcDataset(data) model = NeZhaForMaskedLM.from_pretrained(model_path) model.resize_token_embeddings(tokenizer.vocab_size) data_collator = TcCollator(max_seq_len=30, tokenizer=tokenizer, mlm_probability=0.15) logging_path = os.path.join(output_path, 'log') model_save_path = os.path.join(output_path, args.model_path) tokenizer_and_config = os.path.join(output_path, 'tokenizer_and_config') build_path(model_save_path) build_path(logging_path) build_path(tokenizer_and_config) training_args = TrainingArguments(output_dir=output_path, overwrite_output_dir=True, learning_rate=6e-5, num_train_epochs=130, per_device_train_batch_size=128, logging_steps=5000, fp16=True, fp16_backend='amp', load_best_model_at_end=True, prediction_loss_only=True, logging_dir=logging_path, logging_first_step=True, dataloader_num_workers=4, seed=2021) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, ) trainer.train() trainer.save_model(model_save_path) tokenizer.save_pretrained(tokenizer_and_config)
def main(fixed_params_path, params_path, visualization, check_embedding, remove, edge_batch_size): params = read_data(params_path) params.pop('remove', None) params.pop('edge_batch_size', None) train_full_model(fixed_params_path=fixed_params_path, visualization=visualization, check_embedding=check_embedding, remove=remove, edge_batch_size=edge_batch_size, **params)
def features_from(i): df_fiscalite, df_resp_fis, df_ids_fis, df_democratie, df_resp_dem, df_ids_dem, df_ecologie, df_resp_eco, df_ids_eco, df_organisation, df_resp_org, df_ids_org = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 gmm, features = 0, 0 print(df_fiscalite) if (i == 0): df_fiscalite = ut.read_data( 'data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json') df_resp_fis = get_open_reponses(df_fiscalite) df_ids_fis = get_ids_open_reponses(df_fiscalite) elif (i == 1): df_democratie = ut.read_data('data/DEMOCRATIE_ET_CITOYENNETE.json') df_resp_dem = get_open_reponses(df_democratie) df_ids_dem = get_ids_open_reponses(df_democratie) elif (i == 2): df_ecologie = ut.read_data('data/LA_TRANSITION_ECOLOGIQUE.json') df_resp_eco = get_open_reponses(df_ecologie) df_ids_eco = get_ids_open_reponses(df_ecologie) elif (i == 3): df_organisation = ut.read_data( 'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json') df_resp_org = get_open_reponses(df_organisation) df_ids_org = get_ids_open_reponses(df_organisation) dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie], ["ecologie", df_ecologie], ["organisation", df_organisation]]) dfs_responses = np.array([["responses fiscalite", df_resp_fis], ["responses democratie", df_resp_dem], ["responses ecologie", df_resp_eco], ["responses organisation", df_resp_org]]) dfs_ids = np.array([df_ids_fis, df_ids_dem, df_ids_eco, df_ids_org]) # read features features = np.loadtxt(dfs_responses[i, 0] + '_all_questions.tsv', delimiter='\t') # Fit GMM gmm = GaussianMixture(n_components=10) gmm.fit(np.array(features)) local_pool = multiprocessing.Pool(20, initializer) local_pool.map(fill_X, range(four_surveys_taken_auth_ids)) local_pool.close() local_pool.join() np.savetxt("X_" + str(i) + ".csv", X, delimiter=",")
def main(): params = get_cmd() aa2idx, idx2aa = dict(), dict() for i, aa in enumerate(ORDER_LIST): aa2idx[aa] = i idx2aa[i] = aa X = np.load(params.coords)["arr_0"] sequences, labels, _ = read_data(params.inputfile, get_labels=True) assert params.query >= 0 and params.query < len(sequences) assert params.target >= 0 and params.target < len(sequences) assert params.query != params.target if labels[params.query] == labels[params.target]: print("Warning, query and target are from the same family") seq_query = sequences[params.query] seq_target = sequences[params.target] encoded_query = X[params.query] encoded_target = X[params.target] decoder = keras.models.load_model(params.decoder) points = list() for v in np.linspace(0, 1, params.steps + 2): points.append(slerp(v, encoded_query, encoded_target)) points = np.asarray(points) decoded_points = decoder.predict(points) decoded_seq = [] for pred in decoded_points: wp = warm_prediction(pred.T, 0.5).T num_seq = [ np.random.choice(np.arange(len(ORDER_LIST)), p=wp[j]) for j in range(len(wp)) ] decoded_seq.append("".join(idx2aa[i] for i in num_seq)) with open(params.outputfile, "w") as outf: #outf.write(">query_original\n{}\n".format(seq_query)) outf.write(">query\n{}\n".format(decoded_seq[0])) for i in range(1, len(decoded_seq) - 1): outf.write(">interpolated_{}\n{}\n".format(i, decoded_seq[i])) outf.write(">target\n{}\n".format(decoded_seq[-1])) #outf.write(">target_original\n{}\n".format(seq_target)) sys.exit(0)
def main(params_path, user_ids, use_saved_graph, trained_model_path, use_saved_already_bought, graph_path, ctm_id_path, pdt_id_path, already_bought_path, k, remove): params = read_data(params_path) params.pop('k', None) params.pop('remove', None) inference_ondemand( user_ids=user_ids, # List or 'all' use_saved_graph=use_saved_graph, trained_model_path=trained_model_path, use_saved_already_bought=use_saved_already_bought, graph_path=graph_path, ctm_id_path=ctm_id_path, pdt_id_path=pdt_id_path, already_bought_path=already_bought_path, k=k, remove=remove, **params, )
def main(): data_set_list = ['MNIST', 'lung_small', 'warpPIE10P', 'Yale', 'digits'] n_clusters_list = [10, 7, 10, 15, 10] n_select_feature = [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 64, 70, 80, 90, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300] num_select_feature_max = [200, 200, 200, 200, 64] meth = ['MCFS', 'MCFS-I', 'lap_score', 'NDFS'] for i in range(len(data_set_list)): """ i = 0: use data set MNIST i = 1: use data set lung_small i = 2: use data set warpPIE10P i = 3: use data set Yale i = 4: use data set digits """ print('dataset: {}'.format(data_set_list[i])) data, label = utils.read_data(data_set_list[i]) for num_sel_fea in n_select_feature: if num_sel_fea > num_select_feature_max[i]: break print('select feature: {}/{}'.format(num_sel_fea, num_select_feature_max[i])) with open("../Result/" + data_set_list[i] + ".txt", 'a') as f: line = str(num_sel_fea) + '\n' f.write(line) for j in range(len(meth)): """ j = 0: test MCFS j = 1: test MCFS-I j = 2: test lap_score j = 3: test NDFS """ print('method: {}\ttime: {}'.format(meth[j], time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) cnt = 0 for o in range(5): print('step: {}/5\ttime: {}'.format(str(o+1), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) star_time = datetime.now() if j < 2: weight = MCFS.mcfs(X=data, n_selected_features=num_sel_fea, i=j, n_emb=n_clusters_list[i], n_neighbors=5) idx = MCFS.feature_ranking(weight) elif j == 2: kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} W = construct_W(data, **kwargs_W) score = lap_score.lap_score(data, W=W) idx = lap_score.feature_ranking(score) elif j == 3: kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1} W = construct_W(data, **kwargs) Weight = NDFS.ndfs(data, W=W, n_clusters=20) idx = feature_ranking(Weight) selected_data = data[:, idx[0:num_sel_fea]] end_time = datetime.now() print((end_time-star_time).microseconds) cnt += (end_time-star_time).microseconds # perform k-means clustering based on the selected features and repeats 5 times nmi_total = 0.0 for k in range(5): nmi_total += MCFS.eval_cluster_prediction(selected_data, label, n_clusters_list[i]) # output the average NMI with open("../Result/" + data_set_list[i] + ".txt", 'a') as f: line = meth[j] + ': ' + str(float(nmi_total) / 5) + '\tcost_time: ' + \ str((end_time-star_time).microseconds) + 'us\n' f.write(line) with open("../Result/" + 'MCFS-I' + ".txt", 'a') as f: line = '\n\n' f.write(line) print('Test is complete!') utils.send_message('Complete', 'Test is complete!')
def _read(self, file_path: str): lines = utils.read_data(file_path, self.percent_data) # Create instances for line in lines: yield self.text_to_instance(**line)
def test_sort_data(self): """ test of the sort_data function """ data_to_test = fct.read_data(file) dataM = fct.sort_data(data_to_test, 'M') assert np.size(np.where(dataM['gender'].values == 'F')) == 0
def test_read_data(self): """ test of the read_data function """ data_to_test = fct.read_data(file) assert np.size(data_to_test) == 12 assert round(np.mean(data_to_test['age'].values), 2) == 33.25 assert round(np.sum(data_to_test['height'].values), 2) == 6.79
import argparse from src.utils import exam, exam_handler, read_data if __name__ == '__main__': parser = argparse.ArgumentParser(description='Supervised training') parser.add_argument('--data_path', '-d', type=str, default='./data/vocabulary.json', help='Path to vocabulary JSON') parser.add_argument('--source_lang', '-s', type=str, default='es', help='Source Language') parser.add_argument('--target_lang', '-t', type=str, default='fr', help='Target Language') parser.add_argument( '--mode', '-m', type=int, default=1, help='Test Mode: 1->multilingual 2->ES to FR 3->FR to ES') params = parser.parse_args() data = read_data(params.data_path) exam_handler(data, params.source_lang, params.target_lang, params.mode)
from src.utils import read_data, get_open_reponses, get_ids_open_reponses from sklearn.mixture import GaussianMixture def fill_X(auth_index): global gmm global ids_auth global features global four_surveys_taken_auth_ids auth = four_surveys_taken_auth_ids[auth_index] k = list(ids_auth).index(auth) return gmm.predict_proba(features[k].reshape(1, -1))[0] n_compo = 10 df_organisation = read_data( 'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json') df_resp_org = get_open_reponses(df_organisation) df_ids_org = get_ids_open_reponses(df_organisation) four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv", delimiter=",", dtype=str) ids_auth = np.sort(list(set(df_resp_org['authorId'].values))) np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s") X = np.zeros((len(four_surveys_taken_auth_ids), n_compo)) # read features features = np.loadtxt('responses organisation_all_questions.tsv', delimiter='\t') # Fit GMM gmm = GaussianMixture(n_components=n_compo) gmm.fit(features) # pool
import pandas as pd from src.utils import read_data enc_data = read_data('data/test.question', 'ENCSENTS') dec_data = read_data('data/test.answer', 'DECSENTS') # enc_data['ENCSLEN'] = enc_data.ENCSENTS.apply(lambda l: len(l.split())) # dec_data['DECSLEN'] = dec_data.DECSENTS.apply(lambda l: len(l.split())) dataset = pd.concat([enc_data, dec_data], axis=1) dataset.dropna(axis=0, inplace=True) dataset.to_csv('data/dataset.csv', index=False, encoding='utf-8', sep='\t')
def inference_ondemand( user_ids, # List or 'all' use_saved_graph: bool, trained_model_path: str, use_saved_already_bought: bool, graph_path=None, ctm_id_path=None, pdt_id_path=None, already_bought_path=None, k=10, remove=.99, **params, ): """ Given a fully trained model, return recommendations specific to each user. Files needed to run ------------------- Params used when training the model: Those params will indicate how to run inference on the model. Usually, they are outputted during training (and hyperparametrization). If using a saved already bought dict: The already bought dict: the dict includes all previous purchases of all user ids for which recommendations were requested. If not using a saved dict, it will be created using the graph. Using a saved already bought dict is not necessary, but might make the inference process faster. A) If using a saved graph: The saved graph: the graph that must include all user ids for which recommendations were requested. Usually, it is outputted during training. It could also be created by another independent function. ID mapping: ctm_id and pdt_id mapping that allows to associate real-world information, e.g. item and customer identifier, to actual nodes in the graph. They are usually saved when generating a graph. B) If not using a saved graph: The graph will be generated on demand, using all the files in DataPaths of src.utils_data. All those files will be needed. Parameters ---------- See click options below for details. Returns ------- Recommendations for all user ids. """ # Load & preprocess data ## Graph if use_saved_graph: graph = read_graph(graph_path) ctm_id_df = read_data(ctm_id_path) pdt_id_df = read_data(pdt_id_path) else: # Create graph data_paths = DataPaths() fixed_params = FixedParameters( num_epochs=0, start_epoch=0, # Not used (only used in training) patience=0, edge_batch_size=0, # Not used (only used in training) remove=remove, item_id_type=params['item_id_type'], duplicates=params['duplicates']) data = DataLoader(data_paths, fixed_params) ctm_id_df = data.ctm_id pdt_id_df = data.pdt_id graph = create_graph(data.graph_schema, ) graph = assign_graph_features( graph, fixed_params, data, **params, ) ## Preprocess: fetch right user ids if user_ids[0] == 'all': test_uids = np.arange(graph.num_nodes('user')) else: test_uids = fetch_uids(user_ids, ctm_id_df) ## Remove already bought if use_saved_already_bought: already_bought_dict = read_data(already_bought_path) else: bought_eids = graph.out_edges(u=test_uids, form='eid', etype='buys') already_bought_dict = create_already_bought(graph, bought_eids) # Load model dim_dict = { 'user': graph.nodes['user'].data['features'].shape[1], 'item': graph.nodes['item'].data['features'].shape[1], 'out': params['out_dim'], 'hidden': params['hidden_dim'] } if 'sport' in graph.ntypes: dim_dict['sport'] = graph.nodes['sport'].data['features'].shape[1] trained_model = ConvModel( graph, params['n_layers'], dim_dict, params['norm'], params['dropout'], params['aggregator_type'], params['pred'], params['aggregator_hetero'], params['embedding_layer'], ) trained_model.load_state_dict( torch.load(trained_model_path, map_location=device)) if cuda: trained_model = trained_model.to(device) # Create dataloader all_iids = np.arange(graph.num_nodes('item')) test_node_ids = {'user': test_uids, 'item': all_iids} n_layers = params['n_layers'] if params['embedding_layer']: n_layers = n_layers - 1 sampler = dgl.dataloading.MultiLayerFullNeighborSampler(n_layers) nodeloader_test = dgl.dataloading.NodeDataLoader(graph, test_node_ids, sampler, batch_size=128, shuffle=True, drop_last=False, num_workers=num_workers) num_batches_test = math.ceil((len(test_uids) + len(all_iids)) / 128) # Fetch recs trained_model.eval() with torch.no_grad(): embeddings = get_embeddings( graph, params['out_dim'], trained_model, nodeloader_test, num_batches_test, cuda, device, params['embedding_layer'], ) recs = get_recs(graph, embeddings, trained_model, params['out_dim'], k, test_uids, already_bought_dict, remove_already_bought=True, cuda=cuda, device=device, pred=params['pred'], use_popularity=params['use_popularity'], weight_popularity=params['weight_popularity']) # Postprocess: user & item ids processed_recs = postprocess_recs(recs, pdt_id_df, ctm_id_df, params['item_id_type'], params['ctm_id_type']) print(processed_recs) return processed_recs
""" Created on Sun Apr 14 17:27:13 2019 @author: gabriel """ #%% import src.utils as ut import numpy as np import pandas as pd import string from src.kmeans_embeddings import FeaturesExtractor from src.utils import (read_data, get_open_reponses) from sklearn.mixture import GaussianMixture #%% extract data from json df_fiscalite = ut.read_data('data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json') df_democratie = ut.read_data('data/DEMOCRATIE_ET_CITOYENNETE.json') df_ecologie = ut.read_data('data/LA_TRANSITION_ECOLOGIQUE.json') df_organisation = ut.read_data('data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json') dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie], ["ecologie", df_ecologie], ["organisation", df_organisation]]) #%% questionId = '162' df_responses = get_open_reponses(df_fiscalite) responses = (df_responses[df_responses.questionId == questionId].formattedValue.values.tolist()) # Extract embeddings for sentences s = FeaturesExtractor()
#%% import numpy as np from src.kmeans_embeddings import FeaturesExtractor from src.utils import read_data, get_open_reponses, get_ids_open_reponses #%% extract data from json df_fiscalite = read_data('data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json') df_democratie = 0 df_ecologie = 0 df_organisation = 0 dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie], ["ecologie", df_ecologie], ["organisation", df_organisation]]) #%% #%% responses of each themes df_resp_fis = get_open_reponses(df_fiscalite) df_resp_dem = 0 df_resp_eco = 0 df_resp_org = 0 dfs_responses = np.array([["responses fiscalite", df_resp_fis], ["responses democratie", df_resp_dem], ["responses ecologie", df_resp_eco], ["responses organisation", df_resp_org]]) #%% #%% extract features s = FeaturesExtractor()
# -*- coding: utf-8 -*- from src.kmeans_embeddings import FeaturesExtractor from src.utils import (read_data, get_open_reponses) from sklearn.cluster import KMeans import numpy as np import pandas as pd if __name__ == '__main__': df = read_data('data/DEMOCRATIE_ET_CITOYENNETE.json') df_responses = get_open_reponses(df) responses = (df_responses[df_responses.questionId == '107']. formattedValue.values.tolist()) # Extract embeddings for sentences s = FeaturesExtractor() features = [s.get_features(x) for x in responses] features_np = np.array(features) print(features_np) samples_id = np.random.choice(range(len(features)), 5000) features_np_samples = features_np[samples_id, :] np.savetxt('features_s.tsv', features_np_samples, delimiter='\t') responses_samples = [responses[i] for i in samples_id] with open('labels_s.tsv', 'w') as f: for resp in responses_samples:
################################################################## # Description: Main code to plot statistics (in histogram) of age and height of a database of men and women. Men and women are separated here to give two distinguishable distributions. Histograms are stored in ./figures. # Code name: ./src/main_improved.py # Date of creation: 2019/03/12 # Date of last modification: 2019/03/15 # Contact information: Yann Chavaillaz, [email protected] ################################################################## # Local packages required import src.utils as fct # Definition of paths path_data = './data/' # where to find the dataset path_fig = './figures/' # where to store the figures # Definition and reading of the dataset file = (path_data + 'data.csv') # where data is stored data = fct.read_data(file) # Definition of parameters resolution = 300 # resolution of the figure in dpi step1 = 1. # increment of histogram for the 1st variable step2 = 0.1 # increment of histogram for the 2nd variable # Plotting and saving histograms of age and size distribution among both genders fct.doublehistogram(data, list(data)[1], step1, resolution, path_fig) # age fct.doublehistogram(data, list(data)[2], step2, resolution, path_fig) # size
from collections import Counter from src.settings import * from src.utils import read_data, parse_config from src.functions import create_distance_matrix, create_vertices, teitz_bart_algorithm if __name__ == '__main__': config = parse_config(CONFIG_FILE) count = int(config['DEFAULT']['count']) p = int(config['DEFAULT']['p']) data = read_data(DATA_FILE) vertices = create_vertices(data) distance_matrix = create_distance_matrix(data) medians = [ tuple(teitz_bart_algorithm(distance_matrix, vertices, p)) for i in range(count) ] counter = Counter(medians).items() for key, val in sorted(counter, key=lambda item: item[1], reverse=True): print(f"median {key}, amount {val}")
n_samples = 100 # print(ttest_1samp(np.repeat(5, n_samples), 5)) # exit(1) # for i in range(50): x = np.random.randn(n_samples, 1) y = x # Z = np.random.randn(n_samples, 2) # print(fcit.test(x, y, prop_test=0.2)) print(kernel_based_indepence(x, y, approximate=False)) y = np.random.randn(n_samples, 1) print(kernel_based_indepence(x, y, approximate=False)) # print(kernel_based_conditional_independence(x, y, Z)) exit(1) short_metrics_p, long_metrics_p = utils.read_data(shift=True) short_metrics = short_metrics_p[:, :, 0] long_metrics = long_metrics_p[:, :, 0] metrics = np.hstack((short_metrics, long_metrics)) from itertools import combinations from time import time start = time() values = [] for mx, my, mz in combinations( metrics.reshape((metrics.shape[1], metrics.shape[0], 1)), 3): values.append( kernel_based_conditional_independence(mx, my, mz, approximate=True)) print(time() - start) plt.hist(values, bins='auto') plt.show()
def train_full_model(fixed_params_path, visualization, check_embedding, remove, edge_batch_size, **params,): """ Given the best hyperparameter combination, function to train the model on all available data. Files needed to run ------------------- All the files in the TrainDataPaths: It includes all the interactions between user, sport and items, as well as features for user, sport and items. Fixed_params and params found in hyperparametrization: Those params will indicate how to train the model. Usually, they are found when running the hyperparametrization loop. Parameters ---------- See click options below for details. Saves to files -------------- trained_model with its fixed parameters and hyperparameters: The trained model with all parameters are saved to the folder 'models'. graph and ID mapping: When doing inference, it might be useful to import an already built graph (and the mapping that allows to associate node ID with personal information such as CUSTOMER IDENTIFIER or ITEM IDENTIFIER). Thus, the graph and ID mapping are saved to folder 'models'. """ # Load parameters fixed_params = read_data(fixed_params_path) class objectview(object): def __init__(self, d): self.__dict__ = d fixed_params = objectview(fixed_params) fixed_params.remove = remove fixed_params.subtrain_size = 0.01 fixed_params.valid_size = 0.01 fixed_params.edge_batch_size = edge_batch_size # Create full train set train_data_paths = TrainDataPaths() presplit_item_feat = read_data(train_data_paths.item_feat_path) full_interaction_data = read_data(train_data_paths.full_interaction_path) train_df, test_df = presplit_data(presplit_item_feat, full_interaction_data, num_min=3, remove_unk=True, sort=True, test_size_days=1, item_id_type='ITEM IDENTIFIER', ctm_id_type='CUSTOMER IDENTIFIER', ) train_data_paths.train_path = train_df train_data_paths.test_path = test_df data = DataLoader(train_data_paths, fixed_params) # Initialize graph & features valid_graph = create_graph( data.graph_schema, ) valid_graph = assign_graph_features(valid_graph, fixed_params, data, **params, ) dim_dict = {'user': valid_graph.nodes['user'].data['features'].shape[1], 'item': valid_graph.nodes['item'].data['features'].shape[1], 'out': params['out_dim'], 'hidden': params['hidden_dim']} all_sids = None if 'sport' in valid_graph.ntypes: dim_dict['sport'] = valid_graph.nodes['sport'].data['features'].shape[1] all_sids = np.arange(valid_graph.num_nodes('sport')) # Initialize model model = ConvModel(valid_graph, params['n_layers'], dim_dict, params['norm'], params['dropout'], params['aggregator_type'], params['pred'], params['aggregator_hetero'], params['embedding_layer'], ) if cuda: model = model.to(device) # Initialize dataloaders # get training and test ids ( train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids, test_uids, all_iids, ground_truth_subtrain, ground_truth_valid, all_eids_dict ) = train_valid_split( valid_graph, data.ground_truth_test, fixed_params.etype, fixed_params.subtrain_size, fixed_params.valid_size, fixed_params.reverse_etype, fixed_params.train_on_clicks, fixed_params.remove_train_eids, params['clicks_sample'], params['purchases_sample'], ) ( edgeloader_train, edgeloader_valid, nodeloader_subtrain, nodeloader_valid, nodeloader_test ) = generate_dataloaders(valid_graph, train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids, test_uids, all_iids, fixed_params, num_workers, all_sids, embedding_layer=params['embedding_layer'], n_layers=params['n_layers'], neg_sample_size=params['neg_sample_size'], ) train_eids_len = 0 valid_eids_len = 0 for etype in train_eids_dict.keys(): train_eids_len += len(train_eids_dict[etype]) valid_eids_len += len(valid_eids_dict[etype]) num_batches_train = math.ceil(train_eids_len / fixed_params.edge_batch_size) num_batches_subtrain = math.ceil( (len(subtrain_uids) + len(all_iids)) / fixed_params.node_batch_size ) num_batches_val_loss = math.ceil(valid_eids_len / fixed_params.edge_batch_size) num_batches_val_metrics = math.ceil( (len(valid_uids) + len(all_iids)) / fixed_params.node_batch_size ) num_batches_test = math.ceil( (len(test_uids) + len(all_iids)) / fixed_params.node_batch_size ) # Run model hp_sentence = params hp_sentence.update(vars(fixed_params)) hp_sentence = f'{str(hp_sentence)[1: -1]} \n' save_txt(f'\n \n START - Hyperparameters \n{hp_sentence}', train_data_paths.result_filepath, "a") trained_model, viz, best_metrics = train_model( model, fixed_params.num_epochs, num_batches_train, num_batches_val_loss, edgeloader_train, edgeloader_valid, max_margin_loss, params['delta'], params['neg_sample_size'], params['use_recency'], cuda, device, fixed_params.optimizer, params['lr'], get_metrics=True, train_graph=train_graph, valid_graph=valid_graph, nodeloader_valid=nodeloader_valid, nodeloader_subtrain=nodeloader_subtrain, k=fixed_params.k, out_dim=params['out_dim'], num_batches_val_metrics=num_batches_val_metrics, num_batches_subtrain=num_batches_subtrain, bought_eids=train_eids_dict[('user', 'buys', 'item')], ground_truth_subtrain=ground_truth_subtrain, ground_truth_valid=ground_truth_valid, remove_already_bought=True, result_filepath=train_data_paths.result_filepath, start_epoch=fixed_params.start_epoch, patience=fixed_params.patience, pred=params['pred'], use_popularity=params['use_popularity'], weight_popularity=params['weight_popularity'], remove_false_negative=fixed_params.remove_false_negative, embedding_layer=params['embedding_layer'], ) # Get viz & metrics if visualization: plot_train_loss(hp_sentence, viz) # Report performance on validation set sentence = ("BEST VALIDATION Precision " "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%" .format(best_metrics['precision'] * 100, best_metrics['recall'] * 100, best_metrics['coverage'] * 100)) log.info(sentence) save_txt(sentence, train_data_paths.result_filepath, mode='a') # Report performance on test set log.debug('Test metrics start ...') trained_model.eval() with torch.no_grad(): embeddings = get_embeddings(valid_graph, params['out_dim'], trained_model, nodeloader_test, num_batches_test, cuda, device, params['embedding_layer'], ) for ground_truth in [data.ground_truth_purchase_test, data.ground_truth_test]: precision, recall, coverage = get_metrics_at_k( embeddings, valid_graph, trained_model, params['out_dim'], ground_truth, all_eids_dict[('user', 'buys', 'item')], fixed_params.k, True, # Remove already bought cuda, device, params['pred'], params['use_popularity'], params['weight_popularity'], ) sentence = ("TEST Precision " "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%" .format(precision * 100, recall * 100, coverage * 100)) log.info(sentence) save_txt(sentence, train_data_paths.result_filepath, mode='a') if check_embedding: trained_model.eval() with torch.no_grad(): log.debug('ANALYSIS OF RECOMMENDATIONS') if 'sport' in train_graph.ntypes: result_sport = explore_sports(embeddings, data.sport_feat_df, data.spt_id, fixed_params.num_choices) save_txt(result_sport, train_data_paths.result_filepath, mode='a') already_bought_dict = create_already_bought(valid_graph, all_eids_dict[('user', 'buys', 'item')], ) already_clicked_dict = None if fixed_params.discern_clicks: already_clicked_dict = create_already_bought(valid_graph, all_eids_dict[('user', 'clicks', 'item')], etype='clicks', ) users, items = data.ground_truth_test ground_truth_dict = create_ground_truth(users, items) user_ids = np.unique(users).tolist() recs = get_recs(valid_graph, embeddings, trained_model, params['out_dim'], fixed_params.k, user_ids, already_bought_dict, remove_already_bought=True, pred=params['pred'], use_popularity=params['use_popularity'], weight_popularity=params['weight_popularity']) users, items = data.ground_truth_purchase_test ground_truth_purchase_dict = create_ground_truth(users, items) explore_recs(recs, already_bought_dict, already_clicked_dict, ground_truth_dict, ground_truth_purchase_dict, data.item_feat_df, fixed_params.num_choices, data.pdt_id, fixed_params.item_id_type, train_data_paths.result_filepath) if fixed_params.item_id_type == 'SPECIFIC ITEM IDENTIFIER': coverage_metrics = check_coverage(data.user_item_train, data.item_feat_df, data.pdt_id, recs) sentence = ( "COVERAGE \n|| All transactions : " "Generic {:.1f}% | Junior {:.1f}% | Male {:.1f}% | Female {:.1f}% | Eco {:.1f}% " "\n|| Recommendations : " "Generic {:.1f}% | Junior {:.1f}% | Male {:.1f}% | Female {:.1f} | Eco {:.1f}%%" .format( coverage_metrics['generic_mean_whole'] * 100, coverage_metrics['junior_mean_whole'] * 100, coverage_metrics['male_mean_whole'] * 100, coverage_metrics['female_mean_whole'] * 100, coverage_metrics['eco_mean_whole'] * 100, coverage_metrics['generic_mean_recs'] * 100, coverage_metrics['junior_mean_recs'] * 100, coverage_metrics['male_mean_recs'] * 100, coverage_metrics['female_mean_recs'] * 100, coverage_metrics['eco_mean_recs'] * 100, ) ) log.info(sentence) save_txt(sentence, train_data_paths.result_filepath, mode='a') save_outputs( { 'embeddings': embeddings, 'already_bought': already_bought_dict, 'already_clicked': already_bought_dict, 'ground_truth': ground_truth_dict, 'recs': recs, }, 'outputs/' ) # Save model date = str(datetime.datetime.now())[:-10].replace(' ', '') torch.save(trained_model.state_dict(), f'models/FULL_Recall_{recall * 100:.2f}_{date}.pth') # Save all necessary params save_outputs( { f'{date}_params': params, f'{date}_fixed_params': vars(fixed_params), }, 'models/' ) print("Saved model & parameters to disk.") # Save graph & ID mapping save_graphs(f'models/{date}_graph.bin', [valid_graph]) save_outputs( { f'{date}_ctm_id': data.ctm_id, f'{date}_pdt_id': data.pdt_id, }, 'models/' ) print("Saved graph & ID mapping to disk.")
from src.utils import read_data, get_open_reponses, get_ids_open_reponses from sklearn.mixture import GaussianMixture def fill_X(auth_index): global gmm global ids_auth global features global four_surveys_taken_auth_ids auth = four_surveys_taken_auth_ids[auth_index] k = list(ids_auth).index(auth) return gmm.predict_proba(features[k].reshape(1, -1))[0] n_compo = 10 df_ecologie = read_data('data/LA_TRANSITION_ECOLOGIQUE.json') df_resp_eco = get_open_reponses(df_ecologie) df_ids_eco = get_ids_open_reponses(df_ecologie) four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv", delimiter=",", dtype=str) ids_auth = np.sort(list(set(df_resp_eco['authorId'].values))) np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s") X = np.zeros((len(four_surveys_taken_auth_ids), n_compo)) # read features features = np.loadtxt('responses ecologie_all_questions.tsv', delimiter='\t') # Fit GMM gmm = GaussianMixture(n_components=n_compo) gmm.fit(features) # pool local_pool = multiprocessing.Pool(10)
from src.utils import read_data, get_open_reponses, get_ids_open_reponses from sklearn.mixture import GaussianMixture def fill_X(auth_index): global gmm global ids_auth global features global four_surveys_taken_auth_ids auth = four_surveys_taken_auth_ids[auth_index] k = list(ids_auth).index(auth) return gmm.predict_proba(features[k].reshape(1, -1))[0] n_compo = 10 df_democratie = read_data('data/DEMOCRATIE_ET_CITOYENNETE.json') df_resp_dem = get_open_reponses(df_democratie) df_ids_dem = get_ids_open_reponses(df_democratie) four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv", delimiter=",", dtype=str) ids_auth = np.sort(list(set(df_resp_dem['authorId'].values))) np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s") X = np.zeros((len(four_surveys_taken_auth_ids), n_compo)) # read features features = np.loadtxt('responses democratie_all_questions.tsv', delimiter='\t') # Fit GMM gmm = GaussianMixture(n_components=n_compo) gmm.fit(features) # pool local_pool = multiprocessing.Pool(10)
def inference_fn(trained_model, remove, fixed_params, overwrite_fixed_params=False, days_of_purchases=710, days_of_clicks=710, lifespan_of_items=710, **params): """ Function to run inference inside the hyperparameter loop and calculate metrics. Parameters ---------- trained_model: Model trained during training of hyperparameter loop. remove: Percentage of data removed. See src.utils_data for more details. fixed_params: All parameters used during training of hyperparameter loop. See src.utils_data for more details. overwrite_fixed_params: If true, training parameters will overwritten by the parameters below. Can be useful if need to test the model on different parameters, e.g. that includes older clicks or purchases. days_of_purchases, days_of_clicks, lifespan_of_items: All parameters that can overwrite the training parameters. Only useful if overwrite_fixed_params is True. params: All other parameters used during training. Returns ------- recall: Recall on the test set. Relevant to compare with recall computed on hyperparametrization test set (since parameters like 'remove' and all overwritable parameters are different) Saves to file ------------- Metrics computed on the test set. """ # Import parameters if isinstance(fixed_params, str): path = fixed_params fixed_params = read_data(path) class objectview(object): def __init__(self, d): self.__dict__ = d fixed_params = objectview(fixed_params) if 'params' in params.keys(): # if isinstance(params['params'], str): path = params['params'] params = read_data(path) # Initialize data data_paths = DataPaths() fixed_params.remove = remove if overwrite_fixed_params: fixed_params.days_of_purchases = days_of_purchases fixed_params.days_of_clicks = days_of_clicks fixed_params.lifespan_of_items = lifespan_of_items data = DataLoader(data_paths, fixed_params) # Get graph valid_graph = create_graph(data.graph_schema, ) valid_graph = assign_graph_features( valid_graph, fixed_params, data, **params, ) dim_dict = { 'user': valid_graph.nodes['user'].data['features'].shape[1], 'item': valid_graph.nodes['item'].data['features'].shape[1], 'out': params['out_dim'], 'hidden': params['hidden_dim'] } all_sids = None if 'sport' in valid_graph.ntypes: dim_dict['sport'] = valid_graph.nodes['sport'].data['features'].shape[ 1] all_sids = np.arange(valid_graph.num_nodes('sport')) # get training and test ids (train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids, test_uids, all_iids, ground_truth_subtrain, ground_truth_valid, all_eids_dict) = train_valid_split( valid_graph, data.ground_truth_test, fixed_params.etype, fixed_params.subtrain_size, fixed_params.valid_size, fixed_params.reverse_etype, fixed_params.train_on_clicks, fixed_params.remove_train_eids, params['clicks_sample'], params['purchases_sample'], ) (edgeloader_train, edgeloader_valid, nodeloader_subtrain, nodeloader_valid, nodeloader_test) = generate_dataloaders( valid_graph, train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids, test_uids, all_iids, fixed_params, num_workers, all_sids, embedding_layer=params['embedding_layer'], n_layers=params['n_layers'], neg_sample_size=params['neg_sample_size'], ) num_batches_test = math.ceil( (len(test_uids) + len(all_iids)) / fixed_params.node_batch_size) # Import model if isinstance(trained_model, str): path = trained_model trained_model = ConvModel( valid_graph, params['n_layers'], dim_dict, params['norm'], params['dropout'], params['aggregator_type'], fixed_params.pred, params['aggregator_hetero'], params['embedding_layer'], ) trained_model.load_state_dict(torch.load(path, map_location=device)) if cuda: trained_model = trained_model.to(device) trained_model.eval() with torch.no_grad(): embeddings = get_embeddings( valid_graph, params['out_dim'], trained_model, nodeloader_test, num_batches_test, cuda, device, params['embedding_layer'], ) for ground_truth in [ data.ground_truth_purchase_test, data.ground_truth_test ]: precision, recall, coverage = get_metrics_at_k( embeddings, valid_graph, trained_model, params['out_dim'], ground_truth, all_eids_dict[('user', 'buys', 'item')], fixed_params.k, True, # Remove already bought cuda, device, fixed_params.pred, params['use_popularity'], params['weight_popularity'], ) sentence = ("TEST Precision " "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%".format( precision * 100, recall * 100, coverage * 100)) print(sentence) save_txt(sentence, data_paths.result_filepath, mode='a') return recall
def format_dfs( train_path, # str (path) or pd.Dataframe directly (df) test_path, # str (path) or pd.Dataframe directly (df) item_sport_path: str, user_sport_path: str, sport_sportg_path: str, item_feat_path: str, user_feat_path: str, sport_feat_path: str, sport_onehot_path: str, remove: float = 0., ctm_id_type: str = 'CUSTOMER IDENTIFIER', item_id_type: str = 'SPECIFIC ITEM IDENTIFIER', days_of_purchases: int = 710, days_of_clicks: int = 710, lifespan_of_items: int = 710, report_model_coverage: bool = False, ): """ Import all dfs from csv paths and preprocess interactions to sample interactions and remove old users and items. Parameters ---------- train_path, test_path: Paths of interaction files, between user and items (in the train set and the test set). To accommodate a wider range of utilisation, train_path and test_path can be directly dataframes instead of strings. All files with user and items must include a column named with the specified ctm_id_type or item_id_type. item_sport_path, user_sport_path, sport_sportg_path: Paths of interaction files, between item and sport, user and sport, sport and sport group. All files with user and items must include a column named with the specified ctm_id_type or item_id_type. item_feat_path, user_feat_path, sport_feat_path: Paths of feature files, for item, user and sports. Item features include textual descriptions and junior, male, female and eco indicators. User features include male and female indicator. Sport features include only name of sport. All files with user and items must include a column named with the specified ctm_id_type or item_id_type. sport_onehot_path: Path for a csv matrix containing the sport_id and a one-hot vector, unique per sport. remove: Removes a proportion of users from the dataset randomly. ctm_id_type : Identifier for the customers. item_id_type : Identifier for the items. Can be SPECIFIC ITEM IDENTIFIER (e.g. item SKU) or GENERAL ITEM IDENTIFIER (e.g. item family identifier) days_of_purchases (Days_of_clicks) : Number of days of purchases (clicks) that should be kept in the dataset. Intuition is that interactions of 12+ months ago might not be relevant. Max is 710 days Those that do not have any remaining interactions will be fed recommendations from another model. lifespan_of_items : Number of days since most recent transactions for an item to be considered by the model. Max is 710 days. Won't make a difference is it is > Days_of_interaction. report_model_coverage : bool Computes how many users are included by these parameters (and would thus receive a recommendation by this GNN model). Returns ------- user_item_train, user_item_test, user_sport_interaction, item_sport_interaction, sport_sportg_interaction: Dataframes of interactions. item_feat_df, user_feat_df, sport_feat_df, sport_onehot_df: Dataframes of features. """ np.random.seed(11) # User, item and sport features item_feat_df = read_data(item_feat_path) user_feat_df = read_data(user_feat_path) sport_feat_df = read_data(sport_feat_path) sport_onehot_df = read_data(sport_onehot_path) # User-item interaction. We allow direct df instead of path: check which was passed. if isinstance(train_path, str): user_item_train = read_data(train_path) elif isinstance(train_path, pd.DataFrame): user_item_train = train_path else: raise TypeError( f'Type of {train_path} not recognized. Should be str or pd.DataFrame' ) if isinstance(test_path, str): user_item_test = read_data(test_path) elif isinstance(test_path, pd.DataFrame): user_item_test = test_path else: raise TypeError( f'Type of {test_path} not recognized. Should be str or pd.DataFrame' ) if days_of_purchases < 710: most_recent_date = datetime.strptime(max(user_item_train.hit_date), '%Y-%m-%d') limit_date = datetime.strftime( (most_recent_date - timedelta(days=int(days_of_purchases))), format='%Y-%m-%d') user_item_train = user_item_train[( user_item_train.hit_date >= limit_date) | (user_item_train.buy == 0)] if days_of_clicks < 710: most_recent_date = datetime.strptime(max(user_item_train.hit_date), '%Y-%m-%d') limit_date = datetime.strftime( (most_recent_date - timedelta(days=int(days_of_clicks))), format='%Y-%m-%d') user_item_train = user_item_train[( user_item_train.hit_date >= limit_date) | (user_item_train.buy == 1)] if lifespan_of_items < days_of_purchases: most_recent_date = datetime.strptime(max(user_item_train.hit_date), '%Y-%m-%d') limit_date = datetime.strftime( (most_recent_date - timedelta(days=int(lifespan_of_items))), format='%Y-%m-%d') item_list = user_item_train[user_item_train.hit_date >= limit_date][ 'SPECIFIC ITEM IDENTIFIER'].unique() user_item_train = user_item_train[ user_item_train['SPECIFIC ITEM IDENTIFIER'].isin(item_list)] if remove > 0: ctm_list = user_item_train[ctm_id_type].unique() np.random.shuffle(ctm_list) ctm_list = ctm_list[:int(len(ctm_list) * (1 - remove))] user_item_train = user_item_train[user_item_train[ctm_id_type].isin( ctm_list)] user_item_test = user_item_test[user_item_test[ctm_id_type].isin( ctm_list)] if remove == 0: # Make sure that if no observations were removed by days of clicks / purchases, no user is only in test set user_item_test = user_item_test[user_item_test[ctm_id_type].isin( user_item_train[ctm_id_type].unique())] if item_id_type == 'GENERAL ITEM IDENTIFIER': user_item_train = user_item_train.merge(item_feat_df[[ 'SPECIFIC ITEM IDENTIFIER', 'GENERAL ITEM IDENTIFIER' ]].drop_duplicates(), how='left', on='SPECIFIC ITEM IDENTIFIER') user_item_test = user_item_test.merge(item_feat_df[[ 'SPECIFIC ITEM IDENTIFIER', 'GENERAL ITEM IDENTIFIER' ]].drop_duplicates(), how='left', on='SPECIFIC ITEM IDENTIFIER') assert user_item_train.general_item_identifier.isna().sum() == 0 assert user_item_test.general_item_identifier.isna().sum() == 0 # Item-sport interaction item_sport_interaction = read_data(item_sport_path) if lifespan_of_items < days_of_purchases: item_sport_interaction = item_sport_interaction[ item_sport_interaction['SPECIFIC ITEM IDENTIFIER'].isin(item_list)] if item_id_type == 'GENERAL ITEM IDENTIFIER': item_sport_interaction = item_sport_interaction.merge( item_feat_df[[ 'SPECIFIC ITEM IDENTIFIER', 'GENERAL ITEM IDENTIFIER' ]], how='left', on='SPECIFIC ITEM IDENTIFIER') # Drop duplicates if not item_id_type not model number item_sport_interaction.drop_duplicates(inplace=True) # User-sport interaction user_sport_interaction = read_data(user_sport_path) if remove > 0: user_sport_interaction = user_sport_interaction[ user_sport_interaction[ctm_id_type].isin(ctm_list)] # Sport-sportgroups interaction sport_sportg_interaction = read_data(sport_sportg_path) if report_model_coverage: train_users = user_item_train[ctm_id_type].unique().tolist() test_users = user_item_test[ctm_id_type].unique().tolist() sport_users = user_sport_interaction[ctm_id_type].unique().tolist() unseen_users = [uid for uid in test_users if uid not in train_users] print(f'There are {len(unseen_users)} users with no interactions') train_users.extend(sport_users) unseen_users = [uid for uid in test_users if uid not in train_users] print(f'and {len(unseen_users)} with also no sports associated') print(f'out of {len(test_users)}') return user_item_train, user_item_test, item_sport_interaction, user_sport_interaction, \ sport_sportg_interaction, item_feat_df, user_feat_df, sport_feat_df, sport_onehot_df