def train(features, targets, num_folds, classifiers, output_folder, seed=None, filename_tag="", classifier_obj=None): kf = KFold(n_splits=num_folds, random_state=seed) folds_idxs = list(kf.split(features)) trips_array = np.asarray(features) targets = np.asarray(targets) if type(classifiers) != list: classifiers = [classifiers] # train/val accuracies accuracies = {} mean_accuracies = {} # classify for classifier in classifiers: classif_start = utils.tic() accuracies[classifier] = [] print("\nTesting classifier [%s]" % classifier) # train & test each classifier # for each fold for i, (train_idx, val_idx) in enumerate(folds_idxs): print("\tClassifying fold %d/%d" % (i + 1, len(folds_idxs)), end=" ") train = (trips_array[train_idx], targets[train_idx]) val = (trips_array[val_idx], targets[val_idx]) if classifier == "knn": k = 5 accTrain, accVal = knn_classification( train, val, k, classifier_obj=classifier_obj) elif classifier == "logreg": accTrain, accVal = logreg_classification( train, val, classifier_obj=classifier_obj) elif classifier == "randfor": accTrain, accVal = randfor_classification( train, val, seed, classifier_obj=classifier_obj) accuracies[classifier].append((accTrain, accVal)) print("- accuracies train/val:", accuracies[classifier][-1]) elapsed = utils.tictoc(classif_start) print("Done in:", elapsed) # accuracy across all folds mean_accuracies[classifier] = [np.mean([x[0] for x in accuracies[classifier]]), \ np.mean([x[1] for x in accuracies[classifier]])] titlestr = "%s, overall accuracy train/val: %s" % ( classifier, str(mean_accuracies[classifier])) chart_filename = os.path.join( output_folder, classifier + "_" + filename_tag + "_chart") utils.barchart(list(range(1, num_folds + 1)), accuracies[classifier], title=titlestr, ylabel="accuracy", legend=["train", "val"], save=chart_filename) return mean_accuracies
def main(config_file, ignore_undefined=False, load_models_first=False): """The main function Arguments: config_file {str} -- Path for the run's configuration file """ # # time the entire run with tictoc("Total run"): # initialize configuration global_config, pipeline, triggers = ConfigReader.read_configuration( config_file, ignore_undefined) pipeline.configure_names() # should_load_models = load_models_first or any( trig.requires_model_loading() for trig in triggers) if should_load_models: error( "Should load models but model deserialization is not enabled!", not global_config.misc.allow_model_deserialization) pipeline.load_models() for trig in sorted(triggers, key=lambda x: x.is_blocking): trig.link_pipeline(pipeline) trig.setup() for trig in triggers: trig.arm() if num_warnings > 0: warning("{} warnings occured.".format(num_warnings - 1)) info("Logfile is at: {}".format(global_config.logfile)) tictoc.log(global_config.logfile + ".timings")
def produce_outputs(self): if self.loaded_preprocessed or self.loaded_aggregated: return info("Mapping to {} embeddings.".format(self.name)) train_words = [wp[0] for doc in self.text_train for wp in doc] self.fit_doc2vec(train_words, self.labels_train) d2v = self.get_model() self.embeddings = np.ndarray((0, self.dimension), np.float32) # loop over input text bundles (e.g. train & test) for dset_idx in range(len(self.text)): dset_word_list = self.text[dset_idx] with tictoc("Embedding mapping for text bundle {}/{}".format( dset_idx + 1, len(self.text))): info("Mapping text bundle {}/{}: {} texts".format( dset_idx + 1, len(self.text), len(self.text[dset_idx]))) num_docs = len(dset_word_list) starting_instance_idx = dset_idx * len(self.embeddings) for doc_word_pos in tqdm.tqdm(dset_word_list): doc_words = [wp[0] for wp in doc_word_pos] # debug("Inferring word list:{}".format(doc_words)) vec = np.expand_dims(d2v.infer_vector(doc_words), axis=0) self.embeddings = np.append(self.embeddings, vec, axis=0) self.set_constant_elements_per_instance()
def question_c(features_file, grid_file, test_file, output_folder, seed, classif_file, num_folds): total_start = utils.tic() df_features = pd.read_csv(features_file) features, jid_mapping, targets = jcp.preprocess_train_data( df_features, seed) classifiers = ["knn", "logreg", "randfor"] # classifiers = ["randfor"] mean_accuracies = jcp.train(features, targets, num_folds, classifiers, output_folder, seed=seed) # print mean accuracy per classifier print() for classifier in mean_accuracies: print(classifier, "accuracy train/val:", mean_accuracies[classifier]) # select the random forest algorithm to beat the benchmark impr_classifier_name = "randfor" baseline_accuracy = mean_accuracies[impr_classifier_name][-1] print() print("Improving classification for classifier", impr_classifier_name) best_classifier, best_technique, best_accuracy = jcp.improve_randfor( baseline_accuracy, features_file, num_folds, output_folder, impr_classifier_name, seed) jcp.test(best_classifier, best_technique, test_file, grid_file, jid_mapping, classif_file) elapsed = utils.tictoc(total_start) print("Done in:", elapsed)
def find_similar_subroutes_per_test_trip(test_points, train_df, k, paropts=None, verbosity=False): if paropts: print("Parallelizing with", paropts) partype, numpar = paropts else: partype, numpar = None, None timestart = utils.tic() test_lonlat = utils.idx_to_lonlat(test_points, format="tuples") max_subseqs = [] if partype: # num threads or processes if partype == "processes": max_subseqs = exec_with_processes(train_df, numpar, test_lonlat, k) elif partype == "threads": max_subseqs = exec_with_threads(train_df, numpar, test_lonlat, k) else: max_subseqs = serial_execution(train_df, test_lonlat, k, verbosity=verbosity) if len(max_subseqs) != k: print("WARNING: Specified %d subseqs!" % k) print("Extracted %d nearest subsequences of a %d-long test tring in: %s" % (len(test_points), k, utils.tictoc(timestart))) return max_subseqs
def serial_execution(df, test_lonlat, k, verbosity=False): max_subseqs = [] # for each trip in the training data for index, row in df.iterrows(): train_points = row["points"] train_points = eval(train_points) train_lonlat = utils.idx_to_lonlat(train_points, format="tuples") timestart = utils.tic() # compute common subsequences between the test trip and the current candidate _, subseqs_idx_list = calc_lcss(test_lonlat, train_lonlat) # consider non-consequtive subroutes subseqs_idx = list( set([idx for seq in subseqs_idx_list for idx in seq])) elapsed = utils.tictoc(timestart) # sort by decr. length subseqs_idx.sort(reverse=True) # update the list of the longest subsequences if subseqs_idx: max_subseqs = update_current_maxsubseq(max_subseqs, subseqs_idx, k, elapsed, row) # print("Max subseq length:",len(max_subseqs)) #print([x[0] for x in max_subseqs]) # print("Updated max subseqs, lens now:",[len(x[0]) for x in max_subseqs]) if verbosity: print("Got %d subseqs:" % len(max_subseqs), [(x, y, z["tripId"]) for (x, y, z) in max_subseqs]) #max_subseqs = check_reverse_lcss(max_subseqs, test_lonlat, k) if verbosity: print("Got %d reversed: subseqs:" % len(max_subseqs), [(x, y, z["tripId"]) for (x, y, z) in max_subseqs]) return max_subseqs
def exec_with_threads(df, numpar, test_lonlat, k): max_subseqs = [] res1 = [[] for _ in range(numpar)] res2 = [[] for _ in range(numpar)] subframes = utils.get_sub_dataframes(df, numpar) # assign data and start the threads threads = [] timestart = utils.tic() for i in range(numpar): train_lonlat = [] for index, row in subframes[i].iterrows(): train_points = row["points"] train_points = eval(train_points) train_lonlat = utils.idx_to_lonlat(train_points, format="tuples") threads.append( threading.Thread(target=calc_lcss, args=(test_lonlat, train_lonlat, res1, res2))) threads[i].start() # gather and merge results subseqs = [] subseqs_idx = [] for i in range(numpar): threads[i].join() subseqs += res1[i] subseqs_idx += res2[i] subseqs_idx = sorted(subseqs_idx, key=lambda x: len(x), reverse=True) elapsed = utils.tictoc(timestart) max_subseqs = update_current_maxsubseq(max_subseqs, subseqs_idx, k, elapsed, row) return max_subseqs
def produce_outputs(self): """Apply preprocessing""" self.setup_nltk_resources() # make indices object -- this filters down non-existent (with no instances) roles if type(self.indices) is not Indices: self.indices = Indices(self.indices, tags=self.roles) self.roles = self.indices.tags train_idx, test_idx = self.indices.get_train_test() test_idx = self.indices.get_tag_instances(defs.roles.test, must_exist=False) error("Neither train or test indices found to process dataset", not (train_idx.size > 0 or test_idx.size > 0)) preproc_data = [] preproc_targets = [] with tictoc("Preprocessing {}".format(self.name)): info("Mapping text training data to word collections.") txts, self.vocabulary, discarded_indexes = self.preprocess_text_collection(self.data, train_idx, track_vocabulary=True) self.vocabulary = set(self.vocabulary) preproc_data.extend(txts) if self.has_text_targets(): info("Mapping text training targets to word collections.") txts, voc, _ = self.preprocess_text_collection(self.targets, train_idx, track_vocabulary=True) self.vocabulary.update(voc) preproc_targets.extend(txts) # if discarded_indexes: # warning(f"Discarded {len(discarded_indexes)} instances from preprocessing.") # if self.train_labels is not None: # self.train_labels = [self.train_labels[i] for i in range(len(self.train_labels)) if i not in discarded_indexes] info("Mapping text test data to word collections.") txts, _, discarded_indexes = self.preprocess_text_collection(self.data, test_idx) preproc_data.extend(txts) if self.has_text_targets(): info("Mapping text test targets to word collections.") txts, _, _ = self.preprocess_text_collection(self.targets, test_idx) preproc_targets.extend(txts) # if discarded_indexes: # warning(f"Discarded {len(discarded_indexes)} instances from preprocessing.") # if self.test_labels is not None: # self.test_labels = [self.test_labels[i] for i in discarded_indexes] # fix word order and get word indexes self.vocabulary = list(self.vocabulary) for index, word in enumerate(self.vocabulary): self.word_to_index[word] = index self.vocabulary_index.append(index) # add another for the missing word self.undefined_word_index = len(self.vocabulary) self.data = preproc_data self.targets = preproc_targets
def calculate_nns(test_points, train_df, paropts=None, k=5, unique_jids=False): # parallelization type if paropts: print("Parallelizing with", paropts) partype, numpar = paropts else: partype, numpar = None, None timestart = utils.tic() test_lonlat = utils.idx_to_lonlat(test_points, format="tuples") nearest_neighbours = [-1 for _ in range(len(train_df.index))] if partype: # num threads or processes if partype == "processes": nearest_neighbours = run_with_processes(numpar, test_lonlat, train_df) elif partype == "threads": nearest_neighbours = run_with_threads(numpar, test_lonlat, train_df) else: # serial execution nearest_neighbours = calculate_dists(test_lonlat, train_df) # sort the list to increasing distance nearest_neighbours = sorted(nearest_neighbours, key=lambda k: k[1]) # keep unique jids, if needed if unique_jids: print("Restricting to single neighbour per jid") keep = [0 for _ in range(len(nearest_neighbours))] already_encountered = [] for i, nn in enumerate(nearest_neighbours): jid = nn[2] if jid not in already_encountered: already_encountered.append(jid) keep[i] = True continue nearest_neighbours = [ nearest_neighbours[i] for i in range(len(nearest_neighbours)) if keep[i] ] # return the top 5 nearest_neighbours = nearest_neighbours[:k] print("Neighbours:", [n[0] for n in nearest_neighbours]) print("Extracted %d nearest neighbours of a %d-long test trip in: %s" % (len(test_points), k, utils.tictoc(timestart))) return nearest_neighbours
def execute_training(self): with tictoc("Training run", do_print=self.do_folds, announce=False): # get training - validation instance indexes for building the model self.configure_trainval_indexes() # iterate over required runs (e.g. portion split or folds) # # as per the validation setting for iteration_index, trainval in enumerate( self.validation.get_trainval_indexes()): # set the train/val data indexes self.train_index, self.val_index = trainval # train and keep track of the model self.model_index = iteration_index model = self.acquire_trained_model() self.append_model_instance(model)
def acquire_trained_model(self): """Trains the learning model or load an existing instance from a persisted file.""" with tictoc("Training run [{}] - {} on {} training and {} val data.". format( get_info_string(self.config), self.model_index, len(self.train_index), len(self.val_index) if self.val_index is not None else "[none]")): model = None if not model: model = self.train_model() # create directories makedirs(self.models_folder, exist_ok=True) else: info( "Skipping training due to existing model successfully loaded." ) return model
def exec_with_processes(df, process_num, test_lonlat, k): max_subseqs = [] pool = ThreadPool(processes=process_num) for index, row in df.iterrows(): train_points = row["points"] train_points = eval(train_points) train_lonlat = utils.idx_to_lonlat(train_points, format="tuples") timestart = utils.tic() # compute common subsequences between the test trip and the current candidate async_result = pool.apply_async(calc_lcss, (test_lonlat, train_lonlat)) subseqs, subseqs_idx = async_result.get() elapsed = utils.tictoc(timestart) # sort by decr. length subseqs_idx = sorted(subseqs_idx, key=lambda x: len(x), reverse=True) # update the list of the longest subsequences max_subseqs = update_current_maxsubseq(max_subseqs, subseqs_idx, k, elapsed, row) print("Got %d common subsequences" % len(max_subseqs)) pool.close() pool.join() return max_subseqs
def question_b(train_file, number_of_cells, output_folder): # specify files grid_file = os.path.join(output_folder, "grid.pickle") feature_file = os.path.join(output_folder, "tripFeatures.csv") # read data and make the grid train_df = pd.read_csv(train_file) max_lonlat, min_lonlat, all_lats, all_lons = gvp.find_min_max_latlon( train_df, output_folder) grid = gvp.create_grid(number_of_cells, max_lonlat, min_lonlat, all_lats, all_lons, output_folder=output_folder) # save grid and transform data with open(grid_file, "wb") as f: pickle.dump(grid, f) feats_start = utils.tic() gvp.map_to_features_bow(train_df, grid, feature_file) print("Generated features in", utils.tictoc(feats_start)) return feature_file, grid_file
def question_a1(output_folder, clean_file, test_file, paropts, k): test_df = pd.read_csv(test_file, delimiter="\n") train_df = pd.read_csv(clean_file) print( "Extracting %d nearest neighbours out of %d cleaned train data, for each test trip" % (k, len(train_df))) print("Using parallelization options:", paropts) for index, row in test_df.iterrows(): print("Examining test element %d / %d" % (index + 1, len(test_df))) outfile_name = os.path.join(output_folder, "nn_%d_" % (index + 1)) # prepare to count time millis_start = utils.tic() # compute nearest neighbours test_points = eval(row["Trajectory"]) nns_ids_distances = nn.calculate_nns(test_points, train_df, paropts=paropts) # get time elapsed elapsed = utils.tictoc(millis_start) # visualize nn.visualize_nns(test_points, nns_ids_distances, outfile_name, elapsed, index)
def produce_outputs(self): # get input configuration data self.topk = None self.messages = [] self.input_parameters_dict = [ dp for dp in self.data_pool.data if type(dp.data) == Dictionary ][0].data.instances self.input_parameters = to_namedtuple(self.input_parameters_dict, "input_parameters") # get reference data by chain name output self.label_mapping = [] for mapping in self.params.label_mappings: # read json if type(mapping) is str and mapping.endswith(".json"): with open(mapping) as f: mapping = json.load(f) mapping_dict = {ix: val for (ix, val) in enumerate(mapping)} self.label_mapping.append(mapping_dict) datapack = [ x for x in self.data_pool.data if x.chain == self.params.data_chain ][0] predictions, tagged_idx = [], [] for i, chain_name in enumerate(self.params.pred_chains): # predictions chain_preds = [ x for x in self.data_pool.data if x.chain == chain_name ][0] predictions.append(chain_preds) # get tagged index idx_tag_name = self.params.idx_tags[i] if idx_tag_name is None: continue # get data with indices idx_data = [ x for x in self.data_pool.data if type(x.data) == DummyData and x.has_usage(Indices, allow_superclasses=False) ] # get data with indices with the desired tag idx_data = [ x for x in idx_data if idx_tag_name in x.get_usage(Indices, allow_superclasses=False).tags ][0] idx_data = idx_data.get_usage(Indices, allow_superclasses=False) idx = idx_data.get_tag_instances(idx_tag_name) tagged_idx.append(idx) # for text data, keep just the words if type(datapack.data) == Text: data = [x["words"] for x in datapack.data.instances] res = [] # get final scores # final_preds = predictions[len(predictions)-1].data.instances # final_surv_idx = tagged_idx[len(predictions)-1] # curr_surv_idx = final_surv_idx # # find which words the survivors belong to # for idx in reversed(tagged_idx[:-1]): # curr_surv_idx = idx[curr_surv_idx] res = [] # contextualize wrt. each instance (specified by the ngram tag) num_all_ngrams = len(predictions[0].data.instances) num_steps = len(predictions) index_mapper = IndexMapper(num_all_ngrams, tagged_idx) ngram_tags = sorted( [x for x in datapack.usages[0].tags if x.startswith("ngram_inst")]) with tictoc("Classification report building", announce=False): for n, ngram_tag in enumerate(ngram_tags): # indexes of the tokens for the current instance # to the entire data container original_instance_ix_data = datapack.usages[ 0].get_tag_instances(ngram_tag) inst_obj = { "instance": n, "data": [data[i] for i in original_instance_ix_data], "predictions": [] } for local_word_idx, ix in enumerate(original_instance_ix_data): if data[ix] == "δυο": print() word_obj = { "word": data[ix], "word_idx": int(local_word_idx), "detailed_preds": [], "overall_preds": {} } # final stages final_stages_for_word = [] # for each step for step_idx in range(num_steps): preds = predictions[step_idx].data.instances step_name = self.params.pred_chains[step_idx] step_obj = {"name": step_name, "step_index": step_idx} if step_idx == 0 or index_mapper.index_survives( ix, target_level=step_idx): # we want the position of in the pred. container previous to the step surv_idx = index_mapper.convert_index( ix, target_level=step_idx - 1) step_preds = preds[surv_idx, :] scores, classes = self.get_topK_preds( step_preds, self.label_mapping[step_idx], self.params.only_report_labels[step_idx]) step_obj["step_preds"] = { c: s for (c, s) in zip(classes[0], scores[0]) } if step_idx == num_steps - 1: word_obj["overall_preds"] = step_obj[ "step_preds"] # add to detailed predictions, if not omitted if not self.omit_detailed_results(): word_obj["detailed_preds"].append(step_obj) else: if self.params.report_if_fail is not None: if step_name in self.params.report_if_fail: surv_idx = index_mapper.convert_index( ix, target_level=step_idx - 1) if surv_idx is None: break step_preds = preds[surv_idx, :] scores, classes = self.get_topK_preds( step_preds, self.label_mapping[step_idx], self. params.only_report_labels[step_idx]) step_obj["step_preds"] = { c: s for (c, s) in zip(classes[0], scores[0]) } # since it fails, it's def. a final step for this word word_obj["overall_preds"] = step_obj[ "step_preds"] # add to detailed predictions, if not omitted if not self.omit_detailed_results(): word_obj["detailed_preds"].append( step_obj) else: # add the score of the last classification scores, classes = [], [] step_obj["step_preds"] = {} break # add if there's info in it if not word_obj["detailed_preds"]: del word_obj["detailed_preds"] inst_obj["predictions"].append(word_obj) res.append(inst_obj) self.result = { "results": res, "input_params": self.input_parameters_dict, "messages": self.messages }
def produce_outputs(self): # get input configuration data self.topk = None self.messages = [] self.input_parameters_dict = [dp for dp in self.data_pool.data if type(dp.data) == Dictionary][0].data.instances self.input_parameters = to_namedtuple(self.input_parameters_dict, "input_parameters") # get reference data by chain name output self.label_mapping = [] for mapping in self.params.label_mappings: # read json if type(mapping) is str: try: with open(mapping) as f: mapping = json.load(f) except: error("Requires json labelmapping or literal list") mapping_dict = {ix: val for (ix, val) in enumerate(mapping)} self.label_mapping.append(mapping_dict) # thresholding for th in self.params.thresholds: if th not in self.input_parameters_dict: self.result = {"results": [], "input_params": self.input_parameters_dict, "messages": [f"Threshold {th} missing from input parameters"]} return datapack = [x for x in self.data_pool.data if x.chain == self.params.data_chain][0] predictions, tagged_idx = [], [] for i, chain_name in enumerate(self.params.pred_chains): # predictions chain_preds = [x for x in self.data_pool.data if x.chain == chain_name][0] predictions.append(chain_preds) # for text data, keep just the words if type(datapack.data) == Text: data = [x["words"] for x in datapack.data.instances] res = [] predictions = [x.data.instances for x in predictions] num_all_ngrams = len(predictions[0]) num_steps = len(predictions) # compute thresholding values thresholding = np.zeros((num_all_ngrams, len(self.params.thresholds)), bool) # for i, th in enumerate(self.params.thresholds): # th_val = float(self.input_parameters_dict[th]) thresholding[:, 0] = predictions[0][:, 1] > float(self.input_parameters_dict[self.params.thresholds[0]]) thresholding[:, 1] = predictions[1][:, 1] > float(self.input_parameters_dict[self.params.thresholds[1]]) thresholding[:, 2] = np.any(predictions[2] > float(self.input_parameters_dict[self.params.thresholds[2]]), axis=1) ngram_tags = sorted([x for x in datapack.usages[0].tags if x.startswith("ngram_inst")]) with tictoc("Classification report building", announce=False): for n, ngram_tag in enumerate(ngram_tags): # indexes of the tokens for the current instance # to the entire data container original_instance_ix_data = datapack.usages[0].get_tag_instances(ngram_tag) inst_obj = {"instance": n, "data": [data[i] for i in original_instance_ix_data], "predictions": []} for local_word_idx, ix in enumerate(original_instance_ix_data): word_obj = {"word": data[ix], "word_idx": int(local_word_idx), "overall_predictions": {}} detailed = [] # for each step for step_idx in range(num_steps): preds = predictions[step_idx] step_name = self.params.pred_chains[step_idx] step_obj = {"name": step_name, "step_index": step_idx} survives = thresholding[ix, step_idx] step_preds = np.expand_dims(preds[ix, :], axis=0) scores, classes = self.get_topK_preds(step_preds, self.label_mapping[step_idx], self.params.only_report_labels[step_idx]) step_preds = {c: round(s, 4) for (c, s) in zip(classes[0], scores[0])} step_obj["step_preds"] = step_preds detailed.append(step_preds) modified, deleted, replaced = thresholding[ix, :] modify_obj = {"modified": int(modified), "prob": detailed[0]["modify"]} word_obj["overall_predictions"]["modify_prediction"] = modify_obj delete_obj = detailed[1] # replaced objs = [] for word, prob in detailed[2].items(): objs.append({"word": word, "prob": prob}) replace_obj = objs if modified: if deleted: # deleted word_obj["overall_predictions"]["delete_prediction"] = delete_obj elif replaced: word_obj["overall_predictions"]["replace_prediction"] = replace_obj if not self.omit_detailed_results(): word_obj["detailed_predictions"] = {"modify_prediction": modify_obj, "delete_prediction": delete_obj, "replace_prediction": replace_obj} inst_obj["predictions"].append(word_obj) res.append(inst_obj) self.result = {"results": res, "input_params": self.input_parameters_dict, "messages": self.messages}