def load_entities_data(self, X_PATH, Y_PATH, ENTITIES_PATH): ''' load the fundamental data X (input), Y (labels), entities (entities names) params: X_PATH: the path to input vectors Y_PATH: the path to labels of input vectors ENTITIES_PATH: the path to entities names of input vectors ''' self.setup_print_times(keywords = 'entities data') self.X = load_data_with_pickle(X_PATH) self.Y = load_data_with_pickle(Y_PATH) self.entities = load_data_with_pickle(ENTITIES_PATH) self.print_loaded()
def load_concept_embeddings(self, CONCEPT_EMBEDDING_PATHS, data, nickel = True): ''' load the concept embedding and transform that in dicts, for example: self.concept_embedding is a dict in the format {concept_embedding_name_0: {concept_name_0: vector, ... concept_name_N: vector}, }, concept_embedding_name_1: {concept_name_0: vector, ... concetp_name_M: vector}, } params: CONCEPT_EMBEDDING_PATHS: a list of paths, each path points to a concept embedding ''' self.setup_print_times(keywords = 'concept embeddings') self.concept_embeddings = {} self.concept_embeddings['hyperbolic'] = self.load_nickel(CONCEPT_EMBEDDING_PATHS[1]) # self.concept_embeddings['hyperbolic'] = self.load_nickel2(CONCEPT_EMBEDDING_PATHS[1]) self.concept_embeddings['hyperbolic'] = {k.strip(): v for k, v in self.concept_embeddings['hyperbolic'].items()} if nickel: self.concept_embeddings['distributional'] = self.load_nickel(CONCEPT_EMBEDDING_PATHS[0]) if not nickel: self.concept_embeddings['distributional'] = load_data_with_pickle(CONCEPT_EMBEDDING_PATHS[0]) self.print_loaded()
def load_concept_embeddings(self, CONCEPT_EMBEDDING_PATHS, nickel = False, cleaned = False): ''' load the concept embedding and transform that in dicts, for example: self.concept_embedding is a dict in the format {concept_embedding_name_0: {concept_name_0: vector, ... concept_name_N: vector}, }, concept_embedding_name_1: {concept_name_0: vector, ... concetp_name_M: vector}, } params: CONCEPT_EMBEDDING_PATHS: a list of paths, each path points to a concept embedding ''' self.setup_print_times(keywords = 'concept embeddings') if not nickel: concept_embeddings = [load_data_with_pickle(x) for x in CONCEPT_EMBEDDING_PATHS] self.concept_embeddings = {'hyperbolic': concept_embeddings[1]} else: self.concept_embeddings = {} self.concept_embeddings['hyperbolic'] = self.load_nickel(CONCEPT_EMBEDDING_PATHS[1]) # self.concept_embeddings['hyperbolic'] = self.load_nickel2(CONCEPT_EMBEDDING_PATHS[1]) self.concept_embeddings['hyperbolic'] = {k.strip(): v for k, v in self.concept_embeddings['hyperbolic'].items()} self.concept_embeddings['distributional'] = load_data_with_pickle(CONCEPT_EMBEDDING_PATHS[0]) if not cleaned: # transform gensim embedding in a dict {key: vector} self.concept_embeddings['distributional'] = {k: self.concept_embeddings['distributional'][k] for k in self.concept_embeddings['distributional'].wv.vocab} self.print_loaded()
def load_raw_dataset(self, load_path): self.X = load_data_with_pickle(load_path + 'X') self.Y = load_data_with_pickle(load_path + 'Y') self.entities = load_data_with_pickle(load_path + 'entities')
fraction = 1 datasetManager.shuffle_dataset_and_sample(fraction=fraction, in_place=True) datasetManager.split_data_by_unique_entities( exclude_min_threshold=exclude_min_threshold) print('Train: {} vectors, Val: {} vectors, Test: {} vectors'.format( len(datasetManager.Y_train), len(datasetManager.Y_val), len(datasetManager.Y_test))) elif load_dataset: print('... loading datasets ...') t = time.time() datasetManager.X_train = load_data_with_pickle(X_TRAIN_PATH) datasetManager.X_test = load_data_with_pickle(X_TEST_PATH) datasetManager.X_val = load_data_with_pickle(X_VAL_PATH) datasetManager.Y_train = load_data_with_pickle(Y_TRAIN_PATH) datasetManager.Y_test = load_data_with_pickle(Y_TEST_PATH) datasetManager.Y_val = load_data_with_pickle(Y_VAL_PATH) datasetManager.E_train = load_data_with_pickle(E_TRAIN_PATH) datasetManager.E_test = load_data_with_pickle(E_TEST_PATH) datasetManager.E_val = load_data_with_pickle(E_VAL_PATH) print('--- dataset loaded in {:.2f} seconds ---'.format(time.time() - t)) # datasetManager.plot_datasets() # datasetManager.print_statistic_on_dataset() print('... creating numeric dataset ...')