########### Inputs and labels ############ ########################################## print("Loading inputs and labels") # Vector space model of GloVe embeddings trained on article full texts vsm = pd.read_csv("../data/text/glove_gen_n100_win15_min5_iter500_190428.txt", sep=" ", index_col=0, header=0) n_vocab, n_emb = vsm.shape # Document-term matrix generated from article full texts or titles dtm = {} for inp in inputs: dtm_inp = utilities.load_doc_term_matrix(path="../", inputs=inp) dtm[inp] = dtm_inp[dtm_inp.columns.intersection(vsm.index)] X = dtm # Output labels are brain activation coordinates Y = utilities.load_coordinates(path="../") m, n_structs = Y.shape # Splits of the article PMIDs splits = {} for split in ["train", "dev", "test"]: splits[split] = [ int(pmid.strip()) for pmid in open( "../data/splits/{}.txt".format(split), "r").readlines() ]
def train_classifier(framework, direction, suffix="", clf="", dtm_version=190325): fit_file = "fits/{}_{}.p".format(framework, direction) if not os.path.isfile(fit_file): # Load the data splits splits = {} for split in ["train", "validation"]: splits[split] = [ int(pmid.strip()) for pmid in open( "../../data/splits/{}.txt".format(split), "r").readlines() ] # Load the activation coordinate and text data act_bin = utilities.load_coordinates(path="../../data") dtm_bin = utilities.load_doc_term_matrix(version=dtm_version, binarize=True, path="../../data") # Score the texts using the framework lists, circuits = utilities.load_framework(framework, suffix=suffix, clf=clf, path="../../ontology") scores = utilities.score_lists(lists, dtm_bin) # Specify the hyperparameters for the randomized grid search param_grid = { "penalty": ["l1", "l2"], "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000], "fit_intercept": [True, False] } param_list = list( ParameterSampler(param_grid, n_iter=28, random_state=42)) max_iter = 1000 # Split the train set into batches and load the validation set as a batch if direction == "forward": train_set = [ scores.loc[splits["train"]], act_bin.loc[splits["train"]] ] val_set = [ scores.loc[splits["validation"]], act_bin.loc[splits["validation"]] ] elif direction == "reverse": train_set = [ act_bin.loc[splits["train"]], scores.loc[splits["train"]] ] val_set = [ act_bin.loc[splits["validation"]], scores.loc[splits["validation"]] ] # Search for the optimal hyperparameter combination op_fit = optimize_hyperparameters(param_list, train_set, val_set, max_iter=max_iter) # Export the optimized results pickle.dump(op_fit, open(fit_file, "wb"), protocol=2)
args = parser.parse_args() data = args.data if data not in ["titles", "texts"]: raise ValueError("""An invalid option for `--data` was supplied, options are ['titles', 'texts']""") # Load the GloVe vector space model vsm = pd.read_csv("../data/text/glove_gen_n100_win15_min5_iter500_190428.txt", sep=" ", index_col=0, header=0) n_vocab, n_emb = vsm.shape # Load the term matrix X = utilities.load_doc_term_matrix(path="../", inputs=data) X = X[X.columns.intersection(vsm.index)] m, n_terms = X.shape lexicon = list(X.columns) vsm = vsm.loc[lexicon] # Load the data splits splits = utilities.load_splits(splits=["train", "dev"], path="../", limit=5000) # Zero out embeddings for terms that did not occur in articles def load_emb(split): emb = np.zeros((n_terms, n_emb, len(splits[split]))) occ = X.loc[splits[split]] for i, pmid in enumerate(splits[split]): terms = occ.columns[occ.values[i, :] == 0]
def train_classifier(framework, direction, suffix="", clf="", dtm_version=190325, opt_epochs=500, train_epochs=1000, use_hyperparams=False): # Load the data splits splits = {} for split in ["train", "validation"]: splits[split] = [ int(pmid.strip()) for pmid in open( "../../data/splits/{}.txt".format(split), "r").readlines() ] # Load the activation coordinate and text data act_bin = utilities.load_coordinates(path="../../data") dtm_bin = utilities.load_doc_term_matrix(version=dtm_version, binarize=True, path="../../data") # Score the texts using the framework lists, circuits = utilities.load_framework(framework, suffix=suffix, clf=clf, path="../../ontology") scores = utilities.score_lists(lists, dtm_bin) # If hyperparameters have already been optimizd, use them param_file = "../data/params_{}_{}_{}epochs.csv".format( framework, direction, opt_epochs) if use_hyperparams: params = pd.read_csv(param_file, header=None, index_col=0) param_grid = { "lr": [float(params.loc["lr"])], "weight_decay": [float(params.loc["weight_decay"])], "n_hid": [int(params.loc["n_hid"])], "p_dropout": [float(params.loc["p_dropout"])] } param_list = list( ParameterSampler(param_grid, n_iter=1, random_state=42)) n_epochs = train_epochs # Otherwise, specify hyperparameters for a randomized grid search elif not use_hyperparams: param_grid = { "lr": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0], "weight_decay": [0.00001, 0.0001, 0.001, 0.01, 0.1], "n_hid": [25, 50, 75, 100, 125, 150], "p_dropout": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] } param_list = list( ParameterSampler(param_grid, n_iter=100, random_state=42)) n_epochs = opt_epochs batch_size = 1024 # Split the train set into batches and load the validation set as a batch if direction == "forward": train_set = load_mini_batches(scores, act_bin, splits["train"], mini_batch_size=batch_size, seed=42) val_set = load_mini_batches(scores, act_bin, splits["validation"], mini_batch_size=len(splits["validation"]), seed=42) elif direction == "reverse": train_set = load_mini_batches(act_bin, scores, splits["train"], mini_batch_size=batch_size, seed=42) val_set = load_mini_batches(act_bin, scores, splits["validation"], mini_batch_size=len(splits["validation"]), seed=42) # Search for the optimal hyperparameter combination op_state_dict, op_params, op_loss = optimize_hyperparameters( param_list, train_set, val_set, n_epochs=n_epochs) # Export the trained neural network fit_file = "../fits/{}_{}_{}epochs.pt".format(framework, direction, n_epochs) torch.save(op_state_dict, fit_file) # Export the hyperparameters with open(param_file, "w+") as file: file.write("\n".join( ["{},{}".format(param, val) for param, val in op_params.items()])) # Export the loss over epochs loss_file = "../data/loss_{}_{}_{}epochs.csv".format( framework, direction, n_epochs) pd.DataFrame(op_loss, index=None, columns=["LOSS"]).to_csv(loss_file)