def create_y(data): y = [] for doc in data: for sentence in doc.sentences: for pair in sentence.pairs: label = class_index["null"] if pair.ddi == "true": label = class_index[pair.type] y.append(label) return y ### SPLIT DATASET ### data = read_dataset() n_docs = len(data) np.random.seed(42) train_amount = 0.7 train_ids = choice(n_docs, int(train_amount * n_docs), replace=False) test_ids = [i for i in range(n_docs) if i not in train_ids] training = [data[i] for i in train_ids] test = [data[i] for i in test_ids] print("%i training documents" % len(training)) print("%i test documents" % len(test)) ### TRAINING ###
#features.append(("drug1_type", drug1_type)) #features.append(("drug2_type", drug2_type)) #drug1_name = pair.e1.text #drug2_name = pair.e2.text #features.append(("drug_name", drug1_name)) #features.append(("drug_name", drug2_name)) same_drug = pair.e1.text.lower() == pair.e2.text.lower() features.append(("same_drug", same_drug)) return features if __name__ == "__main__": np.random.seed(42) data = read_dataset() n_docs = len(data) n_folds = 10 folds = k_folds(n_docs, n_folds) if TEST: n_folds = 1 classes = ["int", "effect", "none", "mechanism", "advise"] cv_results = {} cv_precisions = [] cv_recalls = [] cv_fs = [] cv_2class_precisions = [] cv_2class_recalls = [] cv_2class_fs = []