def test_predicting_thrice(): trainCorpus1, trainCorpus2 = generateTestData(positiveCount=100,negativeCount=100,relTypes=1) trainCorpus3, testCorpus = generateTestData(positiveCount=100,negativeCount=100,relTypes=1) for doc in trainCorpus1.documents: for r in doc.relations: r.relationType = 'type1' for doc in trainCorpus2.documents: for r in doc.relations: r.relationType = 'type2' for doc in trainCorpus3.documents: for r in doc.relations: r.relationType = 'type3' testCorpus.removeRelations() classifier1 = kindred.RelationClassifier() classifier1.train(trainCorpus1) classifier2 = kindred.RelationClassifier() classifier2.train(trainCorpus2) classifier3 = kindred.RelationClassifier() classifier3.train(trainCorpus3) classifier1.predict(testCorpus) classifier2.predict(testCorpus) classifier3.predict(testCorpus) relations = [ r for doc in testCorpus.documents for r in doc.relations ] assert len(relations) == len(set(relations)), "Duplicate relations found in predictions"
def test_simpleRelationClassifier_triple(): trainCorpus, testCorpusGold = generateTestData(entityCount=3, positiveCount=100, negativeCount=100) trainRelations = trainCorpus.getRelations() assert len(trainRelations) == 50 for r in trainRelations: assert len(r.entities) == 3 predictionCorpus = testCorpusGold.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier(entityCount=3) classifier.train(trainCorpus) classifier.predict(predictionCorpus) predictedRelations = predictionCorpus.getRelations() assert len(predictedRelations) == 50 for r in predictedRelations: assert len(r.entities) == 3 f1score = kindred.evaluate(testCorpusGold, predictionCorpus, metric='f1score') assert f1score == 1.0
def test_singleClassifier_triple(): trainCorpus, devCorpus = generateTestData(entityCount=3, positiveCount=100, negativeCount=100, relTypes=1) trainRelations = trainCorpus.getRelations() assert len(trainRelations) == 50 for r in trainRelations: assert len(r.entities) == 3 predictionCorpus = devCorpus.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier(entityCount=3) classifier.train(trainCorpus) classifier.predict(predictionCorpus) predictedRelations = predictionCorpus.getRelations() assert len(predictedRelations) == 50 for r in predictedRelations: assert len(r.entities) == 3 f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score') assert round(f1score, 3) == 1.0
def _featureBuilding(useBB3): #trainData, testData = generateTestData(positiveCount=100,negativeCount=100) if useBB3: trainData = kindred.bionlpst.load('2016-BB3-event-train') testData = kindred.bionlpst.load('2016-BB3-event-dev') chosenFeatures = ["entityTypes", "unigramsBetweenEntities"] else: trainData = kindred.bionlpst.load('2016-SeeDev-binary-train') testData = kindred.bionlpst.load('2016-SeeDev-binary-dev') chosenFeatures = [ "entityTypes", "unigramsBetweenEntities", "bigrams", "dependencyPathEdges", "dependencyPathEdgesNearEntities" ] testData_TextAndEntities = [d.getTextAndEntities() for d in testData] testData_Relations = [d.getRelations() for d in testData] finalChosenFeatures = [] for threshold in range(1, 100, 2): fthreshold = threshold / float(100) classifier = kindred.RelationClassifier(useBuilder=False, features=chosenFeatures, threshold=fthreshold) classifier.train(trainData) predictedRelations = classifier.predict(testData_TextAndEntities) evaluator = Evaluator() precision, recall, f1score = evaluator.evaluate(testData_Relations, predictedRelations, metric='all', display=False) print(fthreshold, precision, recall, f1score)
def _nary(): trainData = kindred.BioNLPSTData( '2016-SeeDev-binary-training_and_development') testData = kindred.BioNLPSTData('2016-SeeDev-binary-test') classifier_merge = kindred.RelationClassifier(naryMethod='merge') classifier_merge.train(trainData) predictedRelations_merge = classifier_merge.predict(testData) kindred.saveST('BB3-predictions-merge/', testData, predictedRelations_merge) classifier_combined = kindred.RelationClassifier(naryMethod='combine') classifier_combined.train(trainData) predictedRelations_combined = classifier_combined.predict(testData) kindred.saveST('BB3-predictions-combined/', testData, predictedRelations_combined)
def test_doublelabels(): trainCorpus, devCorpus = generateTestData(positiveCount=100, negativeCount=100, relTypes=1) for doc in trainCorpus.documents: newRelations = [ kindred.Relation("anotherLabel", r.entities, r.argNames) for r in doc.relations ] doc.relations += newRelations for doc in devCorpus.documents: newRelations = [ kindred.Relation("anotherLabel", r.entities, r.argNames) for r in doc.relations ] doc.relations += newRelations predictionCorpus = devCorpus.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier() classifier.train(trainCorpus) classifier.predict(predictionCorpus) f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score') assert round(f1score, 3) == 1.0
def test_simpleRelationClassifier_emptyTrainCorpus(): trainCorpus, testCorpus = generateTestData(positiveCount=100,negativeCount=100) for doc in trainCorpus.documents: doc.entities = [] doc.relations = [] classifier = kindred.RelationClassifier() with pytest.raises(RuntimeError) as excinfo: classifier.train(trainCorpus) assert excinfo.value.args == ('No candidate relations found in corpus for training. Does the corpus contain text and entity annotations with at least one sentence containing 2 entities.',)
def test_logisticregression_threshold(): trainCorpus, devCorpus = generateTestData(positiveCount=100,negativeCount=100,relTypes=1) predictionCorpus = devCorpus.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier(classifierType='LogisticRegression',threshold=0.3) classifier.train(trainCorpus) classifier.predict(predictionCorpus) f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score') assert round(f1score,3) == 1.0
def test_filterByEntityTypes_validTypes(): trainCorpus, devCorpus = generateTestData(positiveCount=100,negativeCount=100,relTypes=1) predictionCorpus = devCorpus.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier(features=["unigramsBetweenEntities","bigrams","dependencyPathEdges","dependencyPathEdgesNearEntities"]) classifier.train(trainCorpus) classifier.predict(predictionCorpus) f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score') assert round(f1score,3) == 1.0
def test_simpleRelationClassifier_emptyTestCorpus(): trainCorpus, testCorpus = generateTestData(positiveCount=100,negativeCount=100) for doc in testCorpus.documents: doc.entities = [] doc.relations = [] classifier = kindred.RelationClassifier() classifier.train(trainCorpus) classifier.predict(testCorpus) assert len(testCorpus.getRelations()) == 0
def test_simpleRelationClassifier_binary(): trainCorpus, testCorpusGold = generateTestData(positiveCount=100,negativeCount=100) predictionCorpus = testCorpusGold.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier() classifier.train(trainCorpus) classifier.predict(predictionCorpus) f1score = kindred.evaluate(testCorpusGold, predictionCorpus, metric='f1score') assert f1score == 1.0
def test_singleClassifier(): trainCorpus, devCorpus = generateTestData(positiveCount=100,negativeCount=100,relTypes=1) predictionCorpus = devCorpus.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier() classifier.train(trainCorpus) classifier.predict(predictionCorpus) f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score') assert round(f1score,3) == 1.0
def test_bionlp_BB3_classifier(): trainCorpus = kindred.bionlpst.load('2016-BB3-event-train') devCorpus = kindred.bionlpst.load('2016-BB3-event-dev') predictionCorpus = devCorpus.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier() classifier.train(trainCorpus) classifier.predict(predictionCorpus) f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score') assert f1score > 0.4
def _bionlpst_seedev(): trainCorpus = kindred.bionlpst.load('2016-SeeDev-binary-train') devCorpus = kindred.bionlpst.load('2016-SeeDev-binary-dev') predictionCorpus = devCorpus.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier() classifier.train(trainCorpus) classifier.predict(predictionCorpus) f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score') print("f1score:", f1score)
def test_bionlp_SeeDev_classifier(): trainCorpus = kindred.bionlpst.load('2016-SeeDev-binary-train') devCorpus = kindred.bionlpst.load('2016-SeeDev-binary-dev') predictionCorpus = devCorpus.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier() classifier.train(trainCorpus) classifier.predict(predictionCorpus) f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score') assert round(f1score, 3) == 0.349
def test_predicting_duplicates(): trainCorpus, testCorpus = generateTestData(positiveCount=100,negativeCount=100,relTypes=1) testCorpus.removeRelations() classifier = kindred.RelationClassifier() classifier.train(trainCorpus) classifier.predict(testCorpus) classifier.predict(testCorpus) classifier.predict(testCorpus) relations = [ r for doc in testCorpus.documents for r in doc.relations ] assert len(relations) == len(set(relations)), "Duplicate relations found in predictions"
def _bionlpst_seedev(swap): trainCorpus = kindred.bionlpst.load('2016-SeeDev-binary-train') devCorpus = kindred.bionlpst.load('2016-SeeDev-binary-dev') if swap: trainCorpus,devCorpus = devCorpus,trainCorpus predictionCorpus = devCorpus.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier() classifier.train(trainCorpus) classifier.predict(predictionCorpus) scores = kindred.evaluate(devCorpus, predictionCorpus, metric='all') print("seedev scores:",scores,swap)
def _SeeDevmini(): trainCorpus = kindred.bionlpst.load('2016-SeeDev-binary-train') devCorpus = kindred.bionlpst.load('2016-SeeDev-binary-dev') trainCorpus.documents = trainCorpus.documents[1:2] devCorpus.documents = devCorpus.documents[:1] predictionCorpus = devCorpus.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier() classifier.train(trainCorpus) classifier.predict(predictionCorpus) f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score') assert round(f1score, 3) == 0.235
def _bionlpst_bb3(swap): trainCorpus = kindred.bionlpst.load('2016-BB3-event-train') devCorpus = kindred.bionlpst.load('2016-BB3-event-dev') if swap: trainCorpus,devCorpus = devCorpus,trainCorpus predictionCorpus = devCorpus.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier(useBuilder=True) classifier.train(trainCorpus) classifier.predict(predictionCorpus) scores = kindred.evaluate(devCorpus, predictionCorpus, metric='all') print("bb3 scores:",scores,swap)
def _featureBuilding(useBB3): #trainData, testData = generateTestData(positiveCount=100,negativeCount=100) if useBB3: trainData = kindred.bionlpst.load('2016-BB3-event-train') testData = kindred.bionlpst.load('2016-BB3-event-dev') useBuilder = True else: trainData = kindred.bionlpst.load('2016-SeeDev-binary-train') testData = kindred.bionlpst.load('2016-SeeDev-binary-dev') useBuilder = False testData_TextAndEntities = [d.getTextAndEntities() for d in testData] testData_Relations = [d.getRelations() for d in testData] featureChoice = [ "entityTypes", "unigramsBetweenEntities", "bigrams", "dependencyPathEdges", "dependencyPathEdgesNearEntities" ] finalChosenFeatures = [] for stage in range(3): bestF1, bestFeature = -1, -1 for feature in featureChoice: chosenFeatures = finalChosenFeatures + [feature] classifier = kindred.RelationClassifier(useBuilder=useBuilder, features=chosenFeatures) classifier.train(trainData) predictedRelations = classifier.predict(testData_TextAndEntities) evaluator = Evaluator() f1score = evaluator.evaluate(testData_Relations, predictedRelations, metric='f1score', display=False) print(stage, feature, f1score) if f1score > bestF1: bestF1 = f1score bestFeature = feature featureChoice.remove(bestFeature) finalChosenFeatures.append(bestFeature)
def _bionlpst_seedev_testSet(): trainData = kindred.bionlpst.load('2016-SeeDev-binary-train') devData = kindred.bionlpst.load('2016-SeeDev-binary-dev') testData = kindred.bionlpst.load('2016-SeeDev-binary-test') trainAndDevData = trainData + devData print("Starting training...") classifier = kindred.RelationClassifier() classifier.train(trainAndDevData) print("Predicting training...") predictedRelations = classifier.predict( testData) #devData_TextAndEntities) print("Saving...") outDir = 'out.SeeDev' kindred.save(testData, 'standoff', outDir, predictedRelations=predictedRelations)
def test_pickle(): trainCorpus, testCorpusGold = generateTestData(positiveCount=100, negativeCount=100) predictionCorpus = testCorpusGold.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier() classifier.train(trainCorpus) with tempfile.NamedTemporaryFile() as tempF: with open(tempF.name, 'wb') as f: pickle.dump(classifier, f) with open(tempF.name, 'rb') as f: classifier = pickle.load(f) classifier.predict(predictionCorpus) f1score = kindred.evaluate(testCorpusGold, predictionCorpus, metric='f1score') assert f1score == 1.0
def _bionlpst_bb3_testSet(): trainData = kindred.bionlpst.load('2016-BB3-event-train') devData = kindred.bionlpst.load('2016-BB3-event-dev') testData = kindred.bionlpst.load('2016-BB3-event-test') trainAndDevData = trainData + devData print("Starting training...") classifier = kindred.RelationClassifier(useBuilder=True) #classifier = RelationClassifier(useBuilder=False) classifier.train(trainAndDevData) print("Predicting training...") predictedRelations = classifier.predict( testData) #devData_TextAndEntities) print("Saving...") outDir = 'out.BB3' kindred.save(testData, 'standoff', outDir, predictedRelations=predictedRelations)
['Driver', 'Oncogene', 'Tumor_Suppressor'], [ args.outModel_Driver, args.outModel_Oncogene, args.outModel_TumorSuppressor ]): print("Building %s model" % relationType) print(" Loading training") goldDir = 'gold' trainCorpus = kindred.load('standoff', args.inTrain) for doc in trainCorpus.documents: doc.relations = [ r for r in doc.relations if r.relationType == relationType ] print(" Doing training") features = "entityTypes,unigramsBetweenEntities,bigrams,dependencyPathEdges,dependencyPathEdgesNearEntities".split( ',') threshold = thresholds[relationType] classifier = kindred.RelationClassifier( classifierType='LogisticRegression', threshold=threshold, features=features, acceptedEntityTypes=[('cancer', 'gene')]) classifier.train(trainCorpus) print(" Saving classifer") with open(outModel, 'wb') as f: pickle.dump(classifier, f) print(" Output done!")
# avg_nn = 0 count = 0 iter_num = 200 print("-------------5 CLASSES------------") while count < iter_num: print("------", count, "------") trainCorpus, devCorpus = Corpus.split(trainFraction=0.9) predictionCorpus = devCorpus.clone() predictionCorpus.removeRelations() classifier = kindred.RelationClassifier() classifier.train(trainCorpus) classifier.predict(predictionCorpus) svmf1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score', display=False) print("svm:\t", svmf1score) # avg_svm = avg_svm+svmf1score classifier = kindred.RelationClassifier(classifierType='DCT') classifier.train(trainCorpus) classifier.predict(predictionCorpus) dctf1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score',
# Trim back to relation type of choice for doc in train.documents: doc.relations = [ r for r in doc.relations if r.relationType == args.reltype ] for doc in gold.documents: doc.relations = [ r for r in doc.relations if r.relationType == args.reltype ] entityType = entityTypes[args.reltype] entityCount = len(entityType) classifier = kindred.RelationClassifier( classifierType='LogisticRegression', threshold=threshold, entityCount=entityCount, acceptedEntityTypes=[entityType]) classifier.train(train) predictions = gold.clone() predictions.removeRelations() classifier.predict(predictions) TP, FN, FP = 0, 0, 0 for goldDoc, testDoc in zip(gold.documents, predictions.documents): goldTuples = [(r.relationType, tuple(r.entityIDs)) for r in goldDoc.relations] testTuples = [(r.relationType, tuple(r.entityIDs))
help='Directory containing stand-off testing test') args = parser.parse_args() print("threshold\tprecision\trecall") for threshold in np.arange(0, 1.01, 0.01): trainCorpus = kindred.load('standoff', args.train) testCorpus = kindred.load('standoff', args.test) predCorpus = testCorpus.clone() predCorpus.removeRelations() parser = kindred.Parser(model='en_core_sci_sm') parser.parse(trainCorpus) parser.parse(testCorpus) parser.parse(predCorpus) classifier = kindred.RelationClassifier( classifierType='LogisticRegression', threshold=threshold, acceptedEntityTypes=[('Chemical', 'Mutation')]) classifier.train(trainCorpus) classifier.predict(predCorpus) precision, recall, f1score = kindred.evaluate(testCorpus, predCorpus, metric='all', display=False) print("%f\t%f\t%f" % (threshold, precision, recall))
def get_rels_clean(): c.execute("SELECT gene, disease, relation FROM relationships") return c.fetchall() # Kindred # 5 classes trainCorpus = kindred.load(dataFormat='json', path='relation/db/1') devCorpus = kindred.load(dataFormat='json', path='ner-dump') predictionCorpus = devCorpus.clone() classifier = kindred.RelationClassifier() classifier.train(trainCorpus) classifier.predict(predictionCorpus) f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score') print("5 CLASSES ---------------------") for i in predictionCorpus.documents: for j in (i.relations): rel = Relationships(j.entities[0].text, j.entities[1].text, j.relationType, i.text) insert_rels(rel) result = get_rels() for i in result: print(i)
for relationType, threshold, entityTypes in relationInfo: print("Building %s model" % relationType) print(" Loading training") trainCorpus = kindred.load('standoff', args.inTrain) for doc in trainCorpus.documents: doc.relations = [ r for r in doc.relations if r.relationType == relationType ] doc.relations = [ r for r in doc.relations if len(r.entities) == len(entityTypes) ] print(" Doing training") threshold = 0.5 classifier = kindred.RelationClassifier( classifierType='LogisticRegression', threshold=threshold, entityCount=len(entityTypes), acceptedEntityTypes=[entityTypes], model='en_core_sci_sm') classifier.train(trainCorpus) print(" Saving classifer") outModel = os.path.join(args.outDir, "%s.model" % relationType) with open(outModel, 'wb') as f: pickle.dump(classifier, f) print(" Output done!")