def test_multilabelclassifier(): np.random.seed(1) N = 1000 split = N // 2 X = np.random.rand(N, 2) Ya = [1 if X[i, 0] < 0.5 else 0 for i in range(X.shape[0])] Yb = [1 if X[i, 1] < 0.5 else 0 for i in range(X.shape[0])] Y = np.column_stack((Ya, Yb)) Xtrain = X[:split, :] Ytrain = Y[:split, :] Xtest = X[split:, :] Ytest = Y[split:, :] classifier = kindred.MultiLabelClassifier( sklearn.linear_model.LogisticRegression, random_state=1, solver='lbfgs') classifier.fit(Xtrain, Ytrain) predicted = classifier.predict(Xtest) assert predicted.shape == Ytest.shape rmse = np.sqrt(np.mean((predicted - Ytest)**2)) assert round(rmse, 3) == 0.071 probs = classifier.predict_proba(Xtest) assert probs.shape == Ytest.shape rmse_probs = np.sqrt(np.mean((probs - Ytest)**2)) assert round(rmse_probs, 3) == 0.202
def train(self,corpus): """ Trains the classifier using this corpus. All relations in the corpus will be used for training. :param corpus: Corpus to use for training :type corpus: kindred.Corpus """ assert isinstance(corpus,kindred.Corpus) if not corpus.parsed: parser = kindred.Parser(model=self.model) parser.parse(corpus) self.candidateBuilder = CandidateBuilder(entityCount=self.entityCount,acceptedEntityTypes=self.acceptedEntityTypes) candidateRelations = self.candidateBuilder.build(corpus) if len(candidateRelations) == 0: raise RuntimeError("No candidate relations found in corpus for training. Does the corpus contain text and entity annotations with at least one sentence containing %d entities." % (self.entityCount)) candidateRelationKeys = set() for cr in candidateRelations: assert isinstance(cr,kindred.CandidateRelation) for knownType,knownArgNames in cr.knownTypesAndArgNames: relKey = tuple([knownType] + knownArgNames) candidateRelationKeys.add(relKey) # Create mappings from the class index to a relation type and back again self.colToRelType = sorted(list(candidateRelationKeys)) self.relTypeToCol = { relationType:i for i,relationType in enumerate(self.colToRelType) } Y = np.zeros((len(candidateRelations),len(self.colToRelType)),np.int32) candidateClasses = [] for i,cr in enumerate(candidateRelations): for knownType,knownArgNames in cr.knownTypesAndArgNames: relKey = tuple([knownType] + knownArgNames) col = self.relTypeToCol[relKey] Y[i,col] = 1 entityCountsInRelations = set([ len(r.entities) for r in corpus.getRelations() ]) entityCountsInRelations = sorted(list(set(entityCountsInRelations))) assert self.entityCount in entityCountsInRelations, "Relation classifier is expecting to train on relations with %d entities (entityCount=%d). But the known relations in the corpus contain relations with the following entity counts: %s. Perhaps the entityCount parameter should be changed or there is a problem with the training corpus." % (self.entityCount,self.entityCount,str(entityCountsInRelations)) self.relTypeToValidEntityTypes = defaultdict(set) for d in corpus.documents: for r in d.relations: validEntityTypes = tuple([ e.entityType for e in r.entities ]) relKey = tuple([r.relationType] + r.argNames) self.relTypeToValidEntityTypes[relKey].add(validEntityTypes) self.vectorizer = Vectorizer(entityCount=self.entityCount,featureChoice=self.chosenFeatures,tfidf=self.tfidf) trainVectors = self.vectorizer.fit_transform(candidateRelations) assert trainVectors.shape[0] == Y.shape[0] posCount = Y.sum() negCount = Y.shape[0]*Y.shape[1] - posCount assert negCount > 0, "Must have at least one negative candidate relation in set for training" assert posCount > 0, "Must have at least one positive candidate relation in set for training" self.clf = None if self.classifierType == 'SVM': self.clf = kindred.MultiLabelClassifier(svm.LinearSVC,class_weight='balanced',random_state=1,max_iter=10000) elif self.classifierType == 'LogisticRegression' and self.threshold is None: self.clf = kindred.MultiLabelClassifier(LogisticRegression,class_weight='balanced',random_state=1,solver='liblinear',multi_class='ovr') elif self.classifierType == 'LogisticRegression' and not self.threshold is None: self.clf = kindred.MultiLabelClassifier(kindred.LogisticRegressionWithThreshold,threshold=self.threshold) self.clf.fit(trainVectors,Y) self.isTrained = True