def first_train(self, data, Y, do_grid=False): """ Model::first_train() Purpose: Train the first pass classifiers (for IOB chunking) @param data A list of split sentences (1 sent = 1 line from file) @param Y A list of list of IOB labels (1:1 mapping with data) @param do_grid A boolean indicating whether to perform a grid search @return None """ print '\textracting features (pass one)' # Create object that is a wrapper for the features feat_obj = FeatureWrapper(data) # Parition into prose v. nonprose prose = [] nonprose = [] pchunks = [] nchunks = [] for line, labels in zip(data, Y): isProse, feats = feat_obj.extract_IOB_features(line) if isProse: prose.append(feats) pchunks += labels else: nonprose.append(feats) nchunks += labels # Classify both prose & nonprose flabels = ['prose', 'nonprose'] fsets = [prose, nonprose] chunksets = [pchunks, nchunks] dvects = [self.first_prose_vec, self.first_nonprose_vec] clfs = [self.first_prose_clf, self.first_nonprose_clf] vectorizers = [] classifiers = [] for flabel, fset, chunks, dvect, clf in zip(flabels, fsets, chunksets, dvects, clfs): if len(fset) == 0: raise Exception( 'Training data must have %s training examples' % flabel) print '\tvectorizing features (pass one) ' + flabel # Vectorize IOB labels Y = [IOB_labels[y] for y in chunks] # Save list structure to reconstruct after vectorization offsets = [len(sublist) for sublist in fset] for i in range(1, len(offsets)): offsets[i] += offsets[i - 1] # Vectorize features flattened = [item for sublist in fset for item in sublist] X = dvect.fit_transform(flattened) vectorizers.append(dvect) print '\ttraining classifiers (pass one) ' + flabel # CRF needs reconstructed lists if self.crf_enabled: X = list(X) X = [X[i:j] for i, j in zip([0] + offsets, offsets)] Y = [Y[i:j] for i, j in zip([0] + offsets, offsets)] lib = crf else: lib = sci # Train classifiers clf = lib.train(X, Y, do_grid) classifiers.append(clf) # Save vectorizers self.first_prose_vec = vectorizers[0] self.first_nonprose_vec = vectorizers[1] # Save classifiers self.first_prose_clf = classifiers[0] self.first_nonprose_clf = classifiers[1]
def first_train(self, data, Y, do_grid=False): """ Model::first_train() Purpose: Train the first pass classifiers (for IOB chunking) @param data A list of split sentences (1 sent = 1 line from file) @param Y A list of list of IOB labels (1:1 mapping with data) @param do_grid A boolean indicating whether to perform a grid search @return None """ print '\textracting features (pass one)' # Create object that is a wrapper for the features feat_obj = FeatureWrapper(data) # Parition into prose v. nonprose prose = [] nonprose = [] pchunks = [] nchunks = [] for line,labels in zip(data,Y): isProse,feats = feat_obj.extract_IOB_features(line) if isProse: prose.append(feats) pchunks += labels else: nonprose.append(feats) nchunks += labels # Classify both prose & nonprose flabels = ['prose' , 'nonprose' ] fsets = [prose , nonprose ] chunksets = [pchunks , nchunks ] dvects = [self.first_prose_vec, self.first_nonprose_vec] clfs = [self.first_prose_clf, self.first_nonprose_clf] vectorizers = [] classifiers = [] for flabel,fset,chunks,dvect,clf in zip(flabels, fsets, chunksets, dvects, clfs): if len(fset) == 0: raise Exception('Training data must have %s training examples' % flabel) print '\tvectorizing features (pass one) ' + flabel # Vectorize IOB labels Y = [ IOB_labels[y] for y in chunks ] # Save list structure to reconstruct after vectorization offsets = [ len(sublist) for sublist in fset ] for i in range(1, len(offsets)): offsets[i] += offsets[i-1] # Vectorize features flattened = [item for sublist in fset for item in sublist] X = dvect.fit_transform(flattened) vectorizers.append(dvect) print '\ttraining classifiers (pass one) ' + flabel # CRF needs reconstructed lists if self.crf_enabled: X = list(X) X = [ X[i:j] for i, j in zip([0] + offsets, offsets)] Y = [ Y[i:j] for i, j in zip([0] + offsets, offsets)] lib = crf else: lib = sci # Train classifiers clf = lib.train(X, Y, do_grid) classifiers.append(clf) # Save vectorizers self.first_prose_vec = vectorizers[0] self.first_nonprose_vec = vectorizers[1] # Save classifiers self.first_prose_clf = classifiers[0] self.first_nonprose_clf = classifiers[1]
def first_predict(self, data): """ Model::first_predict() Purpose: Predict IOB chunks on data @param data. A list of split sentences (1 sent = 1 line from file) @return A list of list of IOB labels (1:1 mapping with data) """ print '\textracting features (pass one)' # Create object that is a wrapper for the features feat_obj = FeatureWrapper(data) # separate prose and nonprose data prose = [] nonprose = [] plinenos = [] nlinenos = [] for i, line in enumerate(data): isProse, feats = feat_obj.extract_IOB_features(line) if isProse: prose.append(feats) plinenos.append(i) else: nonprose.append(feats) nlinenos.append(i) # Classify both prose & nonprose flabels = ['prose', 'nonprose'] fsets = [prose, nonprose] dvects = [self.first_prose_vec, self.first_nonprose_vec] clfs = [self.first_prose_clf, self.first_nonprose_clf] preds = [] for flabel, fset, dvect, clf in zip(flabels, fsets, dvects, clfs): # If nothing to predict, skip actual prediction if len(fset) == 0: preds.append([]) continue print '\tvectorizing features (pass one) ' + flabel # Save list structure to reconstruct after vectorization offsets = [len(sublist) for sublist in fset] for i in range(1, len(offsets)): offsets[i] += offsets[i - 1] # Vectorize features flattened = [item for sublist in fset for item in sublist] X = dvect.transform(flattened) print '\tpredicting labels (pass one) ' + flabel # CRF requires reconstruct lists if self.crf_enabled: X = list(X) X = [X[i:j] for i, j in zip([0] + offsets, offsets)] lib = crf else: lib = sci # Predict IOB labels out = lib.predict(clf, X) # Format labels from output pred = [out[i:j] for i, j in zip([0] + offsets, offsets)] preds.append(pred) # Recover predictions plist = preds[0] nlist = preds[1] # Stitch prose and nonprose data back together # translate IOB labels into a readable format prose_iobs = [] nonprose_iobs = [] iobs = [] trans = lambda l: reverse_IOB_labels[int(l)] for sentence in data: if is_prose_sentence(sentence): prose_iobs.append(plist.pop(0)) prose_iobs[-1] = map(trans, prose_iobs[-1]) iobs.append(prose_iobs[-1]) else: nonprose_iobs.append(nlist.pop(0)) nonprose_iobs[-1] = map(trans, nonprose_iobs[-1]) iobs.append(nonprose_iobs[-1]) # list of list of IOB labels return iobs, prose_iobs, nonprose_iobs
def first_predict(self, data): """ Model::first_predict() Purpose: Predict IOB chunks on data @param data. A list of split sentences (1 sent = 1 line from file) @return A list of list of IOB labels (1:1 mapping with data) """ print '\textracting features (pass one)' # Create object that is a wrapper for the features feat_obj = FeatureWrapper(data) # separate prose and nonprose data prose = [] nonprose = [] plinenos = [] nlinenos = [] for i,line in enumerate(data): isProse,feats = feat_obj.extract_IOB_features(line) if isProse: prose.append(feats) plinenos.append(i) else: nonprose.append(feats) nlinenos.append(i) # Classify both prose & nonprose flabels = ['prose' , 'nonprose' ] fsets = [prose , nonprose ] dvects = [self.first_prose_vec, self.first_nonprose_vec] clfs = [self.first_prose_clf, self.first_nonprose_clf] preds = [] for flabel,fset,dvect,clf in zip(flabels, fsets, dvects, clfs): # If nothing to predict, skip actual prediction if len(fset) == 0: preds.append([]) continue print '\tvectorizing features (pass one) ' + flabel # Save list structure to reconstruct after vectorization offsets = [ len(sublist) for sublist in fset ] for i in range(1, len(offsets)): offsets[i] += offsets[i-1] # Vectorize features flattened = [item for sublist in fset for item in sublist] X = dvect.transform(flattened) print '\tpredicting labels (pass one) ' + flabel # CRF requires reconstruct lists if self.crf_enabled: X = list(X) X = [ X[i:j] for i, j in zip([0] + offsets, offsets)] lib = crf else: lib = sci # Predict IOB labels out = lib.predict(clf, X) # Format labels from output pred = [out[i:j] for i, j in zip([0] + offsets, offsets)] preds.append(pred) # Recover predictions plist = preds[0] nlist = preds[1] # Stitch prose and nonprose data back together # translate IOB labels into a readable format prose_iobs = [] nonprose_iobs = [] iobs = [] trans = lambda l: reverse_IOB_labels[int(l)] for sentence in data: if prose_sentence(sentence): prose_iobs.append( plist.pop(0) ) prose_iobs[-1] = map(trans, prose_iobs[-1]) iobs.append( prose_iobs[-1] ) else: nonprose_iobs.append( nlist.pop(0) ) nonprose_iobs[-1] = map(trans, nonprose_iobs[-1]) iobs.append( nonprose_iobs[-1] ) # list of list of IOB labels return iobs, prose_iobs, nonprose_iobs