def second_predict(self, data, inds_list): # If first pass predicted no concepts, then skip # NOTE: Special case because SVM cannot have empty input if sum([len(inds) for inds in inds_list]) == 0: print "first pass predicted no concepts, skipping second pass" return [] # Create object that is a wrapper for the features feat_o = FeatureWrapper() print '\textracting features (pass two)' # Extract features X = [ feat_o.concept_features(s, inds) for s, inds in zip(data, inds_list) ] X = reduce(concat, X) print '\tvectorizing features (pass two)' # Vectorize features X = self.second_vec.transform(X) print '\tpredicting labels (pass two)' # Predict concept labels out = sci.predict(self.second_clf, X) # Line-by-line processing o = list(out) classifications = [] for lineno, inds in enumerate(inds_list): # Skip empty line if not inds: continue # For each concept for ind in inds: # Get next concept concept = reverse_concept_labels[o.pop(0)] # Get start position (ex. 7th word of line) start = 0 for i in range(ind): start += len(data[lineno][i].split()) # Length of chunk length = len(data[lineno][ind].split()) # Classification token classifications.append( (concept, lineno + 1, start, start + length - 1)) # Return classifications return classifications
def second_train(self, data, inds_list, Y, do_grid=False): """ Model::second_train() Purpose: Train the first pass classifiers (for IOB chunking) @param data A list of list of strings. - A string is a chunked phrase - An inner list corresponds to one line from the file @param inds_list A list of list of integer indices - assertion: len(data) == len(inds_list) - one line of 'inds_list' contains a list of indices into the corresponding line for 'data' @param Y A list of concept labels - assertion: there are sum(len(inds_list)) labels AKA each index from inds_list maps to a label @param do_grid A boolean indicating whether to perform a grid search @return None """ print '\textracting features (pass two)' # Create object that is a wrapper for the features feat_o = FeatureWrapper() # Extract features X = [ feat_o.concept_features(s,inds) for s,inds in zip(data,inds_list) ] X = reduce(concat, X) print '\tvectorizing features (pass two)' # Vectorize labels Y = [ concept_labels[y] for y in Y ] # Vectorize features X = self.second_vec.fit_transform(X) print '\ttraining classifier (pass two)' # Train the model self.second_clf = sci.train(X, Y, do_grid)
def second_train(self, data, inds_list, Y, do_grid=False): """ Model::second_train() Purpose: Train the first pass classifiers (for IOB chunking) @param data A list of list of strings. - A string is a chunked phrase - An inner list corresponds to one line from the file @param inds_list A list of list of integer indices - assertion: len(data) == len(inds_list) - one line of 'inds_list' contains a list of indices into the corresponding line for 'data' @param Y A list of concept labels - assertion: there are sum(len(inds_list)) labels AKA each index from inds_list maps to a label @param do_grid A boolean indicating whether to perform a grid search @return None """ print '\textracting features (pass two)' # Create object that is a wrapper for the features feat_o = FeatureWrapper() # Extract features X = [ feat_o.concept_features(s, inds) for s, inds in zip(data, inds_list) ] X = reduce(concat, X) print '\tvectorizing features (pass two)' # Vectorize labels Y = [concept_labels[y] for y in Y] # Vectorize features X = self.second_vec.fit_transform(X) print '\ttraining classifier (pass two)' # Train the model self.second_clf = sci.train(X, Y, do_grid)
def second_predict(self, data, inds_list): # If first pass predicted no concepts, then skip # NOTE: Special case because SVM cannot have empty input if sum([ len(inds) for inds in inds_list ]) == 0: print "first pass predicted no concepts, skipping second pass" return [] # Create object that is a wrapper for the features feat_o = FeatureWrapper() print '\textracting features (pass two)' # Extract features X = [ feat_o.concept_features(s,inds) for s,inds in zip(data,inds_list) ] X = reduce(concat, X) print '\tvectorizing features (pass two)' # Vectorize features X = self.second_vec.transform(X) print '\tpredicting labels (pass two)' # Predict concept labels out = sci.predict(self.second_clf, X) # Line-by-line processing o = list(out) classifications = [] for lineno,inds in enumerate(inds_list): # Skip empty line if not inds: continue # For each concept for ind in inds: # Get next concept concept = reverse_concept_labels[o.pop(0)] # Get start position (ex. 7th word of line) start = 0 for i in range(ind): start += len( data[lineno][i].split() ) # Length of chunk length = len(data[lineno][ind].split()) # Classification token classifications.append( (concept,lineno+1,start,start+length-1) ) # Return classifications return classifications