def main_tag(featureSet, options): labelCounter, featCounter = BookKeeper(), BookKeeper() labelCounter.readFromFile('{0}.labelNumbers'.format(options.modelName)) featCounter.readFromFile('{0}.featureNumbers'.format(options.modelName)) optionsDict = vars(options) optionsDict['labelCounter'] = labelCounter optionsDict['featCounter'] = featCounter optionsDict['modelFile'] = '{0}.model'.format(options.modelName) tagger = Tagger(featureSet, optionsDict) if options.inFeatFile: tagger_func = lambda: tagger.tag_features(options.inFeatFile) writer_func = lambda s, c: writeSentence(s, comment=c) elif options.input_dir: assert isdir(options.input_dir), "--input-dir must be a directory" out_dir = "{}_out".format(options.input_dir) os.mkdir(out_dir) tagger_func = lambda: tagger.tag_dir(options.input_dir) writer_func = lambda s, c: writeSentence( s, out=open(join(out_dir, '{}.tagged'.format(c)), 'a')) else: tagger_func = lambda: tagger.tag_corp(sys.stdin) writer_func = lambda s, c: writeSentence(s, comment=c) for sen, other in tagger_func(): writer_func(sen, other)
def __init__(self, features, transModel, options): self._features = features self._dataSizes = options['dataSizes'] self._transProbs = transModel print('loading observation model...', end='', file=sys.stderr, flush=True) self._model = joblib.load('{0}'.format(options['modelFileName'])) self._featCounter = BookKeeper(options['featCounterFileName']) self._labelCounter = BookKeeper(options['labelCounterFileName']) print('done', file=sys.stderr, flush=True)
def __init__(self, features, options): self.modelName = options['modelName'] self.parameters = options['trainParams'] self.cutoff = options['cutoff'] self.features = features self.labels = [] self.contexts = [] self.labelCounter = BookKeeper() self.featCounter = BookKeeper() self.usedFeats = None if options['usedFeats']: self.usedFeats = set([line.strip() for line in options['usedFeats']])
def __init__(self, features, options): self.modelName = options['modelName'] self.parameters = options['trainParams'] self.cutoff = options['cutoff'] self.features = features self.labels = [] self.contexts = [] self.labelCounter = BookKeeper() self.featCounter = BookKeeper() self.usedFeats = None if options['usedFeats']: self.usedFeats = set( [line.strip() for line in options['usedFeats']])
def __init__(self, features, options): # Set clasifier algorithm here parameters = dict() # dict(solver='lbfgs') solver = LogisticRegression # Possible alternative solvers: # parameters = {'loss':'modified_huber', 'n_jobs': -1} # solver = SGDClassifier # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ # parameters = {'kernel': 'rbf', 'probability': True} # solver = SVC # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ # parameters = {'kernel': 'linear', 'probability': True} # solver = OneVsRestClassifier(SVC(**parameters)) # XXX won't work because ** in parameters... self._model = solver(**parameters) self._dataSizes = options['dataSizes'] self._tagField = options['tagField'] self._modelFileName = options['modelFileName'] self._parameters = options['trainParams'] self._cutoff = options['cutoff'] self._featCounterFileName = options['featCounterFileName'] self._labelCounterFileName = options['labelCounterFileName'] self._features = features self._tokCount = -1 # Index starts from 0 self._rows = array(self._dataSizes['rows']) self._cols = array(self._dataSizes['cols']) self._data = array(self._dataSizes['data']) self._labels = array(self._dataSizes['labels']) self._sentEnd = array( self._dataSizes['sentEnd']) # Keep track of sentence boundaries self._matrix = None self._featCounter = BookKeeper() self._labelCounter = BookKeeper() self._usedFeats = None if 'usedFeats' in options and options['usedFeats']: self._usedFeats = { line.strip() for line in open(options['usedFeats'], encoding='UTF-8') }
class Trainer(): def __init__(self, features, options): self.modelName = options['modelName'] self.parameters = options['trainParams'] self.cutoff = options['cutoff'] self.features = features self.labels = [] self.contexts = [] self.labelCounter = BookKeeper() self.featCounter = BookKeeper() self.usedFeats = None if options['usedFeats']: self.usedFeats = set([line.strip() for line in options['usedFeats']]) def save(self): sys.stderr.write('saving model...') save_model(self.modelName+'.model', self.model) sys.stderr.write('done\nsaving label and feature lists...') self.labelCounter.saveToFile(self.modelName+'.labelNumbers') self.featCounter.saveToFile(self.modelName+'.featureNumbers') sys.stderr.write('done\n') def writeFeats(self, fileName): """obsolete""" featFile = open(fileName, 'w') for i, context in enumerate(self.contexts): label = self.labelCounter.noToFeat[self.labels[i]] feats = [self.featCounter.noToFeat[c] for c in [feat[0] for feat in context]] featFile.write('{0}\t{1}\n'.format(label, ' '.join(feats))) def reduceContexts(self): sys.stderr.write('reducing training events...') self.contexts = [dict([(number, value) for number, value in context.iteritems() if self.featCounter.noToFeat.has_key(number)]) for context in self.contexts] sys.stderr.write('done!\n') def cutoffFeats(self): if self.cutoff<2: return sys.stderr.write('discarding features with\ less than {0} occurences...'.format(self.cutoff)) self.featCounter.cutoff(self.cutoff) sys.stderr.write('done!\n') self.reduceContexts() def getEvents(self, data, out_file_name): sys.stderr.write('featurizing sentences...') senCount = 0 out_file = None if out_file_name: out_file = open(out_file_name, 'w') for sen, _ in sentenceIterator(data): senCount+=1 sentenceFeats = featurizeSentence(sen, self.features) for c, tok in enumerate(sen): tokFeats = sentenceFeats[c] if self.usedFeats: tokFeats = [feat for feat in tokFeats if feat in self.usedFeats] if out_file: out_file.write(tok[-1]+'\t'+' '.join(tokFeats)+'\n') self.addContext(tokFeats, tok[-1]) if out_file: out_file.write('\n') if senCount % 1000 == 0: sys.stderr.write(str(senCount)+'...') sys.stderr.write(str(senCount)+'...done!\n') def getEventsFromFile(self, fileName): for line in file(fileName): if line == '\n': continue l = line.strip().split() label, feats = l[0], l[1:] self.addContext(feats, label) def addContext(self, tokFeats, label): tokFeats.sort() """features are sorted to ensure identical output no matter where the features are coming from""" featNumbers = set([self.featCounter.getNo(feat) for feat in tokFeats]) context = ((c_int*2)*len(featNumbers))() for i, no in enumerate(featNumbers): context[i][1]=1 context[i][0]=no labelNumber = self.labelCounter.getNo(label) self.contexts.append(context) self.labels.append(labelNumber) def train(self): sys.stderr.write('creating training problem...') prob = problem(self.labels, self.contexts) sys.stderr.write('done\ntraining with option(s) "'+self.parameters+'"...') self.model = train(prob, parameter(self.parameters)) sys.stderr.write('done\n')
class Tagger: def __init__(self, features, transModel, options): self._features = features self._dataSizes = options['dataSizes'] self._transProbs = transModel print('loading observation model...', end='', file=sys.stderr, flush=True) self._model = joblib.load('{0}'.format(options['modelFileName'])) self._featCounter = BookKeeper(options['featCounterFileName']) self._labelCounter = BookKeeper(options['labelCounterFileName']) print('done', file=sys.stderr, flush=True) def printWeights(self, n=100, outputStream=sys.stdout): coefs = self._model.coef_ labelNoToName = self._labelCounter.noToName featNoToName = self._featCounter.noToName sortedFeats = sorted(featNoToName.items()) for i, label in sorted(labelNoToName.items()): columns = ['{0}:{1}'.format(w, feat) for w, (no, feat) in sorted(zip(coefs[i, :], sortedFeats), reverse=True)] print('{0}\t{1}'.format(label, '\t'.join(columns[:n])), file=outputStream) # Best # Worst -> Negative correlation print('{0}\t{1}'.format(label, '\t'.join(sorted(columns[-n:], reverse=True))), file=outputStream) def tagFeatures(self, data): senFeats = [] senCount = 0 for line in data: line = line.strip() if len(line) == 0: senCount += 1 tagging = self._tagSenFeats(senFeats) yield [[tag] for tag in tagging] senFeats = [] if senCount % 1000 == 0: print('{0}...'.format(senCount), end='', file=sys.stderr, flush=True) senFeats.append(line.split()) print('{0}...done'.format(senCount), file=sys.stderr, flush=True) def tagDir(self, dirName): for fn in os.listdir(dirName): print('processing file {0}...'.format(fn), end='', file=sys.stderr, flush=True) for sen, _ in self.tagCorp(open(os.path.join(dirName, fn), encoding='UTF-8')): yield sen, fn def tagCorp(self, inputStream=sys.stdin): senCount = 0 for sen, comment in sentenceIterator(inputStream): senCount += 1 senFeats = featurizeSentence(sen, self._features) bestTagging = self._tagSenFeats(senFeats) taggedSen = [tok + [bestTagging[c]] for c, tok in enumerate(sen)] # Add tagging to sentence yield taggedSen, comment if senCount % 1000 == 0: print('{0}...'.format(senCount), end='', file=sys.stderr, flush=True) print('{0}...done'.format(senCount), file=sys.stderr, flush=True) def _getTagProbsByPos(self, senFeats): # Get Sentence Features translated to numbers and contexts in two steps getNoTag = self._featCounter.getNoTag featNumbers = [{getNoTag(feat) for feat in feats if getNoTag(feat) is not None} for feats in senFeats] rows = [] cols = [] data = [] for rownum, featNumberSet in enumerate(featNumbers): for featNum in featNumberSet: rows.append(rownum) cols.append(featNum) data.append(1) contexts = csr_matrix((data, (rows, cols)), shape=(len(featNumbers), self._featCounter.numOfNames()), dtype=self._dataSizes['dataNP']) tagProbsByPos = [{self._labelCounter.noToName[i]: prob for i, prob in enumerate(probDist)} for probDist in self._model.predict_proba(contexts)] return tagProbsByPos def toCRFsuite(self, inputStream, outputStream=sys.stdout): senCount = 0 getNoTag = self._featCounter.getNoTag featnoToName = self._featCounter.noToName for sen, comment in sentenceIterator(inputStream): senCount += 1 senFeats = featurizeSentence(sen, self._features) # Get Sentence Features translated to numbers and contexts in two steps for featNumberSet in ({getNoTag(feat) for feat in feats if getNoTag(feat) is not None} for feats in senFeats): print('\t'.join(featnoToName[featNum].replace(':', 'colon') for featNum in featNumberSet), file=outputStream) print(file=outputStream) # Sentence separator blank line if senCount % 1000 == 0: print('{0}...'.format(str(senCount)), end='', file=sys.stderr, flush=True) print('{0}...done'.format(str(senCount)), file=sys.stderr, flush=True) def _tagSenFeats(self, senFeats): return self._transProbs.tagSent(self._getTagProbsByPos(senFeats))
class Trainer: def __init__(self, features, options): # Set clasifier algorithm here parameters = dict() # dict(solver='lbfgs') solver = LogisticRegression # Possible alternative solvers: # parameters = {'loss':'modified_huber', 'n_jobs': -1} # solver = SGDClassifier # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ # parameters = {'kernel': 'rbf', 'probability': True} # solver = SVC # ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ # parameters = {'kernel': 'linear', 'probability': True} # solver = OneVsRestClassifier(SVC(**parameters)) # XXX won't work because ** in parameters... self._model = solver(**parameters) self._dataSizes = options['dataSizes'] self._tagField = options['tagField'] self._modelFileName = options['modelFileName'] self._parameters = options['trainParams'] self._cutoff = options['cutoff'] self._featCounterFileName = options['featCounterFileName'] self._labelCounterFileName = options['labelCounterFileName'] self._features = features self._tokCount = -1 # Index starts from 0 self._rows = array(self._dataSizes['rows']) self._cols = array(self._dataSizes['cols']) self._data = array(self._dataSizes['data']) self._labels = array(self._dataSizes['labels']) self._sentEnd = array( self._dataSizes['sentEnd']) # Keep track of sentence boundaries self._matrix = None self._featCounter = BookKeeper() self._labelCounter = BookKeeper() self._usedFeats = None if 'usedFeats' in options and options['usedFeats']: self._usedFeats = { line.strip() for line in open(options['usedFeats'], encoding='UTF-8') } def save(self): print('saving model...', end='', file=sys.stderr, flush=True) joblib.dump(self._model, '{0}'.format(self._modelFileName), compress=3) print('done\nsaving feature and label lists...', end='', file=sys.stderr, flush=True) self._featCounter.save(self._featCounterFileName) self._labelCounter.save(self._labelCounterFileName) print('done', file=sys.stderr, flush=True) def _updateSentEnd(self, sentEnds, rowNums): newEnds = array(self._dataSizes['sentEnd']) vbeg = 0 for end in sentEnds: vend = -1 for i, e in enumerate(rowNums[vbeg:]): if e <= end: vend = vbeg + i else: break if vend > 0: newEnds.append(vend) vbeg = vend + 1 return newEnds def _convertToNPArray(self): rowsNP = np.array(self._rows, dtype=self._dataSizes['rowsNP']) colsNP = np.array(self._cols, dtype=self._dataSizes['cols']) dataNP = np.array(self._data, dtype=self._dataSizes['data']) labelsNP = np.array(self._labels, dtype=self._dataSizes['labels']) del self._rows del self._cols del self._data del self._labels self._rows = rowsNP self._cols = colsNP self._data = dataNP self._labels = labelsNP def _makeSparseArray(self, rowNum, colNum): print('creating training problem...', end='', file=sys.stderr, flush=True) matrix = csr_matrix((self._data, (self._rows, self._cols)), shape=(rowNum, colNum), dtype=self._dataSizes['data']) del self._rows del self._cols del self._data print('done!', file=sys.stderr, flush=True) return matrix def cutoffFeats(self): self._convertToNPArray() colNum = self._featCounter.numOfNames() if self._cutoff < 2: self._matrix = self._makeSparseArray(self._tokCount, colNum) else: print( 'discarding features with less than {0} occurences...'.format( self._cutoff), end='', file=sys.stderr, flush=True) toDelete = self._featCounter.cutoff(self._cutoff) print('done!\nreducing training events by {0}...'.format( len(toDelete)), end='', file=sys.stderr, flush=True) # ...that are not in featCounter anymore indicesToKeepNP = np.fromiter( (ind for ind, featNo in enumerate(self._cols) if featNo not in toDelete), dtype=self._dataSizes['cols']) del toDelete # Reduce cols colsNPNew = self._cols[indicesToKeepNP] del self._cols self._cols = colsNPNew # Reduce data dataNPNew = self._data[indicesToKeepNP] del self._data self._data = dataNPNew # Reduce rows rowsNPNew = self._rows[indicesToKeepNP] rowNumKeep = np.unique(rowsNPNew) rowNum = rowNumKeep.shape[0] colNum = indicesToKeepNP.max() + 1 del self._rows self._rows = rowsNPNew del indicesToKeepNP # Reduce labels labelsNPNew = self._labels[rowNumKeep] del self._labels self._labels = labelsNPNew # Update sentence end markers newEnd = self._updateSentEnd(self._sentEnd, rowNumKeep) del self._sentEnd self._sentEnd = newEnd del rowNumKeep print('done!', file=sys.stderr, flush=True) matrix = self._makeSparseArray(rowNum, colNum) print('updating indices...', end='', file=sys.stderr, flush=True) # Update rowNos rows, _ = matrix.nonzero() matrixNew = matrix[np.unique(rows), :] del matrix del rows # Update featNos _, cols = matrixNew.nonzero() self._matrix = matrixNew[:, np.unique(cols)] del matrixNew del cols print('done!', file=sys.stderr, flush=True) # Input need featurizing def getEvents(self, data): print('featurizing sentences...', end='', file=sys.stderr, flush=True) senCount = 0 tokIndex = -1 # Index starts from 0 for sen, _ in sentenceIterator(data): senCount += 1 sentenceFeats = featurizeSentence(sen, self._features) for c, tok in enumerate(sen): tokIndex += 1 tokFeats = sentenceFeats[c] if self._usedFeats: tokFeats = [ feat for feat in tokFeats if feat in self._usedFeats ] self._addContext(tokFeats, tok[self._tagField], tokIndex) self._sentEnd.append(tokIndex) if senCount % 1000 == 0: print('{0}...'.format(str(senCount)), end='', file=sys.stderr, flush=True) self._tokCount = tokIndex + 1 print('{0}...done!'.format(str(senCount)), file=sys.stderr, flush=True) # Already featurized input def getEventsFromFile(self, data): tokIndex = -1 # Index starts from 0 for line in data: line = line.strip() if len(line) > 0: tokIndex += 1 l = line.split() label, feats = l[0], l[1:] self._addContext(feats, label, tokIndex) self._sentEnd.append(tokIndex) self._tokCount = tokIndex + 1 def _addContext(self, tokFeats, label, curTok): rowsAppend = self._rows.append colsAppend = self._cols.append dataAppend = self._data.append # Features are sorted to ensure identical output no matter where the features are coming from for featNumber in { self._featCounter.getNoTrain(feat) for feat in sorted(tokFeats) }: rowsAppend(curTok) colsAppend(featNumber) dataAppend(1) self._labels.append(self._labelCounter.getNoTrain(label)) # Counting zero elements can be really slow... def mostInformativeFeatures(self, outputStream=sys.stdout, n=-1, countZero=False): # Compute min(P(feature=value|label1), for any label1)/max(P(feature=value|label2), for any label2) # (using contitional probs using joint probabilities) as in NLTK (Bird et al. 2009): # P(feature=value|label) = P(feature=value, label)/P(label) # P(feature=value, label) = C(feature=value, label)/C(feature=value) # P(label) = C(label)/sum_i(C(label_i)) # # P(feature=value|label) = (C(feature=value, label)/C(feature=value))/(C(label)/sum_i(C(label_i))) = # (C(feature=value, label)*sum_i(C(label_i)))/(C(feature=value)*C(label)) # # min(P(feature=value|label1), for any label1)/max(P(feature=value|label2), for any label2) = # # min((C(feature=value, label1)*sum_i(C(label_i)))/(C(feature=value)*C(label1)), for any label1)/ # max((C(feature=value, label2)*sum_i(C(label_i)))/(C(feature=value)*C(label2)), for any label2) = # # (sum_i(C(label_i))/C(feature=value))*min(C(feature=value, label1)/C(label1)), for any label1)/ # (sum_i(C(label_i))/C(feature=value))*max(C(feature=value, label2)/C(label2)), for any label2) = # # min(C(feature=value, label1)/C(label1), for any label1)/ # max(C(feature=value, label2)/C(label2), for any label2) matrix = self._matrix # For easiser handling self._featCounter.makenoToName() self._labelCounter.makenoToName() featnoToName = self._featCounter.noToName labelnoToName = self._labelCounter.noToName labels = self._labels # indexed by token rows (row = token number, column = feature number) featValCounts = defaultdict(Counter) # feat, val -> label: count if countZero: # Every index (including zeros to consider negative correlation) for feat in range(matrix.shape[1]): for tok in range(matrix.shape[0]): featValCounts[feat, matrix[tok, feat]][labels[tok]] += 1 else: matrix = matrix.tocoo() # Every nonzero index for tok, feat, val in zip(matrix.row, matrix.col, matrix.data): featValCounts[feat, val][labels[tok]] += 1 del matrix # (C(label2), for any label2) labelCounts = Counter() for k, v in zip(*np.unique(self._labels, return_counts=True)): labelCounts[k] = v numOfLabels = len(labelCounts) maxprob = defaultdict(lambda: 0.0) minprob = defaultdict(lambda: 1.0) features = set() # For every (feature, val) touple (that has nonzero count) for feature, counts in featValCounts.items(): # For every label label... features.add(feature) for label, count in counts.items(): # prob can only be 0 if the nominator is 0, but this case is already filtered in the Counter... prob = count / labelCounts[label] maxprob[feature] = max(prob, maxprob[feature]) minprob[feature] = min(prob, minprob[feature]) # Convert features to a list, & sort it by how informative features are. """ From NTLK docs: For the purpose of this function, the informativeness of a feature ``(fname,fval)`` is equal to the highest value of P(fname=fval|label), for any label, divided by the lowest value of P(fname=fval|label), for any label: | max[ P(fname=fval|label1) / P(fname=fval|label2) ] """ print('"Feature name"=Value (True/False)', 'Sum of occurences', 'Counts per label', 'Probability per label', 'Max prob.:Min prob.=Ratio:1.0', sep='\t', file=outputStream) # Print header (legend) # To avoid division by zero... for feature in sorted(features, key=lambda feature_: minprob[feature_] / maxprob[ feature_])[:n]: sumOccurences = sum(featValCounts[feature].values()) if len(featValCounts[feature]) < numOfLabels: ratio = 'INF' else: ratio = maxprob[feature] / minprob[feature] # NLTK notation # print('{0:50} = {1:} {2:6} : {3:-6} = {4} : 1.0'.format(featnoToName(feature[0]), feature[1], # maxprob[feature], # minprob[feature], ratio)) # More detailed notation print('"{0:50s}"={1}\t{2}\t{3}\t{4}\t{5:6}:{6:-6}={7}:1.0'.format( featnoToName[feature[0]], bool(feature[1]), sumOccurences, '/'.join( ('{0}:{1}'.format(labelnoToName[l], c) for l, c in featValCounts[feature].items())), '/'.join( ('{0}:{1:.8f}'.format(labelnoToName[l], c / labelCounts[l]) for l, c in featValCounts[feature].items())), maxprob[feature], minprob[feature], ratio), file=outputStream) def toCRFsuite(self, outputStream=sys.stdout): self._featCounter.makenoToName() self._labelCounter.makenoToName() featnoToName = self._featCounter.noToName labelnoToName = self._labelCounter.noToName sentEnd = self._sentEnd matrix = self._matrix.tocsr() labels = self._labels beg = 0 for end in sentEnd: for row in range(beg, end + 1): print('{0}\t{1}'.format( labelnoToName[labels[row]], '\t'.join(featnoToName[col].replace(':', 'colon') for col in matrix[row, :].nonzero()[1])), file=outputStream) print(file=outputStream) # Sentence separator blank line beg = end + 1 def train(self): print('training with option(s) "{0}"...'.format(self._parameters), end='', file=sys.stderr, flush=True) _ = self._model.fit(self._matrix, self._labels) print('done', file=sys.stderr, flush=True)
class Trainer(): def __init__(self, features, options): self.modelName = options['modelName'] self.parameters = options['trainParams'] self.cutoff = options['cutoff'] self.features = features self.labels = [] self.contexts = [] self.labelCounter = BookKeeper() self.featCounter = BookKeeper() self.usedFeats = None if options['usedFeats']: self.usedFeats = set( [line.strip() for line in options['usedFeats']]) def save(self): sys.stderr.write('saving model...') save_model(self.modelName + '.model', self.model) sys.stderr.write('done\nsaving label and feature lists...') self.labelCounter.saveToFile(self.modelName + '.labelNumbers') self.featCounter.saveToFile(self.modelName + '.featureNumbers') sys.stderr.write('done\n') def writeFeats(self, fileName): """obsolete""" featFile = open(fileName, 'w') for i, context in enumerate(self.contexts): label = self.labelCounter.noToFeat[self.labels[i]] feats = [ self.featCounter.noToFeat[c] for c in [feat[0] for feat in context] ] featFile.write('{0}\t{1}\n'.format(label, ' '.join(feats))) def reduceContexts(self): sys.stderr.write('reducing training events...') self.contexts = [ dict([(number, value) for number, value in context.iteritems() if self.featCounter.noToFeat.has_key(number)]) for context in self.contexts ] sys.stderr.write('done!\n') def cutoffFeats(self): if self.cutoff < 2: return sys.stderr.write('discarding features with\ less than {0} occurences...'.format(self.cutoff)) self.featCounter.cutoff(self.cutoff) sys.stderr.write('done!\n') self.reduceContexts() def getEvents(self, data, out_file_name): sys.stderr.write('featurizing sentences...') senCount = 0 out_file = None if out_file_name: out_file = open(out_file_name, 'w') for sen, _ in sentenceIterator(data): senCount += 1 sentenceFeats = featurizeSentence(sen, self.features) for c, tok in enumerate(sen): tokFeats = sentenceFeats[c] if self.usedFeats: tokFeats = [ feat for feat in tokFeats if feat in self.usedFeats ] if out_file: out_file.write(tok[-1] + '\t' + ' '.join(tokFeats) + '\n') self.addContext(tokFeats, tok[-1]) if out_file: out_file.write('\n') if senCount % 1000 == 0: sys.stderr.write(str(senCount) + '...') sys.stderr.write(str(senCount) + '...done!\n') def getEventsFromFile(self, fileName): for line in file(fileName): if line == '\n': continue l = line.strip().split() label, feats = l[0], l[1:] self.addContext(feats, label) def addContext(self, tokFeats, label): tokFeats.sort() """features are sorted to ensure identical output no matter where the features are coming from""" featNumbers = set([self.featCounter.getNo(feat) for feat in tokFeats]) context = ((c_int * 2) * len(featNumbers))() for i, no in enumerate(featNumbers): context[i][1] = 1 context[i][0] = no labelNumber = self.labelCounter.getNo(label) self.contexts.append(context) self.labels.append(labelNumber) def train(self): sys.stderr.write('creating training problem...') prob = problem(self.labels, self.contexts) sys.stderr.write('done\ntraining with option(s) "' + self.parameters + '"...') self.model = train(prob, parameter(self.parameters)) sys.stderr.write('done\n')