def main_bigramTrain(options, input): bigramModel = Bigram(0.000000000000001) for sen, _ in sentenceIterator(input): tags = [tok[options.tagField] for tok in sen] bigramModel.obsSequence(tags) bigramModel.count() bigramModel.writeToFile(options.bigramModelFile)
def tagCorp(self, inputStream=sys.stdin): senCount = 0 for sen, comment in sentenceIterator(inputStream): senCount += 1 senFeats = featurizeSentence(sen, self._features) bestTagging = self._tagSenFeats(senFeats) taggedSen = [tok + [bestTagging[c]] for c, tok in enumerate(sen)] # Add tagging to sentence yield taggedSen, comment if senCount % 1000 == 0: print('{0}...'.format(senCount), end='', file=sys.stderr, flush=True) print('{0}...done'.format(senCount), file=sys.stderr, flush=True)
def tag_corp(self, input): senCount = 0 for sen, comment in sentenceIterator(input): senCount += 1 #sys.stderr.write(str(sen)+'\n') #sys.stderr.flush() senFeats = featurizeSentence(sen, self.featureSet) bestTagging = self.tag_sen_feats(senFeats) taggedSen = addTagging(sen, bestTagging) yield taggedSen, comment if senCount % 1000 == 0: sys.stderr.write(str(senCount)+'...') sys.stderr.write(str(senCount)+'...done\n')
def tag_corp(self, input): senCount = 0 for sen, comment in sentenceIterator(input): senCount += 1 #sys.stderr.write(str(sen)+'\n') #sys.stderr.flush() senFeats = featurizeSentence(sen, self.featureSet) bestTagging = self.tag_sen_feats(senFeats) taggedSen = addTagging(sen, bestTagging) yield taggedSen, comment if senCount % 1000 == 0: sys.stderr.write(str(senCount) + '...') sys.stderr.write(str(senCount) + '...done\n')
def toCRFsuite(self, inputStream, outputStream=sys.stdout): senCount = 0 getNoTag = self._featCounter.getNoTag featnoToName = self._featCounter.noToName for sen, comment in sentenceIterator(inputStream): senCount += 1 senFeats = featurizeSentence(sen, self._features) # Get Sentence Features translated to numbers and contexts in two steps for featNumberSet in ({getNoTag(feat) for feat in feats if getNoTag(feat) is not None} for feats in senFeats): print('\t'.join(featnoToName[featNum].replace(':', 'colon') for featNum in featNumberSet), file=outputStream) print(file=outputStream) # Sentence separator blank line if senCount % 1000 == 0: print('{0}...'.format(str(senCount)), end='', file=sys.stderr, flush=True) print('{0}...done'.format(str(senCount)), file=sys.stderr, flush=True)
def getEvents(self, data, out_file_name): sys.stderr.write('featurizing sentences...') senCount = 0 out_file = None if out_file_name: out_file = open(out_file_name, 'w') for sen, _ in sentenceIterator(data): senCount+=1 sentenceFeats = featurizeSentence(sen, self.features) for c, tok in enumerate(sen): tokFeats = sentenceFeats[c] if self.usedFeats: tokFeats = [feat for feat in tokFeats if feat in self.usedFeats] if out_file: out_file.write(tok[-1]+'\t'+' '.join(tokFeats)+'\n') self.addContext(tokFeats, tok[-1]) if out_file: out_file.write('\n') if senCount % 1000 == 0: sys.stderr.write(str(senCount)+'...') sys.stderr.write(str(senCount)+'...done!\n')
def getEvents(self, data): print('featurizing sentences...', end='', file=sys.stderr, flush=True) senCount = 0 tokIndex = -1 # Index starts from 0 for sen, _ in sentenceIterator(data): senCount += 1 sentenceFeats = featurizeSentence(sen, self._features) for c, tok in enumerate(sen): tokIndex += 1 tokFeats = sentenceFeats[c] if self._usedFeats: tokFeats = [ feat for feat in tokFeats if feat in self._usedFeats ] self._addContext(tokFeats, tok[self._tagField], tokIndex) self._sentEnd.append(tokIndex) if senCount % 1000 == 0: print('{0}...'.format(str(senCount)), end='', file=sys.stderr, flush=True) self._tokCount = tokIndex + 1 print('{0}...done!'.format(str(senCount)), file=sys.stderr, flush=True)
def getEvents(self, data, out_file_name): sys.stderr.write('featurizing sentences...') senCount = 0 out_file = None if out_file_name: out_file = open(out_file_name, 'w') for sen, _ in sentenceIterator(data): senCount += 1 sentenceFeats = featurizeSentence(sen, self.features) for c, tok in enumerate(sen): tokFeats = sentenceFeats[c] if self.usedFeats: tokFeats = [ feat for feat in tokFeats if feat in self.usedFeats ] if out_file: out_file.write(tok[-1] + '\t' + ' '.join(tokFeats) + '\n') self.addContext(tokFeats, tok[-1]) if out_file: out_file.write('\n') if senCount % 1000 == 0: sys.stderr.write(str(senCount) + '...') sys.stderr.write(str(senCount) + '...done!\n')
def train(self, inputStream): for sen, _ in sentenceIterator(inputStream): self.obsSequence((tok[self._tagField] for tok in sen))