def train(self, doc): sys.stderr.write('training nb... ') feats = collections.defaultdict(sbd_util.Counter) totals = sbd_util.Counter() frag = doc.frag while frag: for feat, val in frag.features.items(): feats[frag.label][feat + '_' + val] += 1 totals[frag.label] += len(frag.features) frag = frag.next ## add-1 smoothing and normalization sys.stderr.write('smoothing... ') smooth_inc = 0.1 all_feat_names = set(feats[True].keys()).union(set( feats[False].keys())) for label in [0, 1]: totals[label] += (len(all_feat_names) * smooth_inc) for feat in all_feat_names: feats[label][feat] += smooth_inc feats[label][feat] /= totals[label] self.feats[(label, feat)] = feats[label][feat] feats[label]['<prior>'] = totals[label] / totals.totalCount() self.feats[(label, '<prior>')] = feats[label]['<prior>'] sys.stderr.write('done!\n')
def get_stats(self, verbose): if verbose: sys.stderr.write('getting statistics... ') lower_words = sbd_util.Counter() non_abbrs = sbd_util.Counter() frag = self.frag while frag: for word in frag.tokenized.split(): if word.replace('.', '').isalpha(): if word.islower(): lower_words[word.replace('.','')] += 1 if not word.endswith('.'): non_abbrs[word] += 1 frag = frag.next if verbose: sys.stderr.write('lowercased [%d] non-abbrs [%d]\n' %(len(lower_words), len(non_abbrs))) return lower_words, non_abbrs
def classify_nb_one(self, frag): ## the prior is weird, but it works better this way, consistently probs = sbd_util.Counter([(label, self.feats[label, '<prior>']**4) for label in [0,1]]) for label in probs: for feat, val in frag.features.items(): key = (label, feat + '_' + val) if not key in self.feats: continue probs[label] *= self.feats[key] probs = sbd_util.normalize(probs) return probs[1]
def get_text_data(text, expect_labels=True, tokenize=False, verbose=False): """ get text, returning an instance of the Doc class doc.frag is the first frag, and each points to the next """ frag_list = None word_index = 0 frag_index = 0 curr_words = [] lower_words, non_abbrs = sbd_util.Counter(), sbd_util.Counter() for line in text.splitlines(): ## deal with blank lines if (not line.strip()) and frag_list: if not curr_words: frag.ends_seg = True else: frag = Frag(' '.join(curr_words)) frag.ends_seg = True if expect_labels: frag.label = True prev.next = frag if tokenize: tokens = word_tokenize.tokenize(frag.orig) frag.tokenized = tokens frag_index += 1 prev = frag curr_words = [] for word in line.split(): curr_words.append(word) if is_sbd_hyp(word): frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag ## get label; tokenize if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag_index += 1 prev = frag curr_words = [] word_index += 1 ## last frag frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag.ends_seg = True frag_index += 1 if verbose: sys.stderr.write(' words [%d] sbd hyps [%d]\n' % (word_index, frag_index)) ## create a Doc object to hold all this information doc = Doc(frag_list) return doc
def get_data(files, expect_labels=True, tokenize=False, verbose=False, files_already_opened=False): """ load text from files, returning an instance of the Doc class doc.frag is the first frag, and each points to the next """ if type(files) == type(''): files = [files] frag_list = None word_index = 0 frag_index = 0 curr_words = [] lower_words, non_abbrs = sbd_util.Counter(), sbd_util.Counter() for file in files: sys.stderr.write('reading [%s]\n' % file) #fh = open(file) if files_already_opened: fh = file else: fh = open(file) for line in fh: ## deal with blank lines if (not line.strip()) and frag_list: if not curr_words: frag.ends_seg = True else: frag = Frag(' '.join(curr_words)) frag.ends_seg = True if expect_labels: frag.label = True prev.next = frag if tokenize: tokens = word_tokenize.tokenize(frag.orig) frag.tokenized = tokens frag_index += 1 prev = frag curr_words = [] for word in line.split(): curr_words.append(word) if is_sbd_hyp(word): #if True: # hypothesize all words frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag ## get label; tokenize if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag_index += 1 prev = frag curr_words = [] word_index += 1 if files_already_opened: pass else: fh.close() #fh.close() ## last frag frag = Frag(' '.join(curr_words)) if not frag_list: frag_list = frag else: prev.next = frag if expect_labels: frag.label = int('<S>' in word) if tokenize: tokens = word_tokenize.tokenize(frag.orig) else: tokens = frag.orig tokens = re.sub('(<A>)|(<E>)|(<S>)', '', tokens) frag.tokenized = tokens frag.ends_seg = True frag_index += 1 if verbose: sys.stderr.write(' words [%d] sbd hyps [%d]\n' % (word_index, frag_index)) ## create a Doc object to hold all this information doc = Doc(frag_list) return doc