def test_sentencetokenizer(): tokens = SentenceTokenizer.tokenize("This is a sentence. And this is another.") assert tokens == [('FIRST_CAPITALIZED_STOPWORD', 'This'), ('LOWER_STOPWORD', 'is'), ('LOWER_STOPWORD', 'a'), ('LOWER', 'sentence'), ('TERMINATOR', '.'), ('CAPITALIZED_STOPWORD', 'And'), ('LOWER_STOPWORD', 'this'), ('LOWER_STOPWORD', 'is'), ('LOWER', 'another'), ('TERMINATOR', '.')] tokens = SentenceTokenizer.tokenize("This Is A Sentence Of Type Allcaps.") assert tokens == [('FIRST_CAPITALIZED_STOPWORD', 'This'), ('CAPITALIZED_STOPWORD', 'Is'), ('MIXED_STOPWORD', 'A'), ('CAPITALIZED', 'Sentence'), ('CAPITALIZED_STOPWORD', 'Of'), ('CAPITALIZED', 'Type'), ('CAPITALIZED', 'Allcaps'), ('TERMINATOR', '.')]
def capitalization_type(text): """Determine the capilitization type of the text Types are: - REGULAR: First letter of First word in sentences is capitilized as well as first letter of proper nouns. - GERMAN: First letter of First word in sentences as well as first letter of any noun. - ALLCAPS: First letter of every word is capitilized. - SHOUT: Every letter is uppercase. - LOWER: Every letter is lowercase. - OTHER: None of the above definitions apply. (This may also mean mixed type) """ d = dict(text=text) d['tokens'] = [x[0] for x in SentenceTokenizer.tokenize(d['text'])] result = apply_multinomial_NB(C, V, prior, condprob, d) result = result[0] type_map = dict( REGULAR=CapType.REGULAR, GERMAN=CapType.GERMAN, ALLCAPS=CapType.ALLCAPS, SHOUT=CapType.SHOUT, LOWER=CapType.LOWER, OTHER=CapType.OTHER ) return type_map[result]
'ALLCAPS' 'LOWER' 'SHOUT' and possibly 'GERMAN' ''' # C holds our categories C = ['REGULAR', 'ALLCAPS', 'LOWER', 'SHOUT'] # Now we place all of our training examples into D D = get_training_examples(options.corpus_filename) # Now extract the features for the trainer... for d in D: d['tokens'] = [x[0] for x in SentenceTokenizer.tokenize(d['text'])] print 'Training...' V, prior, condprob = train_multinomial_NB(C, D) # Now pickle these for use by capnormalizer stuff_to_pickle = (C, V, prior, condprob) print 'Pickling...' pickle.dump(stuff_to_pickle, open('data/weights/capnorm_weights.pickle', 'wb')) print 'Done.' if options.test: # Now test the training examples as well, # most should give correct category if training went well... for d in D: result = apply_multinomial_NB(C, V, prior, condprob, d)