def test_cap_type(): for x in data: try: assert capitalization_type(x[0]) == x[1] except: print 'Error: %s' % x[0] print 'Result should be %s but got %s' % (x[1], capitalization_type(x[0])) raise
def tag(cls, text): """Class method that returns tags given some text""" if not text: return [] text = text.replace("'", "") cap_type = capitalization_type(text) bt = BasicTokenizer() tokens = bt.tokenize(text) pos = nltk.pos_tag(tokens) log.info('POS before lower casing:%s', str(pos)) if cap_type == CapType.ALLCAPS: # If the headline is in AllCAPS then the POS tagger # produces too many proper nouns, hence we de-capitilize text first tokens = bt.tokenize(text.lower()) pos = nltk.pos_tag(tokens) log.info('POS after lower casing:%s', str(pos)) # Only return those tokens whose pos is in the include list tags = [t[0] for t in pos if t[1] in pos_include] # Now exclude stopwords... tags = [t for t in tags if not t in stop_words] # Call Singularize tags = [singularize(t) for t in tags] # We want to preserve the order of tags purely for esthetic value # hence we will not use set() # We will also preserve uppercased tags if they are the first occurence tags_ = CIList() for t in tags: if t in tags_: continue if len(t) < 2: continue tags_.append(t) return tags_
import csv from silcc.lib.capnormalizer import capitalization_type, CapType if __name__ == '__main__': reader = csv.reader(open('data/training/muti_submissions.csv', 'rU')) for line in reader: if len(line) != 3: continue print line text = line[1] type_ = capitalization_type(text) for k, v in CapType.__dict__.iteritems(): if isinstance(v, int) and type_ == v: print k print '----'