def test_singularizer_abraxas_tags(): """This is not a true test, it just outputs the singularize result on Abraxas tags in order to bootstrap the singularizer rules""" reader = csv.reader(open('../../data/tests/abraxas_tags.csv')) for line in reader: print '%s ==> %s' % (line[2], singularize(line[2]))
def tag(cls, text): """Class method that returns tags given some text""" if not text: return [] text = text.replace("'", "") cap_type = capitalization_type(text) bt = BasicTokenizer() tokens = bt.tokenize(text) pos = nltk.pos_tag(tokens) log.info('POS before lower casing:%s', str(pos)) if cap_type == CapType.ALLCAPS: # If the headline is in AllCAPS then the POS tagger # produces too many proper nouns, hence we de-capitilize text first tokens = bt.tokenize(text.lower()) pos = nltk.pos_tag(tokens) log.info('POS after lower casing:%s', str(pos)) # Only return those tokens whose pos is in the include list tags = [t[0] for t in pos if t[1] in pos_include] # Now exclude stopwords... tags = [t for t in tags if not t in stop_words] # Call Singularize tags = [singularize(t) for t in tags] # We want to preserve the order of tags purely for esthetic value # hence we will not use set() # We will also preserve uppercased tags if they are the first occurence tags_ = CIList() for t in tags: if t in tags_: continue if len(t) < 2: continue tags_.append(t) return tags_
def tag(cls, text): """Class method that returns tags given some text""" text = text.replace("'", "") cap_type = capitalization_type(text) bt = BasicTokenizer() if cap_type == 'ALLCAPS': context = bt.tokenize(text.lower()) else: context = bt.tokenize(text) tags = [] for i in range(len(context)): features = featurize(i, context) d = dict( word=context[i], context=text, features=features, tokens=features, tags=tags) m, s = apply_multinomial_NB(C, V, prior, condprob, d) if m == 'ham': tags.append(context[i]) # Strip out stopwords... tags = [t for t in tags if t not in stop_words] # Call Singularize tags = [singularize(t) for t in tags] # We want to preserve the order of tags purely for esthetic value # hence we will not use set() # We will also preserve uppercased tags if they are the first occurence tags_ = CIList() for t in tags: if t in tags_: continue if len(t) < 2: continue tags_.append(t) return tags_
def test_singularizer(): assert singularize('movies') == 'movie' assert singularize('business') == 'business' assert singularize('series') == 'series' assert singularize('women') == 'woman' assert singularize('radii') == 'radius' assert singularize('octopii') == 'octopus' assert singularize('virii') == 'virus' assert singularize('fish') == 'fish' assert singularize('properties') == 'property' assert singularize('drapes') == 'drape' assert singularize('types') == 'type' assert singularize('pass') == 'pass' assert singularize('balls') == 'ball' assert singularize('scissors') == 'scissors' assert singularize('clothes') == 'cloth' assert singularize('theses') == 'thesis' assert singularize('indices') == 'index' assert singularize('knives') == 'knife' assert singularize('lives') == 'life' assert singularize('thieves') == 'thief' assert singularize('fungi') == 'fungus'