Ejemplo n.º 1
0
    def tag(cls, text):
        """Class method that returns tags given some text"""
        if not text:
            return []

        text = text.replace("'", "")
        cap_type = capitalization_type(text)

        bt = BasicTokenizer()
        tokens = bt.tokenize(text)
        pos = nltk.pos_tag(tokens)
        log.info('POS before lower casing:%s', str(pos))

        if cap_type == 'ALLCAPS':
            # If the headline is in AllCAPS then the POS tagger
            # produces too many proper nouns, hence we de-capitilize text first
            tokens = bt.tokenize(text.lower())
            pos = nltk.pos_tag(tokens)
            log.info('POS after lower casing:%s', str(pos))

        # Only return those tokens whose pos is in the include list
        tags = [t[0] for t in pos if t[1] in pos_include]

        # Now exclude stopwords...
        tags = [t for t in tags if not t in stop_words]
        
        # Call Singularize
        tags = [singularize(t) for t in tags]
    
        # We want to preserve the order of tags purely for esthetic value
        # hence we will not use set()
        # We will also preserve uppercased tags if they are the first occurence

        tags_ = CIList()
        for t in tags:
            if t in tags_: 
                continue
            if len(t) < 2: 
                continue
            tags_.append(t)

        return tags_
Ejemplo n.º 2
0
    def tag(cls, text):
        """Class method that returns tags given some text"""

        text = text.replace("'", "")
        cap_type = capitalization_type(text)
        bt = BasicTokenizer()

        if cap_type == 'ALLCAPS':
            context = bt.tokenize(text.lower())
        else:
            context = bt.tokenize(text)

        tags = []
        for i in range(len(context)):
            features = featurize(i, context)
            d = dict(
                word=context[i], 
                context=text, 
                features=features, tokens=features, tags=tags)
            m, s = apply_multinomial_NB(C, V, prior, condprob, d)
            if m == 'ham':
                tags.append(context[i])

        # Strip out stopwords...
        tags = [t for t in tags if t not in stop_words]

        # Call Singularize
        tags = [singularize(t) for t in tags]

        # We want to preserve the order of tags purely for esthetic value
        # hence we will not use set()
        # We will also preserve uppercased tags if they are the first occurence

        tags_ = CIList()
        for t in tags:
            if t in tags_: continue
            if len(t) < 2: continue
            tags_.append(t)

        return tags_