Ejemplo n.º 1
0
def invertConditionalFreqDist(CFDist):
    iCFDist = ConditionalFreqDist()
    Stemmer = PorterStemmer()
    for cond in CFDist.conditions():
        for val in CFDist[cond].samples():
            sense = cond.split("_")[0]  # Cut off any POS
            for tok in val:
                if type(tok) == str:
                    iCFDist[Stemmer.raw_stem(tok)].inc(sense, CFDist[cond].count(val))
    return iCFDist
def invertConditionalFreqDist(CFDist):
    iCFDist =  ConditionalFreqDist()
    Stemmer=PorterStemmer()
    for cond in CFDist.conditions():
        for val in CFDist[cond].samples():
            sense = cond.split('_')[0] #Cut off any POS
            for tok in val:
                if type(tok) == str:
                    iCFDist[Stemmer.raw_stem(tok)].inc(sense,CFDist[cond].count(val))
    return iCFDist
def _split_tagged_tokens(tagged_tokens):
    from nltk.set import MutableSet
    from nltk.stemmer.porter import PorterStemmer
    words = []
    ws = []
    word_set = MutableSet()
    tags = []
    ts = []
    tag_set = MutableSet()
    stemmer = PorterStemmer()
    for token in tagged_tokens:
        for sub_token in token['SUBTOKENS']:
            w = sub_token['TEXT'].lower()  # make them lower case
            #w = stemmer.stem_word(w) # oh, and stem them too
            #w = token.type().base()
            t = sub_token['TAG']
            word_set.insert(w)
            tag_set.insert(t)
            ws.append(w)
            ts.append(t)
            if t == '.':
                words.append(ws)
                ws = []
                tags.append(ts)
                ts = []

    return words, word_set.elements(), tags, tag_set.elements()
Ejemplo n.º 4
0
def remove_stem(list):
    new_list = []
    for row in list:
        new_row = []
        for i in row:
            new_row.append(PorterStemmer().stem(i))
        new_list.append(new_row)
    return new_list
def demo():
    # Create a simple regular expression based stemmer
    stemmer = RegexpStemmer('ing$|s$|e$')
    _demo_stemmer(stemmer)

    from nltk.stemmer.porter import PorterStemmer
    stemmer = PorterStemmer()
    _demo_stemmer(stemmer)
def demo():
    from pprint import pprint

    # load stoplist
    stoplist = _unwrap_tokens(stopwords.read('english')['WORDS'])

    # load a bit of the brown corpus
    items = brown.items('humor')
    tagged_tokens = brown.read(items[0])
    from nltk.tokenreader import TaggedTokenReader
    time_flies = TaggedTokenReader().read_token(
        'Time/NN fly/VB like/IN an/DT arrow/NN')

    # create the tagger, using WordNet
    dictionary = WordNetDictionary(stoplist, None, brown_nouns, brown_verbs,
                                   brown_adjs, brown_advs)

    # window of -+ 5 words
    tagger = LeskWordSenseTagger(5, dictionary, WordNetStemmer(), True, 'bag')

    print 'Running with 5 word window, bag of words, WordNet'
    pretty_print(tagger.tag(time_flies), dictionary)
    pretty_print(tagger.tag(tagged_tokens[:200]), dictionary)

    # now change to set of words
    tagger = LeskWordSenseTagger(5, dictionary, WordNetStemmer(), True, 'set')
    print 'Running with 5 word window, set of words, WordNet'
    pretty_print(tagger.tag(time_flies), dictionary)
    pretty_print(tagger.tag(tagged_tokens[:200]), dictionary)

    # create the tagger, using roget
    print 'Creating Roget dictionary (may take a while)...'
    stemmer = PorterStemmer()
    dictionary = RogetDictionary(stoplist, stemmer)
    tagger = LeskWordSenseTagger(5, dictionary, stemmer, True, 'set')
    print 'Running with 5 word window, set of words, Roget'
    pretty_print(tagger.tag(time_flies), dictionary)
    pretty_print(tagger.tag(tagged_tokens[:200]), dictionary)

    # use the simulated annealing tagger, with WordNet
    dictionary = WordNetDictionary(stoplist, None, brown_nouns, brown_verbs,
                                   brown_adjs, brown_advs)

    tagger = SimulatedAnnealingWordSenseTagger(
        dictionary, [20 * (0.5**n) for n in range(100)], WordNetStemmer(),
        True, 'bag')

    print 'Running with bag of words, WordNet, simulated annealing tagger'
    pretty_print(tagger.tag(time_flies), dictionary)
    pretty_print(tagger.tag(tagged_tokens[:200]), dictionary)
Ejemplo n.º 7
0
 def unigramTag(self, dirStats, tagged):
     from nltk.stemmer.porter import PorterStemmer
     for t in tagged['SUBTOKENS']: PorterStemmer().stem(t)
     self.SenseTagger.tag(tagged)
     print 'unigramTag: tagged --',tagged
     TagString = '[???'
     for t in tagged['SUBTOKENS']:
         try:
             TagString += ' ['+t['SENSE']+' '+t['TEXT']+']'
         except KeyError:
             print "Couldn't find Sense tag for", t, 'in', tagged
             TagString += '[??? ' + +t['TEXT']+']'
     TagString += ']'
     dirStats.parse_list[-1] = TagString
     print 'Tagged:', TagString
     return None
def extractSurfaceSemantics(token,parent):
    global Senses
    POS=getPartOfSpeech(token,parent)
    tokenSenses = {}
    text = token['TEXT'].lower()
    default = token['TEXT'].upper()
    if POS in ['N', 'V', 'ADV', 'ADJ']:
        try: #Redo as test = foo while not tokenSensesword: try: foo ; except KeyError: foo = next foo
            tokenSenses = Senses[text]
        except KeyError:
            logger.warning('extractSurfaceSemantics : Text not in tagged senses: %s', text)
            try: 
                #logger.warning('extractSurfaceSemantics : Previously unseen word but in WordNet?: %s', text)
                # stringified range of possible senses without spaces
                tokenSenses = {POS : range(1,len(pywordnet.getWord(text,POS).getSenses())+1)}
            except KeyError:
                try:
                    logger.warning('extractSurfaceSemantics : Inflected version of WordNet word? %s', text)
                    if text.endswith('s'):
                        text = text[:-1]
                        tokenSenses = Senses[text]
                    else:
                        stemmer = PorterStemmer() # Update WordNetStemmer to NLTK 1.4 API
                        stemmer.stem(token)
                        text = token['STEM']
                        tokenSenses = Senses[text]
                except KeyError:
                    text = token['TEXT'].lower()
                    try:
                        logger.warning('extractSurfaceSemantics : Misspelling / typo of WordNet word? %s', text)
                        spellchecker = enchant.DictWithPWL('en_US', Lexicon)
                        s = ''
                        for s in spellchecker.suggest(text):
                            if s in Senses:
                                tokenSenses = Senses[s]
                                break
                        if not tokenSenses and spellchecker.suggest(text):
                            s = spellchecker.suggest(text)[0]
                            tokenSenses = {POS : range(1,len(pywordnet.getWord(s,POS).getSenses())+1)}
                        if s and Options.Spellcheck:
                            logger.warning('extractSurfaceSemantics : Found spelling correction %s for %s', s,text)
                            text = s
                        #logger.debug('*** extractSurfaceSemantics : Implement spelling correction. *** ')
                        #raise KeyError
                    except KeyError:
                        logger.error('extractSurfaceSemantics : Unknown token: %s', text)
                        return default
        # Handle experienced typos.
        if 'see' in tokenSenses:
            ### FIXME adding to dict for typos that are other words
            text = tokenSenses['see']
            try:
                tokenSenses = Senses[text]
            except: return default
        # Handle morphology variants that wordnet understands.
        elif isinstance(tokenSenses, tuple):
            text,tokenSenses[POS] = tokenSenses[POS]
        try:
            return '_'.join([text,POS,','.join([str(i) for i in tokenSenses[POS]])])
        except KeyError:
            #logger.warning('extractSurfaceSemantics : Expected POS %s for token %s, Got %s, Using %s',
            #            POS, token, tokenSenses.keys(), tokenSenses.keys()[0])
            if tokenSenses.keys():
                POS = token['POS'] = tokenSenses.keys()[0]
                return '_'.join([text,POS,','.join([str(i) for i in tokenSenses.values()[0]])])
        except Exception,e:
            logger.error('extractSurfaceSemantics: %s: Could not find sense %s for token %s',
                      e, POS, token) #tokenSenses, text
Ejemplo n.º 9
0
def extractSurfaceSemantics(token, parent):
    global Senses
    POS = getPartOfSpeech(token, parent)
    tokenSenses = {}
    text = token["TEXT"].lower()
    default = token["TEXT"].upper()
    if POS in ["N", "V", "ADV", "ADJ"]:
        try:  # Redo as test = foo while not tokenSensesword: try: foo ; except KeyError: foo = next foo
            tokenSenses = Senses[text]
        except KeyError:
            logger.warning("extractSurfaceSemantics : Text not in tagged senses: %s", text)
            try:
                # logger.warning('extractSurfaceSemantics : Previously unseen word but in WordNet?: %s', text)
                # stringified range of possible senses without spaces
                tokenSenses = {POS: range(1, len(pywordnet.getWord(text, POS).getSenses()) + 1)}
            except KeyError:
                try:
                    logger.warning("extractSurfaceSemantics : Inflected version of WordNet word? %s", text)
                    if text.endswith("s"):
                        text = text[:-1]
                        tokenSenses = Senses[text]
                    else:
                        stemmer = PorterStemmer()  # Update WordNetStemmer to NLTK 1.4 API
                        stemmer.stem(token)
                        text = token["STEM"]
                        tokenSenses = Senses[text]
                except KeyError:
                    text = token["TEXT"].lower()
                    try:
                        logger.warning("extractSurfaceSemantics : Misspelling / typo of WordNet word? %s", text)
                        spellchecker = enchant.DictWithPWL("en_US", Lexicon)
                        s = ""
                        for s in spellchecker.suggest(text):
                            if s in Senses:
                                tokenSenses = Senses[s]
                                break
                        if not tokenSenses and spellchecker.suggest(text):
                            s = spellchecker.suggest(text)[0]
                            tokenSenses = {POS: range(1, len(pywordnet.getWord(s, POS).getSenses()) + 1)}
                        if s and Options.Spellcheck:
                            logger.warning("extractSurfaceSemantics : Found spelling correction %s for %s", s, text)
                            text = s
                        # logger.debug('*** extractSurfaceSemantics : Implement spelling correction. *** ')
                        # raise KeyError
                    except KeyError:
                        logger.error("extractSurfaceSemantics : Unknown token: %s", text)
                        return default
        # Handle experienced typos.
        if "see" in tokenSenses:
            ### FIXME adding to dict for typos that are other words
            text = tokenSenses["see"]
            try:
                tokenSenses = Senses[text]
            except:
                return default
        # Handle morphology variants that wordnet understands.
        elif isinstance(tokenSenses, tuple):
            text, tokenSenses[POS] = tokenSenses[POS]
        try:
            return "_".join([text, POS, ",".join([str(i) for i in tokenSenses[POS]])])
        except KeyError:
            # logger.warning('extractSurfaceSemantics : Expected POS %s for token %s, Got %s, Using %s',
            #            POS, token, tokenSenses.keys(), tokenSenses.keys()[0])
            if tokenSenses.keys():
                POS = token["POS"] = tokenSenses.keys()[0]
                return "_".join([text, POS, ",".join([str(i) for i in tokenSenses.values()[0]])])
        except Exception, e:
            logger.error(
                "extractSurfaceSemantics: %s: Could not find sense %s for token %s", e, POS, token
            )  # tokenSenses, text