Ejemplo n.º 1
0
def notes_to_bow_features(notes,
                          text=lambda x: eliminate_urls(x["contents"]),
                          word_proc=striplow,
                          word_filter=all_pass,
                          lexicon=None,
                          lexicon_size_limit=float('Inf'),
                          min_word_freq=0):

    tokenizer = WordTokenizer()
    notewords = lambda x: [
        word_proc(x) for x in tokenizer.tokenize(text(n)) if word_filter(x)
    ]
    tokenized_notes = dict([(n["id"], notewords(n)) for n in notes])
    dictionary = {}
    if lexicon is None:
        ## build lexicon, otherwise use dictionary passed in
        [
            update_dictionary(tn, dictionary)
            for nid, tn in tokenized_notes.iteritems()
        ]
        lexicon = [
            k for k in dictionary.keys() if dictionary[k] > min_word_freq
        ]
        lexicon.sort(lambda x, y: dictionary[y] - dictionary[x])
        if lexicon_size_limit < float('Inf'):
            lexicon = lexicon[:lexicon_size_limit]
        pass
    ## print tokenized_notes
    return (dict([(nid, to_feature_vec(notewords, lexicon))
                  for nid, notewords in tokenized_notes.iteritems()]), lexicon,
            dictionary)
Ejemplo n.º 2
0
def notes_to_bow_features_nltk(notes,
                               docs=None,
                               word_proc=lambda x: x.lower().strip(),
                               text=lambda x: x["contents"],
                               limit=None):
    tokenizer = WordTokenizer()
    notewords = lambda x: [word_proc(x) for x in tokenizer.tokenize(text(x))]

    id2nw = dict([(x['jid'], notewords(x)) for x in notes])
    wfq = nltk.FreqDist(reduce(lambda x, y: x + y, id2nw.values()))

    start = int(len(wfq) * 0.03)
    if limit is None: limit = int(0.25 * len(wfq))
    print "len", len(wfq), "taking", limit

    freqkeys = wfq.keys()[start:start + limit]
    print "frequent keys"
    print '\n'.join([repr(x) for x in wfq.iteritems()][start:start + limit])
    print len(freqkeys)

    if docs is None: docs = {}
    for n in notes:
        wfq = nltk.FreqDist(id2nw[n['jid']])
        fv = docs.get(n["jid"], [])
        fv = fv + [("freq_%s" % x, wfq[x]) for x in freqkeys]
        docs[n["jid"]] = fv

    return docs
Ejemplo n.º 3
0
def vectify(notes, text=lambda x: x.contents, word_proc=striplow, word_filter=all_pass, lexicon=None, min_word_freq=2):
    tokenizer = WordTokenizer()
    notewords = lambda x: [word_proc(x) for x in tokenizer.tokenize(n.contents) if word_filter(x)]
    tokenized = [notewords(n) for n in notes]
    dictionary = {}
    if lexicon is None:
        ## build lexicon, otherwise use dictionary passed in
        [update_dictionary(tn, dictionary) for tn in tokenized]
        lexicon = [k for k in dictionary.keys() if dictionary[k] > min_word_freq]
        lexicon.sort(lambda x, y: dictionary[y] - dictionary[x])

    return lexicon, [to_feature_vec(tn, lexicon) for tn in tokenized], dictionary
Ejemplo n.º 4
0
    def _generate_BK(self, text, hyp, verbose=False):
        from nltk.tokenize import WordTokenizer
        from nltk.stem.porter import PorterStemmer
        tokenizer = WordTokenizer()
        stemmer = PorterStemmer()
        
        text = tokenizer.tokenize(text)
        hyp = tokenizer.tokenize(hyp)
        
        if self.stemming:
            textbow = set(stemmer.stem(word) for word in text)
            hypbow = set(stemmer.stem(word) for word in hyp)
        else:
            textbow = set(word.lower() for word in text)
            hypbow = set(word.lower() for word in hyp)
        
        if verbose:
            print 'textbow: %s' % textbow
            print 'hypbow: %s' % hypbow
        
        if self.stop:
            textbow = textbow - self.stopwords
            hypbow = hypbow - self.stopwords

        bk = []
        fullbow = textbow|hypbow
        for word_text in fullbow:
            pos = None
            if word_text in wordnet.N:
                bk.extend(self._generate_BK_word(word_text, wordnet.N, fullbow))
            if word_text in wordnet.V:
                bk.extend(self._generate_BK_word(word_text, wordnet.V, fullbow))
            if word_text in wordnet.ADJ:
                bk.extend(self._generate_BK_word(word_text, wordnet.ADJ, fullbow))
            if word_text in wordnet.ADV:
                bk.extend(self._generate_BK_word(word_text, wordnet.ADV, fullbow))
                
        return bk
Ejemplo n.º 5
0
    def tag(self, rtepair, verbose=False):
        """
        Tag a RTEPair as to whether the hypothesis can be inferred from the text.
        """
        
        from nltk.stem.porter import PorterStemmer
        from nltk.tokenize import WordTokenizer
        stemmer = PorterStemmer()
        tokenizer = WordTokenizer()
        
        text = tokenizer.tokenize(rtepair.text)
        hyp = tokenizer.tokenize(rtepair.hyp)
        
        if self.stemming:
            textbow = set(stemmer.stem(word.lower()) for word in text)
            hypbow = set(stemmer.stem(word.lower()) for word in hyp)
        else:
            textbow = set(word.lower() for word in text)
            hypbow = set(word.lower() for word in hyp)
        
        if self.stop:
            textbow = textbow - self.stopwords
            hypbow = hypbow - self.stopwords

        overlap = float(len(hypbow & textbow))/len(hypbow | textbow) * 100
        
        if verbose:
            print "Text:", textbow
            print "Hypothesis:", hypbow
            print "Overlap:", hypbow & textbow
            print 'overlap=%0.2f, value=%s' % (overlap, rtepair.value)
            
        if overlap >= self.threshold:
            return 1
        else:
            return 0
Ejemplo n.º 6
0
    def tag(self, rtepair, verbose=False):
        """
        Tag a RTEPair as to whether the hypothesis can be inferred from the text.
        """

        from nltk.stem.porter import PorterStemmer
        from nltk.tokenize import WordTokenizer
        stemmer = PorterStemmer()
        tokenizer = WordTokenizer()

        text = tokenizer.tokenize(rtepair.text)
        hyp = tokenizer.tokenize(rtepair.hyp)

        if self.stemming:
            textbow = set(stemmer.stem(word.lower()) for word in text)
            hypbow = set(stemmer.stem(word.lower()) for word in hyp)
        else:
            textbow = set(word.lower() for word in text)
            hypbow = set(word.lower() for word in hyp)

        if self.stop:
            textbow = textbow - self.stopwords
            hypbow = hypbow - self.stopwords

        overlap = float(len(hypbow & textbow)) / len(hypbow | textbow) * 100

        if verbose:
            print "Text:", textbow
            print "Hypothesis:", hypbow
            print "Overlap:", hypbow & textbow
            print 'overlap=%0.2f, value=%s' % (overlap, rtepair.value)

        if overlap >= self.threshold:
            return 1
        else:
            return 0
Ejemplo n.º 7
0
def vectify(notes,
            text=lambda x: x.contents,
            word_proc=striplow,
            word_filter=all_pass,
            lexicon=None,
            min_word_freq=2):
    tokenizer = WordTokenizer()
    notewords = lambda x: [
        word_proc(x) for x in tokenizer.tokenize(n.contents) if word_filter(x)
    ]
    tokenized = [notewords(n) for n in notes]
    dictionary = {}
    if lexicon is None:
        ## build lexicon, otherwise use dictionary passed in
        [update_dictionary(tn, dictionary) for tn in tokenized]
        lexicon = [
            k for k in dictionary.keys() if dictionary[k] > min_word_freq
        ]
        lexicon.sort(lambda x, y: dictionary[y] - dictionary[x])

    return lexicon, [to_feature_vec(tn, lexicon)
                     for tn in tokenized], dictionary
Ejemplo n.º 8
0
def notes_to_bow_features(notes,
                          text=lambda x: eliminate_urls(x["contents"]),
                          word_proc=striplow, 
                          word_filter=all_pass,
                          lexicon=None,
                          lexicon_size_limit=float('Inf'),
                          min_word_freq=0):
    
    tokenizer = WordTokenizer()
    notewords = lambda x : [ word_proc(x) for x in tokenizer.tokenize(text(n)) if word_filter(x) ]
    tokenized_notes = dict( [ (n["id"],notewords(n)) for n in notes ] )
    dictionary = {}    
    if lexicon is None:
        ## build lexicon, otherwise use dictionary passed in
        [ update_dictionary(tn,dictionary) for nid,tn in tokenized_notes.iteritems() ]
        lexicon = [k for k in dictionary.keys() if dictionary[k] > min_word_freq]
        lexicon.sort(lambda x,y : dictionary[y] - dictionary[x])
        if lexicon_size_limit < float('Inf'):
            lexicon = lexicon[:lexicon_size_limit]
        pass
    ## print tokenized_notes 
    return (dict( [(nid, to_feature_vec(notewords,lexicon)) for nid,notewords in tokenized_notes.iteritems()] ),lexicon,dictionary)
Ejemplo n.º 9
0
def notes_to_bow_features_nltk(notes, docs=None, word_proc=lambda x: x.lower().strip(), text=lambda x: x["contents"],limit=None):
    tokenizer = WordTokenizer()
    notewords = lambda x : [ word_proc(x) for x in tokenizer.tokenize(text(x)) ]

    id2nw = dict([(x['jid'],notewords(x)) for x in notes])
    wfq = nltk.FreqDist(reduce(lambda x,y: x + y, id2nw.values()))

    start = int(len(wfq)*0.03)
    if limit is None: limit=int(0.25*len(wfq))
    print "len", len(wfq), "taking",limit
    
    freqkeys = wfq.keys()[start:start+limit]
    print "frequent keys"
    print '\n'.join([repr(x) for x in wfq.iteritems()][start:start+limit])
    print len(freqkeys)
    
    if docs is None: docs = {}
    for n in notes:
        wfq = nltk.FreqDist(id2nw[n['jid']])
        fv = docs.get(n["jid"],[])
        fv = fv + [ ("freq_%s" % x, wfq[x]) for x in freqkeys ]
        docs[n["jid"]] = fv
    
    return docs
Ejemplo n.º 10
0
print line.encode('unicode_escape')
for c in line:
    if ord(c) > 127:
        print '%r U+%04x %s' % (c.encode('utf8'), ord(c), unicodedata.name(c))

print line.find(u'zosta\u0142y')
line = line.lower()

import re
print line.encode('unicode_escape')
m = re.search(u'\u015b\w*', line)
print m.group()

from nltk.tokenize import WordTokenizer
tokenizer = WordTokenizer()
print tokenizer.tokenize(line)

path = nltk.data.find('samples/sinorama-gb.xml')
f = codecs.open(path, encoding='gb2312')
lines = f.readlines()
for l in lines:
    l = l[:-1]
    utf_enc = l.encode('utf8')
    print repr(utf_enc)

path = nltk.data.find('samples/sinorama-utf8.xml')
from nltk.etree import ElementTree as ET
tree = ET.parse(path)
text = tree.findtext('sent')
uni_text = text.encode('utf8')
print repr(uni_text.splitlines()[1])