コード例 #1
0
def notes_to_bow_features_nltk(notes,
                               docs=None,
                               word_proc=lambda x: x.lower().strip(),
                               text=lambda x: x["contents"],
                               limit=None):
    tokenizer = WordTokenizer()
    notewords = lambda x: [word_proc(x) for x in tokenizer.tokenize(text(x))]

    id2nw = dict([(x['jid'], notewords(x)) for x in notes])
    wfq = nltk.FreqDist(reduce(lambda x, y: x + y, id2nw.values()))

    start = int(len(wfq) * 0.03)
    if limit is None: limit = int(0.25 * len(wfq))
    print "len", len(wfq), "taking", limit

    freqkeys = wfq.keys()[start:start + limit]
    print "frequent keys"
    print '\n'.join([repr(x) for x in wfq.iteritems()][start:start + limit])
    print len(freqkeys)

    if docs is None: docs = {}
    for n in notes:
        wfq = nltk.FreqDist(id2nw[n['jid']])
        fv = docs.get(n["jid"], [])
        fv = fv + [("freq_%s" % x, wfq[x]) for x in freqkeys]
        docs[n["jid"]] = fv

    return docs
コード例 #2
0
def notes_to_bow_features(notes,
                          text=lambda x: eliminate_urls(x["contents"]),
                          word_proc=striplow,
                          word_filter=all_pass,
                          lexicon=None,
                          lexicon_size_limit=float('Inf'),
                          min_word_freq=0):

    tokenizer = WordTokenizer()
    notewords = lambda x: [
        word_proc(x) for x in tokenizer.tokenize(text(n)) if word_filter(x)
    ]
    tokenized_notes = dict([(n["id"], notewords(n)) for n in notes])
    dictionary = {}
    if lexicon is None:
        ## build lexicon, otherwise use dictionary passed in
        [
            update_dictionary(tn, dictionary)
            for nid, tn in tokenized_notes.iteritems()
        ]
        lexicon = [
            k for k in dictionary.keys() if dictionary[k] > min_word_freq
        ]
        lexicon.sort(lambda x, y: dictionary[y] - dictionary[x])
        if lexicon_size_limit < float('Inf'):
            lexicon = lexicon[:lexicon_size_limit]
        pass
    ## print tokenized_notes
    return (dict([(nid, to_feature_vec(notewords, lexicon))
                  for nid, notewords in tokenized_notes.iteritems()]), lexicon,
            dictionary)
コード例 #3
0
ファイル: loaders.py プロジェクト: kinow/listit-server
def vectify(notes,
            text=lambda x: x.contents,
            word_proc=striplow,
            word_filter=all_pass,
            lexicon=None,
            min_word_freq=2):
    tokenizer = WordTokenizer()
    notewords = lambda x: [
        word_proc(x) for x in tokenizer.tokenize(n.contents) if word_filter(x)
    ]
    tokenized = [notewords(n) for n in notes]
    dictionary = {}
    if lexicon is None:
        ## build lexicon, otherwise use dictionary passed in
        [update_dictionary(tn, dictionary) for tn in tokenized]
        lexicon = [
            k for k in dictionary.keys() if dictionary[k] > min_word_freq
        ]
        lexicon.sort(lambda x, y: dictionary[y] - dictionary[x])

    return lexicon, [to_feature_vec(tn, lexicon)
                     for tn in tokenized], dictionary
コード例 #4
0
    def _generate_BK(self, text, hyp, verbose=False):
        from nltk.tokenize import WordTokenizer
        from nltk.stem.porter import PorterStemmer
        tokenizer = WordTokenizer()
        stemmer = PorterStemmer()
        
        text = tokenizer.tokenize(text)
        hyp = tokenizer.tokenize(hyp)
        
        if self.stemming:
            textbow = set(stemmer.stem(word) for word in text)
            hypbow = set(stemmer.stem(word) for word in hyp)
        else:
            textbow = set(word.lower() for word in text)
            hypbow = set(word.lower() for word in hyp)
        
        if verbose:
            print 'textbow: %s' % textbow
            print 'hypbow: %s' % hypbow
        
        if self.stop:
            textbow = textbow - self.stopwords
            hypbow = hypbow - self.stopwords

        bk = []
        fullbow = textbow|hypbow
        for word_text in fullbow:
            pos = None
            if word_text in wordnet.N:
                bk.extend(self._generate_BK_word(word_text, wordnet.N, fullbow))
            if word_text in wordnet.V:
                bk.extend(self._generate_BK_word(word_text, wordnet.V, fullbow))
            if word_text in wordnet.ADJ:
                bk.extend(self._generate_BK_word(word_text, wordnet.ADJ, fullbow))
            if word_text in wordnet.ADV:
                bk.extend(self._generate_BK_word(word_text, wordnet.ADV, fullbow))
                
        return bk
コード例 #5
0
ファイル: bow.py プロジェクト: steven-cutting/icsisumm
    def tag(self, rtepair, verbose=False):
        """
        Tag a RTEPair as to whether the hypothesis can be inferred from the text.
        """

        from nltk.stem.porter import PorterStemmer
        from nltk.tokenize import WordTokenizer
        stemmer = PorterStemmer()
        tokenizer = WordTokenizer()

        text = tokenizer.tokenize(rtepair.text)
        hyp = tokenizer.tokenize(rtepair.hyp)

        if self.stemming:
            textbow = set(stemmer.stem(word.lower()) for word in text)
            hypbow = set(stemmer.stem(word.lower()) for word in hyp)
        else:
            textbow = set(word.lower() for word in text)
            hypbow = set(word.lower() for word in hyp)

        if self.stop:
            textbow = textbow - self.stopwords
            hypbow = hypbow - self.stopwords

        overlap = float(len(hypbow & textbow)) / len(hypbow | textbow) * 100

        if verbose:
            print "Text:", textbow
            print "Hypothesis:", hypbow
            print "Overlap:", hypbow & textbow
            print 'overlap=%0.2f, value=%s' % (overlap, rtepair.value)

        if overlap >= self.threshold:
            return 1
        else:
            return 0
コード例 #6
0
from __future__ import unicode_literals
from nltk.tokenize import PunktWordTokenizer as WordTokenizer
import random
import pprint
import scipy.sparse
import time
import itertools
import sys
import pickle
import helper
import constants


tokenizer = WordTokenizer()
int2tags = constants.int2slots
tags2int = constants.tags2int
int2citationFeilds = ['Authors', 'Date', 'Title', 'Source']
generic = ["city", "centre", "county", "street", "road", "and", "in", "town", "village"]


def filterArticles(articles):
    relevant_articles = {}
    correct = [0] * (len(int2tags) -1 )
    gold_num = [0] * (len(int2tags)-1)
    filtered_correct = [0] * (len(int2tags) -1 )
    filtered_gold_num = [0] * (len(int2tags)-1)
    helper.load_constants()
    print "Num incidents", len(incidents)
    print "Num unfilitered articles", len(articles)
    for incident_id in incidents.keys():
        incident = incidents[incident_id]