def notes_to_bow_features_nltk(notes, docs=None, word_proc=lambda x: x.lower().strip(), text=lambda x: x["contents"], limit=None): tokenizer = WordTokenizer() notewords = lambda x: [word_proc(x) for x in tokenizer.tokenize(text(x))] id2nw = dict([(x['jid'], notewords(x)) for x in notes]) wfq = nltk.FreqDist(reduce(lambda x, y: x + y, id2nw.values())) start = int(len(wfq) * 0.03) if limit is None: limit = int(0.25 * len(wfq)) print "len", len(wfq), "taking", limit freqkeys = wfq.keys()[start:start + limit] print "frequent keys" print '\n'.join([repr(x) for x in wfq.iteritems()][start:start + limit]) print len(freqkeys) if docs is None: docs = {} for n in notes: wfq = nltk.FreqDist(id2nw[n['jid']]) fv = docs.get(n["jid"], []) fv = fv + [("freq_%s" % x, wfq[x]) for x in freqkeys] docs[n["jid"]] = fv return docs
def notes_to_bow_features(notes, text=lambda x: eliminate_urls(x["contents"]), word_proc=striplow, word_filter=all_pass, lexicon=None, lexicon_size_limit=float('Inf'), min_word_freq=0): tokenizer = WordTokenizer() notewords = lambda x: [ word_proc(x) for x in tokenizer.tokenize(text(n)) if word_filter(x) ] tokenized_notes = dict([(n["id"], notewords(n)) for n in notes]) dictionary = {} if lexicon is None: ## build lexicon, otherwise use dictionary passed in [ update_dictionary(tn, dictionary) for nid, tn in tokenized_notes.iteritems() ] lexicon = [ k for k in dictionary.keys() if dictionary[k] > min_word_freq ] lexicon.sort(lambda x, y: dictionary[y] - dictionary[x]) if lexicon_size_limit < float('Inf'): lexicon = lexicon[:lexicon_size_limit] pass ## print tokenized_notes return (dict([(nid, to_feature_vec(notewords, lexicon)) for nid, notewords in tokenized_notes.iteritems()]), lexicon, dictionary)
def vectify(notes, text=lambda x: x.contents, word_proc=striplow, word_filter=all_pass, lexicon=None, min_word_freq=2): tokenizer = WordTokenizer() notewords = lambda x: [ word_proc(x) for x in tokenizer.tokenize(n.contents) if word_filter(x) ] tokenized = [notewords(n) for n in notes] dictionary = {} if lexicon is None: ## build lexicon, otherwise use dictionary passed in [update_dictionary(tn, dictionary) for tn in tokenized] lexicon = [ k for k in dictionary.keys() if dictionary[k] > min_word_freq ] lexicon.sort(lambda x, y: dictionary[y] - dictionary[x]) return lexicon, [to_feature_vec(tn, lexicon) for tn in tokenized], dictionary
def _generate_BK(self, text, hyp, verbose=False): from nltk.tokenize import WordTokenizer from nltk.stem.porter import PorterStemmer tokenizer = WordTokenizer() stemmer = PorterStemmer() text = tokenizer.tokenize(text) hyp = tokenizer.tokenize(hyp) if self.stemming: textbow = set(stemmer.stem(word) for word in text) hypbow = set(stemmer.stem(word) for word in hyp) else: textbow = set(word.lower() for word in text) hypbow = set(word.lower() for word in hyp) if verbose: print 'textbow: %s' % textbow print 'hypbow: %s' % hypbow if self.stop: textbow = textbow - self.stopwords hypbow = hypbow - self.stopwords bk = [] fullbow = textbow|hypbow for word_text in fullbow: pos = None if word_text in wordnet.N: bk.extend(self._generate_BK_word(word_text, wordnet.N, fullbow)) if word_text in wordnet.V: bk.extend(self._generate_BK_word(word_text, wordnet.V, fullbow)) if word_text in wordnet.ADJ: bk.extend(self._generate_BK_word(word_text, wordnet.ADJ, fullbow)) if word_text in wordnet.ADV: bk.extend(self._generate_BK_word(word_text, wordnet.ADV, fullbow)) return bk
def tag(self, rtepair, verbose=False): """ Tag a RTEPair as to whether the hypothesis can be inferred from the text. """ from nltk.stem.porter import PorterStemmer from nltk.tokenize import WordTokenizer stemmer = PorterStemmer() tokenizer = WordTokenizer() text = tokenizer.tokenize(rtepair.text) hyp = tokenizer.tokenize(rtepair.hyp) if self.stemming: textbow = set(stemmer.stem(word.lower()) for word in text) hypbow = set(stemmer.stem(word.lower()) for word in hyp) else: textbow = set(word.lower() for word in text) hypbow = set(word.lower() for word in hyp) if self.stop: textbow = textbow - self.stopwords hypbow = hypbow - self.stopwords overlap = float(len(hypbow & textbow)) / len(hypbow | textbow) * 100 if verbose: print "Text:", textbow print "Hypothesis:", hypbow print "Overlap:", hypbow & textbow print 'overlap=%0.2f, value=%s' % (overlap, rtepair.value) if overlap >= self.threshold: return 1 else: return 0
from __future__ import unicode_literals from nltk.tokenize import PunktWordTokenizer as WordTokenizer import random import pprint import scipy.sparse import time import itertools import sys import pickle import helper import constants tokenizer = WordTokenizer() int2tags = constants.int2slots tags2int = constants.tags2int int2citationFeilds = ['Authors', 'Date', 'Title', 'Source'] generic = ["city", "centre", "county", "street", "road", "and", "in", "town", "village"] def filterArticles(articles): relevant_articles = {} correct = [0] * (len(int2tags) -1 ) gold_num = [0] * (len(int2tags)-1) filtered_correct = [0] * (len(int2tags) -1 ) filtered_gold_num = [0] * (len(int2tags)-1) helper.load_constants() print "Num incidents", len(incidents) print "Num unfilitered articles", len(articles) for incident_id in incidents.keys(): incident = incidents[incident_id]