def extract_ngram_features(message, n=N_GRAM): words = nltk.word_tokenize(message) words = [word.lower() for word in words if word.isalnum()] ws = set() for i in xrange(n): ws = ws.union(set([word for word in nltk.ingrams(words, i+1)])) return {word: True for word in ws}
def __init__(self, order, string): self.order = order if self.order > 1: self.backoff = MarkovModel(order - 1, string) self.cfd = ConditionalFreqDist() self.charset = self.backoff.charset for ngram in ingrams(string, order): context, char = ngram[:-1], ngram[-1] self.cfd[context][char] += 1 else: self.backoff = None self.n = 0 self.fd = FreqDist(string) self.charset = set(self.fd.keys())
def ngrams(l, n): l = _remove_stopwords_lowercase(l) phrase_list = [] for n_grams in ingrams(l, n): phrase_list.append(' '.join(n_grams)) return phrase_list
import re, os, glob from sys import argv from nltk import FreqDist, PorterStemmer, ingrams assert len(argv) == 4, "usage: %s inputdir outputdir dev|test" % argv[0] assert os.path.isdir(argv[1]) indir = argv[1] wordposngram = "%s/" % argv[2] assert argv[3] in ("dev", "test") devortest=argv[3] leaves = re.compile(r" ([^ )]+)\)") pos = re.compile(r"\(([^ ]+) [^ )]+\)") porter = PorterStemmer() print "extracting ngrams" for train in glob.glob("%s/*.*.train" % indir): fold = int(train.split(".")[-2]) if fold > 3: continue wordpostrigrams = FreqDist(ingrams((porter.stem(word)+"/"+tag for t in open(train) for word, tag in zip(leaves.findall(t), pos.findall(t))), 3)) for test in glob.glob("%s/*/*.%d.%s*" % (indir, fold, devortest)): output = "%s_%s" % (train.split("/")[-1], test.split("/")[-1]) testtrigrams = FreqDist(ingrams((porter.stem(word)+"/"+tag for t in open(test).readlines() for word,tag in zip(leaves.findall(t), pos.findall(t))), 3)) open(wordposngram+output, "w").writelines("%s\t%d\n" % (" ".join(a), b) for a, b in testtrigrams.iteritems() if wordpostrigrams[a]) print output print "done"
def ngram_phrases(t, n): tokens = wordpunct_tokenize(t) tokens = _remove_stopwords_lowercase(tokens) return set(' '.join(n_grams) for n_grams in ingrams(tokens, n))
def ngram_phrases(t, n): tokens = wordpunct_tokenize(t) tokens = clean_words(tokens) return [' '.join(n_grams) for n_grams in ingrams(tokens, n)]
from sys import argv from nltk import FreqDist, PorterStemmer, ingrams assert len(argv) == 4, "usage: %s inputdir outputdir dev|test" % argv[0] assert os.path.isdir(argv[1]) indir = argv[1] wordposngram = "%s/" % argv[2] assert argv[3] in ("dev", "test") devortest = argv[3] leaves = re.compile(r" ([^ )]+)\)") pos = re.compile(r"\(([^ ]+) [^ )]+\)") porter = PorterStemmer() print "extracting ngrams" for train in glob.glob("%s/*.*.train" % indir): fold = int(train.split(".")[-2]) if fold > 3: continue wordpostrigrams = FreqDist( ingrams((porter.stem(word) + "/" + tag for t in open(train) for word, tag in zip(leaves.findall(t), pos.findall(t))), 3)) for test in glob.glob("%s/*/*.%d.%s*" % (indir, fold, devortest)): output = "%s_%s" % (train.split("/")[-1], test.split("/")[-1]) testtrigrams = FreqDist( ingrams( (porter.stem(word) + "/" + tag for t in open(test).readlines() for word, tag in zip(leaves.findall(t), pos.findall(t))), 3)) open(wordposngram + output, "w").writelines("%s\t%d\n" % (" ".join(a), b) for a, b in testtrigrams.iteritems() if wordpostrigrams[a]) print output print "done"