Ejemplo n.º 1
0
def extract_ngram_features(message, n=N_GRAM):
    words = nltk.word_tokenize(message)
    words = [word.lower() for word in words if word.isalnum()]
    ws = set()
    for i in xrange(n):
        ws = ws.union(set([word for word in nltk.ingrams(words, i+1)]))
    return {word: True for word in ws}
Ejemplo n.º 2
0
    def __init__(self, order, string):
        self.order = order
        if self.order > 1:
            self.backoff = MarkovModel(order - 1, string)
            self.cfd = ConditionalFreqDist()
            self.charset = self.backoff.charset
            for ngram in ingrams(string, order):
                context, char = ngram[:-1], ngram[-1]
                self.cfd[context][char] += 1

        else:
            self.backoff = None
            self.n = 0
            self.fd = FreqDist(string)
            self.charset = set(self.fd.keys())
Ejemplo n.º 3
0
def ngrams(l, n):
	l = _remove_stopwords_lowercase(l)
	phrase_list = []
	for n_grams in ingrams(l, n):
		phrase_list.append(' '.join(n_grams))
	return phrase_list
Ejemplo n.º 4
0
import re, os, glob
from sys import argv
from nltk import FreqDist, PorterStemmer, ingrams
assert len(argv) == 4, "usage: %s inputdir outputdir dev|test" % argv[0]
assert os.path.isdir(argv[1])
indir = argv[1]
wordposngram = "%s/" % argv[2]
assert argv[3] in ("dev", "test")
devortest=argv[3]
leaves = re.compile(r" ([^ )]+)\)")
pos = re.compile(r"\(([^ ]+) [^ )]+\)")
porter = PorterStemmer()
print "extracting ngrams"
for train in glob.glob("%s/*.*.train" % indir):
	fold = int(train.split(".")[-2])
	if fold > 3: continue
	wordpostrigrams  = FreqDist(ingrams((porter.stem(word)+"/"+tag
		for t in open(train)
		for word, tag in zip(leaves.findall(t), pos.findall(t))), 3))
	for test in glob.glob("%s/*/*.%d.%s*" % (indir, fold, devortest)):
		output = "%s_%s" % (train.split("/")[-1], test.split("/")[-1])
		testtrigrams = FreqDist(ingrams((porter.stem(word)+"/"+tag
			for t in open(test).readlines()
			for word,tag in zip(leaves.findall(t), pos.findall(t))), 3))
		open(wordposngram+output, "w").writelines("%s\t%d\n" % (" ".join(a), b)
			for a, b in testtrigrams.iteritems() if wordpostrigrams[a])
		print output
print "done"
Ejemplo n.º 5
0
def ngram_phrases(t, n):
    tokens = wordpunct_tokenize(t)
    tokens = _remove_stopwords_lowercase(tokens)
    return set(' '.join(n_grams) for n_grams in ingrams(tokens, n))
Ejemplo n.º 6
0
def ngram_phrases(t, n):
    tokens = wordpunct_tokenize(t)
    tokens = clean_words(tokens)
    return [' '.join(n_grams) for n_grams in ingrams(tokens, n)]
Ejemplo n.º 7
0
from sys import argv
from nltk import FreqDist, PorterStemmer, ingrams
assert len(argv) == 4, "usage: %s inputdir outputdir dev|test" % argv[0]
assert os.path.isdir(argv[1])
indir = argv[1]
wordposngram = "%s/" % argv[2]
assert argv[3] in ("dev", "test")
devortest = argv[3]
leaves = re.compile(r" ([^ )]+)\)")
pos = re.compile(r"\(([^ ]+) [^ )]+\)")
porter = PorterStemmer()
print "extracting ngrams"
for train in glob.glob("%s/*.*.train" % indir):
    fold = int(train.split(".")[-2])
    if fold > 3: continue
    wordpostrigrams = FreqDist(
        ingrams((porter.stem(word) + "/" + tag for t in open(train)
                 for word, tag in zip(leaves.findall(t), pos.findall(t))), 3))
    for test in glob.glob("%s/*/*.%d.%s*" % (indir, fold, devortest)):
        output = "%s_%s" % (train.split("/")[-1], test.split("/")[-1])
        testtrigrams = FreqDist(
            ingrams(
                (porter.stem(word) + "/" + tag for t in open(test).readlines()
                 for word, tag in zip(leaves.findall(t), pos.findall(t))), 3))
        open(wordposngram + output,
             "w").writelines("%s\t%d\n" % (" ".join(a), b)
                             for a, b in testtrigrams.iteritems()
                             if wordpostrigrams[a])
        print output
print "done"