Example #1
0
def _make_qtype_data(verbose=False):
    data = {}
    for fileid in qc.fileids():
        if verbose:
            print fileid
        for type_, sent in qc.tuples(fileid):
            type_ = type_.split(':')[0]
            if type_ not in data:
                data[type_] = {0: 0}
            counts = data[type_]
            tokens = _qtype_tokens(sent.split(' '))
            for i, t in enumerate(tokens):
                counts[0] += 1
                if t not in counts:
                    counts[t] = {0: 0}
                counts[t][0] += 1
                if i + 1 < len(tokens):
                    if tokens[i + 1] not in counts[t]:
                        counts[t][tokens[i + 1]] = {0: 0}
                    counts[t][tokens[i + 1]][0] += 1
                if i + 2 < len(tokens):
                    if tokens[i + 2] not in counts[t][tokens[i + 1]]:
                        counts[t][tokens[i + 1]][tokens[i + 2]] = 0
                    counts[t][tokens[i + 1]][tokens[i + 2]] += 1
    return data
Example #2
0
def _make_qtype_data(verbose=False):
    data = {}
    for fileid in qc.fileids():
        if verbose:
            print fileid
        for type_, sent in qc.tuples(fileid):
            type_ = type_.split(':')[0]
            if type_ not in data:
                data[type_] = {0: 0}
            counts = data[type_]
            tokens = _qtype_tokens(sent.split(' '))
            for i, t in enumerate(tokens):
                counts[0] += 1
                if t not in counts:
                    counts[t] = {0: 0}
                counts[t][0] += 1
                if i + 1 < len(tokens):
                    if tokens[i + 1] not in counts[t]:
                        counts[t][tokens[i + 1]] = {0: 0}
                    counts[t][tokens[i + 1]][0] += 1
                if i + 2 < len(tokens):
                    if tokens[i + 2] not in counts[t][tokens[i + 1]]:
                        counts[t][tokens[i + 1]][tokens[i + 2]] = 0
                    counts[t][tokens[i + 1]][tokens[i + 2]] += 1
    return data
Example #3
0
File: qc.py Project: pandaops/hackU
import nltk
from nltk.corpus import qc
import random
import string

s = qc.tuples()

temp = []
bad=['the']

for x in string.punctuation:
    bad.append(x)

for x,y in s:
    temp += nltk.word_tokenize(y)

all_words = nltk.FreqDist(w.lower() for w in temp if w.isalpha() and w not in bad)

word_features = all_words.keys()[:800]

def qc_features(question):
    words = question.split()
    for val in bad:
        if val in words:
            words.remove(val)
    features = {}
    features['(Words are)'] = words[1]+' '+words[0]
    words=set(words)
    for word in words:
        features['contains(%s)' % word] = (word in words)
    return features
Example #4
0
from nltk import ngrams
from nltk.corpus import qc
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

#List of Categories :- Int -> 1, Dec -> 0
categoryInterrogative = "Int"
categoryDeclarative = "Dec"

# print(qc.raw("test.txt"))

trainData = open("newTrainData30k.txt").read()
trainData = nltk.sent_tokenize(trainData)

qc_train = qc.tuples("train.txt")
traindocuments = [x[1] for x in qc_train]
trainData = trainData[:10000]

qc_testInt = qc.tuples("test.txt")
testdocuments = [x[1] for x in qc_testInt]

testDec = open("RawTestingDataDeclarative.txt").read()
testDec = nltk.sent_tokenize(testDec)


def findFeatures(documents, isInterrogative):
    features = {}
    for sentence in documents:
        words = nltk.word_tokenize(sentence)
        tagged = nltk.pos_tag(words)
Example #5
0
##    try:
##        for i in tokenized:
##            words = nltk.word_tokenize(i)
##            tagged = nltk.pos_tag(words)
##            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
##            chunkParser = nltk.RegexpParser(chunkGram)
##            chunked = chunkParser.parse(tagged)
##            chunked.draw()
##
##    except Exception as e:
##        print(str(e))
##
##process_content()

##### Tutorial 11(text classification) ######
print(qc.tuples()[0][0])
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))
random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]
Example #6
0
print(pos)
print(reader.readline())
print(reader.seek(pos))  # rewind to the position from tell.
print(reader.readline())
# squashed bugs
f = StringIO(b"""
(a b c)
# This line is a comment.
(d e f\ng h)""".decode('ascii'))
print(read_sexpr_block(f, block_size=38, comment_char='#'))
print(read_sexpr_block(f, block_size=38, comment_char='#'))
f = StringIO(b"""
This file ends mid-sexpr
(hello (world""".decode('ascii'))
for i in range(3):
    print(read_sexpr_block(f))
f = StringIO(b"This file has no trailing whitespace.".decode('ascii'))
for i in range(3):
    print(read_sexpr_block(f))
# Bug fixed in 5279:
f = StringIO(b"a b c)".decode('ascii'))
for i in range(3):
    print(read_sexpr_block(f))
sents = nltk.corpus.brown.sents()
print(sents[6000])
print(sents[6000])
print(reuters.words('training/13085'))
print(reuters.words('training/5082'))
nltk.download('qc')
print(qc.tuples('test.txt'))