def process_content(): for i in tokenized: words = nltk.words_tokenize(i) tagged = nltk.pos_tag(words) chunkgram = r"""chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" chunkParser = nltk.RegexpParser(chunkgram) chunked = chunkgram.parse(tagged) chunked.draw()
def process_content(): for i in tokenized: words = nltk.words_tokenize(i) tagged = nltk.pos_tag(words) chunkgram = r"""chunk: {<.*>+} }<VB.?|IN|DT|TO>+{""" chunkParser = nltk.RegexpParser(chunkgram) chunked = chunkgram.parse(tagged) chunked.draw()
def bag_words(s, words): bag = [0 for _ in range(len(words))] s_words = nltk.words_tokenize(s) s_words = [stemmer.stem(word.lower()) for word in s_words] for se in s_words: for i, w in enumerate(words): if w == se: bag[i].append(1) return numpy.array(bag)
import pickle with open("cauhoi.json") as file: data = json.load(file) try: with open("data.pickle", "rb") as f: words, labels, training, output = pickle.load(f) except: words = [] labels = [] docs_x = [] docs_y = [] for cauhoi in data["cauhoi"]: for question in cauhoi["question"]: wrds = nltk.words_tokenize(question) words.extend(wrds) docs_x.append(wrds) docs_y.append(cauhoi["tag"]) if cauhoi["tag"] not in labels: labels.append(cauhoi["tag"]) words = [stemmer.stem(w.lower()) for w in words if w != "?"] words = sorted(list(set(words))) labels = sorted(labels) training = [] output = [] out_empty = [0 for _ in range(len(labels))]