import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn import metrics from sklearn.tree import DecisionTreeClassifier from keras import layers from keras.models import Model from keras import backend as K import tensorflow as tf import tensorflow_hub as hub from nlpia_bot.constants import DATA_DIR from nlpia_bot import spacy_language_model nlp = spacy_language_model.load("en_core_web_lg") file_name = os.path.join(DATA_DIR, 'trec', 'train_5500.label') with open(file_name, 'rb') as f: txt = f.read() lines = txt.decode('latin').splitlines() df = [] for idx, line in enumerate(lines): match = re.match('([A-Z]+):([a-z]+)[ ]+(.+)', line) # print(match.groups()) df.append(match.groups())
import time import csv import gzip from tqdm import tqdm import pandas as pd from wikipediaapi import Wikipedia from nlpia_bot import constants from nlpia_bot.spacy_language_model import load from nlpia_bot.etl.vectors import phrase_to_vec import logging log = logging.getLogger(locals().get('__name__', '')) nlp = load('en_core_web_md') TITLES = ['Chatbot', 'ELIZA', 'Turing_test', 'AIML', 'Chatterbot', 'Loebner_prize', 'Chinese_room'] EXCLUDE_HEADINGS = ['See also', 'References', 'Bibliography', 'External links'] class WikiIndex(): _url = 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz' def __init__(self, url=None, refresh=False, **pd_kwargs): self._url = url or self._url self.df_titles = self.load(url=self._url, refresh=refresh, **pd_kwargs) # self.title_slug = self.df_titles.to_dict() # self.df_vectors = pd.DataFrame(nlp(s).vector for s in self.df_titles.index.values) # self.vectors = dict(zip(range(len(self.df_titles)), )) self.title_row = dict(zip(self.df_titles.index.values, range(len(self.df_titles)))) # AttributeError: 'tuple' object has no attribute 'lower
""" Pattern and template based chatbot dialog engines """ import re import pandas as pd from nlpia_bot.etl import glossaries from nlpia_bot import spacy_language_model nlp = spacy_language_model.load('en_core_web_md') class Bot: """ Bot that can reply with definitions from glossary yml files in data/faq/glossary-*.yml >>> bot = Bot() >>> bot.reply('allele') [(1.0, "I don't understand")] >>> bot.reply('What is a nucleotide?') [(1, 'The basic building blocks of DNA and RNA... """ def __init__(self, domains=('dsdh', )): global nlp self.nlp = nlp self.glossary = glossaries.load(domains=domains) self.glossary.fillna('', inplace=True) self.glossary.index = self.glossary['term'].str.lower().str.strip() self.vector = dict() self.vector['term'] = pd.DataFrame( {s: nlp(s or '').vector for s in self.glossary['term']})