def __init__(self, faq_file): self.verbose_level = 1 self.make_dirs() base_name = os.path.basename(faq_file) self.nn = FastNeuralNet(db_path + base_name + "_nn.db") # Not crawling data, from Crawler we are just using a couple of methods # to get word IDs and URL IDs self.crawler = Crawler(db_path + base_name + "_index.db") self.set_faq_file(faq_file) self.sp = None
class FaqQuery: """Interface to train an later query FAQ file using a neural network""" def __init__(self, faq_file): self.verbose_level = 1 self.make_dirs() base_name = os.path.basename(faq_file) self.nn = FastNeuralNet(db_path + base_name + "_nn.db") # Not crawling data, from Crawler we are just using a couple of methods # to get word IDs and URL IDs self.crawler = Crawler(db_path + base_name + "_index.db") self.set_faq_file(faq_file) self.sp = None def make_dirs(self): """Make directories needed to run the application""" if not os.path.exists(db_path): os.makedirs(db_path) def set_faq_file(self, faqfile): """Sets the current FAQ file""" # Read FAQ file and get a dictionary with pairs: # (AFIP question ID, question string) self.faq = FaqFile(faqfile) # Map: URL ID -> [ AFIP Question ID, Question String ] self.faqdict = {} # When reading this code think "URL" as "neural net output" # TODO refactor & replace all ocurrences of "url" with "output" self.urlids = [] # translate AFIP question IDs to URL IDs for k, v in self.faq.data.iteritems(): urlid = self.crawler.geturlid(k) self.urlids.append(urlid) self.faqdict[urlid] = [k, v] def normalize(self, s): wsz = ''.join(x for x in unicodedata.normalize('NFKD', unicode(s)) if x in valid_chars).lower() #print "sanitized %s -> %s" % (word, wsz) return wsz def parse_sentence(self, s): keywords = [] # Lemmatize sentence and only keep verbs, nouns, dates and PTs l = Lemmatizer() lemmas = l.lemmatize(s) lemmas = l.filter(lemmas, ['V', 'N', 'W', 'PT']) # Normalize lemmas for l in lemmas: if l['tag'] == 'W': norm_lemma = l['lemma'] else: norm_lemma = self.normalize(l['lemma']) if len(norm_lemma) > 0 and norm_lemma not in ignore_lemmas: keywords.append(norm_lemma) self.vprint("Keywords: ", keywords) return [self.crawler.getwordid(word) for word in keywords] def train(self, iters=100, print_tests=False): """Train neural network with the given FAQ file. Must be a valid JSON file""" self.__make_train_cache() try: for i in range(1, iters + 1): self.vprint("\n\n******** ITERATION %d ********\n\n" % i) self.__train() if print_tests: self.__print_partial_test_results(i, iters) if i % 10 == 0 or i == iters: self.__print_test_results() except KeyboardInterrupt: print "Aborted!" def __make_train_cache(self): """for each question in FAQ, build a cache with the parsed data and and the url id""" self.parsed_data = {} self.url_id = {} for k, v in self.faq.data.iteritems(): self.parsed_data[k] = self.parse_sentence(v) self.url_id[k] = self.crawler.geturlid(k) def __train(self): c = 1 total = len(self.faq.data) for k, v in self.faq.data.iteritems(): self.vprint("%d/%d Training question %s: %s" % (c, total, k, v)) starttime = time.time() exp_url_id = self.url_id[k] wordids = self.parsed_data[k] ############################## CHECK ############################## # Train with bigrams and 3-grams ? e.g. min = 2, max = 3 # Or whole query? e.g. min = max = 1000 min_ngram_len = 1000 max_ngram_len = 1000 # Train passing *all* urlids? e.g: urlsubset = self.urlids # Or just a random subset? e.g: #urlsubset = [expurl] #for i in range(10): # urlsubset.append(choice(self.urlids)) ################################################################### for ngram_len in range(min_ngram_len, max_ngram_len + 1): for i in range(max(1, len(wordids) - ngram_len)): self.nn.trainquery(wordids[i:i + ngram_len], urlsubset, exp_url_id) self.vprint("Done in %f secs\n" % ((time.time() - starttime))) c += 1 def __print_partial_test_results(self, iteration, total): """Print some partial test results""" #qids = ["467563"] qids = ["138165"] queries = ['en que bancos puedo realizar el pago', 'bancos pago', 'pago', 'recategorizarme?', 'tomar para recategorizarme?', 'que debo tomar para recategorizarme?'] for qid in qids: scores = [0.0] * len(queries) for q in queries: test_result = self.query(q, 300) for r in test_result: if r[1] != qid: continue else: scores[queries.index(q)] = r[0] break print iteration, qid, "Scores: ", scores def __print_test_results(self): fails = 0 passes = 0 padding = 10 print "\n*Expected*".rjust(padding), "*Got*".rjust(padding) for k, v in self.faq.data.iteritems(): result = self.query(v, 300) best_result = result[0][1] print k.rjust(padding), best_result.rjust(padding), if k == best_result: passes += 1 print "Ok" else: fails += 1 print "Failed!", for i in range(len(result)): if result[i][1] == k: print "Exp response found at position %d" % (i + 1) break print " | \---> ", result[0][2] print " \---------------> ", v total = float(passes + fails) print "\nSummary:" print " * Passed: %d (%%%.2f)" % (passes, passes / total * 100) print " * Failed: %d (%%%.2f)" % (fails, fails / total * 100) def query(self, q, N=10): """Get result for query q using the currently trained database and return N best answers""" urlids = self.urlids wordids = self.parse_sentence(q) result = self.nn.getresult(wordids, urlids) # result is hard to read. So we create user-friendly result with # form [QuestionRelevance, QuestionID, QuestionString] uf_result = [] for i in range(len(urlids)): q = self.faqdict[urlids[i]] uf_result.append([result[i], q[0], q[1]]) # sort by relevance uf_result.sort(reverse=True) return uf_result[0:N] def spell_check(self, q): self.__init_spell_checker() return self.sp.correct_sentence(q) def __init_spell_checker(self): if self.sp: return faq_corpus = "" for v in self.faq.data.itervalues(): faq_corpus += v + " " # TODO provide a bigger corpus! self.sp = SpellChecker(faq_corpus) def test(self): self.__make_train_cache() self.__print_test_results() def vprint(self, *args, **keys): level = 1 if 'level' in keys: level = keys['level'] if self.verbose_level >= level: for arg in args: print arg
class FaqQuery: """Interface to train an later query FAQ file using a neural network""" def __init__(self, faq_file): self.verbose_level = 1 self.make_dirs() base_name = os.path.basename(faq_file) self.nn = FastNeuralNet(db_path + base_name + "_nn.db") # Not crawling data, from Crawler we are just using a couple of methods # to get word IDs and URL IDs self.crawler = Crawler(db_path + base_name + "_index.db") self.set_faq_file(faq_file) self.sp = None def make_dirs(self): """Make directories needed to run the application""" if not os.path.exists(db_path): os.makedirs(db_path) def set_faq_file(self, faqfile): """Sets the current FAQ file""" # Read FAQ file and get a dictionary with pairs: # (AFIP question ID, question string) self.faq = FaqFile(faqfile) # Map: URL ID -> [ AFIP Question ID, Question String ] self.faqdict = {} # When reading this code think "URL" as "neural net output" # TODO refactor & replace all ocurrences of "url" with "output" self.urlids = [] # translate AFIP question IDs to URL IDs for k, v in self.faq.data.iteritems(): urlid = self.crawler.geturlid(k) self.urlids.append(urlid) self.faqdict[urlid] = [k, v] def normalize(self, s): wsz = ''.join(x for x in unicodedata.normalize('NFKD', unicode(s)) if x in valid_chars).lower() #print "sanitized %s -> %s" % (word, wsz) return wsz def parse_sentence(self, s): keywords = [] # Lemmatize sentence and only keep verbs, nouns, dates and PTs l = Lemmatizer() lemmas = l.lemmatize(s) lemmas = l.filter(lemmas, ['V', 'N', 'W', 'PT']) # Normalize lemmas for l in lemmas: if l['tag'] == 'W': norm_lemma = l['lemma'] else: norm_lemma = self.normalize(l['lemma']) if len(norm_lemma) > 0 and norm_lemma not in ignore_lemmas: keywords.append(norm_lemma) self.vprint("Keywords: ", keywords) return [self.crawler.getwordid(word) for word in keywords] def train(self, iters=100, print_tests=False): """Train neural network with the given FAQ file. Must be a valid JSON file""" self.__make_train_cache() try: for i in range(1, iters + 1): self.vprint("\n\n******** ITERATION %d ********\n\n" % i) self.__train() if print_tests: self.__print_partial_test_results(i, iters) if i % 10 == 0 or i == iters: self.__print_test_results() except KeyboardInterrupt: print "Aborted!" def __make_train_cache(self): """for each question in FAQ, build a cache with the parsed data and and the url id""" self.parsed_data = {} self.url_id = {} for k, v in self.faq.data.iteritems(): self.parsed_data[k] = self.parse_sentence(v) self.url_id[k] = self.crawler.geturlid(k) def __train(self): c = 1 total = len(self.faq.data) for k, v in self.faq.data.iteritems(): self.vprint("%d/%d Training question %s: %s" % (c, total, k, v)) starttime = time.time() exp_url_id = self.url_id[k] wordids = self.parsed_data[k] ############################## CHECK ############################## # Train with bigrams and 3-grams ? e.g. min = 2, max = 3 # Or whole query? e.g. min = max = 1000 min_ngram_len = 1000 max_ngram_len = 1000 # Train passing *all* urlids? e.g: urlsubset = self.urlids # Or just a random subset? e.g: #urlsubset = [expurl] #for i in range(10): # urlsubset.append(choice(self.urlids)) ################################################################### for ngram_len in range(min_ngram_len, max_ngram_len + 1): for i in range(max(1, len(wordids) - ngram_len)): self.nn.trainquery(wordids[i:i + ngram_len], urlsubset, exp_url_id) self.vprint("Done in %f secs\n" % ((time.time() - starttime))) c += 1 def __print_partial_test_results(self, iteration, total): """Print some partial test results""" #qids = ["467563"] qids = ["138165"] queries = [ 'en que bancos puedo realizar el pago', 'bancos pago', 'pago', 'recategorizarme?', 'tomar para recategorizarme?', 'que debo tomar para recategorizarme?' ] for qid in qids: scores = [0.0] * len(queries) for q in queries: test_result = self.query(q, 300) for r in test_result: if r[1] != qid: continue else: scores[queries.index(q)] = r[0] break print iteration, qid, "Scores: ", scores def __print_test_results(self): fails = 0 passes = 0 padding = 10 print "\n*Expected*".rjust(padding), "*Got*".rjust(padding) for k, v in self.faq.data.iteritems(): result = self.query(v, 300) best_result = result[0][1] print k.rjust(padding), best_result.rjust(padding), if k == best_result: passes += 1 print "Ok" else: fails += 1 print "Failed!", for i in range(len(result)): if result[i][1] == k: print "Exp response found at position %d" % (i + 1) break print " | \---> ", result[0][2] print " \---------------> ", v total = float(passes + fails) print "\nSummary:" print " * Passed: %d (%%%.2f)" % (passes, passes / total * 100) print " * Failed: %d (%%%.2f)" % (fails, fails / total * 100) def query(self, q, N=10): """Get result for query q using the currently trained database and return N best answers""" urlids = self.urlids wordids = self.parse_sentence(q) result = self.nn.getresult(wordids, urlids) # result is hard to read. So we create user-friendly result with # form [QuestionRelevance, QuestionID, QuestionString] uf_result = [] for i in range(len(urlids)): q = self.faqdict[urlids[i]] uf_result.append([result[i], q[0], q[1]]) # sort by relevance uf_result.sort(reverse=True) return uf_result[0:N] def spell_check(self, q): self.__init_spell_checker() return self.sp.correct_sentence(q) def __init_spell_checker(self): if self.sp: return faq_corpus = "" for v in self.faq.data.itervalues(): faq_corpus += v + " " # TODO provide a bigger corpus! self.sp = SpellChecker(faq_corpus) def test(self): self.__make_train_cache() self.__print_test_results() def vprint(self, *args, **keys): level = 1 if 'level' in keys: level = keys['level'] if self.verbose_level >= level: for arg in args: print arg
from searchengine import Crawler pagelist = ['http://kiwitobes.com/wiki/Perl.html'] crawler = Crawler() Crawler.crawl(pagelist)