class Engine(object): def __init__(self, fd, fp): self.dictionary = Dictionary(fd, load=True) self.postings = Postings(fp, mode='r') def _get_postings(self, termInfo): if termInfo[-1] is not None: return self.postings.list_at_offset(termInfo[-1]) return None def execute_query(self, reverse_polish): args = [] while reverse_polish: token = reverse_polish.popleft() if not isinstance(token, Operator): dterm = self.dictionary.term(token) postings_list = self._get_postings(dterm) args.append(postings_list) else: if isinstance(token, NOTOperator): args.append(self.postings.not_list()) # print '\nExecuting ', token, ' for args: ', str(args), '\n' for i in range(len(args)): if args[i] is not None and args[i]._entries_len == 0: args[i] = None splitpoint = -1 * token.nargs o_args = args[splitpoint:] args = args[:splitpoint] + [token.execute(o_args)] return args[-1]
def build_index(training_data_dir, dictionary_file, postings_file, is_debug): training_files = sorted(os.listdir(training_data_dir), key=lambda x: x) if is_debug: training_files = training_files[:DEBUG_LIMIT] dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) for training_file in training_files: doc_id = training_file doc_path = osp.join(training_data_dir, training_file) add_doc_to_index(doc_id, doc_path, dictionary, postings) postings.save() # turn line nos to byte offsets f = open(postings_file) current_line = 0 while True: term = dictionary.term_for_offset(current_line) dictionary.add_term(term, f.tell(), update_freq=False) line = f.readline() if not line: break current_line += 1 dictionary.generate_idf(len(training_files)) dictionary.save()
def build_index(in_dir, out_dict, out_postings): """ Build index from documents stored in the input directory, then output the dictionary file and postings file """ print('Indexing...') stemmer = PorterStemmer() dictionaries = Dictionaries(out_dict) postings = Postings(out_postings) offset = 1 for docID in os.listdir(in_dir): f = open(f'{in_dir}/{docID}', 'r') content_tokens = word_tokenize(f.read()) for word in content_tokens: term = stemmer.stem(word=word).lower() if dictionaries.has_term(term): old_offset = dictionaries.get_offset(term) postings.add_docId_to_offset(old_offset, docID) else: dictionaries.add_term(term, offset) postings.add_doc_id(offset) postings.add_docId_to_offset(offset, docID) offset += 1 dictionaries.increment_frequency(term) postings.save_to_file(dictionaries) dictionaries.save_to_file()
def test4(self): po = Postings(TEST_DIR) p = po.get('bericht 1') self.assertEqual(p.content(), 'bericht 1 content') self.assertEqual(p.content(), 'bericht 1 content') self.assertEqual(p.content(), 'bericht 1 content')
def test6(self): po = Postings(TEST_DIR) l = po.latest(1) self.assertEqual(l[0].title(), 'bericht 2') l = po.latest(1, reverse = True) self.assertEqual(l[0].title(), 'bericht 1')
def test2(self): po = Postings(TEST_DIR) p = po.get('bericht 2') d1 = p.date() d2 = datetime.datetime.now() self.assertEqual((d2-d1).seconds, 0) time.sleep(2) d3 = datetime.datetime.now() self.assertEqual((d3-d1).seconds, 2)
def __init__(self, company_id="demo"): self.graph_id = company_id self.orphans = set() self.orphan_list = list() self.node_map = dict() self.node_id_map = dict() self.db_info = dict() self.search_postings = Postings() # get utils for intents. with open(os.path.realpath("chatbot/intentUtils.json")) as data_file: self.graph_utils = json.load(data_file) # replace escaped slashes by a single slash. for class_string in self.graph_utils["class"]: if type(self.graph_utils["class"][class_string]) is str: self.graph_utils["class"][class_string] = self.graph_utils[ "class"][class_string].replace('\\\\', '\\')
def build_index(in_dir, out_dict, out_postings): """ Build index from documents stored in the input directory, then output the dictionary file and postings file """ print('Indexing...') stemmer = PorterStemmer() dictionaries = Dictionaries(out_dict) postings = Postings(out_postings) offset = 1 count = len(os.listdir(in_dir)) for docID in os.listdir(in_dir): f = open(f'{in_dir}/{docID}', 'r') content = f.read() sentences = sent_tokenize(content) doc_terms = [] for sentence in sentences: for word in word_tokenize(sentence): term = stemmer.stem(word=word.lower()) doc_terms.append(term) # Calculate weighted term frequencies for each term weighted_term_freqs = [(x[0], get_term_frequency_weight(x[1])) for x in Counter(doc_terms).most_common()] # Calculate document vector length doc_length = math.sqrt( sum(map(lambda x: x[1] * x[1], weighted_term_freqs))) for term, normalised_tf in weighted_term_freqs: if dictionaries.has_term(term): old_offset = dictionaries.get_offset(term) postings.add_docId_tf_to_offset(old_offset, docID, normalised_tf / doc_length) else: dictionaries.add_term(term, offset) postings.add_doc_id(offset) postings.add_docId_tf_to_offset(offset, docID, normalised_tf / doc_length) offset += 1 postings.save_to_file(dictionaries, count) dictionaries.save_to_file()
def build_index(training_data_dir, dictionary_file, postings_file, is_debug): training_files = sorted(os.listdir(training_data_dir), key=lambda x: int(x)) if is_debug: training_files = training_files[:DEBUG_LIMIT] dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) for training_file in training_files: doc_id = int(training_file) doc_path = osp.join(training_data_dir, training_file) postings.not_list().add(doc_id) add_doc_to_index(doc_id, doc_path, dictionary, postings) postings.save() # turn line nos to byte offsets f = open(postings_file) current_line = 1 f.readline() # skip postings list containing all doc ids while True: term = dictionary.term_for_offset(current_line) dictionary.add_term(term, f.tell()) line = f.readline() if not line: break current_line += 1 dictionary.save()
class Searcher(object): def __init__(self, dictionary_file, postings_file): self.dictionary = Dictionary(dictionary_file) self.postings = Postings(postings_file) self.dictionary.load() self.all_docs = self.postings.load_list(0) #evaluates a query assuming it is in RPN def evaluate_query(self, parsed_query): stack = [] while (len(parsed_query) != 0): element = parsed_query.pop(0) if element == 'NOT': operand = stack.pop() stack.append(self.evaluate_NOT(operand)) elif element == 'AND': first_operand = stack.pop() second_operand = stack.pop() stack.append(self.evaluate_AND(first_operand, second_operand)) elif element == 'OR': first_operand = stack.pop() second_operand = stack.pop() stack.append(self.evaluate_OR(first_operand, second_operand)) else: stack.append(element) value = stack.pop() if not isinstance(value, list): offset = self.dictionary.get_offset(value) value = self.postings.load_list(offset) return value def evaluate_AND(self, first, second): if not isinstance(first, list): offset = self.dictionary.get_offset(first) first = self.postings.load_list(offset) if not isinstance(second, list): offset = self.dictionary.get_offset(second) second = self.postings.load_list(offset) return skip_intersection(first, second) def evaluate_OR(self, first, second): if not isinstance(first, list): offset = self.dictionary.get_offset(first) first = self.postings.load_list(offset) if not isinstance(second, list): offset = self.dictionary.get_offset(second) second = self.postings.load_list(offset) return union(first, second) def evaluate_NOT(self, operand): if not isinstance(operand, list): offset = self.dictionary.get_offset(operand) operand = self.postings.load_list(offset) return difference(self.all_docs, operand)
def build_index(directory, dictionary_file, postings_file): files = os.listdir(directory) dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) stemmer = nltk.stem.porter.PorterStemmer() last = '' for doc_id in files: postings.add_doc(doc_id) line_number = 1 #Use linecache to get line line = linecache.getline(os.path.join(directory, doc_id), line_number) while line != '': #tokenize lines into sentences sentences = nltk.sent_tokenize(line) for sentence in sentences: #tokenize sentence tokens = nltk.word_tokenize(sentence) for token in tokens: #apply stemming and case folding stemmed_token = stemmer.stem(token).lower() #if term alraeady exists in dictionary, we find row number if dictionary.has_term(stemmed_token): offset = dictionary.get_offset(stemmed_token) result = postings.add_doc_id(doc_id, offset) # Result indicates if the doc id is new if result: dictionary.increment_frequency(stemmed_token) #else, we add it to dictionary and postings else: offset = postings.add_new_term() postings.add_doc_id(doc_id, offset) dictionary.add_new_term(stemmed_token, offset) line_number += 1 line = linecache.getline(os.path.join(directory, doc_id), line_number) #save data postings.save(dictionary) dictionary.save()
class Engine(object): NUM_RESULTS = 10 def __init__(self, fd, fp): self.dictionary = Dictionary(fd, load=True) self.postings = Postings(fp, mode='r') def _get_postings(self, offset): return self.postings.list_at_offset(offset) def _accumulate_scores(self, scores, postings_list, q_wt): for doc_id, d_tf in postings_list: scores[doc_id] = scores.get(doc_id, 0) + q_wt * d_tf def _normalize(self, scores, q_len): for doc_id in scores: scores[doc_id] /= (q_len * self.dictionary.doc_length(doc_id)) def _get_top_n_docs(self, scores, n): scores_heap = [(-v, k) for k, v in scores.items()] heapq.heapify(scores_heap) return [ heapq.heappop(scores_heap)[1] for i in xrange(n) if len(scores_heap) > 0 ] def execute_query(self, query_map): scores = {} for term in query_map: q_idf, term_offset = self.dictionary.term(term) # unknown term, skip everything, score 0 if term_offset is None: continue # accumulate scores for postings list query_map[term] = q_wt = tf(query_map[term]) * q_idf postings_list = self._get_postings(term_offset) self._accumulate_scores(scores, postings_list, q_wt) # perform length normalization (query and document) q_len = math.sqrt(sum(x * x for x in query_map.values())) self._normalize(scores, q_len) # find top n top_n_docs = self._get_top_n_docs(scores, Engine.NUM_RESULTS) return " ".join(str(x) for x in top_n_docs)
def run_search(dict_file, postings_file, queries_file, results_file): """ Using the given dictionary file and postings file, perform searching on the given queries file and output the results to a file """ print('Running search on the queries...') dictionaries = Dictionaries(dict_file) dictionaries.load() postings = Postings(postings_file) searcher = Searcher(dictionaries, postings) result_string = '' with open(queries_file, 'r') as f, open(results_file, 'w') as o: for i, query in enumerate(f): searcher.set_query(query.strip()) output = searcher.evaluate_query() result_string += output.strip() + '\n' searcher.clear_postings() f.close() o.write(result_string.strip()) o.close()
def __init__(self, dictionary_file, postings_file): self.dictionary = Dictionary(dictionary_file) self.postings = Postings(postings_file) self.dictionary.load() self.all_docs = self.postings.load_list(0)
def test5(self): po = Postings(TEST_DIR) p = po.get('bericht 2') self.assertIsNone(p.content())
class feedbackEngine(object): """ Search engine that uses relevance feedback with a vector space model to retrieve patents """ global NUM_RESULTS global QUERY_WEIGHT global P_FEEDBACK_WEIGHT NUM_RESULTS = 10 QUERY_WEIGHT = 0.5 P_FEEDBACK_WEIGHT = 0.5 def __init__(self, fd, fp): self.dictionary = Dictionary(fd, load=True) self.postings = Postings(fp, mode='r') self.feedback = False def _get_postings(self, offset): """ This method gets the postings list at an offset """ return self.postings.list_at_offset(offset) def _accumulate_scores(self, scores, postings_list, q_wt): """ This method accumulates scores for a term """ for doc_id, d_tf in postings_list: scores[doc_id] = scores.get(doc_id, 0) + q_wt * d_tf def _normalize(self, scores, q_len): """ This method normalises scores for every document """ for doc_id in scores: scores[doc_id] /= (q_len * self.dictionary.doc_length(doc_id)) def _get_top_n_docs(self, scores, n): """ This method creates a heap of the docs and pick out the top few """ scores_heap = [(-v, k) for k, v in scores.items()] heapq.heapify(scores_heap) return [heapq.heappop(scores_heap)[1] for i in xrange(n) if len(scores_heap) > 0] def relevance_feedback(self, query_map, top_n_docs): """ This method expands the query based on pseudo relevance feedback """ self.feedback = True vector_sum = {} term_dict = self.dictionary._terms # constructing the document vector for the top n docs for term in term_dict: term_offset = term_dict[term][1] # unknown term, skip everything, score 0 if term_offset is None or term is None: continue # adding the term frequencies of all the documents in top_n_docs postings_list = self._get_postings(term_offset) for doc_id, d_tf in postings_list: if doc_id in top_n_docs: temp_term_freq = d_tf*P_FEEDBACK_WEIGHT if term in vector_sum: vector_sum[term] += temp_term_freq else: vector_sum[term] = temp_term_freq # averaging the vector for the top docs to get the centroid for term in vector_sum: vector_sum[term] /= NUM_RESULTS vector_sum[term] *= P_FEEDBACK_WEIGHT # adding the initial query vector terms to the centroid for term in vector_sum: if term in query_map: vector_sum[term] += query_map[term] * QUERY_WEIGHT # adding the remaining terms left in the query vector for term in query_map: if term not in vector_sum: vector_sum[term] = query_map[term] * QUERY_WEIGHT # execute query with the new query vector return self.execute_query(vector_sum) def execute_query(self, query_map): """ This method is called to execute a query """ scores = {} query_map_copy = copy.deepcopy(query_map) for term in query_map: q_idf, term_offset = self.dictionary.term(term) # unknown term, skip everything, score 0 if term_offset is None: continue # accumulate scores for postings list query_map[term] = q_wt = tf(query_map[term]) * q_idf postings_list = self._get_postings(term_offset) self._accumulate_scores(scores, postings_list, q_wt) # perform length normalization (query and document) q_len = math.sqrt(sum(x * x for x in query_map.values())) self._normalize(scores, q_len) # if havent done relevance feedback, do relevance feedback if not self.feedback: top_n_docs = self._get_top_n_docs(scores, Engine.NUM_RESULTS) stringout = self.relevance_feedback(query_map_copy, top_n_docs) # if here, calling from within relevance feedback else: # return the output of all the scores after relevance feedback stringout = " ".join(str(x) for x in scores.keys()) return stringout
def test3(self): po = Postings(TEST_DIR) p = po.get('bericht 2') self.assertEqual(p.title(), 'bericht 2')
class Engine(object): """ Search engine that uses a simple vector space model to retrieve patents """ NUM_RESULTS = 500 def __init__(self, fd, fp): self.dictionary = Dictionary(fd, load=True) self.postings = Postings(fp, mode='r') def _get_postings(self, offset): """ This method gets the postings list at an offset """ return self.postings.list_at_offset(offset) def _accumulate_scores(self, scores, postings_list, q_wt): """ This method accumulates scores for a term """ for doc_id, d_tf in postings_list: scores[doc_id] = scores.get(doc_id, 0) + q_wt * d_tf def _normalize(self, scores, q_len): """ This method normalises scores for every document """ for doc_id in scores: scores[doc_id] /= (q_len * self.dictionary.doc_length(doc_id)) def _get_top_n_docs(self, scores, n): """ This method creates a heap of the docs and pick out the top few """ scores_heap = [(-v, k) for k, v in scores.items()] heapq.heapify(scores_heap) return [heapq.heappop(scores_heap)[1] for i in xrange(n) if len(scores_heap) > 0] def execute_query(self, query_map): """ This method is called to execute a query """ scores = {} for term in query_map: q_idf, term_offset = self.dictionary.term(term) # unknown term, skip everything, score 0 if term_offset is None: continue # accumulate scores for postings list query_map[term] = q_wt = tf(query_map[term]) * q_idf postings_list = self._get_postings(term_offset) self._accumulate_scores(scores, postings_list, q_wt) # perform length normalization (query and document) q_len = math.sqrt(sum(x * x for x in query_map.values())) self._normalize(scores, q_len) # find top n # top_n_docs = self._get_top_n_docs(scores, Engine.NUM_RESULTS) # return " ".join(str(x) for x in top_n_docs) return " ".join(str(x) for x in scores.keys())
def test1(self): po = Postings(TEST_DIR) p = po.get('bericht 1') self.assertEqual(p.date(), datetime.datetime(2013,7,10))
def __init__(self, fd, fp): self.dictionary = Dictionary(fd, load=True) self.postings = Postings(fp, mode='r')
def build_index(directory, dictionary_file, postings_file): files = os.listdir(directory) dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) stemmer = nltk.stem.porter.PorterStemmer() last = '' for doc_id in files: tf_list = {} line_number = 1 offset = 0 # Use linecache to get line line = linecache.getline(os.path.join(directory, doc_id), line_number) while line != '': # tokenize lines into sentences sentences = nltk.sent_tokenize(line) for sentence in sentences: # tokenize sentence tokens = nltk.word_tokenize(sentence) for token in tokens: # apply stemming and case folding stemmed_token = stemmer.stem(token).lower() # if term already exists in dictionary, we find row number if dictionary.has_term(stemmed_token): offset = dictionary.get_offset(stemmed_token) # If postings for that term already has doc id, # then increment tf, # Else increment df and add the doc id if postings.has_doc_id(doc_id, offset): postings.increment_tf(doc_id, offset) else: dictionary.increment_df(stemmed_token) postings.add_doc_id(doc_id, offset) # else, we add it to dictionary and postings else: offset = postings.add_new_term() postings.add_doc_id(doc_id, offset) dictionary.add_new_term(stemmed_token, offset) #Keep track of tf values of all terms in doc if stemmed_token in tf_list: tf_list[stemmed_token] += 1 else: tf_list[stemmed_token] = 1 line_number += 1 line = linecache.getline(os.path.join(directory, doc_id), line_number) # Store doc length dictionary.add_doc_length(doc_id, tf_list.values()) # save data postings.save(dictionary) dictionary.save()
class Graph: # A class for the graph. # Keeps track of orphans. # Has a 1:1 mapping for each node # Allows population from data stored in a JSON def __init__(self, company_id="demo"): self.graph_id = company_id self.orphans = set() self.orphan_list = list() self.node_map = dict() self.node_id_map = dict() self.db_info = dict() self.search_postings = Postings() # get utils for intents. with open(os.path.realpath("chatbot/intentUtils.json")) as data_file: self.graph_utils = json.load(data_file) # replace escaped slashes by a single slash. for class_string in self.graph_utils["class"]: if type(self.graph_utils["class"][class_string]) is str: self.graph_utils["class"][class_string] = self.graph_utils[ "class"][class_string].replace('\\\\', '\\') def get_node(self, node_name): return self.node_map.get(node_name) def get_node_by_id(self, node_id): return self.node_id_map.get(node_id) def populate_graph(self, dir_path, db_object, redis_object, extraction_indices): ''' Read a stored graph from JSON files under the directory Read aux graph info from DB. Identify the orphan nodes. Set up the 1:1 mapping of a node name/ id with its object :param dir_path: :param db_object: :param redis_object: :param extraction_indices: :return: ''' # initializations graph_json = {} id_counter = 0 nodes_have_ids = False # read from JSON for file_path in os.listdir(dir_path): with open(dir_path + "/" + file_path) as data_file: graph_json.update(json.load(data_file)) # read from DB self.db_info = db_object["graphdetails"].find_one( {"graph_id": self.graph_id}) # enumerate on keys # there are 2 cases: nodes have keys and nodes don't # either cases are exhaustive for node_key, node_json in graph_json.items(): # get id node_id = node_json.get("id") if not nodes_have_ids and not node_id: node_id = id_counter id_counter += 1 else: nodes_have_ids = True # get other data node_name = node_key node_no_match_before = node_json.get("no_match_before") node_matches = node_json.get("matches") node_connections = node_json.get("connections") node_action = node_json.get("action") node_context = node_json.get("context") node_alias = node_json.get("alias") node_searchable = node_json.get("searchable") node_suggested = node_json.get("suggested") current_node = Node(node_name, node_connections, node_action, node_matches, node_context, node_searchable, node_suggested, node_alias, node_id, node_no_match_before) # put into mappings self.node_map[node_name] = current_node self.node_id_map[node_id] = current_node # Enumerate over nodes and build connections # Also set the orphan flag for appropriate nodes # TODO generate auto placeholders based on mapping. Along the same lines as for actions for node_name, node in self.node_map.items(): for connected_node in node.connections: con_node_name = connected_node["name"] connected_node["node"] = self.node_map.get(con_node_name) connected_node["node"].orphan = False # Add orphan nodes to the list # and build postings list for node_name, node in self.node_map.items(): if node.orphan: self.orphans.add(node_name) self.orphan_list.append({ "node": node, "name": node_name, "matches": node.matches }) self.build_postings(node) # compute tf-idf scores self.search_postings.compute_tf_idf() # build/ reuse postings for extraction mappings self.build_extraction_postings(db_object, redis_object, extraction_indices) def build_postings(self, node): if node.searchable: # extra weight to the question text # TODO make this generic. weights should be incorporated in the graph # TODO a default weight system should be used in case weights are not put in the graph searchable_text = " ".join( node.searchable) + node.searchable[0] * 2 # get lemmatized tokens lemmatized_tokens = utils.lemmatize_text(searchable_text.lower()) # get stemmed tokens stemmed_tokens = utils.stem_text(searchable_text.lower()) # merge the lemmatized and stemmed tokens into lmmatized_tokens # every stemmed token that gets put, is put as many times its versions occur in the text lemmatized_tokens_set = set(lemmatized_tokens) for token in stemmed_tokens: if token not in lemmatized_tokens_set: lemmatized_tokens = lemmatized_tokens + [token] # remove stop words # lemmatized_tokens = utils.remove_stop_words(lemmatized_tokens, input_type="list") token_frequencies = dict() # count frequency for every lemmatized token for token in lemmatized_tokens: token_frequency = token_frequencies.get(token, 0) token_frequencies[token] = token_frequency + 1 # put token and frequency info in postings for token in token_frequencies: self.search_postings.add_document_for_token( token, node.id, {"tf": token_frequencies[token]}) def build_extraction_postings(self, db_object, redis_object, extraction_indices): if self.db_info and self.db_info.get("mappings"): map_names = self.db_info.get("mappings", []) or [] for map_name in map_names: # initializations mapping = dict() postings_object = Postings() # skip if entry for the map exists in redis map_value = redis_object.get(map_name) # in case the entry has not been populated before if not map_value: # get mapping from DB mapping = db_object["mappings"].find_one( {"name": map_name}) mapping.pop("_id") # store mapping in Redis redis_object.set(map_name, json.dumps(mapping)) else: mapping = json.loads(map_value) entries = mapping.get("map") tokenized_entries = [] fields_to_index = mapping.get("toIndex") # build postings for i, entry in enumerate(entries): # use active entries if entry.get("active"): # merge all texts stripped_text = utils.remove_non_alpha_num_chars( " ".join( filter( lambda x: bool(x), reduce(lambda x, y: x + y, [ entry.get(field, []) or [] if type( entry.get(field, []) or []) == list else [str(entry[field])] for field in fields_to_index ], []))))[0] # generate tokens if stripped_text: map( lambda x: postings_object. add_document_for_token(x, i), set(utils.lemmatize_text( stripped_text.lower()))) if not map_value: # construct tokens for all constituents of the entry and store in redis if not already there tokenized_elements = map( lambda x: sorted( utils.lemmatize_text( utils.remove_non_alpha_num_chars(x)[0]) ), filter( lambda x: bool(x), reduce(lambda x, y: x + y, [ entry.get(field, []) or [] if type( entry.get(field, []) or []) == list else [str(entry[field])] for field in fields_to_index ], []))) tokenized_entries.append(tokenized_elements) else: if not map_value: tokenized_entries.append(None) extraction_indices[map_name] = postings_object if not map_value: # set tokenized mappings in redis if not already there redis_object.set("tokenized" + map_name, json.dumps(tokenized_entries)) def dump_graph(self): ''' Dump the graph as a json file. TODO : Extend it to make it dump in a DB :return: ''' pass def get_unknown_intent_node(self): ''' returns the unknown_intent_node :return: ''' return self.get_node(self.graph_utils["unknown_intent_node_name"]) def get_next_node(self, node_name=None, node=None, data=None): ''' Get the best node from a list of possible ones :param node_name: :param node: :param data: :return: ''' resulting_nodes_with_confidence_values = self.get_child_confidence( node_name, node, data) resulting_node = self.get_node( intents.get_highest_probability_intent( resulting_nodes_with_confidence_values, self.graph_utils)) # try to see if we can recommend other nodes if resulting_node.name == self.graph_utils['unknown_intent_node_name']: suggestions = intents.get_top_k_suggestions(data, self.graph_utils, self.search_postings, k=3) if suggestions: # get the temp node resulting_node = self.get_node("suggestion") connections_to_be_stored = json.loads( json.dumps(resulting_node.connections_json)) # put each suggestion as a quick reply and a connection resulting_node.action[1]["suggestions"]["replies"] = [] quick_replies = resulting_node.action[1]["suggestions"][ "replies"] for index, suggestion in enumerate(suggestions): suggested_node = self.get_node_by_id(suggestion) suggestion_json = { "name": suggested_node.suggested_response[0].get( "payload", ""), "matches": { "or": [{ "fuzzyMessage": suggested_node.suggested_response[0].get( "text", "") }, { "==": [{ "var": "payload" }, suggested_node.name] }] } } # If index is 0, the connection info needs to have a matching for the yes class if index == 0: suggestion_json["matches"]["or"].append( {"class": "yes"}) # put a copy so that it can be stored in the context connections_to_be_stored.append( json.loads(json.dumps(suggestion_json))) suggestion_json["node"] = suggested_node quick_replies.append({ "content_type": "text", "title": suggested_node.suggested_response[0].get("text", ""), "payload": suggested_node.suggested_response[0].get( "payload", "") }) resulting_node.connections.append(suggestion_json) # Store values in context for later retrieval. resulting_node.set_context_vars[ "suggestion_node_connections"] = connections_to_be_stored return resulting_node def get_child_confidence(self, node_name=None, node=None, data=None): ''' Given the current node, and relevant data, computes the confidence values of child nodes and orphans :param node_name: :param node: :param data: :return: ''' if not node and not node_name: return None # Initializations unknown_intent_node = self.get_unknown_intent_node() if not node: node = self.node_map[node_name] resulting_nodes_with_consequences = dict() # iterate through the current node's connections and orphan nodes and evaluate their possibility of being the # next node found = False for connection in node.connections + self.orphan_list: if not connection["node"].no_match_before or ( not found and connection["node"].no_match_before): matching_conditions = connection["matches"] result = self.json_logic(matching_conditions, data) if result is True: result = 100 found = True elif result in [False, None]: result = 0 resulting_nodes_with_consequences[connection["name"]] = result return resulting_nodes_with_consequences def json_logic(self, tests, data=None): ''' Evaluate JSON Logic. The matching conditions for a node and its connections are stored in JSON format. This evaluator takes in relevant data and evaluates a test against the data. The boolean o/p is returned :param tests: :param data: :return: ''' # You've recursed to a primitive, stop! if tests is None or type(tests) != dict: return tests data = data or {} op = tests.keys()[0] values = tests[op] operations = { "==": (lambda a, b: a == b), "===": (lambda a, b: a is b), "!=": (lambda a, b: a != b), "!==": (lambda a, b: a is not b), ">": (lambda a, b: a > b), ">=": (lambda a, b: a >= b), "<": (lambda a, b, c=None: a < b if (c is None) else (a < b) and (b < c)), "<=": (lambda a, b, c=None: a <= b if (c is None) else (a <= b) and (b <= c)), "!": (lambda a: not a), "%": (lambda a, b: a % b), "and": (lambda *args: reduce(lambda total, arg: total and arg, args, True) ), "or": (lambda *args: reduce(lambda total, arg: total or arg, args, False) ), "?:": (lambda a, b, c: b if a else c), "log": (lambda a: a if sys.stdout.write(str(a)) else a), "in": (lambda a, b: a in b if "__contains__" in dir(b) else False), "var": (lambda a, not_found=None: reduce( lambda data, key: (data.get(key, not_found) if type(data) == dict else data[int(key)] if type(data) in [list, tuple] else not_found), str(a).split("."), data)), "cat": (lambda *args: "".join(args)), "+": (lambda *args: reduce(lambda total, arg: total + float(arg), args, 0.0)), "*": (lambda *args: reduce(lambda total, arg: total * float(arg), args, 1.0)), "-": (lambda a, b=None: -a if b is None else a - b), "/": (lambda a, b=None: a if b is None else float(a) / float(b)), "min": (lambda *args: min(args)), "max": (lambda *args: max(args)), "function": (lambda *args: intents.execute_function( args[0], args[1], data.get('context', None))), "regex": (lambda x, y: re.match(x, y)), "fuzzy": (lambda x, y: utils.fuzzy_text_matcher(x, y)), "fuzzyMessage": (lambda x: utils.fuzzy_text_matcher(data.get('message', ""), x) > self.graph_utils["min_fuzzy_prob"]), "class": (lambda x: utils.class_check(data.get("message", ""), x, self.graph_utils)), "bool": (lambda a: bool(a)), "extract": (lambda *x: utils.perform_extraction(x, data)), "count": (lambda *x: utils.length(x)) } if op not in operations: raise RuntimeError("Unrecognized operation %s" % op) # Easy syntax for unary operators, like {"var": "x"} instead of strict # {"var": ["x"]} if type(values) not in [list, tuple]: values = [values] # Recursion! try: values = map(lambda val: self.json_logic(val, data), values) except RuntimeError: pass return operations[op](*values)
def build_extraction_postings(self, db_object, redis_object, extraction_indices): if self.db_info and self.db_info.get("mappings"): map_names = self.db_info.get("mappings", []) or [] for map_name in map_names: # initializations mapping = dict() postings_object = Postings() # skip if entry for the map exists in redis map_value = redis_object.get(map_name) # in case the entry has not been populated before if not map_value: # get mapping from DB mapping = db_object["mappings"].find_one( {"name": map_name}) mapping.pop("_id") # store mapping in Redis redis_object.set(map_name, json.dumps(mapping)) else: mapping = json.loads(map_value) entries = mapping.get("map") tokenized_entries = [] fields_to_index = mapping.get("toIndex") # build postings for i, entry in enumerate(entries): # use active entries if entry.get("active"): # merge all texts stripped_text = utils.remove_non_alpha_num_chars( " ".join( filter( lambda x: bool(x), reduce(lambda x, y: x + y, [ entry.get(field, []) or [] if type( entry.get(field, []) or []) == list else [str(entry[field])] for field in fields_to_index ], []))))[0] # generate tokens if stripped_text: map( lambda x: postings_object. add_document_for_token(x, i), set(utils.lemmatize_text( stripped_text.lower()))) if not map_value: # construct tokens for all constituents of the entry and store in redis if not already there tokenized_elements = map( lambda x: sorted( utils.lemmatize_text( utils.remove_non_alpha_num_chars(x)[0]) ), filter( lambda x: bool(x), reduce(lambda x, y: x + y, [ entry.get(field, []) or [] if type( entry.get(field, []) or []) == list else [str(entry[field])] for field in fields_to_index ], []))) tokenized_entries.append(tokenized_elements) else: if not map_value: tokenized_entries.append(None) extraction_indices[map_name] = postings_object if not map_value: # set tokenized mappings in redis if not already there redis_object.set("tokenized" + map_name, json.dumps(tokenized_entries))