Example #1
0
class Engine(object):
    def __init__(self, fd, fp):
        self.dictionary = Dictionary(fd, load=True)
        self.postings = Postings(fp, mode='r')

    def _get_postings(self, termInfo):
        if termInfo[-1] is not None:
            return self.postings.list_at_offset(termInfo[-1])
        return None

    def execute_query(self, reverse_polish):
        args = []

        while reverse_polish:
            token = reverse_polish.popleft()

            if not isinstance(token, Operator):
                dterm = self.dictionary.term(token)
                postings_list = self._get_postings(dterm)
                args.append(postings_list)
            else:
                if isinstance(token, NOTOperator):
                    args.append(self.postings.not_list())
                # print '\nExecuting ', token, ' for args: ', str(args), '\n'
                for i in range(len(args)):
                    if args[i] is not None and args[i]._entries_len == 0:
                        args[i] = None
                splitpoint = -1 * token.nargs
                o_args = args[splitpoint:]
                args = args[:splitpoint] + [token.execute(o_args)]

        return args[-1]
Example #2
0
def build_index(training_data_dir, dictionary_file, postings_file, is_debug):
    training_files = sorted(os.listdir(training_data_dir), key=lambda x: x)
    if is_debug:
        training_files = training_files[:DEBUG_LIMIT]

    dictionary = Dictionary(dictionary_file)
    postings = Postings(postings_file)
    for training_file in training_files:
        doc_id = training_file
        doc_path = osp.join(training_data_dir, training_file)
        add_doc_to_index(doc_id, doc_path, dictionary, postings)
    postings.save()

    # turn line nos to byte offsets
    f = open(postings_file)
    current_line = 0
    while True:
        term = dictionary.term_for_offset(current_line)
        dictionary.add_term(term, f.tell(), update_freq=False)
        line = f.readline()
        if not line:
            break
        current_line += 1
    dictionary.generate_idf(len(training_files))
    dictionary.save()
Example #3
0
def build_index(in_dir, out_dict, out_postings):
    """
    Build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('Indexing...')

    stemmer = PorterStemmer()
    dictionaries = Dictionaries(out_dict)
    postings = Postings(out_postings)
    offset = 1

    for docID in os.listdir(in_dir):
        f = open(f'{in_dir}/{docID}', 'r')
        content_tokens = word_tokenize(f.read())
        for word in content_tokens:
            term = stemmer.stem(word=word).lower()

            if dictionaries.has_term(term):
                old_offset = dictionaries.get_offset(term)
                postings.add_docId_to_offset(old_offset, docID)
            else:
                dictionaries.add_term(term, offset)
                postings.add_doc_id(offset)
                postings.add_docId_to_offset(offset, docID)
                offset += 1

            dictionaries.increment_frequency(term)

    postings.save_to_file(dictionaries)
    dictionaries.save_to_file()
Example #4
0
 def test4(self):
     po = Postings(TEST_DIR)
     
     p = po.get('bericht 1')
     
     self.assertEqual(p.content(), 'bericht 1 content')
     self.assertEqual(p.content(), 'bericht 1 content')
     self.assertEqual(p.content(), 'bericht 1 content')
Example #5
0
 def test6(self):
     po = Postings(TEST_DIR)
     
     l = po.latest(1)
     self.assertEqual(l[0].title(), 'bericht 2')
     
     l = po.latest(1, reverse = True)
     self.assertEqual(l[0].title(), 'bericht 1')
Example #6
0
 def test2(self):
     po = Postings(TEST_DIR)
     
     p = po.get('bericht 2')
     d1 = p.date()
     d2 = datetime.datetime.now()
     
     self.assertEqual((d2-d1).seconds, 0)
     
     time.sleep(2)
     
     d3 = datetime.datetime.now()
     self.assertEqual((d3-d1).seconds, 2)
Example #7
0
 def __init__(self, company_id="demo"):
     self.graph_id = company_id
     self.orphans = set()
     self.orphan_list = list()
     self.node_map = dict()
     self.node_id_map = dict()
     self.db_info = dict()
     self.search_postings = Postings()
     # get utils for intents.
     with open(os.path.realpath("chatbot/intentUtils.json")) as data_file:
         self.graph_utils = json.load(data_file)
         # replace escaped slashes by a single slash.
         for class_string in self.graph_utils["class"]:
             if type(self.graph_utils["class"][class_string]) is str:
                 self.graph_utils["class"][class_string] = self.graph_utils[
                     "class"][class_string].replace('\\\\', '\\')
Example #8
0
def build_index(in_dir, out_dict, out_postings):
    """
    Build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('Indexing...')

    stemmer = PorterStemmer()
    dictionaries = Dictionaries(out_dict)
    postings = Postings(out_postings)
    offset = 1
    count = len(os.listdir(in_dir))

    for docID in os.listdir(in_dir):
        f = open(f'{in_dir}/{docID}', 'r')
        content = f.read()
        sentences = sent_tokenize(content)
        doc_terms = []
        for sentence in sentences:
            for word in word_tokenize(sentence):
                term = stemmer.stem(word=word.lower())
                doc_terms.append(term)

        # Calculate weighted term frequencies for each term
        weighted_term_freqs = [(x[0], get_term_frequency_weight(x[1]))
                               for x in Counter(doc_terms).most_common()]
        # Calculate document vector length
        doc_length = math.sqrt(
            sum(map(lambda x: x[1] * x[1], weighted_term_freqs)))

        for term, normalised_tf in weighted_term_freqs:
            if dictionaries.has_term(term):
                old_offset = dictionaries.get_offset(term)
                postings.add_docId_tf_to_offset(old_offset, docID,
                                                normalised_tf / doc_length)
            else:
                dictionaries.add_term(term, offset)
                postings.add_doc_id(offset)
                postings.add_docId_tf_to_offset(offset, docID,
                                                normalised_tf / doc_length)
                offset += 1

    postings.save_to_file(dictionaries, count)
    dictionaries.save_to_file()
Example #9
0
def build_index(training_data_dir, dictionary_file, postings_file, is_debug):
    training_files = sorted(os.listdir(training_data_dir),
                            key=lambda x: int(x))
    if is_debug:
        training_files = training_files[:DEBUG_LIMIT]

    dictionary = Dictionary(dictionary_file)
    postings = Postings(postings_file)
    for training_file in training_files:
        doc_id = int(training_file)
        doc_path = osp.join(training_data_dir, training_file)
        postings.not_list().add(doc_id)
        add_doc_to_index(doc_id, doc_path, dictionary, postings)
    postings.save()

    # turn line nos to byte offsets
    f = open(postings_file)
    current_line = 1
    f.readline()  # skip postings list containing all doc ids
    while True:
        term = dictionary.term_for_offset(current_line)
        dictionary.add_term(term, f.tell())
        line = f.readline()
        if not line:
            break
        current_line += 1
    dictionary.save()
Example #10
0
class Searcher(object):
    def __init__(self, dictionary_file, postings_file):
        self.dictionary = Dictionary(dictionary_file)
        self.postings = Postings(postings_file)
        self.dictionary.load()
        self.all_docs = self.postings.load_list(0)

    #evaluates a query assuming it is in RPN
    def evaluate_query(self, parsed_query):
        stack = []
        while (len(parsed_query) != 0):
            element = parsed_query.pop(0)
            if element == 'NOT':
                operand = stack.pop()
                stack.append(self.evaluate_NOT(operand))
            elif element == 'AND':
                first_operand = stack.pop()
                second_operand = stack.pop()
                stack.append(self.evaluate_AND(first_operand, second_operand))
            elif element == 'OR':
                first_operand = stack.pop()
                second_operand = stack.pop()
                stack.append(self.evaluate_OR(first_operand, second_operand))
            else:
                stack.append(element)
        value = stack.pop()
        if not isinstance(value, list):
            offset = self.dictionary.get_offset(value)
            value = self.postings.load_list(offset)
        return value

    def evaluate_AND(self, first, second):
        if not isinstance(first, list):
            offset = self.dictionary.get_offset(first)
            first = self.postings.load_list(offset)
        if not isinstance(second, list):
            offset = self.dictionary.get_offset(second)
            second = self.postings.load_list(offset)
        return skip_intersection(first, second)

    def evaluate_OR(self, first, second):
        if not isinstance(first, list):
            offset = self.dictionary.get_offset(first)
            first = self.postings.load_list(offset)
        if not isinstance(second, list):
            offset = self.dictionary.get_offset(second)
            second = self.postings.load_list(offset)
        return union(first, second)

    def evaluate_NOT(self, operand):
        if not isinstance(operand, list):
            offset = self.dictionary.get_offset(operand)
            operand = self.postings.load_list(offset)
        return difference(self.all_docs, operand)
Example #11
0
def build_index(directory, dictionary_file, postings_file):
	files = os.listdir(directory)
	dictionary = Dictionary(dictionary_file)
	postings = Postings(postings_file)
	stemmer = nltk.stem.porter.PorterStemmer()
	last = ''
	for doc_id in files:
		postings.add_doc(doc_id)
		line_number = 1
		#Use linecache to get line
		line = linecache.getline(os.path.join(directory, doc_id), line_number)
		while line != '':
			#tokenize lines into sentences
			sentences = nltk.sent_tokenize(line)
			for sentence in sentences:
				#tokenize sentence
				tokens = nltk.word_tokenize(sentence)
				for token in tokens:
					#apply stemming and case folding
					stemmed_token = stemmer.stem(token).lower()
					#if term alraeady exists in dictionary, we find row number
					if dictionary.has_term(stemmed_token):
						offset = dictionary.get_offset(stemmed_token)
						result = postings.add_doc_id(doc_id, offset)
						# Result indicates if the doc id is new
						if result:
							dictionary.increment_frequency(stemmed_token)
					#else, we add it to dictionary and postings
					else:
						offset = postings.add_new_term()
						postings.add_doc_id(doc_id, offset)
						dictionary.add_new_term(stemmed_token, offset)
						
			line_number += 1
			line = linecache.getline(os.path.join(directory, doc_id), line_number)
	#save data
	postings.save(dictionary)
	dictionary.save()
Example #12
0
class Engine(object):

    NUM_RESULTS = 10

    def __init__(self, fd, fp):
        self.dictionary = Dictionary(fd, load=True)
        self.postings = Postings(fp, mode='r')

    def _get_postings(self, offset):
        return self.postings.list_at_offset(offset)

    def _accumulate_scores(self, scores, postings_list, q_wt):
        for doc_id, d_tf in postings_list:
            scores[doc_id] = scores.get(doc_id, 0) + q_wt * d_tf

    def _normalize(self, scores, q_len):
        for doc_id in scores:
            scores[doc_id] /= (q_len * self.dictionary.doc_length(doc_id))

    def _get_top_n_docs(self, scores, n):
        scores_heap = [(-v, k) for k, v in scores.items()]
        heapq.heapify(scores_heap)
        return [
            heapq.heappop(scores_heap)[1] for i in xrange(n)
            if len(scores_heap) > 0
        ]

    def execute_query(self, query_map):
        scores = {}
        for term in query_map:
            q_idf, term_offset = self.dictionary.term(term)

            # unknown term, skip everything, score 0
            if term_offset is None:
                continue

            # accumulate scores for postings list
            query_map[term] = q_wt = tf(query_map[term]) * q_idf
            postings_list = self._get_postings(term_offset)
            self._accumulate_scores(scores, postings_list, q_wt)

        # perform length normalization (query and document)
        q_len = math.sqrt(sum(x * x for x in query_map.values()))
        self._normalize(scores, q_len)

        # find top n
        top_n_docs = self._get_top_n_docs(scores, Engine.NUM_RESULTS)
        return " ".join(str(x) for x in top_n_docs)
Example #13
0
def run_search(dict_file, postings_file, queries_file, results_file):
    """
    Using the given dictionary file and postings file,
    perform searching on the given queries file and output the results to a file
    """
    print('Running search on the queries...')

    dictionaries = Dictionaries(dict_file)
    dictionaries.load()
    postings = Postings(postings_file)
    searcher = Searcher(dictionaries, postings)

    result_string = ''
    with open(queries_file, 'r') as f, open(results_file, 'w') as o:
        for i, query in enumerate(f):
            searcher.set_query(query.strip())
            output = searcher.evaluate_query()
            result_string += output.strip() + '\n'
            searcher.clear_postings()
        f.close()
        o.write(result_string.strip())
        o.close()
Example #14
0
 def __init__(self, dictionary_file, postings_file):
     self.dictionary = Dictionary(dictionary_file)
     self.postings = Postings(postings_file)
     self.dictionary.load()
     self.all_docs = self.postings.load_list(0)
Example #15
0
 def test5(self):
     po = Postings(TEST_DIR)
     
     p = po.get('bericht 2')
     
     self.assertIsNone(p.content())
Example #16
0
class feedbackEngine(object):
    """
    Search engine that uses relevance feedback
    with a vector space model to retrieve patents
    """

    global NUM_RESULTS
    global QUERY_WEIGHT
    global P_FEEDBACK_WEIGHT
    NUM_RESULTS = 10
    QUERY_WEIGHT = 0.5
    P_FEEDBACK_WEIGHT = 0.5

    def __init__(self, fd, fp):
        self.dictionary = Dictionary(fd, load=True)
        self.postings = Postings(fp, mode='r')
        self.feedback = False

    def _get_postings(self, offset):
        """
        This method gets the postings list at an offset
        """
        return self.postings.list_at_offset(offset)

    def _accumulate_scores(self, scores, postings_list, q_wt):
        """
        This method accumulates scores for a term
        """
        for doc_id, d_tf in postings_list:
            scores[doc_id] = scores.get(doc_id, 0) + q_wt * d_tf

    def _normalize(self, scores, q_len):
        """
        This method normalises scores for every document
        """
        for doc_id in scores:
            scores[doc_id] /= (q_len * self.dictionary.doc_length(doc_id))

    def _get_top_n_docs(self, scores, n):
        """
        This method creates a heap of the docs and pick out the top few
        """
        scores_heap = [(-v, k) for k, v in scores.items()]
        heapq.heapify(scores_heap)
        return [heapq.heappop(scores_heap)[1] for i in xrange(n)
                if len(scores_heap) > 0]

    def relevance_feedback(self, query_map, top_n_docs):
        """
        This method expands the query based on pseudo relevance feedback
        """
        self.feedback = True
        vector_sum = {}
        term_dict = self.dictionary._terms

        # constructing the document vector for the top n docs
        for term in term_dict:
            term_offset = term_dict[term][1]

            # unknown term, skip everything, score 0
            if term_offset is None or term is None:
                continue

            # adding the term frequencies of all the documents in top_n_docs
            postings_list = self._get_postings(term_offset)
            for doc_id, d_tf in postings_list:
                if doc_id in top_n_docs:
                    temp_term_freq = d_tf*P_FEEDBACK_WEIGHT
                    if term in vector_sum:
                        vector_sum[term] += temp_term_freq
                    else:
                        vector_sum[term] = temp_term_freq

        # averaging the vector for the top docs to get the centroid
        for term in vector_sum:
            vector_sum[term] /= NUM_RESULTS
            vector_sum[term] *= P_FEEDBACK_WEIGHT

        # adding the initial query vector terms to the centroid
        for term in vector_sum:
            if term in query_map:
                vector_sum[term] += query_map[term] * QUERY_WEIGHT

        # adding the remaining terms left in the query vector
        for term in query_map:
            if term not in vector_sum:
                vector_sum[term] = query_map[term] * QUERY_WEIGHT

        # execute query with the new query vector
        return self.execute_query(vector_sum)

    def execute_query(self, query_map):
        """
        This method is called to execute a query
        """
        scores = {}
        query_map_copy = copy.deepcopy(query_map)
        for term in query_map:
            q_idf, term_offset = self.dictionary.term(term)

            # unknown term, skip everything, score 0
            if term_offset is None:
                continue

            # accumulate scores for postings list
            query_map[term] = q_wt = tf(query_map[term]) * q_idf
            postings_list = self._get_postings(term_offset)
            self._accumulate_scores(scores, postings_list, q_wt)

        # perform length normalization (query and document)
        q_len = math.sqrt(sum(x * x for x in query_map.values()))
        self._normalize(scores, q_len)

        # if havent done relevance feedback, do relevance feedback
        if not self.feedback:
            top_n_docs = self._get_top_n_docs(scores, Engine.NUM_RESULTS)
            stringout = self.relevance_feedback(query_map_copy, top_n_docs)

        # if here, calling from within relevance feedback
        else:
            # return the output of all the scores after relevance feedback
            stringout = " ".join(str(x) for x in scores.keys())

        return stringout
Example #17
0
 def test3(self):
     po = Postings(TEST_DIR)
     
     p = po.get('bericht 2')
     
     self.assertEqual(p.title(), 'bericht 2')
Example #18
0
class Engine(object):
    """
    Search engine that uses a simple vector space model to retrieve patents
    """

    NUM_RESULTS = 500

    def __init__(self, fd, fp):
        self.dictionary = Dictionary(fd, load=True)
        self.postings = Postings(fp, mode='r')

    def _get_postings(self, offset):
        """
        This method gets the postings list at an offset
        """
        return self.postings.list_at_offset(offset)

    def _accumulate_scores(self, scores, postings_list, q_wt):
        """
        This method accumulates scores for a term
        """
        for doc_id, d_tf in postings_list:
            scores[doc_id] = scores.get(doc_id, 0) + q_wt * d_tf

    def _normalize(self, scores, q_len):
        """
        This method normalises scores for every document
        """
        for doc_id in scores:
            scores[doc_id] /= (q_len * self.dictionary.doc_length(doc_id))

    def _get_top_n_docs(self, scores, n):
        """
        This method creates a heap of the docs and pick out the top few
        """
        scores_heap = [(-v, k) for k, v in scores.items()]
        heapq.heapify(scores_heap)
        return [heapq.heappop(scores_heap)[1] for i in xrange(n)
                if len(scores_heap) > 0]

    def execute_query(self, query_map):
        """
        This method is called to execute a query
        """
        scores = {}
        for term in query_map:
            q_idf, term_offset = self.dictionary.term(term)

            # unknown term, skip everything, score 0
            if term_offset is None:
                continue

            # accumulate scores for postings list
            query_map[term] = q_wt = tf(query_map[term]) * q_idf
            postings_list = self._get_postings(term_offset)
            self._accumulate_scores(scores, postings_list, q_wt)

        # perform length normalization (query and document)
        q_len = math.sqrt(sum(x * x for x in query_map.values()))
        self._normalize(scores, q_len)

        # find top n
        # top_n_docs = self._get_top_n_docs(scores, Engine.NUM_RESULTS)
        # return " ".join(str(x) for x in top_n_docs)

        return " ".join(str(x) for x in scores.keys())
Example #19
0
    def test1(self):
        po = Postings(TEST_DIR)

        p = po.get('bericht 1')
        self.assertEqual(p.date(), datetime.datetime(2013,7,10))
Example #20
0
 def __init__(self, fd, fp):
     self.dictionary = Dictionary(fd, load=True)
     self.postings = Postings(fp, mode='r')
Example #21
0
def build_index(directory, dictionary_file, postings_file):
	files = os.listdir(directory)
	dictionary = Dictionary(dictionary_file)
	postings = Postings(postings_file)
	stemmer = nltk.stem.porter.PorterStemmer()
	last = ''
	for doc_id in files:
		tf_list = {}
		line_number = 1
		offset = 0
		# Use linecache to get line
		line = linecache.getline(os.path.join(directory, doc_id), line_number)
		while line != '':
			# tokenize lines into sentences
			sentences = nltk.sent_tokenize(line)
			for sentence in sentences:
				# tokenize sentence
				tokens = nltk.word_tokenize(sentence)
				for token in tokens:
					# apply stemming and case folding
					stemmed_token = stemmer.stem(token).lower()
					# if term already exists in dictionary, we find row number
					if dictionary.has_term(stemmed_token):
						offset = dictionary.get_offset(stemmed_token) 
						# If postings for that term already has doc id, 
						# then increment tf,
						# Else increment df and add the doc id
						if postings.has_doc_id(doc_id, offset):
							postings.increment_tf(doc_id, offset)	
						else:
							dictionary.increment_df(stemmed_token)
							postings.add_doc_id(doc_id, offset)
					# else, we add it to dictionary and postings
					else:
						offset = postings.add_new_term()
						postings.add_doc_id(doc_id, offset)
						dictionary.add_new_term(stemmed_token, offset)

					#Keep track of tf values of all terms in doc
					if stemmed_token in tf_list:
						tf_list[stemmed_token] += 1
					else:
						tf_list[stemmed_token] = 1
						
			line_number += 1
			line = linecache.getline(os.path.join(directory, doc_id), line_number)
		# Store doc length
		dictionary.add_doc_length(doc_id, tf_list.values())
	# save data
	postings.save(dictionary)
	dictionary.save()
Example #22
0
class Graph:
    # A class for the graph.
    # Keeps track of orphans.
    # Has a 1:1 mapping for each node
    # Allows population from data stored in a JSON

    def __init__(self, company_id="demo"):
        self.graph_id = company_id
        self.orphans = set()
        self.orphan_list = list()
        self.node_map = dict()
        self.node_id_map = dict()
        self.db_info = dict()
        self.search_postings = Postings()
        # get utils for intents.
        with open(os.path.realpath("chatbot/intentUtils.json")) as data_file:
            self.graph_utils = json.load(data_file)
            # replace escaped slashes by a single slash.
            for class_string in self.graph_utils["class"]:
                if type(self.graph_utils["class"][class_string]) is str:
                    self.graph_utils["class"][class_string] = self.graph_utils[
                        "class"][class_string].replace('\\\\', '\\')

    def get_node(self, node_name):
        return self.node_map.get(node_name)

    def get_node_by_id(self, node_id):
        return self.node_id_map.get(node_id)

    def populate_graph(self, dir_path, db_object, redis_object,
                       extraction_indices):
        '''
        Read a stored graph from JSON files under the directory
        Read aux graph info from DB.
        Identify the orphan nodes.
        Set up the 1:1 mapping of a node name/ id with its object
        :param dir_path:
        :param db_object:
        :param redis_object:
        :param extraction_indices:
        :return:
        '''

        # initializations
        graph_json = {}
        id_counter = 0
        nodes_have_ids = False

        # read from JSON
        for file_path in os.listdir(dir_path):
            with open(dir_path + "/" + file_path) as data_file:
                graph_json.update(json.load(data_file))

        # read from DB
        self.db_info = db_object["graphdetails"].find_one(
            {"graph_id": self.graph_id})

        # enumerate on keys
        # there are 2 cases: nodes have keys and nodes don't
        # either cases are exhaustive

        for node_key, node_json in graph_json.items():
            # get id
            node_id = node_json.get("id")
            if not nodes_have_ids and not node_id:
                node_id = id_counter
                id_counter += 1
            else:
                nodes_have_ids = True

            # get other data
            node_name = node_key
            node_no_match_before = node_json.get("no_match_before")
            node_matches = node_json.get("matches")
            node_connections = node_json.get("connections")
            node_action = node_json.get("action")
            node_context = node_json.get("context")
            node_alias = node_json.get("alias")
            node_searchable = node_json.get("searchable")
            node_suggested = node_json.get("suggested")

            current_node = Node(node_name, node_connections, node_action,
                                node_matches, node_context, node_searchable,
                                node_suggested, node_alias, node_id,
                                node_no_match_before)

            # put into mappings
            self.node_map[node_name] = current_node
            self.node_id_map[node_id] = current_node

        # Enumerate over nodes and build connections
        # Also set the orphan flag for appropriate nodes

        # TODO generate auto placeholders based on mapping. Along the same lines as for actions
        for node_name, node in self.node_map.items():
            for connected_node in node.connections:
                con_node_name = connected_node["name"]
                connected_node["node"] = self.node_map.get(con_node_name)
                connected_node["node"].orphan = False

        # Add orphan nodes to the list
        # and build postings list
        for node_name, node in self.node_map.items():
            if node.orphan:
                self.orphans.add(node_name)
                self.orphan_list.append({
                    "node": node,
                    "name": node_name,
                    "matches": node.matches
                })
            self.build_postings(node)
        # compute tf-idf scores
        self.search_postings.compute_tf_idf()

        # build/ reuse postings for extraction mappings
        self.build_extraction_postings(db_object, redis_object,
                                       extraction_indices)

    def build_postings(self, node):
        if node.searchable:
            # extra weight to the question text
            # TODO make this generic. weights should be incorporated in the graph
            # TODO a default weight system should be used in case weights are not put in the graph
            searchable_text = " ".join(
                node.searchable) + node.searchable[0] * 2
            # get lemmatized tokens
            lemmatized_tokens = utils.lemmatize_text(searchable_text.lower())
            # get stemmed tokens
            stemmed_tokens = utils.stem_text(searchable_text.lower())

            # merge the lemmatized and stemmed tokens into lmmatized_tokens
            # every stemmed token that gets put, is put as many times its versions occur in the text
            lemmatized_tokens_set = set(lemmatized_tokens)
            for token in stemmed_tokens:
                if token not in lemmatized_tokens_set:
                    lemmatized_tokens = lemmatized_tokens + [token]

            # remove stop words
            # lemmatized_tokens = utils.remove_stop_words(lemmatized_tokens, input_type="list")
            token_frequencies = dict()
            # count frequency for every lemmatized token
            for token in lemmatized_tokens:
                token_frequency = token_frequencies.get(token, 0)
                token_frequencies[token] = token_frequency + 1
            # put token and frequency info in postings
            for token in token_frequencies:
                self.search_postings.add_document_for_token(
                    token, node.id, {"tf": token_frequencies[token]})

    def build_extraction_postings(self, db_object, redis_object,
                                  extraction_indices):
        if self.db_info and self.db_info.get("mappings"):
            map_names = self.db_info.get("mappings", []) or []
            for map_name in map_names:
                # initializations
                mapping = dict()
                postings_object = Postings()
                # skip if entry for the map exists in redis
                map_value = redis_object.get(map_name)
                # in case the entry has not been populated before
                if not map_value:
                    # get mapping from DB
                    mapping = db_object["mappings"].find_one(
                        {"name": map_name})
                    mapping.pop("_id")
                    # store mapping in Redis
                    redis_object.set(map_name, json.dumps(mapping))
                else:
                    mapping = json.loads(map_value)
                entries = mapping.get("map")
                tokenized_entries = []
                fields_to_index = mapping.get("toIndex")
                # build postings
                for i, entry in enumerate(entries):
                    # use active entries
                    if entry.get("active"):
                        # merge all texts
                        stripped_text = utils.remove_non_alpha_num_chars(
                            " ".join(
                                filter(
                                    lambda x: bool(x),
                                    reduce(lambda x, y: x + y, [
                                        entry.get(field, []) or [] if type(
                                            entry.get(field, []) or []) == list
                                        else [str(entry[field])]
                                        for field in fields_to_index
                                    ], []))))[0]
                        # generate tokens
                        if stripped_text:
                            map(
                                lambda x: postings_object.
                                add_document_for_token(x, i),
                                set(utils.lemmatize_text(
                                    stripped_text.lower())))
                        if not map_value:
                            # construct tokens for all constituents of the entry and store in redis if not already there
                            tokenized_elements = map(
                                lambda x: sorted(
                                    utils.lemmatize_text(
                                        utils.remove_non_alpha_num_chars(x)[0])
                                ),
                                filter(
                                    lambda x: bool(x),
                                    reduce(lambda x, y: x + y, [
                                        entry.get(field, []) or [] if type(
                                            entry.get(field, []) or []) == list
                                        else [str(entry[field])]
                                        for field in fields_to_index
                                    ], [])))
                            tokenized_entries.append(tokenized_elements)
                    else:
                        if not map_value:
                            tokenized_entries.append(None)
                extraction_indices[map_name] = postings_object
                if not map_value:
                    # set tokenized mappings in redis if not already there
                    redis_object.set("tokenized" + map_name,
                                     json.dumps(tokenized_entries))

    def dump_graph(self):
        '''
        Dump the graph as a json file.
        TODO : Extend it to make it dump in a DB
        :return:
        '''
        pass

    def get_unknown_intent_node(self):
        '''
        returns the unknown_intent_node
        :return:
        '''
        return self.get_node(self.graph_utils["unknown_intent_node_name"])

    def get_next_node(self, node_name=None, node=None, data=None):
        '''
        Get the best node from a list of possible ones
        :param node_name:
        :param node:
        :param data:
        :return:
        '''
        resulting_nodes_with_confidence_values = self.get_child_confidence(
            node_name, node, data)
        resulting_node = self.get_node(
            intents.get_highest_probability_intent(
                resulting_nodes_with_confidence_values, self.graph_utils))
        # try to see if we can recommend other nodes
        if resulting_node.name == self.graph_utils['unknown_intent_node_name']:
            suggestions = intents.get_top_k_suggestions(data,
                                                        self.graph_utils,
                                                        self.search_postings,
                                                        k=3)
            if suggestions:
                # get the temp node
                resulting_node = self.get_node("suggestion")
                connections_to_be_stored = json.loads(
                    json.dumps(resulting_node.connections_json))
                # put each suggestion as a quick reply and a connection
                resulting_node.action[1]["suggestions"]["replies"] = []
                quick_replies = resulting_node.action[1]["suggestions"][
                    "replies"]
                for index, suggestion in enumerate(suggestions):
                    suggested_node = self.get_node_by_id(suggestion)
                    suggestion_json = {
                        "name":
                        suggested_node.suggested_response[0].get(
                            "payload", ""),
                        "matches": {
                            "or": [{
                                "fuzzyMessage":
                                suggested_node.suggested_response[0].get(
                                    "text", "")
                            }, {
                                "==": [{
                                    "var": "payload"
                                }, suggested_node.name]
                            }]
                        }
                    }
                    # If index is 0, the connection info needs to have a matching for the yes class
                    if index == 0:
                        suggestion_json["matches"]["or"].append(
                            {"class": "yes"})
                    # put a copy so that it can be stored in the context
                    connections_to_be_stored.append(
                        json.loads(json.dumps(suggestion_json)))
                    suggestion_json["node"] = suggested_node
                    quick_replies.append({
                        "content_type":
                        "text",
                        "title":
                        suggested_node.suggested_response[0].get("text", ""),
                        "payload":
                        suggested_node.suggested_response[0].get(
                            "payload", "")
                    })
                    resulting_node.connections.append(suggestion_json)
                # Store values in context for later retrieval.
                resulting_node.set_context_vars[
                    "suggestion_node_connections"] = connections_to_be_stored

        return resulting_node

    def get_child_confidence(self, node_name=None, node=None, data=None):
        '''
        Given the current node, and relevant data, computes the confidence values of child nodes and orphans
        :param node_name:
        :param node:
        :param data:
        :return:
        '''
        if not node and not node_name:
            return None
        # Initializations
        unknown_intent_node = self.get_unknown_intent_node()
        if not node:
            node = self.node_map[node_name]
        resulting_nodes_with_consequences = dict()

        # iterate through the current node's connections and orphan nodes and evaluate their possibility of being the
        # next node
        found = False
        for connection in node.connections + self.orphan_list:
            if not connection["node"].no_match_before or (
                    not found and connection["node"].no_match_before):
                matching_conditions = connection["matches"]
                result = self.json_logic(matching_conditions, data)
                if result is True:
                    result = 100
                    found = True
                elif result in [False, None]:
                    result = 0
                resulting_nodes_with_consequences[connection["name"]] = result

        return resulting_nodes_with_consequences

    def json_logic(self, tests, data=None):
        '''
        Evaluate JSON Logic. The matching conditions for a node and its connections are stored in JSON
        format. This evaluator takes in relevant data and evaluates a test against the data. The boolean o/p
        is returned
        :param tests:
        :param data:
        :return:
        '''
        # You've recursed to a primitive, stop!
        if tests is None or type(tests) != dict:
            return tests

        data = data or {}

        op = tests.keys()[0]
        values = tests[op]
        operations = {
            "==": (lambda a, b: a == b),
            "===": (lambda a, b: a is b),
            "!=": (lambda a, b: a != b),
            "!==": (lambda a, b: a is not b),
            ">": (lambda a, b: a > b),
            ">=": (lambda a, b: a >= b),
            "<": (lambda a, b, c=None: a < b
                  if (c is None) else (a < b) and (b < c)),
            "<=": (lambda a, b, c=None: a <= b
                   if (c is None) else (a <= b) and (b <= c)),
            "!": (lambda a: not a),
            "%": (lambda a, b: a % b),
            "and":
            (lambda *args: reduce(lambda total, arg: total and arg, args, True)
             ),
            "or":
            (lambda *args: reduce(lambda total, arg: total or arg, args, False)
             ),
            "?:": (lambda a, b, c: b if a else c),
            "log": (lambda a: a if sys.stdout.write(str(a)) else a),
            "in": (lambda a, b: a in b if "__contains__" in dir(b) else False),
            "var": (lambda a, not_found=None: reduce(
                lambda data, key:
                (data.get(key, not_found)
                 if type(data) == dict else data[int(key)]
                 if type(data) in [list, tuple] else not_found),
                str(a).split("."), data)),
            "cat": (lambda *args: "".join(args)),
            "+": (lambda *args: reduce(lambda total, arg: total + float(arg),
                                       args, 0.0)),
            "*": (lambda *args: reduce(lambda total, arg: total * float(arg),
                                       args, 1.0)),
            "-": (lambda a, b=None: -a if b is None else a - b),
            "/": (lambda a, b=None: a if b is None else float(a) / float(b)),
            "min": (lambda *args: min(args)),
            "max": (lambda *args: max(args)),
            "function": (lambda *args: intents.execute_function(
                args[0], args[1], data.get('context', None))),
            "regex": (lambda x, y: re.match(x, y)),
            "fuzzy": (lambda x, y: utils.fuzzy_text_matcher(x, y)),
            "fuzzyMessage":
            (lambda x: utils.fuzzy_text_matcher(data.get('message', ""), x) >
             self.graph_utils["min_fuzzy_prob"]),
            "class": (lambda x: utils.class_check(data.get("message", ""), x,
                                                  self.graph_utils)),
            "bool": (lambda a: bool(a)),
            "extract": (lambda *x: utils.perform_extraction(x, data)),
            "count": (lambda *x: utils.length(x))
        }

        if op not in operations:
            raise RuntimeError("Unrecognized operation %s" % op)

        # Easy syntax for unary operators, like {"var": "x"} instead of strict
        # {"var": ["x"]}
        if type(values) not in [list, tuple]:
            values = [values]

        # Recursion!
        try:
            values = map(lambda val: self.json_logic(val, data), values)
        except RuntimeError:
            pass

        return operations[op](*values)
Example #23
0
 def build_extraction_postings(self, db_object, redis_object,
                               extraction_indices):
     if self.db_info and self.db_info.get("mappings"):
         map_names = self.db_info.get("mappings", []) or []
         for map_name in map_names:
             # initializations
             mapping = dict()
             postings_object = Postings()
             # skip if entry for the map exists in redis
             map_value = redis_object.get(map_name)
             # in case the entry has not been populated before
             if not map_value:
                 # get mapping from DB
                 mapping = db_object["mappings"].find_one(
                     {"name": map_name})
                 mapping.pop("_id")
                 # store mapping in Redis
                 redis_object.set(map_name, json.dumps(mapping))
             else:
                 mapping = json.loads(map_value)
             entries = mapping.get("map")
             tokenized_entries = []
             fields_to_index = mapping.get("toIndex")
             # build postings
             for i, entry in enumerate(entries):
                 # use active entries
                 if entry.get("active"):
                     # merge all texts
                     stripped_text = utils.remove_non_alpha_num_chars(
                         " ".join(
                             filter(
                                 lambda x: bool(x),
                                 reduce(lambda x, y: x + y, [
                                     entry.get(field, []) or [] if type(
                                         entry.get(field, []) or []) == list
                                     else [str(entry[field])]
                                     for field in fields_to_index
                                 ], []))))[0]
                     # generate tokens
                     if stripped_text:
                         map(
                             lambda x: postings_object.
                             add_document_for_token(x, i),
                             set(utils.lemmatize_text(
                                 stripped_text.lower())))
                     if not map_value:
                         # construct tokens for all constituents of the entry and store in redis if not already there
                         tokenized_elements = map(
                             lambda x: sorted(
                                 utils.lemmatize_text(
                                     utils.remove_non_alpha_num_chars(x)[0])
                             ),
                             filter(
                                 lambda x: bool(x),
                                 reduce(lambda x, y: x + y, [
                                     entry.get(field, []) or [] if type(
                                         entry.get(field, []) or []) == list
                                     else [str(entry[field])]
                                     for field in fields_to_index
                                 ], [])))
                         tokenized_entries.append(tokenized_elements)
                 else:
                     if not map_value:
                         tokenized_entries.append(None)
             extraction_indices[map_name] = postings_object
             if not map_value:
                 # set tokenized mappings in redis if not already there
                 redis_object.set("tokenized" + map_name,
                                  json.dumps(tokenized_entries))