Ejemplo n.º 1
0
class TSearchEngine(object):
    SEGMENT_SIZE = 64
    #maximum amount to store in index, the rest is in the bloomfilter
    #SERVER_LOCATION = "http://5.9.104.49:8081/?q="
    #SERVER_LOCATION = "http://127.0.0.1:8080/?q="
    
    def __init__(self, index_location="./"):
        self.parsers = TParsersBundle()
        #self.word_index = TWordIndexReader(index_location)
        self.segment_index = TSegmentIndexReader(index_location)
        self.search_index = TSearchIndex(index_location)
    
    """ return list of TSearchEngineResult sorted by weights (high weight first) """
    def search(self, query, filter_objects=None, query_tokens=[], first_object2return=0, objects2return=10):
        if not query_tokens:
            if type(query) == unicode:
                query = query.encode("utf8") 
            query_matches = self.parsers.parse_buffer(query)
            query_tokens = [match.token for match in query_matches]
        objects_str = ""
        if  (filter_objects != None) and (len(filter_objects) > 0):
            objects_str = "<>".join(str(object_id) for object_id in filter_objects)
        as_text = self.search_index.ExecuteQuery("<>".join(query_tokens), first_object2return, objects2return, objects_str);
        """
            import urllib2
            objects_suffix =""
            if  (filter_objects != None) and (len(filter_objects) > 0):
                objects_suffix = "&o=" + ",".join(str(object_id) for object_id in filter_objects)
            start_len_suffix = "&start=" + str(first_object2return) + "&len=" + str(objects2return)
            as_text = urllib2.urlopen(TSearchEngine.SERVER_LOCATION + ",".join(query_tokens) + objects_suffix + start_len_suffix).read()
        """
        results = []
        chunks = [chunk.strip() for chunk in as_text.split("<:::>") if chunk.strip()]
        total_results_count = int(chunks[0])
        for object_chunk in chunks[1:]:
            object_data, matches = object_chunk.split("||")
            object_id, object_relevance, matches_count = object_data.split(":")
            object_id, object_relevance, matches_count = int(object_id), float(object_relevance), int(matches_count)
            segment_matches = []
            for match_str in matches.split("}"):
                if not match_str.strip():
                    continue
                match_weight_and_segment_id, occurences = match_str.split("|")
                segment_id, span_len, relevance = match_weight_and_segment_id.split(":")                
                segment_id, span_len, relevance = int(segment_id), int(span_len), float(relevance)
                words2select = []
                for occurence_str in occurences.split(";"):
                    if not "," in occurence_str:
                        continue
                    pos_in_segment, occ_weight, word_id = occurence_str.split(",")
                    word_id, pos_in_segment, occ_weight = int(word_id), int(pos_in_segment), int(occ_weight)
                    token = query_tokens[word_id]
                    words2select += [(token, pos_in_segment)]
                segment_matches.append(TSegmentMatch(segment_id, (span_len, relevance), words2select))
            result = TSearchEngineResult(object_id, object_relevance, matches_count)
            result.segment_matches = segment_matches
            results.append(result)
        
        return results, total_results_count
Ejemplo n.º 2
0
 def __init__(self, index_location="./"):
     self.parsers = TParsersBundle()
     #self.word_index = TWordIndexReader(index_location)
     self.segment_index = TSegmentIndexReader(index_location)
     self.search_index = TSearchIndex(index_location)