Esempio n. 1
0
class ClastersSearcher(object):
    def __init__(self, aerospike_connector):
        self._search_index = SearchIndex(aerospike_connector)

    def search(self, params):
        # Prepare params
        lat1 = params.center_latitude - params.span_latitude
        lon1 = params.center_longitude - params.span_longitude
        lat2 = params.center_latitude + params.span_latitude
        lon2 = params.center_longitude + params.span_longitude

        gridsize_lat = params.span_latitude * 2 / params.screen_height * params.grid_size
        gridsize_lon = params.span_longitude * 2 / params.screen_width * params.grid_size

        n_signs = params.signs_sample_size
        if n_signs == None:
            n_signs = 100

        # Builder to accumulate and group signs
        builder = ClustersBuilder(gridsize_lat, gridsize_lon, 100, n_signs)

        # Retrieve and process signs
        self._search_index.search_region(params.user_id, \
                                         lat1, lon1, \
                                         lat2, lon2, \
                                         self._records_processor(params, builder))
        return builder.clusters

    def _records_processor(self, params, builder):
        def _impl(record):
            sign_id = int(record['sign_id'])
            location = record['location'].unwrap().get('coordinates')
            builder.put_sign(sign_id, location[1], location[0])

        return _impl
Esempio n. 2
0
def main():
    search = SearchIndex(dictionary_file, postings_file)
    with open(query_file, 'r') as fquery:
        with open(output_file, 'w') as foutput:
            for query in fquery.readlines():
                result = search.search(query)
                # print result
                foutput.write(result + '\n')
Esempio n. 3
0
def main():
    """
    This is the point of entry. Does initialization, retrieve files' content,
    do indexing, generation diction and postings_files.

    Make another pass to calculate doc weights (tf * 1)
    """

    data = sorted(os.listdir(dir_to_index), key=int)
    for d in data:
        filepath = os.path.join(dir_to_index, d)
        with open(filepath, 'r') as f:
            content = " ".join(map(lambda x: x.strip(), f.readlines()))
            term_freq = get_each_file_term_frequency(content, d)
            index_content(term_freq, d)

    # make another pass to calculate weights
    for word, pointer in dictionary.iteritems():
        for doc in postings[pointer]:
            doc.append(SearchIndex.cal_log_tfs(doc[1]))

    # pprint(postings)
    create_files(len(data))
Esempio n. 4
0
 def __init__(self, aerospike_connector):
     self._search_index = SearchIndex(aerospike_connector)
Esempio n. 5
0
        if len(tuples) > MAX_CANDIDATE_TUPLES:
          tuples = set(random.sample(tuples, MAX_CANDIDATE_TUPLES))
        sources = extract_dimension_from_tuples_as_list(tuples, 0)
        relations = extract_dimension_from_tuples_as_list(tuples, 1)
        targets = extract_dimension_from_tuples_as_list(tuples, 2)
        output_row = {
          'question': question,
          'qn_entities': get_str_of_seq(qn_entities),
          'ans_entities': get_str_of_seq(ans_entities),
          'sources': get_str_of_seq(sources),
          'relations': get_str_of_seq(relations),
          'targets': get_str_of_seq(targets)
        }
        writer.writerow(output_row)


if __name__ == "__main__":
  parser = argparse.ArgumentParser(description='Specify arguments')
  parser.add_argument('--input_examples', help='the raw qa pairs', required=True)
  parser.add_argument('--input_graph', help='the graph file', required=True)
  parser.add_argument('--input_doc', help='the doc file', required=False)
  parser.add_argument('--stopwords', help='stopwords file', required=False)
  parser.add_argument('--output_examples', help='the processed output file', required=True)
  args = parser.parse_args()

  #global variables
  knowledge_base = KnowledgeGraph(args.input_graph, unidirectional=False)
  search_index = SearchIndex(args.input_doc, args.stopwords)
  stop_vocab = read_file_as_dict(args.stopwords)
  question_parser = QuestionParser(knowledge_base.get_entities(), stop_vocab)
  main(args)
Esempio n. 6
0
class Searcher(object):

    def __init__(self, aerospike_connector):
        self._search_index = SearchIndex(aerospike_connector)

    def search(self, params):
        if params.features == None or len(params.features) == 0:
            signs = self._search_nearest(params)
        else:
            signs = self._search_by_fea(params)

        return self._make_result(params, signs)

    def _records_processor(self, params, heap):
        def _impl(record):
            entry = SearchEntry()
            entry.sign_id = int(record['sign_id'])
            entry.rank = record.get('rank')
            # Min rank threshold
            if entry.rank != None and entry.rank < params.min_rank:
                return
            # Legacy
            if entry.rank == None:
                entry.rank = 0

            location = record['location'].unwrap().get('coordinates')
            entry.distance = geo.distance(params.latitude, params.longitude, location[1], location[0])
            heap.push(entry)
        return _impl

    def _search_nearest(self, params):
        heap = SearchResultsHeap(params.max_n, lambda entry: -entry.distance)
        # Retrieve records
        self._search_index.search_nearest(params.user_id, \
                                          params.latitude, \
                                          params.longitude, \
                                          params.radius, \
                                          self._records_processor(params, heap))
        # Resort
        signs = heap.results.values()
        signs.sort(key=lambda entry: entry.distance)
        return signs

    def _search_by_fea(self, params):
        heap = SearchResultsHeap(params.max_n, lambda entry: entry.rank)
        # Normailize features vector
        qsum = 0
        for fea in params.features:
            qsum = qsum + fea * fea
        qsum = math.sqrt(qsum)
        for i in range(len(params.features)):
            params.features[i] = params.features[i] / qsum
        # Retrieve records
        self._search_index.search_by_fea(params.user_id, \
                                         params.latitude, \
                                         params.longitude, \
                                         params.radius, \
                                         params.features, \
                                         self._records_processor(params, heap))
        # Resort
        signs = heap.results.values()
        if params.sort_by == SearchParams.SORT_BY_RANK or \
           params.sort_by == SearchParams.SORT_BY_UNKNOWN:
            signs.sort(key=lambda entry: entry.rank, reverse=True)
        else:
            signs.sort(key=lambda entry: entry.distance)
        return signs

    def _make_result(self, params, signs):
        signs_list = list()
        for sign in signs:
            signs_list.append({
                'sign_id': sign.sign_id,
                'distance': sign.distance
            })

        debug = None
        if params.debug:
            weights = list()
            for sign in signs:
                weights.append({
                    'sign_id': sign.sign_id,
                    'weight': sign.rank
                })
            debug = {
                'weights': weights
            }
        return signs_list, debug