class ClastersSearcher(object): def __init__(self, aerospike_connector): self._search_index = SearchIndex(aerospike_connector) def search(self, params): # Prepare params lat1 = params.center_latitude - params.span_latitude lon1 = params.center_longitude - params.span_longitude lat2 = params.center_latitude + params.span_latitude lon2 = params.center_longitude + params.span_longitude gridsize_lat = params.span_latitude * 2 / params.screen_height * params.grid_size gridsize_lon = params.span_longitude * 2 / params.screen_width * params.grid_size n_signs = params.signs_sample_size if n_signs == None: n_signs = 100 # Builder to accumulate and group signs builder = ClustersBuilder(gridsize_lat, gridsize_lon, 100, n_signs) # Retrieve and process signs self._search_index.search_region(params.user_id, \ lat1, lon1, \ lat2, lon2, \ self._records_processor(params, builder)) return builder.clusters def _records_processor(self, params, builder): def _impl(record): sign_id = int(record['sign_id']) location = record['location'].unwrap().get('coordinates') builder.put_sign(sign_id, location[1], location[0]) return _impl
def main(): search = SearchIndex(dictionary_file, postings_file) with open(query_file, 'r') as fquery: with open(output_file, 'w') as foutput: for query in fquery.readlines(): result = search.search(query) # print result foutput.write(result + '\n')
def main(): """ This is the point of entry. Does initialization, retrieve files' content, do indexing, generation diction and postings_files. Make another pass to calculate doc weights (tf * 1) """ data = sorted(os.listdir(dir_to_index), key=int) for d in data: filepath = os.path.join(dir_to_index, d) with open(filepath, 'r') as f: content = " ".join(map(lambda x: x.strip(), f.readlines())) term_freq = get_each_file_term_frequency(content, d) index_content(term_freq, d) # make another pass to calculate weights for word, pointer in dictionary.iteritems(): for doc in postings[pointer]: doc.append(SearchIndex.cal_log_tfs(doc[1])) # pprint(postings) create_files(len(data))
def __init__(self, aerospike_connector): self._search_index = SearchIndex(aerospike_connector)
if len(tuples) > MAX_CANDIDATE_TUPLES: tuples = set(random.sample(tuples, MAX_CANDIDATE_TUPLES)) sources = extract_dimension_from_tuples_as_list(tuples, 0) relations = extract_dimension_from_tuples_as_list(tuples, 1) targets = extract_dimension_from_tuples_as_list(tuples, 2) output_row = { 'question': question, 'qn_entities': get_str_of_seq(qn_entities), 'ans_entities': get_str_of_seq(ans_entities), 'sources': get_str_of_seq(sources), 'relations': get_str_of_seq(relations), 'targets': get_str_of_seq(targets) } writer.writerow(output_row) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Specify arguments') parser.add_argument('--input_examples', help='the raw qa pairs', required=True) parser.add_argument('--input_graph', help='the graph file', required=True) parser.add_argument('--input_doc', help='the doc file', required=False) parser.add_argument('--stopwords', help='stopwords file', required=False) parser.add_argument('--output_examples', help='the processed output file', required=True) args = parser.parse_args() #global variables knowledge_base = KnowledgeGraph(args.input_graph, unidirectional=False) search_index = SearchIndex(args.input_doc, args.stopwords) stop_vocab = read_file_as_dict(args.stopwords) question_parser = QuestionParser(knowledge_base.get_entities(), stop_vocab) main(args)
class Searcher(object): def __init__(self, aerospike_connector): self._search_index = SearchIndex(aerospike_connector) def search(self, params): if params.features == None or len(params.features) == 0: signs = self._search_nearest(params) else: signs = self._search_by_fea(params) return self._make_result(params, signs) def _records_processor(self, params, heap): def _impl(record): entry = SearchEntry() entry.sign_id = int(record['sign_id']) entry.rank = record.get('rank') # Min rank threshold if entry.rank != None and entry.rank < params.min_rank: return # Legacy if entry.rank == None: entry.rank = 0 location = record['location'].unwrap().get('coordinates') entry.distance = geo.distance(params.latitude, params.longitude, location[1], location[0]) heap.push(entry) return _impl def _search_nearest(self, params): heap = SearchResultsHeap(params.max_n, lambda entry: -entry.distance) # Retrieve records self._search_index.search_nearest(params.user_id, \ params.latitude, \ params.longitude, \ params.radius, \ self._records_processor(params, heap)) # Resort signs = heap.results.values() signs.sort(key=lambda entry: entry.distance) return signs def _search_by_fea(self, params): heap = SearchResultsHeap(params.max_n, lambda entry: entry.rank) # Normailize features vector qsum = 0 for fea in params.features: qsum = qsum + fea * fea qsum = math.sqrt(qsum) for i in range(len(params.features)): params.features[i] = params.features[i] / qsum # Retrieve records self._search_index.search_by_fea(params.user_id, \ params.latitude, \ params.longitude, \ params.radius, \ params.features, \ self._records_processor(params, heap)) # Resort signs = heap.results.values() if params.sort_by == SearchParams.SORT_BY_RANK or \ params.sort_by == SearchParams.SORT_BY_UNKNOWN: signs.sort(key=lambda entry: entry.rank, reverse=True) else: signs.sort(key=lambda entry: entry.distance) return signs def _make_result(self, params, signs): signs_list = list() for sign in signs: signs_list.append({ 'sign_id': sign.sign_id, 'distance': sign.distance }) debug = None if params.debug: weights = list() for sign in signs: weights.append({ 'sign_id': sign.sign_id, 'weight': sign.rank }) debug = { 'weights': weights } return signs_list, debug