def calculate_distance_documents( self, all_documents: List[ProcessedDocument], new_documents: List[ProcessedDocument]) -> List[DocumentDistance]: nbow = self._convert_documents_to_nbow(all_documents) amount_of_closest_documents = get_int_property( "distance", "amount_of_closest_documents_to_calculate") vocabulary_min = get_int_property("distance", "vocabulary_min") vocabulary_max = get_int_property("distance", "vocabulary_max") calculated_distances: List[DocumentDistance] = [] documents_length = len(new_documents) finished_operations = 0 start_time = time() for document in new_documents: # add newly calculated distances to 'document' to the end of 'calculated_distances' calculated_distances.extend( self._calculate_nearest_documents_to( nbow, document.document_id, amount_of_closest_documents, vocabulary_min, vocabulary_max)) finished_operations += 1 avg_calculation_time = (time() - start_time) / finished_operations time_left = (documents_length - finished_operations) * avg_calculation_time self.LOG.debug( "calculated distance of {} out of {}, {}%. Average time per calculation: {}s, time left: {}" .format( finished_operations, documents_length, format(finished_operations / documents_length * 100, ".2f"), avg_calculation_time, str(datetime.timedelta(seconds=time_left)))) print(calculated_distances) return calculated_distances
class Aggregator(): _bm25_weight = get_int_property("weight", "bm25_weight") _distance_weight = get_int_property("weight", "distance_weight") def normalize_and_sort_search_results(self, search_results: List[Raw_Document]): normalized_search_results: List[Normalized_Document] = self._normalize_search_results(search_results) weighted_results: List[Tuple[float, Raw_Document]] = self._apply_weights_on_results(normalized_search_results) sorted_results = [result for result in sorted(weighted_results, key=lambda item: item[0], reverse=True)] return sorted_results def _sigmoid(self, value: float, midpoint: float, stretch: float) -> float: return math.atan(((value - midpoint) * stretch)) * (math.pi * 0.1) + 0.5 def _normalize_search_results(self, search_results: List[BM25_Distance_Document]) -> List[Normalized_Document]: normalized_search_results: List[Normalized_Document] = [] for result in search_results: normalized_BM25_score = self._normalize_BM25_result(result.BM_25_score) normalized_distance: float = self._normalize_distance_results(result.distances_with_user_history) normalized_search_results.append(Normalized_Document( document=result.document, normalized_BM_25_score=normalized_BM25_score, normalized_distances_with_user_history=normalized_distance )) return normalized_search_results def _normalize_BM25_result(self, BM25_score: float) -> float: midpoint = get_float_property("normalization", "bm25_midpoint") stretch = get_float_property("normalization", "bm25_stretch") return self._sigmoid(BM25_score, midpoint, stretch) def _normalize_distance_results(self, distances: List[float]) -> float: if len(distances) == 0: return 0.0 midpoint = get_float_property("normalization", "distance_midpoint") stretch = get_float_property("normalization", "distance_stretch") total_distance = 0.0 for distance in distances: total_distance += self._sigmoid(distance, midpoint, stretch) return total_distance / len(distances) def _apply_weights_on_results(self, normalized_search_results: List[Normalized_Document]) -> List[ Tuple[float, Raw_Document]]: weighted_results: List[Tuple[float, Raw_Document]] = [] for search_result in normalized_search_results: total = 0.0 total += search_result.normalized_BM_25_score * self._bm25_weight total += search_result.normalized_distances_with_user_history * self._distance_weight weighted_results.append((total, search_result.document)) return weighted_results
def test_get_int_property_success(self, mocker): # Setup input = "3" expected = 3 read_property_mock = mocker.patch( "helper.ConfigReader._read_property", return_value = input ) # Run actual = get_int_property("test", "parameters") # Check assert actual == expected read_property_mock.assert_called_once_with("test", "parameters")
def start_data_loading(): repeat_interval: int = get_int_property("datasource", "automatic_check_interval") if get_bool_property("datasource", "automatic_check"): start_time = time() while True: _load_and_process_data() time_to_next_iteration = (start_time + repeat_interval) - time() if time_to_next_iteration > 0: sleep(time_to_next_iteration) start_time = time() else: _load_and_process_data()
def search_query(self, query: str) -> List[BM25_graded_document]: amount_to_return = get_int_property("search", "number_of_results") inclusion_threshold: float = get_float_property("search", "threshold") preprocessed_query = preprocess_string(query) documents_above_threshold: List[BM25_graded_document] = [] print(preprocessed_query) scores = self._BM25_model.get_scores( preprocessed_query.lower().split()) for i, score in enumerate(scores): if score >= inclusion_threshold: corresponding_document = self._indexed_documents[i] corresponding_document_raw = Raw_Document( document_id=corresponding_document.document_id, body_raw=corresponding_document.body_raw, title_raw=corresponding_document.title_raw, space=corresponding_document.space, ) documents_above_threshold.append( BM25_graded_document(document=corresponding_document_raw, BM_25_score=score)) return sorted(documents_above_threshold, key=lambda doc: doc.BM_25_score, reverse=True)[:amount_to_return]
def _setup_fast_api(): ip = get_string_property("api", "ip") port = get_int_property("api", "port") uvicorn.run(app, host=ip, port=port)