Ejemplo n.º 1
0
    def calculate_distance_documents(
            self, all_documents: List[ProcessedDocument],
            new_documents: List[ProcessedDocument]) -> List[DocumentDistance]:
        nbow = self._convert_documents_to_nbow(all_documents)
        amount_of_closest_documents = get_int_property(
            "distance", "amount_of_closest_documents_to_calculate")
        vocabulary_min = get_int_property("distance", "vocabulary_min")
        vocabulary_max = get_int_property("distance", "vocabulary_max")

        calculated_distances: List[DocumentDistance] = []
        documents_length = len(new_documents)
        finished_operations = 0

        start_time = time()
        for document in new_documents:
            # add newly calculated distances to 'document' to the end of 'calculated_distances'
            calculated_distances.extend(
                self._calculate_nearest_documents_to(
                    nbow, document.document_id, amount_of_closest_documents,
                    vocabulary_min, vocabulary_max))
            finished_operations += 1
            avg_calculation_time = (time() - start_time) / finished_operations
            time_left = (documents_length -
                         finished_operations) * avg_calculation_time
            self.LOG.debug(
                "calculated distance of {} out of {}, {}%. Average time per calculation: {}s, time left: {}"
                .format(
                    finished_operations, documents_length,
                    format(finished_operations / documents_length * 100,
                           ".2f"), avg_calculation_time,
                    str(datetime.timedelta(seconds=time_left))))
        print(calculated_distances)
        return calculated_distances
Ejemplo n.º 2
0
class Aggregator():
    _bm25_weight = get_int_property("weight", "bm25_weight")
    _distance_weight = get_int_property("weight", "distance_weight")

    def normalize_and_sort_search_results(self, search_results: List[Raw_Document]):
        normalized_search_results: List[Normalized_Document] = self._normalize_search_results(search_results)
        weighted_results: List[Tuple[float, Raw_Document]] = self._apply_weights_on_results(normalized_search_results)
        sorted_results = [result for result in sorted(weighted_results, key=lambda item: item[0], reverse=True)]
        return sorted_results

    def _sigmoid(self, value: float, midpoint: float, stretch: float) -> float:
        return math.atan(((value - midpoint) * stretch)) * (math.pi * 0.1) + 0.5

    def _normalize_search_results(self, search_results: List[BM25_Distance_Document]) -> List[Normalized_Document]:
        normalized_search_results: List[Normalized_Document] = []
        for result in search_results:
            normalized_BM25_score = self._normalize_BM25_result(result.BM_25_score)
            normalized_distance: float = self._normalize_distance_results(result.distances_with_user_history)
            normalized_search_results.append(Normalized_Document(
                document=result.document,
                normalized_BM_25_score=normalized_BM25_score,
                normalized_distances_with_user_history=normalized_distance
            ))
        return normalized_search_results

    def _normalize_BM25_result(self, BM25_score: float) -> float:
        midpoint = get_float_property("normalization", "bm25_midpoint")
        stretch = get_float_property("normalization", "bm25_stretch")
        return self._sigmoid(BM25_score, midpoint, stretch)

    def _normalize_distance_results(self, distances: List[float]) -> float:
        if len(distances) == 0:
            return 0.0
        midpoint = get_float_property("normalization", "distance_midpoint")
        stretch = get_float_property("normalization", "distance_stretch")
        total_distance = 0.0
        for distance in distances:
            total_distance += self._sigmoid(distance, midpoint, stretch)
        return total_distance / len(distances)

    def _apply_weights_on_results(self, normalized_search_results: List[Normalized_Document]) -> List[
            Tuple[float, Raw_Document]]:
        weighted_results: List[Tuple[float, Raw_Document]] = []
        for search_result in normalized_search_results:
            total = 0.0
            total += search_result.normalized_BM_25_score * self._bm25_weight
            total += search_result.normalized_distances_with_user_history * self._distance_weight
            weighted_results.append((total, search_result.document))
        return weighted_results
Ejemplo n.º 3
0
 def test_get_int_property_success(self, mocker):
     # Setup
     input = "3"
     expected = 3
     read_property_mock = mocker.patch(
         "helper.ConfigReader._read_property",
         return_value = input
     )
     # Run
     actual = get_int_property("test", "parameters")
     # Check
     assert actual == expected
     read_property_mock.assert_called_once_with("test", "parameters")
Ejemplo n.º 4
0
def start_data_loading():
    repeat_interval: int = get_int_property("datasource",
                                            "automatic_check_interval")

    if get_bool_property("datasource", "automatic_check"):
        start_time = time()
        while True:
            _load_and_process_data()
            time_to_next_iteration = (start_time + repeat_interval) - time()
            if time_to_next_iteration > 0:
                sleep(time_to_next_iteration)
            start_time = time()
    else:
        _load_and_process_data()
Ejemplo n.º 5
0
    def search_query(self, query: str) -> List[BM25_graded_document]:
        amount_to_return = get_int_property("search", "number_of_results")
        inclusion_threshold: float = get_float_property("search", "threshold")

        preprocessed_query = preprocess_string(query)
        documents_above_threshold: List[BM25_graded_document] = []
        print(preprocessed_query)
        scores = self._BM25_model.get_scores(
            preprocessed_query.lower().split())
        for i, score in enumerate(scores):
            if score >= inclusion_threshold:
                corresponding_document = self._indexed_documents[i]
                corresponding_document_raw = Raw_Document(
                    document_id=corresponding_document.document_id,
                    body_raw=corresponding_document.body_raw,
                    title_raw=corresponding_document.title_raw,
                    space=corresponding_document.space,
                )
                documents_above_threshold.append(
                    BM25_graded_document(document=corresponding_document_raw,
                                         BM_25_score=score))
        return sorted(documents_above_threshold,
                      key=lambda doc: doc.BM_25_score,
                      reverse=True)[:amount_to_return]
Ejemplo n.º 6
0
def _setup_fast_api():
    ip = get_string_property("api", "ip")
    port = get_int_property("api", "port")
    uvicorn.run(app, host=ip, port=port)