コード例 #1
0
def test_item_distance_computer(context):
    df_contracts = context.get_contracts_data()
    df_user_profiles = context.get_user_profiles_data()
    df_query = df_user_profiles.rename(columns={'user_id': 'query_id', 'interest_items': 'items'})

    idc = ItemDistanceComputer(df_contracts)
    result = idc.compute_nearest(df_query)

    assert isinstance(result, dict)
    assert len(result) == 4
    assert 13 in result

    query_result = result[13]
    assert isinstance(query_result, dict)
    assert len(query_result) == 1
    assert 'automobily' in query_result

    items_result = query_result['automobily']
    assert isinstance(items_result, list)
    assert len(items_result) == 1

    item_result = items_result[0]
    assert isinstance(item_result, dict)
    assert len(item_result) == 3
    assert 'contract_id' in item_result
    assert 2 == item_result['contract_id']
    assert 'item' in item_result
    assert 'dopravník' == item_result['item']
    assert 'distance' in item_result
    assert numpy.isclose(0.5445505853395554, item_result['distance'])
コード例 #2
0
def test_item_distance_computer_efficiency(context):
    numpy.random.seed(42)
    ncontracts = 1000
    nqueries = 100
    df_contracts = create_random_dataframe(ncontracts).rename(columns={'id': 'contract_id'})
    df_query = create_random_dataframe(nqueries).rename(columns={'id': 'query_id'})

    idc = ItemDistanceComputer(df_contracts)
    result = idc.compute_nearest(df_query, num_results=ncontracts)
    assert numpy.isclose(0.19712971175935512, result[0][''][0]['distance'])
コード例 #3
0
def test_complex_similarity_computer2(context):
    df_contracts = context.get_contracts_data()
    df_user_profiles = context.get_user_profiles_data()
    df_query = df_user_profiles.rename(columns={'user_id': 'query_id', 'interest_items': 'items'})

    similarity_computers = [
        (AggregatedItemSimilarityComputer(df_contracts), WeightedStandardizer(1)),
        (AggregatedItemSimilarityComputer(df_contracts, distance_computer=ItemDistanceComputer(df_contracts, cols=(
            'entity_embeddings', 'entity_items'))), WeightedStandardizer(0.2)),
        (AggregatedLocalSimilarityComputer(df_contracts), WeightedStandardizer(0.2))
    ]

    csc = ComplexSimilarityComputer(df_contracts, similarity_computers=similarity_computers)
    result = csc.compute_most_similar(df_query, 2)

    assert isinstance(result, dict)
    assert len(result) == 4
    assert 16 in result

    query_result = result[16]
    assert isinstance(query_result, list)
    assert len(query_result) == 2

    contract_result = query_result[0]
    assert isinstance(contract_result, dict)
    assert len(contract_result) == 2
    assert 'contract_id' in contract_result
    assert contract_result['contract_id'] == 1
    assert 'similarity' in contract_result
    assert numpy.isclose(0.6261697794780968, contract_result['similarity'])
コード例 #4
0
 def __init__(self,
              df_contracts,
              embedder=None,
              geocoder=None,
              num_results=1,
              random_bias_rate=0.0,
              **kwargs):
     """
     Args:
         df_contracts (DataFrame): reference dataframe for the dataset
         embedder (Component): embedding component for transformation of text input to vector representation
         geocoder (Component): geocoding component for transformation of input address to GPS representation
         num_results (int): number of results to be found for query
         random_bias_rate (float): the rate of random bias, 0 means no bias, 1 means total random bias
     """
     super().__init__(**kwargs)
     self.embedder = embedder if embedder is not None else RandomEmbedder(
         logger=self.logger)
     self.geocoder = geocoder if geocoder is not None else APIGeocoder(
         logger=self.logger)
     self.num_results = num_results
     self.df_contracts = df_contracts
     self.random_bias_rate = random_bias_rate
     self._similarity_computers = {
         'subject': {
             'sc':
             AggregatedItemSimilarityComputer(self.df_contracts,
                                              logger=self.logger),
             'weight':
             1,
             'cols': ('items', 'embeddings')
         },
         'locality': {
             'sc':
             AggregatedLocalSimilarityComputer(self.df_contracts,
                                               logger=self.logger),
             'weight':
             0.1,
             'cols': ('address', 'gps')
         },
         'entity_subject': {
             'sc':
             AggregatedItemSimilarityComputer(
                 self.df_contracts,
                 distance_computer=ItemDistanceComputer(
                     df_contracts,
                     cols=('entity_embeddings', 'entity_items'),
                     logger=self.logger),
                 logger=self.logger),
             'weight':
             0.1,
             'cols': ('items', 'embeddings')
         },
     }
     self._full_similarity_computer = \
         ComplexSimilarityComputer(self.df_contracts,
                                   similarity_computers=[
                                       (sc['sc'], WeightedStandardizer(sc['weight']))
                                       for sc in self._similarity_computers.values()],
                                   random_bias_rate=self.random_bias_rate)
コード例 #5
0
def test_aggregated_item_similarity_computer2(context):
    df_contracts = context.get_contracts_data()
    df_user_profiles = context.get_user_profiles_data()
    df_query = df_user_profiles.rename(columns={
        'user_id': 'query_id',
        'interest_items': 'items'
    })

    aisc = AggregatedItemSimilarityComputer(
        df_contracts,
        distance_computer=ItemDistanceComputer(df_contracts,
                                               cols=('entity_embeddings',
                                                     'entity_items')))
    result = aisc.compute_most_similar(df_query, 2)

    assert isinstance(result, dict)
    assert len(result) == 4
    assert 13 in result

    query_result = result[13]
    assert isinstance(query_result, list)
    assert len(query_result) == 2

    item_result = query_result[0]
    assert isinstance(item_result, dict)
    assert len(item_result) == 2
    assert 'contract_id' in item_result
    assert 0 == item_result['contract_id']
    assert 'similarity' in item_result
    assert numpy.isclose(0.7017270036793921, item_result['similarity'])