from bt_candidates.wiring import default_schema_factory, default_filter_factory as f from bt_candidates.client import Client from bt_candidates.sorting import SortStrategy from bt_candidates.common import FieldType, AmountType, MatchType import pytest import time from datetime import timedelta CLIENT = Client(host='candidates.aws.boomtrain.com', port=7070) def test_global_filter(): candidates = CLIENT.get_candidates('70ec7833a06f2b6989747149874bb74d', filter=f.named_filter('GLOBAL'), limit=1000, sort_by=SortStrategy.POP_1D) #print(candidates) assert len(candidates) > 1000000 def test_metafilter_resource_type_article(): filter = f.overlap_filter(field='resource-type', values=['article'], min=1) candidates = CLIENT.get_candidates('70ec7833a06f2b6989747149874bb74d', filter=f.and_filter( f.named_filter('GLOBAL'), filter), limit=1000, sort_by=SortStrategy.POP_1D) assert len(candidates) == 1000
from bt_candidates.wiring import default_schema_factory, default_filter_factory as f from bt_candidates.client import Client from bt_candidates.sorting import SortStrategy from bt_candidates.common import FieldType, AmountType, MatchType from datetime import timedelta client = Client(host='candidates.aws.boomtrain.com', port=7070) vogue = "9b69d8fc8b441b43d493d713e5703ada" filter_two_days = f.recency_filter( field='pubDate', min=timedelta(days=-2), max=timedelta(days=1), ) filter_five_days = f.recency_filter( field='pubDate', min=timedelta(days=-5), max=timedelta(days=1), ) recency_fallback_filter = f.or_filter(filter_two_days, filter_five_days) candidates_two_days = client.get_candidates(site_id=vogue, filter=filter_two_days, limit=100) count1 = len(candidates_two_days) print("Candidates applying two days filter : {}".format(count1)) fallback_candidates = client.get_candidates(site_id=vogue, filter=recency_fallback_filter, limit=100) count2 = len(fallback_candidates) print("Candidates applying the recency fallback to 5 days : {}".format(count2))
from bt_candidates.client import Client from bt_candidates.wiring import default_schema_factory as sf from bt_candidates.wiring import default_filter_factory as ff from bt_candidates.common import FieldType, AmountType, MatchType from bt_candidates.resource_schema import SchemaField, ResourceSchema, DataFormat, DataLoader from datetime import timedelta from bt_candidates.sorting import SortStrategy client = Client(host='candidates.aws.boomtrain.com', port=7070) site_id = "e9cd7a8ae2406275f6afb01b679ebf69" schema = client.get_schema(site_id) def test_gazette(): filter_resource_type = ff.overlap_filter(field='resource-type', values={'thegazette_sports'}, min=1, match_type=MatchType.CONTAINS) filter_meta_global = ff.and_filter(filter_resource_type, schema.named_filters['GLOBAL']) candidates = client.get_candidates(site_id=site_id, filter=filter_meta_global, limit=100, sort_by=SortStrategy.POP_1D) assert len(candidates) == 100 resource_ids = [candidate.resource_id for candidate in candidates] resources = client.get_resources(site_id=site_id, ids=resource_ids).resources resource_types = [resource.to_jsonobj()['fields']['resource-type'] for resource in resources] for resource_type in resource_types: assert 'thegazette_sports' in resource_type
from bt_candidates.client import Client from bt_candidates.wiring import default_filter_factory as ff from bt_candidates.filters import MatchType from bt_candidates.sorting import SortStrategy client = Client('candidates.magic.boomtrain.com') schema = client.get_schema('atlanta-black-star') filt = ff.or_filter( ff.overlap_filter('title', ['Yohannes', 'IV'], match_type=MatchType.EXACT, min=2, max=2), ff.overlap_filter('title', ['Search', 'chicago'], match_type=MatchType.EXACT, min=2, max=2), ff.overlap_filter('title', ['african', 'history', 'month'], match_type=MatchType.EXACT, min=3, max=3) ) candidates = client.get_candidates('atlanta-black-star', filt, limit=25) print(len(candidates)) for c in candidates: print(c) print('======================================================') import itertools as it def split_candidates(candidates, needed=10): to_score = [] for _, grp in it.groupby(candidates, lambda c: c.sort_weight): grp = list(grp) if len(grp) > needed: to_score.extend(grp) return to_score else:
from bt_candidates.client import Client from bt_candidates.wiring import default_schema_factory as sf from bt_candidates.wiring import default_filter_factory as ff from bt_candidates.common import FieldType, AmountType, MatchType from bt_candidates.resource_schema import SchemaField, ResourceSchema, DataFormat, DataLoader from datetime import timedelta client = Client(host='candidates.aws.boomtrain.com', port=7070) # yellow pages. site_id = "593964c3c0f76bc59c65b324f9dbf869" schema = client.get_schema(site_id) filter_item_type = ff.overlap_filter(field='itemType', values={'lists_en'}, min=1, match_type=MatchType.CONTAINS) filter_city_region = ff.overlap_filter(field='cityRegion', values={'toronto'}, min=1, match_type=MatchType.CONTAINS) filter_meta = ff.and_filter(filter_item_type, filter_city_region) filter_meta_global = ff.and_filter(filter_item_type, filter_city_region, schema.named_filters['GLOBAL']) candidates_meta = client.get_candidates(site_id=site_id, filter=filter_meta, limit=100) candidates_meta_global = client.get_candidates(site_id=site_id, filter=filter_meta_global, limit=100)