def _apply_match_heuristic(page, link_contexts, to_match, entity): '''helper for defining heuristics for finding mentions of an entity''' matches = u.match_all(to_match, page['plaintext']) mentions = sum(link_contexts.values(), []) link_context = { entity: [{ 'text': to_match, 'offset': match_index, 'page_title': page['title'], 'preredirect': _.upper_first(entity) } for match_index in matches] } filtered_link_context = { entity: [ mention for mention in link_context[entity] if not _mention_overlaps(mentions, mention) ] } concat = lambda dest, src: _.uniq_by(dest + src, 'offset') if dest else src if not _.is_empty(filtered_link_context[entity]): return _.merge_with(link_contexts, filtered_link_context, iteratee=concat) else: return link_contexts
def enum_envs(): '''Enumerate all the env names of the latest version''' envs = [es.id for es in gym.envs.registration.registry.all()] def get_name(s): return s.split('-')[0] # filter out the old stuff envs = ps.reverse(ps.uniq_by(ps.reverse(envs), get_name)) # filter out the excluded envs envs = ps.difference_by(envs, EXCLUDE_ENVS, get_name) envs += INCLUDE_ENVS return envs
def get_continents(self): # 1. Map the GWOD to extract only the continents data. # 2. Make the results unique by continent code. # 3. Sort the resulting collection by continents code. continents = pydash.sort_by( pydash.uniq_by( pydash.map_(self._GWOD, lambda i: pydash.pick( i, ['Continent Code', 'Continent Name'])), 'Continent Code'), 'Continent Name') # Return continent objects. return continents
from pydash import uniq_by, mapcat, intersection import sys import requests wsurl = "https://s3.amazonaws.com/challenge.getcrossbeam.com/public/" rs1 = requests.get(wsurl + sys.argv[1] + ".json") data1 = rs1.json() names1 = uniq_by(data1['companies'], lambda r: r['domain']) n1Ar = mapcat(names1, lambda n: n['domain']) rs2 = requests.get(wsurl + sys.argv[2] + ".json") data2 = rs2.json() names2 = uniq_by(data2['companies'], lambda r: r['domain']) n2Ar = mapcat(names2, lambda n: n['domain']) names3 = intersection(n1Ar, n2Ar) print(repr(len(names1)) + " " + repr(len(names2)) + " " + repr(len(names3)))
def add_identical_offers_to_batch(offers: list, model: dict, collection_name: str, n_highest: int = 32): index_to_uri_map = model["index_to_uri_map"] result = get_most_similar_offers( offers, model["fitted_pipeline"], model["tf_idf_matrix"], n_highest + 1, ) print("Mapping similar offers.") start_time = time() for i, ranking in enumerate(result): offer_uri = index_to_uri_map[i] ranking_with_uris = list({ **x, "uri": index_to_uri_map[x["idx"]] } for x in ranking) result[i] = ranking_with_uris updates = [] for i, ranking in enumerate(result): offer = offers[i] offer_namespace = offer["uri"].split(":")[0] most_similar_offers_hits = list({ "uri": index_to_uri_map[similar["idx"]], "score": similar["score"] } for similar in ranking if index_to_uri_map[similar["idx"]].split( ":")[0] != offer_namespace and similar["score"] > 0.7) print(f"{offer['title']} {offer['uri']}") pprint(most_similar_offers_hits) if len(most_similar_offers_hits) == 0: continue most_similar_offers = get_offers_by_uris( list(x["uri"] for x in most_similar_offers_hits)) most_similar_offers = list({ **x, "score": next(y for y in most_similar_offers_hits if y["uri"] == x["uri"])["score"] } for x in most_similar_offers) offer_quantity = get_real_quantity(offer) if not offer_quantity: continue most_similar_offers_result = [] for similar_offer in most_similar_offers: similar_quantity = get_real_quantity(similar_offer) print( f"offer_quantity: {offer_quantity}, similar_quantity: {similar_quantity}" ) # Filter offers with too big price difference, as it's probably a quantity parsing error. try: offer_price = offer["pricing"]["price"] similar_offer_price = similar_offer["pricing"]["price"] price_difference = abs(offer_price - similar_offer_price) price_ratio = price_difference / max(offer_price, similar_offer_price) print(f"price_ratio: {price_ratio}") if price_ratio > 0.6: continue except Exception: print("Could not calculate price difference") continue if offer_quantity and offer_quantity == similar_quantity: most_similar_offers_result.append(similar_offer) if len(most_similar_offers_result) == 0: print("No offers with same quantity") continue print(f"{len(most_similar_offers_result)} offers with same quantity") print( pydash.pick( offer, ["title", "subtitle", "brand", "uri", "shortDescription"])) pprint( list( pydash.pick(x, [ "title", "subtitle", "brand", "uri", "shortDescription", "score" ]) for x in most_similar_offers_result)) most_similar_offers_result = sorted(most_similar_offers_result, key=lambda x: x["score"], reverse=True) most_similar_offers_result = pydash.uniq_by(most_similar_offers_result, lambda x: x["uri"]) # Only add one offer, as false positives are common most_similar_offer = most_similar_offers_result[0] print("Using:") pprint( pydash.pick(most_similar_offer, [ "title", "subtitle", "brand", "uri", "shortDescription", "score" ])) #updates.append(dict(uri=offer["uri"], similarOffers=list(pydash.pick(x, ["uri", "score"]) for x in most_similar_offers_result))) if len(most_similar_offers) > 0: updates.append([offer["uri"], most_similar_offer["uri"]]) print(f"Time spent: {time() - start_time} s.") if len(updates) > 0: print("Updating offers.") start_time = time() result = add_identical_offer_relations(updates) print(f"Time spent: {time() - start_time} s.") return json_util.dumps(result.bulk_api_result) else: return {"message": "No identical offers found"}
def test_uniq_by(case, iteratee, expected): assert _.uniq_by(case, iteratee) == expected