Esempio n. 1
0
def _apply_match_heuristic(page, link_contexts, to_match, entity):
    '''helper for defining heuristics for finding mentions of an entity'''
    matches = u.match_all(to_match, page['plaintext'])
    mentions = sum(link_contexts.values(), [])
    link_context = {
        entity: [{
            'text': to_match,
            'offset': match_index,
            'page_title': page['title'],
            'preredirect': _.upper_first(entity)
        } for match_index in matches]
    }
    filtered_link_context = {
        entity: [
            mention for mention in link_context[entity]
            if not _mention_overlaps(mentions, mention)
        ]
    }
    concat = lambda dest, src: _.uniq_by(dest + src, 'offset') if dest else src
    if not _.is_empty(filtered_link_context[entity]):
        return _.merge_with(link_contexts,
                            filtered_link_context,
                            iteratee=concat)
    else:
        return link_contexts
Esempio n. 2
0
def enum_envs():
    '''Enumerate all the env names of the latest version'''
    envs = [es.id for es in gym.envs.registration.registry.all()]

    def get_name(s):
        return s.split('-')[0]

    # filter out the old stuff
    envs = ps.reverse(ps.uniq_by(ps.reverse(envs), get_name))
    # filter out the excluded envs
    envs = ps.difference_by(envs, EXCLUDE_ENVS, get_name)
    envs += INCLUDE_ENVS
    return envs
    def get_continents(self):

        # 1. Map the GWOD to extract only the continents data.
        # 2. Make the results unique by continent code.
        # 3. Sort the resulting collection by continents code.

        continents = pydash.sort_by(
            pydash.uniq_by(
                pydash.map_(self._GWOD, lambda i: pydash.pick(
                    i, ['Continent Code', 'Continent Name'])),
                'Continent Code'),
            'Continent Name')

        # Return continent objects.
        return continents
Esempio n. 4
0
from pydash import uniq_by, mapcat, intersection
import sys
import requests

wsurl = "https://s3.amazonaws.com/challenge.getcrossbeam.com/public/"
rs1 = requests.get(wsurl + sys.argv[1] + ".json")
data1 = rs1.json()
names1 = uniq_by(data1['companies'], lambda r: r['domain'])
n1Ar = mapcat(names1, lambda n: n['domain'])
rs2 = requests.get(wsurl + sys.argv[2] + ".json")
data2 = rs2.json()
names2 = uniq_by(data2['companies'], lambda r: r['domain'])
n2Ar = mapcat(names2, lambda n: n['domain'])
names3 = intersection(n1Ar, n2Ar)
print(repr(len(names1)) + " " + repr(len(names2)) + " " + repr(len(names3)))
Esempio n. 5
0
def add_identical_offers_to_batch(offers: list,
                                  model: dict,
                                  collection_name: str,
                                  n_highest: int = 32):
    index_to_uri_map = model["index_to_uri_map"]

    result = get_most_similar_offers(
        offers,
        model["fitted_pipeline"],
        model["tf_idf_matrix"],
        n_highest + 1,
    )
    print("Mapping similar offers.")
    start_time = time()
    for i, ranking in enumerate(result):
        offer_uri = index_to_uri_map[i]
        ranking_with_uris = list({
            **x, "uri": index_to_uri_map[x["idx"]]
        } for x in ranking)
        result[i] = ranking_with_uris

    updates = []
    for i, ranking in enumerate(result):
        offer = offers[i]
        offer_namespace = offer["uri"].split(":")[0]
        most_similar_offers_hits = list({
            "uri": index_to_uri_map[similar["idx"]],
            "score": similar["score"]
        } for similar in ranking if index_to_uri_map[similar["idx"]].split(
            ":")[0] != offer_namespace and similar["score"] > 0.7)
        print(f"{offer['title']} {offer['uri']}")
        pprint(most_similar_offers_hits)

        if len(most_similar_offers_hits) == 0:
            continue

        most_similar_offers = get_offers_by_uris(
            list(x["uri"] for x in most_similar_offers_hits))
        most_similar_offers = list({
            **x, "score":
            next(y for y in most_similar_offers_hits
                 if y["uri"] == x["uri"])["score"]
        } for x in most_similar_offers)
        offer_quantity = get_real_quantity(offer)
        if not offer_quantity:
            continue
        most_similar_offers_result = []
        for similar_offer in most_similar_offers:
            similar_quantity = get_real_quantity(similar_offer)
            print(
                f"offer_quantity: {offer_quantity}, similar_quantity: {similar_quantity}"
            )
            # Filter offers with too big price difference, as it's probably a quantity parsing error.
            try:
                offer_price = offer["pricing"]["price"]
                similar_offer_price = similar_offer["pricing"]["price"]
                price_difference = abs(offer_price - similar_offer_price)
                price_ratio = price_difference / max(offer_price,
                                                     similar_offer_price)
                print(f"price_ratio: {price_ratio}")
                if price_ratio > 0.6:
                    continue
            except Exception:
                print("Could not calculate price difference")
                continue
            if offer_quantity and offer_quantity == similar_quantity:
                most_similar_offers_result.append(similar_offer)

        if len(most_similar_offers_result) == 0:
            print("No offers with same quantity")
            continue
        print(f"{len(most_similar_offers_result)} offers with same quantity")

        print(
            pydash.pick(
                offer,
                ["title", "subtitle", "brand", "uri", "shortDescription"]))
        pprint(
            list(
                pydash.pick(x, [
                    "title", "subtitle", "brand", "uri", "shortDescription",
                    "score"
                ]) for x in most_similar_offers_result))

        most_similar_offers_result = sorted(most_similar_offers_result,
                                            key=lambda x: x["score"],
                                            reverse=True)
        most_similar_offers_result = pydash.uniq_by(most_similar_offers_result,
                                                    lambda x: x["uri"])
        # Only add one offer, as false positives are common
        most_similar_offer = most_similar_offers_result[0]

        print("Using:")
        pprint(
            pydash.pick(most_similar_offer, [
                "title", "subtitle", "brand", "uri", "shortDescription",
                "score"
            ]))

        #updates.append(dict(uri=offer["uri"], similarOffers=list(pydash.pick(x, ["uri", "score"]) for x in most_similar_offers_result)))
        if len(most_similar_offers) > 0:
            updates.append([offer["uri"], most_similar_offer["uri"]])
    print(f"Time spent: {time() - start_time} s.")

    if len(updates) > 0:
        print("Updating offers.")
        start_time = time()
        result = add_identical_offer_relations(updates)
        print(f"Time spent: {time() - start_time} s.")

        return json_util.dumps(result.bulk_api_result)
    else:
        return {"message": "No identical offers found"}
Esempio n. 6
0
def test_uniq_by(case, iteratee, expected):
    assert _.uniq_by(case, iteratee) == expected