Exemple #1
0
def _get_candidates(dataset):
    for value in Value.all(dataset, eager_links=dataset.match_links):
        candidate = normalize(value.value, dataset)
        yield candidate, value.id
        if dataset.match_links:
            for link in value.links_static:
                candidate = normalize(link.key, dataset)
                yield candidate, value.id
Exemple #2
0
def _get_candidates(dataset):
    for entity in Entity.all(dataset, eager_aliases=dataset.match_aliases):
        candidate = normalize(entity.name, dataset)
        yield candidate, entity.id
        if dataset.match_aliases:
            for link in entity.aliases_static:
                candidate = normalize(link.name, dataset)
                yield candidate, entity.id
Exemple #3
0
def get_candidates(dataset):
    candidates = set()
    for value in Value.all(dataset, eager_links=dataset.match_links):
        candidate = normalize(value.value, dataset)
        candidates.add(candidate)
        yield candidate, value
        if dataset.match_links:
            for link in value.links_static:
                candidate = normalize(link.key, dataset)
                if candidate in candidates:
                    continue
                candidates.add(candidate)
                yield candidate, value
Exemple #4
0
def prefix_search(prefix, dataset):
    prefix_normalized = normalize(prefix, dataset)
    candidates = get_candidates(dataset)
    matches = []
    entities = set()
    for candidate, entity_id in candidates:
        if candidate.startswith(prefix_normalized):
            if entity_id not in entities:
                entities.add(entity_id)
                matches.append((candidate, entity_id))
    return matches
Exemple #5
0
def prefix_search(prefix, dataset):
    prefix_normalized = normalize(prefix, dataset)
    candidates = get_candidates(dataset)
    matches = []
    entities = set()
    for candidate, entity_id in candidates:
        if candidate.startswith(prefix_normalized):
            if entity_id not in entities:
                entities.add(entity_id)
                matches.append((candidate, entity_id))
    return matches
Exemple #6
0
def _match(text, dataset, query=None):
    query = '' if query is None else query.strip().lower()
    text_normalized = normalize(text, dataset)
    matches = []
    func = ALGORITHMS.get(dataset.algorithm, levenshtein)
    for candidate, value in get_candidates(dataset):
        if len(query) and query not in candidate.lower():
            continue
        score = func(text_normalized, candidate)
        matches.append((candidate, value, score))
    matches = sorted(matches, key=lambda (c,v,s): s, reverse=True)
    values = []
    matches_uniq = []
    for c,v,s in matches:
        if v in values:
            continue
        values.append(v)
        matches_uniq.append((c,v,s))
    return matches_uniq
Exemple #7
0
def match(text, dataset, query=None):
    query = '' if query is None else query.strip().lower()
    text_normalized = normalize(text, dataset)
    candidates = get_candidates(dataset)
    matches = []
    begin = time.time()
    func = ALGORITHMS.get(dataset.algorithm, levenshtein)
    for candidate, value in candidates:
        if len(query) and query not in candidate.lower():
            continue
        score = func(text_normalized, candidate)
        matches.append((candidate, value, score))
    matches = sorted(matches, key=lambda (c,v,s): s, reverse=True)
    values = set()
    matches_uniq = []
    for c,v,s in matches:
        if v in values:
            continue
        values.add(v)
        matches_uniq.append((c,v,s))
    duration = time.time() - begin
    log.info("Matching %s candidates took: %sms",
            len(matches_uniq), duration*1000)
    return matches_uniq
Exemple #8
0
def match(text, dataset, query=None):
    query = '' if query is None else query.strip()
    text_normalized = normalize(text, dataset)
    candidates = get_candidates(dataset)
    matches = []
    begin = time.time()
    func = ALGORITHMS.get(dataset.algorithm, levenshtein)
    for candidate, entity_id in candidates:
        if len(query) and query not in candidate.lower():
            continue
        score = func(text_normalized, candidate)
        matches.append((candidate, entity_id, score))
    matches = sorted(matches, key=lambda (c, e, s): s, reverse=True)
    entities = set()
    matches_uniq = []
    for c, e, s in matches:
        if e in entities:
            continue
        entities.add(e)
        matches_uniq.append((c, e, s))
    duration = time.time() - begin
    log.info("Matching %s candidates took: %sms", len(matches_uniq),
             duration * 1000)
    return matches_uniq