Esempio n. 1
0
def element_slices(text, elements, rules):
    """Get slices of all elements in text.

    :param text:
    :param elements:
    :param config:
    """
    case_sensitive = rules['case_sensitive']
    tar = tokens_as_re(elements, case_sensitive)
    log(tar)
    token_regex = re.compile(tar)

    element_slices = []
    element_slices_append = element_slices.append

    for match in token_regex.finditer(text):
        token = match.group()
        base = None

        for element in elements:
            check_element = list(element)[0]
            check_token = token

            if case_sensitive is False:
                check_element = check_element.lower()
                check_token = check_token.lower()

            if check_element == check_token:
                base = element
                break

        element_slices_append(match.span() + (token, list(base.values())[0]))

    return element_slices
Esempio n. 2
0
def applicables(elements_per_units, old_links, config, own_validator):
    """Loop the units and validate each item in a unit.

    :param elements_per_units:
    :param config:
    :param own_validator:
    """
    rules = config['rules']
    replaces_at_all = rules.get('replaces_at_all')
    items_per_unit = rules.get('items_per_unit')
    sort_by_item_value = rules.get('sort_by_item_value')

    candidates = []
    candidates_append = candidates.append

    i = 0
    for (u_from, u_to, u_string), elements in elements_per_units:
        i += 1
        log("UNIT %s %s, %s, %s" % (i, u_from, u_to, u_string))

        if len(elements) is 0:
            continue

        # # check the rules - 1. replaces_at_all
        if replaces_at_all and len(candidates) >= replaces_at_all:
            return the_applicables(candidates, config)

        unit_candidates = []
        unit_candidates_append = unit_candidates.append
        args = items_per_unit, old_links, u_from, u_to, unit_candidates
        elements = sort_em(sort_by_item_value, elements, 3)

        log('\n'.join([str(x) for x in elements]))

        for _from, _to, token, element in elements:
            # # check the rules - 1. replaces_at_all
            if replaces_at_all and len(candidates) >= replaces_at_all:
                return the_applicables(candidates, config)

            # we need to check this already before first candidate
            if saturated_unit(*args):
                break

            if candidate.valid(((token, element), candidates, unit_candidates,
                                rules, old_links), own_validator):
                valid_candidate = (_from, _to, token, element)
                candidates_append(valid_candidate)
                unit_candidates_append(valid_candidate)
                # # 2. text_unit > number_of_items
                if saturated_unit(*args):
                    break

    return the_applicables(candidates, config)
Esempio n. 3
0
def parse_yaml(filepath, loaded_from=__file__):
    """Get and parse yaml from absolute path."""
    abspath = os.path.abspath(os.path.dirname(loaded_from))
    path = "/".join([abspath, filepath])
    try:
        yaml_data = yaml.load(open(path, 'r', 'utf-8'))
    except IOError as e:
        log("No yaml file found: {}".format(e))
        raise IOError
    except yaml.YAMLError as e:
        log("Error in configuration file: {}".format(e))
        raise yaml.YAMLError
    return yaml_data
Esempio n. 4
0
def units_gen(the_soup, settings):

    text_unit_key = settings['text_unit']['key']
    soup, soup_str = the_soup
    # for a_tag in soup.findAll(True):
    for a_tag in soup.findAll(text_unit_key):
        # if a_tag.name == text_unit_key:
        try:
            u_tag = a_tag.__unicode__()
            # # bs4 wrongly aumgmented string?!
            _from = soup_str.index(u_tag)
            yield (_from, _from + len(u_tag), u_tag)
        except ValueError as e:
            log("substring not found: {}, {}".format(u_tag, e))
Esempio n. 5
0
def replaces_per_element(element, candidates, _x, rules, old_links):
    """Replace only n elements of the same base element.

    A. Merkel, Mum Merkel, Mrs. Merkula - baseform *Angela Merkel*
    in most cases just marked once, but set value to as many times u want.
    """
    replaces_per_element = rules.get('replaces_per_element')
    if replaces_per_element:
        token, attributes = element
        key = replaces_per_element['key']
        n = replaces_per_element['number']

        found = 0
        # do we really want this or better check items in links
        # based on attributes only
        if token in old_links:
            found += 1
            if found >= n:
                return False
            # return False

        for _, _, candidate, candidate_attributes in candidates:
            if candidate == token:
                found += 1
            try:
                ka = attributes.get(key)
                if ka:
                    la = candidate_attributes.get(key)
                    if ka == la:
                        found += 1
            except Exception as e:
                log("No such attributes: {}".format(attributes))

            if found >= n:
                return False

        if found >= n:
            return False

    return True