def element_slices(text, elements, rules): """Get slices of all elements in text. :param text: :param elements: :param config: """ case_sensitive = rules['case_sensitive'] tar = tokens_as_re(elements, case_sensitive) log(tar) token_regex = re.compile(tar) element_slices = [] element_slices_append = element_slices.append for match in token_regex.finditer(text): token = match.group() base = None for element in elements: check_element = list(element)[0] check_token = token if case_sensitive is False: check_element = check_element.lower() check_token = check_token.lower() if check_element == check_token: base = element break element_slices_append(match.span() + (token, list(base.values())[0])) return element_slices
def applicables(elements_per_units, old_links, config, own_validator): """Loop the units and validate each item in a unit. :param elements_per_units: :param config: :param own_validator: """ rules = config['rules'] replaces_at_all = rules.get('replaces_at_all') items_per_unit = rules.get('items_per_unit') sort_by_item_value = rules.get('sort_by_item_value') candidates = [] candidates_append = candidates.append i = 0 for (u_from, u_to, u_string), elements in elements_per_units: i += 1 log("UNIT %s %s, %s, %s" % (i, u_from, u_to, u_string)) if len(elements) is 0: continue # # check the rules - 1. replaces_at_all if replaces_at_all and len(candidates) >= replaces_at_all: return the_applicables(candidates, config) unit_candidates = [] unit_candidates_append = unit_candidates.append args = items_per_unit, old_links, u_from, u_to, unit_candidates elements = sort_em(sort_by_item_value, elements, 3) log('\n'.join([str(x) for x in elements])) for _from, _to, token, element in elements: # # check the rules - 1. replaces_at_all if replaces_at_all and len(candidates) >= replaces_at_all: return the_applicables(candidates, config) # we need to check this already before first candidate if saturated_unit(*args): break if candidate.valid(((token, element), candidates, unit_candidates, rules, old_links), own_validator): valid_candidate = (_from, _to, token, element) candidates_append(valid_candidate) unit_candidates_append(valid_candidate) # # 2. text_unit > number_of_items if saturated_unit(*args): break return the_applicables(candidates, config)
def parse_yaml(filepath, loaded_from=__file__): """Get and parse yaml from absolute path.""" abspath = os.path.abspath(os.path.dirname(loaded_from)) path = "/".join([abspath, filepath]) try: yaml_data = yaml.load(open(path, 'r', 'utf-8')) except IOError as e: log("No yaml file found: {}".format(e)) raise IOError except yaml.YAMLError as e: log("Error in configuration file: {}".format(e)) raise yaml.YAMLError return yaml_data
def units_gen(the_soup, settings): text_unit_key = settings['text_unit']['key'] soup, soup_str = the_soup # for a_tag in soup.findAll(True): for a_tag in soup.findAll(text_unit_key): # if a_tag.name == text_unit_key: try: u_tag = a_tag.__unicode__() # # bs4 wrongly aumgmented string?! _from = soup_str.index(u_tag) yield (_from, _from + len(u_tag), u_tag) except ValueError as e: log("substring not found: {}, {}".format(u_tag, e))
def replaces_per_element(element, candidates, _x, rules, old_links): """Replace only n elements of the same base element. A. Merkel, Mum Merkel, Mrs. Merkula - baseform *Angela Merkel* in most cases just marked once, but set value to as many times u want. """ replaces_per_element = rules.get('replaces_per_element') if replaces_per_element: token, attributes = element key = replaces_per_element['key'] n = replaces_per_element['number'] found = 0 # do we really want this or better check items in links # based on attributes only if token in old_links: found += 1 if found >= n: return False # return False for _, _, candidate, candidate_attributes in candidates: if candidate == token: found += 1 try: ka = attributes.get(key) if ka: la = candidate_attributes.get(key) if ka == la: found += 1 except Exception as e: log("No such attributes: {}".format(attributes)) if found >= n: return False if found >= n: return False return True