def _create_affiliated_company(aff_name, aff_rel_span, aff_url_span, company_owner, context, emitter): aff_href = aff_url_span[0].find(".//a") if len(aff_url_span) else None aff_href = aff_href.get("href") if aff_href is not None else None aff_rel = aff_rel_span[0].text if len(aff_rel_span) else None company = emitter.make("Company") company.add("name", aff_name) company.add("website", aff_href) company.make_id("meineabgeordneten.at", aff_name) company_ownership = emitter.make("Ownership") if aff_rel: # Info is given that way: "GESELLSCHAFTER 50.00% (100.00...)" match_percentage = re.search(r"(\d\d?\d?\.\d\d)", aff_rel) aff_pct = match_percentage.group(1) if match_percentage else None if aff_pct: aff_type = collapse_spaces(aff_rel[:match_percentage.start()]) else: aff_type = collapse_spaces(aff_rel) print("AFFILIATE pct '{}' ownerType '{}'".format(aff_pct, aff_type)) company_ownership.add("percentage", aff_pct) company_ownership.add("ownershipType", aff_type) company_ownership.add("owner", company_owner.id) company_ownership.add("asset", company.id) company_ownership.make_id(company_owner.id, company.id) emitter.emit(company) emitter.emit(company_ownership) context.log.info("CREATED COMPANY '" + aff_name + "' and membership with id '" + company_ownership.id + "'")
def _extract_table_description(context, row, isWork): description_sub_el = row.xpath(".//span[@class='bold']") affiliated = None if len(description_sub_el): # Description has a main part and a sub part. # The main part usually states the name of an organisation and # the sub part the function of the person in that organization. desc_main = description_sub_el[0] # sometimes text is wrapped inside <a ... /> that links to organization website desc_parent = desc_main.getparent() href = None if desc_parent.tag == "a": href = desc_parent.get("href") desc_parent.remove(desc_main) desc_parent = desc_parent.getparent() else: desc_parent.remove(desc_main) if isWork: aff_div = desc_parent.xpath( './/div[contains(@class,"tochterfirmen")]') if len(aff_div): affiliated = aff_div[0] desc_parent.remove(affiliated) desc_sub = collapse_spaces(desc_parent.text_content()) description = collapse_spaces(desc_main.text_content()) context.log.info("PARSED MANDATE DESCRIPTION: {}, {}".format( description, desc_sub)) return description, desc_sub, href, affiliated
def pdf_extract_page(self, file_path, temp_dir, page): """Extract the contents of a single PDF page, using OCR if need be.""" pagenum = page.get('number') page_size = self._element_size(page) is_ocr = False texts = [] for text in page.findall('.//text'): content = text.xpath('string()').strip() content = collapse_spaces(content) if len(content): texts.append(content) for image in page.findall('.//image'): ratio = self._element_size(image) / page_size if len(texts) < 2 or ratio > self.IMAGE_RATIO_FOR_OCR: is_ocr = True if is_ocr and self.manager.config.get('PDF_OCR_PAGES', True): image_file = self.pdf_page_to_image(file_path, pagenum, temp_dir) with open(image_file, 'rb') as fh: text = self.extract_text_from_image(fh.read()) text = collapse_spaces(text) if text is not None: texts.append(text) text = ' \n'.join(texts).strip() self.result.emit_page(int(pagenum), text)
def parse_html(context, data, result): context.log.info('Parse: %r', result.url) title = result.html.findtext('.//title') if title is not None and 'title' not in data: data['title'] = title seen = set() for tag_query, attr_name in URL_TAGS: for element in result.html.findall(tag_query): attr = element.get(attr_name) if attr is None: continue url = normalize_url(urljoin(result.url, attr)) if url is None or url in seen: continue seen.add(url) tag = make_key((context.run_id, url)) if context.check_tag(tag): continue context.set_tag(tag, None) data = {'url': url} # Option to set the document title from the link text. if context.get('link_title', False): data['title'] = collapse_spaces(element.text_content()) elif element.get('title'): data['title'] = collapse_spaces(element.get('title')) context.emit(rule='fetch', data=data)
def crawl(context: Context): xls_url = fetch_xls_url(context) path = context.fetch_resource("source.xls", xls_url) context.export_resource(path, XLS, title=context.SOURCE_TITLE) xls = xlrd.open_workbook(path) for sheet in xls.sheets(): headers = None row0 = [h.convert_excel_cell(xls, c) for c in sheet.row(0)] sections = [c for c in row0 if c is not None] section = collapse_spaces(" / ".join(sections)) for r in range(1, sheet.nrows): row = [h.convert_excel_cell(xls, c) for c in sheet.row(r)] # after a header is found, read normal data: if headers is not None: data: Dict[str, List[str]] = {} for header, cell in zip(headers, row): if header is None: continue values = [] if isinstance(cell, datetime): cell = cell.date() for value in multi_split(stringify(cell), SPLITS): if value is None: continue if value == "不明": continue if value is not None: values.append(value) data[header] = values emit_row(context, sheet.name, section, data) if not len(row) or row[0] is None: continue teaser = row[0].strip() # the first column of the common headers: if "告示日付" in teaser: if headers is not None: context.log.error("Found double header?", row=row) # print("SHEET", sheet, row) headers = [] for cell in row: cell = collapse_spaces(cell) header = context.lookup_value("columns", cell) if header is None: context.log.warning("Unknown column title", column=cell, sheet=sheet.name) headers.append(header)
def parse_html(context, data, result): context.log.info("Parse: %r", result.url) for title in result.html.xpath(".//title/text()"): if title is not None and "title" not in data: data["title"] = title include = context.params.get("include_paths") if include is None: roots = [result.html] else: roots = [] for path in include: roots = roots + result.html.xpath(path) seen = set() for root in roots: for tag_query, attr_name in URL_TAGS: for element in root.xpath(tag_query): attr = element.get(attr_name) if attr is None: continue try: url = urljoin(result.url, attr) key = url except Exception: log.warning("Invalid URL: %r", attr) continue if url is None or key is None or key in seen: continue seen.add(key) tag = make_key(context.run_id, key) if context.check_tag(tag): continue context.set_tag(tag, None) data["url"] = url if data.get("title") is None: # Option to set the document title from the link text. if context.get("link_title", False): data["title"] = collapse_spaces(element.text_content()) elif element.get("title"): data["title"] = collapse_spaces(element.get("title")) context.http.session.headers["Referer"] = url context.emit(rule="fetch", data=data)
def parse_html(context, data, result): context.log.info('Parse: %r', result.url) title = result.html.findtext('.//title') if title is not None and 'title' not in data: data['title'] = title include = context.params.get('include_paths') if include is None: roots = [result.html] else: roots = [] for path in include: roots = roots + result.html.findall(path) seen = set() for root in roots: for tag_query, attr_name in URL_TAGS: for element in root.findall(tag_query): attr = element.get(attr_name) if attr is None: continue try: url = normalize_url(urljoin(result.url, attr)) except Exception: log.warning('Invalid URL: %r', attr) continue if url is None or url in seen: continue seen.add(url) tag = make_key(context.run_id, url) if context.check_tag(tag): continue context.set_tag(tag, None) data = {'url': url} # Option to set the document title from the link text. if context.get('link_title', False): data['title'] = collapse_spaces(element.text_content()) elif element.get('title'): data['title'] = collapse_spaces(element.get('title')) context.http.session.headers['Referer'] = url if re.findall('publicId|firstResult', url): print("----------------PRINTING URL----------------") print(url) context.emit(rule='fetch', data=data)
def cleanup_text(text): if text is None: return match = CLEANUP.match(text) if match is not None: term = match.group('term') return collapse_spaces(term)
def generate(text, keep_order=False): text = stringify(text) if text is None: return # this needs to happen before the replacements text = text.lower() text = remove_person_prefix(text) # remove any text in brackets text = BRACKETED.sub(WS, text) # super hard-core string scrubbing text = clean_strict(text) text = replace_types(text) if keep_order: text = collapse_spaces(text) else: # final manicure, based on openrefine algo parts = [p for p in text.split(WS) if len(p)] text = WS.join(sorted(set(parts))) if not len(text): return None return text
def parse_entry(context, node): entity_name = node.findtext("./Entity") if entity_name is not None: entity = context.make("LegalEntity") entity.add("name", entity_name.split("/")) else: entity = context.make("Person") given_name = node.findtext("./GivenName") entity.add("firstName", given_name) last_name = node.findtext("./LastName") entity.add("lastName", last_name) entity.add("name", jointext(given_name, last_name)) entity.add("birthDate", node.findtext("./DateOfBirth")) # ids are per country and entry type (individual/entity) item = node.findtext("./Item") schedule = node.findtext("./Schedule") country = node.findtext("./Country") if "/" in country: country, _ = country.split("/") entity.id = context.make_slug(country, schedule, item, strict=False) entity.add("country", country) sanction = h.make_sanction(context, entity) sanction.add("program", schedule) names = node.findtext("./Aliases") if names is not None: for name in names.split(", "): name = collapse_spaces(name) entity.add("alias", name) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def generate(text, keep_order=False): text = ensure_text(text) if text is None: return # this needs to happen before the replacements text = text.lower() # try to remove personal prefix, such as Mr., Mrs. text = PERSON_PREFIX.sub(WS, text) # remove any text in brackets text = BRACKETED.sub(WS, text) # super hard-core string scrubbing text = clean_strict(text) text = COMPANY_TYPES.sub(company_type_replacer, text) if keep_order: text = collapse_spaces(text) else: # final manicure, based on openrefine algo parts = [p for p in text.split(WS) if len(p)] text = WS.join(sorted(set(parts))) if not len(text): return None return text
def analyze(self, document): if document.schema in self.IGNORED: return collector = DocumentTagCollector(document, self.ORIGIN) text = document.text if text is None or len(text) <= self.MIN_LENGTH: return try: hint_language_code = None if len(document.languages) == 1: hint_language_code = document.languages[0] text = Text(text, hint_language_code=hint_language_code) for entity in text.entities: if entity.tag == 'I-LOC': continue label = ' '.join(entity) label = self.CLEAN.sub(' ', label) label = collapse_spaces(label) if ' ' not in label or len(label) < 4 or len(label) > 200: continue # log.info("Entity [Doc %s]: %s [%s]", # document.id, label, entity.tag) collector.emit(label, self.TYPES[entity.tag]) except ValueError as ve: log.warning('NER value error: %r', ve) except Exception as ex: log.warning('NER failed: %r', ex) finally: collector.save() log.info('Polyglot extracted %s entities.', len(collector))
def generate(text: Optional[str], keep_order: bool = False, keep_brackets: bool = False) -> Optional[str]: text = stringify(text) if text is None: return None # this needs to happen before the replacements text = text.lower() text = clean_entity_name(text) if not keep_brackets: # Remove any text in brackets # This is meant to handle names of companies which include # the jurisdiction, like: Turtle Management (Seychelles) Ltd. text = BRACKETED.sub(WS, text) # Super hard-core string scrubbing text = clean_strict(text) text = replace_types(text) if keep_order: text = collapse_spaces(text) elif text is not None: # final manicure, based on openrefine algo parts = [p for p in text.split(WS) if len(p)] text = WS.join(sorted(set(parts))) if text is None or not len(text): return None return text
def index_form(texts): """Turn a set of strings into the appropriate form for indexing.""" results = [] total_len = 0 for text in texts: # We don't want to store more than INDEX_MAX_LEN of text per doc if total_len > INDEX_MAX_LEN: # TODO: there might be nicer techniques for dealing with overly # long text buffers? results = list(set(results)) total_len = sum((len(t) for t in results)) if total_len > INDEX_MAX_LEN: break text = stringify(text) if text is None: continue text = collapse_spaces(text) total_len += len(text) results.append(text) # Make latinized text version latin = latinize_text(text) latin = stringify(latin) if latin is None or latin == text: continue total_len += len(latin) results.append(latin) return results
def apply(self, record): value = six.text_type(self.template) for repl, ref in self.replacements.items(): ref_value = record.get(ref) or '' ref_value = six.text_type(ref_value) value = value.replace(repl, ref_value) return collapse_spaces(value).strip()
def Extract(self, request, context): text = request.text if text is None or not len(text.strip()): return entity_count = 0 for language in request.languages: if language not in LANGUAGES: continue try: parsed = Text(text, hint_language_code=language) for entity in parsed.entities: label = ' '.join(entity) label = CLEAN.sub(' ', label) label = collapse_spaces(label) if len(label) < 4 or len(label) > 200: continue if ' ' not in label: continue length = entity.end - entity.start entity_count += 1 yield ExtractedEntity(label=label, offset=entity.start, length=length, type=TYPES[entity.tag]) except Exception: log.exception("Cannot extract. Language: %s", language) log.info("Extract: extracted %s entities.", entity_count)
def _make_work_and_affiliates(person, context, description, description_sub, startDate, endDate, emitter, org_website, affiliates): company_owner = emitter.make("Company") company_owner.add("website", org_website) membership = emitter.make("Membership") membership.add("startDate", startDate) membership.add("endDate", endDate) _create_org_and_attach(emitter, context, company_owner, person, description, membership, description_sub, startDate) if affiliates is None: return for aff in affiliates.xpath('.//div[contains(@class,"tochterfirma")]'): aff_name_span = aff.xpath(".//span[@class='tochterFirmaName']") aff_url_span = aff.xpath('.//span[@class="tochterFirmaLink"]') aff_rel_span = aff.xpath('.//span[@class="tochterFirmaBeziehung"]') aff_name = collapse_spaces( aff_name_span[0].text) if len(aff_name_span) else None if not aff_name: # An affiliated company without a name indicates a parsing error. continue _create_affiliated_company(aff_name, aff_rel_span, aff_url_span, company_owner, context, emitter)
def emit_document(context, row, date): context.http.reset() url = context.params.get("url") cells = row.findall("./td") if not len(cells): return text = [c.text_content().strip() for c in cells] _, num, category, name, _ = text title = "%s (%s, %s)" % (name, category, date) title = collapse_spaces(title) link = row.find('.//a[@class="pdfLnk"]') if link is None: return url = urljoin(url, link.get("href")) context.emit( data={ "url": url, "title": title, "foreign_id": url, "countries": ["ch"], "dates": [date], "extension": "pdf", "mime_type": "application/pdf", })
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) table = doc.find('//div[@class="sanctioned-table"]/table') headers = None for row in table.findall(".//tr"): if headers is None: headers = [slugify(el.text) for el in row.findall("./th")] continue cells = [collapse_spaces(el.text) for el in row.findall("./td")] data = {hdr: c for hdr, c in zip(headers, cells)} entity = context.make("Person") entity.id = context.make_id(data["id"], data["ad-soyad-ata-adi"]) entity.add("name", data["ad-soyad-ata-adi"]) entity.add("idNumber", data["id"]) entity.add("birthDate", parse_date(data["dogum-tarixi"])) entity.add("country", "az") entity.add("topics", "sanction") addr = h.make_address(context, full=data["malumat"]) h.apply_address(context, entity, addr) sanction = h.make_sanction(context, entity) context.emit(sanction) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="utf-8") as fh: doc = html.fromstring(fh.read()) for table in doc.findall('.//div[@class="editor-content"]//table'): headers = None schema = None for row in table.findall(".//tr"): cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] if headers is None: headers = [slugify(c, sep="_") for c in cells] continue if len(cells) == 1: schema = TYPES[cells[0]] continue row = dict(zip(headers, cells)) entity = context.make(schema) name = row.pop("imie_i_nazwisko_nazwa_podmiotu") entity.id = context.make_slug(name) names = name.split("(") entity.add("name", names[0]) for alias in names[1:]: entity.add("alias", alias.split(")")[0]) notes = row.pop("uzasadnienie_wpisu_na_liste") entity.add("notes", notes) details = row.pop("dane_identyfikacyjne_osoby_podmiotu") for (chop, prop) in CHOPSKA: parts = details.rsplit(chop, 1) details = parts[0] if len(parts) > 1: if prop == "address": addr = h.make_address(context, full=parts[1]) h.apply_address(context, entity, addr) else: entity.add(prop, parts[1]) if len(details.strip()): result = context.lookup("details", details) if result is None: context.log.warning("Unhandled details", details=details) else: for prop, value in result.props.items(): entity.add(prop, value) sanction = h.make_sanction(context, entity) provisions = row.pop("zastosowane_srodki_sankcyjne") sanction.add("provisions", provisions) start_date = row.pop("data_umieszczenia_na_liscie") start_date = start_date.replace(" r.", "") sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"])) h.audit_data(row) context.emit(entity, target=True) context.emit(sanction)
def clean_name(self, text): if text is None or len(text) > self.MAX_LENGTH: return text = clean_entity_name(text) text = collapse_spaces(text) if len(text) < self.MIN_LENGTH: return return text
def _convert_time_span(raw_time_span): raw_time_span = collapse_spaces(raw_time_span) if "seit" in raw_time_span.lower(): # "seit" (since) indicates that there is no end time return _parse_single_date(raw_time_span), None elif "-" in raw_time_span: arr = raw_time_span.split("-") return _parse_single_date(arr[0]), _parse_single_date(arr[1])
def clean_name(text): if text is None or len(text) > NAME_MAX_LENGTH: return text = clean_entity_name(text) text = collapse_spaces(text) if text is None or len(text) <= NAME_MIN_LENGTH or ' ' not in text: return return text
def name(self): if self._name is not None: return self._name names = (self.first_name, self.second_name, self.third_name, self.father_name, self.last_name) names = [n for n in names if n is not None] if len(names): names = ' '.join(names) return collapse_spaces(names)
def clean_strict(text, boundary=WS): """Super-hardcore string scrubbing.""" # transliterate to ascii text = ascii_text(text) # replace punctuation and symbols text = CHARACTERS_REMOVE_RE.sub('', text) text = category_replace(text) # pad out for company type replacements text = ''.join((boundary, collapse_spaces(text), boundary)) return text
def clean_name(cls, text): if text is None or len(text) > MAX_LENGTH: return match = CLEANUP.match(text) if match is not None: text = match.group('term') text = collapse_spaces(text) if not len(text) or len(text) < MIN_LENGTH: return return text
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="ISO-8859-1") as fh: doc = html.parse(fh) table = doc.find("//div[@id='viewcontainer']/table") headers = None for row in table.findall(".//tr"): if headers is None: headers = [ slugify(c.text_content(), "_") for c in row.findall("./th") ] continue cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] cells = dict(zip(headers, cells)) cells.pop(None, None) full_name = name = cells.pop("name") registration_number = None for splitter in REG_NRS: if splitter in name: name, registration_number = name.split(splitter, 1) registration_number = registration_number.replace(")", "") country = cells.pop("nationality") country = country.replace("Non ADB Member Country", "") country = country.replace("Rep. of", "") entity = context.make("LegalEntity") entity.id = context.make_id(full_name, country) entity.add("name", name) entity.add("alias", cells.pop("othername_logo")) entity.add("topics", "debarment") entity.add("country", country) entity.add("registrationNumber", registration_number) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("grounds")) sanction.add("program", cells.pop("sanction_type")) date_range = cells.pop("effect_date_lapse_date", "") if "|" in date_range: start_date, end_date = date_range.split("|") sanction.add("startDate", h.parse_date(start_date.strip(), FORMATS)) sanction.add("endDate", h.parse_date(end_date.strip(), FORMATS)) address = h.make_address(context, full=cells.pop("address"), country=country) h.apply_address(context, entity, address) context.emit(entity, target=True) context.emit(sanction)
def get_countries(context): doc = context.fetch_html(COUNTRIES_URL) path = ".//select[@id='arrestWarrantCountryId']//option" options = [] for option in doc.findall(path): code = stringify(option.get("value")) if code is None: continue label = collapse_spaces(option.text_content()) options.append((code, label)) return list(sorted(options))
def parse_alias(entity, node): names = node.findtext('./ALIAS_NAME') if names is None: return for name in names.split('; '): name = collapse_spaces(name) if not len(name): continue alias = entity.create_alias(name=name) alias.quality = QUALITY[node.findtext('./QUALITY')]
def text_chunks(texts, sep=" ", max_chunk=25000): """Pre-chew text snippets for NLP and pattern matching.""" for text in texts: text = collapse_spaces(text) if text is None or len(text) < 5: continue # Crudest text splitting code in documented human history. # Most of the time, a single page of text is going to be # 3000-4000 characters, so this really only kicks in if # something weird is happening in the first place. for idx in range(0, len(text), max_chunk): yield text[idx : idx + max_chunk]