Example #1
0
def _create_affiliated_company(aff_name, aff_rel_span, aff_url_span,
                               company_owner, context, emitter):
    aff_href = aff_url_span[0].find(".//a") if len(aff_url_span) else None
    aff_href = aff_href.get("href") if aff_href is not None else None
    aff_rel = aff_rel_span[0].text if len(aff_rel_span) else None

    company = emitter.make("Company")
    company.add("name", aff_name)
    company.add("website", aff_href)
    company.make_id("meineabgeordneten.at", aff_name)
    company_ownership = emitter.make("Ownership")
    if aff_rel:
        # Info is given that way: "GESELLSCHAFTER 50.00% (100.00...)"
        match_percentage = re.search(r"(\d\d?\d?\.\d\d)", aff_rel)
        aff_pct = match_percentage.group(1) if match_percentage else None
        if aff_pct:
            aff_type = collapse_spaces(aff_rel[:match_percentage.start()])
        else:
            aff_type = collapse_spaces(aff_rel)

        print("AFFILIATE pct '{}' ownerType '{}'".format(aff_pct, aff_type))
        company_ownership.add("percentage", aff_pct)
        company_ownership.add("ownershipType", aff_type)

    company_ownership.add("owner", company_owner.id)
    company_ownership.add("asset", company.id)
    company_ownership.make_id(company_owner.id, company.id)
    emitter.emit(company)
    emitter.emit(company_ownership)
    context.log.info("CREATED COMPANY '" + aff_name +
                     "' and membership with id '" + company_ownership.id + "'")
Example #2
0
def _extract_table_description(context, row, isWork):
    description_sub_el = row.xpath(".//span[@class='bold']")
    affiliated = None

    if len(description_sub_el):
        # Description has a main part and a sub part.
        # The main part usually states the name of an organisation and
        # the sub part the function of the person in that organization.
        desc_main = description_sub_el[0]

        # sometimes text is wrapped inside <a ... /> that links to organization website
        desc_parent = desc_main.getparent()
        href = None

        if desc_parent.tag == "a":
            href = desc_parent.get("href")
            desc_parent.remove(desc_main)
            desc_parent = desc_parent.getparent()
        else:
            desc_parent.remove(desc_main)

        if isWork:
            aff_div = desc_parent.xpath(
                './/div[contains(@class,"tochterfirmen")]')
            if len(aff_div):
                affiliated = aff_div[0]
                desc_parent.remove(affiliated)

        desc_sub = collapse_spaces(desc_parent.text_content())
        description = collapse_spaces(desc_main.text_content())
        context.log.info("PARSED MANDATE DESCRIPTION: {}, {}".format(
            description, desc_sub))
        return description, desc_sub, href, affiliated
Example #3
0
    def pdf_extract_page(self, file_path, temp_dir, page):
        """Extract the contents of a single PDF page, using OCR if need be."""
        pagenum = page.get('number')
        page_size = self._element_size(page)
        is_ocr = False

        texts = []
        for text in page.findall('.//text'):
            content = text.xpath('string()').strip()
            content = collapse_spaces(content)
            if len(content):
                texts.append(content)

        for image in page.findall('.//image'):
            ratio = self._element_size(image) / page_size
            if len(texts) < 2 or ratio > self.IMAGE_RATIO_FOR_OCR:
                is_ocr = True

        if is_ocr and self.manager.config.get('PDF_OCR_PAGES', True):
            image_file = self.pdf_page_to_image(file_path, pagenum, temp_dir)
            with open(image_file, 'rb') as fh:
                text = self.extract_text_from_image(fh.read())
                text = collapse_spaces(text)
                if text is not None:
                    texts.append(text)

        text = ' \n'.join(texts).strip()
        self.result.emit_page(int(pagenum), text)
Example #4
0
def parse_html(context, data, result):
    context.log.info('Parse: %r', result.url)

    title = result.html.findtext('.//title')
    if title is not None and 'title' not in data:
        data['title'] = title

    seen = set()
    for tag_query, attr_name in URL_TAGS:
        for element in result.html.findall(tag_query):
            attr = element.get(attr_name)
            if attr is None:
                continue

            url = normalize_url(urljoin(result.url, attr))
            if url is None or url in seen:
                continue
            seen.add(url)

            tag = make_key((context.run_id, url))
            if context.check_tag(tag):
                continue
            context.set_tag(tag, None)

            data = {'url': url}
            # Option to set the document title from the link text.
            if context.get('link_title', False):
                data['title'] = collapse_spaces(element.text_content())
            elif element.get('title'):
                data['title'] = collapse_spaces(element.get('title'))
            context.emit(rule='fetch', data=data)
Example #5
0
def crawl(context: Context):
    xls_url = fetch_xls_url(context)
    path = context.fetch_resource("source.xls", xls_url)
    context.export_resource(path, XLS, title=context.SOURCE_TITLE)

    xls = xlrd.open_workbook(path)
    for sheet in xls.sheets():
        headers = None
        row0 = [h.convert_excel_cell(xls, c) for c in sheet.row(0)]
        sections = [c for c in row0 if c is not None]
        section = collapse_spaces(" / ".join(sections))
        for r in range(1, sheet.nrows):
            row = [h.convert_excel_cell(xls, c) for c in sheet.row(r)]

            # after a header is found, read normal data:
            if headers is not None:
                data: Dict[str, List[str]] = {}
                for header, cell in zip(headers, row):
                    if header is None:
                        continue
                    values = []
                    if isinstance(cell, datetime):
                        cell = cell.date()
                    for value in multi_split(stringify(cell), SPLITS):
                        if value is None:
                            continue
                        if value == "不明":
                            continue
                        if value is not None:
                            values.append(value)
                    data[header] = values
                emit_row(context, sheet.name, section, data)

            if not len(row) or row[0] is None:
                continue
            teaser = row[0].strip()
            # the first column of the common headers:
            if "告示日付" in teaser:
                if headers is not None:
                    context.log.error("Found double header?", row=row)
                # print("SHEET", sheet, row)
                headers = []
                for cell in row:
                    cell = collapse_spaces(cell)
                    header = context.lookup_value("columns", cell)
                    if header is None:
                        context.log.warning("Unknown column title",
                                            column=cell,
                                            sheet=sheet.name)
                    headers.append(header)
Example #6
0
def parse_html(context, data, result):
    context.log.info("Parse: %r", result.url)

    for title in result.html.xpath(".//title/text()"):
        if title is not None and "title" not in data:
            data["title"] = title

    include = context.params.get("include_paths")
    if include is None:
        roots = [result.html]
    else:
        roots = []
        for path in include:
            roots = roots + result.html.xpath(path)

    seen = set()
    for root in roots:
        for tag_query, attr_name in URL_TAGS:
            for element in root.xpath(tag_query):
                attr = element.get(attr_name)
                if attr is None:
                    continue

                try:
                    url = urljoin(result.url, attr)
                    key = url
                except Exception:
                    log.warning("Invalid URL: %r", attr)
                    continue

                if url is None or key is None or key in seen:
                    continue
                seen.add(key)

                tag = make_key(context.run_id, key)
                if context.check_tag(tag):
                    continue
                context.set_tag(tag, None)
                data["url"] = url

                if data.get("title") is None:
                    # Option to set the document title from the link text.
                    if context.get("link_title", False):
                        data["title"] = collapse_spaces(element.text_content())
                    elif element.get("title"):
                        data["title"] = collapse_spaces(element.get("title"))

                context.http.session.headers["Referer"] = url
                context.emit(rule="fetch", data=data)
Example #7
0
def parse_html(context, data, result):
    context.log.info('Parse: %r', result.url)

    title = result.html.findtext('.//title')
    if title is not None and 'title' not in data:
        data['title'] = title

    include = context.params.get('include_paths')
    if include is None:
        roots = [result.html]
    else:
        roots = []
        for path in include:
            roots = roots + result.html.findall(path)

    seen = set()
    for root in roots:
        for tag_query, attr_name in URL_TAGS:
            for element in root.findall(tag_query):
                attr = element.get(attr_name)
                if attr is None:
                    continue

                try:
                    url = normalize_url(urljoin(result.url, attr))
                except Exception:
                    log.warning('Invalid URL: %r', attr)
                    continue

                if url is None or url in seen:
                    continue
                seen.add(url)

                tag = make_key(context.run_id, url)
                if context.check_tag(tag):
                    continue
                context.set_tag(tag, None)
                data = {'url': url}
                # Option to set the document title from the link text.
                if context.get('link_title', False):
                    data['title'] = collapse_spaces(element.text_content())
                elif element.get('title'):
                    data['title'] = collapse_spaces(element.get('title'))

                context.http.session.headers['Referer'] = url
                if  re.findall('publicId|firstResult', url):
                    print("----------------PRINTING URL----------------")
                    print(url)
                    context.emit(rule='fetch', data=data)
Example #8
0
def cleanup_text(text):
    if text is None:
        return
    match = CLEANUP.match(text)
    if match is not None:
        term = match.group('term')
        return collapse_spaces(term)
Example #9
0
def generate(text, keep_order=False):
    text = stringify(text)
    if text is None:
        return

    # this needs to happen before the replacements
    text = text.lower()
    text = remove_person_prefix(text)

    # remove any text in brackets
    text = BRACKETED.sub(WS, text)

    # super hard-core string scrubbing
    text = clean_strict(text)
    text = replace_types(text)

    if keep_order:
        text = collapse_spaces(text)
    else:
        # final manicure, based on openrefine algo
        parts = [p for p in text.split(WS) if len(p)]
        text = WS.join(sorted(set(parts)))

    if not len(text):
        return None

    return text
def parse_entry(context, node):
    entity_name = node.findtext("./Entity")
    if entity_name is not None:
        entity = context.make("LegalEntity")
        entity.add("name", entity_name.split("/"))
    else:
        entity = context.make("Person")
        given_name = node.findtext("./GivenName")
        entity.add("firstName", given_name)
        last_name = node.findtext("./LastName")
        entity.add("lastName", last_name)
        entity.add("name", jointext(given_name, last_name))
        entity.add("birthDate", node.findtext("./DateOfBirth"))

    # ids are per country and entry type (individual/entity)
    item = node.findtext("./Item")
    schedule = node.findtext("./Schedule")
    country = node.findtext("./Country")
    if "/" in country:
        country, _ = country.split("/")
    entity.id = context.make_slug(country, schedule, item, strict=False)
    entity.add("country", country)
    sanction = h.make_sanction(context, entity)
    sanction.add("program", schedule)

    names = node.findtext("./Aliases")
    if names is not None:
        for name in names.split(", "):
            name = collapse_spaces(name)
            entity.add("alias", name)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Example #11
0
def generate(text, keep_order=False):
    text = ensure_text(text)
    if text is None:
        return

    # this needs to happen before the replacements
    text = text.lower()

    # try to remove personal prefix, such as Mr., Mrs.
    text = PERSON_PREFIX.sub(WS, text)

    # remove any text in brackets
    text = BRACKETED.sub(WS, text)

    # super hard-core string scrubbing
    text = clean_strict(text)
    text = COMPANY_TYPES.sub(company_type_replacer, text)

    if keep_order:
        text = collapse_spaces(text)
    else:
        # final manicure, based on openrefine algo
        parts = [p for p in text.split(WS) if len(p)]
        text = WS.join(sorted(set(parts)))

    if not len(text):
        return None

    return text
Example #12
0
    def analyze(self, document):
        if document.schema in self.IGNORED:
            return

        collector = DocumentTagCollector(document, self.ORIGIN)
        text = document.text
        if text is None or len(text) <= self.MIN_LENGTH:
            return
        try:
            hint_language_code = None
            if len(document.languages) == 1:
                hint_language_code = document.languages[0]
            text = Text(text, hint_language_code=hint_language_code)
            for entity in text.entities:
                if entity.tag == 'I-LOC':
                    continue

                label = ' '.join(entity)
                label = self.CLEAN.sub(' ', label)
                label = collapse_spaces(label)
                if ' ' not in label or len(label) < 4 or len(label) > 200:
                    continue
                # log.info("Entity [Doc %s]: %s [%s]",
                #          document.id, label, entity.tag)
                collector.emit(label, self.TYPES[entity.tag])

        except ValueError as ve:
            log.warning('NER value error: %r', ve)
        except Exception as ex:
            log.warning('NER failed: %r', ex)
        finally:
            collector.save()
            log.info('Polyglot extracted %s entities.', len(collector))
Example #13
0
def generate(text: Optional[str],
             keep_order: bool = False,
             keep_brackets: bool = False) -> Optional[str]:
    text = stringify(text)
    if text is None:
        return None

    # this needs to happen before the replacements
    text = text.lower()
    text = clean_entity_name(text)

    if not keep_brackets:
        # Remove any text in brackets
        # This is meant to handle names of companies which include
        # the jurisdiction, like: Turtle Management (Seychelles) Ltd.
        text = BRACKETED.sub(WS, text)

    # Super hard-core string scrubbing
    text = clean_strict(text)
    text = replace_types(text)

    if keep_order:
        text = collapse_spaces(text)
    elif text is not None:
        # final manicure, based on openrefine algo
        parts = [p for p in text.split(WS) if len(p)]
        text = WS.join(sorted(set(parts)))

    if text is None or not len(text):
        return None

    return text
Example #14
0
def index_form(texts):
    """Turn a set of strings into the appropriate form for indexing."""
    results = []
    total_len = 0

    for text in texts:
        # We don't want to store more than INDEX_MAX_LEN of text per doc
        if total_len > INDEX_MAX_LEN:
            # TODO: there might be nicer techniques for dealing with overly
            # long text buffers?
            results = list(set(results))
            total_len = sum((len(t) for t in results))
            if total_len > INDEX_MAX_LEN:
                break

        text = stringify(text)
        if text is None:
            continue
        text = collapse_spaces(text)
        total_len += len(text)
        results.append(text)

        # Make latinized text version
        latin = latinize_text(text)
        latin = stringify(latin)
        if latin is None or latin == text:
            continue
        total_len += len(latin)
        results.append(latin)
    return results
Example #15
0
 def apply(self, record):
     value = six.text_type(self.template)
     for repl, ref in self.replacements.items():
         ref_value = record.get(ref) or ''
         ref_value = six.text_type(ref_value)
         value = value.replace(repl, ref_value)
     return collapse_spaces(value).strip()
Example #16
0
    def Extract(self, request, context):
        text = request.text
        if text is None or not len(text.strip()):
            return

        entity_count = 0
        for language in request.languages:
            if language not in LANGUAGES:
                continue
            try:
                parsed = Text(text, hint_language_code=language)
                for entity in parsed.entities:
                    label = ' '.join(entity)
                    label = CLEAN.sub(' ', label)
                    label = collapse_spaces(label)
                    if len(label) < 4 or len(label) > 200:
                        continue
                    if ' ' not in label:
                        continue
                    length = entity.end - entity.start
                    entity_count += 1
                    yield ExtractedEntity(label=label,
                                          offset=entity.start,
                                          length=length,
                                          type=TYPES[entity.tag])
            except Exception:
                log.exception("Cannot extract. Language: %s", language)
        log.info("Extract: extracted %s entities.", entity_count)
Example #17
0
def _make_work_and_affiliates(person, context, description, description_sub,
                              startDate, endDate, emitter, org_website,
                              affiliates):
    company_owner = emitter.make("Company")
    company_owner.add("website", org_website)
    membership = emitter.make("Membership")
    membership.add("startDate", startDate)
    membership.add("endDate", endDate)

    _create_org_and_attach(emitter, context, company_owner, person,
                           description, membership, description_sub, startDate)

    if affiliates is None:
        return

    for aff in affiliates.xpath('.//div[contains(@class,"tochterfirma")]'):
        aff_name_span = aff.xpath(".//span[@class='tochterFirmaName']")
        aff_url_span = aff.xpath('.//span[@class="tochterFirmaLink"]')
        aff_rel_span = aff.xpath('.//span[@class="tochterFirmaBeziehung"]')

        aff_name = collapse_spaces(
            aff_name_span[0].text) if len(aff_name_span) else None

        if not aff_name:
            # An affiliated company without a name indicates a parsing error.
            continue

        _create_affiliated_company(aff_name, aff_rel_span, aff_url_span,
                                   company_owner, context, emitter)
Example #18
0
def emit_document(context, row, date):
    context.http.reset()
    url = context.params.get("url")
    cells = row.findall("./td")
    if not len(cells):
        return

    text = [c.text_content().strip() for c in cells]
    _, num, category, name, _ = text
    title = "%s (%s, %s)" % (name, category, date)
    title = collapse_spaces(title)

    link = row.find('.//a[@class="pdfLnk"]')
    if link is None:
        return
    url = urljoin(url, link.get("href"))

    context.emit(
        data={
            "url": url,
            "title": title,
            "foreign_id": url,
            "countries": ["ch"],
            "dates": [date],
            "extension": "pdf",
            "mime_type": "application/pdf",
        })
Example #19
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)
    table = doc.find('//div[@class="sanctioned-table"]/table')
    headers = None
    for row in table.findall(".//tr"):
        if headers is None:
            headers = [slugify(el.text) for el in row.findall("./th")]
            continue
        cells = [collapse_spaces(el.text) for el in row.findall("./td")]
        data = {hdr: c for hdr, c in zip(headers, cells)}

        entity = context.make("Person")
        entity.id = context.make_id(data["id"], data["ad-soyad-ata-adi"])
        entity.add("name", data["ad-soyad-ata-adi"])
        entity.add("idNumber", data["id"])
        entity.add("birthDate", parse_date(data["dogum-tarixi"]))
        entity.add("country", "az")
        entity.add("topics", "sanction")

        addr = h.make_address(context, full=data["malumat"])
        h.apply_address(context, entity, addr)

        sanction = h.make_sanction(context, entity)
        context.emit(sanction)
        context.emit(entity, target=True)
Example #20
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r", encoding="utf-8") as fh:
        doc = html.fromstring(fh.read())
    for table in doc.findall('.//div[@class="editor-content"]//table'):
        headers = None
        schema = None
        for row in table.findall(".//tr"):
            cells = [
                collapse_spaces(c.text_content()) for c in row.findall("./td")
            ]
            if headers is None:
                headers = [slugify(c, sep="_") for c in cells]
                continue
            if len(cells) == 1:
                schema = TYPES[cells[0]]
                continue
            row = dict(zip(headers, cells))

            entity = context.make(schema)
            name = row.pop("imie_i_nazwisko_nazwa_podmiotu")
            entity.id = context.make_slug(name)
            names = name.split("(")
            entity.add("name", names[0])
            for alias in names[1:]:
                entity.add("alias", alias.split(")")[0])
            notes = row.pop("uzasadnienie_wpisu_na_liste")
            entity.add("notes", notes)

            details = row.pop("dane_identyfikacyjne_osoby_podmiotu")
            for (chop, prop) in CHOPSKA:
                parts = details.rsplit(chop, 1)
                details = parts[0]
                if len(parts) > 1:
                    if prop == "address":
                        addr = h.make_address(context, full=parts[1])
                        h.apply_address(context, entity, addr)
                    else:
                        entity.add(prop, parts[1])
            if len(details.strip()):
                result = context.lookup("details", details)
                if result is None:
                    context.log.warning("Unhandled details", details=details)
                else:
                    for prop, value in result.props.items():
                        entity.add(prop, value)

            sanction = h.make_sanction(context, entity)
            provisions = row.pop("zastosowane_srodki_sankcyjne")
            sanction.add("provisions", provisions)

            start_date = row.pop("data_umieszczenia_na_liscie")
            start_date = start_date.replace(" r.", "")
            sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"]))

            h.audit_data(row)
            context.emit(entity, target=True)
            context.emit(sanction)
Example #21
0
 def clean_name(self, text):
     if text is None or len(text) > self.MAX_LENGTH:
         return
     text = clean_entity_name(text)
     text = collapse_spaces(text)
     if len(text) < self.MIN_LENGTH:
         return
     return text
Example #22
0
def _convert_time_span(raw_time_span):
    raw_time_span = collapse_spaces(raw_time_span)
    if "seit" in raw_time_span.lower():
        # "seit" (since) indicates that there is no end time
        return _parse_single_date(raw_time_span), None
    elif "-" in raw_time_span:
        arr = raw_time_span.split("-")
        return _parse_single_date(arr[0]), _parse_single_date(arr[1])
Example #23
0
def clean_name(text):
    if text is None or len(text) > NAME_MAX_LENGTH:
        return
    text = clean_entity_name(text)
    text = collapse_spaces(text)
    if text is None or len(text) <= NAME_MIN_LENGTH or ' ' not in text:
        return
    return text
Example #24
0
 def name(self):
     if self._name is not None:
         return self._name
     names = (self.first_name, self.second_name, self.third_name,
              self.father_name, self.last_name)
     names = [n for n in names if n is not None]
     if len(names):
         names = ' '.join(names)
         return collapse_spaces(names)
Example #25
0
def clean_strict(text, boundary=WS):
    """Super-hardcore string scrubbing."""
    # transliterate to ascii
    text = ascii_text(text)
    # replace punctuation and symbols
    text = CHARACTERS_REMOVE_RE.sub('', text)
    text = category_replace(text)
    # pad out for company type replacements
    text = ''.join((boundary, collapse_spaces(text), boundary))
    return text
Example #26
0
 def clean_name(cls, text):
     if text is None or len(text) > MAX_LENGTH:
         return
     match = CLEANUP.match(text)
     if match is not None:
         text = match.group('term')
     text = collapse_spaces(text)
     if not len(text) or len(text) < MIN_LENGTH:
         return
     return text
Example #27
0
def clean_strict(text, boundary=WS):
    """Super-hardcore string scrubbing."""
    # transliterate to ascii
    text = ascii_text(text)
    # replace punctuation and symbols
    text = CHARACTERS_REMOVE_RE.sub('', text)
    text = category_replace(text)
    # pad out for company type replacements
    text = ''.join((boundary, collapse_spaces(text), boundary))
    return text
Example #28
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r", encoding="ISO-8859-1") as fh:
        doc = html.parse(fh)

    table = doc.find("//div[@id='viewcontainer']/table")
    headers = None
    for row in table.findall(".//tr"):
        if headers is None:
            headers = [
                slugify(c.text_content(), "_") for c in row.findall("./th")
            ]
            continue
        cells = [
            collapse_spaces(c.text_content()) for c in row.findall("./td")
        ]
        cells = dict(zip(headers, cells))
        cells.pop(None, None)

        full_name = name = cells.pop("name")
        registration_number = None
        for splitter in REG_NRS:
            if splitter in name:
                name, registration_number = name.split(splitter, 1)
                registration_number = registration_number.replace(")", "")

        country = cells.pop("nationality")
        country = country.replace("Non ADB Member Country", "")
        country = country.replace("Rep. of", "")
        entity = context.make("LegalEntity")
        entity.id = context.make_id(full_name, country)
        entity.add("name", name)
        entity.add("alias", cells.pop("othername_logo"))
        entity.add("topics", "debarment")
        entity.add("country", country)
        entity.add("registrationNumber", registration_number)

        sanction = h.make_sanction(context, entity)
        sanction.add("reason", cells.pop("grounds"))
        sanction.add("program", cells.pop("sanction_type"))
        date_range = cells.pop("effect_date_lapse_date", "")
        if "|" in date_range:
            start_date, end_date = date_range.split("|")
            sanction.add("startDate", h.parse_date(start_date.strip(),
                                                   FORMATS))
            sanction.add("endDate", h.parse_date(end_date.strip(), FORMATS))

        address = h.make_address(context,
                                 full=cells.pop("address"),
                                 country=country)
        h.apply_address(context, entity, address)

        context.emit(entity, target=True)
        context.emit(sanction)
Example #29
0
def get_countries(context):
    doc = context.fetch_html(COUNTRIES_URL)
    path = ".//select[@id='arrestWarrantCountryId']//option"
    options = []
    for option in doc.findall(path):
        code = stringify(option.get("value"))
        if code is None:
            continue
        label = collapse_spaces(option.text_content())
        options.append((code, label))
    return list(sorted(options))
Example #30
0
def parse_alias(entity, node):
    names = node.findtext('./ALIAS_NAME')
    if names is None:
        return

    for name in names.split('; '):
        name = collapse_spaces(name)
        if not len(name):
            continue

        alias = entity.create_alias(name=name)
        alias.quality = QUALITY[node.findtext('./QUALITY')]
Example #31
0
def text_chunks(texts, sep=" ", max_chunk=25000):
    """Pre-chew text snippets for NLP and pattern matching."""
    for text in texts:
        text = collapse_spaces(text)
        if text is None or len(text) < 5:
            continue
        # Crudest text splitting code in documented human history.
        # Most of the time, a single page of text is going to be
        # 3000-4000 characters, so this really only kicks in if
        # something weird is happening in the first place.
        for idx in range(0, len(text), max_chunk):
            yield text[idx : idx + max_chunk]