def get_pages_by_ward_name(self, ward):
     ward = clean_text(ward)
     matched_pages = []
     for page in self.pages:
         if page.is_top_page:
             if matched_pages:
                 return matched_pages
             search_text = clean_text(page.get_page_heading())
             wards = ward.split("/")
             for ward in wards:
                 if ward in search_text:
                     matched_pages.append(page)
         else:
             if matched_pages:
                 matched_pages.append(page)
     if matched_pages:
         return matched_pages
Example #2
0
def extract_ballot_table(ballot, parse_flavor="lattice"):
    """
    Given a OfficialDocument model, update or create a ParsedSOPN model with the
    contents of the table as a JSON string.

    :type ballot: candidates.models.Ballot

    """
    document = ballot.sopn
    if not document.relevant_pages:
        raise ValueError(
            "Pages for table not known for document, extract page numbers first"
        )

    try:
        tables = camelot.read_pdf(
            document.uploaded_file.url,
            pages=document.relevant_pages,
            flavor=parse_flavor,
        )
    except (NotImplementedError, AttributeError):
        # * NotImplementedError is thrown if the PDF is an image or generally
        #   unreadable.
        # * AttributeError is thrown on some PDFs saying they need a password.
        #   Assume this is a bug in camelot, and ignore these PDFs
        raise NoTextInDocumentError()

    # Tables can span pages, camelot assumes they're different tables, so we
    # need to join them back together
    table_list = []
    for table in tables:
        table_list.append(table)
    table_list.sort(key=lambda t: (t.page, t.order))

    if not table_list:
        return

    table_data = table_list.pop(0).df
    for table in table_list:
        # It's possible to have the "situation of poll" document on the SOPN
        # Ignore any table that contains "polling station" (SOPNs tables don't)
        first_row = table.df.iloc[0].to_string()
        if "polling station" in clean_text(first_row):
            break
        # Append the continuation table to the first one in the document.
        # ignore_index is needed so the e.g table 2 row 1 doesn't replace
        # table 1 row 1
        table_data = table_data.append(table.df, ignore_index=True)

    if not table_data.empty:
        parsed, _ = ParsedSOPN.objects.update_or_create(
            sopn=document,
            defaults={"raw_data": json.dumps(table_data.to_dict())},
        )
        return parsed
Example #3
0
 def test_clean_text(self):
     text = "\n C andidates (Namés)"
     self.assertEqual(clean_text(text), "candidates")
Example #4
0
def contains_header_like_strings(row):
    row_string = clean_text(row.to_string())
    if any(s in row_string for s in NAME_FIELDS):
        return True
    return False
Example #5
0
def clean_row(row):
    return [clean_text(c) for c in row]