Beispiel #1
0
def import_promises():
    # just refresh everything
    Promise.objects.all().delete()

    df = pd.read_csv(settings.PROMISES_CSV)
    for index, row in df.iterrows():
        scope = row["scope"]
        gss_code = row["gss_code"]
        if pd.isnull(row["gss_code"]) or gss_code == "nan":
            continue

        try:
            council = Council.objects.get(gss_code=gss_code)
        except Council.DoesNotExist:
            print(
                "Could not find council to import promise: %s" % row["council"],
                file=sys.stderr,
            )
            continue

        if not pd.isnull(row["source_url"]):

            if scope == "Council operations":
                scope = "Council only"

            target_year = None
            non_numbers = re.compile(r"^(\d{4}).*$")
            # needs to be a string for the regexp to work
            target = str(char_from_text(row["target"]))

            # some of the entries in the sheet are not years or have slight
            # clarifications so remove those
            target = non_numbers.sub(r"\1", target)
            if len(target) == 4:
                target_year = target

            promise = Promise.objects.create(
                council=council,
                scope=PlanDocument.scope_code(scope),
                source=char_from_text(row["source_url"]),
                source_name=char_from_text(row["source_name"]),
                target_year=target_year,
                text=char_from_text(row["wording"]),
                notes=char_from_text(row["notes"]),
                has_promise=True,
            )

        elif scope == "No promise":
            promise = Promise.objects.create(
                council=council, scope=PlanDocument.scope_code(scope), has_promise=False
            )
Beispiel #2
0
def add_text_to_csv(get_all):
    df = pd.read_csv(settings.PROCESSED_CSV)
    rows = len(df["council"])

    # add a text column to the CSV
    df["text"] = pd.Series([None] * rows, index=df.index)

    # convert each PDF to text and add the text to the column
    pdf_rows = df["file_type"] == "pdf"
    for index, row in df[pdf_rows].iterrows():
        filename = basename(row["plan_path"])
        url = row["url"]
        url_hash = PlanDocument.make_url_hash(url)
        council = row["council"]
        pdf_path = join(settings.PLANS_DIR, filename)
        if get_all:
            generate_text(pdf_path, df, index)
        else:
            try:
                plan_document = PlanDocument.objects.get(url_hash=url_hash,
                                                         council__name=council)
                df.at[index, "text"] = plan_document.text
            except PlanDocument.DoesNotExist:
                generate_text(pdf_path, df, index)

    # save the CSV
    df.to_csv(open(settings.PROCESSED_CSV, "w"), index=False, header=True)
Beispiel #3
0
def update_plan(row, index, df, get_all):
    url = row["url"]
    council = row["council"]
    url_hash = PlanDocument.make_url_hash(url)
    new_filename = PlanDocument.plan_filename(council, url)
    if get_all:
        get_plan(row, index, df)
    else:
        # If we've already loaded a document from this URL, don't get the file again
        try:
            plan_document = PlanDocument.objects.get(url_hash=url_hash,
                                                     council__name=council)
            df.at[index, "charset"] = plan_document.charset
            df.at[index, "file_type"] = plan_document.file_type
            new_filename = new_filename + "." + plan_document.file_type
            local_path = join(settings.PLANS_DIR, new_filename)
            df.at[index, "plan_path"] = local_path
        except PlanDocument.DoesNotExist:
            print(f"fetching: {url} ({url_hash}) ")
            get_plan(row, index, df)
Beispiel #4
0
    def get_plan_defaults_from_row(self, row):
        (start_year,
         end_year) = PlanDocument.start_and_end_year_from_time_period(
             row["time_period"])
        defaults = {
            "document_type": PlanDocument.document_type_code(row["type"]),
            "scope": PlanDocument.scope_code(row["scope"]),
            "status": PlanDocument.status_code(row["status"]),
            "well_presented": boolean_from_text(row["well_presented"]),
            "baseline_analysis": boolean_from_text(row["baseline_analysis"]),
            "notes": char_from_text(row["notes"]),
            "file_type": char_from_text(row["file_type"]),
            "charset": char_from_text(row["charset"]),
            "text": char_from_text(row["text"]),
            "start_year": start_year,
            "end_year": end_year,
            "date_last_found": date_from_text(row["date_retrieved"]),
            "title": "",
        }
        if char_from_text(row["title_checked"]).lower() == "y":
            defaults["title"] = char_from_text(row["title"])

        return defaults
Beispiel #5
0
def get_plan(row, index, df):
    url = row["url"]
    council = row["council"]
    new_filename = PlanDocument.plan_filename(council, url)
    url_parts = urlparse(url)
    filepath, extension = splitext(url_parts.path)
    headers = {
        "User-Agent": "mySociety Council climate action plans search",
    }
    try:
        r = requests.get(url, headers=headers, verify=False, timeout=10)
        r.raise_for_status()
        set_file_attributes(df, index, r.headers.get("content-type"),
                            extension)
        new_filename = new_filename + "." + df.at[index, "file_type"]
        local_path = join(settings.PLANS_DIR, new_filename)
        df.at[index, "plan_path"] = local_path
        with open(local_path, "wb") as outfile:
            outfile.write(r.content)
    except requests.exceptions.RequestException as err:
        print(f"Error {council} {url}: {err}")
        df.at[index, "url"] = numpy.nan
Beispiel #6
0
 def test_whitespace_removed(self):
     expected = PlanDocument.APPROVED
     actual = PlanDocument.status_code(" Approved")
     self.assertEqual(expected, actual)
Beispiel #7
0
 def test_invalid_entry(self):
     expected = None
     actual = PlanDocument.status_code("Something else")
     self.assertEqual(expected, actual)
Beispiel #8
0
 def test_capitalisation_normalised(self):
     expected = PlanDocument.APPROVED
     actual = PlanDocument.status_code("aPProVed")
     self.assertEqual(expected, actual)
Beispiel #9
0
 def test_dates_then_text_parsed(self):
     expected = (2020, 2030)
     actual = PlanDocument.start_and_end_year_from_time_period(
         '2020-2030: "aim to become carbon neutral by 2030, and 80% by 2025."'
     )
     self.assertEqual(expected, actual)
Beispiel #10
0
 def test_whitespace_removed(self):
     expected = PlanDocument.ACTION_PLAN
     actual = PlanDocument.scope_code("Council only ")
     self.assertEqual(expected, actual)
Beispiel #11
0
 def test_capitalisation_normalised(self):
     expected = PlanDocument.WHOLE_AREA
     actual = PlanDocument.scope_code("WhOle AreA")
     self.assertEqual(expected, actual)
Beispiel #12
0
 def test_simple_case(self):
     expected = PlanDocument.COUNCIL_ONLY
     actual = PlanDocument.scope_code("Council only")
     self.assertEqual(expected, actual)
Beispiel #13
0
 def test_whitespace_removed(self):
     expected = PlanDocument.ACTION_PLAN
     actual = PlanDocument.document_type_code("Action plan ")
     self.assertEqual(expected, actual)
Beispiel #14
0
 def test_capitalisation_normalised(self):
     expected = PlanDocument.CLIMATE_STRATEGY
     actual = PlanDocument.document_type_code("Climate STraTegy")
     self.assertEqual(expected, actual)
Beispiel #15
0
 def test_simple_case(self):
     expected = PlanDocument.ACTION_PLAN
     actual = PlanDocument.document_type_code("Action plan")
     self.assertEqual(expected, actual)
Beispiel #16
0
    def update_database(self):
        df = pd.read_csv(settings.PROCESSED_CSV)
        for index, row in df.iterrows():
            council_url = char_from_text(row["website_url"])
            twitter_url = char_from_text(row["twitter_url"])
            twitter_name = char_from_text(row["twitter_name"])
            region = char_from_text(row["region"])
            county = char_from_text(row["county"])
            council, created = Council.objects.get_or_create(
                authority_code=char_from_text(row["authority_code"]),
                country=Council.country_code(row["country"]),
                defaults={
                    "authority_type": char_from_text(row["authority_type"]),
                    "name": row["council"],
                    "slug": PlanDocument.council_slug(row["council"]),
                    "gss_code": char_from_text(row["gss_code"]),
                    "whatdotheyknow_id": integer_from_text(row["wdtk_id"]),
                    "mapit_area_code": char_from_text(row["mapit_area_code"]),
                    "website_url": council_url,
                    "twitter_url": twitter_url,
                    "twitter_name": twitter_name,
                    "county": county,
                    "region": region,
                },
            )

            # check the council things that might change
            changed = False

            if char_from_text(row["authority_type"]) != council.authority_type:
                council.authority_type = char_from_text(row["authority_type"])
                changed = True

            if row["council"] != council.name:
                council.name = row["council"]
                council.slug = PlanDocument.council_slug(row["council"])
                changed = True

            if char_from_text(row["gss_code"]) != council.gss_code:
                council.gss_code = char_from_text(row["gss_code"])
                changed = True

            if council_url != "" and council.website_url != council_url:
                council.website_url = council_url
                changed = True

            if (council.twitter_name != ""
                    or council.twitter_name != twitter_name
                    or council.twitter_url != twitter_url):
                council.twitter_url = twitter_url
                council.twitter_name = twitter_name
                changed = True

            if council.region != region:
                council.region = region
                changed = True

            if council.county != county:
                council.county = county
                changed = True

            if changed is True:
                council.save()

            if not pd.isnull(row["url"]) and index in self.plans_to_process:
                document_file = open(row["plan_path"], "rb")
                file_object = File(document_file)
                defaults = {"file": file_object}
                defaults.update(self.get_plan_defaults_from_row(row))

                plan_document, created = PlanDocument.objects.update_or_create(
                    url=row["url"],
                    url_hash=PlanDocument.make_url_hash(row["url"]),
                    council=council,
                    defaults=defaults,
                )
                if created:
                    plan_document.date_first_found = date_from_text(
                        row["date_retrieved"])
                    plan_document.save()

        PlanDocument.objects.exclude(
            council__gss_code__in=self.councils_with_plan_in_sheet).delete()

        for council_code in self.plans_to_delete.keys():
            council = Council.objects.get(gss_code=council_code)
            plans = PlanDocument.objects.filter(
                council=council,
                url__in=self.plans_to_delete[council_code]).delete()

        Council.objects.exclude(gss_code__in=self.councils_in_sheet).delete()

        self.end_council_plan_count = (Council.objects.annotate(
            num_plans=Count("plandocument")).filter(
                Q(plandocument__document_type=PlanDocument.ACTION_PLAN)
                | Q(plandocument__document_type=PlanDocument.CLIMATE_STRATEGY),
                num_plans__gt=0,
            ).count())
        self.end_document_count = PlanDocument.objects.count()
        self.end_plan_count = PlanDocument.objects.filter(
            Q(document_type=PlanDocument.ACTION_PLAN)
            | Q(document_type=PlanDocument.CLIMATE_STRATEGY)).count()
Beispiel #17
0
 def test_simple_case(self):
     expected = (2020, 2030)
     actual = PlanDocument.start_and_end_year_from_time_period("2020-2030")
     self.assertEqual(expected, actual)
Beispiel #18
0
 def test_simple_case(self):
     expected = PlanDocument.DRAFT
     actual = PlanDocument.status_code("Draft")
     self.assertEqual(expected, actual)
Beispiel #19
0
 def test_no_dates_set_to_none(self):
     expected = (None, None)
     actual = PlanDocument.start_and_end_year_from_time_period(
         "Not yet published")
     self.assertEqual(expected, actual)