Example #1
0
    def parse_text(self, soup: BeautifulSoup) -> pd.Series:

        national_program = r"As of ([\d]+ [A-Za-z]+ 20\d{2}), we have administered a total of ([\d,]+) doses of COVID-19 vaccines under the national vaccination programme \(Pfizer-BioNTech Comirnaty and Moderna\), covering ([\d,]+) individuals"
        data = re.search(national_program, soup.text).groups()
        national_date = clean_date(data[0],
                                   fmt="%d %B %Y",
                                   lang="en_US",
                                   loc="en_US")
        national_doses = clean_count(data[1])
        national_people_vaccinated = clean_count(data[2])

        who_eul = r"In addition, ([\d,]+) doses of other vaccines recognised in the World Health Organization.s Emergency Use Listing \(WHO EUL\) have been administered as of ([\d]+ [A-Za-z]+ 20\d{2}), covering ([\d,]+) individuals\. In total, (\d+)% of our population has completed their full regimen/ received two doses of COVID-19 vaccines, and (\d+)% has received at least one dose"
        data = re.search(who_eul, soup.text).groups()
        who_doses = clean_count(data[0])
        who_date = clean_date(data[1],
                              fmt="%d %B %Y",
                              lang="en_US",
                              loc="en_US")
        who_people_vaccinated = clean_count(data[2])
        share_fully_vaccinated = int(data[3])
        share_vaccinated = int(data[4])

        date = max([national_date, who_date])
        total_vaccinations = national_doses + who_doses
        people_vaccinated = national_people_vaccinated + who_people_vaccinated
        people_fully_vaccinated = round(
            people_vaccinated * (share_fully_vaccinated / share_vaccinated))

        data = pd.Series({
            "date": date,
            "total_vaccinations": total_vaccinations,
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
        })
        return data
Example #2
0
 def _propose_df(self):
     regex = r"COVID-19 update: As at (\d{1,2} [a-zA-Z]+ 202\d), .* a total of ([\d ]+) people have been vaccinated"
     data = []
     for tweet in self.tweets:
         match = re.search(regex, tweet.full_text)
         if match:
             dt = clean_date(match.group(1), "%d %B %Y")
             total_vaccinations = clean_count(match.group(2))
             dt = clean_date(tweet.created_at)
             if self.stop_search(dt):
                 break
             data.append({
                 "date":
                 dt,
                 "people_vaccinated":
                 total_vaccinations,
                 "text":
                 tweet.full_text,
                 "source_url":
                 self.build_post_url(tweet.id),
                 "media_url":
                 tweet.entities["media"][0]["media_url_https"]
                 if "media" in tweet.entities else None,
             })
     return pd.DataFrame(data)
Example #3
0
 def _propose_df(self):
     regex_1 = (
         r"COVID-19 Vaccination Update:\n\n1st and second dose — (([a-zA-Z]+) (\d{1,2})(?:th|nd|rd|st) (202\d)), in 36 States \+ the FCT\. \n\n([0-9,]+) eligible "
         r"Nigerians have been vaccinated with first dose while ([0-9,]+) of Nigerians vaccinated with 1st dose have collected their 2nd dose\."
     )
     regex_2 = r"COVID-19 Vaccination Update for (([a-zA-Z]+) (\d{1,2})(?:th|nd|rd|st),? (202\d)), in 36 States \+ the FCT\. "
     regex_3 = r"COVID-19 Vaccination Update"
     data = []
     for tweet in self.tweets:
         match_1 = re.search(regex_1, tweet.full_text)
         match_2 = re.search(regex_2, tweet.full_text)
         match_3 = re.search(regex_3, tweet.full_text)
         if match_1:
             people_vaccinated = clean_count(match_1.group(5))
             people_fully_vaccinated = clean_count(match_1.group(6))
             dt = clean_date(" ".join(match_1.group(2, 3, 4)), "%B %d %Y")
             if self.stop_search(dt):
                 break
             data.append(
                 {
                     "date": dt,
                     "total_vaccinations": people_vaccinated
                     + people_fully_vaccinated,
                     "people_vaccinated": people_vaccinated,
                     "people_fully_vaccinated": people_fully_vaccinated,
                     "text": tweet.full_text,
                     "source_url": self.build_post_url(tweet.id),
                     "media_url": tweet.extended_entities["media"][0][
                         "media_url_https"
                     ],
                 }
             )
         elif match_2:
             dt = clean_date(" ".join(match_2.group(2, 3, 4)), "%B %d %Y")
             if self.stop_search(dt):
                 break
             data.append(
                 {
                     "date": dt,
                     "text": tweet.full_text,
                     "source_url": self.build_post_url(tweet.id),
                     "media_url": tweet.extended_entities["media"][0][
                         "media_url_https"
                     ],
                 }
             )
         elif match_3:
             data.append(
                 {
                     "text": tweet.full_text,
                     "source_url": self.build_post_url(tweet.id),
                     "media_url": tweet.extended_entities["media"][0][
                         "media_url_https"
                     ],
                 }
             )
     df = pd.DataFrame(data)
     return df
Example #4
0
 def _propose_df(self):
     max_iter = 30
     dist_th = 8.7
     col_dominant = [160, 194, 195]
     records = []
     for tweet in self.tweets[:max_iter]:
         cond = "media" in tweet.entities  # and len(tweet.full_text) < 30
         if cond:
             url = tweet.extended_entities["media"][0]["media_url_https"]
             im = Image.open(requests.get(url, stream=True).raw, formats=["jpeg"])
             pixel_values = [x for i, x in enumerate(im.getdata()) if i < 100000]
             h = pd.value_counts(pixel_values, normalize=True).index[0]
             dist = np.linalg.norm(np.array(h) - np.array(col_dominant))
             if dist < dist_th:
                 dt = clean_date(tweet.created_at)
                 if self.stop_search(dt):
                     break
                 records.append(
                     {
                         "date": dt,
                         "text": tweet.full_text,
                         "source_url": self.build_post_url(tweet.id),
                         "media_url": url,
                     }
                 )
     df = pd.DataFrame(records)
     return df
Example #5
0
    def _parse_data(self, worksheet):

        for row in worksheet.values():
            for value in row:
                if "Total dosis aplicadas al " in str(value):
                    total_vaccinations = row[-1]
                    if type(total_vaccinations) != int:
                        total_vaccinations = clean_count(total_vaccinations)
                    date_raw = re.search(r"[\d-]{10}$", value).group(0)
                    date_str = clean_date(date_raw, "%d-%m-%Y")
                elif value == "Esquemas completos segundas + únicas dosis":
                    people_fully_vaccinated = row[-1]
                    if type(people_fully_vaccinated) != int:
                        people_fully_vaccinated = clean_count(people_fully_vaccinated)
                elif value == "Total únicas dosis acumuladas":
                    unique_doses = row[-1]
                    if type(unique_doses) != int:
                        unique_doses = clean_count(unique_doses)

        if total_vaccinations is None or people_fully_vaccinated is None:
            raise ValueError("Date is not where it is expected be! Check worksheet")
        return pd.Series(
            {
                "date": date_str,
                "total_vaccinations": total_vaccinations,
                "people_fully_vaccinated": people_fully_vaccinated,
                "people_vaccinated": total_vaccinations
                - people_fully_vaccinated
                + unique_doses,
            }
        )
Example #6
0
def read(source: str) -> pd.Series:
    op = Options()
    op.add_argument("--headless")

    with webdriver.Chrome(options=op) as driver:
        driver.get(source)
        time.sleep(3)

        for h5 in driver.find_elements_by_tag_name("h5"):

            if "Primera dosis" in h5.text:
                people_vaccinated = clean_count(
                    h5.find_element_by_xpath("./preceding-sibling::div").text)

            elif "Total dosis aplicadas" in h5.text:
                total_vaccinations = clean_count(
                    h5.find_element_by_xpath("./preceding-sibling::div").text)

            elif "Población completamente vacunada" in h5.text:
                people_fully_vaccinated = clean_count(
                    h5.find_element_by_xpath("./preceding-sibling::div").text)

            elif "Acumulados al" in h5.text:
                date = clean_date(h5.text, "Acumulados al %d de %B de %Y",
                                  "es")

    data = {
        "date": date,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "total_vaccinations": total_vaccinations,
    }
    return pd.Series(data=data)
Example #7
0
 def _propose_df(self):
     regex_1 = r"Results of COVID-19 tests .*"
     regex_2 = r"against COVID-19: ([\d,]+)"
     data = []
     for tweet in self.tweets:
         dt = clean_date(tweet.created_at)
         if self.stop_search(dt):
             break
         if re.search(regex_1, tweet.full_text):
             if "media" in tweet.entities:
                 data.append({
                     "date":
                     dt,
                     "text":
                     tweet.full_text,
                     "source_url":
                     self.build_post_url(tweet.id),
                     "media_url":
                     tweet.entities["media"][0]["media_url_https"],
                 })
         elif re.search(regex_2, tweet.full_text):
             total_vaccinations = re.search(regex_2,
                                            tweet.full_text).group(1)
             data.append({
                 "date":
                 dt,
                 "total_vaccinations":
                 clean_count(total_vaccinations),
                 "text":
                 tweet.full_text,
                 "source_url":
                 self.build_post_url(tweet.id),
             })
     return pd.DataFrame(data)
Example #8
0
def parse_data(last_update: str, max_iter: int = 10):
    records = []
    for days in range(10):
        date_it = clean_date(datetime.now() - timedelta(days=days))
        # print(date_it)
        # print(f"{date_it} > {last_update}?")
        if date_it > last_update:
            source = _get_source_url(date_it.replace("-", ""))
            try:
                df_ = pd.read_excel(source, index_col=0)
            except HTTPError:
                print("No available!")
            else:
                # print("Adding!")
                _check_vaccine_names(df_)
                ds = _parse_ds_data(df_, source)
                records.append(ds)
        else:
            # print("End!")
            break
        # print(max_iter)
    if len(records) > 0:
        return pd.DataFrame(records)
    # print("No data being added to Spain")
    return None
Example #9
0
def connect_parse_data(source: str) -> pd.Series:
    op = Options()
    op.add_argument("--headless")

    with webdriver.Chrome(options=op) as driver:
        driver.get(source)
        time.sleep(10)

        date = (driver.find_element_by_class_name(
            "as_of").find_element_by_tag_name("span").text)
        date = clean_date(date, "%d.%m.%Y")

        for elem in driver.find_elements_by_class_name("counter_block"):
            if "1 ДОЗУ" in elem.text:
                people_vaccinated = elem.find_element_by_tag_name("h2").text
            if "2 ДОЗИ" in elem.text:
                people_fully_vaccinated = elem.find_element_by_tag_name(
                    "h2").text

    data = {
        "people_vaccinated": clean_count(people_vaccinated),
        "people_fully_vaccinated": clean_count(people_fully_vaccinated),
        "date": date,
    }
    return pd.Series(data=data)
Example #10
0
 def _parse_date(self, text: str):
     thai_date_replace = {
         # Months
         "มกราคม": 1,
         "กุมภาพันธ์": 2,
         "มีนาคม": 3,
         "เมษายน": 4,
         "พฤษภาคม": 5,
         "พฤษภำคม": 5,
         "มิถุนายน": 6,
         "มิถุนำยน": 6,
         "กรกฎาคม": 7,
         "กรกฎำคม": 7,
         "สิงหาคม": 8,
         "สิงหำคม": 8,
         "กันยายน": 9,
         "ตุลาคม": 10,
         "พฤศจิกายน": 11,
         "ธันวาคม": 12,
     }
     date_raw = re.search(self.regex_date, text)
     day = clean_count(date_raw.group(1))
     month = thai_date_replace[date_raw.group(2)]
     year = clean_count(date_raw.group(3)) - 543
     return clean_date(datetime(year, month, day))
Example #11
0
 def parse_data(self, soup: BeautifulSoup) -> pd.Series:
     # Get path to newest pdf
     pdf_path = self._parse_last_pdf_link(soup)
     # Get text from pdf
     text = self._extract_text_from_pdf(pdf_path)
     # Get vaccine table from text
     df_vax = self._parse_vaccines_table_as_df(text)
     people_vaccinated = df_vax.doses_1.sum()
     people_fully_vaccinated = df_vax.doses_2.sum()
     total_vaccinations = people_vaccinated + people_fully_vaccinated
     vaccine = ", ".join(df_vax.vaccine.map(vaccines_mapping))
     # Get date
     regex = r"Situation Report\s+([\d\.]{10})"
     date = re.search(regex, text).group(1)
     date = clean_date(date, "%d.%m.%Y")
     # Build data series
     return {
         "total_vaccinations": total_vaccinations,
         "people_vaccinated": people_vaccinated,
         "people_fully_vaccinated": people_fully_vaccinated,
         "date": date,
         "source_url": pdf_path,
         "vaccine": vaccine,
         "location": self.location,
     }
Example #12
0
 def read(self):
     data = requests.get(self.source_url).json()["features"][0]["attributes"]
     return pd.Series(
         {
             "total_vaccinations": data["Vaccine_total"],
             "people_fully_vaccinated": data["Vaccine_total_last24"],
             "date": clean_date(datetime.fromtimestamp(data["Date"] / 1000)),
         }
     )
Example #13
0
def parse_date(filename):
    # Read pdf (for date)
    with open(filename, mode="rb") as f:
        reader = PyPDF2.PdfFileReader(f)
        page = reader.getPage(0)
        text = page.extractText()
    # Get date
    date_str = re.search(r"\n(?P<count>\d{1,2}.\d{1,2}.\d{4})\n",
                         text).group(1)
    return clean_date(date_str, "%d.%m.%Y")
Example #14
0
 def _parse_date(self, soup: BeautifulSoup) -> str:
     elems = soup.find_all("p")
     x = []
     for elem in elems:
         if elem.find(text=re.compile(self.regex["date"])):
             x.append(elem)
     if len(x) > 1:
         raise ValueError("Format of source has changed")
     date_str = clean_date(x[0].text, "ажурирано %d.%m.%Y")
     return date_str
Example #15
0
 def read(self) -> pd.Series:
     data = self._parse_data()
     # Build Series
     return pd.Series(
         {
             "total_vaccinations": data["Doses_Administered"],
             "people_vaccinated": data["Administered_Dose1_Recip"],
             "people_fully_vaccinated": data["Series_Complete_Yes"],
             "date": clean_date(data["Date"], "%Y-%m-%d"),
             "vaccine": self._parse_vaccines(data),
         }
     )
Example #16
0
def read(source: str) -> pd.Series:
    soup = get_soup(source)

    counters = soup.find_all(class_="text-brand-blue")
    dose_1 = clean_count(counters[1].text)
    dose_2 = clean_count(counters[2].text)
    assert dose_1 >= dose_2

    date = soup.find(class_="text-gray-500").text
    date = date.replace("Updated ", "") + str(datetime.date.today().year)
    date = clean_date(date, fmt="%d. %B%Y", lang="en")

    return pd.Series({"people_vaccinated": dose_1, "people_fully_vaccinated": dose_2, "date": date})
Example #17
0
def parse_data(data: dict) -> pd.Series:

    date = clean_date(data["updated"], "%Y/%m/%d")

    people_vaccinated = data["progress"]
    people_fully_vaccinated = data["completed"]

    return pd.Series(
        data={
            "date": date,
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
            "vaccine": ", ".join(_get_vaccine_names(data, translate=True)),
        }
    )
Example #18
0
def parse_data(soup: BeautifulSoup) -> pd.Series:

    numbers = soup.find_all(class_="odometer")

    date = re.search(r"[\d\.]{10}", soup.find(class_="counter").text).group(0)
    date = clean_date(date, "%d.%m.%Y")

    return pd.Series(
        data={
            "total_vaccinations": int(numbers[0]["data-count"]),
            "people_vaccinated": int(numbers[1]["data-count"]),
            "people_fully_vaccinated": int(numbers[2]["data-count"]),
            "date": date,
        }
    )
Example #19
0
 def parse_data(self, soup: BeautifulSoup) -> pd.Series:
     data = {}
     match = re.search(self.regex["title"], soup.text)
     if match:
         # date
         date_str = match.group(1)
         data["date"] = clean_date(
             f"{date_str} {datetime.now().year}", "%d de %B %Y", lang="es"
         )
         # vaccinations
         data["total_vaccinations"] = clean_count(match.group(2))
     match = re.search(self.regex["data"], soup.text)
     if match:
         data["people_vaccinated"] = clean_count(match.group(1))
         data["people_fully_vaccinated"] = clean_count(match.group(3))
     return pd.Series(data)
Example #20
0
def parse_data(soup: BeautifulSoup) -> pd.Series:

    people_vaccinated = clean_count(soup.find_all(class_="counter")[3].text)
    people_fully_vaccinated = clean_count(
        soup.find_all(class_="counter")[4].text)
    assert people_vaccinated >= people_fully_vaccinated
    total_vaccinations = people_vaccinated + people_fully_vaccinated

    date = soup.find(class_="actualiza").text
    date = re.search(r"\d{2}-\d{2}-\d{4}", date).group(0)
    date = clean_date(date, "%d-%m-%Y")

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
    }
    return pd.Series(data=data)
Example #21
0
def read(source: str) -> pd.Series:

    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache",
    }
    soup = BeautifulSoup(requests.get(source, headers=headers).content, "html.parser")

    text = soup.find("div", id="data").find("p").text

    date = re.search(r"На сегодня \(([\d\.]{8})\)", text).group(1)
    date = clean_date(date, "%d.%m.%y")

    people_vaccinated = re.search(
        r"([\d\s]+) чел\. \([\d\.]+% от населения[^)]*\) - привито хотя бы одним компонентом вакцины",
        text,
    ).group(1)
    people_vaccinated = clean_count(people_vaccinated)

    people_fully_vaccinated = re.search(
        r"([\d\s]+) чел\. \([\d\.]+% от населения,?[^)]*\) - полностью привито", text
    ).group(1)
    people_fully_vaccinated = clean_count(people_fully_vaccinated)

    total_vaccinations = re.search(
        r"([\d\s]+) шт\. - всего прививок сделано", text
    ).group(1)
    total_vaccinations = clean_count(total_vaccinations)

    return pd.Series(
        {
            "total_vaccinations": total_vaccinations,
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
            "date": date,
        }
    )
Example #22
0
 def _propose_df(self):
     regex = r"ባለፉት 24 .*"
     data = []
     for tweet in self.tweets:
         if re.search(regex, tweet.full_text):
             dt = clean_date(tweet.created_at)
             if self.stop_search(dt):
                 break
             data.append({
                 "date":
                 dt,
                 "text":
                 tweet.full_text,
                 "source_url":
                 self.build_post_url(tweet.id),
                 "media_url":
                 tweet.extended_entities["media"][1]["media_url_https"],
             })
     return pd.DataFrame(data)
Example #23
0
 def _propose_df(self):
     regex = r"Recevez la situation .* au (\d{1,2} [a-z]+ 202\d)\."
     data = []
     for tweet in self.tweets:
         match = re.search(regex, tweet.full_text)
         if match:
             dt = clean_date(match.group(1), "%d %B %Y", lang="fr")
             if self.stop_search(dt):
                 break
             data.append(
                 {
                     "date": dt,
                     "text": tweet.full_text,
                     "source_url": self.build_post_url(tweet.id),
                     "media_url": tweet.entities["media"][0]["media_url_https"]
                     if "media" in tweet.entities
                     else None,
                 }
             )
     return pd.DataFrame(data)
Example #24
0
 def _propose_df(self):
     regex = r"Comunicado N° (\d{3,4}).*"
     data = []
     for tweet in self.tweets:
         match = re.search(regex, tweet.full_text)
         if match:
             dt = clean_date(
                 from_tz_to_tz(tweet.created_at, to_tz="America/Panama"))
             if self.stop_search(dt):
                 break
             data.append({
                 "date": dt,
                 "text": tweet.full_text,
                 "source_url": self.build_post_url(tweet.id),
                 "num": match.group(1),
             })
             self.tweets_relevant.append(tweet)
     df = pd.DataFrame(data)
     df = df.drop_duplicates(subset=["num"], keep="last")
     return df
Example #25
0
 def _propose_df(self):
     regex = r"COVID-19 : Vaccination Updates\n\n(\d{1,2}\.\d{1,2}\.202\d).*"
     data = []
     for tweet in self.tweets:
         match = re.search(regex, tweet.full_text)
         if match:
             dt = clean_date(match.group(1), "%d.%m.%Y")
             if self.stop_search(dt):
                 break
             data.append({
                 "date":
                 dt,
                 "text":
                 tweet.full_text,
                 "source_url":
                 self.build_post_url(tweet.id),
                 "media_url":
                 tweet.entities["media"][0]["media_url_https"]
                 if "media" in tweet.entities else None,
             })
     return pd.DataFrame(data)
Example #26
0
def _parse_ds_data(df: pd.DataFrame, source: str) -> pd.Series:
    df.loc[~df.index.isin(["Sanidad Exterior"]),
           "Fecha de la última vacuna registrada (2)"].dropna().max()
    return pd.Series(
        data={
            "total_vaccinations":
            df.loc["Totales", "Dosis administradas (2)"].item(),
            "people_vaccinated":
            df.loc["Totales", "Nº Personas con al menos 1 dosis"].item(),
            "people_fully_vaccinated":
            df.loc["Totales",
                   "Nº Personas vacunadas(pauta completada)"].item(),
            "date":
            clean_date(df.loc[
                ~df.index.isin(["Sanidad Exterior"]),
                "Fecha de la última vacuna registrada (2)", ].dropna().max()),
            "source_url":
            source,
            "vaccine":
            ", ".join(_get_vaccine_names(df, translate=True)),
        })
Example #27
0
 def _propose_df(self):
     regex = r"Minister of Health Lizzie Nkosi's #COVID19 update on (\d{1,2} [a-zA-Z]+ 202\d)"
     data = []
     for tweet in self.tweets:
         match = re.search(regex, tweet.full_text)
         if match:
             dt = clean_date(match.group(1), "%d %B %Y")
             if self.stop_search(dt):
                 break
             data.append(
                 {
                     "date": dt,
                     "text": tweet.full_text,
                     "source_url": self.build_post_url(tweet.id),
                     "media_url": tweet.extended_entities["media"][0][
                         "media_url_https"
                     ],
                 }
             )
     df = pd.DataFrame(data)
     return df
Example #28
0
def main(paths):

    data = {
        "location": "Guatemala",
        "source_url":
        "https://gtmvigilanciacovid.shinyapps.io/3869aac0fb95d6baf2c80f19f2da5f98",
        "vaccine": "Moderna, Oxford/AstraZeneca",
    }

    op = Options()
    op.add_argument("--headless")
    with webdriver.Chrome(options=op) as driver:
        driver.maximize_window()  # For maximizing window
        driver.implicitly_wait(20)  # gives an implicit wait for 20 seconds
        driver.get(data["source_url"])
        driver.find_element_by_class_name("fa-syringe").click()
        date = driver.find_element_by_class_name("logo").text
        dose1 = (driver.find_element_by_id(
            "dosisaplicadas1").find_element_by_tag_name("h3").text)
        dose2 = (driver.find_element_by_id(
            "dosisaplicadas2").find_element_by_tag_name("h3").text)

    data["people_vaccinated"] = clean_count(dose1)
    data["people_fully_vaccinated"] = clean_count(dose2)
    data["total_vaccinations"] = (data["people_vaccinated"] +
                                  data["people_fully_vaccinated"])

    date = re.search(r"\d+/\d+/202\d", date).group(0)
    data["date"] = clean_date(date, "%d/%m/%Y")

    increment(
        paths=paths,
        location=data["location"],
        total_vaccinations=data["total_vaccinations"],
        people_vaccinated=data["people_vaccinated"],
        people_fully_vaccinated=data["people_fully_vaccinated"],
        date=data["date"],
        source_url=data["source_url"],
        vaccine=data["vaccine"],
    )
Example #29
0
 def _propose_df(self):
     regex = r"VACUNACIÓN #COVID19 \| Reporte del (\d{1,2}\.\d{1,2}\.202\d) - \d{1,2}:\d{1,2}"
     data = []
     for tweet in self.tweets:
         match = re.search(regex, tweet.full_text)
         if match:
             regex_doses = r"Total Dosis Administradas: ([\d\.]+)"
             total_vaccinations = re.search(regex_doses, tweet.full_text)
             if total_vaccinations:
                 total_vaccinations = clean_count(total_vaccinations.group(1))
             else:
                 total_vaccinations = pd.NA
             regex_people = r"Total personas vacunadas: ([\d\.]+)"
             people_vaccinated = re.search(regex_people, tweet.full_text)
             if people_vaccinated:
                 people_vaccinated = clean_count(people_vaccinated.group(1))
             else:
                 people_vaccinated = pd.NA
             people_fully_vaccinated = total_vaccinations - people_vaccinated
             dt = clean_date(match.group(1), "%d.%m.%Y")
             if self.stop_search(dt):
                 break
             data.append(
                 {
                     "date": dt,
                     "total_vaccinations": total_vaccinations,
                     "people_vaccinated": people_vaccinated,
                     "people_fully_vaccinated": people_fully_vaccinated,
                     "text": tweet.full_text,
                     "source_url": 1,  # pan.build_post_url(tweet.id),
                     "media_url": tweet.extended_entities["media"][0][
                         "media_url_https"
                     ],
                 }
             )
     df = pd.DataFrame(data)
     return df
Example #30
0
 def _propose_df(self):
     regex = (
         r"Trouvez ci-bas les données du \d{1,2} [a-zA-Z]+ et la mise à jour globale à la date du (\d{1,2}-\d{1,2}"
         r"-202\d)\."
     )
     data = []
     for tweet in self.tweets:
         match = re.search(regex, tweet.full_text)
         if match:
             dt = clean_date(match.group(1), "%d-%m-%Y")
             if self.stop_search(dt):
                 break
             data.append(
                 {
                     "date": dt,
                     "text": tweet.full_text,
                     "source_url": self.build_post_url(tweet.id),
                     "media_url": tweet.extended_entities["media"][0][
                         "media_url_https"
                     ],
                 }
             )
     df = pd.DataFrame(data)
     return df