Python download_url Examples, utilities.download_url Python Examples

Example #1

0

Show file

File: process_variants_hist_data.py Project: Ludo915/covsco

    def process_variants(self):

        print("Processing Variants data ...")

        self.url ="https://www.data.gouv.fr/fr/datasets/r/16f4fd03-797f-4616-bca9-78ff212d06e8"
        download_url(self.url, "/home/ludo915/code/covsco/data/train/variants/fr/variants_hist_data.csv", chunk_size=128)

        self.df = pd.read_csv("/home/ludo915/code/covsco/data/train/variants/fr/variants_hist_data.csv", sep=';')
        self.df2 = pd.read_csv("/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv")

        self.df2["date"] = pd.to_datetime(self.df2["date"])
        self.df['dep'] = self.df['dep'].replace({'2A': '201', '2B': '202'}).astype(int)
        self.df = self.df[self.df['dep'] < 203]
        self.df = self.df.groupby(['dep', 'semaine'
                        ])[["dep", "semaine", "Nb_susp_501Y_V1", "Nb_susp_501Y_V2_3"
                            ]].sum().drop(columns=["dep"]).reset_index()

        self.df['jour'] = self.df.apply(self.extract_date, axis=1)
        self.df.drop(columns='semaine', inplace=True)

        self.variantstuple = (self.df['dep'], self.df['jour'], self.df["Nb_susp_501Y_V1"],self.df["Nb_susp_501Y_V2_3"] )
        self.dicvariant = {(i, j) : (k,l) for (i, j, k, l) in zip(*self.variantstuple)}

        self.df2[['Nb_susp_501Y_V1','Nb_susp_501Y_V2_3']] = self.df2.apply(self.enriched_variant, axis=1).apply(pd.Series)
        self.df2.sort_values(by = ["numero","date"], inplace = True)
        print(self.df2)
        self.df2.to_csv("/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv",\
                index=False)
        return None

Example #2

0

Show file

File: process_mobility_historical_data.py Project: Ludo915/covsco

    def process_mobility(self):

        print("Processing Mobility indices data ...")
        Configuration.create(hdx_site='prod',
                            user_agent='A_Quick_Example',
                            hdx_read_only=True)
        dataset = Dataset.read_from_hdx('movement-range-maps')
        resources = dataset.get_resources()
        dic = resources[1]
        url_mobility = dic['download_url']

        self.file_mobility = "/home/ludo915/code/covsco/data/train/mobility/fr/mvt_range.zip"
        download_url(url_mobility, self.file_mobility)

        with ZipFile(self.file_mobility, 'r',) as zipf:
            zipf.printdir()
            print('Extracting mv_range file now...')
            mvt_range = zipf.namelist()[-1]
            zipf.extract(mvt_range,"/home/ludo915/code/covsco/data/train/mobility/fr/")
            print('Done!')

        os.chdir("/home/ludo915/code/covsco/data/train/mobility/fr/")
        os.system("""grep "FRA" """+ mvt_range + """ > mouvement-range-FRA.txt""")
        os.system("""head -n 1 """+ mvt_range + """ > header.txt""")
        os.system("""cat header.txt mouvement-range-FRA.txt > mouvement-range-FRA-final.csv""")
        os.chdir("/home/ludo915/code/covsco/scripts")
        self.df = pd.read_csv("/home/ludo915/code/covsco/data/train/mobility/fr/mouvement-range-FRA-final.csv", sep = '\t')
        print(self.df)
        self.df["ds"]=pd.to_datetime(self.df["ds"], dayfirst = True)
        self.df['polygon_name'] = self.df['polygon_name'].replace(
            {'Ile-de-France': 'Île-de-France',\
            '-le-de-France': 'Île-de-France',\
            "Auvergne-Rh-ne-Alpes":"Auvergne-Rhône-Alpes",\
            "Bourgogne-Franche-Comt-":"Bourgogne-Franche-Comté",\
            "Provence-Alpes-C-te d'Azur":"Provence-Alpes-Côte d'Azur"})

        self.df2 = pd.read_csv('/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv')
        self.df2["date"]=pd.to_datetime(self.df2["date"])
        self.df3 = pd.read_csv("/home/ludo915/code/covsco/data/train/pop/fr/regions_departements.csv", sep = ";")

        self.df.reset_index(inplace=  True)
        self.df2.reset_index(inplace = True)
        self.df3.reset_index(inplace = True)
        self.df.drop(columns = ["index"],inplace = True)
        self.df2.drop(columns = ["index"],inplace = True)
        self.df3.drop(columns = ["index"],inplace = True)

        self.df2 = self.df2.merge(self.df3, how='inner', left_on = "numero", right_on = "depnum",suffixes=("","_y"))
        self.df2 = self.df2.merge(self.df, how ="outer", left_on = ["Region","date"], right_on = ["polygon_name","ds"],suffixes=("","_y")).dropna()
        print(self.df2)
        self.df2.to_csv("/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv", index = False)
        print('OK')

        return None

Example #3

0

Show file

File: process_hist_vaccination_data.py Project: Ludo915/covsco

    def process_hist_vaccination(self):

        self.url = "https://www.data.gouv.fr/es/datasets/r/59aeab47-c364-462c-9087-ce233b6acbbc"

        download_url(
            self.url,
            "/home/ludo915/code/covsco/data/train/vaccination/fr/vaccination_hist_data.csv",
            chunk_size=128)

        self.df = pd.read_csv(
            "/home/ludo915/code/covsco/data/train/vaccination/fr/vaccination_hist_data.csv"
        )
        print(self.df.columns)
        self.df['departement'] = self.df['departement'].replace({
            '2A': '201',
            '2B': '202'
        }).astype(int)
        self.df = self.df[self.df['departement'] < 203]
        self.df["date_debut_semaine"] = pd.to_datetime(
            self.df["date_debut_semaine"], dayfirst=True)

        self.df2 = pd.read_csv(
            "/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv",
            sep=",")
        self.df2['vac1nb'] = 0
        self.df2['vac2nb'] = 0
        self.df2["date"] = pd.to_datetime(self.df2["date"])

        self.dfvac1 = self.df[self.df["rang_vaccinal"] == 1].reset_index()
        self.dfvac2 = self.df[self.df["rang_vaccinal"] == 2].reset_index()

        self.cum1 = self.dfvac1.groupby([
            'departement', 'date_debut_semaine'
        ]).sum().groupby(level=0).cumsum().sort_values(
            "date_debut_semaine").reset_index().drop(columns="index")
        self.cum2 = self.dfvac2.groupby([
            'departement', 'date_debut_semaine'
        ]).sum().groupby(level=0).cumsum().sort_values(
            "date_debut_semaine").reset_index().drop(columns="index")

        self.cum1['7_days'] = self.cum1.apply(self.create_week, axis=1)
        self.cum2['7_days'] = self.cum2.apply(self.create_week, axis=1)

        self.df2[['vac1nb',
                  'vac2nb']] = self.df2.apply(self.enriched_vaccin,
                                              axis=1).apply(pd.Series)
        print(self.df2)
        self.df2.to_csv(
            "/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv",
            index=False)

Example #4

0

Show file

File: get_data.py Project: shenglih/web-scraping

def get_listing_urls(br):
    """
    Searches StreetEasy for all rental apartment listings in
    Williamsburg, caches each page of search results to the directory
    whose name is stored in the variable SEARCH_RESULTS_DIR, and
    caches the URLs for the listings (one per line) to the file whose
    name is stored in the variable LISTING_URLS_FILE.

    Arguments:

    br -- Browser object
    """

    if os.path.exists(LISTING_URLS_FILE):
        return

    makedir(os.path.dirname(LISTING_URLS_FILE))

    br.open(SEARCH_URL)

    br.select_form(nr=1)
    #    print br.form
    br.form['area[]'] = ['302']
    response = br.submit()
    results_url = response.geturl()

    with safe_write(LISTING_URLS_FILE) as f:
        while True:

            filename = download_url(br, results_url, SEARCH_RESULTS_DIR)
            soup = BeautifulSoup(file(filename).read())

            results = soup.findAll('div', attrs={'class': 'details_title'})

            urls = []

            for r in results:

                r = r.find('h5')
                r = r.find('a')
                r = r.get('href')

                urls.append('http://streeteasy.com' + r)


#            urls = ['http://www.streeteasy.com' + r.find('h5').find('a').get('href') for r in soup.findAll('div', attrs={'class': 'details_title' })]

            f.write('\n'.join(urls))
            f.write('\n')
            f.flush()

            nav = soup.find('a', attrs={'class': 'next_page'})

            try:
                results_url = 'http://www.streeteasy.com' + nav.get('href')
            except AttributeError:
                break

Example #5

0

Show file

File: get_data.py Project: hannawallach/web-scraping

def get_listing_urls(br):
    """
    Searches StreetEasy for all rental apartment listings in
    Williamsburg, caches each page of search results to the directory
    whose name is stored in the variable SEARCH_RESULTS_DIR, and
    caches the URLs for the listings (one per line) to the file whose
    name is stored in the variable LISTING_URLS_FILE.

    Arguments:

    br -- Browser object
    """

    if os.path.exists(LISTING_URLS_FILE):
        return

    makedir(os.path.dirname(LISTING_URLS_FILE))

    br.open(SEARCH_URL)

    br.select_form(nr=1)
#    print br.form
    br.form['area[]'] = ['302']
    response = br.submit()
    results_url = response.geturl()

    with safe_write(LISTING_URLS_FILE) as f:
        while True:

            filename = download_url(br, results_url, SEARCH_RESULTS_DIR)
            soup = BeautifulSoup(file(filename).read())

            results = soup.findAll('div', attrs={'class': 'details_title' })

            urls = []

            for r in results:

                r = r.find('h5')
                r = r.find('a')
                r = r.get('href')

                urls.append('http://streeteasy.com' + r)

#            urls = ['http://www.streeteasy.com' + r.find('h5').find('a').get('href') for r in soup.findAll('div', attrs={'class': 'details_title' })]

            f.write('\n'.join(urls))
            f.write('\n')
            f.flush()

            nav = soup.find('a', attrs={'class': 'next_page'})

            try:
                results_url = 'http://www.streeteasy.com' + nav.get('href')
            except AttributeError:
                break

Example #6

0

Show file

File: get_data.py Project: hannawallach/web-scraping

def get_listing_pages(br):
    """
    Caches the contents of each URL in the file whose name is stored
    in the variable LISTING_URLS_FILE to the directory whose name is
    stored on the variable LISTING_PAGES_DIR. The contents of each URL
    will be stored in a file whose name is that URL's md5 hash.

    Arguments:

    br -- Browser object
    """

    listing_urls = [url.strip() for url in file(LISTING_URLS_FILE)]

    for url in iterview(listing_urls):
        try:
            download_url(br, url, LISTING_PAGES_DIR)
        except Exception as e:
            print >> sys.stderr, '\n', (url, e)

Example #7

0

Show file

File: get_data.py Project: shenglih/web-scraping

def get_listing_pages(br):
    """
    Caches the contents of each URL in the file whose name is stored
    in the variable LISTING_URLS_FILE to the directory whose name is
    stored on the variable LISTING_PAGES_DIR. The contents of each URL
    will be stored in a file whose name is that URL's md5 hash.

    Arguments:

    br -- Browser object
    """

    listing_urls = [url.strip() for url in file(LISTING_URLS_FILE)]

    for url in iterview(listing_urls):
        try:
            download_url(br, url, LISTING_PAGES_DIR)
        except Exception as e:
            print >> sys.stderr, '\n', (url, e)

Example #8

0

Show file

File: process_covid_positive_test_historical_data.py Project: Ludo915/covsco

    def process_covid_positive_test(self):

        print("Processing Covid Positive Tests (Previous day) ...")
        self.url1 = "https://www.data.gouv.fr/en/datasets/r/406c6a23-e283-4300-9484-54e78c8ae675"
        download_url(
            self.url1,
            "/home/ludo915/code/covsco/data/train/covidpostest/fr/covid_pos_test_hist_data.csv",
            chunk_size=128)
        self.df = pd.read_csv(
            "/home/ludo915/code/covsco/data/train/covidpostest/fr/covid_pos_test_hist_data.csv",
            sep=";")
        self.df2 = pd.read_csv(
            "/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv",
            sep=",")
        self.df['dep'] = self.df['dep'].replace({
            '2A': '201',
            '2B': '202'
        }).astype(int)
        self.df = self.df[self.df["dep"] < 203]
        self.df["jour"] = pd.to_datetime(self.df["jour"], dayfirst=True)
        self.df2["date"] = pd.to_datetime(self.df2["date"])
        self.df = self.df.groupby(["dep", "jour"
                                   ]).sum().sort_values(["dep", "jour"
                                                         ]).reset_index()
        dftotalcovidcasescumulated = self.df.groupby([
            'dep', 'jour'
        ]).sum().groupby(level=0).cumsum().sort_values(["dep",
                                                        "jour"]).reset_index()
        print(dftotalcovidcasescumulated)
        self.df = self.df[["dep", "jour", "P"]]
        self.df["totalcovidcasescumulated"] = dftotalcovidcasescumulated["P"]
        self.covpostesttuple = (self.df['dep'], self.df['jour'], self.df["P"],
                                self.df["totalcovidcasescumulated"])
        self.diccovpostest = {(i, j): (k, l)
                              for (i, j, k, l) in zip(*self.covpostesttuple)}
        self.df2 = self.CovidPosTest_to_df(self.df2)
        print(self.df2)
        self.df2.to_csv(
            "/home/ludo915/code/covsco/data/train/all_data_merged/fr/Enriched_Covid_history_data.csv",
            index=False)
        print('OK')

Example #9

0

Show file

File: download_live_vaccins_i.py Project: Ludo915/covsco

 def get_file(self):
     self.url = 'https://www.data.gouv.fr/es/datasets/r/59aeab47-c364-462c-9087-ce233b6acbbc'
     download_url(
         self.url,
         '/home/ludo915/code/covsco/data/prediction/live_vaccins.csv')
     return self

Example #10

0

Show file

def get(repository: str,
        dp: frictionless.package.Package,
        isForced: bool = False):
    """
    Retrieve (meta)data and check whether an update is necessary.

    Parameters
    ----------
    repository : str
        URL of the Gitlab repository (raw).
    dp : frictionless.package.Package
        Existing dp or None.
    isForced : bool, optional
        isForced update. The default is False.

    Returns
    -------
    data_enermaps : DataFrame
        df in EnerMaps format.
    dp : frictionless.package.Package
        Datapackage corresponding to the data.

    """
    new_dp = frictionless.Package(repository + "datapackage.json")
    isChangedStats = False  # initialize check

    # Prepare df containing paths to rasters
    rasters = []
    for resource_idx in range(len(new_dp["resources"])):
        if "temporal" in new_dp["resources"][resource_idx]:
            start_at = pd.to_datetime(
                new_dp["resources"][resource_idx]["temporal"]["start"])
        else:
            start_at = None

        if "unit" in new_dp["resources"][resource_idx]:
            unit = new_dp["resources"][resource_idx]["unit"]
        else:
            unit = None

        if new_dp["resources"][resource_idx]["format"] == "tif":
            logging.info(new_dp["resources"][resource_idx]["path"])
            utilities.download_url(
                repository + new_dp["resources"][resource_idx]["path"],
                os.path.basename(new_dp["resources"][resource_idx]["path"]),
            )
            raster = {
                "value":
                os.path.basename(new_dp["resources"][resource_idx]["path"]),
                "variable":
                new_dp["resources"][resource_idx]["title"],
                "start_at":
                start_at,
                "z":
                Z,
                "unit":
                unit,
                "dt":
                DT,
            }
            rasters.append(raster)
            # check statistics for each resource
            if dp is not None and "stats" in new_dp["resources"][resource_idx]:
                if (dp["resources"][resource_idx]["stats"] !=
                        new_dp["resources"][resource_idx]["stats"]):
                    isChangedStats = True
    rasters = pd.DataFrame(rasters)

    if dp is not None:  # Existing dataset
        # check stats
        isChangedVersion = dp["version"] != new_dp["version"]
        if isChangedStats or isChangedVersion:
            logging.info("Data has changed")
            data_enermaps = utilities.prepareRaster(rasters, delete_orig=True)
        elif isForced:
            logging.info("Forced update")
            data_enermaps = utilities.prepareRaster(rasters, delete_orig=True)
        else:
            logging.info(
                "Data has not changed. Use --force if you want to reupload.")
            return None, None
    else:  # New dataset
        data_enermaps = utilities.prepareRaster(rasters, delete_orig=True)

    # Move rasters into the data directory
    if not os.path.exists("data"):
        os.mkdir("data")
    if not os.path.exists(os.path.join("data", str(ds_id))):
        os.mkdir(os.path.join("data", str(ds_id)))
    for i, row in data_enermaps.iterrows():
        shutil.move(row.fid, os.path.join("data", str(ds_id), row.fid))

    return data_enermaps, new_dp

Example #11

0

Show file

File: download_live_mobility_i.py Project: Ludo915/covsco

 def get_file(self):
     self.file_name = "/home/ludo915/code/covsco/data/prediction/mvt_range.zip"
     download_url(self.url, self.file_name)
     return self

Example #12

0

Show file

File: download_live_variants.py Project: Ludo915/covsco

 def get_file(self):
     self.url = 'https://www.data.gouv.fr/fr/datasets/r/16f4fd03-797f-4616-bca9-78ff212d06e8'
     self.file_name = '/home/ludo915/code/covsco/data/prediction/live_variants.csv'
     download_url(self.url, self.file_name)
     return self