Example #1
0
    def download_raw_dataset(self):
        """
        Download the raw dataset and extract the contents of the zip file and
        store that in the cache location.  If the user has not specified creds in the
        kaggle.json file we lookup the passed in username and the api key and
        perform authentication.
        """
        with self.update_env(KAGGLE_USERNAME=self.kaggle_username,
                             KAGGLE_KEY=self.kaggle_key):
            # Call authenticate explicitly to pick up new credentials if necessary
            api = create_kaggle_client()
            api.authenticate()
        os.makedirs(self.raw_temp_path, exist_ok=True)

        if self.is_kaggle_competition:
            download_func = api.competition_download_files
        else:
            download_func = api.dataset_download_files
        # Download all files for a competition/dataset
        download_func(self.competition_name, path=self.raw_temp_path)

        archive_zip = os.path.join(self.raw_temp_path, self.archive_filename)
        with ZipFile(archive_zip, 'r') as z:
            z.extractall(self.raw_temp_path)
        os.rename(self.raw_temp_path, self.raw_dataset_path)
Example #2
0
def download_kaggle_dataset(dataset_url, data_dir, force=False, dry_run=False):
    print("Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds")
    os.environ['KAGGLE_USERNAME'] = click.prompt("Your Kaggle username")
    os.environ['KAGGLE_KEY'] = _get_kaggle_key()

    dataset_id = get_kaggle_dataset_id(dataset_url)
    if not dry_run:
        from kaggle import api
        api.authenticate()
        api.dataset_download_files(
            dataset_id, os.path.join(data_dir, dataset_id.split('/')[1]), force=force, quiet=False, unzip=True)
    else:
        print("This is a dry run, skipping..")
def download_coronavirus_data(path='New\ Data/', verbose=False):
    """Installs the Kaggle Command Line Interface to clone dataset.
    Then extracts dataset to specified path and displays name of main file.
    Args:
        path(str): Folder to extract dataset into (must end with a '/')
        
    Returns:
        file_list(list): List of full filepaths to downloaded csv files.
    """
    ## Determine if dataset is downloaded via Kaggle CL
    import os, glob
    from zipfile import ZipFile
    from IPython.display import clear_output
    os.makedirs(path, exist_ok=True)

    ## Install Kaggle
    try:
        import kaggle.api as kaggle
    except:
        ## Install Kaggle
        os.system("pip install kaggle --upgrade")  #
        clear_output()
        if verbose: print('\t- Installed kaggle command line tool.')

    ## Using the kaggle.api
    import kaggle.api as kaggle
    kaggle.authenticate()
    kaggle.dataset_download_files(
        'sudalairajkumar/novel-corona-virus-2019-dataset',
        path=path,
        force=True,
        unzip=True)
    # ## Delete Zip File
    # zipfile  = path+"novel-corona-virus-2019-dataset.zip"
    # try:
    #     os.system(f"rm {zipfile}"  )
    # except:
    #     print("ERROR DELETING ZIP FILE")

    ## Get list of all csvs
    print('[i] Extraction Complete.')
    file_list = glob.glob(path + "*.csv")

    ## Find main df
    main_file = [file for file in file_list if 'covid_19_data.csv' in file]
    if verbose:
        print(f"[i] The main file name is {main_file}")
    return main_file[0]  #file_list[index]
Example #4
0
def download_kaggle_dataset(dataset_url, data_dir, force=False, dry_run=False):
    dataset_id = get_kaggle_dataset_id(dataset_url)
    id = dataset_id.split('/')[1]
    target_dir = os.path.join(data_dir, id)

    if not force and os.path.exists(target_dir) and len(
            os.listdir(target_dir)) > 0:
        print(
            'Skipping, found downloaded files in "{}" (use force=True to force download)'
            .format(target_dir))
        return

    if not read_kaggle_creds():
        print(
            "Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds"
        )
        os.environ['KAGGLE_USERNAME'] = click.prompt("Your Kaggle username")
        os.environ['KAGGLE_KEY'] = _get_kaggle_key()

    if not dry_run:
        from kaggle import api
        api.authenticate()
        if dataset_id.split('/')[0] == 'competitions' or dataset_id.split(
                '/')[0] == 'c':
            api.competition_download_files(id,
                                           target_dir,
                                           force=force,
                                           quiet=False)
            zip_fname = target_dir + '/' + id + '.zip'
            extract_archive(zip_fname, target_dir)
            try:
                os.remove(zip_fname)
            except OSError as e:
                print('Could not delete zip file, got' + str(e))
        else:
            api.dataset_download_files(dataset_id,
                                       target_dir,
                                       force=force,
                                       quiet=False,
                                       unzip=True)

    else:
        print("This is a dry run, skipping..")
Example #5
0
import kaggle.api
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

# Download all files of a dataset
# Signature: dataset_download_files(dataset, path=None, force=False, quiet=True, unzip=False)
# api.dataset_download_files('avenn98/world-of-warcraft-demographics')

# downoad single file
# Signature: dataset_download_file(dataset, file_name, path=None, force=False, quiet=True)

api.dataset_download_files(
    '/shashikant9198/nlp-and-glove-word-embeddings-sentimental-analysis',
    path='/Users/fred/OneDrive - Adobe/Data/NLP_sentiment/Kaggle_Files')
def get_kaggle(dsname, fpath):
    import kaggle.api as k
    k.authenticate()
    k.dataset_download_files(dsname, path='temp/', unzip=True)
    def download_coronavirus_data(self, path=None, verbose=None):
        """Installs the Kaggle Command Line Interface to clone dataset.
        Then extracts dataset to specified path and displays name of main file.
        Args:
            path(str): Folder to extract dataset into (must end with a '/')

        Returns:
            file_list(list): List of full filepaths to downloaded csv files.
        """
        if verbose == None:
            verbose = self.__verbose

        if verbose:
            print('[i] DOWNLOADING DATA USING KAGGLE API')
            print(
                "\thttps://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset"
            )

        if path is None:
            path = self._data_folder

        ## Determine if dataset is downloaded via Kaggle CL
        import os, glob
        from zipfile import ZipFile
        from IPython.display import clear_output
        os.makedirs(path, exist_ok=True)

        try:
            import kaggle.api as kaggle
        except:
            ## Install Kaggle
            os.system("pip install kaggle --upgrade")  #
            clear_output()
            if verbose: print('\t- Installed kaggle command line tool.')

        ## Using the kaggle.api
        import kaggle.api as kaggle
        kaggle.authenticate()
        kaggle.dataset_download_files(
            'sudalairajkumar/novel-corona-virus-2019-dataset',
            path=path,
            force=True,
            unzip=True)

        ## Run Kaggle Command
        # cmd = 'kaggle datasets download -d sudalairajkumar/novel-corona-virus-2019-dataset'
        # os.system(cmd)

        # ## Extract ZipFile
        # zip_filepath = 'novel-corona-virus-2019-dataset.zip'
        # with ZipFile(zip_filepath) as file:
        #     file.extractall(path)

        if self.__verbose:
            print(f'\t- Downloaded dataset .zip and extracted to:"{path}"')

        # ## Delete Zip File
        # try:
        #     os.system(f"rm {path}novel-corona-virus-2019-dataset.zip"  )
        # except:
        #     print("ERROR DELETING ZIP FILE")

        self.get_data_fpath(path)