def test_dataset():
    work_env = AzureMLEnvironment(connect_workspace=False,
                                  datastore_path='tests/resources/datasets')
    _df = work_env.load_tabular_dataset('student-admission',
                                        cloud_storage=False)
    _df = _df.tail(20)
    assert _df.shape == (20, 3)  # 3 columns expected
def test_partitions_header():
    work_env = AzureMLEnvironment(connect_workspace=False,
                                  datastore_path='tests/resources/datastore')
    partition_df = work_env.load_tabular_partition('stock_header_AT*',
                                                   first_row_header=True,
                                                   cloud_storage=False)
    assert 'Isin' in partition_df.columns
    assert partition_df.shape == (24, 9)
    assert len(Counter(partition_df['Isin'])) == 3
def test_partitions_noheader_and_columns():
    work_env = AzureMLEnvironment(connect_workspace=False,
                                  datastore_path='tests/resources/datastore')
    partition_df = work_env.load_tabular_partition(
        'stock_header_AT*',
        first_row_header=False,
        columns=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
        cloud_storage=False)
    assert 'C' in partition_df.columns
    assert partition_df.shape == (27, 9)
def test_partitions_notexisting():
    work_env = AzureMLEnvironment(connect_workspace=False,
                                  datastore_path='tests/resources/datastore')
    partition_df = work_env.load_tabular_partition('stock_BE*',
                                                   columns=[
                                                       'Close', 'High', 'Isin',
                                                       'ItemDate', 'Low',
                                                       'Market', 'Open',
                                                       'Ticker', 'Volume'
                                                   ],
                                                   cloud_storage=False)
    assert partition_df == None
def test_partitions():
    work_env = AzureMLEnvironment(connect_workspace=False,
                                  datastore_path='tests/resources/datastore')
    partition_df = work_env.load_tabular_partition('stock_AT*',
                                                   columns=[
                                                       'Close', 'High', 'Isin',
                                                       'ItemDate', 'Low',
                                                       'Market', 'Open',
                                                       'Ticker', 'Volume'
                                                   ],
                                                   cloud_storage=False)
    assert partition_df.shape == (30, 9)

    assert len(Counter(partition_df['Isin'])) == 3
Example #6
0
    def copy_to_azureml(self, environment: aml.AzureMLEnvironment, dataset_name:str, user_name:str = None, user_key:str = None, use_key_vault:bool = True, local_path:str = None, force_download: bool = False):
        '''Downloads a kaggle dataset and stores it into an AzureML dataset
        Args:
            environment (aml.AzureMLEnvironment): The environment in which the dataset should be created
            dataset_name (str): The name of the kaggle dataset
            user_name (str): The kaggle user name (or the secret name in the KeyVault to it).  
            user_key (str): The kaggle secret key (or the secret name in the KeyVault to it).  
            use_key_vault (bool): Recommended, will retrieve the kaggle credentials from Key Vault
            local_path (str): The local folder in which to persist the downloaded Kaggle data
            force_download (bool): Will redownload and overwrite existing files
        '''  
        local_path = os.path.join(local_path, dataset_name.replace('/', '_').replace(' ', '-'))

        if force_download or not os.path.exists(local_path):
            if use_key_vault:
                self.__logger.info('Using KeyVault for kaggle authentication')
                # When no user_name / user_key is given, we'll take the default kaggle authentication names
                if not user_name: user_name = 'KAGGLE-USERNAME'
                if not user_key: user_key = 'KAGGLE-KEY'
                # When using key vault, we will replace the user_name & user_key values with the values from the secrets in key vault
                user_name = environment.get_secret(user_name)
                user_key = environment.get_secret(user_key)
                self.__logger.info(f'Authentication to kaggle with user {user_name}')
            
            # Kaggle authentication happens through environment variables (or a json file)
            if user_name:
                os.environ['KAGGLE_USERNAME'] = user_name
            if user_key:
                os.environ['KAGGLE_KEY'] = user_key

            import kaggle
            kaggle.api.authenticate()
            self.__logger.info('Successfully authenticated to kaggle.com')

            kaggle.api.dataset_download_files(dataset_name, path=local_path, unzip=True)
            self.__logger.info('Dataset successfully downloaded locally')

            environment.upload_dataset(dataset_name, local_path, overwrite=force_download, tags={'source': 'kaggle', 'url': f'https://www.kaggle.com/{dataset_name}'})
            self.__logger.info('Dataset successfully uploaded to AzureML Dataset')
Example #7
0
                    help='Kaggle user secret')
parser.add_argument('--kaggle_dataset',
                    type=str,
                    dest='dataset',
                    default=None,
                    help='Kaggle data set name')
parser.add_argument('--use_keyvault',
                    type=bool,
                    dest='usekeyvault',
                    default=True,
                    help='Indicate to use Key Vault')

args, unknown = parser.parse_known_args()
kaggle_user = args.user
kaggle_secret = args.key
kaggle_dataset = args.dataset
use_key_vault = args.usekeyvault

# Load the environment from the Run context, so you can access any dataset
aml_environment = AzureMLEnvironment.CreateFromContext()
collector = KaggleDataCollector()
collector.copy_to_azureml(aml_environment,
                          kaggle_dataset,
                          local_path='kaggle_data',
                          user_name=kaggle_user,
                          user_key=kaggle_secret,
                          use_key_vault=use_key_vault,
                          force_download=True)

print('Training finished')
Example #8
0
def test_dataset():
    work_env = aml.Create()
    _df = work_env.load_tabular_dataset('smart-devops-changesets')
    _df = _df.tail(20)
    assert _df.shape == (20,16) # 16 columns expected