def process_data(logger): output_folder = Path("data") for element in output_folder.glob('**/*'): if element.is_dir(): shutil.rmtree(element) for json_file in sorted(temp_files_path.glob("*.json")): deck = json.load(json_file.open()) date = dateparser.parse( deck["submission date"] ) if "submission date" in deck else datetime.datetime.min deck["submission date"] = date.isoformat() keys = list(deck.keys()) for k in keys: deck[slugify(k, separator="_")] = deck.pop(k) output_file = Path(output_folder, f"{date.year:04}", f"{date.month:02}.jsonl") if not output_file.parent.exists(): output_file.parent.mkdir(parents=True) with open(output_file, "a") as fd: json.dump(deck, fd) fd.write("\n") api.dataset_create_version("data", "Daily dataset update", dir_mode="zip", quiet=False)
def main(log_file): logger = setup_logger(log_file) logger.info("downloading urls") download_urls("urls.txt", 1) logger.info("downloading articles") download_articles("urls.txt", "raw") logger.info("processing articles") extract("raw", "articulos") logger.info("compressing articles") shutil.make_archive('data/articulos', 'zip', "articulos") logger.info("creating new dataset version") api.dataset_create_version("data", "Daily dataset update", dir_mode="zip", quiet=False)
def kernel_submit(self): self.info('kernel_submit updating dataset') folder = 'submit' os.makedirs(folder, exist_ok=True) shutil.copy(self.file, os.path.join(folder, self.file_name)) config = api.read_config_file() username = config['username'] title = f'{self.competition}-{self.kernel_suffix}-dataset' dataset_meta = { 'title': f'{self.competition}-{self.kernel_suffix}-dataset', 'id': f'{username}/{title}', 'licenses': [{ 'name': 'CC0-1.0' }] } with open(f'{folder}/dataset-metadata.json', 'w') as f: json.dump(dataset_meta, f) res = api.dataset_status(dataset_meta['id']) if res != 'ready': res = api.dataset_create_new(folder=folder) if res.status == 'error': raise Exception('dataset_create_new Error: ' + res.error) res = api.dataset_create_version(folder, 'Updated') if res.status == 'error': raise Exception('dataset_create_version Error: ' + res.error) self.info('dataset updated') seconds_to_sleep = 20 self.info(f'sleeping {seconds_to_sleep} seconds') time.sleep(seconds_to_sleep) slug = f'{self.competition}-{self.kernel_suffix}' kernel_meta = { 'id': f'{username}/{slug}', 'title': f'{self.competition}-{self.kernel_suffix}', 'code_file': 'code.py', 'language': 'python', 'kernel_type': 'script', 'is_private': 'true', 'enable_gpu': 'false', 'enable_internet': 'false', 'dataset_sources': [dataset_meta['id']], 'competition_sources': [self.competition], 'name': f'{self.competition}-{self.kernel_suffix}' } with open(f'{folder}/kernel-metadata.json', 'w') as f: json.dump(kernel_meta, f) code = """ import pandas as pd DATA_DIR = '../input/{self.competition}' CSV_FILE = '../input/{self.competition}-' + \ '{self.kernel_suffix}-dataset/{self.file_name}' df = pd.read_csv(DATA_DIR + '/sample_submission.csv') df_predict = pd.read_csv(CSV_FILE) keys = [c for c in df.columns if c!='{self.predict_column}'] predict_values = dict() for index, row in df_predict.iterrows(): key = tuple([row[k] for k in keys]) predict_values[key] = row res = [] for index, row in df.iterrows(): key = tuple([row[k] for k in keys]) if key in predict_values: res.append(predict_values[key]) else: res.append(row) res = pd.DataFrame(res) res.to_csv('submission.csv', index=False) """.replace('{self.competition}', self.competition).replace( '{self.kernel_suffix}', self.kernel_suffix).replace( '{self.file_name}', self.file_name).replace('{self.predict_column}', self.predict_column) with open(f'{folder}/code.py', 'w') as f: f.write(code) self.info('kernel data created') api.kernels_push(folder) self.info('kernel is pushed. waiting for the end of the commit') self.info(f'kernel address: https://www.kaggle.com/{username}/{slug}') seconds = self.wait_seconds for i in range(seconds): response = api.kernel_status(username, slug) if response['status'] == 'complete': self.info(f'kernel has completed successfully. ' f'Please go to ' f'https://www.kaggle.com/{username}/{slug} ' f'and push the button "Submit to the competition"') return if response['status'] == 'error': raise Exception( f'Kernel is failed. Msg = {response["failureMessage"]}') time.sleep(1) self.wait_seconds -= 1 self.info(f'kernel is not finished after {seconds}')
def kernel_submit(self): self.info('kernel_submit creating dataset') folder = os.path.expanduser( f'~/.kaggle/competitions/{self.competition}' ) shutil.rmtree(folder, ignore_errors=True) os.makedirs(folder, exist_ok=True) total_size = sum([du(f) for f in self.folders]) total_size += sum([du(f) for f in self.files]) if self.max_size: assert total_size < self.max_size, \ f'max_size = {self.max_size} Gb. Current size = {total_size}' config = api.read_config_file() username = config['username'] competition = self.competition dataset_meta = { 'competition': f'{competition}', 'id': f'{username}/{competition}-api-dataset', 'licenses': [{ 'name': 'CC0-1.0' }], 'title': 'API auto' } with open(f'{folder}/dataset-metadata.json', 'w') as f: json.dump(dataset_meta, f) self.info('\tzipping folders') dst = os.path.join(folder, 'dataset.zip') zip_folder(folders=self.folders, dst=dst, files=self.files) self.info('\tfolders are zipped. uploading dataset') if not any(d.ref == dataset_meta['id'] for d in api.dataset_list(user=username)): api.dataset_create_new(folder) else: res = api.dataset_create_version(folder, 'Updated') if res.status == 'error': raise Exception('dataset_create_version Error: ' + res.error) self.info('dataset uploaded. starting kernel') # dataset update time time.sleep(30) slug = 'predict' def push_notebook(file: str, slug: str): shutil.copy(file, os.path.join(folder, 'predict.ipynb')) datasets = [dataset_meta['id']] + list(self.datasets) kernel_meta = { 'id': f'{username}/{slug}', 'code_file': 'predict.ipynb', 'language': 'python', 'kernel_type': 'notebook', 'is_private': 'true', 'enable_gpu': 'true', 'enable_internet': 'false', 'dataset_sources': datasets, 'competition_sources': [competition], 'title': f'{slug}', 'kernel_sources': [] } with open(f'{folder}/kernel-metadata.json', 'w') as f: json.dump(kernel_meta, f) api.kernels_push(folder) push_notebook('predict.ipynb', 'predict') self.info('kernel is pushed. waiting for the end of the commit') self.info(f'kernel address: https://www.kaggle.com/{username}/{slug}')