コード例 #1
0
def process_data(logger):
    output_folder = Path("data")
    for element in output_folder.glob('**/*'):
        if element.is_dir():
            shutil.rmtree(element)
    for json_file in sorted(temp_files_path.glob("*.json")):
        deck = json.load(json_file.open())
        date = dateparser.parse(
            deck["submission date"]
        ) if "submission date" in deck else datetime.datetime.min
        deck["submission date"] = date.isoformat()

        keys = list(deck.keys())
        for k in keys:
            deck[slugify(k, separator="_")] = deck.pop(k)

        output_file = Path(output_folder, f"{date.year:04}",
                           f"{date.month:02}.jsonl")
        if not output_file.parent.exists():
            output_file.parent.mkdir(parents=True)
        with open(output_file, "a") as fd:
            json.dump(deck, fd)
            fd.write("\n")
    api.dataset_create_version("data",
                               "Daily dataset update",
                               dir_mode="zip",
                               quiet=False)
コード例 #2
0
ファイル: main.py プロジェクト: m16-datasets/mananeras
def main(log_file):
    logger = setup_logger(log_file)

    logger.info("downloading urls")
    download_urls("urls.txt", 1)
    logger.info("downloading articles")
    download_articles("urls.txt", "raw")
    logger.info("processing articles")
    extract("raw", "articulos")
    logger.info("compressing articles")
    shutil.make_archive('data/articulos', 'zip', "articulos")
    logger.info("creating new dataset version")
    api.dataset_create_version("data", "Daily dataset update", dir_mode="zip", quiet=False)
コード例 #3
0
ファイル: kaggle.py プロジェクト: shlemph/mlcomp
    def kernel_submit(self):
        self.info('kernel_submit updating dataset')

        folder = 'submit'
        os.makedirs(folder, exist_ok=True)

        shutil.copy(self.file, os.path.join(folder, self.file_name))

        config = api.read_config_file()
        username = config['username']
        title = f'{self.competition}-{self.kernel_suffix}-dataset'
        dataset_meta = {
            'title': f'{self.competition}-{self.kernel_suffix}-dataset',
            'id': f'{username}/{title}',
            'licenses': [{
                'name': 'CC0-1.0'
            }]
        }
        with open(f'{folder}/dataset-metadata.json', 'w') as f:
            json.dump(dataset_meta, f)

        res = api.dataset_status(dataset_meta['id'])
        if res != 'ready':
            res = api.dataset_create_new(folder=folder)
            if res.status == 'error':
                raise Exception('dataset_create_new Error: ' + res.error)

        res = api.dataset_create_version(folder, 'Updated')
        if res.status == 'error':
            raise Exception('dataset_create_version Error: ' + res.error)

        self.info('dataset updated')

        seconds_to_sleep = 20
        self.info(f'sleeping {seconds_to_sleep} seconds')
        time.sleep(seconds_to_sleep)

        slug = f'{self.competition}-{self.kernel_suffix}'
        kernel_meta = {
            'id': f'{username}/{slug}',
            'title': f'{self.competition}-{self.kernel_suffix}',
            'code_file': 'code.py',
            'language': 'python',
            'kernel_type': 'script',
            'is_private': 'true',
            'enable_gpu': 'false',
            'enable_internet': 'false',
            'dataset_sources': [dataset_meta['id']],
            'competition_sources': [self.competition],
            'name': f'{self.competition}-{self.kernel_suffix}'
        }
        with open(f'{folder}/kernel-metadata.json', 'w') as f:
            json.dump(kernel_meta, f)

        code = """
import pandas as pd

DATA_DIR = '../input/{self.competition}'
CSV_FILE = '../input/{self.competition}-' + \
           '{self.kernel_suffix}-dataset/{self.file_name}'

df = pd.read_csv(DATA_DIR + '/sample_submission.csv')
df_predict = pd.read_csv(CSV_FILE)

keys = [c for c in df.columns if c!='{self.predict_column}']
predict_values = dict()
for index, row in df_predict.iterrows():
    key = tuple([row[k] for k in keys])
    predict_values[key] = row

res = []
for index, row in df.iterrows():
    key = tuple([row[k] for k in keys])
    if key in predict_values:
        res.append(predict_values[key])
    else:
        res.append(row)

res = pd.DataFrame(res)
res.to_csv('submission.csv', index=False)
        """.replace('{self.competition}', self.competition).replace(
            '{self.kernel_suffix}', self.kernel_suffix).replace(
                '{self.file_name}',
                self.file_name).replace('{self.predict_column}',
                                        self.predict_column)

        with open(f'{folder}/code.py', 'w') as f:
            f.write(code)

        self.info('kernel data created')
        api.kernels_push(folder)
        self.info('kernel is pushed. waiting for the end of the commit')
        self.info(f'kernel address: https://www.kaggle.com/{username}/{slug}')

        seconds = self.wait_seconds
        for i in range(seconds):
            response = api.kernel_status(username, slug)
            if response['status'] == 'complete':
                self.info(f'kernel has completed successfully. '
                          f'Please go to '
                          f'https://www.kaggle.com/{username}/{slug} '
                          f'and push the button "Submit to the competition"')
                return
            if response['status'] == 'error':
                raise Exception(
                    f'Kernel is failed. Msg = {response["failureMessage"]}')
            time.sleep(1)
            self.wait_seconds -= 1

        self.info(f'kernel is not finished after {seconds}')
コード例 #4
0
ファイル: kaggle.py プロジェクト: lightforever/mlcomp
    def kernel_submit(self):
        self.info('kernel_submit creating dataset')

        folder = os.path.expanduser(
            f'~/.kaggle/competitions/{self.competition}'
        )
        shutil.rmtree(folder, ignore_errors=True)
        os.makedirs(folder, exist_ok=True)

        total_size = sum([du(f) for f in self.folders])
        total_size += sum([du(f) for f in self.files])

        if self.max_size:
            assert total_size < self.max_size, \
                f'max_size = {self.max_size} Gb. Current size = {total_size}'

        config = api.read_config_file()
        username = config['username']
        competition = self.competition
        dataset_meta = {
            'competition': f'{competition}',
            'id': f'{username}/{competition}-api-dataset',
            'licenses': [{
                'name': 'CC0-1.0'
            }],
            'title': 'API auto'
        }
        with open(f'{folder}/dataset-metadata.json', 'w') as f:
            json.dump(dataset_meta, f)

        self.info('\tzipping folders')

        dst = os.path.join(folder, 'dataset.zip')
        zip_folder(folders=self.folders, dst=dst, files=self.files)

        self.info('\tfolders are zipped. uploading dataset')
        if not any(d.ref == dataset_meta['id'] for d in
                   api.dataset_list(user=username)):
            api.dataset_create_new(folder)
        else:
            res = api.dataset_create_version(folder, 'Updated')
            if res.status == 'error':
                raise Exception('dataset_create_version Error: ' + res.error)

        self.info('dataset uploaded. starting kernel')

        # dataset update time
        time.sleep(30)

        slug = 'predict'

        def push_notebook(file: str, slug: str):
            shutil.copy(file, os.path.join(folder, 'predict.ipynb'))

            datasets = [dataset_meta['id']] + list(self.datasets)
            kernel_meta = {
                'id': f'{username}/{slug}',
                'code_file': 'predict.ipynb',
                'language': 'python',
                'kernel_type': 'notebook',
                'is_private': 'true',
                'enable_gpu': 'true',
                'enable_internet': 'false',
                'dataset_sources': datasets,
                'competition_sources': [competition],
                'title': f'{slug}',
                'kernel_sources': []
            }
            with open(f'{folder}/kernel-metadata.json', 'w') as f:
                json.dump(kernel_meta, f)

            api.kernels_push(folder)

        push_notebook('predict.ipynb', 'predict')

        self.info('kernel is pushed. waiting for the end of the commit')

        self.info(f'kernel address: https://www.kaggle.com/{username}/{slug}')