Ejemplo n.º 1
0
def main():

    n_experiments, n_samples, n_runs = 0, 0, 0

    db = get_database()

    # clear pre-existing data
    print("--- dropping existing data")
    db.drop_tables([Sample, Experiment, Run])
    print("--- migrating up!")
    db.create_tables([Experiment, Sample, Run])

    batch_size = 1000

    with tarfile.open(TAR_PATH, "r:gz") as tar:
        for n, member in enumerate(tar):
            print(f"--- loading {member.name}")
            h = tar.extractfile(member)

            experiments_batch = []
            samples_batch = []
            runs_batch = []

            if h is not None:
                if "experiment" in member.name:
                    # experiment alias -> biosample identifier
                    with tar.extractfile(member) as handle:
                        root = xml.etree.ElementTree.parse(handle)
                        for _id, experiment_data in load_experiment_data(root):
                            experiments_batch.append({
                                "id": _id,
                                **experiment_data
                            })
                            n_experiments += 1

                            if len(experiments_batch) >= batch_size:
                                print("--- inserting batch of experiments")
                                print(len(experiments_batch))
                                Experiment.insert_many(
                                    experiments_batch).execute()
                                print(
                                    f"--- n_experiments={n_experiments}; row count={Experiment.select().count()}"
                                )
                                experiments_batch = []

                    if len(experiments_batch) > 0:
                        try:
                            Experiment.insert_many(experiments_batch).execute()
                        except peewee.DataError:
                            pprint(experiments_batch)
                            quit(-1)

                if "sample" in member.name:
                    with tar.extractfile(member) as handle:
                        root = xml.etree.ElementTree.parse(handle)
                        for _id, sample_data in load_sample_data(root):
                            row = {"id": _id, **sample_data}
                            samples_batch.append(row)
                            n_samples += 1
                            if len(samples_batch) >= batch_size:
                                print(len(samples_batch))
                                print("--- inserting batch of samples")
                                Sample.insert_many(samples_batch).execute()
                                print(
                                    f"--- n_samples={n_samples}; row count={Sample.select().count()}"
                                )
                                samples_batch = []

                    if len(samples_batch) > 0:
                        Sample.insert_many(samples_batch).execute()

                if "run" in member.name:
                    with tar.extractfile(member) as handle:
                        root = xml.etree.ElementTree.parse(handle)
                        for _id, run_data in load_run_data(root):
                            runs_batch.append({"id": _id, **run_data})
                            n_runs += 1

                            if len(runs_batch) >= batch_size:
                                print(len(runs_batch))
                                print("--- inserting batch of runs")
                                Run.insert_many(runs_batch).execute()
                                print(
                                    f"--- n_runs={n_runs}; row count={Run.select().count()}"
                                )
                                runs_batch = []
                    if len(runs_batch) > 0:
                        Run.insert_many(runs_batch).execute()

            else:
                pass