def import_coverage(coverage_id): """ Import coverage as regions. """ logger.info('Started task: import_coverage(%d)', coverage_id) current_task.update_state(state='PROGRESS', meta={'percentage': 0}) coverage = Coverage.query.get(coverage_id) if coverage is None: raise TaskError('coverage_not_found', 'Coverage not found') if coverage.task_done: raise TaskError('coverage_imported', 'Coverage already imported') # If running eagerly, task id cannot have been stored yet. But perhaps # this is not a good check anyway... if (not current_task.request.is_eager and coverage.task_uuid and coverage.task_uuid != current_task.request.id): raise TaskError('coverage_importing', 'Coverage is being imported ' 'by another task instance') data_source = coverage.data_source # Calculate data digest if it is not yet known. if not data_source.checksum: with data_source.data() as data: data_source.checksum, data_source.records = digest(data) db.session.commit() # Check if checksum is not in imported data sources. if DataSource.query.filter_by(checksum=data_source.checksum ).join(Coverage).filter_by(task_done=True ).count() > 0: raise TaskError('duplicate_data_source', 'Identical data source already imported') def delete_regions(): coverage.regions.delete() db.session.commit() current_task.register_cleanup(current_task.request.id, delete_regions) # In case we are retrying after a failed import, delete any existing # regions for this coverage. delete_regions() try: data = data_source.data() except DataUnavailable as e: raise TaskError(e.code, e.message) try: with data as regions: old_percentage = -1 for i, (record, chromosome, begin, end) \ in enumerate(read_regions(regions, filetype=data_source.filetype)): percentage = min(int(record / data_source.records * 100), 99) if percentage > old_percentage: current_task.update_state(state='PROGRESS', meta={'percentage': percentage}) old_percentage = percentage db.session.add(Region(coverage, chromosome, begin, end)) if i % DB_BUFFER_SIZE == DB_BUFFER_SIZE - 1: db.session.flush() db.session.commit() except ReadError as e: raise TaskError('invalid_regions', str(e)) current_task.update_state(state='PROGRESS', meta={'percentage': 100}) coverage.task_done = True db.session.commit() logger.info('Finished task: import_coverage(%d)', coverage_id)
def import_variation(variation_id): """ Import variation as observations. """ logger.info('Started task: import_variation(%d)', variation_id) current_task.update_state(state='PROGRESS', meta={'percentage': 0}) variation = Variation.query.get(variation_id) if variation is None: raise TaskError('variation_not_found', 'Variation not found') if variation.task_done: raise TaskError('variation_imported', 'Variation already imported') # If running eagerly, task id cannot have been stored yet. But perhaps # this is not a good check anyway... if (not current_task.request.is_eager and variation.task_uuid and variation.task_uuid != current_task.request.id): raise TaskError('variation_importing', 'Variation is being imported ' 'by another task instance') data_source = variation.data_source # Calculate data digest if it is not yet known. # Todo: Can we somehow factor this out into a separate (singleton) task, # on which we wait? # Waiting synchronously is not a good idea, since we would be holding # the worker process, but I think retrying after some countdown would # be the solution? # self.apply_async(countdown=SOME_CONFIGURATION_VARIABLE) if not data_source.checksum: with data_source.data() as data: data_source.checksum, data_source.records = digest(data) db.session.commit() # Check if checksum is not in imported data sources. if DataSource.query.filter_by(checksum=data_source.checksum ).join(Variation).filter_by(task_done=True ).count() > 0: raise TaskError('duplicate_data_source', 'Identical data source already imported') def delete_observations(): variation.observations.delete() db.session.commit() current_task.register_cleanup(current_task.request.id, delete_observations) # In case we are retrying after a failed import, delete any existing # observations for this variation. delete_observations() try: data = data_source.data() except DataUnavailable as e: raise TaskError(e.code, e.message) # Remember that this only makes sense if autocommit and autoflush are off, # which is the default for flask-sqlalchemy. # Related discussion: https://groups.google.com/forum/?fromgroups=#!topic/sqlalchemy/ZD5RNfsmQmU # Alternative solution might be to dump all observations to a file and # import from that. It would not have memory problems and is probably # faster, but really not portable. # A better option is probably to bypass part of the ORM like discussed in # this presentation: # # https://speakerdeck.com/rwarren/a-brief-intro-to-profiling-in-python # # Not sure if this would solve any memory problems, but it's probably a # lot faster than what we do now. try: with data as observations: old_percentage = -1 for i, (record, chromosome, position, reference, observed, zygosity, support) \ in enumerate(read_observations(observations, filetype=data_source.filetype, skip_filtered=variation.skip_filtered, use_genotypes=variation.use_genotypes, prefer_genotype_likelihoods=variation.prefer_genotype_likelihoods)): # Task progress is updated in whole percentages, so for a # maximum of 100 times per task. percentage = min(int(record / data_source.records * 100), 99) if percentage > old_percentage: current_task.update_state(state='PROGRESS', meta={'percentage': percentage}) old_percentage = percentage observation = Observation(variation, chromosome, position, reference, observed, zygosity=zygosity, support=support) db.session.add(observation) if i % DB_BUFFER_SIZE == DB_BUFFER_SIZE - 1: db.session.flush() db.session.commit() # Todo: In principle I think calling session.flush() once # every N records should work perfectly. We could then # just do a session.rollback() on error. # Unfortunately, session.flush() does not prevent the # memory usage from increasing (even with expire_all() # or expunge_all() calls). So in practice we cannot # use it (tested with psycopg2 2.4.5 and SQLAlchemy # 0.7.8). # As an alternative, we call session.commit() but the # problem is that a simple session.rollback() is not # enough. Therefore we use the CleanTask base class # to register a cleanup handler. except ReadError as e: raise TaskError('invalid_observations', str(e)) current_task.update_state(state='PROGRESS', meta={'percentage': 100}) variation.task_done = True db.session.commit() logger.info('Finished task: import_variation(%d)', variation_id)