def extract_chunk(self, since, until): api_call_start = datetime.datetime.now() # when did the API call start? if until is none, we will get records up to this time try: logger.info("%s doing chunked extract for domain %s, requesting records since %s until %s" % (self.__class__.__name__, self.domain, since if since else 'forever', until if until else 'forever')) writer = PgCopyWriter(self.engine.connect(), self.api_client.project) env = BuiltInEnv() | CommCareHqEnv(self.api_client, since, until) | JsonPathEnv({}) result = self._get_query.eval(env) if (self._get_table_name in [t['name'] for t in env.emitted_tables()]): with writer: for table in env.emitted_tables(): if table['name'] == self._get_table_name: writer.write_table(table, self._get_attribute_db_cols, self._get_hstore_db_col) else: logger.warn('no table emitted with name %s' % self._get_table_name) self.extract_log.extract_end = until if until else api_call_start except: raise finally: if self.extract_log.extract_end: self.extract_log.save()
def test_compile_mapped_field(self): env = BuiltInEnv() | JsonPathEnv({'foo': {'bar': 'a', 'baz': 'b'}}) expression = compile_mapped_field({'a': 'mapped from a'}, Reference('foo.bar')) assert expression.eval(env) == 'mapped from a' expression = compile_mapped_field({'a': 'mapped from a'}, Reference('foo.baz')) assert list(expression.eval(env))[0].value == 'b'
def handle(self, project, query, output_format, output, limit, **options): # note: this is heavily copy/paste/modified from commcare_export.cli commcare_hq = 'local_commcare_export' try: # local development only sys.path.append(os.path.join(os.getcwd(), 'lib', 'commcare-export')) import commcare_export # noqa except ImportError: raise CommandError( 'This command requires commcare-export to be installed! ' 'Please run: pip install commcare-export. You may also need to run: ' 'pip install openpyxl==2.6.0b1 ' 'afterwards to run CommCare due to version incompatibilities.') from commcare_export import misc from commcare_export.checkpoint import CheckpointManager from commcare_export.cli import _get_writer, _get_query_from_file from commcare_export.commcare_minilinq import CommCareHqEnv from commcare_export.env import BuiltInEnv, JsonPathEnv, EmitterEnv print('commcare-export is installed.') writer = _get_writer(output_format, output, strict_types=False) query_obj = _get_query_from_file( query, None, # missing_value writer.supports_multi_table_write, writer.max_column_length, writer.required_columns) checkpoint_manager = None if writer.support_checkpoints: md5 = misc.digest_file(query) checkpoint_manager = CheckpointManager( output, query, md5, project, commcare_hq, ) since = checkpoint_manager.get_time_of_last_checkpoint() else: since = None commcarehq_base_url = commcare_hq api_client = LocalCommCareHqClient( url=commcarehq_base_url, project=project, limit=limit, checkpoint_manager=checkpoint_manager) if since is not None: since = dateutil.parser.parse(since) env = (BuiltInEnv({'commcarehq_base_url': commcarehq_base_url}) | CommCareHqEnv(api_client, since=since) | JsonPathEnv({}) | EmitterEnv(writer)) with env: try: lazy_result = query_obj.eval(env) if lazy_result is not None: # evaluate lazy results for r in lazy_result: list(r) if r else r except KeyboardInterrupt: print('\nExport aborted') return if checkpoint_manager: checkpoint_manager.set_final_checkpoint()
def main_with_args(args): # Reads as excel if it is a file name that looks like excel, otherwise reads as JSON, # falling back to parsing arg directly as JSON, and finally parsing stdin as JSON if args.query: if os.path.exists(args.query): if os.path.splitext(args.query)[1] in ['.xls', '.xlsx']: import openpyxl workbook = openpyxl.load_workbook(args.query) query = excel_query.compile_workbook(workbook) else: with open(args.query) as fh: query = MiniLinq.from_jvalue(json.loads(fh.read())) else: query = MiniLinq.from_jvalue(json.loads(args.query)) else: query = MiniLinq.from_jvalue(json.loads(sys.stdin.read())) if args.dump_query: print json.dumps(query.to_jvalue(), indent=4) exit(0) if not args.username: args.username = raw_input('Please provide a username: '******'Please enter your password: '******'xlsx': writer = writers.Excel2007TableWriter(args.output) elif args.output_format == 'xls': writer = writers.Excel2003TableWriter(args.output) elif args.output_format == 'csv': writer = writers.CsvTableWriter(args.output) elif args.output_format == 'json': writer = writers.JValueTableWriter() elif args.output_format == 'markdown': writer = writers.StreamingMarkdownTableWriter(sys.stdout) elif args.output_format == 'sql': writer = writers.SqlTableWriter(args.output) # Output should be a connection URL env = BuiltInEnv() | CommCareHqEnv(api_client, since=dateutil.parser.parse(args.since) if args.since else None) | JsonPathEnv({}) results = query.eval(env) # Assume that if any tables were emitted, that is the idea, otherwise print the output if len(list(env.emitted_tables())) > 0: with writer: for table in env.emitted_tables(): logging.debug('Writing %s', table['name']) writer.write_table(table) if args.output_format == 'json': print json.dumps(writer.tables, indent=4, default=RepeatableIterator.to_jvalue) else: print json.dumps(list(results), indent=4, default=RepeatableIterator.to_jvalue)
def main_with_args(args): logger.info("CommCare Export Version {}".format(__version__)) writer = _get_writer(args.output_format, args.output, args.strict_types) if args.query is None and args.users is False and args.locations is False: print( 'At least one the following arguments is required: ' '--query, --users, --locations', file=sys.stderr) return EXIT_STATUS_ERROR column_enforcer = None if args.with_organization: column_enforcer = builtin_queries.ColumnEnforcer() commcarehq_base_url = commcare_hq_aliases.get(args.commcare_hq, args.commcare_hq) api_client = _get_api_client(args, commcarehq_base_url) lp = LocationInfoProvider(api_client, page_size=args.batch_size) try: query = get_queries(args, writer, lp, column_enforcer) except DataExportException as e: print(e.message, file=sys.stderr) return EXIT_STATUS_ERROR if args.dump_query: print(json.dumps(query.to_jvalue(), indent=4)) return checkpoint_manager = None if writer.support_checkpoints: checkpoint_manager = _get_checkpoint_manager(args) if not args.username: args.username = input('Please provide a username: '******'Ignoring all checkpoints and re-fetching all data from CommCare.' ) elif since: logger.debug('Starting from %s', args.since) cm = CheckpointManagerProvider(checkpoint_manager, since, args.start_over) static_env = { 'commcarehq_base_url': commcarehq_base_url, 'get_checkpoint_manager': cm.get_checkpoint_manager, 'get_location_info': lp.get_location_info, 'get_location_ancestor': lp.get_location_ancestor } env = (BuiltInEnv(static_env) | CommCareHqEnv(api_client, until=until, page_size=args.batch_size) | JsonPathEnv({}) | EmitterEnv(writer)) exit_status = evaluate_query(env, query) if args.output_format == 'json': print( json.dumps(list(writer.tables.values()), indent=4, default=default_to_json)) return exit_status
def main_with_args(args): # Grab the timestamp here so that anything that comes in while this runs will be grabbed next time. run_start = datetime.utcnow() # Reads as excel if it is a file name that looks like excel, otherwise reads as JSON, # falling back to parsing arg directly as JSON, and finally parsing stdin as JSON if args.query: if os.path.exists(args.query): query_file_md5 = misc.digest_file(args.query) if os.path.splitext(args.query)[1] in ['.xls', '.xlsx']: import openpyxl workbook = openpyxl.load_workbook(args.query) query = excel_query.compile_workbook(workbook) else: with io.open(args.query, encoding='utf-8') as fh: query = MiniLinq.from_jvalue(json.loads(fh.read())) else: print('Query file not found: %s' % args.query) exit(1) else: query = MiniLinq.from_jvalue(json.loads(sys.stdin.read())) if args.dump_query: print(json.dumps(query.to_jvalue(), indent=4)) exit(0) if not args.username: args.username = input('Please provide a username: '******'xlsx': writer = writers.Excel2007TableWriter(args.output) elif args.output_format == 'xls': writer = writers.Excel2003TableWriter(args.output) elif args.output_format == 'csv': if not args.output.endswith(".zip"): print("WARNING: csv output is a zip file, but " "will be written to %s" % args.output) print("Consider appending .zip to the file name to avoid confusion.") writer = writers.CsvTableWriter(args.output) elif args.output_format == 'json': writer = writers.JValueTableWriter() elif args.output_format == 'markdown': writer = writers.StreamingMarkdownTableWriter(sys.stdout) elif args.output_format == 'sql': # Output should be a connection URL # Writer had bizarre issues so we use a full connection instead of passing in a URL or engine import sqlalchemy engine = sqlalchemy.create_engine(args.output) writer = writers.SqlTableWriter(engine.connect()) if not args.since and not args.start_over and os.path.exists(args.query): connection = sqlalchemy.create_engine(args.output) # Grab the current list of tables to see if we have already run & written to it metadata = sqlalchemy.MetaData() metadata.bind = connection metadata.reflect() if 'commcare_export_runs' in metadata.tables: cursor = connection.execute(sqlalchemy.sql.text('SELECT time_of_run FROM commcare_export_runs WHERE query_file_md5 = :query_file_md5 ORDER BY time_of_run DESC'), query_file_md5=query_file_md5) for row in cursor: args.since = row[0] logger.debug('Last successful run was %s', args.since) break cursor.close() else: logger.warn('No successful runs found, and --since not specified: will import ALL data') if args.since: logger.debug('Starting from %s', args.since) env = BuiltInEnv() | CommCareHqEnv(api_client, since=dateutil.parser.parse(args.since) if args.since else None) | JsonPathEnv({}) results = query.eval(env) # Assume that if any tables were emitted, that is the idea, otherwise print the output if len(list(env.emitted_tables())) > 0: with writer: for table in env.emitted_tables(): logger.debug('Writing %s', table['name']) writer.write_table(table) if args.output_format == 'sql' and os.path.exists(args.query): writer.write_table({ 'name': 'commcare_export_runs', 'headings': ['id', 'query_file_name', 'query_file_md5', 'time_of_run'], 'rows': [ [uuid.uuid4().hex, args.query, query_file_md5, run_start.isoformat()] ] }) if args.output_format == 'json': print(json.dumps(writer.tables, indent=4, default=RepeatableIterator.to_jvalue)) else: print(json.dumps(list(results), indent=4, default=RepeatableIterator.to_jvalue))
def main_with_args(args): # Grab the timestamp here so that anything that comes in while this runs will be grabbed next time. run_start = datetime.utcnow() # Reads as excel if it is a file name that looks like excel, otherwise reads as JSON, # falling back to parsing arg directly as JSON, and finally parsing stdin as JSON if args.query: if os.path.exists(args.query): query_file_md5 = misc.digest_file(args.query) if os.path.splitext(args.query)[1] in ['.xls', '.xlsx']: import openpyxl workbook = openpyxl.load_workbook(args.query) query = excel_query.compile_workbook(workbook, args.missing_value) else: with io.open(args.query, encoding='utf-8') as fh: query = MiniLinq.from_jvalue(json.loads(fh.read())) else: print('Query file not found: %s' % args.query) exit(1) else: query = MiniLinq.from_jvalue(json.loads(sys.stdin.read())) if args.dump_query: print(json.dumps(query.to_jvalue(), indent=4)) exit(0) if not args.username: args.username = input('Please provide a username: '******'xlsx': writer = writers.Excel2007TableWriter(args.output) elif args.output_format == 'xls': writer = writers.Excel2003TableWriter(args.output) elif args.output_format == 'csv': if not args.output.endswith(".zip"): print("WARNING: csv output is a zip file, but " "will be written to %s" % args.output) print( "Consider appending .zip to the file name to avoid confusion.") writer = writers.CsvTableWriter(args.output) elif args.output_format == 'json': writer = writers.JValueTableWriter() elif args.output_format == 'markdown': writer = writers.StreamingMarkdownTableWriter(sys.stdout) elif args.output_format == 'sql': # Output should be a connection URL # Writer had bizarre issues so we use a full connection instead of passing in a URL or engine import sqlalchemy engine = sqlalchemy.create_engine(args.output) is_mysql = 'mysql' in args.output collation = 'utf8_bin' if is_mysql else None writer = writers.SqlTableWriter(engine.connect(), args.strict_types, collation=collation) if not args.since and not args.start_over and os.path.exists( args.query): connection = sqlalchemy.create_engine(args.output) # Grab the current list of tables to see if we have already run & written to it metadata = sqlalchemy.MetaData() metadata.bind = connection metadata.reflect() if 'commcare_export_runs' in metadata.tables: cursor = connection.execute(sqlalchemy.sql.text( 'SELECT time_of_run FROM commcare_export_runs WHERE query_file_md5 = :query_file_md5 ORDER BY time_of_run DESC' ), query_file_md5=query_file_md5) for row in cursor: args.since = row[0] logger.debug('Last successful run was %s', args.since) break cursor.close() else: logger.warn( 'No successful runs found, and --since not specified: will import ALL data' ) if args.since: logger.debug('Starting from %s', args.since) since = dateutil.parser.parse(args.since) if args.since else None until = dateutil.parser.parse(args.until) if args.until else None env = BuiltInEnv({ 'commcarehq_base_url': commcarehq_base_url }) | CommCareHqEnv(api_client, since=since, until=until) | JsonPathEnv({}) results = query.eval(env) # Assume that if any tables were emitted, that is the idea, otherwise print the output if len(list(env.emitted_tables())) > 0: with writer: api_client.set_checkpointer(writer, query=args.query, query_md5=query_file_md5) for table in env.emitted_tables(): logger.debug('Writing %s', table['name']) if table['name'] != table['name'].lower(): logger.warning("Caution: Using upper case letters in a " "table name is not advised: {}".format( table['name'])) writer.write_table(table) if os.path.exists(args.query): writer.set_checkpoint(args.query, query_file_md5, run_start, True) if args.output_format == 'json': print( json.dumps(writer.tables, indent=4, default=RepeatableIterator.to_jvalue)) else: print( json.dumps(list(results), indent=4, default=RepeatableIterator.to_jvalue))