Example #1
0
    def check_digest(self, contents):
        with tempfile.NamedTemporaryFile(prefix='commcare-export-test-', mode='wb') as file:
            file.write(contents) 
            file.flush()
            file_digest = misc.digest_file(file.name)

        assert file_digest == hashlib.md5(contents).hexdigest() # Make sure the chunking does not mess with stuff
Example #2
0
    def check_digest(self, contents):
        with tempfile.NamedTemporaryFile(prefix='commcare-export-test-',
                                         mode='wb') as file:
            file.write(contents)
            file.flush()
            file_digest = misc.digest_file(file.name)

        assert file_digest == hashlib.md5(contents).hexdigest(
        )  # Make sure the chunking does not mess with stuff
Example #3
0
def get_checkpoint_manager(args, require_query=True):
    md5 = None
    try:
        md5 = misc.digest_file(args.query)
    except Exception:
        if require_query:
            raise

    return CheckpointManager(
        args.output, args.query, md5,
        args.project, args.commcare_hq, args.checkpoint_key
    )
Example #4
0
def get_checkpoint_manager(args, require_query=True):
    md5 = None
    try:
        md5 = misc.digest_file(args.query)
    except Exception:
        if require_query:
            raise

    return CheckpointManager(
        args.output, args.query, md5,
        args.project, args.commcare_hq, args.checkpoint_key
    )
Example #5
0
    def handle(self, project, query, output_format, output, limit, **options):
        # note: this is heavily copy/paste/modified from commcare_export.cli
        commcare_hq = 'local_commcare_export'
        try:
            # local development only
            sys.path.append(os.path.join(os.getcwd(), 'lib',
                                         'commcare-export'))
            import commcare_export  # noqa
        except ImportError:
            raise CommandError(
                'This command requires commcare-export to be installed! '
                'Please run: pip install commcare-export. You may also need to run: '
                'pip install openpyxl==2.6.0b1 '
                'afterwards to run CommCare due to version incompatibilities.')
        from commcare_export import misc
        from commcare_export.checkpoint import CheckpointManager
        from commcare_export.cli import _get_writer, _get_query_from_file
        from commcare_export.commcare_minilinq import CommCareHqEnv
        from commcare_export.env import BuiltInEnv, JsonPathEnv, EmitterEnv

        print('commcare-export is installed.')

        writer = _get_writer(output_format, output, strict_types=False)

        query_obj = _get_query_from_file(
            query,
            None,  # missing_value
            writer.supports_multi_table_write,
            writer.max_column_length,
            writer.required_columns)
        checkpoint_manager = None
        if writer.support_checkpoints:
            md5 = misc.digest_file(query)
            checkpoint_manager = CheckpointManager(
                output,
                query,
                md5,
                project,
                commcare_hq,
            )
            since = checkpoint_manager.get_time_of_last_checkpoint()

        else:
            since = None

        commcarehq_base_url = commcare_hq
        api_client = LocalCommCareHqClient(
            url=commcarehq_base_url,
            project=project,
            limit=limit,
            checkpoint_manager=checkpoint_manager)
        if since is not None:
            since = dateutil.parser.parse(since)

        env = (BuiltInEnv({'commcarehq_base_url': commcarehq_base_url})
               | CommCareHqEnv(api_client, since=since)
               | JsonPathEnv({})
               | EmitterEnv(writer))

        with env:
            try:
                lazy_result = query_obj.eval(env)
                if lazy_result is not None:
                    # evaluate lazy results
                    for r in lazy_result:
                        list(r) if r else r
            except KeyboardInterrupt:
                print('\nExport aborted')
                return

        if checkpoint_manager:
            checkpoint_manager.set_final_checkpoint()
Example #6
0
def main_with_args(args):
    # Grab the timestamp here so that anything that comes in while this runs will be grabbed next time.
    run_start = datetime.utcnow()

    # Reads as excel if it is a file name that looks like excel, otherwise reads as JSON,
    # falling back to parsing arg directly as JSON, and finally parsing stdin as JSON
    if args.query:
        if os.path.exists(args.query):
            query_file_md5 = misc.digest_file(args.query)
            if os.path.splitext(args.query)[1] in [".xls", ".xlsx"]:
                import openpyxl

                workbook = openpyxl.load_workbook(args.query)
                query = excel_query.compile_workbook(workbook)
            else:
                with io.open(args.query, encoding="utf-8") as fh:
                    query = MiniLinq.from_jvalue(json.loads(fh.read()))
        else:
            print("Query file not found: %s" % args.query)
            exit(1)
    else:
        query = MiniLinq.from_jvalue(json.loads(sys.stdin.read()))

    if args.dump_query:
        print(json.dumps(query.to_jvalue(), indent=4))
        exit(0)

    if not args.username:
        args.username = input("Please provide a username: "******"xlsx":
        writer = writers.Excel2007TableWriter(args.output)
    elif args.output_format == "xls":
        writer = writers.Excel2003TableWriter(args.output)
    elif args.output_format == "csv":
        writer = writers.CsvTableWriter(args.output)
    elif args.output_format == "json":
        writer = writers.JValueTableWriter()
    elif args.output_format == "markdown":
        writer = writers.StreamingMarkdownTableWriter(sys.stdout)
    elif args.output_format == "sql":
        writer = writers.SqlTableWriter(args.output)  # Output should be a connection URL

        if not args.since and not args.start_over and os.path.exists(args.query):
            connection = sqlalchemy.create_engine(args.output)

            # Grab the current list of tables to see if we have already run & written to it
            metadata = sqlalchemy.MetaData()
            metadata.bind = connection
            metadata.reflect()

            if "commcare_export_runs" in metadata.tables:
                cursor = connection.execute(
                    sqlalchemy.sql.text(
                        "SELECT time_of_run FROM commcare_export_runs WHERE query_file_md5 = :query_file_md5 ORDER BY time_of_run DESC"
                    ),
                    query_file_md5=query_file_md5,
                )
                for row in cursor:
                    args.since = row[0]
                    logger.debug("Last successful run was %s", args.since)
                    break
                cursor.close()
            else:
                logger.warn("No successful runs found, and --since not specified: will import ALL data")

    if args.since:
        logger.debug("Starting from %s", args.since)
    env = (
        BuiltInEnv()
        | CommCareHqEnv(api_client, since=dateutil.parser.parse(args.since) if args.since else None)
        | JsonPathEnv({})
    )
    results = query.eval(env)

    # Assume that if any tables were emitted, that is the idea, otherwise print the output
    if len(list(env.emitted_tables())) > 0:
        with writer:
            for table in env.emitted_tables():
                logger.debug("Writing %s", table["name"])
                writer.write_table(table)

            if args.output_format == "sql" and os.path.exists(args.query):
                writer.write_table(
                    {
                        "name": "commcare_export_runs",
                        "headings": ["id", "query_file_name", "query_file_md5", "time_of_run"],
                        "rows": [[uuid.uuid4().hex, args.query, query_file_md5, run_start.isoformat()]],
                    }
                )

        if args.output_format == "json":
            print(json.dumps(writer.tables, indent=4, default=RepeatableIterator.to_jvalue))
    else:
        print(json.dumps(list(results), indent=4, default=RepeatableIterator.to_jvalue))
    def handle(self, project, query, output_format, output, limit, **options):
        # note: this is heavily copy/paste/modified from commcare_export.cli
        commcare_hq = 'local_commcare_export'
        try:
            # local development only
            sys.path.append(os.path.join(os.getcwd(), 'lib', 'commcare-export'))
            import commcare_export  # noqa
        except ImportError:
            raise CommandError(
                'This command requires commcare-export to be installed! '
                'Please run: pip install commcare-export. You may also need to run: '
                'pip install openpyxl==2.6.0b1 '
                'afterwards to run CommCare due to version incompatibilities.'
            )
        from commcare_export import misc
        from commcare_export.checkpoint import CheckpointManager
        from commcare_export.cli import _get_writer, _get_query_from_file
        from commcare_export.commcare_minilinq import CommCareHqEnv
        from commcare_export.env import BuiltInEnv, JsonPathEnv, EmitterEnv

        print('commcare-export is installed.')

        writer = _get_writer(output_format, output, strict_types=False)

        query_obj = _get_query_from_file(
            query,
            None,  # missing_value
            writer.supports_multi_table_write,
            writer.max_column_length,
            writer.required_columns
        )
        checkpoint_manager = None
        if writer.support_checkpoints:
            md5 = misc.digest_file(query)
            checkpoint_manager = CheckpointManager(
                output,
                query,
                md5,
                project,
                commcare_hq,
            )
            since = checkpoint_manager.get_time_of_last_checkpoint()

        else:
            since = None

        commcarehq_base_url = commcare_hq
        api_client = LocalCommCareHqClient(
            url=commcarehq_base_url,
            project=project,
            limit=limit,
            checkpoint_manager=checkpoint_manager
        )
        if since is not None:
            since = dateutil.parser.parse(since)

        env = (
            BuiltInEnv({'commcarehq_base_url': commcarehq_base_url})
            | CommCareHqEnv(api_client, since=since)
            | JsonPathEnv({})
            | EmitterEnv(writer)
        )

        with env:
            try:
                lazy_result = query_obj.eval(env)
                if lazy_result is not None:
                    # evaluate lazy results
                    for r in lazy_result:
                        list(r) if r else r
            except KeyboardInterrupt:
                print('\nExport aborted')
                return

        if checkpoint_manager:
            checkpoint_manager.set_final_checkpoint()
Example #8
0
def main_with_args(args):
    # Grab the timestamp here so that anything that comes in while this runs will be grabbed next time.
    run_start = datetime.utcnow()
    
    # Reads as excel if it is a file name that looks like excel, otherwise reads as JSON, 
    # falling back to parsing arg directly as JSON, and finally parsing stdin as JSON
    if args.query:
        if os.path.exists(args.query):
            query_file_md5 = misc.digest_file(args.query)
            if os.path.splitext(args.query)[1] in ['.xls', '.xlsx']:
                import openpyxl
                workbook = openpyxl.load_workbook(args.query)
                query = excel_query.compile_workbook(workbook)
            else:
                with io.open(args.query, encoding='utf-8') as fh:
                    query = MiniLinq.from_jvalue(json.loads(fh.read()))
        else:
            print('Query file not found: %s' % args.query)
            exit(1)
    else:
        query = MiniLinq.from_jvalue(json.loads(sys.stdin.read()))

    if args.dump_query:
        print(json.dumps(query.to_jvalue(), indent=4))
        exit(0)

    if not args.username:
        args.username = input('Please provide a username: '******'xlsx':
        writer = writers.Excel2007TableWriter(args.output)
    elif args.output_format == 'xls':
        writer = writers.Excel2003TableWriter(args.output)
    elif args.output_format == 'csv':
        if not args.output.endswith(".zip"):
            print("WARNING: csv output is a zip file, but "
                  "will be written to %s" % args.output)
            print("Consider appending .zip to the file name to avoid confusion.")
        writer = writers.CsvTableWriter(args.output)
    elif args.output_format == 'json':
        writer = writers.JValueTableWriter()
    elif args.output_format == 'markdown':
        writer = writers.StreamingMarkdownTableWriter(sys.stdout) 
    elif args.output_format == 'sql':
        # Output should be a connection URL
        # Writer had bizarre issues so we use a full connection instead of passing in a URL or engine
        import sqlalchemy
        engine = sqlalchemy.create_engine(args.output)
        writer = writers.SqlTableWriter(engine.connect())

        if not args.since and not args.start_over and os.path.exists(args.query):
            connection = sqlalchemy.create_engine(args.output)

            # Grab the current list of tables to see if we have already run & written to it
            metadata = sqlalchemy.MetaData()
            metadata.bind = connection
            metadata.reflect()

            if 'commcare_export_runs' in metadata.tables:
                cursor = connection.execute(sqlalchemy.sql.text('SELECT time_of_run FROM commcare_export_runs WHERE query_file_md5 = :query_file_md5 ORDER BY time_of_run DESC'), query_file_md5=query_file_md5)
                for row in cursor:
                    args.since = row[0]
                    logger.debug('Last successful run was %s', args.since)
                    break
                cursor.close()
            else:
                logger.warn('No successful runs found, and --since not specified: will import ALL data')

    if args.since:
        logger.debug('Starting from %s', args.since)
    env = BuiltInEnv() | CommCareHqEnv(api_client, since=dateutil.parser.parse(args.since) if args.since else None) | JsonPathEnv({}) 
    results = query.eval(env)

    # Assume that if any tables were emitted, that is the idea, otherwise print the output
    if len(list(env.emitted_tables())) > 0:
        with writer:
            for table in env.emitted_tables():
                logger.debug('Writing %s', table['name'])
                writer.write_table(table)

            if args.output_format == 'sql' and os.path.exists(args.query):
                writer.write_table({
                    'name': 'commcare_export_runs',
                    'headings': ['id', 'query_file_name', 'query_file_md5', 'time_of_run'],
                    'rows': [ [uuid.uuid4().hex, args.query, query_file_md5, run_start.isoformat()] ]
                })

        if args.output_format == 'json':
            print(json.dumps(writer.tables, indent=4, default=RepeatableIterator.to_jvalue))
    else:
        print(json.dumps(list(results), indent=4, default=RepeatableIterator.to_jvalue))
Example #9
0
def main_with_args(args):
    # Grab the timestamp here so that anything that comes in while this runs will be grabbed next time.
    run_start = datetime.utcnow()

    # Reads as excel if it is a file name that looks like excel, otherwise reads as JSON,
    # falling back to parsing arg directly as JSON, and finally parsing stdin as JSON
    if args.query:
        if os.path.exists(args.query):
            query_file_md5 = misc.digest_file(args.query)
            if os.path.splitext(args.query)[1] in ['.xls', '.xlsx']:
                import openpyxl
                workbook = openpyxl.load_workbook(args.query)
                query = excel_query.compile_workbook(workbook,
                                                     args.missing_value)
            else:
                with io.open(args.query, encoding='utf-8') as fh:
                    query = MiniLinq.from_jvalue(json.loads(fh.read()))
        else:
            print('Query file not found: %s' % args.query)
            exit(1)
    else:
        query = MiniLinq.from_jvalue(json.loads(sys.stdin.read()))

    if args.dump_query:
        print(json.dumps(query.to_jvalue(), indent=4))
        exit(0)

    if not args.username:
        args.username = input('Please provide a username: '******'xlsx':
        writer = writers.Excel2007TableWriter(args.output)
    elif args.output_format == 'xls':
        writer = writers.Excel2003TableWriter(args.output)
    elif args.output_format == 'csv':
        if not args.output.endswith(".zip"):
            print("WARNING: csv output is a zip file, but "
                  "will be written to %s" % args.output)
            print(
                "Consider appending .zip to the file name to avoid confusion.")
        writer = writers.CsvTableWriter(args.output)
    elif args.output_format == 'json':
        writer = writers.JValueTableWriter()
    elif args.output_format == 'markdown':
        writer = writers.StreamingMarkdownTableWriter(sys.stdout)
    elif args.output_format == 'sql':
        # Output should be a connection URL
        # Writer had bizarre issues so we use a full connection instead of passing in a URL or engine
        import sqlalchemy
        engine = sqlalchemy.create_engine(args.output)
        is_mysql = 'mysql' in args.output
        collation = 'utf8_bin' if is_mysql else None
        writer = writers.SqlTableWriter(engine.connect(),
                                        args.strict_types,
                                        collation=collation)

        if not args.since and not args.start_over and os.path.exists(
                args.query):
            connection = sqlalchemy.create_engine(args.output)

            # Grab the current list of tables to see if we have already run & written to it
            metadata = sqlalchemy.MetaData()
            metadata.bind = connection
            metadata.reflect()

            if 'commcare_export_runs' in metadata.tables:
                cursor = connection.execute(sqlalchemy.sql.text(
                    'SELECT time_of_run FROM commcare_export_runs WHERE query_file_md5 = :query_file_md5 ORDER BY time_of_run DESC'
                ),
                                            query_file_md5=query_file_md5)
                for row in cursor:
                    args.since = row[0]
                    logger.debug('Last successful run was %s', args.since)
                    break
                cursor.close()
            else:
                logger.warn(
                    'No successful runs found, and --since not specified: will import ALL data'
                )

    if args.since:
        logger.debug('Starting from %s', args.since)
    since = dateutil.parser.parse(args.since) if args.since else None
    until = dateutil.parser.parse(args.until) if args.until else None
    env = BuiltInEnv({
        'commcarehq_base_url': commcarehq_base_url
    }) | CommCareHqEnv(api_client, since=since, until=until) | JsonPathEnv({})
    results = query.eval(env)

    # Assume that if any tables were emitted, that is the idea, otherwise print the output
    if len(list(env.emitted_tables())) > 0:
        with writer:
            api_client.set_checkpointer(writer,
                                        query=args.query,
                                        query_md5=query_file_md5)
            for table in env.emitted_tables():
                logger.debug('Writing %s', table['name'])
                if table['name'] != table['name'].lower():
                    logger.warning("Caution: Using upper case letters in a "
                                   "table name is not advised: {}".format(
                                       table['name']))
                writer.write_table(table)

            if os.path.exists(args.query):
                writer.set_checkpoint(args.query, query_file_md5, run_start,
                                      True)

        if args.output_format == 'json':
            print(
                json.dumps(writer.tables,
                           indent=4,
                           default=RepeatableIterator.to_jvalue))
    else:
        print(
            json.dumps(list(results),
                       indent=4,
                       default=RepeatableIterator.to_jvalue))