def bulk_import_async(import_id, config, domain, excel_id): task = bulk_import_async excel_ref = DownloadBase.get(excel_id) spreadsheet = importer_util.get_spreadsheet(excel_ref, config.named_columns) if not spreadsheet: return {'error': 'EXPIRED'} if spreadsheet.has_errors: return {'error': 'HAS_ERRORS'} row_count = spreadsheet.get_num_rows() columns = spreadsheet.get_header_columns() match_count = created_count = too_many_matches = 0 blank_external_ids = [] invalid_dates = [] prime_offset = 1 # used to prevent back-to-back priming user = CouchUser.get_by_user_id(config.couch_user_id, domain) username = user.username user_id = user._id for i in range(row_count): DownloadBase.set_progress(task, i, row_count) # skip first row if it is a header field if i == 0 and config.named_columns: continue priming_progress = match_count + created_count + prime_offset if priming_progress % PRIME_VIEW_FREQUENCY == 0: prime_views(POOL_SIZE) # increment so we can't possibly prime on next iteration prime_offset += 1 row = spreadsheet.get_row(i) search_id = importer_util.parse_search_id(config, columns, row) if config.search_field == 'external_id' and not search_id: # do not allow blank external id since we save this blank_external_ids.append(i + 1) continue case, error = importer_util.lookup_case(config.search_field, search_id, domain) try: fields_to_update = importer_util.populate_updated_fields( config, columns, row ) except importer_util.InvalidDateException: invalid_dates.append(i + 1) continue if case: match_count += 1 elif error == LookupErrors.NotFound: if not config.create_new_cases: continue created_count += 1 elif error == LookupErrors.MultipleResults: too_many_matches += 1 continue if 'owner_id' in fields_to_update: owner_id = fields_to_update['owner_id'] del fields_to_update['owner_id'] else: owner_id = user_id if not case: id = uuid.uuid4().hex caseblock = CaseBlock( create=True, case_id=id, version=V2, user_id=user_id, owner_id=owner_id, case_type=config.case_type, external_id=search_id if config.search_field == 'external_id' else '', update=fields_to_update ) submit_case_block(caseblock, domain, username, user_id) elif case and case.type == config.case_type: caseblock = CaseBlock( create=False, case_id=case._id, owner_id=owner_id, version=V2, update=fields_to_update ) submit_case_block(caseblock, domain, username, user_id) return { 'created_count': created_count, 'match_count': match_count, 'too_many_matches': too_many_matches, 'blank_externals': blank_external_ids, 'invalid_dates': invalid_dates, }
def do_import(spreadsheet, config, domain, task=None, chunksize=CASEBLOCK_CHUNKSIZE): if not spreadsheet: return {"errors": "EXPIRED"} if spreadsheet.has_errors: return {"errors": "HAS_ERRORS"} row_count = spreadsheet.get_num_rows() columns = spreadsheet.get_header_columns() match_count = created_count = too_many_matches = num_chunks = 0 errors = importer_util.ImportErrorDetail() prime_offset = 1 # used to prevent back-to-back priming user = CouchUser.get_by_user_id(config.couch_user_id, domain) username = user.username user_id = user._id # keep a cache of id lookup successes to help performance id_cache = {} name_cache = {} caseblocks = [] ids_seen = set() def _submit_caseblocks(caseblocks): if caseblocks: submit_case_blocks([ElementTree.tostring(cb.as_xml()) for cb in caseblocks], domain, username, user_id) for i in range(row_count): if task: DownloadBase.set_progress(task, i, row_count) # skip first row if it is a header field if i == 0 and config.named_columns: continue if not is_bigcouch(): priming_progress = match_count + created_count + prime_offset if priming_progress % PRIME_VIEW_FREQUENCY == 0: prime_views(POOL_SIZE) # increment so we can't possibly prime on next iteration prime_offset += 1 row = spreadsheet.get_row(i) search_id = importer_util.parse_search_id(config, columns, row) if config.search_field == "external_id" and not search_id: # do not allow blank external id since we save this errors.add(ImportErrors.BlankExternalId, i + 1) continue try: fields_to_update = importer_util.populate_updated_fields( config, columns, row, spreadsheet.workbook.datemode ) if not any(fields_to_update.values()): # if the row was blank, just skip it, no errors continue except importer_util.InvalidDateException: errors.add(ImportErrors.InvalidDate, i + 1) continue external_id = fields_to_update.pop("external_id", None) parent_id = fields_to_update.pop("parent_id", None) parent_external_id = fields_to_update.pop("parent_external_id", None) parent_type = fields_to_update.pop("parent_type", config.case_type) parent_ref = fields_to_update.pop("parent_ref", "parent") to_close = fields_to_update.pop("close", False) if any([lookup_id and lookup_id in ids_seen for lookup_id in [search_id, parent_id, parent_external_id]]): # clear out the queue to make sure we've processed any potential # cases we want to look up # note: these three lines are repeated a few places, and could be converted # to a function that makes use of closures (and globals) to do the same thing, # but that seems sketchier than just beeing a little RY _submit_caseblocks(caseblocks) num_chunks += 1 caseblocks = [] ids_seen = set() # also clear ids_seen, since all the cases will now be in the database case, error = importer_util.lookup_case(config.search_field, search_id, domain, config.case_type) if case: if case.type != config.case_type: continue elif error == LookupErrors.NotFound: if not config.create_new_cases: continue elif error == LookupErrors.MultipleResults: too_many_matches += 1 continue uploaded_owner_name = fields_to_update.pop("owner_name", None) uploaded_owner_id = fields_to_update.pop("owner_id", None) if uploaded_owner_name: # If an owner name was provided, replace the provided # uploaded_owner_id with the id of the provided group or owner try: uploaded_owner_id = importer_util.get_id_from_name(uploaded_owner_name, domain, name_cache) except SQLLocation.MultipleObjectsReturned: errors.add(ImportErrors.DuplicateLocationName, i + 1) continue if not uploaded_owner_id: errors.add(ImportErrors.InvalidOwnerName, i + 1) continue if uploaded_owner_id: # If an owner_id mapping exists, verify it is a valid user # or case sharing group if importer_util.is_valid_id(uploaded_owner_id, domain, id_cache): owner_id = uploaded_owner_id id_cache[uploaded_owner_id] = True else: errors.add(ImportErrors.InvalidOwnerId, i + 1) id_cache[uploaded_owner_id] = False continue else: # if they didn't supply an owner_id mapping, default to current # user owner_id = user_id extras = {} if parent_id: try: parent_case = CommCareCase.get(parent_id) if parent_case.domain == domain: extras["index"] = {parent_ref: (parent_case.type, parent_id)} except ResourceNotFound: errors.add(ImportErrors.InvalidParentId, i + 1) continue elif parent_external_id: parent_case, error = importer_util.lookup_case("external_id", parent_external_id, domain, parent_type) if parent_case: extras["index"] = {parent_ref: (parent_type, parent_case._id)} if not case: id = uuid.uuid4().hex if config.search_field == "external_id": extras["external_id"] = search_id try: caseblock = CaseBlock( create=True, case_id=id, owner_id=owner_id, user_id=user_id, case_type=config.case_type, update=fields_to_update, **extras ) caseblocks.append(caseblock) created_count += 1 if external_id: ids_seen.add(external_id) except CaseBlockError: errors.add(ImportErrors.CaseGeneration, i + 1) else: if external_id: extras["external_id"] = external_id if uploaded_owner_id: extras["owner_id"] = owner_id if to_close == "yes": extras["close"] = True try: caseblock = CaseBlock(create=False, case_id=case._id, update=fields_to_update, **extras) caseblocks.append(caseblock) match_count += 1 except CaseBlockError: errors.add(ImportErrors.CaseGeneration, i + 1) # check if we've reached a reasonable chunksize # and if so submit if len(caseblocks) >= chunksize: _submit_caseblocks(caseblocks) num_chunks += 1 caseblocks = [] # final purge of anything left in the queue _submit_caseblocks(caseblocks) num_chunks += 1 return { "created_count": created_count, "match_count": match_count, "too_many_matches": too_many_matches, "errors": errors.as_dict(), "num_chunks": num_chunks, }
def do_import(spreadsheet, config, domain, task=None, chunksize=CASEBLOCK_CHUNKSIZE): if not spreadsheet: return {'error': 'EXPIRED'} if spreadsheet.has_errors: return {'error': 'HAS_ERRORS'} row_count = spreadsheet.get_num_rows() columns = spreadsheet.get_header_columns() match_count = created_count = too_many_matches = errors = num_chunks = 0 blank_external_ids = [] invalid_dates = [] owner_id_errors = [] prime_offset = 1 # used to prevent back-to-back priming user = CouchUser.get_by_user_id(config.couch_user_id, domain) username = user.username user_id = user._id # keep a cache of id lookup successes to help performance id_cache = {} caseblocks = [] ids_seen = set() def _submit_caseblocks(caseblocks): if caseblocks: submit_case_blocks( [ElementTree.tostring(cb.as_xml(format_datetime=json_format_datetime)) for cb in caseblocks], domain, username, user_id, ) for i in range(row_count): if task: DownloadBase.set_progress(task, i, row_count) # skip first row if it is a header field if i == 0 and config.named_columns: continue if not is_bigcouch(): priming_progress = match_count + created_count + prime_offset if priming_progress % PRIME_VIEW_FREQUENCY == 0: prime_views(POOL_SIZE) # increment so we can't possibly prime on next iteration prime_offset += 1 row = spreadsheet.get_row(i) search_id = importer_util.parse_search_id(config, columns, row) if config.search_field == 'external_id' and not search_id: # do not allow blank external id since we save this blank_external_ids.append(i + 1) continue try: fields_to_update = importer_util.populate_updated_fields( config, columns, row ) if not any(fields_to_update.values()): # if the row was blank, just skip it, no errors continue except importer_util.InvalidDateException: invalid_dates.append(i + 1) continue external_id = fields_to_update.pop('external_id', None) parent_id = fields_to_update.pop('parent_id', None) parent_external_id = fields_to_update.pop('parent_external_id', None) parent_type = fields_to_update.pop('parent_type', config.case_type) parent_ref = fields_to_update.pop('parent_ref', 'parent') to_close = fields_to_update.pop('close', False) if any([lookup_id and lookup_id in ids_seen for lookup_id in [search_id, parent_id, parent_external_id]]): # clear out the queue to make sure we've processed any potential # cases we want to look up # note: these three lines are repeated a few places, and could be converted # to a function that makes use of closures (and globals) to do the same thing, # but that seems sketchier than just beeing a little RY _submit_caseblocks(caseblocks) num_chunks += 1 caseblocks = [] ids_seen = set() # also clear ids_seen, since all the cases will now be in the database case, error = importer_util.lookup_case( config.search_field, search_id, domain, config.case_type ) if case: if case.type != config.case_type: continue elif error == LookupErrors.NotFound: if not config.create_new_cases: continue elif error == LookupErrors.MultipleResults: too_many_matches += 1 continue uploaded_owner_id = fields_to_update.pop('owner_id', None) if uploaded_owner_id: # If an owner_id mapping exists, verify it is a valid user # or case sharing group if importer_util.is_valid_id(uploaded_owner_id, domain, id_cache): owner_id = uploaded_owner_id id_cache[uploaded_owner_id] = True else: owner_id_errors.append(i + 1) id_cache[uploaded_owner_id] = False continue else: # if they didn't supply an owner_id mapping, default to current # user owner_id = user_id extras = {} if parent_id: try: parent_case = CommCareCase.get(parent_id) if parent_case.domain == domain: extras['index'] = { parent_ref: (parent_case.type, parent_id) } except ResourceNotFound: continue elif parent_external_id: parent_case, error = importer_util.lookup_case( 'external_id', parent_external_id, domain, parent_type ) if parent_case: extras['index'] = { parent_ref: (parent_type, parent_case._id) } if not case: id = uuid.uuid4().hex if config.search_field == 'external_id': extras['external_id'] = search_id try: caseblock = CaseBlock( create=True, case_id=id, version=V2, owner_id=owner_id, user_id=user_id, case_type=config.case_type, update=fields_to_update, **extras ) caseblocks.append(caseblock) created_count += 1 if external_id: ids_seen.add(external_id) except CaseBlockError: errors += 1 else: if external_id: extras['external_id'] = external_id if uploaded_owner_id: extras['owner_id'] = owner_id if to_close == 'yes': extras['close'] = True try: caseblock = CaseBlock( create=False, case_id=case._id, version=V2, update=fields_to_update, **extras ) caseblocks.append(caseblock) match_count += 1 except CaseBlockError: errors += 1 # check if we've reached a reasonable chunksize # and if so submit if len(caseblocks) >= chunksize: _submit_caseblocks(caseblocks) num_chunks += 1 caseblocks = [] # final purge of anything left in the queue _submit_caseblocks(caseblocks) num_chunks += 1 return { 'created_count': created_count, 'match_count': match_count, 'too_many_matches': too_many_matches, 'blank_externals': blank_external_ids, 'invalid_dates': invalid_dates, 'owner_id_errors': owner_id_errors, 'errors': errors, 'num_chunks': num_chunks, }
def do_import(spreadsheet, config, domain, task=None, chunksize=CASEBLOCK_CHUNKSIZE): row_count = spreadsheet.get_num_rows() columns = spreadsheet.get_header_columns() match_count = created_count = too_many_matches = num_chunks = 0 errors = importer_util.ImportErrorDetail() prime_offset = 1 # used to prevent back-to-back priming user = CouchUser.get_by_user_id(config.couch_user_id, domain) username = user.username user_id = user._id # keep a cache of id lookup successes to help performance id_cache = {} name_cache = {} caseblocks = [] ids_seen = set() def _submit_caseblocks(domain, case_type, caseblocks): err = False if caseblocks: try: form, cases = submit_case_blocks( [cb.as_string() for cb in caseblocks], domain, username, user_id, ) if form.is_error: errors.add(error=ImportErrors.ImportErrorMessage, row_number=form.problem) except Exception: err = True errors.add(error=ImportErrors.ImportErrorMessage, row_number=caseblocks[0]._id) else: properties = set().union(*map( lambda c: set(c.dynamic_case_properties().keys()), cases)) add_inferred_export_properties.delay( 'CaseImporter', domain, case_type, properties, ) return err for i in range(row_count): if task: DownloadBase.set_progress(task, i, row_count) # skip first row if it is a header field if i == 0 and config.named_columns: continue if not is_bigcouch(): priming_progress = match_count + created_count + prime_offset if priming_progress % PRIME_VIEW_FREQUENCY == 0: prime_views(POOL_SIZE) # increment so we can't possibly prime on next iteration prime_offset += 1 row = spreadsheet.get_row(i) search_id = importer_util.parse_search_id(config, columns, row) if config.search_field == 'external_id' and not search_id: # do not allow blank external id since we save this errors.add(ImportErrors.BlankExternalId, i + 1) continue try: fields_to_update = importer_util.populate_updated_fields( config, columns, row, spreadsheet.workbook.datemode) if not any(fields_to_update.values()): # if the row was blank, just skip it, no errors continue except importer_util.InvalidDateException as e: errors.add(ImportErrors.InvalidDate, i + 1, e.column) continue except importer_util.InvalidIntegerException as e: errors.add(ImportErrors.InvalidInteger, i + 1, e.column) continue external_id = fields_to_update.pop('external_id', None) parent_id = fields_to_update.pop('parent_id', None) parent_external_id = fields_to_update.pop('parent_external_id', None) parent_type = fields_to_update.pop('parent_type', config.case_type) parent_ref = fields_to_update.pop('parent_ref', 'parent') to_close = fields_to_update.pop('close', False) if any([ lookup_id and lookup_id in ids_seen for lookup_id in [search_id, parent_id, parent_external_id] ]): # clear out the queue to make sure we've processed any potential # cases we want to look up # note: these three lines are repeated a few places, and could be converted # to a function that makes use of closures (and globals) to do the same thing, # but that seems sketchier than just beeing a little RY _submit_caseblocks(domain, config.case_type, caseblocks) num_chunks += 1 caseblocks = [] ids_seen = set( ) # also clear ids_seen, since all the cases will now be in the database case, error = importer_util.lookup_case(config.search_field, search_id, domain, config.case_type) if case: if case.type != config.case_type: continue elif error == LookupErrors.NotFound: if not config.create_new_cases: continue
def bulk_import_async(import_id, config, domain, excel_id): task = bulk_import_async excel_ref = DownloadBase.get(excel_id) spreadsheet = importer_util.get_spreadsheet(excel_ref, config.named_columns) if not spreadsheet: return {'error': 'EXPIRED'} if spreadsheet.has_errors: return {'error': 'HAS_ERRORS'} row_count = spreadsheet.get_num_rows() columns = spreadsheet.get_header_columns() match_count = created_count = too_many_matches = errors = 0 blank_external_ids = [] invalid_dates = [] owner_id_errors = [] prime_offset = 1 # used to prevent back-to-back priming user = CouchUser.get_by_user_id(config.couch_user_id, domain) username = user.username user_id = user._id # keep a cache of id lookup successes to help performance id_cache = {} for i in range(row_count): DownloadBase.set_progress(task, i, row_count) # skip first row if it is a header field if i == 0 and config.named_columns: continue priming_progress = match_count + created_count + prime_offset if priming_progress % PRIME_VIEW_FREQUENCY == 0: prime_views(POOL_SIZE) # increment so we can't possibly prime on next iteration prime_offset += 1 row = spreadsheet.get_row(i) search_id = importer_util.parse_search_id(config, columns, row) if config.search_field == 'external_id' and not search_id: # do not allow blank external id since we save this blank_external_ids.append(i + 1) continue case, error = importer_util.lookup_case( config.search_field, search_id, domain, config.case_type ) try: fields_to_update = importer_util.populate_updated_fields( config, columns, row ) except importer_util.InvalidDateException: invalid_dates.append(i + 1) continue if case: pass elif error == LookupErrors.NotFound: if not config.create_new_cases: continue elif error == LookupErrors.MultipleResults: too_many_matches += 1 continue uploaded_owner_id = fields_to_update.pop('owner_id', None) if uploaded_owner_id: # If an owner_id mapping exists, verify it is a valid user # or case sharing group if importer_util.is_valid_id(uploaded_owner_id, domain, id_cache): owner_id = uploaded_owner_id id_cache[uploaded_owner_id] = True else: owner_id_errors.append(i + 1) id_cache[uploaded_owner_id] = False continue else: # if they didn't supply an owner_id mapping, default to current # user owner_id = user_id external_id = fields_to_update.pop('external_id', None) if not case: id = uuid.uuid4().hex try: caseblock = CaseBlock( create=True, case_id=id, version=V2, user_id=user_id, owner_id=owner_id, case_type=config.case_type, update=fields_to_update ) if config.search_field == 'external_id': caseblock['external_id'] = search_id submit_case_block(caseblock, domain, username, user_id) created_count += 1 except CaseBlockError: errors += 1 elif case and case.type == config.case_type: extras = {} if external_id: extras['external_id'] = external_id try: caseblock = CaseBlock( create=False, case_id=case._id, owner_id=owner_id, version=V2, update=fields_to_update, **extras ) submit_case_block(caseblock, domain, username, user_id) match_count += 1 except CaseBlockError: errors += 1 return { 'created_count': created_count, 'match_count': match_count, 'too_many_matches': too_many_matches, 'blank_externals': blank_external_ids, 'invalid_dates': invalid_dates, 'owner_id_errors': owner_id_errors, 'errors': errors, }
def do_import(spreadsheet, config, domain, task=None, chunksize=CASEBLOCK_CHUNKSIZE): if not spreadsheet: return {'error': 'EXPIRED'} if spreadsheet.has_errors: return {'error': 'HAS_ERRORS'} row_count = spreadsheet.get_num_rows() columns = spreadsheet.get_header_columns() match_count = created_count = too_many_matches = errors = num_chunks = 0 blank_external_ids = [] invalid_dates = [] owner_id_errors = [] prime_offset = 1 # used to prevent back-to-back priming user = CouchUser.get_by_user_id(config.couch_user_id, domain) username = user.username user_id = user._id # keep a cache of id lookup successes to help performance id_cache = {} caseblocks = [] ids_seen = set() def _submit_caseblocks(caseblocks): if caseblocks: submit_case_blocks( [ ElementTree.tostring( cb.as_xml(format_datetime=json_format_datetime)) for cb in caseblocks ], domain, username, user_id, ) for i in range(row_count): if task: DownloadBase.set_progress(task, i, row_count) # skip first row if it is a header field if i == 0 and config.named_columns: continue if not is_bigcouch(): priming_progress = match_count + created_count + prime_offset if priming_progress % PRIME_VIEW_FREQUENCY == 0: prime_views(POOL_SIZE) # increment so we can't possibly prime on next iteration prime_offset += 1 row = spreadsheet.get_row(i) search_id = importer_util.parse_search_id(config, columns, row) if config.search_field == 'external_id' and not search_id: # do not allow blank external id since we save this blank_external_ids.append(i + 1) continue try: fields_to_update = importer_util.populate_updated_fields( config, columns, row) if not any(fields_to_update.values()): # if the row was blank, just skip it, no errors continue except importer_util.InvalidDateException: invalid_dates.append(i + 1) continue external_id = fields_to_update.pop('external_id', None) parent_id = fields_to_update.pop('parent_id', None) parent_external_id = fields_to_update.pop('parent_external_id', None) parent_type = fields_to_update.pop('parent_type', config.case_type) parent_ref = fields_to_update.pop('parent_ref', 'parent') to_close = fields_to_update.pop('close', False) if any([ lookup_id and lookup_id in ids_seen for lookup_id in [search_id, parent_id, parent_external_id] ]): # clear out the queue to make sure we've processed any potential # cases we want to look up # note: these three lines are repeated a few places, and could be converted # to a function that makes use of closures (and globals) to do the same thing, # but that seems sketchier than just beeing a little RY _submit_caseblocks(caseblocks) num_chunks += 1 caseblocks = [] ids_seen = set( ) # also clear ids_seen, since all the cases will now be in the database case, error = importer_util.lookup_case(config.search_field, search_id, domain, config.case_type) if case: if case.type != config.case_type: continue elif error == LookupErrors.NotFound: if not config.create_new_cases: continue