def handle_data_task(row_object, **kwargs): """ handle task :param row_object: row object :param kwargs: :return: The Row object that processed by task """ try: check_row_object(row_object, **kwargs) _row_object = handle_data(row_object, **kwargs) return _row_object except AssertionError as e: row_object.status = RowStatus.ERROR.value nlogger.error("{fn} Params error: {e}".format( fn='handle_data_task', e=traceback.format_exc())) if hasattr(row_object, 'column_value') and isinstance( row_object.column_value, dict): row_object.column_value[ 'result'] = f'Params AssertionError: {str(e)}' _company_name = str(row_object.column_value.get('公司', '未知单位')).strip() else: setattr(row_object, 'column_value', {'result': f'Params AssertionError: {str(e)}'}) _company_name = '未知单位' flogger.error("handle_data_task failed:{n},AssertionError:{e}".format( n=_company_name, e=repr(e))) return row_object except HandleDataError as e: row_object.status = RowStatus.ERROR.value _company_name = str(row_object.column_value.get('公司', '未知单位')).strip() flogger.error("handle_data_task failed:{n},HandleDataError:{e}".format( n=_company_name, e=repr(e))) row_object.column_value['result'] = f'HandleDataError: {str(e)}' return row_object except Exception as e: row_object.status = RowStatus.ERROR.value nlogger.error( "{fn} position:{p}, company name:{n}, undefined error: {e}".format( fn='handle_data_task', p=row_object.position, n=row_object.column_value['公司'], e=traceback.format_exc())) row_object.column_value[ 'result'] = f'handle_data_task undefined error: {str(e)}' _company_name = str(row_object.column_value.get('公司', '未知单位')).strip() flogger.error( "handle_data_task undefined failed:{n},HandleDataError:{e}".format( n=_company_name, e=repr(e))) return row_object
def exec_func(check_file, file_name=None, sheet_name=None, start_point=None, end_point=None, **kwargs): """ Executive Function :param check_file: if check_file is True,then only check if download file exists. default False :param file_name: Excel file name :param sheet_name: sheet name, default active sheet :param start_point: start row number, minimum is 2 ( row 1 is column name) :param end_point: end row number , maximum is the row number of sheet :param kwargs: :return: """ try: # Construct dictionary of company _dict_file_name = check_file_name(SAMPLES_FILE, **kwargs) # _dict_file_name = check_file_name('会员单位名单.xlsx', **kwargs) _dict_xls, _dict_row_object_iterator = get_row_object_iterator( check_file, _dict_file_name, 'listing', **kwargs) samples_object = get_samples_object(_dict_row_object_iterator, **kwargs) nlogger.info(f"get_samples_object has been completed") # Prepare source data _data_file_name = check_file_name(file_name, **kwargs) _data_xls, _data_row_object_iterator = get_row_object_iterator( check_file, _data_file_name, sheet_name, start_point, end_point, **kwargs) nlogger.info(f"handle_data_thread start") _data_result = handle_data_thread( row_object_iterator=_data_row_object_iterator, samples_object=samples_object, **kwargs) nlogger.info(f"write_result_to_xls start") write_result_to_xls(_data_xls, _data_result) except (GetRowIterError, HandleDataError, ThreadTaskError, WriteResultError) as e: nlogger.error('{fn} Custom error: {e}'.format(fn='exec_func', e=repr(e))) print(f'Custom error: {repr(e)}') except AssertionError as e: nlogger.error('{fn} Assertion error: {e}'.format( fn='exec_func', e=traceback.format_exc())) print(repr(e)) except Exception as e: nlogger.error('{fn} error: {e}'.format(fn='exec_func', e=traceback.format_exc())) print(f'Undefined error: {repr(e)}')
def handle_data_thread(row_object_iterator, **kwargs): """ handle data by multi Thread :param row_object_iterator: :param kwargs: :return: The result of download, it's list """ executor = ThreadPoolExecutor(max_workers=MAX_WORKERS) try: data_result = [] all_task = [ executor.submit(handle_data_task, row_object, **kwargs) for row_object in row_object_iterator ] # Raise TimeoutError: If the entire result iterator could not be generated before the given timeout. for future in as_completed(all_task, timeout=TASK_WAITING_TIME): data = future.result() if data: data_result.append(data) nlogger.info(f'Handle data completed, {len(data_result)} rows') return data_result except TimeoutError as e: nlogger.error("{fn} TimeoutError: {e}".format(fn='handle_data_thread', e=repr(e))) executor.shutdown(wait=True) # 等待future 任务都执行完成后再关闭资源 raise ThreadTaskError('{fn} TimeoutError: {e}'.format( fn='handle_data_thread', e=repr(e))) except Exception as e: nlogger.error("{fn} error: {e}".format(fn='handle_data_thread', e=traceback.format_exc())) flogger.error("{fn} error: {e}".format(fn='handle_data_thread', e=repr(e))) raise ThreadTaskError('{fn} error: {e}'.format(fn='handle_data_thread', e=repr(e)))
def get_row_object_iterator(check_file, file_name, sheet_name=None, start_point=None, end_point=None, **kwargs): """ get iterator of row object :param check_file: True or False :param file_name: :param sheet_name: :param start_point: :param end_point: :param kwargs: :return: instance of HandleXLSX object, iterator of row object """ try: source_xls = HandleXLSX(file_name, sheet_name) row_object_iterator = source_xls.generate_row_object_iterator( check_file, sheet_name, start_point, end_point, **kwargs) return source_xls, row_object_iterator except Exception as e: nlogger.error('{fn} error: {e}'.format(fn='get_row_object_iterator', e=traceback.format_exc())) raise GetRowIterError('{fn} error: {e}'.format( fn='get_row_object_iterator', e=repr(e)))
def handle_data(row_object, **kwargs): """ Get absolute patch of file storage :param row_object: :param kwargs: :return: """ try: # kwargs['company_name'] = str(row_object.column_value.get('公司')).strip() _source_company_name = str(row_object.column_value.get('公司')).strip() kwargs['company_name'] = extract_company_name(_source_company_name) assert kwargs.get('company_name'), "company name is invalid" assert kwargs.get('samples_object'), "company dict is invalid" _row_object = set_row_object_company_info(row_object, **kwargs) return _row_object except AssertionError: raise except Exception as e: nlogger.error("{fn} error: {e}".format(fn='handle_data', e=traceback.format_exc())) raise HandleDataError("{fn} error: {e}".format(fn='handle_data', e=repr(e)))
def add_sample_object_property(samples_object, row_object, **kwargs): try: full_name = str(row_object.column_value.get('单位名称')).strip() abbr_name = str(row_object.column_value.get('简称')).strip() assert full_name, f"单位名称 can't be empty" assert abbr_name, f"简称 can't be empty" brief_name = simplify_company_name(full_name) # sort name type _name_type = get_name_type(row_object) # structure Samples property name _property_name = f'{_name_type}_name' _property_full_name = f'{_name_type}_full_name' _property_abbr_name = f'{_name_type}_abbr_name' _property_name_value = samples_object.get_property(_property_name) _property_full_name_value = samples_object.get_property(_property_full_name) _property_abbr_name_value = samples_object.get_property(_property_abbr_name) # add source _property_full_name_value[full_name] = {'termination': True, 'full_name': row_object.column_value.get('单位名称'), 'company_type': row_object.column_value.get('单位类别')} samples_object.set_property(_property_full_name, _property_full_name_value) # add source reference _property_name_value[full_name] = {_property_full_name: full_name} samples_object.set_property(_property_name, _property_name_value) samples_object.all_full_name[full_name] = {_property_full_name: full_name} samples_object.all_name[full_name] = {_property_full_name: full_name} _property_abbr_name_value[abbr_name] = {_property_full_name: full_name} samples_object.set_property(_property_abbr_name, _property_abbr_name_value) _property_name_value[abbr_name] = {_property_full_name: full_name} samples_object.set_property(_property_name, _property_name_value) samples_object.all_abbr_name[abbr_name] = {_property_full_name: full_name} samples_object.all_name[abbr_name] = {_property_full_name: full_name} if brief_name != abbr_name: _property_abbr_name_value[brief_name] = {_property_full_name: full_name} samples_object.set_property(_property_abbr_name, _property_abbr_name_value) _property_name_value[brief_name] = {_property_full_name: full_name} samples_object.set_property(_property_name, _property_name_value) samples_object.all_abbr_name[brief_name] = {_property_full_name: full_name} samples_object.all_name[brief_name] = {_property_full_name: full_name} return samples_object except Exception as e: nlogger.error('{fn} Undefined error: {e}'.format(fn='add_sample_object_property', e=traceback.format_exc())) print(f'Undefined error: {repr(e)}') raise
def write_result_to_xls(source_xls, data_result): """ Write result in sheet of Excel :param source_xls: :param data_result: :return: """ try: column_name_list = source_xls.get_column_name_list() columns_number = len(column_name_list) y_result = recursive_get_index(column_name_list, 'result') + 1 if y_result > columns_number: source_xls.write_sheet_rows_value( sheet_name=source_xls.sheet.title, values=(1, y_result, 'result')) y_name = recursive_get_index(column_name_list, 'name') + 1 if y_name > columns_number: source_xls.write_sheet_rows_value( sheet_name=source_xls.sheet.title, values=(1, y_name, 'name')) y_guess_name = recursive_get_index(column_name_list, 'guess_name') + 1 if y_guess_name > columns_number: source_xls.write_sheet_rows_value( sheet_name=source_xls.sheet.title, values=(1, y_guess_name, 'guess_name')) y_full_name = recursive_get_index(column_name_list, 'full_name') + 1 if y_full_name > columns_number: source_xls.write_sheet_rows_value( sheet_name=source_xls.sheet.title, values=(1, y_full_name, 'full_name')) y_type = recursive_get_index(column_name_list, 'type') + 1 if y_type > columns_number: source_xls.write_sheet_rows_value( sheet_name=source_xls.sheet.title, values=(1, y_type, 'type')) y_similarity = recursive_get_index(column_name_list, 'similarity') + 1 if y_similarity > columns_number: source_xls.write_sheet_rows_value( sheet_name=source_xls.sheet.title, values=(1, y_similarity, 'similarity')) for row_object in data_result: x = row_object.position y = y_result values = (x, y, row_object.column_value.get('result', 'unknown')) source_xls.write_sheet_rows_value(sheet_name=row_object.sheet_name, values=values) if row_object.column_value.get('company_name'): y = y_name values = (x, y, row_object.column_value.get('company_name')) source_xls.write_sheet_rows_value( sheet_name=row_object.sheet_name, values=values) if row_object.column_value.get('guess_name'): y = y_guess_name values = (x, y, row_object.column_value.get('guess_name')) source_xls.write_sheet_rows_value( sheet_name=row_object.sheet_name, values=values) if row_object.column_value.get('company_full_name'): y = y_full_name values = (x, y, row_object.column_value.get('company_full_name')) source_xls.write_sheet_rows_value( sheet_name=row_object.sheet_name, values=values) if row_object.column_value.get('company_type'): y = y_type values = (x, y, row_object.column_value.get('company_type')) source_xls.write_sheet_rows_value( sheet_name=row_object.sheet_name, values=values) if row_object.column_value.get('similarity'): y = y_similarity values = (x, y, row_object.column_value.get('similarity')) source_xls.write_sheet_rows_value( sheet_name=row_object.sheet_name, values=values) _result_file_name = "result_{d}.xlsx".format( d=datetime.now().strftime('%Y%m%d-%H:%M:%S')) source_xls.save(_result_file_name) nlogger.info( f'Write result completed, output file: {_result_file_name}') except Exception as e: nlogger.error("{fn} error: {e}".format(fn='write_result_to_xls', e=traceback.format_exc())) raise WriteResultError("{fn} error: {e}".format( fn='write_result_to_xls', e=repr(e)))