def calculation_A(company, batch_id, file_output): infos = mongo.show_datas('sheet_info', query={ 'company': company, 'batch_id': batch_id }, db='Info') dfs = [] for info in infos: print(info['file'], info['table']) data = mongo.show_datas('mapped_df', { 'file': info['file'], 'table': info['table'], 'batch_id': batch_id }, 'Cache') cur_df = pd.read_json(data[0]['data']) cur_df = expand_date(cur_df, info['start_date'], info['end_date']) # print(cur_df) dfs.append(cur_df) all_df = pd.DataFrame(columns=['in', 'out', 'balance']) for df in dfs: all_df = all_df.add(df, fill_value=0) print(all_df) balance_all = all_df['balance'] in_all = all_df['in'].resample('1M', label='left', loffset=datetime.timedelta(days=1), closed='left').sum() out_all = all_df['out'].resample('1M', label='left', loffset=datetime.timedelta(days=1), closed='left').sum() print(balance_all, in_all, out_all, sep='\n') return balance_all, in_all, out_all
def save_info(self, batch_id): ''' info 库里的先update表里的,因为方便修改库里的信息然后反映到表上 不重合时互相update ''' self.info = { # TODO 修改analyser里的dates, account写法 # 'dates': [self.start_date, self.end_date], 'company': self.company, 'file': self.title, 'table': self.table, 'batch_id': batch_id, 'self_name': self.self_name, 'self_account': self.self_account, 'self_bank': self.self_bank, 'currency': self.currency, 'start_date': self.start_date, 'end_date': self.end_date, 'gen_date': self.gen_date, 'transactions_num': self.transaction_num, 'init_balance': self.init_balance, } query = { 'company': self.company, 'file': self.title, 'table': self.table, 'batch_id': batch_id } # 库数据更新表数据 try: db_info = mongo.show_datas('sheet_info', query, 'Info')[0] for k, v in db_info.items(): if k not in self.info or self.info[k] == '': # 不更新已有的数据 self.info[k] = v if '_id' in self.info: del self.info['_id'] except: pass # 表数据更新库 if mongo.show_datas('sheet_info', query, 'Info'): mongo.update_datas(query, {'$set': self.info}, 'sheet_info', 'Info') else: mongo.insert_data(self.info, 'sheet_info', 'Info') # 根据表数据找到未匹配数据 nec_unmatched = [] for i in self.necessary_items: if i not in self.info or self.info[i] == '': nec_unmatched.append(i) self.necessary_unmatched = nec_unmatched return [self.necessary_unmatched, self.info]
def get_infos(self): # forms = mongo.show_datas(self.name, {'type': 'form'}, 'mapping') datas = mongo.show_datas('sheet_info', {'company': self.company}, 'Info') for form in datas: self.file_paths.append([form['file'], form['table']]) if form['start_date'] and form['end_date']: self.dates.append([form['start_date'], form['end_date']]) if form['self_account']: self.self_accounts.append(form['self_account']) self.path2account[form['file']+form['table']] = form['self_account'] # final_df = pd.read_json(datas[0]['data']) # for i in range(1, len(datas)): # cur_table = datas[i]['data'] # cur_df = pd.read_json(cur_table) # final_df = pd.concat([final_df, cur_df], ignore_index=True) # print(final_df) # make dates from str to int for d in range(len(self.dates)): if self.dates[d][0] and self.dates[d][1]: self.dates[d][0] = int(self.dates[d][0]) self.dates[d][1] = int(self.dates[d][1]) else: continue print(self.file_paths) print(self.dates) print(self.self_accounts) return True
def benford_check(self, file_path): # cur_df = pd.read_excel(file_path) datas = mongo.show_datas('mapped_df', {'file': file_path[0], 'table': file_path[1]}, 'Cache') cur_df = pd.read_json(datas[-1]['data']) income = cur_df['流入金额'].values out = cur_df['流出金额'].values # balance = cur_df['交易后余额'].values income2, out2, balance2 = [], [], [] # print(income) try: cur_df['流入金额'] = cur_df['流入金额'].astype(int) cur_df['流出金额'] = cur_df['流出金额'].astype(int) except Exception as e: print(e) print('failed to convert datatype to in for income and out money') return 'benford_check failed' for i in range(len(income)): if not np.isnan(income[i]): income2.append(income[i]) if not np.isnan(out[i]): out2.append(out[i]) all = income2 + out2 res = md.benford(all) print('benford coefficient: ', res[0]) print('total samples: ', len(all)) return res[0], len(all)
def cross_validation(self): invalid_accounts = [] account2df = {} # 先把账号下表格都打开 for path in self.file_paths: # cur_df = pd.read_excel(path) datas = mongo.show_datas('mapped_df', {'file': path[0], 'table': path[1]}, 'Cache') cur_df = pd.read_json(datas[-1]['data']) account2df[self.path2account[path[0]+path[1]]] = cur_df account2trans = {} for account in self.self_accounts: cur_df = account2df[account] accounts = [] # 对方账号 for index in cur_df.index: # 逐行找向自己公司转账的条目,并提取账号 if cur_df.loc[index, '对方名称'] == self.company: cur_account = cur_df.loc[index, '对方账号'] accounts.append(cur_account) if cur_account not in self.self_accounts: invalid_accounts.append(cur_account) cur_trans = cur_df.loc[index] if account not in account2trans: account2trans[account] = [cur_trans] else: account2trans[account].append(cur_trans) unmatched_trans = [] for from_acc, trans in account2trans.items(): for tran in trans: tran_date = tran.loc['交易日期'] tran_in = tran.loc['流入金额'] tran_out = tran.loc['流出金额'] out_acc = tran.loc['对方账号'] if out_acc in account2df: to_df = account2df[out_acc] else: print('not existed account: ', out_acc) continue matched = False for index in cur_df.index: # 为什么这里cur_df没有declare过?? if cur_df.loc[index, '对方账号'] == from_acc and cur_df.loc[index, '交易日期'] == tran_date: if cur_df.loc[index, '流入金额'] == tran_out or cur_df.loc[index, '流出金额'] == tran_in: print('Get one matched transaction.', from_acc, out_acc) matched = True break if not matched: print('---- not matched!----\n', tran) unmatched_trans.append(tran) # print('missing accounts:', invalid_accounts) return unmatched_trans
def add_rules(request, company, rule_name): query = {'company': company, 'rule_name': rule_name} try: user_rules = mongo.show_datas('user_rule', query, 'Mapping')[0] # user_rules.update(request) # mongo.update_datas({'company':company, 'rule_name': rule_name}, {'$set': user_rules}, 'user_rule', 'Mapping') mongo.delete_datas(query, 'user_rule', 'Mapping') except: user_rules = query # print('no user rules yet.') user_rules.update(request) mongo.insert_data(user_rules, 'user_rule', 'Mapping') print(user_rules) return 'success update ' + str(request)
def manual_mapping(self): asked_template = False while self.target_unmatched: # 一个个处理还没有匹配上的target选项 # use rule template if not asked_template: templates = mongo.show_datas('user_rule', {'company': self.company}, 'Mapping') print('现有的规则模版为:') rule_name_all = [] for i in templates: del i['_id'] del i['company'] rule_name_all.append(i['rule_name']) print(i) rule_name = input('使用规则模版:') if rule_name: if rule_name not in rule_name_all: print('无此模版。') continue self.mapping(rule_name) asked_template = True continue asked_template = True # write new rule cur_tar = self.target_unmatched[0] print('Options: ') for i in range(0, len(self.option_list), 4): # 每四个换一行显示 print(self.option_list[i:i + 4]) selected = input('与"{}"对应的是:'.format(cur_tar)) if selected == '': selected = 'none' if selected not in self.option_list: print('错误!不存在此选项') continue if cur_tar in self.target_unmatched: # 还没被match的 self.target_unmatched.remove(cur_tar) self.reversed_mapping[cur_tar] = selected add_rules({cur_tar: selected}, self.company, self.rule_name) while self.necessary_unmatched: cur_tar = self.necessary_unmatched[0] val = input('{} = '.format(cur_tar)) add_stats({cur_tar: val}, self.company, self.title, self.table, self.batch_id) self.necessary_unmatched.remove(cur_tar)
def output_excel(company, batch_id, file_output): datas = mongo.show_datas('mapped_df', { 'company': company, 'batch_id': batch_id }, 'Cache') final_df = pd.read_json(datas[0]['data']) for i in range(1, len(datas)): cur_table = datas[i]['data'] cur_df = pd.read_json(cur_table) final_df = pd.concat([final_df, cur_df], ignore_index=True) print(final_df) writer = pd.ExcelWriter(file_output) final_df.to_excel(writer, sheet_name='Sheet1') writer.save() print('DataFrame is written successfully to the Excel File.')
def inner_account_check(self): invalid_accounts = [] for path in self.file_paths: # cur_df = pd.read_excel(path) datas = mongo.show_datas('mapped_df', {'file': path[0], 'table': path[1]}, 'Cache') cur_df = pd.read_json(datas[-1]['data']) accounts = [] # 对方账号 for index in cur_df.index: # 逐行找向自己公司转账的条目,并提取账号 if cur_df.loc[index, '对方名称'] == self.company: cur_account = cur_df.loc[index, '对方账号'] accounts.append(cur_account) if cur_account not in self.self_accounts: invalid_accounts.append(cur_account) print('missing accounts:', invalid_accounts) return invalid_accounts
def get_dfs_by_company(company, batch_id): datas = mongo.show_datas('mapped_df', query={ 'company': company, 'batch_id': batch_id }, db='Cache') df = pd.read_json(datas[0]['data']) for data in datas[1:]: cur_df = pd.read_json(data['data']) df = df.append(cur_df) # 记得赋值! df.rename(columns=mydata.english_mapping, inplace=True) df['year'] = df['date'].apply(lambda x: str(x)[:4]) df['month'] = df['date'].apply(lambda x: str(x)[:6]) return df
def add_rules(query, user): user_rules = {} try: user_rules = mongo.show_datas('user_rule', { 'type': 'user_rule', 'name': user }, 'mapping')[0] except: user_rules["type"] = "user_rules" user_rules['name'] = user print('no user rules yet.') user_rules.update(query) mongo.delete_datas({'name': user}, 'user_rule', 'mapping') # 每次删掉原有collection mongo.insert_data(user_rules, 'user_rule', 'mapping') return 'success'
def main_mg(company, batch_id): in_map, out_map = get_rules(rulePath) try: del in_map['nan'] del out_map[''] del out_map['nan'] except Exception as e: print(e) datas = mg.show_datas('mapped_df', query={'company': company, 'batch_id': batch_id}, db='Cache') for data in datas: cur_df = pd.read_json(data['data']) labeled_df = process_file(cur_df, '', in_map, out_map, show_plot=False, write_excel=False) df_json = labeled_df.to_json(orient='columns', force_ascii=False) data['data'] = df_json mg.delete_datas({'batch_id': batch_id, 'file': data['file'], 'table': data['table']}, 'mapped_df', 'Cache') mg.insert_data(data, 'mapped_df', 'Cache')
def info_missing_check(self, file_path): # cur_df = pd.read_excel(file_path) datas = mongo.show_datas('mapped_df', {'file': file_path[0], 'table': file_path[1]}, 'Cache') cur_df = pd.read_json(datas[-1]['data']) abstract = cur_df['摘要'].values receiver_name = cur_df['对方名称'].values abstract_num = 0 receiver_num = 0 for i in range(len(abstract)): if type(abstract[i]) != str: abstract_num += 1 if type(receiver_name[i]) != str: receiver_num += 1 print('缺失的对方名称有:', receiver_num) print('缺失的摘要有:', abstract_num) return [abstract_num, receiver_num]
def add_stats(query, path): necc_info = {} try: necc_info = mongo.show_datas('necessary', { 'type': 'necessary', 'path': path }, 'mapping')[0] except: necc_info["type"] = "necessary" necc_info['path'] = path necc_info.update(query) mongo.delete_datas({ 'type': 'necessary', 'path': path }, 'necessary', 'mapping') # 每次删掉原有collection mongo.insert_data(necc_info, 'necessary', 'mapping') return 'success'
def add_stats(request, company, file, table, batch_id): query = { 'company': company, 'file': file, 'table': table, 'batch_id': batch_id } # print(request) # print(type(request)) try: necc_info = mongo.show_datas('sheet_info', query, 'Info')[0] mongo.delete_datas(query, 'sheet_info', 'Info') except: necc_info = query necc_info.update(request) mongo.insert_data(necc_info, 'sheet_info', 'Info') print(necc_info) return 'success update ' + str(request)
def get_infos(self): forms = mongo.show_datas(self.name, {'type': 'form'}, 'mapping') if not forms: return False for form in forms: self.file_paths.append(form['path']) self.dates.append(form['dates']) self.self_accounts.append(form['account']) self.path2account[form['path']] = form['account'] self.company_name = forms[0]['company_name'] # make dates from str to int for d in range(len(self.dates)): self.dates[d][0] = int(self.dates[d][0]) self.dates[d][1] = int(self.dates[d][1]) print(self.file_paths) print(self.dates) print(self.self_accounts) return True
def upload_mysql(company, batch_id): datas = mongo.show_datas('mapped_df', { 'company': company, 'batch_id': batch_id }, 'Cache') final_df = pd.read_json(datas[0]['data']) db = create_engine( 'mysql+pymysql://bank_dev:[email protected]:3306/bank_dev' ) for i in range(1, len(datas)): cur_table = datas[i]['data'] cur_df = pd.read_json(cur_table) final_df = pd.concat([final_df, cur_df], ignore_index=True) if not 'type' in final_df.columns.ravel(): final_df.rename(columns=data.english_mapping, inplace=True) df = final_df.iloc[:, 1:] df['batch_id'] = batch_id print(df) df.to_sql('liushui', db, index=False, if_exists='append')
def add_stats(query, path): necc_info = {} try: necc_info = mongo.show_datas('necessary', { 'type': 'necessary', 'path': path }, 'mapping')[0] except: necc_info["type"] = "necessary" necc_info['path'] = path # print('no user rules yet.') necc_info.update(query) # for key, val in query.items(): # user_rules[key] = val mongo.delete_datas({ 'type': 'necessary', 'path': path }, 'necessary', 'mapping') # 每次删掉原有collection mongo.insert_data(necc_info, 'necessary', 'mapping') return 'success'
def balance_check(self, error_tolerance, file_path): # cur_df = pd.read_excel(file_path) datas = mongo.show_datas('mapped_df', {'file': file_path[0], 'table': file_path[1]}, 'Cache') cur_df = pd.read_json(datas[-1]['data']) invalid = [] cur_df['流入金额'].fillna(0, inplace=True) cur_df['流出金额'].fillna(0, inplace=True) skip = False try: cur_df['流入金额'] = cur_df['流入金额'].astype(int) cur_df['流出金额'] = cur_df['流出金额'].astype(int) except Exception as e: print(e) print('failed to convert datatype to in for income and out money') skip = True income = cur_df['流入金额'].values out = cur_df['流出金额'].values balance = cur_df['交易后余额'].values if not skip: for i in range(1, len(income)): try: if not income[i] is None and not pd.isna(income[i]) and income[i] != 0: if abs(balance[i-1] + income[i] - balance[i]) > error_tolerance: invalid.append(i) elif not out[i] is None and not pd.isna(out[i]) and out[i] != 0: if abs(balance[i-1] - out[i] != balance[i]) > error_tolerance: invalid.append(i) except Exception as e: print(income, i) print(e) print(type(income[i])) print(income[i]) exit(1) # else: # invalid.append(i) # print(cur_df.loc[invalid]['交易日期'].values[:5]) invalid_dates = cur_df.loc[invalid]['交易日期'].values.tolist() # 提取所有不正确余额对应的日期 <class 'numpy.ndarray'> print('ratio of invalid balance: ', len(invalid_dates)/len(income)) return invalid_dates
def save_df(self): df_json = self.generated_df.to_json(orient='columns', force_ascii=False) df_data = { 'company': self.company, 'file': self.title, 'table': self.table, 'batch_id': self.batch_id, 'data': df_json } query = { 'company': self.company, 'file': self.title, 'table': self.table, 'batch_id': self.batch_id } if mongo.show_datas('mapped_df', query, 'Cache'): # mongo.update_datas(query, {'$set': df_data}, 'mapped_df', 'Cache') mongo.delete_datas(query, 'mapped_df', 'Cache') mongo.insert_data(df_data, 'mapped_df', 'Cache') else: mongo.insert_data(df_data, 'mapped_df', 'Cache') print('batch_id is ', self.batch_id)
def mapping(self, rule_name): try: self.user_rules = mongo.show_datas('user_rule', { 'company': self.company, 'rule_name': rule_name }, 'Mapping')[0] except: self.user_rules = {'company': self.company, 'rule_name': rule_name} # print('no user rules yet.') self.target_unmatched = self.target_headers.copy() # 根据base rule填充mapping self.option_list.append('none') for key in self.option_list: if key in self.base_rules: # 如果在base rule里已找到匹配项 val = self.base_rules[key] self.reversed_mapping[val] = key self.target_unmatched.remove(val) # 去掉input excel中随录信息包含值 if self.self_name: self.target_unmatched.remove('本方名称') if self.self_account: self.target_unmatched.remove('本方账号') # 根据user rule填充mapping self.reversed_mapping.update( self.user_rules) # 合并user_rules 进base_rule! target_unmatched = [] for i in self.target_unmatched: # 一个个处理还没有匹配上的target选项 if i not in self.reversed_mapping: # user_rule被加进reversemap了,但target_unmatched并没有被update target_unmatched.append( i) # 不直接remove self的,因为for循环remove后index会过 self.target_unmatched = target_unmatched print(self.target_unmatched, self.option_list, self.reversed_mapping) return [self.target_unmatched, self.option_list, self.reversed_mapping]
def mapping(self): # get base rule and rule summary from mongodb self.base_rules_summary = mongo.show_datas('base_rule', {'type': 'rule_summary'}, 'mapping')[0] self.base_rules = mongo.show_datas('base_rule', {'type': 'base_rule'}, 'mapping')[0] try: self.user_rules = mongo.show_datas('user_rule', { 'type': 'user_rule', 'name': self.user_name }, 'mapping')[0] # self.base_rules.update(self.user_rules) # 合并user_rules 进base_rule! except: self.user_rules["type"] = "user_rule" self.user_rules['name'] = self.user_name # print('no user rules yet.') try: self.necessary_info = \ mongo.show_datas('necessary', {'type': 'necessary', 'path': self.output_path}, 'mapping')[0] except: self.necessary_info = { 'type': 'necessary', 'path': self.output_path, } self.target_unmatched = self.base_rules_summary['target_headers'].copy( ) # 需要.copy,防止总的headers list被修改 self.necessary_unmatched = self.necessary_items.copy() self.option_list.append('none') for key in self.option_list: if key in self.base_rules: # 如果在baserule里已找到匹配项 val = self.base_rules[key] self.reversed_mapping[val] = key self.target_unmatched.remove(val) # 去掉input excel中随录信息包含值 if self.self_name: self.target_unmatched.remove('本方名称') if self.self_account: self.target_unmatched.remove('本方账号') # 三步,库数据更新表,表数据更新库,找到空项 # TODO 库里的necc,把表数据更新 for key, val in self.necessary_info.items(): if key not in ['type', 'path', '_id'] and val: exec('self.{} = "{}"'.format(key, val)) self.necessary_unmatched.remove(key) # TODO 表数据更新库。去除表里包含的necessary for i in self.necessary_unmatched: # if exec('temp = "self.{}"'.format(i)): # exec('self.necessary_unmatched.remove("{}")'.format(i)) # 注意,在里面如果要变量变str,需要加"" exec('self.necessary_info["{}"] = self.{}'.format(i, i)) # TODO 根据库数据找到未匹配数据 for i, val in self.necessary_info.items(): if i in self.necessary_unmatched and val: self.necessary_unmatched.remove(i) # print(self.necessary_info, self.necessary_unmatched) mongo.delete_datas({ 'type': 'necessary', 'path': self.output_path }, 'necessary', 'mapping') mongo.insert_data(self.necessary_info, 'necessary', 'mapping') # 生成反向mapping self.reversed_mapping.update( self.user_rules) # 合并user_rules 进base_rule! target_unmatched = [] for i in self.target_unmatched: # 一个个处理还没有匹配上的target选项 if i not in self.reversed_mapping: # user_rule被加进reversemap了,但target_unmatched并没有被update target_unmatched.append(i) self.target_unmatched = target_unmatched return [ self.target_unmatched, self.option_list, self.necessary_unmatched, self.necessary_info ]
def mapping(self): # get base rule and rule summary from mongodb self.base_rules_summary = mongo.show_datas('base_rule', {'type': 'rule_summary'}, 'mapping')[0] self.base_rules = mongo.show_datas('base_rule', {'type': 'base_rule'}, 'mapping')[0] try: self.user_rules = mongo.show_datas('user_rule', { 'type': 'user_rule', 'name': self.user_name }, 'mapping')[0] # self.base_rules.update(self.user_rules) # 合并user_rules 进base_rule! except: self.user_rules["type"] = "user_rule" self.user_rules['name'] = self.user_name # print('no user rules yet.') try: self.necessary_info = mongo.show_datas('necessary', { 'type': 'necessary', 'path': self.output_path }, 'mapping')[0] except: self.necessary_info = { 'type': 'necessary', 'path': self.output_path, } self.target_unmatched = self.base_rules_summary['target_headers'].copy( ) # 需要.copy,防止总的headers list被修改 self.necessary_unmatched = self.necessary_items.copy() # self.option_unmatched = list(self.option_list).copy() # self.option_unmatched.append('none') # 用作空选项 self.option_list.append('none') for key in self.option_list: if key in self.base_rules: # 如果在baserule里已找到匹配项 val = self.base_rules[key] # self.matched_mapping[item] = self.base_rules[item] self.reversed_mapping[val] = key self.target_unmatched.remove(val) # self.option_unmatched.remove(item) # 可多选?去不去掉呢?? # 去掉input excel中随录信息包含值 if self.self_name: self.target_unmatched.remove('本方名称') # self.necessary_unmatched.remove('self_name') # necessary['self_name'] = self.self_name if self.self_account: self.target_unmatched.remove('本方账号') # self.necessary_unmatched.remove('self_account') # necessary['self_account'] = self.self_account # 三步,库数据更新表,表数据更新库,找到空项 # TODO 库里的necc,把表数据更新 for key, val in self.necessary_info.items(): if key not in ['type', 'path', '_id'] and val: exec('self.{} = "{}"'.format(key, val)) self.necessary_unmatched.remove(key) # TODO 表数据更新库。去除表里包含的necessary # neccs = [self.self_name, self.self_account, self.self_bank, self.currency, self.start_date, self.end_date] # for i in range(len(neccs)): # if neccs[i]: # self.necessary_unmatched.remove(self.necessary_items[i]) # self.necessary_info[self.necessary_items[i]] = neccs[i] for i in self.necessary_unmatched: # if exec('temp = "self.{}"'.format(i)): # exec('self.necessary_unmatched.remove("{}")'.format(i)) # 注意,在里面如果要变量变str,需要加"" exec('self.necessary_info["{}"] = self.{}'.format(i, i)) # TODO 根据库数据找到未匹配数据 for i, val in self.necessary_info.items(): if i in self.necessary_unmatched and val: self.necessary_unmatched.remove(i) print(self.necessary_info, self.necessary_unmatched) mongo.delete_datas({ 'type': 'necessary', 'path': self.output_path }, 'necessary', 'mapping') mongo.insert_data(self.necessary_info, 'necessary', 'mapping') # 生成反向mapping # for key, val in self.matched_mapping.items(): # 如果有多个none怎么办呢?:此时还无none, 所以需要先reverse,再加none # self.reversed_mapping[val] = key self.reversed_mapping.update( self.user_rules) # 合并user_rules 进base_rule! target_unmatched = [] for i in self.target_unmatched: # 一个个处理还没有匹配上的target选项 # cur_tar = self.target_unmatched[0] if i not in self.reversed_mapping: # user_rule被加进reversemap了,但target_unmatched并没有被update target_unmatched.append(i) self.target_unmatched = target_unmatched # return [self.target_unmatched, self.option_unmatched] return [ self.target_unmatched, self.option_list, self.necessary_unmatched ]
def info_extractor(self): # 匹配表头行 并提取表格信息 row_num_found = False row_num = 0 keywords_dict = data.keywords_dict for index in self.raw_df.index: # 逐行看关键词是否存在 # 看是否第0行本来就匹配 cols = self.raw_df.columns.ravel().tolist() for i in cols: if i in keywords_dict['header_key']: row_num = 0 row_num_found = True break if row_num_found: break for i in range(self.raw_df.shape[1]): # 需要先找表头 cell = self.raw_df.loc[index].values[i] if cell in keywords_dict['header_key']: # 通过关键词寻找表头位置 row_num = index + 1 row_num_found = True break for key in keywords_dict: # 获取表头前统计信息 if (cell in keywords_dict[key]): print(cell) exec('self.{} = self.raw_df.loc[index].values[i + 1]'. format(key)) # i+1为被匹配信息右边一项 break if row_num_found: self.target_df = pd.read_excel( self.file_path, sheet_name=self.table, header=row_num) # 重新建立dataframe, 注意换table!! cols = self.target_df.columns.ravel() unnamed = [i for i in cols if re.search(r'Unnamed.*', i)] for i in unnamed: self.target_df = self.target_df.drop(columns=i) self.option_list = self.target_df.columns.ravel().tolist( ) # 表头list self.transaction_num = self.target_df.shape[0] else: print('titles not found!') return False print(self.target_df) # 从标题提取name self.name_mapping = data.name_mapping if not self.self_name: for name in self.name_mapping: if name in self.title: self.self_name = name # 从表名提取银行 if '银行' in self.table: self.self_bank = self.table # 从第一格提取账号 cell = self.raw_df.columns.ravel()[0] match = re.findall(r'(\d{16,19})', cell) if match: print('Found self account number: ', match[0]) self.self_account = str(match[0]) # 从标题提取日期 if not self.start_date or not self.end_date: res = re.findall(r'(20[12]\d)(\d*)-?(\d*)', self.title) if res: res = res[0] if not res[1] and not res[2]: # 只匹配到年份 self.start_date = res[0] + '0101' self.end_date = res[0] + '1231' elif not res[2]: self.start_date = res[0] + res[1] + '01' self.end_date = res[0] + res[1] + '30' elif len(res[1]) == 2: if len(res[2]) == 6: self.start_date = res[0] + res[1] + '01' self.end_date = res[2] + '30' if len(res[2]) == 2: self.start_date = res[0] + res[1] + '01' self.end_date = res[0] + res[2] + '30' if len(res[2]) == 1: self.start_date = res[0] + res[1] + '01' self.end_date = res[0] + '0' + res[2] + '30' # store as json df_json = self.target_df.to_json(orient='columns', force_ascii=False) df_data = { 'company': self.company, 'file': self.title, 'table': self.table, 'data': df_json } query = { 'company': self.company, 'file': self.title, 'table': self.table, 'batchid': self.batch_id } if mongo.show_datas('unmapped_df', query, 'Cache'): # mongo.update_datas(query, {'$set': df_data}, 'unmapped_df', 'Cache') mongo.delete_datas(query, 'unmapped_df', 'Cache') mongo.insert_data(df_data, 'unmapped_df', 'Cache') else: mongo.insert_data(df_data, 'unmapped_df', 'Cache') # data2 = mongo.show_datas('unmapped_df', query, 'Cache')[0] # df2 = pd.read_json(data2['data']) # print(df2) return True