def update_rule( self, query): # query should be in the form of {'target': 'option'} for key, selected in query.items(): # selected = query[key] # 1.更新option_unmatched # if selected not in self.option_unmatched: if selected not in self.option_list: print('错误!不存在此选项') return False # 2. 分情况更新target_unmatched和user_rule if key in self.target_unmatched: # 还没被match的 self.target_unmatched.remove(key) self.user_rules[key] = selected # mongo.delete_col('user_rule', 'mapping') # 每次删掉原有collection mongo.delete_datas({'name': self.user_name}, 'user_rule', 'mapping') mongo.insert_data(self.user_rules, 'user_rule', 'mapping') # 3. 更新reversed_mapping 为之后生成excel作准备 self.reversed_mapping[key] = selected # if selected != 'none': # none 不去掉,因为还可能被选择 # self.option_unmatched.remove(selected) return True
def rule_setup(self): # 初始化base_rule mongo.delete_col('base_rule', 'mapping') # 每次删掉原有collection target_headers = [ '交易日期', '交易时间', '本方名称', '本方账号', '本方银行', '对方名称', '对方账号', '交易类型', '摘要', '流入金额', '流出金额', '交易后余额', '系统分类' ] target_summary = ['开始日期', '结束日期', '货币种类', '流水条数', '流入总额', '流出总额'] mongo.insert_datas([{ 'type': 'rule_summary', 'target_headers': target_headers, 'target_summary': target_summary }], 'base_rule', 'mapping') mapping_rules = { # 可以多对一,在后面匹配上后reverse便形成一对一,不冲突 'type': 'base_rule', '交易日': '交易日期', '交易时间': '交易时间', '收/付方名称': '对方名称', '收/付方帐号': '对方账号', '交易类型': '交易类型', '摘要': '摘要', '备注': '摘要', '贷方金额': '流入金额', '借方金额': '流出金额', '余额': '交易后余额', '收取金额': '流入金额', '汇入金额': '流入金额', '汇出金额': '流出金额', '支出金额': '流出金额', '账户余额': '交易后余额', '对方户名': '对方名称', '对方账号': '对方账号', # '日期': '交易日期', # '交易类型': '系统分类', } mongo.insert_data(mapping_rules, 'base_rule', 'mapping')
def database_input(self): self.name_mapping = data.name_mapping # 从标题提取name if not self.self_name: for name in self.name_mapping: if name in self.title: self.self_name = name if self.self_name in self.name_mapping: comp_id = self.name_mapping[self.self_name] elif not self.self_name: comp_id = 'temp' else: comp_id = self.self_name # print(comp_id) # clear_company_file(self.output_path, name_mapping[self.self_name]) mongo.delete_datas({'path': self.output_path}, comp_id, 'mapping') info = { 'type': 'form', 'path': self.output_path, 'company_name': self.self_name, 'dates': [self.start_date, self.end_date], 'account': self.self_account, 'currency': self.currency, 'gen_date': self.gen_date, 'transactions_num': self.target_df.shape[1] } self.transaction_num = self.target_df.shape[1] info['transctions_num'] = self.transaction_num mongo.insert_data(info, comp_id, 'mapping')
def save_info(self, batch_id): ''' info 库里的先update表里的,因为方便修改库里的信息然后反映到表上 不重合时互相update ''' self.info = { # TODO 修改analyser里的dates, account写法 # 'dates': [self.start_date, self.end_date], 'company': self.company, 'file': self.title, 'table': self.table, 'batch_id': batch_id, 'self_name': self.self_name, 'self_account': self.self_account, 'self_bank': self.self_bank, 'currency': self.currency, 'start_date': self.start_date, 'end_date': self.end_date, 'gen_date': self.gen_date, 'transactions_num': self.transaction_num, 'init_balance': self.init_balance, } query = { 'company': self.company, 'file': self.title, 'table': self.table, 'batch_id': batch_id } # 库数据更新表数据 try: db_info = mongo.show_datas('sheet_info', query, 'Info')[0] for k, v in db_info.items(): if k not in self.info or self.info[k] == '': # 不更新已有的数据 self.info[k] = v if '_id' in self.info: del self.info['_id'] except: pass # 表数据更新库 if mongo.show_datas('sheet_info', query, 'Info'): mongo.update_datas(query, {'$set': self.info}, 'sheet_info', 'Info') else: mongo.insert_data(self.info, 'sheet_info', 'Info') # 根据表数据找到未匹配数据 nec_unmatched = [] for i in self.necessary_items: if i not in self.info or self.info[i] == '': nec_unmatched.append(i) self.necessary_unmatched = nec_unmatched return [self.necessary_unmatched, self.info]
def rule_setup(self): # 初始化base_rule mongo.delete_col('base_rule', 'mapping') # 每次删掉原有collection target_headers = data.target_headers target_summary = data.target_summary mongo.insert_datas([{ 'type': 'rule_summary', 'target_headers': target_headers, 'target_summary': target_summary }], 'base_rule', 'mapping') mapping_rules = data.mapping_rules mongo.insert_data(mapping_rules, 'base_rule', 'mapping')
def add_rules(request, company, rule_name): query = {'company': company, 'rule_name': rule_name} try: user_rules = mongo.show_datas('user_rule', query, 'Mapping')[0] # user_rules.update(request) # mongo.update_datas({'company':company, 'rule_name': rule_name}, {'$set': user_rules}, 'user_rule', 'Mapping') mongo.delete_datas(query, 'user_rule', 'Mapping') except: user_rules = query # print('no user rules yet.') user_rules.update(request) mongo.insert_data(user_rules, 'user_rule', 'Mapping') print(user_rules) return 'success update ' + str(request)
def main_mg(company, batch_id): in_map, out_map = get_rules(rulePath) try: del in_map['nan'] del out_map[''] del out_map['nan'] except Exception as e: print(e) datas = mg.show_datas('mapped_df', query={'company': company, 'batch_id': batch_id}, db='Cache') for data in datas: cur_df = pd.read_json(data['data']) labeled_df = process_file(cur_df, '', in_map, out_map, show_plot=False, write_excel=False) df_json = labeled_df.to_json(orient='columns', force_ascii=False) data['data'] = df_json mg.delete_datas({'batch_id': batch_id, 'file': data['file'], 'table': data['table']}, 'mapped_df', 'Cache') mg.insert_data(data, 'mapped_df', 'Cache')
def add_rules(query, user): user_rules = {} try: user_rules = mongo.show_datas('user_rule', { 'type': 'user_rule', 'name': user }, 'mapping')[0] except: user_rules["type"] = "user_rules" user_rules['name'] = user print('no user rules yet.') user_rules.update(query) mongo.delete_datas({'name': user}, 'user_rule', 'mapping') # 每次删掉原有collection mongo.insert_data(user_rules, 'user_rule', 'mapping') return 'success'
def add_stats(query, path): necc_info = {} try: necc_info = mongo.show_datas('necessary', { 'type': 'necessary', 'path': path }, 'mapping')[0] except: necc_info["type"] = "necessary" necc_info['path'] = path necc_info.update(query) mongo.delete_datas({ 'type': 'necessary', 'path': path }, 'necessary', 'mapping') # 每次删掉原有collection mongo.insert_data(necc_info, 'necessary', 'mapping') return 'success'
def add_stats(request, company, file, table, batch_id): query = { 'company': company, 'file': file, 'table': table, 'batch_id': batch_id } # print(request) # print(type(request)) try: necc_info = mongo.show_datas('sheet_info', query, 'Info')[0] mongo.delete_datas(query, 'sheet_info', 'Info') except: necc_info = query necc_info.update(request) mongo.insert_data(necc_info, 'sheet_info', 'Info') print(necc_info) return 'success update ' + str(request)
def add_stats(query, path): necc_info = {} try: necc_info = mongo.show_datas('necessary', { 'type': 'necessary', 'path': path }, 'mapping')[0] except: necc_info["type"] = "necessary" necc_info['path'] = path # print('no user rules yet.') necc_info.update(query) # for key, val in query.items(): # user_rules[key] = val mongo.delete_datas({ 'type': 'necessary', 'path': path }, 'necessary', 'mapping') # 每次删掉原有collection mongo.insert_data(necc_info, 'necessary', 'mapping') return 'success'
def database_input(self): self.name_mapping = { # 之后可以考虑用头四个字转拼音来生成collection名字 '上海爱钛技术咨询有限公司': 'aitai', '宜昌华昊新材料科技有限公司': 'huahao', '浙江亿控自动化设备有限公司': 'yikong', '爱钛': 'aitai', '华昊': 'huahao', '亿控': 'yikong', '同普': 'tongpu', } # 从标题提取name if not self.self_name: for name in self.name_mapping: if name in self.title: self.self_name = name if self.self_name in self.name_mapping: comp_id = self.name_mapping[self.self_name] elif not self.self_name: comp_id = 'temp' else: comp_id = self.self_name print(comp_id) # clear_company_file(self.output_path, name_mapping[self.self_name]) mongo.delete_datas({'path': self.output_path}, comp_id, 'mapping') print(self.raw_df) print(self.target_df) info = { 'type': 'form', 'path': self.output_path, 'company_name': self.self_name, 'dates': [self.start_date, self.end_date], 'account': self.self_account, 'currency': self.currency, 'gen_date': self.gen_date, 'transactions_num': self.target_df.shape[1] } # self.transaction_num = self.target_df.shape[1] # info['transctions_num'] = self.transaction_num mongo.insert_data(info, comp_id, 'mapping')
def save_df(self): df_json = self.generated_df.to_json(orient='columns', force_ascii=False) df_data = { 'company': self.company, 'file': self.title, 'table': self.table, 'batch_id': self.batch_id, 'data': df_json } query = { 'company': self.company, 'file': self.title, 'table': self.table, 'batch_id': self.batch_id } if mongo.show_datas('mapped_df', query, 'Cache'): # mongo.update_datas(query, {'$set': df_data}, 'mapped_df', 'Cache') mongo.delete_datas(query, 'mapped_df', 'Cache') mongo.insert_data(df_data, 'mapped_df', 'Cache') else: mongo.insert_data(df_data, 'mapped_df', 'Cache') print('batch_id is ', self.batch_id)
def mapping(self): # get base rule and rule summary from mongodb self.base_rules_summary = mongo.show_datas('base_rule', {'type': 'rule_summary'}, 'mapping')[0] self.base_rules = mongo.show_datas('base_rule', {'type': 'base_rule'}, 'mapping')[0] try: self.user_rules = mongo.show_datas('user_rule', { 'type': 'user_rule', 'name': self.user_name }, 'mapping')[0] # self.base_rules.update(self.user_rules) # 合并user_rules 进base_rule! except: self.user_rules["type"] = "user_rule" self.user_rules['name'] = self.user_name # print('no user rules yet.') try: self.necessary_info = mongo.show_datas('necessary', { 'type': 'necessary', 'path': self.output_path }, 'mapping')[0] except: self.necessary_info = { 'type': 'necessary', 'path': self.output_path, } self.target_unmatched = self.base_rules_summary['target_headers'].copy( ) # 需要.copy,防止总的headers list被修改 self.necessary_unmatched = self.necessary_items.copy() # self.option_unmatched = list(self.option_list).copy() # self.option_unmatched.append('none') # 用作空选项 self.option_list.append('none') for key in self.option_list: if key in self.base_rules: # 如果在baserule里已找到匹配项 val = self.base_rules[key] # self.matched_mapping[item] = self.base_rules[item] self.reversed_mapping[val] = key self.target_unmatched.remove(val) # self.option_unmatched.remove(item) # 可多选?去不去掉呢?? # 去掉input excel中随录信息包含值 if self.self_name: self.target_unmatched.remove('本方名称') # self.necessary_unmatched.remove('self_name') # necessary['self_name'] = self.self_name if self.self_account: self.target_unmatched.remove('本方账号') # self.necessary_unmatched.remove('self_account') # necessary['self_account'] = self.self_account # 三步,库数据更新表,表数据更新库,找到空项 # TODO 库里的necc,把表数据更新 for key, val in self.necessary_info.items(): if key not in ['type', 'path', '_id'] and val: exec('self.{} = "{}"'.format(key, val)) self.necessary_unmatched.remove(key) # TODO 表数据更新库。去除表里包含的necessary # neccs = [self.self_name, self.self_account, self.self_bank, self.currency, self.start_date, self.end_date] # for i in range(len(neccs)): # if neccs[i]: # self.necessary_unmatched.remove(self.necessary_items[i]) # self.necessary_info[self.necessary_items[i]] = neccs[i] for i in self.necessary_unmatched: # if exec('temp = "self.{}"'.format(i)): # exec('self.necessary_unmatched.remove("{}")'.format(i)) # 注意,在里面如果要变量变str,需要加"" exec('self.necessary_info["{}"] = self.{}'.format(i, i)) # TODO 根据库数据找到未匹配数据 for i, val in self.necessary_info.items(): if i in self.necessary_unmatched and val: self.necessary_unmatched.remove(i) print(self.necessary_info, self.necessary_unmatched) mongo.delete_datas({ 'type': 'necessary', 'path': self.output_path }, 'necessary', 'mapping') mongo.insert_data(self.necessary_info, 'necessary', 'mapping') # 生成反向mapping # for key, val in self.matched_mapping.items(): # 如果有多个none怎么办呢?:此时还无none, 所以需要先reverse,再加none # self.reversed_mapping[val] = key self.reversed_mapping.update( self.user_rules) # 合并user_rules 进base_rule! target_unmatched = [] for i in self.target_unmatched: # 一个个处理还没有匹配上的target选项 # cur_tar = self.target_unmatched[0] if i not in self.reversed_mapping: # user_rule被加进reversemap了,但target_unmatched并没有被update target_unmatched.append(i) self.target_unmatched = target_unmatched # return [self.target_unmatched, self.option_unmatched] return [ self.target_unmatched, self.option_list, self.necessary_unmatched ]
def info_extractor(self): # 匹配表头行 并提取表格信息 row_num_found = False row_num = 0 keywords_dict = data.keywords_dict for index in self.raw_df.index: # 逐行看关键词是否存在 # 看是否第0行本来就匹配 cols = self.raw_df.columns.ravel().tolist() for i in cols: if i in keywords_dict['header_key']: row_num = 0 row_num_found = True break if row_num_found: break for i in range(self.raw_df.shape[1]): # 需要先找表头 cell = self.raw_df.loc[index].values[i] if cell in keywords_dict['header_key']: # 通过关键词寻找表头位置 row_num = index + 1 row_num_found = True break for key in keywords_dict: # 获取表头前统计信息 if (cell in keywords_dict[key]): print(cell) exec('self.{} = self.raw_df.loc[index].values[i + 1]'. format(key)) # i+1为被匹配信息右边一项 break if row_num_found: self.target_df = pd.read_excel( self.file_path, sheet_name=self.table, header=row_num) # 重新建立dataframe, 注意换table!! cols = self.target_df.columns.ravel() unnamed = [i for i in cols if re.search(r'Unnamed.*', i)] for i in unnamed: self.target_df = self.target_df.drop(columns=i) self.option_list = self.target_df.columns.ravel().tolist( ) # 表头list self.transaction_num = self.target_df.shape[0] else: print('titles not found!') return False print(self.target_df) # 从标题提取name self.name_mapping = data.name_mapping if not self.self_name: for name in self.name_mapping: if name in self.title: self.self_name = name # 从表名提取银行 if '银行' in self.table: self.self_bank = self.table # 从第一格提取账号 cell = self.raw_df.columns.ravel()[0] match = re.findall(r'(\d{16,19})', cell) if match: print('Found self account number: ', match[0]) self.self_account = str(match[0]) # 从标题提取日期 if not self.start_date or not self.end_date: res = re.findall(r'(20[12]\d)(\d*)-?(\d*)', self.title) if res: res = res[0] if not res[1] and not res[2]: # 只匹配到年份 self.start_date = res[0] + '0101' self.end_date = res[0] + '1231' elif not res[2]: self.start_date = res[0] + res[1] + '01' self.end_date = res[0] + res[1] + '30' elif len(res[1]) == 2: if len(res[2]) == 6: self.start_date = res[0] + res[1] + '01' self.end_date = res[2] + '30' if len(res[2]) == 2: self.start_date = res[0] + res[1] + '01' self.end_date = res[0] + res[2] + '30' if len(res[2]) == 1: self.start_date = res[0] + res[1] + '01' self.end_date = res[0] + '0' + res[2] + '30' # store as json df_json = self.target_df.to_json(orient='columns', force_ascii=False) df_data = { 'company': self.company, 'file': self.title, 'table': self.table, 'data': df_json } query = { 'company': self.company, 'file': self.title, 'table': self.table, 'batchid': self.batch_id } if mongo.show_datas('unmapped_df', query, 'Cache'): # mongo.update_datas(query, {'$set': df_data}, 'unmapped_df', 'Cache') mongo.delete_datas(query, 'unmapped_df', 'Cache') mongo.insert_data(df_data, 'unmapped_df', 'Cache') else: mongo.insert_data(df_data, 'unmapped_df', 'Cache') # data2 = mongo.show_datas('unmapped_df', query, 'Cache')[0] # df2 = pd.read_json(data2['data']) # print(df2) return True
def mapping(self): # get base rule and rule summary from mongodb self.base_rules_summary = mongo.show_datas('base_rule', {'type': 'rule_summary'}, 'mapping')[0] self.base_rules = mongo.show_datas('base_rule', {'type': 'base_rule'}, 'mapping')[0] try: self.user_rules = mongo.show_datas('user_rule', { 'type': 'user_rule', 'name': self.user_name }, 'mapping')[0] # self.base_rules.update(self.user_rules) # 合并user_rules 进base_rule! except: self.user_rules["type"] = "user_rule" self.user_rules['name'] = self.user_name # print('no user rules yet.') try: self.necessary_info = \ mongo.show_datas('necessary', {'type': 'necessary', 'path': self.output_path}, 'mapping')[0] except: self.necessary_info = { 'type': 'necessary', 'path': self.output_path, } self.target_unmatched = self.base_rules_summary['target_headers'].copy( ) # 需要.copy,防止总的headers list被修改 self.necessary_unmatched = self.necessary_items.copy() self.option_list.append('none') for key in self.option_list: if key in self.base_rules: # 如果在baserule里已找到匹配项 val = self.base_rules[key] self.reversed_mapping[val] = key self.target_unmatched.remove(val) # 去掉input excel中随录信息包含值 if self.self_name: self.target_unmatched.remove('本方名称') if self.self_account: self.target_unmatched.remove('本方账号') # 三步,库数据更新表,表数据更新库,找到空项 # TODO 库里的necc,把表数据更新 for key, val in self.necessary_info.items(): if key not in ['type', 'path', '_id'] and val: exec('self.{} = "{}"'.format(key, val)) self.necessary_unmatched.remove(key) # TODO 表数据更新库。去除表里包含的necessary for i in self.necessary_unmatched: # if exec('temp = "self.{}"'.format(i)): # exec('self.necessary_unmatched.remove("{}")'.format(i)) # 注意,在里面如果要变量变str,需要加"" exec('self.necessary_info["{}"] = self.{}'.format(i, i)) # TODO 根据库数据找到未匹配数据 for i, val in self.necessary_info.items(): if i in self.necessary_unmatched and val: self.necessary_unmatched.remove(i) # print(self.necessary_info, self.necessary_unmatched) mongo.delete_datas({ 'type': 'necessary', 'path': self.output_path }, 'necessary', 'mapping') mongo.insert_data(self.necessary_info, 'necessary', 'mapping') # 生成反向mapping self.reversed_mapping.update( self.user_rules) # 合并user_rules 进base_rule! target_unmatched = [] for i in self.target_unmatched: # 一个个处理还没有匹配上的target选项 if i not in self.reversed_mapping: # user_rule被加进reversemap了,但target_unmatched并没有被update target_unmatched.append(i) self.target_unmatched = target_unmatched return [ self.target_unmatched, self.option_list, self.necessary_unmatched, self.necessary_info ]