def calculation_A(company, batch_id, file_output):
    infos = mongo.show_datas('sheet_info',
                             query={
                                 'company': company,
                                 'batch_id': batch_id
                             },
                             db='Info')
    dfs = []
    for info in infos:
        print(info['file'], info['table'])
        data = mongo.show_datas('mapped_df', {
            'file': info['file'],
            'table': info['table'],
            'batch_id': batch_id
        }, 'Cache')
        cur_df = pd.read_json(data[0]['data'])
        cur_df = expand_date(cur_df, info['start_date'], info['end_date'])
        # print(cur_df)
        dfs.append(cur_df)
    all_df = pd.DataFrame(columns=['in', 'out', 'balance'])
    for df in dfs:
        all_df = all_df.add(df, fill_value=0)
    print(all_df)
    balance_all = all_df['balance']
    in_all = all_df['in'].resample('1M',
                                   label='left',
                                   loffset=datetime.timedelta(days=1),
                                   closed='left').sum()
    out_all = all_df['out'].resample('1M',
                                     label='left',
                                     loffset=datetime.timedelta(days=1),
                                     closed='left').sum()
    print(balance_all, in_all, out_all, sep='\n')
    return balance_all, in_all, out_all
    def save_info(self, batch_id):
        '''
            info 库里的先update表里的,因为方便修改库里的信息然后反映到表上
            不重合时互相update
        '''
        self.info = {
            # TODO 修改analyser里的dates, account写法
            # 'dates': [self.start_date, self.end_date],
            'company': self.company,
            'file': self.title,
            'table': self.table,
            'batch_id': batch_id,
            'self_name': self.self_name,
            'self_account': self.self_account,
            'self_bank': self.self_bank,
            'currency': self.currency,
            'start_date': self.start_date,
            'end_date': self.end_date,
            'gen_date': self.gen_date,
            'transactions_num': self.transaction_num,
            'init_balance': self.init_balance,
        }
        query = {
            'company': self.company,
            'file': self.title,
            'table': self.table,
            'batch_id': batch_id
        }
        # 库数据更新表数据
        try:
            db_info = mongo.show_datas('sheet_info', query, 'Info')[0]
            for k, v in db_info.items():
                if k not in self.info or self.info[k] == '':  # 不更新已有的数据
                    self.info[k] = v
            if '_id' in self.info:
                del self.info['_id']
        except:
            pass

        # 表数据更新库
        if mongo.show_datas('sheet_info', query, 'Info'):
            mongo.update_datas(query, {'$set': self.info}, 'sheet_info',
                               'Info')
        else:
            mongo.insert_data(self.info, 'sheet_info', 'Info')

        # 根据表数据找到未匹配数据
        nec_unmatched = []
        for i in self.necessary_items:
            if i not in self.info or self.info[i] == '':
                nec_unmatched.append(i)
        self.necessary_unmatched = nec_unmatched

        return [self.necessary_unmatched, self.info]
    def get_infos(self):
        # forms = mongo.show_datas(self.name, {'type': 'form'}, 'mapping')
        datas = mongo.show_datas('sheet_info', {'company': self.company}, 'Info')

        for form in datas:
            self.file_paths.append([form['file'], form['table']])
            if form['start_date'] and form['end_date']:
                self.dates.append([form['start_date'], form['end_date']])
            if form['self_account']:
                self.self_accounts.append(form['self_account'])
            self.path2account[form['file']+form['table']] = form['self_account']

        # final_df = pd.read_json(datas[0]['data'])
        # for i in range(1, len(datas)):
        #     cur_table = datas[i]['data']
        #     cur_df = pd.read_json(cur_table)
        #     final_df = pd.concat([final_df, cur_df], ignore_index=True)
        # print(final_df)

        # make dates from str to int
        for d in range(len(self.dates)):
            if self.dates[d][0] and self.dates[d][1]:
                self.dates[d][0] = int(self.dates[d][0])
                self.dates[d][1] = int(self.dates[d][1])
            else:
                continue
        print(self.file_paths)
        print(self.dates)
        print(self.self_accounts)
        return True
 def benford_check(self, file_path):
     # cur_df = pd.read_excel(file_path)
     datas = mongo.show_datas('mapped_df', {'file': file_path[0], 'table': file_path[1]}, 'Cache')
     cur_df = pd.read_json(datas[-1]['data'])
     income = cur_df['流入金额'].values
     out = cur_df['流出金额'].values
     # balance = cur_df['交易后余额'].values
     income2, out2, balance2 = [], [], []
     # print(income)
     try:
         cur_df['流入金额'] = cur_df['流入金额'].astype(int)
         cur_df['流出金额'] = cur_df['流出金额'].astype(int)
     except Exception as e:
         print(e)
         print('failed to convert datatype to in for income and out money')
         return 'benford_check failed'
     for i in range(len(income)):
         if not np.isnan(income[i]):
             income2.append(income[i])
         if not np.isnan(out[i]):
             out2.append(out[i])
     all = income2 + out2
     res = md.benford(all)
     print('benford coefficient: ', res[0])
     print('total samples: ', len(all))
     return res[0], len(all)
    def cross_validation(self):
        invalid_accounts = []
        account2df = {}
        # 先把账号下表格都打开
        for path in self.file_paths:
            # cur_df = pd.read_excel(path)
            datas = mongo.show_datas('mapped_df', {'file': path[0], 'table': path[1]}, 'Cache')
            cur_df = pd.read_json(datas[-1]['data'])
            account2df[self.path2account[path[0]+path[1]]] = cur_df

        account2trans = {}
        for account in self.self_accounts:
            cur_df = account2df[account]

            accounts = []  # 对方账号
            for index in cur_df.index:  # 逐行找向自己公司转账的条目,并提取账号
                if cur_df.loc[index, '对方名称'] == self.company:
                    cur_account = cur_df.loc[index, '对方账号']
                    accounts.append(cur_account)
                    if cur_account not in self.self_accounts:
                        invalid_accounts.append(cur_account)
                    cur_trans = cur_df.loc[index]
                    if account not in account2trans:
                        account2trans[account] = [cur_trans]
                    else:
                        account2trans[account].append(cur_trans)

        unmatched_trans = []
        for from_acc, trans in account2trans.items():
            for tran in trans:
                tran_date = tran.loc['交易日期']
                tran_in = tran.loc['流入金额']
                tran_out = tran.loc['流出金额']
                out_acc = tran.loc['对方账号']
                if out_acc in account2df:
                    to_df = account2df[out_acc]
                else:
                    print('not existed account: ', out_acc)
                    continue
                matched = False
                for index in cur_df.index:      # 为什么这里cur_df没有declare过??
                    if cur_df.loc[index, '对方账号'] == from_acc and cur_df.loc[index, '交易日期'] == tran_date:
                        if cur_df.loc[index, '流入金额'] == tran_out or cur_df.loc[index, '流出金额'] == tran_in:
                            print('Get one matched transaction.', from_acc, out_acc)
                            matched = True
                            break
                if not matched:
                    print('---- not matched!----\n', tran)
                    unmatched_trans.append(tran)

        # print('missing accounts:', invalid_accounts)
        return unmatched_trans
def add_rules(request, company, rule_name):
    query = {'company': company, 'rule_name': rule_name}
    try:
        user_rules = mongo.show_datas('user_rule', query, 'Mapping')[0]
        # user_rules.update(request)
        # mongo.update_datas({'company':company, 'rule_name': rule_name}, {'$set': user_rules}, 'user_rule', 'Mapping')
        mongo.delete_datas(query, 'user_rule', 'Mapping')
    except:
        user_rules = query
        # print('no user rules yet.')
    user_rules.update(request)
    mongo.insert_data(user_rules, 'user_rule', 'Mapping')
    print(user_rules)
    return 'success update ' + str(request)
    def manual_mapping(self):
        asked_template = False
        while self.target_unmatched:  # 一个个处理还没有匹配上的target选项
            # use rule template
            if not asked_template:
                templates = mongo.show_datas('user_rule',
                                             {'company': self.company},
                                             'Mapping')
                print('现有的规则模版为:')
                rule_name_all = []
                for i in templates:
                    del i['_id']
                    del i['company']
                    rule_name_all.append(i['rule_name'])
                    print(i)
                rule_name = input('使用规则模版:')
                if rule_name:
                    if rule_name not in rule_name_all:
                        print('无此模版。')
                        continue
                    self.mapping(rule_name)
                    asked_template = True
                    continue
                asked_template = True

            # write new rule
            cur_tar = self.target_unmatched[0]
            print('Options: ')
            for i in range(0, len(self.option_list), 4):  # 每四个换一行显示
                print(self.option_list[i:i + 4])
            selected = input('与"{}"对应的是:'.format(cur_tar))
            if selected == '':
                selected = 'none'

            if selected not in self.option_list:
                print('错误!不存在此选项')
                continue

            if cur_tar in self.target_unmatched:  # 还没被match的
                self.target_unmatched.remove(cur_tar)
            self.reversed_mapping[cur_tar] = selected
            add_rules({cur_tar: selected}, self.company, self.rule_name)

        while self.necessary_unmatched:
            cur_tar = self.necessary_unmatched[0]
            val = input('{} = '.format(cur_tar))
            add_stats({cur_tar: val}, self.company, self.title, self.table,
                      self.batch_id)
            self.necessary_unmatched.remove(cur_tar)
def output_excel(company, batch_id, file_output):
    datas = mongo.show_datas('mapped_df', {
        'company': company,
        'batch_id': batch_id
    }, 'Cache')
    final_df = pd.read_json(datas[0]['data'])
    for i in range(1, len(datas)):
        cur_table = datas[i]['data']
        cur_df = pd.read_json(cur_table)
        final_df = pd.concat([final_df, cur_df], ignore_index=True)
    print(final_df)
    writer = pd.ExcelWriter(file_output)
    final_df.to_excel(writer, sheet_name='Sheet1')
    writer.save()
    print('DataFrame is written successfully to the Excel File.')
 def inner_account_check(self):
     invalid_accounts = []
     for path in self.file_paths:
         # cur_df = pd.read_excel(path)
         datas = mongo.show_datas('mapped_df', {'file': path[0], 'table': path[1]}, 'Cache')
         cur_df = pd.read_json(datas[-1]['data'])
         accounts = []       # 对方账号
         for index in cur_df.index:      # 逐行找向自己公司转账的条目,并提取账号
             if cur_df.loc[index, '对方名称'] == self.company:
                 cur_account = cur_df.loc[index, '对方账号']
                 accounts.append(cur_account)
                 if cur_account not in self.self_accounts:
                     invalid_accounts.append(cur_account)
     print('missing accounts:', invalid_accounts)
     return invalid_accounts
def get_dfs_by_company(company, batch_id):
    datas = mongo.show_datas('mapped_df',
                             query={
                                 'company': company,
                                 'batch_id': batch_id
                             },
                             db='Cache')
    df = pd.read_json(datas[0]['data'])
    for data in datas[1:]:
        cur_df = pd.read_json(data['data'])
        df = df.append(cur_df)  # 记得赋值!
    df.rename(columns=mydata.english_mapping, inplace=True)
    df['year'] = df['date'].apply(lambda x: str(x)[:4])
    df['month'] = df['date'].apply(lambda x: str(x)[:6])
    return df
def add_rules(query, user):
    user_rules = {}
    try:
        user_rules = mongo.show_datas('user_rule', {
            'type': 'user_rule',
            'name': user
        }, 'mapping')[0]
    except:
        user_rules["type"] = "user_rules"
        user_rules['name'] = user
        print('no user rules yet.')
    user_rules.update(query)
    mongo.delete_datas({'name': user}, 'user_rule',
                       'mapping')  # 每次删掉原有collection
    mongo.insert_data(user_rules, 'user_rule', 'mapping')
    return 'success'
def main_mg(company, batch_id):
    in_map, out_map = get_rules(rulePath)
    try:
        del in_map['nan']
        del out_map['']
        del out_map['nan']
    except Exception as e:
        print(e)
    datas = mg.show_datas('mapped_df', query={'company': company, 'batch_id': batch_id}, db='Cache')
    for data in datas:
        cur_df = pd.read_json(data['data'])
        labeled_df = process_file(cur_df, '', in_map, out_map, show_plot=False, write_excel=False)
        df_json = labeled_df.to_json(orient='columns', force_ascii=False)
        data['data'] = df_json
        mg.delete_datas({'batch_id': batch_id, 'file': data['file'], 'table': data['table']}, 'mapped_df', 'Cache')
        mg.insert_data(data, 'mapped_df', 'Cache')
 def info_missing_check(self, file_path):
     # cur_df = pd.read_excel(file_path)
     datas = mongo.show_datas('mapped_df', {'file': file_path[0], 'table': file_path[1]}, 'Cache')
     cur_df = pd.read_json(datas[-1]['data'])
     abstract = cur_df['摘要'].values
     receiver_name = cur_df['对方名称'].values
     abstract_num = 0
     receiver_num = 0
     for i in range(len(abstract)):
         if type(abstract[i]) != str:
             abstract_num += 1
         if type(receiver_name[i]) != str:
             receiver_num += 1
     print('缺失的对方名称有:', receiver_num)
     print('缺失的摘要有:', abstract_num)
     return [abstract_num, receiver_num]
def add_stats(query, path):
    necc_info = {}
    try:
        necc_info = mongo.show_datas('necessary', {
            'type': 'necessary',
            'path': path
        }, 'mapping')[0]
    except:
        necc_info["type"] = "necessary"
        necc_info['path'] = path
    necc_info.update(query)
    mongo.delete_datas({
        'type': 'necessary',
        'path': path
    }, 'necessary', 'mapping')  # 每次删掉原有collection
    mongo.insert_data(necc_info, 'necessary', 'mapping')
    return 'success'
def add_stats(request, company, file, table, batch_id):
    query = {
        'company': company,
        'file': file,
        'table': table,
        'batch_id': batch_id
    }
    #   print(request)
    #   print(type(request))
    try:
        necc_info = mongo.show_datas('sheet_info', query, 'Info')[0]
        mongo.delete_datas(query, 'sheet_info', 'Info')
    except:
        necc_info = query
    necc_info.update(request)
    mongo.insert_data(necc_info, 'sheet_info', 'Info')
    print(necc_info)
    return 'success update ' + str(request)
Beispiel #16
0
 def get_infos(self):
     forms = mongo.show_datas(self.name, {'type': 'form'}, 'mapping')
     if not forms:
         return False
     for form in forms:
         self.file_paths.append(form['path'])
         self.dates.append(form['dates'])
         self.self_accounts.append(form['account'])
         self.path2account[form['path']] = form['account']
     self.company_name = forms[0]['company_name']
     # make dates from str to int
     for d in range(len(self.dates)):
         self.dates[d][0] = int(self.dates[d][0])
         self.dates[d][1] = int(self.dates[d][1])
     print(self.file_paths)
     print(self.dates)
     print(self.self_accounts)
     return True
def upload_mysql(company, batch_id):
    datas = mongo.show_datas('mapped_df', {
        'company': company,
        'batch_id': batch_id
    }, 'Cache')
    final_df = pd.read_json(datas[0]['data'])
    db = create_engine(
        'mysql+pymysql://bank_dev:[email protected]:3306/bank_dev'
    )
    for i in range(1, len(datas)):
        cur_table = datas[i]['data']
        cur_df = pd.read_json(cur_table)
        final_df = pd.concat([final_df, cur_df], ignore_index=True)
    if not 'type' in final_df.columns.ravel():
        final_df.rename(columns=data.english_mapping, inplace=True)
    df = final_df.iloc[:, 1:]
    df['batch_id'] = batch_id
    print(df)
    df.to_sql('liushui', db, index=False, if_exists='append')
Beispiel #18
0
def add_stats(query, path):
    necc_info = {}
    try:
        necc_info = mongo.show_datas('necessary', {
            'type': 'necessary',
            'path': path
        }, 'mapping')[0]
    except:
        necc_info["type"] = "necessary"
        necc_info['path'] = path
        # print('no user rules yet.')
    necc_info.update(query)
    # for key, val in query.items():
    #     user_rules[key] = val
    mongo.delete_datas({
        'type': 'necessary',
        'path': path
    }, 'necessary', 'mapping')  # 每次删掉原有collection
    mongo.insert_data(necc_info, 'necessary', 'mapping')
    return 'success'
 def balance_check(self, error_tolerance, file_path):
     # cur_df = pd.read_excel(file_path)
     datas = mongo.show_datas('mapped_df', {'file': file_path[0], 'table': file_path[1]}, 'Cache')
     cur_df = pd.read_json(datas[-1]['data'])
     invalid = []
     cur_df['流入金额'].fillna(0, inplace=True)
     cur_df['流出金额'].fillna(0, inplace=True)
     skip = False
     try:
         cur_df['流入金额'] = cur_df['流入金额'].astype(int)
         cur_df['流出金额'] = cur_df['流出金额'].astype(int)
     except Exception as e:
         print(e)
         print('failed to convert datatype to in for income and out money')
         skip = True
     income = cur_df['流入金额'].values
     out = cur_df['流出金额'].values
     balance = cur_df['交易后余额'].values
     if not skip:
         for i in range(1, len(income)):
             try:
                 if not income[i] is None and not pd.isna(income[i]) and income[i] != 0:
                     if abs(balance[i-1] + income[i] - balance[i]) > error_tolerance:
                         invalid.append(i)
                 elif not out[i] is None and not pd.isna(out[i]) and out[i] != 0:
                     if abs(balance[i-1] - out[i] != balance[i]) > error_tolerance:
                         invalid.append(i)
             except Exception as e:
                 print(income, i)
                 print(e)
                 print(type(income[i]))
                 print(income[i])
                 exit(1)
             # else:
             #     invalid.append(i)
     # print(cur_df.loc[invalid]['交易日期'].values[:5])
     invalid_dates = cur_df.loc[invalid]['交易日期'].values.tolist()     # 提取所有不正确余额对应的日期 <class 'numpy.ndarray'>
     print('ratio of invalid balance: ', len(invalid_dates)/len(income))
     return invalid_dates
 def save_df(self):
     df_json = self.generated_df.to_json(orient='columns',
                                         force_ascii=False)
     df_data = {
         'company': self.company,
         'file': self.title,
         'table': self.table,
         'batch_id': self.batch_id,
         'data': df_json
     }
     query = {
         'company': self.company,
         'file': self.title,
         'table': self.table,
         'batch_id': self.batch_id
     }
     if mongo.show_datas('mapped_df', query, 'Cache'):
         # mongo.update_datas(query, {'$set': df_data}, 'mapped_df', 'Cache')
         mongo.delete_datas(query, 'mapped_df', 'Cache')
         mongo.insert_data(df_data, 'mapped_df', 'Cache')
     else:
         mongo.insert_data(df_data, 'mapped_df', 'Cache')
         print('batch_id is ', self.batch_id)
    def mapping(self, rule_name):
        try:
            self.user_rules = mongo.show_datas('user_rule', {
                'company': self.company,
                'rule_name': rule_name
            }, 'Mapping')[0]
        except:
            self.user_rules = {'company': self.company, 'rule_name': rule_name}
            # print('no user rules yet.')

        self.target_unmatched = self.target_headers.copy()

        # 根据base rule填充mapping
        self.option_list.append('none')
        for key in self.option_list:
            if key in self.base_rules:  # 如果在base rule里已找到匹配项
                val = self.base_rules[key]
                self.reversed_mapping[val] = key
                self.target_unmatched.remove(val)
        # 去掉input excel中随录信息包含值
        if self.self_name:
            self.target_unmatched.remove('本方名称')
        if self.self_account:
            self.target_unmatched.remove('本方账号')

        # 根据user rule填充mapping
        self.reversed_mapping.update(
            self.user_rules)  # 合并user_rules 进base_rule!
        target_unmatched = []
        for i in self.target_unmatched:  # 一个个处理还没有匹配上的target选项
            if i not in self.reversed_mapping:  # user_rule被加进reversemap了,但target_unmatched并没有被update
                target_unmatched.append(
                    i)  # 不直接remove self的,因为for循环remove后index会过
        self.target_unmatched = target_unmatched
        print(self.target_unmatched, self.option_list, self.reversed_mapping)
        return [self.target_unmatched, self.option_list, self.reversed_mapping]
    def mapping(self):
        # get base rule and rule summary from mongodb
        self.base_rules_summary = mongo.show_datas('base_rule',
                                                   {'type': 'rule_summary'},
                                                   'mapping')[0]
        self.base_rules = mongo.show_datas('base_rule', {'type': 'base_rule'},
                                           'mapping')[0]
        try:
            self.user_rules = mongo.show_datas('user_rule', {
                'type': 'user_rule',
                'name': self.user_name
            }, 'mapping')[0]
            # self.base_rules.update(self.user_rules)         # 合并user_rules 进base_rule!
        except:
            self.user_rules["type"] = "user_rule"
            self.user_rules['name'] = self.user_name
            # print('no user rules yet.')
        try:
            self.necessary_info = \
            mongo.show_datas('necessary', {'type': 'necessary', 'path': self.output_path}, 'mapping')[0]
        except:
            self.necessary_info = {
                'type': 'necessary',
                'path': self.output_path,
            }
        self.target_unmatched = self.base_rules_summary['target_headers'].copy(
        )  # 需要.copy,防止总的headers list被修改
        self.necessary_unmatched = self.necessary_items.copy()
        self.option_list.append('none')
        for key in self.option_list:
            if key in self.base_rules:  # 如果在baserule里已找到匹配项
                val = self.base_rules[key]
                self.reversed_mapping[val] = key
                self.target_unmatched.remove(val)
        # 去掉input excel中随录信息包含值
        if self.self_name:
            self.target_unmatched.remove('本方名称')
        if self.self_account:
            self.target_unmatched.remove('本方账号')

        # 三步,库数据更新表,表数据更新库,找到空项
        # TODO 库里的necc,把表数据更新
        for key, val in self.necessary_info.items():
            if key not in ['type', 'path', '_id'] and val:
                exec('self.{} = "{}"'.format(key, val))
                self.necessary_unmatched.remove(key)

        # TODO 表数据更新库。去除表里包含的necessary
        for i in self.necessary_unmatched:
            # if exec('temp = "self.{}"'.format(i)):
            # exec('self.necessary_unmatched.remove("{}")'.format(i))     # 注意,在里面如果要变量变str,需要加""
            exec('self.necessary_info["{}"] = self.{}'.format(i, i))

        #  TODO 根据库数据找到未匹配数据
        for i, val in self.necessary_info.items():
            if i in self.necessary_unmatched and val:
                self.necessary_unmatched.remove(i)
        # print(self.necessary_info, self.necessary_unmatched)
        mongo.delete_datas({
            'type': 'necessary',
            'path': self.output_path
        }, 'necessary', 'mapping')
        mongo.insert_data(self.necessary_info, 'necessary', 'mapping')

        # 生成反向mapping
        self.reversed_mapping.update(
            self.user_rules)  # 合并user_rules 进base_rule!
        target_unmatched = []
        for i in self.target_unmatched:  # 一个个处理还没有匹配上的target选项
            if i not in self.reversed_mapping:  # user_rule被加进reversemap了,但target_unmatched并没有被update
                target_unmatched.append(i)
        self.target_unmatched = target_unmatched
        return [
            self.target_unmatched, self.option_list, self.necessary_unmatched,
            self.necessary_info
        ]
Beispiel #23
0
    def mapping(self):
        # get base rule and rule summary from mongodb
        self.base_rules_summary = mongo.show_datas('base_rule',
                                                   {'type': 'rule_summary'},
                                                   'mapping')[0]
        self.base_rules = mongo.show_datas('base_rule', {'type': 'base_rule'},
                                           'mapping')[0]
        try:
            self.user_rules = mongo.show_datas('user_rule', {
                'type': 'user_rule',
                'name': self.user_name
            }, 'mapping')[0]
            # self.base_rules.update(self.user_rules)         # 合并user_rules 进base_rule!
        except:
            self.user_rules["type"] = "user_rule"
            self.user_rules['name'] = self.user_name
            # print('no user rules yet.')
        try:
            self.necessary_info = mongo.show_datas('necessary', {
                'type': 'necessary',
                'path': self.output_path
            }, 'mapping')[0]
        except:
            self.necessary_info = {
                'type': 'necessary',
                'path': self.output_path,
            }
        self.target_unmatched = self.base_rules_summary['target_headers'].copy(
        )  # 需要.copy,防止总的headers list被修改
        self.necessary_unmatched = self.necessary_items.copy()
        # self.option_unmatched = list(self.option_list).copy()
        # self.option_unmatched.append('none')        # 用作空选项
        self.option_list.append('none')
        for key in self.option_list:
            if key in self.base_rules:  # 如果在baserule里已找到匹配项
                val = self.base_rules[key]
                # self.matched_mapping[item] = self.base_rules[item]
                self.reversed_mapping[val] = key
                self.target_unmatched.remove(val)
                # self.option_unmatched.remove(item)            # 可多选?去不去掉呢??
        # 去掉input excel中随录信息包含值
        if self.self_name:
            self.target_unmatched.remove('本方名称')
            # self.necessary_unmatched.remove('self_name')
            # necessary['self_name'] = self.self_name
        if self.self_account:
            self.target_unmatched.remove('本方账号')
            # self.necessary_unmatched.remove('self_account')
            # necessary['self_account'] = self.self_account

        # 三步,库数据更新表,表数据更新库,找到空项
        # TODO 库里的necc,把表数据更新
        for key, val in self.necessary_info.items():
            if key not in ['type', 'path', '_id'] and val:
                exec('self.{} = "{}"'.format(key, val))
                self.necessary_unmatched.remove(key)

        # TODO 表数据更新库。去除表里包含的necessary
        # neccs = [self.self_name, self.self_account, self.self_bank, self.currency, self.start_date, self.end_date]
        # for i in range(len(neccs)):
        #     if neccs[i]:
        #         self.necessary_unmatched.remove(self.necessary_items[i])
        #         self.necessary_info[self.necessary_items[i]] = neccs[i]

        for i in self.necessary_unmatched:
            # if exec('temp = "self.{}"'.format(i)):
            # exec('self.necessary_unmatched.remove("{}")'.format(i))     # 注意,在里面如果要变量变str,需要加""
            exec('self.necessary_info["{}"] = self.{}'.format(i, i))

        #  TODO 根据库数据找到未匹配数据
        for i, val in self.necessary_info.items():
            if i in self.necessary_unmatched and val:
                self.necessary_unmatched.remove(i)
        print(self.necessary_info, self.necessary_unmatched)
        mongo.delete_datas({
            'type': 'necessary',
            'path': self.output_path
        }, 'necessary', 'mapping')
        mongo.insert_data(self.necessary_info, 'necessary', 'mapping')

        # 生成反向mapping
        # for key, val in self.matched_mapping.items():  # 如果有多个none怎么办呢?:此时还无none, 所以需要先reverse,再加none
        #     self.reversed_mapping[val] = key
        self.reversed_mapping.update(
            self.user_rules)  # 合并user_rules 进base_rule!
        target_unmatched = []
        for i in self.target_unmatched:  # 一个个处理还没有匹配上的target选项
            # cur_tar = self.target_unmatched[0]
            if i not in self.reversed_mapping:  # user_rule被加进reversemap了,但target_unmatched并没有被update
                target_unmatched.append(i)
        self.target_unmatched = target_unmatched
        # return [self.target_unmatched, self.option_unmatched]
        return [
            self.target_unmatched, self.option_list, self.necessary_unmatched
        ]
    def info_extractor(self):
        # 匹配表头行 并提取表格信息
        row_num_found = False
        row_num = 0
        keywords_dict = data.keywords_dict
        for index in self.raw_df.index:  # 逐行看关键词是否存在
            # 看是否第0行本来就匹配
            cols = self.raw_df.columns.ravel().tolist()
            for i in cols:
                if i in keywords_dict['header_key']:
                    row_num = 0
                    row_num_found = True
                    break
            if row_num_found:
                break

            for i in range(self.raw_df.shape[1]):
                # 需要先找表头
                cell = self.raw_df.loc[index].values[i]
                if cell in keywords_dict['header_key']:  # 通过关键词寻找表头位置
                    row_num = index + 1
                    row_num_found = True
                    break

                for key in keywords_dict:  # 获取表头前统计信息
                    if (cell in keywords_dict[key]):
                        print(cell)
                        exec('self.{} = self.raw_df.loc[index].values[i + 1]'.
                             format(key))  # i+1为被匹配信息右边一项
                        break

        if row_num_found:
            self.target_df = pd.read_excel(
                self.file_path, sheet_name=self.table,
                header=row_num)  # 重新建立dataframe, 注意换table!!
            cols = self.target_df.columns.ravel()
            unnamed = [i for i in cols if re.search(r'Unnamed.*', i)]
            for i in unnamed:
                self.target_df = self.target_df.drop(columns=i)
            self.option_list = self.target_df.columns.ravel().tolist(
            )  # 表头list
            self.transaction_num = self.target_df.shape[0]
        else:
            print('titles not found!')
            return False
        print(self.target_df)

        # 从标题提取name
        self.name_mapping = data.name_mapping
        if not self.self_name:
            for name in self.name_mapping:
                if name in self.title:
                    self.self_name = name

        # 从表名提取银行
        if '银行' in self.table:
            self.self_bank = self.table

        # 从第一格提取账号
        cell = self.raw_df.columns.ravel()[0]
        match = re.findall(r'(\d{16,19})', cell)
        if match:
            print('Found self account number: ', match[0])
            self.self_account = str(match[0])

        # 从标题提取日期
        if not self.start_date or not self.end_date:
            res = re.findall(r'(20[12]\d)(\d*)-?(\d*)', self.title)
            if res:
                res = res[0]
                if not res[1] and not res[2]:  # 只匹配到年份
                    self.start_date = res[0] + '0101'
                    self.end_date = res[0] + '1231'
                elif not res[2]:
                    self.start_date = res[0] + res[1] + '01'
                    self.end_date = res[0] + res[1] + '30'
                elif len(res[1]) == 2:
                    if len(res[2]) == 6:
                        self.start_date = res[0] + res[1] + '01'
                        self.end_date = res[2] + '30'
                    if len(res[2]) == 2:
                        self.start_date = res[0] + res[1] + '01'
                        self.end_date = res[0] + res[2] + '30'
                    if len(res[2]) == 1:
                        self.start_date = res[0] + res[1] + '01'
                        self.end_date = res[0] + '0' + res[2] + '30'

        # store as json
        df_json = self.target_df.to_json(orient='columns', force_ascii=False)
        df_data = {
            'company': self.company,
            'file': self.title,
            'table': self.table,
            'data': df_json
        }
        query = {
            'company': self.company,
            'file': self.title,
            'table': self.table,
            'batchid': self.batch_id
        }
        if mongo.show_datas('unmapped_df', query, 'Cache'):
            # mongo.update_datas(query, {'$set': df_data}, 'unmapped_df', 'Cache')
            mongo.delete_datas(query, 'unmapped_df', 'Cache')
            mongo.insert_data(df_data, 'unmapped_df', 'Cache')
        else:
            mongo.insert_data(df_data, 'unmapped_df', 'Cache')

        # data2 = mongo.show_datas('unmapped_df', query, 'Cache')[0]
        # df2 = pd.read_json(data2['data'])
        # print(df2)
        return True