def read_config_table(file_path, dtype=str): df = pd.DataFrame([]) df_workbook = None #读取数据文件,只读取第一个肉眼可见的sheet if '.xlsx' == file_path[-5:]: df_workbook = pd.ExcelFile(file_path) sheet_name_list = df_workbook.book.sheetnames for sheet in sheet_name_list: if df_workbook.book.get_sheet_by_name( sheet).sheet_state == 'visible': df = df_workbook.parse(sheet, dtype=str) break else: try: df = pd.read_excel(file_path) except: df = pd.read_html(file_path, header=0) if df.empty: enter_exit(f'Cannot read any visible table in "{file_path}"') if df_workbook != None: df_workbook.close() return df
def run_vba(original_path, new_path, macro, macro_sub_name): #检查输出文件夹 是否存在 if not os.path.isfile(original_path): enter_exit(f"File not found: {original_path}") check_create_new_folder(new_path) xlapp = XlappClass() try: #打开文档,运行VBA, 如果有报错,everything 搜索gen_py文件 删除,让系统重新生成新的gen_py xlapp = client.gencache.EnsureDispatch('Excel.Application') # xlapp = client.dynamic.Dispatch('Excel.Application') # xlapp = client.DispatchEx('Excel.Application') xlapp.Visible = 0 xlapp.DisplayAlerts = False xlwb = xlapp.Workbooks.Open(original_path, ReadOnly=1) xlwb.Visible = 0 xlwb.VBProject.VBComponents.Add(1) for i in xlwb.VBProject.VBComponents: if i.name == '模块1': module = xlwb.VBProject.VBComponents.Item(i.name).CodeModule module.AddFromString(macro) xlwb.Application.Run(macro_sub_name) xlwb.SaveAs(new_path, FileFormat=51, ConflictResolution=2) xlwb.Close(True) print("{} Draw completed".format(new_path.split('\\')[-1])) except Exception as e: logging.error(traceback.format_exc()) finally: xlapp.Quit del xlapp
def get_sql_data(engine_text, table_name, sql, save_path, how='normal'): """通过SQL获取到目标数据并保存到文档""" db = create_engine(engine_text, poolclass=NullPool) conn = db.connect() if how == 'normal': #execute 返回ResultProxy,fetchall如果为空返回空列表 fetchall_result = execute_fetchall(conn, sql) result_df = convert_fetchall2df(fetchall_result) result_df.to_excel(save_path, index=False) elif how == 'mysql_dump': #保存数据到临时目的地,mysqldump不能识别中文的路径,所以必须获取到最高一级的类似C盘C://的路径才能正常写入 temp_path = get_most_upper_level_path('df_temp_file.csv') #先获取一次表头,sql输入结尾不能填分号 if ';' == sql.strip()[-1]: sql = sql.strip()[:-1] try: first_row = conn.execute(sql) header = list(first_row.keys()) except Exception as e: logging.error(traceback.format_exc()) print(sql, '数据提取失败') conn.close() db.dispose() return None dump_sql = r""" {} INTO OUTFILE '{}' CHARACTER SET 'utf8mb4' FIELDS TERMINATED BY ',' ENCLOSED BY '"' ESCAPED BY '\\' LINES TERMINATED BY '\r\n' ; """.format(sql, temp_path) if os.path.exists(temp_path): os.remove(temp_path) dump_sql = text(dump_sql) conn.execute(dump_sql) #获取到目标后,将结果复制到目标文件夹并且转换成EXCEL格式 if save_path.split('.')[-1] == 'xlsx': result_df = pd.read_csv(temp_path) result_df.to_excel(save_path, header=header, index=False) os.remove(temp_path) else: #如果不是保存的XLSX格式,就直接复制CSV结果到目标文件夹 copyfile(temp_path, save_path) print(save_path, '数据已保存') else: enter_exit(how, "不能识别该导出方式") conn.close() db.dispose()
def group_basic_agg(df, group_column, agg_func, value_column=None, keyword_list=[], stopword_list=[], group_index=False): """ 根据分类统计基本的类别数量 :param df : input df :param group_column : groupby column, list :param agg_function : same as df agg func :return : dataframe contains agg result """ df_copy = df.copy() if type(group_column) != list: group_column = [group_column] #是否有传入指定数值的列,不传入只能做简单的count统计 if value_column == None: agg_func_column = [agg_func] df_copy[agg_func_column] = True else: agg_func_column = [value_column] lack_column_list = find_lack_columns(df_copy, agg_func_column + group_column, 'Group statistic') df_copy = df_copy.loc[:, agg_func_column + group_column] df_copy = df_copy.fillna(0) if 'word' in agg_func: #注意这里的agg_func_columns是一个列表,如果需要做词频统计,需要取出第一个 df_copy_agg = word_agg_func(df_copy, group_column, agg_func, agg_func_column[0], keyword_list, stopword_list) else: try: df_copy_agg = df_copy.groupby(group_column).agg( agg_func).sort_values(by=agg_func_column, ascending=False).fillna(0) except AttributeError: enter_exit(f'"{agg_func}" function not found') except: enter_exit( f'Failed to execute calc function: "{agg_func}" in "{agg_func_column[0]}"' ) if not group_index: df_copy_agg = df_copy_agg.reset_index() return df_copy_agg
def get_sql_connection(engine_text): try: print(f'Creating connection to {engine_text}') db = create_engine(engine_text, poolclass=NullPool) conn = db.connect() except Exception as e: logging.error(traceback.format_exc()) enter_exit(f"Failed to connect database with config:\n {engine_text}") return conn, db
def df_query(df, condition): if type(condition) == str and condition.strip() != "": try: df = df.query(condition, engine='python') except: write_format_columns('Filter condition-Result when error.xlsx', df, 'content') enter_exit( f'Unable to compile the following filter condition:\n"{condition}"' ) return df
def find_lack_columns(df, require_columns, error_func=''): lack_column_list = [] for r in require_columns: if r not in df.columns: lack_column_list.append(r) if lack_column_list: error_msg = ','.join(lack_column_list) enter_exit(f"{error_func} Error - Missing columns:{error_msg}") return lack_column_list
def get_sqlite_connection(db_path): if db_path.strip() == '': db_path = ":memory:" try: print('Creating connection to {}'.format(db_path.strip(':'))) conn = sqlite3.connect(db_path) #修改执行的等待时间,防止数据还没有写到库里面,另一个查询函数就开始 报错 table is locked conn.execute("PRAGMA busy_timeout=1800000") conn.commit() except Exception as e: logging.error(traceback.format_exc()) enter_exit(f"Failed to connect database with config:\n {db_path}") return conn
def normalize_dates(df_worksheet, date_columns, table_path='默认', sheet='默认'): """ 检查日期是否为空,日期是否填写异常不能正常转换成pandas的datetime """ if type(date_columns) != list: date_columns = [date_columns] for date_column in date_columns: #转换发帖时间字段, 可能出现日期错误 check_date_error = 0 check_date_empty = df_worksheet.loc[df_worksheet[date_column].isna() == True, :] #记录日期为空的部分,不允许填空 if not check_date_empty.empty: empty_date_row = [str(x + 2) for x in check_date_empty.index] empty_date_row = ','.join(empty_date_row) enter_exit( 'Date must not be empty:File:{},Sheet:{},Row:{},Please add an valid date then try again!' .format(table_path, sheet, empty_date_row)) try: #如果没问题 直接转成datetime df_worksheet[date_column] = pd.to_datetime( df_worksheet[date_column]) except: #如果发帖时间 被转成了int,float格式的时间 (在EXCEL会显示 1月2日,点进去单元格会出现正确的时间格式) df_worksheet[date_column] = df_worksheet[date_column].apply(lambda x: \ datetime.datetime.strftime(xldate_as_datetime(x,0),'%Y-%m-%d') if (type(x) == int or type(x)==float) else x) try: df_worksheet[date_column] = pd.to_datetime( df_worksheet[date_column]) check_date_error += 1 except (ValueError, OutOfBoundsDatetime) as e: for i, t in zip(df_worksheet.index, df_worksheet[date_column]): try: convert_t = pd.to_datetime(t) except ValueError: print('错误日期格式:"{}"'.format(t), '文档第{}行'.format(i + 2)) #找不到发帖时间 except KeyError: print('无法映射出"{}"字段,请检查该字段对应表:"{}","{}"'.format( date_column, table_path, sheet)) input('回车键退出') if check_date_error < 1: print('日期格式错误:"{}", "{}",请修改以上日期再运行'.format(table_path, sheet)) input('回车键退出') sys.exit() return df_worksheet
def stack_columns_to_multi_row(df, target_stack_name=None, regex_foramt=None): """将多个故障摊开的列以_id为主键,摊开成多行, 默认最后几列表头中带有0,1,2,3之类的属于应该被叠起来的故障现象 :param target_stack_name 转换的目标默认字段名 :param regex_foramt return result df """ #检查哪几个列带有数字的后缀 stack_column_pat = '([^0-9]+)[0-9]{1,2}' df_columns = df.columns stack_columns = [] if regex_foramt == None: stack_columns = [ x for x in df_columns if re.match(stack_column_pat, x) != None ] else: stack_columns = [x for x in df_columns if re.match(stack_column_pat,x) != None \ and re.match(regex_foramt) != None] if target_stack_name == None and stack_columns: target_stack_name = re.match(regex_foramt, stack_columns[0]).group() #判断最后一个按顺序排列的列表 stack_columns = get_list_partial_sorted(stack_columns) stack_number_list = [ re.match(stack_column_pat,x).group() for x in df_columns if re.match(stack_column_pat,x) != None \ and re.match(regex_foramt) != None ] #保存数据 stack_line_list = [] if len(stack_columns) < 2: enter_exit('电商爬虫数据中没有找到任何带有数字后缀的多个故障现象列') else: #如果找到了对应列 stack_header_name = re.match(stack_column_pat, stack_columns[0]).group(1) for i, row in df.loc[:, stack_columns].iterrows(): stack_line = ','.join( sorted( [x for x in row.values if type(x) == str and x != 'nan'])) stack_line_list.append(stack_line) df[target_stack_name] = stack_line_list df_columns = [x for x in df_columns if x not in stack_columns ] + [target_stack_name] #堆叠字段全部放后面 df = df.loc[:, df_columns] return df
def get_enddate(txt_path): try: with open(txt_path, 'r', encoding='utf-8') as file: enddate_str = file.read() #防止手写的日期格式有问题 try: enddate = pd.to_datetime(enddate_str) enddate_str = datetime.datetime.strftime(enddate, '%Y%m%d') except: enter_exit('日期有错误') except: enter_exit(f'无法读取到{txt_path}文档,请确认文档在输入数据文件夹') return enddate_str
def word_agg_func(df, group_column, agg_func, agg_func_column, keyword_list, stopword_list): #如果是统计的关键词,停用词直接置空 if 'keyword' in agg_func: stopword_list = [] #不允许关键词为空 if len(keyword_list) == 0: enter_exit( 'Can not find keywords with empty keyword list,please check the keyword files !' ) #每条出现多次,只统计一次,不适用与sum的逻辑 if 'unique' in agg_func and 'count' in agg_func: df[agg_func_column] = df[agg_func_column].swifter.progress_bar( enable=True, desc=agg_func).apply(lambda x: ' '.join( set(process_text_eng(x, keyword_list, stopword_list))) if type(x) == str else x) if 'keyword_count_en' == agg_func[:len('keyword_count_en')]: df_agg = group_word_count(df, group_column, agg_func, agg_func_column, keyword_list=keyword_list, stopword_list=[], count_keywords_only=True) elif "word_count_en" == agg_func[:len('word_count_en')]: df_agg = group_word_count(df, group_column, agg_func, agg_func_column, keyword_list, stopword_list, count_keywords_only=False) elif 'keyword_sum' == agg_func[:len('keyword_sum' )]: #和word_sum一样计算 字典内的数字相加 df_agg = group_dict_sum(df, group_column, agg_func_column) elif 'word_sum' == agg_func[:len('word_sum')]: df_agg = group_dict_sum(df, group_column, agg_func_column) else: #以下是暂时复制的,中文分词还没写 enter_exit(f'{agg_func} not supported yet.') return df_agg
def normalize_dates_single(date): #上面的简单版,针对单个日期的标准化 try: date = pd.to_datetime(date) except: if (type(date) == int or type(date) == float): date = datetime.datetime.strftime(xldate_as_datetime(x, 0), '%Y-%m-%d') try: pd.to_datetime(date) except: pd.to_datetime(date) else: enter_exit(f'Error when converting "{date}"" to datetime') return date
def highlight_kw(path, keyword_dict, keyword_format_dict): content_path = choose_file(path) try: xlrd_wb, xlrd_ws, highlight_column = choose_sheet_column(content_path) except: enter_exit(f'Unknown Error raised when reading {content_path}') new_path = re.match('(.*).(xlsx|xls|xlsm)$', content_path).group(1) + '_Macro.xlsx' xlsxwriter_wb = Workbook(new_path) xlsxwriter_wb = highlight(xlsxwriter_wb, xlrd_ws, highlight_column, keyword_dict, keyword_format_dict) #传过来只是为了关闭 xlrd_wb.release_resources() save_xlsxwriter_wb(xlsxwriter_wb, new_path)
def highlight_kw(path, keyword_dict, keyword_format_dict): content_path = choose_file(path) #默认读取第一个sheet的前两列 xlrd_wb = xlrd.open_workbook(content_path) xlrd_ws = xlrd_wb.sheet_by_index(0) if xlrd_ws.ncols < 2: enter_exit( "Error: Input file have less than 2 columns(Content, Keyword_type) !" ) highlight_column = xlrd_ws.cell(0, 0).value highlight_column_list = xlrd_ws.col_values(0) keyword_type_list = [] for row_index in range(xlrd_ws.nrows): row_values = xlrd_ws.row_values(row_index) if row_values[1] != None or (type(row_values) == str and row_values[1].strip() != ''): keyword_type_list.append(row_values[1]) else: keyword_type_list.append(None) new_path = re.match('(.*).(xlsx|xls|xlsm)$', content_path).group(1) + '_HL_Type.xlsx' xlsxwriter_wb = Workbook(new_path) xlsxwriter_wb = write2wb(xlsxwriter_wb, highlight_column_list, keyword_type_list, keyword_dict, keyword_format_dict, highlight_column) #传过来只是为了关闭 xlrd_wb.release_resources() save_xlsxwriter_wb(xlsxwriter_wb, new_path)
def get_keyword_list(path, re_file_name): found_tag = 0 while found_tag <= 0: try: wb = xlrd.open_workbook(path) found_tag += 1 except FileNotFoundError: input('\n "{0}.xlsx" File Not Found in Current Folder!\n \ Please put the {0}.xlsx to current folder and Press Enter to continue'.format( re_file_name)) continue ws = wb.sheet_by_index(0) #只读取第一列的拆分关键词 keyword_list = [ x for x in ws.col_values(colx=0, start_rowx=0)[1:] if type(x) == str and x.strip() != '' ] if keyword_list: return keyword_list else: enter_exit( 'Cannot read any keyword from the file(first sheet&first column)')
for value in ws_values: current_value = value[1] superior = value[2] #如果有上一层,就往上一层的字典添加children节点 result_dict = find_dict_hirarchy(result_dict,superior,current_value,zero_level_name) result_dict = add_empty_children_value(result_dict) return result_dict #开始 if __name__ == '__main__': input_dir = '.\\input_dir' path_list = [ x for x in os.listdir(input_dir) if '~$' not in x and '.xlsx' in x ] if not path_list: enter_exit('input_dir没有找到任何Excel文档') else: for path_original in path_list: zero_level_name = path_original.replace('.xlsx','') path = os.path.join(input_dir,path_original) workbook = open_workbook(path) ws = workbook.sheet_by_index(0) try: #可以共同去掉第一行数据,但只有第二种情况需要fill_empty ws_values = convert2values(ws) if if_convert_to_tw : ws_values = convert2tw(ws_values) result_dict = {'name':zero_level_name,'children':[ ]}
def generate_complete_index(df, group_columns): #通过需要做成索引的字段,补充出一份完整的索引 number_column = '' original_group_columns = group_columns[:] #获取到数字列的最大即可,文字列 不用交叉生成新数据 column_min = 0 column_max = 0 num_unique_values = [] total_df_list = [] for c in group_columns: row_list = [] try: #是否是数字列, 如果是获取最大最小值, 通常不会拿float当成索引 df[c] = df[c].astype(float) #转成float之后 检查是否全部字段都是空了,如果是空,直接跳过 df_not_na = df.loc[df[c].isna() == False, :] c_min = int(df[c].fillna(0).min(skipna=True)) c_max = int(df[c].fillna(0).max(skipna=True)) if c_max > column_max: column_max = c_max number_column = c if c_min < column_min: column_min = c_min if column_max < 5: #最小保证要是5以上(week_1 报错) num_unique_values = [x for x in range(0, 11)] elif column_max >= 5 and column_max <= 15: #临时限定补充的部分不超过12,通常计算0不会超过这个(天/周维度),否则计算的数据量太大 num_unique_values = [ x for x in range(column_min, column_max + 1) ] else: num_unique_values = [x for x in range(0, 16)] except: #如果不是数字列,获取所有唯一值 pass #如果存在数字列,生成完整的df if num_unique_values: try: group_columns.remove(number_column) except KeyError: enter_exit( 'Error when trying to generate complete date index, please check datetime column value in config files!' ) df_temp = df.loc[:, group_columns] df_temp = df_temp.drop_duplicates() for i in num_unique_values: df_temp[number_column] = i total_df_list.append(df_temp.copy()) df_temp_total = pd.concat(total_df_list, axis=0, ignore_index=True) else: find_lack_columns(df, group_columns) df_temp_total = df.loc[:, group_columns] df_temp_total = df_temp_total.drop_duplicates() df_temp_total = df_temp_total.set_index(original_group_columns) return df_temp_total
require_table_dict = get_require_files(input_dir,require_table_list,matched_part=None) bi_aftersale_path = require_table_dict['印度BI售后失效数据'] model_info_path = require_table_dict['产品信息表'] print('找到的机型信息表:',model_info_path) require_table_list_1 = ['售后类别统计映射','售后机型过滤'] require_table_dict_1 = get_require_files(base_dir,require_table_list_1,matched_part=None) type_name_path = require_table_dict_1['售后类别统计映射'] filt_model_path = require_table_dict_1['售后机型过滤'] #检查有没有找到文件,以及是否以CSV结尾 if bi_aftersale_path == None : enter_exit(f'找不到“印度BI售后失效数据”文档') else: print('找到的文档:{}'.format(bi_aftersale_path)) #读取文档 if re.search('.*.csv$',bi_aftersale_path) != None: bi_aftersale_df = pd.read_csv(bi_aftersale_path) elif re.search('.*.xlsx$',bi_aftersale_path) != None or re.search('.*.xls$',bi_aftersale_path) != None: bi_aftersale_df = pd.read_excel(bi_aftersale_path) #存在需要删除的特殊机型 special_delete_model = process_aftersale_filt(filt_model_path) #读取机型表 model_info_df = pd.read_excel(model_info_path)
model_info_df = model_info_df.rename(rename_dict, axis=1) rename_dict_reverse = dict([(x, y) for y, x in rename_dict.items()]) h = lambda x, y: x.replace(x, y[x]) tag = 0 for u in info_usecols: if u not in model_info_df.columns: print('产品信息表缺失字段:{},原始文档名称:{}, 需要补充再继续'.format( u, h(u, rename_dict_reverse))) tag + 1 if tag > 0: enter_exit('需要补充以上字段,再继续') model_info_df = model_info_df.loc[:, info_usecols] #去掉不包含在口碑报告中的 report_contains_condition = model_info_df['if_report_contains']\ .apply(lambda x: True if type(x)==str and re.search('^是',x)!=None else False) model_info_df = model_info_df.loc[report_contains_condition, :] #去掉机型里面的OPPO 和 Samsung brand_list = ['vivo', 'OPPO', 'xiaomi', 'samsung'] def replace_brand(input_word): for b in brand_list:
def wrapper(*args, **kwargs): try: return func(*args, **kwargs) except Exception as e: logging.error(traceback.format_exc()) enter_exit(f'Error: Calling function: {func.__name__}')
def get_keyword_dict(path_list): #保存每个关键词列所需颜色的文字 keyword_dict = defaultdict(set) #保存每个关键词列 类别的数字 keyword_format_dict = defaultdict(str) seen_keywords = set() for path in path_list: wb = xlrd.open_workbook(path) #sheet name传入颜色 sheet_names = wb.sheet_names() for sn in sheet_names: ws = wb.sheet_by_name(sn) #表头,根据表头获取应该写入红色还是蓝色,还是粗体 header_list = [] try: for x in ws.row(0): if type(x.value) == str and x.value.strip() != '': header = x.value.strip() elif (type(x.value) == float or type(x.value) == int): header = str(x.value).rstrip('0').rstrip('.').strip() else: #为了防止两列中间隔一个空的表头单元格 header = None if header != None: header_list.append(header) if not header_list: enter_exit( f'Error when reading keywords:\n{path}-"{sn}" should have at least one table header(keyword column names).' ) except IndexError: enter_exit( f'Error when reading keywords:\n{path}-"{sn}" should have at least one table header(keyword column names).' ) for row in list(ws.get_rows())[1:]: for i, format_word in enumerate(header_list): if format_word != None: keyword_value = row[i].value if type(keyword_value) == float and math.ceil( keyword_value) == keyword_value: keyword = str(keyword_value).rstrip('0').rstrip( '.').strip() else: #必须去掉容易导致歧义的特殊符号 keyword = replace_re_special( str(keyword_value).strip().lower()) if keyword not in seen_keywords and keyword != "": keyword_dict[format_word].add(keyword) seen_keywords.add(keyword) #记录将每个颜色对应的关键词类 for h in header_list: if h != None: keyword_format_dict[h] = sn.strip().lower() wb.release_resources() return keyword_dict, keyword_format_dict
class sql_handler(): def __init__(self): self.read_or_write = read_or_write self.save_name = save_name self.input_dir = input_dir if self.read_or_write.lower().strip() == 'read': try: with open('query_sql.txt','r') as file: sql = file.read() #去掉可能出现的看不见的UTF8-BOM空格 sql = sql.replace('\ufeff','') except FileNotFoundError: enter_exit('query_sql.txt Not exists!') t1 = time.clock() df = execute_fetchall_engine(engine_text=engine_text,sql=sql) t2 = time.clock() print('Results get in',round(t2 - t1,0),'seconds') df_length = df.shape[0] save_name = 'sql_result' save_path = "{0}.xlsx".format(save_name) print('Writing to excel') if df_length > seperate_batch: counter = 0 for i in range(0,df_length,seperate_batch): counter += 1 df_batch = df.iloc[i:i+seperate_batch,:] print('Getting the {}th Row to {}th Row'.format(i,i+seperate_batch)) #创建新的文档 save_path = "{0}_{1}.xlsx".format(save_name,counter) writer = pd.ExcelWriter(save_path,engine='xlsxwriter',options={'strings_to_urls': False,'strings_to_formulas': False}) xlsxwriter_wb = writer.book write_pct_columns(xlsxwriter_wb,'result',df_batch,pct_columns=['占比']) save_xlsxwriter_wb(writer,save_path) t3 = time.clock() print(round(t3-t2,0),'seconds used') else:#如果并没有超过预定的记录数直接提取后写入 writer = pd.ExcelWriter(save_path,engine='xlsxwriter',options={'strings_to_urls': False,'strings_to_formulas': False}) xlsxwriter_wb = writer.book write_pct_columns(xlsxwriter_wb,'result',df,pct_columns=['占比']) save_xlsxwriter_wb(writer,save_path) t3 = time.clock() print(round(t3-t2),'seconds used') elif self.read_or_write.lower().strip() == 'write': list_dir else:
def write_fault2sql(fault_phen_pathes,conn): usecols = ['_id','model_id','source_content_id','website_id', 'brand','model_name','Series','time','故障现象','故障类别', '上市时间','上市价格','价格区间'] rename_dict = {'Series':'series','故障现象':'fault_phen','time':'comment_time', '故障类别':'fault_type','上市时间':'launch_date', '上市价格':'launch_price','价格区间':'price_range', '年':'comment_year','月':'comment_month', 'model_name':'model','Correct standard_model':'model_name'} not_null_cols = ['_id','model_id','brand','model','series','comment_time','fault_phen','fault_type', 'launch_date','launch_price','price_range'] fault_df_list = [ ] for f_path in fault_phen_pathes: print('正在读取',f_path) fault_df = pd.read_excel(f_path) #先尝试把其他机型可能的不同名称变成统一的model_name fault_df = fault_df.rename({'Correct standard_model':'model_name', '机型':'model_name', '机型名称':'model_name', 'model':'model_name', '修正的机型名称':'model_name' },axis=1) for u in usecols: if u not in fault_df.columns : enter_exit("**缺少必要字段:{} 不能写入**".format(u)) if fault_df[u].dtype == str: fault_df[u] = fault_df[u].str.strip() print('\n 正在根据"是否包含在口碑报告中"字段删除非报告数据\n (最好提前把不包含在口碑报告中的数据手动删除)') if '是否包含在口碑报告中' in fault_df.columns: fault_df = fault_df.loc[fault_df['是否包含在口碑报告中'].str.strip().apply(lambda x: True if x[0]=='是' else False)==True,:] fault_df = fault_df.loc[:,usecols] fault_df = fault_df.rename(rename_dict,axis=1) #检查字段是否有空 + 保持字符型字段两边不包含空格 for column in not_null_cols: check_non_df = fault_df.loc[fault_df[column].isna(),:] fault_df[column] = fault_df[column].apply(lambda x : x.strip() if type(x)==str else x ) if not check_non_df.empty : enter_exit("{}字段有空值".format(column)) start_position = datetime.datetime.strftime(fault_df['comment_time'].min(),'%Y-%m-%d') end_position = datetime.datetime.strftime(fault_df['comment_time'].max(),'%Y-%m-%d') #确认是否写入,如果是回车,否的话就关掉窗口 input('\n 共读取到从{}至{}共 {} 条数据(包含在口碑报告中)\n\ 回车即确认写入数据库(否则关闭该窗口)'.format(start_position,end_position,len(fault_df))) print('开始写入...') t1 = time.clock() fault_df['enddate'] = fault_df['comment_time'].apply(lambda x: last_day_of_month(x)) #去掉model内的品牌 fault_df['model'] = fault_df['model']\ .apply(lambda x :re.sub(pattern='(samsung|OPPO|vivo|xiaomi) ',repl='',string=x,flags=re.I)) #防止机型前后带个空格 fault_df['model'] = fault_df['model'].apply(lambda x: x.strip()) fault_df = merge_report_fault_type(fault_df) fault_df['comment_year'] = fault_df['comment_time'].dt.year fault_df['comment_month'] = fault_df['comment_time'].dt.month #先写入数据库 write2table(engine_text,fault_df,'ecommerce_data',how='mysql_load') print('mysql已写入',f_path) t2 = time.clock() print('用时',round(t2-t1,0),'秒\n')
writer_ws.write(0, 0, headers[0], bold) writer_ws.write(0, 1, headers[1], bold) for i, text in enumerate(column_series): target_text, drop_text = split_keywords(keyword_list, text) writer_ws.write(i + 1, 0, target_text) writer_ws.write(i + 1, 1, drop_text) writer_ws.set_column(0, 0, 40) writer_ws.set_column(0, 1, 40) save_xlsxwriter_wb(writer_wb, save_path) xlrd_wb.release_resources() require_tables = get_require_files('.\\', ['split_keyword']) keyword_path = require_tables['split_keyword'] keyword_list = get_keyword_list(keyword_path, 'Split_keyword') content_path = choose_file(path=r'.\\') new_path = re.match('(.*).(xlsx|xls)$', content_path).group(1) + '_Split.xlsx' split_write(new_path, content_path, keyword_list) enter_exit('')
def write2table(engine_text, df, table_name, how='normal'): # engine_text format : "mysql://*****:*****@localhost:3306/web_data?charset=utf8" """ 4种方式写入: 1.normal: 直接写入,可选参数是否清空原有表; 2.complete_rewrite: 删除原有的所有数据,并整个写入新数据 3.mysql_load:直接通过mysql_load 方式写入 (这个方式经测试已经非常快,不用再重建索引也很快) (适用于特别大的数据集, 都需要确保MYSQL已经有完整的表结构) """ db = create_engine(engine_text, poolclass=NullPool) conn = db.connect() if how == 'normal': df.to_sql(table_name, con=conn, if_exists='append', index=False, chunksize=100000) elif how == 'complete_rewrite': try: truncate_statement = "truncate {};".format(table_name) conn.execute(truncate_statement) except: print('truncate table"{}" failed!'.format(table_name)) df.to_sql(table_name, con=conn, if_exists='append', index=False, chunksize=100000) #如果采用第二种方式写入 elif how == 'mysql_load': #如果存在自增主键,去掉表头的自增主键 auto_increment_key = '' header_column_sql = text(" describe {} ;".format(table_name)) result = conn.execute(header_column_sql).fetchall() if result: header_columns = [x[0] for x in list(result)] else: enter_exit('找不到表格:', table_name) #根据MYSQL表头结构构建一个能LOAD的CSV文档,如果表格存在自增主键, auto_increment_key_sql = """ SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = '{}' AND DATA_TYPE = 'int' AND COLUMN_DEFAULT IS NULL AND IS_NULLABLE = 'NO' AND EXTRA like '%auto_increment%'; """.format(table_name) auto_increment_key_sql = text(auto_increment_key_sql) result_auto = conn.execute(auto_increment_key_sql).fetchone() if result_auto != None: auto_increment_key = list(result_auto)[0] header_columns = [x for x in header_columns if x != auto_increment_key] for h in header_columns: if h not in df.columns: df[h] = None df = df.loc[:, header_columns] #保存数据到临时目的地 temp_path = get_most_upper_level_path('df_temp_file.csv') df.to_csv(temp_path, encoding='utf8', sep=',', quotechar='"', escapechar='\\', index=False, header=None) if auto_increment_key != '': load_infile_sql = r""" LOAD DATA INFILE '{0}' INTO TABLE {1} CHARACTER SET 'utf8mb4' FIELDS TERMINATED BY ',' ENCLOSED BY '"' ESCAPED BY '\\' LINES TERMINATED BY '\r\n' ({2}) SET {3} = NULL; -- 将默认需要自增的键设置为NULL即可正常写入 """.format(temp_path, table_name, ','.join(header_columns), auto_increment_key) else: load_infile_sql = r""" LOAD DATA INFILE '{0}' INTO TABLE {1} CHARACTER SET 'utf8mb4' FIELDS TERMINATED BY ',' ENCLOSED BY '"' ESCAPED BY '\\' LINES TERMINATED BY '\r\n'; """.format(temp_path, table_name) load_infile_sql = text(load_infile_sql).execution_options( autocommit=True) try: conn.execute(load_infile_sql) except Exception as e: logging.error(traceback.format_exc()) finally: conn.close() db.dispose() #删除temp CSV文档 os.remove(temp_path) else: print(how, '写入方法未知,数据未写入') conn.close() db.dispose()
writer = pd.ExcelWriter(save_path,engine='xlsxwriter',options={'strings_to_urls': False,'strings_to_formulas': False}) xlsxwriter_wb = writer.book write_pct_columns(xlsxwriter_wb,'result',df_batch,pct_columns=['占比']) save_xlsxwriter_wb(writer,save_path) t3 = time.clock() print(round(t3-t2,0),'seconds used') else:#如果并没有超过预定的记录数直接提取后写入 writer = pd.ExcelWriter(save_path,engine='xlsxwriter',options={'strings_to_urls': False,'strings_to_formulas': False}) xlsxwriter_wb = writer.book write_pct_columns(xlsxwriter_wb,'result',df,pct_columns=['占比']) save_xlsxwriter_wb(writer,save_path) t3 = time.clock() print(round(t3-t2),'seconds used') elif self.read_or_write.lower().strip() == 'write': list_dir else: enter_exit()