Ejemplo n.º 1
0
def read_config_table(file_path, dtype=str):
    df = pd.DataFrame([])
    df_workbook = None
    #读取数据文件,只读取第一个肉眼可见的sheet
    if '.xlsx' == file_path[-5:]:
        df_workbook = pd.ExcelFile(file_path)
        sheet_name_list = df_workbook.book.sheetnames
        for sheet in sheet_name_list:
            if df_workbook.book.get_sheet_by_name(
                    sheet).sheet_state == 'visible':
                df = df_workbook.parse(sheet, dtype=str)
                break
    else:
        try:
            df = pd.read_excel(file_path)
        except:
            df = pd.read_html(file_path, header=0)

    if df.empty:
        enter_exit(f'Cannot read any visible table in "{file_path}"')

    if df_workbook != None:
        df_workbook.close()

    return df
Ejemplo n.º 2
0
def run_vba(original_path, new_path, macro, macro_sub_name):
    #检查输出文件夹 是否存在
    if not os.path.isfile(original_path):
        enter_exit(f"File not found: {original_path}")

    check_create_new_folder(new_path)

    xlapp = XlappClass()

    try:
        #打开文档,运行VBA, 如果有报错,everything 搜索gen_py文件 删除,让系统重新生成新的gen_py
        xlapp = client.gencache.EnsureDispatch('Excel.Application')
        # xlapp = client.dynamic.Dispatch('Excel.Application')
        # xlapp = client.DispatchEx('Excel.Application')
        xlapp.Visible = 0
        xlapp.DisplayAlerts = False
        xlwb = xlapp.Workbooks.Open(original_path, ReadOnly=1)
        xlwb.Visible = 0
        xlwb.VBProject.VBComponents.Add(1)

        for i in xlwb.VBProject.VBComponents:
            if i.name == '模块1':
                module = xlwb.VBProject.VBComponents.Item(i.name).CodeModule
                module.AddFromString(macro)
                xlwb.Application.Run(macro_sub_name)

        xlwb.SaveAs(new_path, FileFormat=51, ConflictResolution=2)
        xlwb.Close(True)
        print("{} Draw completed".format(new_path.split('\\')[-1]))
    except Exception as e:
        logging.error(traceback.format_exc())
    finally:
        xlapp.Quit
        del xlapp
Ejemplo n.º 3
0
def get_sql_data(engine_text, table_name, sql, save_path, how='normal'):
    """通过SQL获取到目标数据并保存到文档"""
    db = create_engine(engine_text, poolclass=NullPool)
    conn = db.connect()

    if how == 'normal':
        #execute 返回ResultProxy,fetchall如果为空返回空列表
        fetchall_result = execute_fetchall(conn, sql)
        result_df = convert_fetchall2df(fetchall_result)
        result_df.to_excel(save_path, index=False)
    elif how == 'mysql_dump':
        #保存数据到临时目的地,mysqldump不能识别中文的路径,所以必须获取到最高一级的类似C盘C://的路径才能正常写入
        temp_path = get_most_upper_level_path('df_temp_file.csv')

        #先获取一次表头,sql输入结尾不能填分号
        if ';' == sql.strip()[-1]:
            sql = sql.strip()[:-1]

        try:
            first_row = conn.execute(sql)
            header = list(first_row.keys())
        except Exception as e:
            logging.error(traceback.format_exc())
            print(sql, '数据提取失败')
            conn.close()
            db.dispose()
            return None

        dump_sql = r"""  {}
                        INTO OUTFILE '{}' 
                        CHARACTER SET 'utf8mb4'
                        FIELDS TERMINATED BY ',' ENCLOSED BY '"'
                        ESCAPED BY '\\' 
                        LINES TERMINATED BY '\r\n' ; 
                    """.format(sql, temp_path)

        if os.path.exists(temp_path):
            os.remove(temp_path)

        dump_sql = text(dump_sql)
        conn.execute(dump_sql)
        #获取到目标后,将结果复制到目标文件夹并且转换成EXCEL格式
        if save_path.split('.')[-1] == 'xlsx':
            result_df = pd.read_csv(temp_path)
            result_df.to_excel(save_path, header=header, index=False)
            os.remove(temp_path)
        else:  #如果不是保存的XLSX格式,就直接复制CSV结果到目标文件夹
            copyfile(temp_path, save_path)

        print(save_path, '数据已保存')
    else:
        enter_exit(how, "不能识别该导出方式")

    conn.close()
    db.dispose()
Ejemplo n.º 4
0
def group_basic_agg(df,
                    group_column,
                    agg_func,
                    value_column=None,
                    keyword_list=[],
                    stopword_list=[],
                    group_index=False):
    """
	根据分类统计基本的类别数量
	:param df : input df 
	:param group_column : groupby column, list
	:param agg_function : same as df agg func
	:return :  dataframe contains agg result
	"""
    df_copy = df.copy()

    if type(group_column) != list:
        group_column = [group_column]

    #是否有传入指定数值的列,不传入只能做简单的count统计
    if value_column == None:
        agg_func_column = [agg_func]
        df_copy[agg_func_column] = True
    else:
        agg_func_column = [value_column]

    lack_column_list = find_lack_columns(df_copy,
                                         agg_func_column + group_column,
                                         'Group statistic')

    df_copy = df_copy.loc[:, agg_func_column + group_column]

    df_copy = df_copy.fillna(0)

    if 'word' in agg_func:
        #注意这里的agg_func_columns是一个列表,如果需要做词频统计,需要取出第一个
        df_copy_agg = word_agg_func(df_copy, group_column, agg_func,
                                    agg_func_column[0], keyword_list,
                                    stopword_list)
    else:
        try:
            df_copy_agg = df_copy.groupby(group_column).agg(
                agg_func).sort_values(by=agg_func_column,
                                      ascending=False).fillna(0)
        except AttributeError:
            enter_exit(f'"{agg_func}" function not found')
        except:
            enter_exit(
                f'Failed to execute calc function: "{agg_func}" in "{agg_func_column[0]}"'
            )

    if not group_index:
        df_copy_agg = df_copy_agg.reset_index()

    return df_copy_agg
Ejemplo n.º 5
0
def get_sql_connection(engine_text):

    try:
        print(f'Creating connection to {engine_text}')
        db = create_engine(engine_text, poolclass=NullPool)
        conn = db.connect()
    except Exception as e:
        logging.error(traceback.format_exc())
        enter_exit(f"Failed to connect database with config:\n  {engine_text}")

    return conn, db
Ejemplo n.º 6
0
def df_query(df, condition):
    if type(condition) == str and condition.strip() != "":
        try:
            df = df.query(condition, engine='python')
        except:
            write_format_columns('Filter condition-Result when error.xlsx', df,
                                 'content')
            enter_exit(
                f'Unable to compile the following filter condition:\n"{condition}"'
            )

    return df
Ejemplo n.º 7
0
def find_lack_columns(df, require_columns, error_func=''):

    lack_column_list = []
    for r in require_columns:
        if r not in df.columns:
            lack_column_list.append(r)

    if lack_column_list:
        error_msg = ','.join(lack_column_list)
        enter_exit(f"{error_func} Error - Missing columns:{error_msg}")

    return lack_column_list
Ejemplo n.º 8
0
def get_sqlite_connection(db_path):
    if db_path.strip() == '':
        db_path = ":memory:"
    try:
        print('Creating connection to {}'.format(db_path.strip(':')))
        conn = sqlite3.connect(db_path)
        #修改执行的等待时间,防止数据还没有写到库里面,另一个查询函数就开始 报错 table is locked
        conn.execute("PRAGMA busy_timeout=1800000")
        conn.commit()
    except Exception as e:
        logging.error(traceback.format_exc())
        enter_exit(f"Failed to connect database with config:\n  {db_path}")
    return conn
Ejemplo n.º 9
0
def normalize_dates(df_worksheet, date_columns, table_path='默认', sheet='默认'):
    """
	检查日期是否为空,日期是否填写异常不能正常转换成pandas的datetime
	"""
    if type(date_columns) != list:
        date_columns = [date_columns]

    for date_column in date_columns:
        #转换发帖时间字段, 可能出现日期错误
        check_date_error = 0

        check_date_empty = df_worksheet.loc[df_worksheet[date_column].isna() ==
                                            True, :]
        #记录日期为空的部分,不允许填空
        if not check_date_empty.empty:
            empty_date_row = [str(x + 2) for x in check_date_empty.index]
            empty_date_row = ','.join(empty_date_row)
            enter_exit(
                'Date must not be empty:File:{},Sheet:{},Row:{},Please add an valid date then try again!'
                .format(table_path, sheet, empty_date_row))

        try:  #如果没问题 直接转成datetime
            df_worksheet[date_column] = pd.to_datetime(
                df_worksheet[date_column])

        except:
            #如果发帖时间 被转成了int,float格式的时间 (在EXCEL会显示 1月2日,点进去单元格会出现正确的时间格式)
            df_worksheet[date_column] = df_worksheet[date_column].apply(lambda x: \
             datetime.datetime.strftime(xldate_as_datetime(x,0),'%Y-%m-%d') if (type(x) == int or type(x)==float) else x)
            try:
                df_worksheet[date_column] = pd.to_datetime(
                    df_worksheet[date_column])
                check_date_error += 1
            except (ValueError, OutOfBoundsDatetime) as e:
                for i, t in zip(df_worksheet.index, df_worksheet[date_column]):
                    try:
                        convert_t = pd.to_datetime(t)
                    except ValueError:
                        print('错误日期格式:"{}"'.format(t), '文档第{}行'.format(i + 2))
            #找不到发帖时间
            except KeyError:
                print('无法映射出"{}"字段,请检查该字段对应表:"{}","{}"'.format(
                    date_column, table_path, sheet))
                input('回车键退出')

            if check_date_error < 1:
                print('日期格式错误:"{}", "{}",请修改以上日期再运行'.format(table_path, sheet))
                input('回车键退出')
                sys.exit()

    return df_worksheet
Ejemplo n.º 10
0
def stack_columns_to_multi_row(df, target_stack_name=None, regex_foramt=None):
    """将多个故障摊开的列以_id为主键,摊开成多行,
	默认最后几列表头中带有0,1,2,3之类的属于应该被叠起来的故障现象
	:param target_stack_name 转换的目标默认字段名
	:param regex_foramt
	return result df 
	"""
    #检查哪几个列带有数字的后缀
    stack_column_pat = '([^0-9]+)[0-9]{1,2}'

    df_columns = df.columns
    stack_columns = []

    if regex_foramt == None:
        stack_columns = [
            x for x in df_columns if re.match(stack_column_pat, x) != None
        ]
    else:
        stack_columns = [x for x in df_columns if re.match(stack_column_pat,x) != None \
        and re.match(regex_foramt) != None]

        if target_stack_name == None and stack_columns:
            target_stack_name = re.match(regex_foramt,
                                         stack_columns[0]).group()

    #判断最后一个按顺序排列的列表
    stack_columns = get_list_partial_sorted(stack_columns)

    stack_number_list = [ re.match(stack_column_pat,x).group() for x in df_columns if re.match(stack_column_pat,x) != None \
     and re.match(regex_foramt) != None ]
    #保存数据
    stack_line_list = []
    if len(stack_columns) < 2:
        enter_exit('电商爬虫数据中没有找到任何带有数字后缀的多个故障现象列')
    else:  #如果找到了对应列
        stack_header_name = re.match(stack_column_pat,
                                     stack_columns[0]).group(1)
        for i, row in df.loc[:, stack_columns].iterrows():
            stack_line = ','.join(
                sorted(
                    [x for x in row.values if type(x) == str and x != 'nan']))
            stack_line_list.append(stack_line)

    df[target_stack_name] = stack_line_list
    df_columns = [x for x in df_columns if x not in stack_columns
                  ] + [target_stack_name]

    #堆叠字段全部放后面
    df = df.loc[:, df_columns]
    return df
Ejemplo n.º 11
0
def get_enddate(txt_path):
    try:
        with open(txt_path, 'r', encoding='utf-8') as file:
            enddate_str = file.read()
            #防止手写的日期格式有问题
            try:
                enddate = pd.to_datetime(enddate_str)
                enddate_str = datetime.datetime.strftime(enddate, '%Y%m%d')
            except:
                enter_exit('日期有错误')
    except:
        enter_exit(f'无法读取到{txt_path}文档,请确认文档在输入数据文件夹')

    return enddate_str
Ejemplo n.º 12
0
def word_agg_func(df, group_column, agg_func, agg_func_column, keyword_list,
                  stopword_list):

    #如果是统计的关键词,停用词直接置空
    if 'keyword' in agg_func:
        stopword_list = []
        #不允许关键词为空
        if len(keyword_list) == 0:
            enter_exit(
                'Can not find keywords with empty keyword list,please check the keyword files !'
            )

    #每条出现多次,只统计一次,不适用与sum的逻辑
    if 'unique' in agg_func and 'count' in agg_func:
        df[agg_func_column] = df[agg_func_column].swifter.progress_bar(
            enable=True, desc=agg_func).apply(lambda x: ' '.join(
                set(process_text_eng(x, keyword_list, stopword_list)))
                                              if type(x) == str else x)

    if 'keyword_count_en' == agg_func[:len('keyword_count_en')]:
        df_agg = group_word_count(df,
                                  group_column,
                                  agg_func,
                                  agg_func_column,
                                  keyword_list=keyword_list,
                                  stopword_list=[],
                                  count_keywords_only=True)

    elif "word_count_en" == agg_func[:len('word_count_en')]:
        df_agg = group_word_count(df,
                                  group_column,
                                  agg_func,
                                  agg_func_column,
                                  keyword_list,
                                  stopword_list,
                                  count_keywords_only=False)

    elif 'keyword_sum' == agg_func[:len('keyword_sum'
                                        )]:  #和word_sum一样计算 字典内的数字相加
        df_agg = group_dict_sum(df, group_column, agg_func_column)

    elif 'word_sum' == agg_func[:len('word_sum')]:
        df_agg = group_dict_sum(df, group_column, agg_func_column)

    else:  #以下是暂时复制的,中文分词还没写
        enter_exit(f'{agg_func} not supported yet.')

    return df_agg
Ejemplo n.º 13
0
def normalize_dates_single(date):
    #上面的简单版,针对单个日期的标准化
    try:
        date = pd.to_datetime(date)
    except:
        if (type(date) == int or type(date) == float):
            date = datetime.datetime.strftime(xldate_as_datetime(x, 0),
                                              '%Y-%m-%d')
            try:
                pd.to_datetime(date)
            except:
                pd.to_datetime(date)
        else:
            enter_exit(f'Error when converting "{date}"" to datetime')

    return date
Ejemplo n.º 14
0
def highlight_kw(path, keyword_dict, keyword_format_dict):

    content_path = choose_file(path)

    try:
        xlrd_wb, xlrd_ws, highlight_column = choose_sheet_column(content_path)
    except:
        enter_exit(f'Unknown Error raised when reading {content_path}')

    new_path = re.match('(.*).(xlsx|xls|xlsm)$',
                        content_path).group(1) + '_Macro.xlsx'

    xlsxwriter_wb = Workbook(new_path)
    xlsxwriter_wb = highlight(xlsxwriter_wb, xlrd_ws, highlight_column,
                              keyword_dict, keyword_format_dict)

    #传过来只是为了关闭
    xlrd_wb.release_resources()

    save_xlsxwriter_wb(xlsxwriter_wb, new_path)
Ejemplo n.º 15
0
def highlight_kw(path, keyword_dict, keyword_format_dict):

    content_path = choose_file(path)

    #默认读取第一个sheet的前两列
    xlrd_wb = xlrd.open_workbook(content_path)
    xlrd_ws = xlrd_wb.sheet_by_index(0)

    if xlrd_ws.ncols < 2:
        enter_exit(
            "Error: Input file have less than 2 columns(Content, Keyword_type) !"
        )

    highlight_column = xlrd_ws.cell(0, 0).value

    highlight_column_list = xlrd_ws.col_values(0)

    keyword_type_list = []
    for row_index in range(xlrd_ws.nrows):
        row_values = xlrd_ws.row_values(row_index)
        if row_values[1] != None or (type(row_values) == str
                                     and row_values[1].strip() != ''):
            keyword_type_list.append(row_values[1])
        else:
            keyword_type_list.append(None)

    new_path = re.match('(.*).(xlsx|xls|xlsm)$',
                        content_path).group(1) + '_HL_Type.xlsx'

    xlsxwriter_wb = Workbook(new_path)
    xlsxwriter_wb = write2wb(xlsxwriter_wb, highlight_column_list,
                             keyword_type_list, keyword_dict,
                             keyword_format_dict, highlight_column)

    #传过来只是为了关闭
    xlrd_wb.release_resources()

    save_xlsxwriter_wb(xlsxwriter_wb, new_path)
Ejemplo n.º 16
0
def get_keyword_list(path, re_file_name):
    found_tag = 0
    while found_tag <= 0:
        try:
            wb = xlrd.open_workbook(path)
            found_tag += 1
        except FileNotFoundError:
            input('\n "{0}.xlsx" File Not Found in Current Folder!\n \
Please put the {0}.xlsx to current folder and Press Enter to continue'.format(
                re_file_name))
            continue
    ws = wb.sheet_by_index(0)

    #只读取第一列的拆分关键词
    keyword_list = [
        x for x in ws.col_values(colx=0, start_rowx=0)[1:]
        if type(x) == str and x.strip() != ''
    ]

    if keyword_list:
        return keyword_list
    else:
        enter_exit(
            'Cannot read any keyword from the file(first sheet&first column)')
Ejemplo n.º 17
0
	for value in ws_values:
		current_value = value[1]
		superior = value[2]
		#如果有上一层,就往上一层的字典添加children节点
		result_dict = find_dict_hirarchy(result_dict,superior,current_value,zero_level_name)
	result_dict = add_empty_children_value(result_dict)
	return result_dict

#开始
if __name__ == '__main__':

	input_dir = '.\\input_dir'
	path_list = [ x for x in os.listdir(input_dir) if '~$' not in x and '.xlsx' in x ]

	if not path_list:
		enter_exit('input_dir没有找到任何Excel文档')
	else:
		for path_original in path_list:
			zero_level_name = path_original.replace('.xlsx','')
			path = os.path.join(input_dir,path_original)

			workbook = open_workbook(path)
			ws = workbook.sheet_by_index(0)

			try:
				#可以共同去掉第一行数据,但只有第二种情况需要fill_empty
				ws_values = convert2values(ws)
				if if_convert_to_tw :
					ws_values = convert2tw(ws_values)

				result_dict = {'name':zero_level_name,'children':[ ]}
Ejemplo n.º 18
0
def generate_complete_index(df, group_columns):
    #通过需要做成索引的字段,补充出一份完整的索引
    number_column = ''

    original_group_columns = group_columns[:]
    #获取到数字列的最大即可,文字列 不用交叉生成新数据
    column_min = 0
    column_max = 0
    num_unique_values = []

    total_df_list = []
    for c in group_columns:
        row_list = []
        try:  #是否是数字列, 如果是获取最大最小值, 通常不会拿float当成索引
            df[c] = df[c].astype(float)
            #转成float之后 检查是否全部字段都是空了,如果是空,直接跳过
            df_not_na = df.loc[df[c].isna() == False, :]
            c_min = int(df[c].fillna(0).min(skipna=True))
            c_max = int(df[c].fillna(0).max(skipna=True))
            if c_max > column_max:
                column_max = c_max
                number_column = c
            if c_min < column_min:
                column_min = c_min

            if column_max < 5:  #最小保证要是5以上(week_1 报错)
                num_unique_values = [x for x in range(0, 11)]
            elif column_max >= 5 and column_max <= 15:  #临时限定补充的部分不超过12,通常计算0不会超过这个(天/周维度),否则计算的数据量太大
                num_unique_values = [
                    x for x in range(column_min, column_max + 1)
                ]
            else:
                num_unique_values = [x for x in range(0, 16)]

        except:  #如果不是数字列,获取所有唯一值
            pass

    #如果存在数字列,生成完整的df
    if num_unique_values:
        try:
            group_columns.remove(number_column)
        except KeyError:
            enter_exit(
                'Error when trying to generate complete date index, please check datetime column value in config files!'
            )
        df_temp = df.loc[:, group_columns]
        df_temp = df_temp.drop_duplicates()

        for i in num_unique_values:
            df_temp[number_column] = i
            total_df_list.append(df_temp.copy())

        df_temp_total = pd.concat(total_df_list, axis=0, ignore_index=True)
    else:
        find_lack_columns(df, group_columns)
        df_temp_total = df.loc[:, group_columns]
        df_temp_total = df_temp_total.drop_duplicates()

    df_temp_total = df_temp_total.set_index(original_group_columns)

    return df_temp_total
Ejemplo n.º 19
0
	require_table_dict = get_require_files(input_dir,require_table_list,matched_part=None)

	bi_aftersale_path = require_table_dict['印度BI售后失效数据']

	model_info_path = require_table_dict['产品信息表']
	print('找到的机型信息表:',model_info_path)

	require_table_list_1 = ['售后类别统计映射','售后机型过滤']
	require_table_dict_1 = get_require_files(base_dir,require_table_list_1,matched_part=None)

	type_name_path = require_table_dict_1['售后类别统计映射']
	filt_model_path = require_table_dict_1['售后机型过滤']

	#检查有没有找到文件,以及是否以CSV结尾
	if bi_aftersale_path == None :
		enter_exit(f'找不到“印度BI售后失效数据”文档')
	else:
		print('找到的文档:{}'.format(bi_aftersale_path))

	#读取文档
	if re.search('.*.csv$',bi_aftersale_path) != None:
		bi_aftersale_df = pd.read_csv(bi_aftersale_path)
	elif re.search('.*.xlsx$',bi_aftersale_path) != None or re.search('.*.xls$',bi_aftersale_path) != None:
		bi_aftersale_df = pd.read_excel(bi_aftersale_path)

	#存在需要删除的特殊机型
	special_delete_model = process_aftersale_filt(filt_model_path)

	#读取机型表
	model_info_df = pd.read_excel(model_info_path)
Ejemplo n.º 20
0
model_info_df = model_info_df.rename(rename_dict, axis=1)

rename_dict_reverse = dict([(x, y) for y, x in rename_dict.items()])

h = lambda x, y: x.replace(x, y[x])

tag = 0
for u in info_usecols:
    if u not in model_info_df.columns:
        print('产品信息表缺失字段:{},原始文档名称:{}, 需要补充再继续'.format(
            u, h(u, rename_dict_reverse)))
        tag + 1

if tag > 0:
    enter_exit('需要补充以上字段,再继续')

model_info_df = model_info_df.loc[:, info_usecols]

#去掉不包含在口碑报告中的
report_contains_condition = model_info_df['if_report_contains']\
    .apply(lambda x: True if type(x)==str and re.search('^是',x)!=None else False)

model_info_df = model_info_df.loc[report_contains_condition, :]

#去掉机型里面的OPPO 和 Samsung
brand_list = ['vivo', 'OPPO', 'xiaomi', 'samsung']


def replace_brand(input_word):
    for b in brand_list:
Ejemplo n.º 21
0
 def wrapper(*args, **kwargs):
     try:
         return func(*args, **kwargs)
     except Exception as e:
         logging.error(traceback.format_exc())
         enter_exit(f'Error: Calling function: {func.__name__}')
Ejemplo n.º 22
0
def get_keyword_dict(path_list):
    #保存每个关键词列所需颜色的文字
    keyword_dict = defaultdict(set)
    #保存每个关键词列 类别的数字
    keyword_format_dict = defaultdict(str)

    seen_keywords = set()
    for path in path_list:
        wb = xlrd.open_workbook(path)
        #sheet name传入颜色
        sheet_names = wb.sheet_names()
        for sn in sheet_names:
            ws = wb.sheet_by_name(sn)
            #表头,根据表头获取应该写入红色还是蓝色,还是粗体
            header_list = []
            try:
                for x in ws.row(0):
                    if type(x.value) == str and x.value.strip() != '':
                        header = x.value.strip()
                    elif (type(x.value) == float or type(x.value) == int):
                        header = str(x.value).rstrip('0').rstrip('.').strip()
                    else:
                        #为了防止两列中间隔一个空的表头单元格
                        header = None

                    if header != None:
                        header_list.append(header)

                if not header_list:
                    enter_exit(
                        f'Error when reading keywords:\n{path}-"{sn}" should have at least one table header(keyword column names).'
                    )
            except IndexError:
                enter_exit(
                    f'Error when reading keywords:\n{path}-"{sn}" should have at least one table header(keyword column names).'
                )

            for row in list(ws.get_rows())[1:]:
                for i, format_word in enumerate(header_list):
                    if format_word != None:
                        keyword_value = row[i].value
                        if type(keyword_value) == float and math.ceil(
                                keyword_value) == keyword_value:
                            keyword = str(keyword_value).rstrip('0').rstrip(
                                '.').strip()
                        else:  #必须去掉容易导致歧义的特殊符号
                            keyword = replace_re_special(
                                str(keyword_value).strip().lower())

                        if keyword not in seen_keywords and keyword != "":
                            keyword_dict[format_word].add(keyword)

                        seen_keywords.add(keyword)

            #记录将每个颜色对应的关键词类
            for h in header_list:
                if h != None:
                    keyword_format_dict[h] = sn.strip().lower()

        wb.release_resources()

    return keyword_dict, keyword_format_dict
Ejemplo n.º 23
0
class sql_handler():
	def __init__(self):
		self.read_or_write = read_or_write
		self.save_name = save_name
		self.input_dir = input_dir

	if self.read_or_write.lower().strip() == 'read':

		try:
			with open('query_sql.txt','r') as file:
				sql = file.read()
				#去掉可能出现的看不见的UTF8-BOM空格
				sql = sql.replace('\ufeff','')
		except FileNotFoundError:
			enter_exit('query_sql.txt Not exists!')

		t1 = time.clock()
		df = execute_fetchall_engine(engine_text=engine_text,sql=sql)

		t2 = time.clock()

		print('Results get in',round(t2 - t1,0),'seconds')

		df_length = df.shape[0]
		save_name = 'sql_result'

		save_path = "{0}.xlsx".format(save_name)

		print('Writing to excel')

		if df_length > seperate_batch:
			counter = 0 
			for i in range(0,df_length,seperate_batch):
				counter += 1 
				df_batch = df.iloc[i:i+seperate_batch,:]

				print('Getting the {}th Row to {}th Row'.format(i,i+seperate_batch))
				#创建新的文档
				save_path  = "{0}_{1}.xlsx".format(save_name,counter)

				writer = pd.ExcelWriter(save_path,engine='xlsxwriter',options={'strings_to_urls': False,'strings_to_formulas': False})

				xlsxwriter_wb = writer.book

				write_pct_columns(xlsxwriter_wb,'result',df_batch,pct_columns=['占比'])

				save_xlsxwriter_wb(writer,save_path)

				t3 = time.clock()
				print(round(t3-t2,0),'seconds used')

		else:#如果并没有超过预定的记录数直接提取后写入 
			writer = pd.ExcelWriter(save_path,engine='xlsxwriter',options={'strings_to_urls': False,'strings_to_formulas': False})

			xlsxwriter_wb = writer.book

			write_pct_columns(xlsxwriter_wb,'result',df,pct_columns=['占比'])

			save_xlsxwriter_wb(writer,save_path)

			t3 = time.clock()
			print(round(t3-t2),'seconds used')

	elif self.read_or_write.lower().strip() == 'write':
		list_dir
	else:
Ejemplo n.º 24
0
def write_fault2sql(fault_phen_pathes,conn):
	usecols = ['_id','model_id','source_content_id','website_id',
		   'brand','model_name','Series','time','故障现象','故障类别',
		   '上市时间','上市价格','价格区间']

	rename_dict = {'Series':'series','故障现象':'fault_phen','time':'comment_time',
				  '故障类别':'fault_type','上市时间':'launch_date',
				  '上市价格':'launch_price','价格区间':'price_range',
				  '年':'comment_year','月':'comment_month',
				  'model_name':'model','Correct standard_model':'model_name'}

	not_null_cols = ['_id','model_id','brand','model','series','comment_time','fault_phen','fault_type',
					'launch_date','launch_price','price_range']

	fault_df_list = [ ] 
	for f_path in fault_phen_pathes:
		print('正在读取',f_path)
		fault_df = pd.read_excel(f_path)

		#先尝试把其他机型可能的不同名称变成统一的model_name
		fault_df = fault_df.rename({'Correct standard_model':'model_name',
											'机型':'model_name',
											'机型名称':'model_name',
											'model':'model_name',
											'修正的机型名称':'model_name'
											},axis=1)
		for u in usecols:
			if u not in fault_df.columns :
				enter_exit("**缺少必要字段:{} 不能写入**".format(u))
			if fault_df[u].dtype == str:
				fault_df[u] = fault_df[u].str.strip()
				
		print('\n  正在根据"是否包含在口碑报告中"字段删除非报告数据\n  (最好提前把不包含在口碑报告中的数据手动删除)')

		if '是否包含在口碑报告中' in fault_df.columns:
			fault_df = fault_df.loc[fault_df['是否包含在口碑报告中'].str.strip().apply(lambda x: True if x[0]=='是' else False)==True,:]

		fault_df = fault_df.loc[:,usecols]
		fault_df = fault_df.rename(rename_dict,axis=1)

		#检查字段是否有空 + 保持字符型字段两边不包含空格
		for column in not_null_cols: 
			check_non_df = fault_df.loc[fault_df[column].isna(),:]
			fault_df[column] = fault_df[column].apply(lambda x : x.strip() if type(x)==str else x )

			if not check_non_df.empty :
				enter_exit("{}字段有空值".format(column))

		start_position = datetime.datetime.strftime(fault_df['comment_time'].min(),'%Y-%m-%d')
		end_position = datetime.datetime.strftime(fault_df['comment_time'].max(),'%Y-%m-%d')

		#确认是否写入,如果是回车,否的话就关掉窗口
		input('\n   共读取到从{}至{}共 {} 条数据(包含在口碑报告中)\n\
			  回车即确认写入数据库(否则关闭该窗口)'.format(start_position,end_position,len(fault_df)))
		print('开始写入...')

		t1 =  time.clock()

		fault_df['enddate'] =  fault_df['comment_time'].apply(lambda x: last_day_of_month(x))
		#去掉model内的品牌
		fault_df['model'] = fault_df['model']\
					.apply(lambda x :re.sub(pattern='(samsung|OPPO|vivo|xiaomi) ',repl='',string=x,flags=re.I))

		#防止机型前后带个空格
		fault_df['model'] = fault_df['model'].apply(lambda x: x.strip())

		fault_df = merge_report_fault_type(fault_df)
		fault_df['comment_year'] = fault_df['comment_time'].dt.year 
		fault_df['comment_month'] = fault_df['comment_time'].dt.month 

		#先写入数据库
		write2table(engine_text,fault_df,'ecommerce_data',how='mysql_load')

		print('mysql已写入',f_path)
		t2 = time.clock()
		print('用时',round(t2-t1,0),'秒\n')
Ejemplo n.º 25
0
    writer_ws.write(0, 0, headers[0], bold)
    writer_ws.write(0, 1, headers[1], bold)

    for i, text in enumerate(column_series):
        target_text, drop_text = split_keywords(keyword_list, text)

        writer_ws.write(i + 1, 0, target_text)
        writer_ws.write(i + 1, 1, drop_text)

    writer_ws.set_column(0, 0, 40)
    writer_ws.set_column(0, 1, 40)

    save_xlsxwriter_wb(writer_wb, save_path)

    xlrd_wb.release_resources()


require_tables = get_require_files('.\\', ['split_keyword'])

keyword_path = require_tables['split_keyword']

keyword_list = get_keyword_list(keyword_path, 'Split_keyword')

content_path = choose_file(path=r'.\\')

new_path = re.match('(.*).(xlsx|xls)$', content_path).group(1) + '_Split.xlsx'

split_write(new_path, content_path, keyword_list)

enter_exit('')
Ejemplo n.º 26
0
def write2table(engine_text, df, table_name, how='normal'):
    # engine_text format : "mysql://*****:*****@localhost:3306/web_data?charset=utf8"
    """
    4种方式写入: 1.normal: 直接写入,可选参数是否清空原有表;
                2.complete_rewrite: 删除原有的所有数据,并整个写入新数据
                3.mysql_load:直接通过mysql_load 方式写入 (这个方式经测试已经非常快,不用再重建索引也很快)
               (适用于特别大的数据集, 都需要确保MYSQL已经有完整的表结构)
    """
    db = create_engine(engine_text, poolclass=NullPool)
    conn = db.connect()

    if how == 'normal':
        df.to_sql(table_name,
                  con=conn,
                  if_exists='append',
                  index=False,
                  chunksize=100000)

    elif how == 'complete_rewrite':
        try:
            truncate_statement = "truncate {};".format(table_name)
            conn.execute(truncate_statement)
        except:
            print('truncate table"{}" failed!'.format(table_name))

        df.to_sql(table_name,
                  con=conn,
                  if_exists='append',
                  index=False,
                  chunksize=100000)

    #如果采用第二种方式写入
    elif how == 'mysql_load':
        #如果存在自增主键,去掉表头的自增主键
        auto_increment_key = ''

        header_column_sql = text(" describe {} ;".format(table_name))
        result = conn.execute(header_column_sql).fetchall()
        if result:
            header_columns = [x[0] for x in list(result)]
        else:
            enter_exit('找不到表格:', table_name)
        #根据MYSQL表头结构构建一个能LOAD的CSV文档,如果表格存在自增主键,
        auto_increment_key_sql = """ SELECT COLUMN_NAME
                                     FROM INFORMATION_SCHEMA.COLUMNS  
                                     WHERE TABLE_SCHEMA = DATABASE()  AND TABLE_NAME = '{}'  AND DATA_TYPE = 'int'
                                     AND COLUMN_DEFAULT IS NULL AND IS_NULLABLE = 'NO' AND EXTRA like '%auto_increment%';
                                 """.format(table_name)

        auto_increment_key_sql = text(auto_increment_key_sql)
        result_auto = conn.execute(auto_increment_key_sql).fetchone()

        if result_auto != None:
            auto_increment_key = list(result_auto)[0]

        header_columns = [x for x in header_columns if x != auto_increment_key]

        for h in header_columns:
            if h not in df.columns:
                df[h] = None

        df = df.loc[:, header_columns]
        #保存数据到临时目的地
        temp_path = get_most_upper_level_path('df_temp_file.csv')

        df.to_csv(temp_path,
                  encoding='utf8',
                  sep=',',
                  quotechar='"',
                  escapechar='\\',
                  index=False,
                  header=None)

        if auto_increment_key != '':
            load_infile_sql = r"""  LOAD DATA INFILE '{0}'
                                    INTO TABLE {1} 
                                    CHARACTER SET 'utf8mb4'
                                    FIELDS TERMINATED BY ',' ENCLOSED BY '"'
                                    ESCAPED BY '\\'
                                    LINES TERMINATED BY '\r\n'
                                    ({2})
                                    SET {3} = NULL; -- 将默认需要自增的键设置为NULL即可正常写入
                                    """.format(temp_path, table_name,
                                               ','.join(header_columns),
                                               auto_increment_key)
        else:
            load_infile_sql = r"""  LOAD DATA INFILE '{0}'
                        INTO TABLE {1} 
                        CHARACTER SET 'utf8mb4'
                        FIELDS TERMINATED BY ',' ENCLOSED BY '"'
                        ESCAPED BY '\\'
                        LINES TERMINATED BY '\r\n';
                        """.format(temp_path, table_name)

        load_infile_sql = text(load_infile_sql).execution_options(
            autocommit=True)
        try:
            conn.execute(load_infile_sql)
        except Exception as e:
            logging.error(traceback.format_exc())
        finally:
            conn.close()
            db.dispose()
        #删除temp CSV文档
        os.remove(temp_path)
    else:
        print(how, '写入方法未知,数据未写入')

    conn.close()
    db.dispose()
Ejemplo n.º 27
0
				writer = pd.ExcelWriter(save_path,engine='xlsxwriter',options={'strings_to_urls': False,'strings_to_formulas': False})

				xlsxwriter_wb = writer.book

				write_pct_columns(xlsxwriter_wb,'result',df_batch,pct_columns=['占比'])

				save_xlsxwriter_wb(writer,save_path)

				t3 = time.clock()
				print(round(t3-t2,0),'seconds used')

		else:#如果并没有超过预定的记录数直接提取后写入 
			writer = pd.ExcelWriter(save_path,engine='xlsxwriter',options={'strings_to_urls': False,'strings_to_formulas': False})

			xlsxwriter_wb = writer.book

			write_pct_columns(xlsxwriter_wb,'result',df,pct_columns=['占比'])

			save_xlsxwriter_wb(writer,save_path)

			t3 = time.clock()
			print(round(t3-t2),'seconds used')

	elif self.read_or_write.lower().strip() == 'write':
		list_dir
	else:



enter_exit()