def copy_result(deviceid, exetime): # Copy <deviceid> of DaVinci (Crosswalk xml result) from <test_suite>/tests to <test_result_dir> defined in config.json if not common.find_dir(testresultdir): common.mk_dir(testresultdir) try: common.copy_tree(os.path.join(TESTPATH, deviceid), os.path.join(testresultdir, deviceid)) l('Copied ' + deviceid + ' to: ' + testresultdir) except Exception, ex: common.copy_files(os.path.join(TESTPATH, deviceid), os.path.join(testresultdir, deviceid))
def update_serial_number(): common.mk_dir('CACHE') global _serial_number n = _serial_number SERIAL_NUMBER_FILE = os.path.join('CACHE', 'serial.number') if os.path.exists(SERIAL_NUMBER_FILE): s = file(SERIAL_NUMBER_FILE, 'rt').read() print '%s contains "%s"' % (SERIAL_NUMBER_FILE, s) n = int(s) print 'int=%d' % n _serial_number = n + 1 file(SERIAL_NUMBER_FILE, 'wt').write(str(_serial_number)) print '_serial_number = %d' % _serial_number
def csv_reader(version, deviceid, arch, filepath, exetime): if common.find_file(filepath): p = os.path.join(TESTPATH, deviceid) common.mk_dir(p) p = os.path.join(TESTPATH, deviceid, exetime) common.mk_dir(p) q = os.path.join(p, common.parse_c_json(JSONPATH, 'test_result_xml_name')) generate_xml_report(version, deviceid, arch, q) dreader = csv.DictReader(open(filepath)) #print len(list(dreader)) for c in dreader: reportlink = str(c['Link']).replace('=HYPERLINK("','').replace('")','').replace('\\\\','\\') #testtime = c['TestTime'] testtime = '' l(c['Install'] + ' ' + c['Launch'] + ' ' + c['Random'] + ' ' + c['Back'] + ' ' + c['Uninstall'] + ' ' + c['Logcat'] + ' ' + c['Result'] + ' ' + c['Reason'].strip()) devicemode = '' try: devicemode = c['Device Model'] except Exception, ex: lr(str(ex)) application = '' try: application = c['Application'].decode('utf-8').replace('.apk','') except Exception, ex: application = c['\xef\xbb\xbfApplication'].decode('utf-8').replace('.apk','') applicationname = '' try: applicationname = c['App name'].decode('utf-8') except Exception, ex: lr(str(ex))
def main(p_id): logger.info('程序启动。') """ 实时标注的主函数 """ # 获取启动日期 start_date = cm.string2date(configs['date']['start_date']) # 初始化全局缓存 global_cache = cache.load_global_cache(p_id) # 初始化文本处理器 pro = Processor(configs['type']['news_type_set']) while True: try: # step0: 如果在新的一天开始时,清理一些缓存数据 # 获取待处理文件 processed_files = global_cache.get('processed_files', set()) # 获取监控日期 monitor_date = global_cache.get('monitor_date', start_date) # step1: 加载爬虫文件 spider_df, spider_file = cp.load_spider_df( configs['path']['monitor'], monitor_date, processed_files) # step2: 加载规则库 # 每处理一个文件前,先加载最新规则 pro.load_total_rules(global_cache) # step3: 对文件每行进行打标签 logger.info('开始处理文件{}。'.format(spider_file.full_path)) # 对文件中的每一条数据进行打标签 # 测试直接爬虫文件时需要增加的代码 # 测试直接爬虫文件时需要增加的代码 # spider_df['date'] = spider_df.apply(lambda row: str(dt.datetime.fromtimestamp(int(row['date']))), axis=1) # spider_df['ffdCreate'] = spider_df.apply(lambda row: str(dt.datetime.fromtimestamp(int(row['ffdCreate']))), axis=1) # spider_df.loc[i, 'keyword'], spider_df.loc[i, 'businessType'] = pro.mark(spider_df.iloc[i]) # 要修改 # 数据和规则都不为空的情况下,才处理数据 # 如果数据不为空 if len(spider_df) > 0: # 如果规则不为空 if len(pro.title_lfreq) > 0 or len(pro.content_lfreq) > 0: spider_df[['keyword', 'businessType']] = \ spider_df.apply(lambda row: pro.mark(row['title'], row['content'], row['recType'], row['ffdCreate']), axis=1) else: spider_df['keyword'] = '' spider_df['businessType'] = '' # step4: 保存结果文件 out_temp_file_path = configs['path']['temp_out'] out_file_path = os.path.join(configs['path']['out'], spider_file.parent_folder.name) # 创建临时输出目录 cm.mk_dir(out_temp_file_path) # 创建临时文件所在的目录 spider_df.to_csv(os.path.join(out_temp_file_path, spider_file.name), sep='|', header=True, index=False, encoding='utf-8', quoting=csv.QUOTE_NONE, escapechar='\\') # 将临时输出文件移到正式输出文件夹中 cm.mk_dir(out_file_path) # 创建正式文件所在的目录 shutil.move(os.path.join(out_temp_file_path, spider_file.name), os.path.join(out_file_path, spider_file.name)) # 测试单个文件效率 # break # 当前的处理的文件加入缓存记录 processed_files.add(spider_file) # 如果已经开始处理比较近期的文件,则将历史的一些缓存删除 expected_monitor_date = spider_file.parent_folder.date - dt.timedelta( days=configs['date']['monitor_days']) if expected_monitor_date > monitor_date: processed_files = { file for file in processed_files if file.parent_folder.date >= expected_monitor_date } global_cache.monitor_date = expected_monitor_date # 清理监控日期之前的空文件夹 cp.clean_empty_folder(configs['path']['monitor'], expected_monitor_date) global_cache.processed_files = processed_files # step5: 移除处理完成的文件 # 测试效率 # sys.exit(0) os.remove(spider_file.full_path) logger.info('共处理{}条数据,已生成结果文件{},并将临时文件删除成功。'.format( len(spider_df), os.path.join(out_file_path, spider_file.name))) # step6: 释放资源 # 内存监测 except Exception as e: logger.error('舆情数据流式处理出错。') logger.error(traceback.format_exc())
from __future__ import division """ Reduce Heritage .csv files Claims.csv 2,668,990 rows """ import math import os import re import time import csv import pickle import common DATA_DIRECTORY = 'data' common.mk_dir(DATA_DIRECTORY) DERIVED_PREFIX = os.path.join(DATA_DIRECTORY, 'derived_') _serial_number = 0 def update_serial_number(): common.mk_dir('CACHE') global _serial_number n = _serial_number SERIAL_NUMBER_FILE = os.path.join('CACHE', 'serial.number') if os.path.exists(SERIAL_NUMBER_FILE): s = file(SERIAL_NUMBER_FILE, 'rt').read() print '%s contains "%s"' % (SERIAL_NUMBER_FILE, s) n = int(s)