def test_demo_log_files(psr): psr() log_data = cst.read_data(cst.LOG_JSON_PATH) txt_data = cst.read_data(cst.TXT_JSON_PATH) for k in cst.PARSE_KEYS: if k not in ['last_update_time', 'last_update_timestamp']: assert log_data[k] == txt_data[k] # 2019-01-01T00_00_01.log # 2019-01-01T00_00_02.txt for case, data in zip(['log', 'txt'], [log_data, txt_data]): cst.check_demo_data(data) if case == 'log': job = cst.JOB ext = 'log' else: job = cst.JOB_TXT ext = 'txt' assert data['log_path'].endswith('%s.%s' % (job, ext)) assert data['json_path'].endswith('%s.json' % job) assert data['json_url'].endswith('%s.json' % job) assert data['json_url'].startswith('http://%s' % cst.SCRAPYD_SERVER) assert data['size'] == cst.SIZE assert data['position'] == cst.SIZE assert data['status'] == cst.STATUS assert data['_head'] == cst.LOG_HEAD_LINES assert data['logparser_version'] == cst.LOGPARSER_VERSION
def test_chunk_size(psr): parser = psr(execute_main=False) os.remove(cst.TXT_PATH) assert not os.path.isdir(cst.TXT_PATH) parser.main() data = cst.read_data(cst.LOG_JSON_PATH) assert data['first_log_time'] == '2018-10-23 18:28:34' assert data['latest_log_time'] == '2018-10-23 18:29:42' cst.check_demo_data(data) assert os.path.getsize(cst.APPENDED_LOG_PATH) == cst.SIZE parser = psr(execute_main=False, chunk_size=10000) # 15,862 = 9924 + 5938, 15683 = 9938 + 5745 os.remove(cst.TXT_PATH) assert not os.path.isdir(cst.TXT_PATH) parser.main() data = cst.read_data(cst.LOG_JSON_PATH) cst.json_dumps(data) assert data['first_log_time'] == '2018-10-23 18:28:34' assert data['latest_log_time'] == '2018-10-23 18:29:42' cst.check_demo_data(data) assert os.path.getsize(cst.APPENDED_LOG_PATH) == 5938 if len(os.linesep) == 2 else 5745
def test_new_size_read_data(psr): appended_log = u'test' appended_log_length = len(appended_log) parser = psr() log_data = cst.read_data(cst.LOG_JSON_PATH) assert log_data['logparser_version'] == cst.LOGPARSER_VERSION cst.check_demo_data(log_data) last_update_timestamp = log_data['last_update_timestamp'] # Valid but short appended log cst.write_text(cst.LOG_PATH, appended_log, append=True) time.sleep(2) parser.main() assert os.path.getsize(cst.APPENDED_LOG_PATH) == 0 log_data = cst.read_data(cst.LOG_JSON_PATH) assert log_data['last_update_timestamp'] > last_update_timestamp assert log_data['size'] == cst.SIZE + appended_log_length assert log_data['position'] == cst.SIZE cst.check_demo_data( log_data ) # Previous parsed result is not affected by short appended log # Mismatching version log_data['logparser_version'] = '0.0.0' cst.write_text(cst.LOG_JSON_PATH, cst.json_dumps(log_data)) log_data = cst.read_data(cst.LOG_JSON_PATH) assert log_data['logparser_version'] == '0.0.0' cst.write_text(cst.LOG_PATH, appended_log, append=True) now_size = cst.SIZE + appended_log_length * 2 parser.main() assert os.path.getsize(cst.APPENDED_LOG_PATH) == now_size log_data = cst.read_data(cst.LOG_JSON_PATH) assert log_data['logparser_version'] == cst.LOGPARSER_VERSION assert log_data['size'] == now_size assert log_data['position'] == now_size cst.check_demo_data(log_data) # Broken json file cst.write_text(cst.LOG_JSON_PATH, appended_log, append=True) cst.write_text(cst.LOG_PATH, appended_log, append=True) now_size = cst.SIZE + appended_log_length * 3 parser.main() assert os.path.getsize(cst.APPENDED_LOG_PATH) == now_size log_data = cst.read_data(cst.LOG_JSON_PATH) assert log_data['size'] == now_size assert log_data['position'] == now_size cst.check_demo_data(log_data)
def test_demo_log(): modified_logstats = FRONT.replace("Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min)", "Crawled 1 pages (at 2 pages/min), scraped 3 items (at 4 items/min)") for case, text in zip(['without_stats_dumped', 'whole_log', 'modified_logstats'], [FRONT, FRONT + END, modified_logstats + END]): data = parse(text, headlines=50, taillines=100) # 180 lines in total # cst.json_dumps(data) if case == 'without_stats_dumped': cst.check_demo_data(data, without_stats_dumped=True) elif case == 'modified_logstats': # to test update_data_with_crawler_stats() cst.check_demo_data(data, without_stats_dumped=False, modified_logstats=True) else: cst.check_demo_data(data, without_stats_dumped=False)
def test_new_file_read_data(psr): psr() log_data = cst.read_data(cst.LOG_JSON_PATH) last_update_timestamp = log_data['last_update_timestamp'] # Skip parsing since data with same size found # Old file with old size parser = psr(execute_main=False, reset_logs=False) for i in range(2): time.sleep(2) parser.main() log_data = cst.read_data(cst.LOG_JSON_PATH) assert log_data['last_update_timestamp'] == last_update_timestamp cst.check_demo_data(log_data) # Old logfile with smaller size cst.write_text(cst.LOG_PATH, FRONT + END.replace('memory', '')) parser.main() log_data = cst.read_data(cst.LOG_JSON_PATH) assert log_data['last_update_timestamp'] == last_update_timestamp cst.check_demo_data(log_data) stats = cst.read_data(cst.STATS_JSON_PATH) assert cst.PROJECT not in stats['datas'] # -> parse in next round parser.main() log_data = cst.read_data(cst.LOG_JSON_PATH) assert log_data['last_update_timestamp'] > last_update_timestamp cst.check_demo_data(log_data) stats = cst.read_data(cst.STATS_JSON_PATH) assert cst.PROJECT in stats['datas'] # Read data fail time.sleep(2) cst.write_text(cst.LOG_JSON_PATH, u'') psr(reset_logs=False) log_data = cst.read_data(cst.LOG_JSON_PATH) assert log_data['last_update_timestamp'] > last_update_timestamp cst.check_demo_data(log_data)
def test_log_categories_limit(psr): log_categories_limit = 3 psr(log_categories_limit=log_categories_limit) data = cst.read_data(cst.LOG_JSON_PATH) cst.check_demo_data(data, log_categories_limit=log_categories_limit)