def crawling(self, category_name): # Multi Process PID print(category_name + " PID: " + str(os.getpid())) writer = Writer(category_name=category_name, date=self.date) wcsv = writer.get_writer_csv() wcsv.writerow([ "date", "time", "category", "company", "author", "headline", "sentence", "content_url", "image_url" ]) # 기사 URL 형식 url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str( self.categories.get(category_name)) + "&date=" # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다. day_urls = self.make_news_page_url(url, self.date['year'], self.date['month'], self.date['day']) print(category_name + " Urls are generated") print("The crawler starts") with concurrent.futures.ThreadPoolExecutor() as pool: futureWorkers = [] for URL in day_urls: futureWorkers.append( pool.submit( self.get_page_and_write_row, category_name, writer, URL, )) for future in concurrent.futures.as_completed(futureWorkers): print(future.result()) writer.close()
def total_content(Syear, Smonth, Sday, Fyear, Fmonth, Fday, unique_number): length = 0 first = "" start_lst = list() finish_lst = list() with open( f"{PATH}/{Syear}{Smonth}{Sday}_{Fyear}{Fmonth}{Fday}_TStatistic.csv", newline='') as csvfile: reader = csv.reader(csvfile) for i, line in enumerate(reader): length += 1 if first == "": first = line[0] start_lst.append(i) else: if first != line[0]: first = line[0] start_lst.append(i) finish_lst.append(i) finish_lst.append(length) final_lst = list() for i in range(len(start_lst)): dif = finish_lst[i] - start_lst[i] mine = dif // 10 left = dif % 10 start = start_lst[i] + mine * unique_number if left > unique_number: start += unique_number else: start += left if left > unique_number: mine += 1 if mine != 0: final_lst.append((start, start + mine)) setter = Setter(Syear, Smonth, Sday, Fyear, Fmonth, Fday, unique_number) with Pool(processes=50) as pool: a = pool.map(setter.multi_wrapper, final_lst) site_lst = list() for i in a: try: site = i[0][1].replace('/', '') site_lst.append(site) success_writer = Writer( f'{Syear}{Smonth}{Sday}_{Fyear}{Fmonth}{Fday}_{site}url{unique_number}' ) for j in i: success_writer.write_line(j) success_writer.close() except: print(f"Fail {i}") writer = Writer( f'{Syear}{Smonth}{Sday}_{Fyear}{Fmonth}{Fday}_check{unique_number}') writer.close() return site_lst
def split(Syear, Smonth, Sday, Fyear, Fmonth, Fday, threshold, setting): total_lst = list() target_lst = dict() startTime = datetime.now() if setting: total_dict = dict() for i in range(0, 10): with open(f"{PATH}/setting/setting{i}.json", "r", encoding='utf-8') as csvfile: tmp_dict = json.load(csvfile) if i == 0: total_dict = tmp_dict else: for j in tmp_dict['application'].keys(): for k in tmp_dict['application'][j].keys(): total_dict['application'][j][k] = tmp_dict[ 'application'][j][k] os.remove(f"{PATH}/setting/setting{i}.json") with open(f"{PATH}/setting/setting.json", 'w', encoding='utf-8') as f: json.dump(total_dict, f, ensure_ascii=False, indent=2) del total_dict for i in range(0, 10): with open( f"{PATH}/{Syear}{Smonth}{Sday}_{Fyear}{Fmonth}{Fday}_Statistic{i}.csv", newline='') as csvfile: reader = csv.reader(csvfile) for line in reader: if (int(line[2]) >= int(threshold)) and (line[1] not in except_lst): site = line[0] if site in target_lst: key_lst = target_lst[site] key_lst.append(line) target_lst[site] = key_lst else: target_lst[site] = [line] total_lst.append(line) os.remove( f"{PATH}/{Syear}{Smonth}{Sday}_{Fyear}{Fmonth}{Fday}_Statistic{i}.csv" ) writer = Writer(f'{Syear}{Smonth}{Sday}_{Fyear}{Fmonth}{Fday}_Statistic') for i in total_lst: writer.write_line(i) writer.close() writer = Writer(f'{Syear}{Smonth}{Sday}_{Fyear}{Fmonth}{Fday}_TStatistic') for i in target_lst: for j in target_lst[i]: writer.write_line(j) writer.close() endTime = datetime.now()
def train(nbSticks, nbGames, plotValueFunction, players, debugMode=False): writer = Writer() if debugMode else None game = Game(nbSticks, players, { ConfigKey.TRAIN: True, ConfigKey.DEBUG: debugMode }) # stats for trained players valueFunctionStats = {} for p in (p for p in players if type(p) is TrainedPlayer): valueFunctionStats[p] = dict((i, []) for i in range(1, nbSticks + 1)) print("\nTraining...") for i in range(0, nbGames): if writer: writer.beginGame(i, game.players[0], game.players[1]) game.start() # play a game if writer: writer.endGame(game.players[0], game.players[1]) # update stats for p in valueFunctionStats: for s, v in p.v.items(): valueFunctionStats[p][s].append(v) # decrease exploration over time if i % 10 == 0: for p in players: p.epsilon = max(p.epsilon * 0.996, 0.05) game.reset() if writer: writer.close() for p in (p for p in players if type(p) is TrainedPlayer): print(f'\n=== Value function of {p}: ===') PrettyPrinter().pprint(p.v) if plotValueFunction: plot(valueFunctionStats)
def total_statistic(Syear, Smonth, Sday, Fyear, Fmonth, Fday, unique_number, setting): length = 0 first = "" start_lst = list() finish_lst = list() with open(PATH, newline='') as csvfile: reader = csv.reader(csvfile) for i, line in enumerate(reader): length += 1 if first == "": first = line[0] start_lst.append(i) else: if first != line[0]: first = line[0] start_lst.append(i) finish_lst.append(i) finish_lst.append(length) final_lst = list() for i in range(len(start_lst)): dif = finish_lst[i] - start_lst[i] mine = dif // 10 left = dif % 10 start = start_lst[i] + mine * unique_number if left > unique_number: start += unique_number else: start += left if left > unique_number: mine += 1 if mine != 0: final_lst.append((start, start + mine)) setter = Setter(Syear, Smonth, Sday, Fyear, Fmonth, Fday, setting) with Pool(processes=50) as pool: a = pool.map(setter.multi_wrapper, final_lst) success_writer = Writer( f'{Syear}{Smonth}{Sday}_{Fyear}{Fmonth}{Fday}_Statistic{unique_number}' ) merged_dict = dict() log_dict = dict() for i in a: for j in i[0]: success_writer.write_line(j) if setting: merged_dict = {**merged_dict, **i[1]} if str(i[4]) in log_dict: log_dict[str(i[4])]['total'] = log_dict[str(i[4])]['total'] + i[2] log_dict[str(i[4])]['fail'] = log_dict[str(i[4])]['fail'] + i[3] else: log_dict[str(i[4])] = {'total': i[2], 'fail': i[3]} success_writer.close() if setting: json_object = dict() json_object['last-modified'] = str(datetime.now().date()) json_object['application'] = merged_dict with open(f"{PATH2}/setting{unique_number}.json", 'w', encoding='utf-8') as f: json.dump(json_object, f, ensure_ascii=False, indent=2) return log_dict
def run_train(train_params=None, test_params=None): opt = TrainOptions().parse(train_params) testopt = TestOptions().parse(test_params) testopt.timestamp = opt.timestamp testopt.batch_size = 30 # model init model = SketchModel(opt) model.print_detail() writer = Writer(opt) # data load trainDataloader = load_data(opt, datasetType='train', permutation=opt.permutation, shuffle=opt.shuffle) validDataloader = load_data(opt, datasetType='valid') testDataloader = load_data(opt, datasetType='test') # train epoches # with torchsnooper.snoop(): ii = 0 min_test_avgloss = 100 min_test_avgloss_epoch = 0 for epoch in range(opt.epoch): for i, data in enumerate(trainDataloader): model.step(data) if ii % opt.plot_freq == 0: writer.plot_train_loss(model.loss, ii) if ii % opt.print_freq == 0: writer.print_train_loss(epoch, i, model.loss) ii += 1 model.update_learning_rate() if opt.plot_weights: writer.plot_model_wts(model, epoch) # test if epoch % opt.run_test_freq == 0: model.save_network('latest') loss_avg, P_metric, C_metric = run_eval(opt=testopt, loader=validDataloader, dataset='valid', write_result=False) writer.print_test_loss(epoch, 0, loss_avg) writer.plot_test_loss(loss_avg, epoch) writer.print_eval_metric(epoch, P_metric, C_metric) writer.plot_eval_metric(epoch, P_metric, C_metric) if loss_avg < min_test_avgloss: min_test_avgloss = loss_avg min_test_avgloss_epoch = epoch print( 'saving the model at the end of epoch {} with best avgLoss {}' .format(epoch, min_test_avgloss)) model.save_network('bestloss') testopt.which_epoch = 'latest' testopt.metric_way = 'wlen' loss_avg, P_metric, C_metric = run_eval(opt=testopt, loader=testDataloader, dataset='test', write_result=False) testopt.which_epoch = 'bestloss' testopt.metric_way = 'wlen' loss_avg_2, P_metric_2, C_metric_2 = run_eval(opt=testopt, loader=testDataloader, dataset='test', write_result=False) record_list = { 'p_metric': round(P_metric * 100, 2), 'c_metric': round(C_metric * 100, 2), 'loss_avg': round(loss_avg, 4), 'best_epoch': min_test_avgloss_epoch, 'p_metric_2': round(P_metric_2 * 100, 2), 'c_metric_2': round(C_metric_2 * 100, 2), 'loss_avg_2': round(loss_avg_2, 4), } writer.train_record(record_list=record_list) writer.close() return record_list, opt.timestamp
count = 0 if opcode.cc is not None: line += "<" + opcode.cc.value count += 1 for operand in opcode.operands: line += (", " if count > 0 else "<") + operand.template_value() count += 1 if count > 0: line += ">" line += "}," writer.write(line) if __name__ == "__main__": if len(sys.argv) < 3: print("usage: %s <8086|x86> output_filename" % sys.argv[0]) sys.exit(1) writer = Writer(sys.argv[2]) if sys.argv[1] == "8086": MODULE = opcodes_8086 DISPATCH_FUNCTION_NAME = "CPU_8086::Instructions::DispatchInstruction" gen_dispatch(writer) elif sys.argv[1] == "x86": MODULE = opcodes_x86 DISPATCH_FUNCTION_NAME = "CPU_X86::Interpreter::Dispatch" gen_dispatch(writer) gen_handler_table(writer, "CPU_X86::Interpreter", "s_handler_functions") writer.close()
def crawling(self, category_name): # Multi Process PID print(category_name + " PID: " + str(os.getpid())) writer = Writer(category_name=category_name, date=self.date) # 기사 URL 형식 url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str( self.categories.get(category_name)) + "&date=" # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다. day_urls = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month']) print(category_name + " Urls are generated") print("The crawler starts") for URL in day_urls: regex = re.compile("date=(\d+)") news_date = regex.findall(URL)[0] request = self.get_url_data(URL) document = BeautifulSoup(request.content, 'html.parser') # html - newsflash_body - type06_headline, type06 # 각 페이지에 있는 기사들 가져오기 post_temp = document.select( '.newsflash_body .type06_headline li dl') post_temp.extend(document.select('.newsflash_body .type06 li dl')) # 각 페이지에 있는 기사들의 url 저장 post = [] for line in post_temp: post.append(line.a.get( 'href')) # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음 del post_temp for content_url in post: # 기사 URL # 크롤링 대기 시간 sleep(0.01) # 기사 HTML 가져옴 request_content = self.get_url_data(content_url) try: document_content = BeautifulSoup(request_content.content, 'html.parser') except: continue try: # 기사 제목 가져옴 tag_headline = document_content.find_all( 'h3', {'id': 'articleTitle'}, {'class': 'tts_head'}) text_headline = '' # 뉴스 기사 제목 초기화 text_headline = text_headline + ArticleParser.clear_headline( str(tag_headline[0].find_all(text=True))) if not text_headline: # 공백일 경우 기사 제외 처리 continue # 기사 본문 가져옴 tag_content = document_content.find_all( 'div', {'id': 'articleBodyContents'}) text_sentence = '' # 뉴스 기사 본문 초기화 text_sentence = text_sentence + ArticleParser.clear_content( str(tag_content[0].find_all(text=True))) if not text_sentence: # 공백일 경우 기사 제외 처리 continue # 기사 언론사 가져옴 tag_company = document_content.find_all( 'meta', {'property': 'me2:category1'}) text_company = '' # 언론사 초기화 text_company = text_company + str( tag_company[0].get('content')) if not text_company: # 공백일 경우 기사 제외 처리 continue # CSV 작성 wcsv = writer.get_writer_csv() wcsv.writerow([ news_date, category_name, text_company, text_headline, text_sentence, content_url ]) del text_company, text_sentence, text_headline del tag_company del tag_content, tag_headline del request_content, document_content except Exception as ex: # UnicodeEncodeError .. # wcsv.writerow([ex, content_url]) del request_content, document_content pass writer.close()
class LoggerImpl(object): def __init__(self): self.TRACE = 0 self.DEBUG = 1 self.INFO = 2 self.WARN = 3 self.ERROR = 4 self.FATAL = 5 self.REPORT = 6 self.__all_log_level = { 'trace': self.TRACE, 'debug': self.DEBUG, 'info': self.INFO, 'warn': self.WARN, 'error': self.ERROR, 'fatal': self.FATAL, 'report': self.REPORT, } self.__process = None self.__report = None self.__current_log_level = self.INFO def init(self, target, file_name, file_size=100 * 1024 * 1024, max_file_count=-1, multiprocess=False): try: if multiprocess: lock = multiprocessing.RLock() else: lock = threading.RLock() if not os.path.exists(target): os.makedirs(target) if not os.path.exists(target): return False self.__process = Writer(lock, target, file_name + '.process.log', file_size, max_file_count) if not self.__process.open(): return False self.__report = Writer(lock, target, file_name + '.report.log', file_size, max_file_count) if not self.__report.open(): return False if not SysLog.open(): return False return True except: print traceback.format_exc() def set_level(self, level): if isinstance(level, int): if level < self.TRACE or level > self.REPORT: return False self.__current_log_level = level return True level_value = self.__all_log_level.get(level, None) if level_value is None: return False self.__current_log_level = level_value return True def close(self): try: if not self.__process: self.__process.close() self.__process = None if not self.__report: self.__report.close() self.__report = None SysLog.close() except: print traceback.format_exc() def trace(self, content): try: if self.TRACE < self.__current_log_level: return self.__process.write(content) except: SysLog.error(traceback.format_exc()) def debug(self, content): try: if self.DEBUG < self.__current_log_level: return self.__process.write(content) except: SysLog.error(traceback.format_exc()) def info(self, content): try: if self.INFO < self.__current_log_level: return self.__process.write(content) except: SysLog.error(traceback.format_exc()) def warn(self, content): try: if self.WARN < self.__current_log_level: return self.__process.write(content) except: SysLog.error(traceback.format_exc()) def error(self, content): try: if self.ERROR < self.__current_log_level: return self.__process.write(content) except: SysLog.error(traceback.format_exc()) def fatal(self, content): try: if self.FATAL < self.__current_log_level: return self.__process.write(content) except: SysLog.error(traceback.format_exc()) def report(self, content): try: if self.REPORT < self.__current_log_level: return self.__report.write(content) except: SysLog.error(traceback.format_exc())
def _main(): indentation = [""] prevRelLine = [None] # Previous relevant line startOfBlock = False line_no = 1 writer = Writer(outFileName) jOut = open(jsonFileName, "w") jOut.write("[") for line in open(inFileName, "r"): if not len(line.strip()): continue line = processComments(line) if line_no != 1: jOut.write(",") assert whitespaceValid( line, indentation, startOfBlock), "Error: Invalid whitespace on line: %i" % line_no if startOfBlock: indentation.append(getLeadingWhitespace(line)) prevRelLine.append(None) ws = getLeadingWhitespace(line) while indentation and ws != indentation[-1]: indentation.pop() temp = prevRelLine.pop() assert temp == None or temp[0] not in [ "do" ], "Error, failed to properly close \"%s\" on line %i" % (temp[0], temp[1]) if prevRelLine[-1] != None and prevRelLine[-1][0] == "do": writer.formatLine(line.replace("times", "").strip()) elif prevRelLine[-1] != None and prevRelLine[-1][0] == "python": writer.inline = False try: writer.convert(line) except Exception: # print("Error parsing line: %i" % line_no) assert 0 == 0 if not writer.inline: jsonObj = toJson(line) jsonObj.id = line_no jsonObj.prev = line_no - 1 #Dis shit is hacked together as f**k jsonObj.indent = len(ws) jOut.write(str(jsonObj)) if prevRelLine[-1] != None: prevRelLine.pop() try: command = getCommand(line) except Exception: # print("Error parsing line: %i" % line_no) assert 0 == 0 if command == "python": writer.inline = True # print(prevRelLine[-2] if len(prevRelLine) > 1 else "Top") startOfBlock = startsBlock(command) if startOfBlock: prevRelLine.append((command, line_no)) line_no += 1 writer.printLines() writer.close() jOut.write("]") jOut.close()