def write_bible_chapter(book_abbrev, chapter, words_in_chapter, rows, custom_paragraphing=False): description = (f"KJV Bible Chapter Word Frequencies:" f" {bible_book_names[book_abbrev]} {chapter}") keywords = [ "KJV", "Bible", bible_book_names[book_abbrev], f"{bible_book_names[book_abbrev]} {chapter}", "chapter", "word frequency", ] book_num = f"{str(get_book_nums()[book_abbrev]).zfill(2)}" html_folder = os.path.join(os.getcwd(), "public_html", f"{book_num}-{book_abbrev.lower()}") os.makedirs(html_folder, exist_ok=True) csv_file_name = (f"{book_abbrev.lower()}{str(chapter).zfill(3)}" "-word-freq.csv") keywords += get_top_7_words(os.path.join(html_folder, csv_file_name)) # Include top 7 words in the page's keywords metatag base_template_args = get_base_template_args(description, ",".join(keywords), description) bible_chapter_text = get_bible_chapter_text( book_num, book_abbrev, chapter, custom_paragraphing=custom_paragraphing, ) new_template_args = { "images_path": "../images", "styles_path": "../styles", "bible_book_name": bible_book_names[book_abbrev], "book_abbrev": book_abbrev, "chapters_in_book": get_verse_counts()[f"{book_abbrev} {chapter}"], "chapter": chapter, "words_in_bible": "790,663", "words_in_chapter": words_in_chapter, "csv_file_name": csv_file_name, "bible_chapter_text": bible_chapter_text, "rows": rows, } write_html( base_template_args, new_template_args, "templates/bible_chapter.mako", html_folder, f"{book_abbrev.lower()}{chapter.zfill(3)}-word-freq.html", )
def first_query(self): get_rows = self.data.map(lambda row: (row[0], float(row[13]))) total = get_rows.reduceByKey(lambda x, y: x + y) print("primera consulta archivo 2") #print(get_rows.collect()) print(total.collect()) xxx = get_ejex_ejey(total) print(str(xxx[0])) print(str(xxx[1])) graph_js("archivo2_reporte1", str(xxx[0]), str(xxx[1]), 'pie', 'labels', 'values') write_html("archivo2_reporte1", "Ingresos de todas las regiones")
def first_query(self): get_rows = self.data.map(lambda row: (row[3], 1)) total_race = get_rows.reduceByKey(lambda x, y: x + y) total_sort = total_race.sortBy(lambda row: row[1], ascending=False) print("primera consulta archivo 3") print(total_sort.collect()[:3]) xxx = get_ejex_ejey(total_sort) print(str(xxx[0][:3])) print(str(xxx[1][:3])) graph_js("archivo3_reporte1", str(xxx[0][:3]), str(xxx[1][:3]), 'bar') write_html("archivo3_reporte1", "Top de razas victimas")
def third_query(self): get_rows = self.data.map(lambda row: (row[2], 1)) total = get_rows.reduceByKey(lambda x, y: x + y) total_ordenado = total.sortBy(lambda row: row[1], ascending=False) print("tercera consulta archivo 2") #print(get_rows.collect()) print(total_ordenado.collect()[0:5]) xxx = get_ejex_ejey(total_ordenado) print(str(xxx[0][:5])) print(str(xxx[1][:5])) graph_js("archivo1_reporte3", str(xxx[0][:5]), str(xxx[1][:5]), 'bar') write_html("archivo1_reporte3", "Top 5 de plataformas con mas lanzamientos")
def homewort_query(self): get_rows = self.data.map(lambda row: (row[2], float(row[11]), float(row[12]), float(row[13]))) filter = get_rows.filter(lambda x: (x[0].lower()=='baby food') or \ (x[0].lower() == 'cereal') or \ (x[0].lower() == 'fruits') or \ (x[0].lower() == 'meat') or \ (x[0].lower() == 'vegetables') or \ (x[0].lower() == 'beverages') or \ (x[0].lower() == 'snacks') ) renueve = filter.map(lambda x: (x[0], x[1])).reduceByKey( lambda x, y: x + y) costo = filter.map(lambda x: (x[0], x[2])).reduceByKey( lambda x, y: x + y) profit = filter.map(lambda x: (x[0], x[3])).reduceByKey( lambda x, y: x + y) xxx = get_ejex_ejey(renueve) print(str(xxx[0])) print(str(xxx[1])) strring = graph_js_apilda('renueve', str(xxx[0]), str(xxx[1]), 'bar', 'x', 'y', '1') xxx = get_ejex_ejey(costo) print(str(xxx[0])) print(str(xxx[1])) strring += graph_js_apilda('costo', str(xxx[0]), str(xxx[1]), 'bar', 'x', 'y', '2') xxx = get_ejex_ejey(profit) print(str(xxx[0])) print(str(xxx[1])) strring += graph_js_apilda('profit', str(xxx[0]), str(xxx[1]), 'bar', 'x', 'y', '3') write_js_tarea('tarea_reporte', strring) write_html("tarea_reporte", "Tarea")
def second_query(self): get_rows = self.data.map(lambda row: (row[5], row[4], 1)) rows_nintendo = get_rows.filter( lambda row: row[0].lower() == "nintendo") rows_final = rows_nintendo.map(lambda row: (row[1], row[2])) total = rows_final.reduceByKey(lambda x, y: x + y) print("segunda consulta archivo 2") #print(rows_nintendo.collect()) print(total.collect()) xxx = get_ejex_ejey(total) print(str(xxx[0])) print(str(xxx[1])) graph_js("archivo1_reporte2", str(xxx[0]), str(xxx[1]), 'pie', 'labels', 'values') write_html("archivo1_reporte2", "Total de generos publicados por nintendo")
def write_examples(): description = "KJV Bible Chapter Word Frequencies: Examples" base_template_args = get_base_template_args( description, ",".join([ "KJV", "Bible", "chapter", "word frequency", "relative frequency", "examples", ]), description, ) with open("examples.md", "r") as read_file: examples_source = read_file.read() examples_html = markdown2.markdown(examples_source, extras=["tables"]) examples_html = examples_html.replace('align="right"', 'class="numerical"') # The align attribute which markdown2 puts on th and td elements is # obsolete. # It will fail HTML validation by the W3C's # [Nu Html Checker](https://validator.w3.org/) # Replace it with CSS styling. new_template_args = { "images_path": "./images", "styles_path": "./styles", "examples_html": examples_html, } html_folder = os.path.join(os.getcwd(), "public_html") write_html( base_template_args, new_template_args, "templates/examples.mako", html_folder, "examples.html", )
def zhihu_spider(): url = 'https://www.zhihu.com/question/61170968' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', # 'accept-encoding': 'gzip, deflate, br', # 加上这一行会乱码 'accept-language': 'zh-CN,zh;q=0.9', 'referer': r'https://www.zhihu.com/signin?next=%2F', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '******', 'upgrade-insecure-requests': '1' } cookie_list = CookiePool.get_cookies(Zhihu.cookie_table_name) cookie = cookie_list[0][1] headers['cookie'] = cookie response = requests.get(url, headers=headers) # print(response.content) write_html(response)
def write_bible_book_index(book_abbrev): bible_books = get_bible_books() bible_book_names = { bible_books[bible_book][0]: bible_book for bible_book in bible_books } bible_book_name = bible_book_names[book_abbrev] book_length = bible_books[bible_book_name][1] description = f"KJV Bible Chapter Word Frequencies: {bible_book_name}" base_template_args = get_base_template_args( description, ",".join( ["KJV", "Bible", bible_book_name, "chapter", "word frequency"]), description, ) new_template_args = { "images_path": "../images", "styles_path": "../styles", "bible_book_name": bible_book_name, "book_abbrev": book_abbrev, "chapters_in_book": book_length, } book_num = f"{str(get_book_nums()[book_abbrev]).zfill(2)}" html_folder = os.path.join(os.getcwd(), "public_html") chapter_folder = os.path.join(html_folder, f"{book_num}-{book_abbrev.lower()}") os.makedirs(chapter_folder, exist_ok=True) write_html( base_template_args, new_template_args, "templates/bible_book_index.mako", chapter_folder, f"{book_abbrev.lower()}-index.html", )
def first_query(self): get_rows = self.data.map(lambda row: (row[4], float(row[10]))) genre_filters= get_rows.filter( lambda row: (row[0].lower() == "action") \ or (row[0].lower() == "sports") \ or (row[0].lower() == "fighting") \ or (row[0].lower() == "shooter") \ or (row[0].lower() == "racing") \ or (row[0].lower() == "adventure") \ or (row[0].lower() == "strategy") ) total = genre_filters.reduceByKey(lambda x, y: x + y) #print(genre_filters.collect()) print(total.collect()) xxx = get_ejex_ejey(total) print(str(xxx[0])) print(str(xxx[1])) graph_js("archivo1_reporte1", str(xxx[0]), str(xxx[1]), 'bar') write_html("archivo1_reporte1", "Ventas globales de la sig categorias")
def second_query(self): #use la orden get_rows = self.data.map(lambda row: (row[1], row[5].split("/")[2], int(row[8]))) \ .filter(lambda row: row[0].lower() == 'guatemala') total = get_rows.map(lambda row: (row[1] + "x", row[2])).reduceByKey( lambda x, y: x + y) orden = total.sortBy(lambda row: row[1], ascending=False) #ventas_anio = get_rows.filter(lambda row: row[1]=="2019") #GUATEMLA print("segunda consulta archivo 2") #print(get_rows.collect()) #print(total.collect()) print(orden.collect()) xxx = get_ejex_ejey(orden) print(str(xxx[0])) print(str(xxx[1])) graph_js("archivo2_reporte2", str(xxx[0]), str(xxx[1]), 'bar') write_html("archivo2_reporte2", "Año con mas unidades venididas")
def third_query(self): get_rows = self.data.map( lambda row: (row[5].split("/")[2], row[0], float(row[13]), row[3])) year2010 = get_rows.filter( lambda row: row[0] == "2010" and row[3].lower() == "online") total = year2010.map(lambda row: (row[1], row[2])).reduceByKey( lambda x, y: x + y) total_ordenado = total.sortBy(lambda row: row[1], ascending=False) print("tercera consulta archivo 2") #print(get_rows.collect()) #print(year2010.collect()) #print(total.collect()) print(total_ordenado.collect()) xxx = get_ejex_ejey(total_ordenado) print(str(xxx[0])) print(str(xxx[1])) graph_js("archivo2_reporte3", str(xxx[0]), str(xxx[1]), 'bar') write_html("archivo2_reporte3", "Año 2010 ventas online x region")
print("Iteration: %08d/%08d" % (iterations + 1, max_iter)) write_loss(iterations, trainer, train_writer) # Write images if (iterations + 1) % config['image_save_iter'] == 0: with torch.no_grad(): test_image_outputs = trainer.sample( test_display_images_a, test_display_images_b) train_image_outputs = trainer.sample( train_display_images_a, train_display_images_b) write_2images(test_image_outputs, display_size, image_directory, 'test_%08d' % (iterations + 1)) write_2images(train_image_outputs, display_size, image_directory, 'train_%08d' % (iterations + 1)) # HTML write_html(output_directory + "/index.html", iterations + 1, config['image_save_iter'], 'images') if (iterations + 1) % config['image_display_iter'] == 0: with torch.no_grad(): image_outputs = trainer.sample(train_display_images_a, train_display_images_b) write_2images(image_outputs, display_size, image_directory, 'train_current') # Save network weights if (iterations + 1) % config['snapshot_save_iter'] == 0: trainer.save(checkpoint_directory, iterations) iterations += 1 if iterations >= max_iter: sys.exit('Finish training')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, default='configs/edges2handbags_folder.yaml', help='Path to the config file.') parser.add_argument('--output_path', type=str, default='.', help="outputs path") #resume option => [, default='730000'] parser.add_argument("--resume", default='150000', action="store_true") parser.add_argument('--trainer', type=str, default='MUNIT', help="MUNIT|UNIT") opts = parser.parse_args() cudnn.benchmark = True # Load experiment setting config = get_config(opts.config) max_iter = config['max_iter'] display_size = config['display_size'] config['vgg_model_path'] = opts.output_path # Setup model and data loader if opts.trainer == 'MUNIT': trainer = MUNIT_Trainer(config) elif opts.trainer == 'UNIT': trainer = UNIT_Trainer(config) else: sys.exit("Only support MUNIT|UNIT") trainer.cuda() train_loader_a, train_loader_b, test_loader_a, test_loader_b = get_all_data_loaders( config) train_display_images_a = torch.stack( [train_loader_a.dataset[i] for i in range(display_size)]).cuda() train_display_images_b = torch.stack( [train_loader_b.dataset[i] for i in range(display_size)]).cuda() test_display_images_a = torch.stack( [test_loader_a.dataset[i] for i in range(display_size)]).cuda() test_display_images_b = torch.stack( [test_loader_b.dataset[i] for i in range(display_size)]).cuda() # Setup logger and output folders model_name = os.path.splitext(os.path.basename(opts.config))[0] train_writer = tensorboardX.SummaryWriter( os.path.join(opts.output_path + "/logs", model_name)) output_directory = os.path.join(opts.output_path + "/outputs", model_name) checkpoint_directory, image_directory = prepare_sub_folder( output_directory) shutil.copy(opts.config, os.path.join( output_directory, 'config.yaml')) # copy config file to output folder # Start training iterations = trainer.resume(checkpoint_directory, hyperparameters=config) if opts.resume else 0 while True: for it, (images_a, images_b) in enumerate(zip(train_loader_a, train_loader_b)): trainer.update_learning_rate() images_a, images_b = images_a.cuda().detach(), images_b.cuda( ).detach() with Timer("Elapsed time in update: %f"): # Main training code trainer.dis_update(images_a, images_b, config) trainer.gen_update(images_a, images_b, config) torch.cuda.synchronize() # Dump training stats in log file if (iterations + 1) % config['log_iter'] == 0: print("Iteration: %08d/%08d" % (iterations + 1, max_iter)) write_loss(iterations, trainer, train_writer) # Write images if (iterations + 1) % config['image_save_iter'] == 0: with torch.no_grad(): test_image_outputs = trainer.sample( test_display_images_a, test_display_images_b) train_image_outputs = trainer.sample( train_display_images_a, train_display_images_b) write_2images(test_image_outputs, display_size, image_directory, 'test_%08d' % (iterations + 1)) write_2images(train_image_outputs, display_size, image_directory, 'train_%08d' % (iterations + 1)) # HTML write_html(output_directory + "/index.html", iterations + 1, config['image_save_iter'], 'images') if (iterations + 1) % config['image_display_iter'] == 0: with torch.no_grad(): image_outputs = trainer.sample(train_display_images_a, train_display_images_b) write_2images(image_outputs, display_size, image_directory, 'train_current') # Save network weights if (iterations + 1) % config['snapshot_save_iter'] == 0: trainer.save(checkpoint_directory, iterations) iterations += 1 if iterations >= max_iter: sys.exit('Finish training')
def main(argv): (opts, args) = parser.parse_args(argv) cudnn.benchmark = True model_name = os.path.splitext(os.path.basename(opts.config))[0] # Load experiment setting config = get_config(opts.config) max_iter = config['max_iter'] display_size = config['display_size'] # Setup model and data loader trainer = MUNIT_Trainer(config) trainer.cuda() train_loader_a, train_loader_b, test_loader_a, test_loader_b = get_all_data_loaders( config) test_display_images_a = Variable(torch.stack( [test_loader_a.dataset[i] for i in range(display_size)]).cuda(), volatile=True) test_display_images_b = Variable(torch.stack( [test_loader_b.dataset[i] for i in range(display_size)]).cuda(), volatile=True) train_display_images_a = Variable(torch.stack( [train_loader_a.dataset[i] for i in range(display_size)]).cuda(), volatile=True) train_display_images_b = Variable(torch.stack( [train_loader_b.dataset[i] for i in range(display_size)]).cuda(), volatile=True) # Setup logger and output folders train_writer = tensorboard.SummaryWriter(os.path.join( opts.log, model_name)) output_directory = os.path.join(opts.outputs, model_name) checkpoint_directory, image_directory = prepare_sub_folder( output_directory) shutil.copy(opts.config, os.path.join( output_directory, 'config.yaml')) # copy config file to output folder # Start training iterations = trainer.resume(checkpoint_directory) if opts.resume else 0 while True: for it, (images_a, images_b) in enumerate(izip(train_loader_a, train_loader_b)): trainer.update_learning_rate() images_a, images_b = Variable(images_a.cuda()), Variable( images_b.cuda()) # Main training code trainer.dis_update(images_a, images_b, config) trainer.gen_update(images_a, images_b, config) # Dump training stats in log file if (iterations + 1) % config['log_iter'] == 0: print("Iteration: %08d/%08d" % (iterations + 1, max_iter)) write_loss(iterations, trainer, train_writer) # Write images if (iterations + 1) % config['image_save_iter'] == 0: # Test set images image_outputs = trainer.sample(test_display_images_a, test_display_images_b) write_images( image_outputs, display_size, '%s/gen_test%08d.jpg' % (image_directory, iterations + 1)) # Train set images image_outputs = trainer.sample(train_display_images_a, train_display_images_b) write_images( image_outputs, display_size, '%s/gen_train%08d.jpg' % (image_directory, iterations + 1)) # HTML write_html(output_directory + "/index.html", iterations + 1, config['image_save_iter'], 'images') if (iterations + 1) % config['image_save_iter'] == 0: image_outputs = trainer.sample(test_display_images_a, test_display_images_b) write_images(image_outputs, display_size, '%s/gen.jpg' % image_directory) # Save network weights if (iterations + 1) % config['snapshot_save_iter'] == 0: trainer.save(checkpoint_directory, iterations) iterations += 1 if iterations >= max_iter: return
image_directory, "test_%08d" % (iterations + 1), comet_exp=comet_exp, ) write_2images( train_image_outputs, display_size, image_directory, "train_%08d" % (iterations + 1), comet_exp=comet_exp, ) # HTML write_html( output_directory + "/index.html", iterations + 1, config["image_save_iter"], "images", comet_exp=comet_exp, ) if (iterations + 1) % config["image_display_iter"] == 0: with torch.no_grad(): image_outputs = trainer.sample(train_display_images_a, train_display_images_b) write_2images(image_outputs, display_size, image_directory, "train_current") # Save network weights if (iterations + 1) % config["snapshot_save_iter"] == 0: trainer.save(checkpoint_directory, iterations)
def process_data(self): print("#" * 10, "Processing data", "#" * 10) grades = [] # Создаём классы комментариев for raw in zip(self.grade, self.owned, self.reviews, self.ingame_hours, self.helpful, self.funny, self.texts): # Передаём в новый коммент текст и оценку comm = Comment(raw[0], raw[-1]) self.comments.append(comm) grades.append(comm.grade) if raw[0] == "1": Comments.positive_comments.append(comm) else: Comments.negative_comments.append(comm) # Сохраняем кол-во позитивных и негативных комментов comments_count = { "pos_comm": len(Comments.positive_comments), "neg_comm": len(Comments.negative_comments) } with open('data/comments_count.json', '+w') as file: json.dump(comments_count, file) Comments.grades = np.array(grades) np.save("data/grades.npy", Comments.grade) count = 1 if ut.path_exists('data/words_count.json'): with open('data/words_count.json', 'r', encoding='utf-8') as file: print("Loading words_count from json") Comments.features_count = json.load(file) count = 0 # Подсчитываем для каждого коммента вектор признаков # from utils import Watcher следить за прогрессом Comments.stupid_comments = '' wt = Watcher(len(self.comments)) for comm in self.comments: wt.display_load(self.comments.index(comm), "making vector") comm.make_vector() # Подсчитываем кол-во фичей в этом комменте и добавляем в общую кучу if count: comm.count() # Подсчитываем кол-во фичей во всех комментах if not ut.path_exists('data/words_count.json'): with open('data/words_count.json', 'w') as file: json.dump(Comments.features_count, file, ensure_ascii=False) count = 1 if ut.path_exists('data/idf.json'): with open('data/idf.json', 'r', encoding='utf-8') as file: print("Loading idf from json") Comments.idf = json.load(file) for comm in Comments.comments: wt.display_load(Comments.comments.index(comm), "loading tf-idf") comm.load_tf_idf() count = 0 if count: for comm in self.comments: wt.display_load(self.comments.index(comm), "counting tf-idf") # Считаем и сразу удаляем ненужные слова comm.count_tf_idf() # pprint(comm.tf_idf) if not ut.path_exists('data/idf.json'): with open('data/idf.json', 'w') as file: json.dump(Comments.idf, file, ensure_ascii=False) # Выводим бесполезные слова ut.write_html("data/tf-idf-useless.txt", Comment.tf_idf_words) print("OUTPUTED DATA/TF-IDF USELESS") # Создаём массив таргет_нэймс # Подсчитываем кол-во разных слов target_names = set() for comm in self.comments: wt.display_load(self.comments.index(comm), "creating target names") # print("Comm {} from {}\tCreating target names".format(self.comments.index(comm), len(self.comments))) for feature in comm.features: target_names.add(feature) Comments.target_names = sorted(list(target_names)) target_names_text = "" for name in Comments.target_names: target_names_text += name + "\n::|::\n" ut.write_html("data/target_names.txt", target_names_text) print("Comments.target_names len is:\t{}".format(len(Comments.target_names))) wt = Watcher(len(Comments.target_names)) # Загружаем чтобы не считать всё заново count = 1 if ut.path_exists('data/target_names_indexes.json'): with open('data/target_names_indexes.json', 'r', encoding='utf-8') as file: print("Loading target_names_indexes from json") Comments.target_names_dict = json.load(file) count = 0 oldLetter = '' endIndex = 0 startIndex = 0 if count: # Сделать по индексам, и когда нашли конечный индекс сразу смещаться на него -1 # Делать без внутреннего цикла, а просто заводить переменную *текущая буква* и когда другая буква не равна ей просто ставить конечный индекс - новое слово for j in range(0, len(Comments.target_names)): name = Comments.target_names[j] if name == '': continue wt.display_load(j, "counting indexes") letter = name[0] if letter != oldLetter: endIndex = j if j != 0: Comments.target_names_dict[oldLetter] = str(startIndex) + ":" + str(endIndex) startIndex = j oldLetter = letter # Сохраняем то что сейчас написали, т.к. процесс мега трудоёмкий if not ut.path_exists('data/target_names_indexes.json'): with open('data/target_names_indexes.json', 'w') as file: json.dump(Comments.target_names_dict, file, ensure_ascii=False) print("Ended saving file") wt = Watcher(len(Comments.comments)) # TODO: Сделать подгрузку логарифмического выражения # Подсчитываем delta tf - idf for comm in self.comments: wt.display_load(self.comments.index(comm), "counting delta tf-idf") comm.count_values() ut.write_html('data/delta_tf_idf_log.txt', Comments.output) # Сохраняем логарифмическое выражение with open('data/delta_tf_idf_frac.json', 'w') as file: json.dump(Comments.delta_tf_idf_frac, file, ensure_ascii=False) print("Ended saving file") data = sparse.lil_matrix((len(Comments.comments), len(Comments.target_names))) comments_len = len(Comments.comments) for i in range(0, comments_len): wt.display_load(i, "editing matrix") comment = Comments.comments[i] for feature in comment.features: if feature == '': continue if feature not in comm.values: continue # для каждой фичи находим индекс first_letter = feature[0] if first_letter not in Comments.target_names_dict: continue start_ind = int(Comments.target_names_dict[first_letter].split(':')[0]) end_ind = int(Comments.target_names_dict[first_letter].split(':')[-1]) for j in range(start_ind, end_ind): if Comments.target_names[j] == feature: data[i, j] = comment.values[feature] Comments.data = data print("Data shape:\t{}".format(Comments.data.shape)) print("Grade shape:\t{}".format(Comments.grades.shape)) print("Target_name len:\t{}".format(len(Comments.target_names))) # Сохраняем массив # np.save("data/data.npy", Comments.data) # sparse.save_npz('data/data.npz', data) Comments.save_sparse_matrix(Comments, "data/data.npz", Comments.data)
def parse_json_comments(self, response): print("==============\nstart parsing json\n===============") num = re.compile( r'[0-9]+\.?[0-9]*') # Регулярное выражение для определения числа data = json.loads(response.body) ut.write_html(self.dest + "comments.html", data['html']) html = data['html'].replace('<br>', '\n') # Заменяем для целостности комента selector = Selector(text=html) selector.remove_namespaces() output = "" # Используем регулярное выражение для получения только полных комментариев review_boxes = selector.xpath( "//div[re:test(@class, '\Areview_box\s*\Z')]") for review in review_boxes: output += "\n=======================\n" if review.css('div.persona_name') is None: continue # Если такого не существует пропускаем persona_name = review.css('div.persona_name') if persona_name.css('a::text').extract_first() is None: name = "i have to search in span" continue else: name = str(persona_name.css('a::text').extract_first()) if persona_name.css('a::attr(href)').extract_first() is None: url = "have to search in another place" continue else: url = str(persona_name.css('a::attr(href)').extract_first()) if url != "None" and url is not None: person_id = url.split('/')[-2] else: person_id = "Doesn't exist" if review.css( 'div.num_owned_games a::text').extract_first() is None: num_owned_games = "Didn't find" continue else: num_owned_games = str( review.css('div.num_owned_games a::text').extract_first() ).split(' ')[-1] num_owned_games = num_owned_games.replace(',', '') num_owned_games = num_owned_games.replace('.', '') if review.css('div.num_reviews a::text').extract_first() is None: num_reviews = "Didn't find" continue else: num_reviews_text = review.css( 'div.num_reviews a::text').extract_first().strip() if num.match(num_reviews_text): num_reviews = (num.findall(num_reviews_text))[0].strip() num_reviews = num_reviews.replace(',', '') num_reviews = num_reviews.replace('.', '') else: num_reviews = "0" if review.xpath('.//div[contains(@class, "title ellipsis")]/text()' ).extract_first() is None: grade = "Didn't find" continue else: grade = review.xpath( './/div[contains(@class, "title ellipsis")]/text()' ).extract_first() if grade == "Рекомендую": grade = "1" else: grade = "0" if review.xpath('.//div[contains(@class, "hours ellipsis")]/text()' ).extract_first() is None: hours = "Didn't find" continue else: hours = review.xpath( './/div[contains(@class, "hours ellipsis")]/text()' ).extract_first() hours = hours.split(' ')[-2].replace('.', '') hours = hours.replace(',', '') if review.css('div.vote_info::text').extract_first() is None: num_useful = "Didn't find" num_funny = "Didn't find" continue else: useful = "Not found" funny = "Not found" num_useful = '0' num_funny = '0' votes_info = review.css('div.vote_info::text').extract() for _ in votes_info: votes = _.splitlines() for vote in votes: if 'полезным' in vote: useful = vote.strip() num_useful = num.findall(useful)[0].strip() elif 'забавным' in vote: funny = vote.strip() num_funny = num.findall(funny)[0].strip() if review.css('div.content::text').extract_first() is None: text = "None" continue else: text = review.css('div.content::text').extract_first() num_reviews = num.findall(num_reviews_text)[0] output += "Name\tis:\t{}\n".format(name) output += "Url\tis:\t{}\n".format(url) output += "Id \tis:\t{}\n".format(person_id) output += "Owned games:\t{}\n".format(num_owned_games) output += "Num reviews:\t{}\n".format(num_reviews) output += "Grade\tis:\t{}\n".format(grade) output += "Ingame hours:\t{}\n".format(hours) output += "People think it helpful:\t{}\n".format(num_useful) output += "People think it funny:\t\t{}\n".format(num_funny) # output += "Text:\n{}\n".format(text) Comments.add_comment(Comments, text, num_owned_games, num_reviews, grade, hours, num_useful, num_funny) output += "=======================\n" ut.write_html(self.dest + "reviewers.txt", output) # output = "" # comments = selector.css('div.review_box').css('div.content::text').extract() # for comment in comments: # comment = comment.strip() # if not comment: # continue # Пропускаем если строчка пустая # output += "\n=============================\n" # output += comment # output += "\n=============================\n" # ut.write_html(self.dest + 'comments.txt', output) print("==============\nended parsing json\n===============")
os.path.join( image_directory, 'gen_b2a_' + 'train_%08d' % (iterations + 1) + '.jpg'), filename='train_gen_b2a_im-iteration: ' + str(iterations) + '.jpg') telegram_bot_send_document( os.path.join( image_directory, 'gen_b2a_' + 'test_%08d' % (iterations + 1) + '.jpg'), filename='test_gen_b2a_im-iteration: ' + str(iterations) + '.jpg') # HTML write_html(output_directory + "/index.html", iterations + 1, config['image_save_iter'], 'images', do_a2b=config['do_a2b'], do_b2a=config['do_b2a']) if (iterations + 1) % config['image_display_iter'] == 0: with torch.no_grad(): image_outputs = trainer.sample(train_display_images_a, train_display_images_b) write_2images(image_outputs, display_size * config['council']['council_size'], image_directory, 'train_current', do_a2b=config['do_a2b'], do_b2a=config['do_b2a'])
# Dump training stats in log file if (iterations + 1) % config['log_iter'] == 0: print("Iteration: %08d/%08d" % (iterations + 1, max_iter)) write_loss(iterations, trainer, train_writer) # Write images if (iterations + 1) % config['image_save_iter'] == 0: # Test set images image_outputs = trainer.sample(test_display_images_a, test_display_images_b) write_2images(image_outputs, display_size, image_directory, 'test_%08d' % (iterations + 1)) # Train set images image_outputs = trainer.sample(train_display_images_a, train_display_images_b) write_2images(image_outputs, display_size, image_directory, 'train_%08d' % (iterations + 1)) # HTML write_html(output_directory + "/index.html", iterations + 1, config['image_save_iter'], 'images') if (iterations + 1) % config['image_display_iter'] == 0: train_display_images_a = Variable(torch.stack([train_loader_a.dataset[i] for i in range(display_size)]).cuda(), volatile=True) train_display_images_b = Variable(torch.stack([train_loader_b.dataset[i] for i in range(display_size)]).cuda(), volatile=True) image_outputs = trainer.sample(train_display_images_a, train_display_images_b) write_2images(image_outputs, display_size, image_directory, 'train_current') # Save network weights if (iterations + 1) % config['snapshot_save_iter'] == 0: trainer.save(checkpoint_directory, iterations) iterations += 1 if iterations >= max_iter: sys.exit('Finish training')
def main(): cudnn.benchmark = True # Load experiment setting config = get_config(opts.config) max_iter = config['max_iter'] display_size = config['display_size'] config['vgg_model_path'] = opts.output_path # Setup model and data loader trainer = UNIT_Trainer(config) if torch.cuda.is_available(): trainer.cuda(config['gpuID']) train_loader_a, train_loader_b, test_loader_a, test_loader_b = get_all_data_loaders( config) # Setup logger and output folders model_name = os.path.splitext(os.path.basename(opts.config))[0] writer = SummaryWriter(os.path.join(opts.output_path + "/logs", model_name)) output_directory = os.path.join(opts.output_path + "/outputs", model_name) checkpoint_directory, image_directory = prepare_sub_folder( output_directory) shutil.copy(opts.config, os.path.join( output_directory, 'config.yaml')) # copy config file to output folder print('start training !!') # Start training iterations = trainer.resume(checkpoint_directory, hyperparameters=config) if opts.resume else 0 TraindataA = data_prefetcher(train_loader_a) TraindataB = data_prefetcher(train_loader_b) testdataA = data_prefetcher(test_loader_a) testdataB = data_prefetcher(test_loader_b) while True: dataA = TraindataA.next() dataB = TraindataB.next() if dataA is None or dataB is None: TraindataA = data_prefetcher(train_loader_a) TraindataB = data_prefetcher(train_loader_b) dataA = TraindataA.next() dataB = TraindataB.next() with Timer("Elapsed time in update: %f"): # Main training code for _ in range(3): trainer.content_update(dataA, dataB, config) trainer.dis_update(dataA, dataB, config) trainer.gen_update(dataA, dataB, config) # torch.cuda.synchronize() trainer.update_learning_rate() # Dump training stats in log file if (iterations + 1) % config['log_iter'] == 0: print("Iteration: %08d/%08d" % (iterations + 1, max_iter)) write_loss(iterations, trainer, writer) if (iterations + 1) % config['image_save_iter'] == 0: testa = testdataA.next() testb = testdataB.next() if dataA is None or dataB is None or dataA.size( 0) != display_size or dataB.size(0) != display_size: testdataA = data_prefetcher(test_loader_a) testdataB = data_prefetcher(test_loader_b) testa = testdataA.next() testb = testdataB.next() with torch.no_grad(): test_image_outputs = trainer.sample(testa, testb) train_image_outputs = trainer.sample(dataA, dataB) if test_image_outputs is not None and train_image_outputs is not None: write_2images(test_image_outputs, display_size, image_directory, 'test_%08d' % (iterations + 1)) write_2images(train_image_outputs, display_size, image_directory, 'train_%08d' % (iterations + 1)) # HTML write_html(output_directory + "/index.html", iterations + 1, config['image_save_iter'], 'images') if (iterations + 1) % config['image_display_iter'] == 0: with torch.no_grad(): image_outputs = trainer.sample(dataA, dataB) if image_outputs is not None: write_2images(image_outputs, display_size, image_directory, 'train_current') # Save network weights if (iterations + 1) % config['snapshot_save_iter'] == 0: trainer.save(checkpoint_directory, iterations) iterations += 1 if iterations >= max_iter: writer.close() sys.exit('Finish training')
def main(): from utils import get_all_data_loaders, prepare_sub_folder, write_html, write_loss, get_config, write_2images, Timer import argparse from torch.autograd import Variable from trainer import MUNIT_Trainer, UNIT_Trainer import torch.backends.cudnn as cudnn import torch # try: # from itertools import izip as zip # except ImportError: # will be 3.x series # pass import os import sys import tensorboardX import shutil os.environ["CUDA_VISIBLE_DEVICES"] = str(0) parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, default='configs/edges2handbags_folder.yaml', help='Path to the config file.') parser.add_argument('--output_path', type=str, default='.', help="outputs path") parser.add_argument("--resume", action="store_true") parser.add_argument('--trainer', type=str, default='MUNIT', help="MUNIT|UNIT") opts = parser.parse_args() cudnn.benchmark = True ''' Note: https://www.pytorchtutorial.com/when-should-we-set-cudnn-benchmark-to-true/ 大部分情况下,设置这个 flag 可以让内置的 cuDNN 的 auto-tuner 自动寻找最适合当前配置的高效算法,来达到优化运行效率的问题 1. 如果网络的输入数据维度或类型上变化不大,设置 torch.backends.cudnn.benchmark = true 可以增加运行效率; 2. 如果网络的输入数据在每次 iteration 都变化的话,会导致 cnDNN 每次都会去寻找一遍最优配置,这样反而会降低运行效率。 ''' # Load experiment setting config = get_config(opts.config) max_iter = config['max_iter'] display_size = config['display_size'] config['vgg_model_path'] = opts.output_path # Setup model and data loader if opts.trainer == 'MUNIT': trainer = MUNIT_Trainer(config) elif opts.trainer == 'UNIT': trainer = UNIT_Trainer(config) else: sys.exit("Only support MUNIT|UNIT") trainer.cuda() train_loader_a, train_loader_b, test_loader_a, test_loader_b = get_all_data_loaders( config) train_display_images_a = torch.stack( [train_loader_a.dataset[i] for i in range(display_size)]).cuda() train_display_images_b = torch.stack( [train_loader_b.dataset[i] for i in range(display_size)]).cuda() test_display_images_a = torch.stack( [test_loader_a.dataset[i] for i in range(display_size)]).cuda() test_display_images_b = torch.stack( [test_loader_b.dataset[i] for i in range(display_size)]).cuda() # Setup logger and output folders model_name = os.path.splitext(os.path.basename(opts.config))[0] train_writer = tensorboardX.SummaryWriter( os.path.join(opts.output_path + "/logs", model_name)) output_directory = os.path.join(opts.output_path + "/outputs", model_name) checkpoint_directory, image_directory = prepare_sub_folder( output_directory) shutil.copy(opts.config, os.path.join( output_directory, 'config.yaml')) # copy config file to output folder # Start training iterations = trainer.resume(checkpoint_directory, hyperparameters=config) if opts.resume else 0 while True: for it, (images_a, images_b) in enumerate(zip(train_loader_a, train_loader_b)): trainer.update_learning_rate() images_a, images_b = images_a.cuda().detach(), images_b.cuda( ).detach() with Timer("Elapsed time in update: %f"): # Main training code trainer.dis_update(images_a, images_b, config) trainer.gen_update(images_a, images_b, config) torch.cuda.synchronize() # Dump training stats in log file if (iterations + 1) % config['log_iter'] == 0: print("Iteration: %08d/%08d" % (iterations + 1, max_iter)) write_loss(iterations, trainer, train_writer) # Write images if (iterations + 1) % config['image_save_iter'] == 0: with torch.no_grad(): test_image_outputs = trainer.sample( test_display_images_a, test_display_images_b) train_image_outputs = trainer.sample( train_display_images_a, train_display_images_b) write_2images(test_image_outputs, display_size, image_directory, 'test_%08d' % (iterations + 1)) write_2images(train_image_outputs, display_size, image_directory, 'train_%08d' % (iterations + 1)) # HTML write_html(output_directory + "/index.html", iterations + 1, config['image_save_iter'], 'images') if (iterations + 1) % config['image_display_iter'] == 0: with torch.no_grad(): image_outputs = trainer.sample(train_display_images_a, train_display_images_b) write_2images(image_outputs, display_size, image_directory, 'train_current') # Save network weights if (iterations + 1) % config['snapshot_save_iter'] == 0: trainer.save(checkpoint_directory, iterations) iterations += 1 if iterations >= max_iter: sys.exit('Finish training')
v_input, v_output, v_target = [], [], [] visual_images = [] for index, val_data in enumerate(val_loader): if index < config['display_num']: model.feed_data(val_data) model.test() visuals = model.get_current_visuals() v_input.append(visuals['input']) v_output.append(visuals['output']) v_target.append(visuals['target']) else: break visual_images.extend(v_input) visual_images.extend(v_output) visual_images.extend(v_target) _write_images(visual_images, config['display_num'], '%s/val_%08d.jpg' % (image_dir, current_step)) # HTML write_html(output_dir + '/index.html', current_step, config['save_image_iter'], 'images') # save models if current_step % config['save_model_iter'] == 0: print('Saving models.') model.save(current_step) print('Saving the final model.') model.save('latest') print('End of training.')
def main(opts): # Load experiment setting config = get_config(opts.config) max_iter = config['max_iter'] # Override the batch size if specified. if opts.batch_size != 0: config['batch_size'] = opts.batch_size trainer = Trainer(config) trainer.cuda() if opts.multigpus: ngpus = torch.cuda.device_count() config['gpus'] = ngpus print("Number of GPUs: %d" % ngpus) trainer.model = torch.nn.DataParallel(trainer.model, device_ids=range(ngpus)) else: config['gpus'] = 1 loaders = get_train_loaders(config) train_content_loader = loaders[0] train_class_loader = loaders[1] test_content_loader = loaders[2] test_class_loader = loaders[3] # Setup logger and output folders model_name = os.path.splitext(os.path.basename(opts.config))[0] train_writer = SummaryWriter( os.path.join(opts.output_path + "/logs", model_name)) output_directory = os.path.join(opts.output_path + "/outputs", model_name) checkpoint_directory, image_directory = make_result_folders( output_directory) shutil.copy(opts.config, os.path.join(output_directory, 'config.yaml')) iterations = trainer.resume(checkpoint_directory, hp=config, multigpus=opts.multigpus) if opts.resume else 0 while True: for it, (co_data, cl_data) in enumerate( zip(train_content_loader, train_class_loader)): with Timer("Elapsed time in update: %f"): d_acc = trainer.dis_update(co_data, cl_data, config) g_acc = trainer.gen_update(co_data, cl_data, config, opts.multigpus) torch.cuda.synchronize() print('D acc: %.4f\t G acc: %.4f' % (d_acc, g_acc)) if (iterations + 1) % config['log_iter'] == 0: print("Iteration: %08d/%08d" % (iterations + 1, max_iter)) write_loss(iterations, trainer, train_writer) if ((iterations + 1) % config['image_save_iter'] == 0 or (iterations + 1) % config['image_display_iter'] == 0): if (iterations + 1) % config['image_save_iter'] == 0: key_str = '%08d' % (iterations + 1) write_html(output_directory + "/index.html", iterations + 1, config['image_save_iter'], 'images') else: key_str = 'current' with torch.no_grad(): for t, (val_co_data, val_cl_data) in enumerate( zip(train_content_loader, train_class_loader)): if t >= opts.test_batch_size: break val_image_outputs = trainer.test( val_co_data, val_cl_data, opts.multigpus) write_1images(val_image_outputs, image_directory, 'train_%s_%02d' % (key_str, t)) for t, (test_co_data, test_cl_data) in enumerate( zip(test_content_loader, test_class_loader)): if t >= opts.test_batch_size: break test_image_outputs = trainer.test( test_co_data, test_cl_data, opts.multigpus) write_1images(test_image_outputs, image_directory, 'test_%s_%02d' % (key_str, t)) if (iterations + 1) % config['snapshot_save_iter'] == 0: trainer.save(checkpoint_directory, iterations, opts.multigpus) print('Saved model at iteration %d' % (iterations + 1)) iterations += 1 if iterations >= max_iter: print("Finish Training") sys.exit(0)