コード例 #1
0
def write_bible_chapter(book_abbrev,
                        chapter,
                        words_in_chapter,
                        rows,
                        custom_paragraphing=False):

    description = (f"KJV Bible Chapter Word Frequencies:"
                   f" {bible_book_names[book_abbrev]} {chapter}")

    keywords = [
        "KJV",
        "Bible",
        bible_book_names[book_abbrev],
        f"{bible_book_names[book_abbrev]} {chapter}",
        "chapter",
        "word frequency",
    ]

    book_num = f"{str(get_book_nums()[book_abbrev]).zfill(2)}"
    html_folder = os.path.join(os.getcwd(), "public_html",
                               f"{book_num}-{book_abbrev.lower()}")
    os.makedirs(html_folder, exist_ok=True)
    csv_file_name = (f"{book_abbrev.lower()}{str(chapter).zfill(3)}"
                     "-word-freq.csv")
    keywords += get_top_7_words(os.path.join(html_folder, csv_file_name))
    # Include top 7 words in the page's keywords metatag

    base_template_args = get_base_template_args(description,
                                                ",".join(keywords),
                                                description)

    bible_chapter_text = get_bible_chapter_text(
        book_num,
        book_abbrev,
        chapter,
        custom_paragraphing=custom_paragraphing,
    )

    new_template_args = {
        "images_path": "../images",
        "styles_path": "../styles",
        "bible_book_name": bible_book_names[book_abbrev],
        "book_abbrev": book_abbrev,
        "chapters_in_book": get_verse_counts()[f"{book_abbrev} {chapter}"],
        "chapter": chapter,
        "words_in_bible": "790,663",
        "words_in_chapter": words_in_chapter,
        "csv_file_name": csv_file_name,
        "bible_chapter_text": bible_chapter_text,
        "rows": rows,
    }

    write_html(
        base_template_args,
        new_template_args,
        "templates/bible_chapter.mako",
        html_folder,
        f"{book_abbrev.lower()}{chapter.zfill(3)}-word-freq.html",
    )
コード例 #2
0
    def first_query(self):

        get_rows = self.data.map(lambda row: (row[0], float(row[13])))
        total = get_rows.reduceByKey(lambda x, y: x + y)
        print("primera consulta archivo 2")
        #print(get_rows.collect())
        print(total.collect())

        xxx = get_ejex_ejey(total)
        print(str(xxx[0]))
        print(str(xxx[1]))

        graph_js("archivo2_reporte1", str(xxx[0]), str(xxx[1]), 'pie',
                 'labels', 'values')
        write_html("archivo2_reporte1", "Ingresos de todas las regiones")
コード例 #3
0
    def first_query(self):
        get_rows = self.data.map(lambda row: (row[3], 1))

        total_race = get_rows.reduceByKey(lambda x, y: x + y)

        total_sort = total_race.sortBy(lambda row: row[1], ascending=False)

        print("primera consulta archivo 3")
        print(total_sort.collect()[:3])

        xxx = get_ejex_ejey(total_sort)
        print(str(xxx[0][:3]))
        print(str(xxx[1][:3]))

        graph_js("archivo3_reporte1", str(xxx[0][:3]), str(xxx[1][:3]), 'bar')
        write_html("archivo3_reporte1", "Top de razas victimas")
コード例 #4
0
    def third_query(self):

        get_rows = self.data.map(lambda row: (row[2], 1))

        total = get_rows.reduceByKey(lambda x, y: x + y)

        total_ordenado = total.sortBy(lambda row: row[1], ascending=False)

        print("tercera consulta archivo 2")
        #print(get_rows.collect())
        print(total_ordenado.collect()[0:5])

        xxx = get_ejex_ejey(total_ordenado)
        print(str(xxx[0][:5]))
        print(str(xxx[1][:5]))

        graph_js("archivo1_reporte3", str(xxx[0][:5]), str(xxx[1][:5]), 'bar')
        write_html("archivo1_reporte3",
                   "Top 5 de plataformas con mas lanzamientos")
コード例 #5
0
    def homewort_query(self):
        get_rows = self.data.map(lambda row: (row[2], float(row[11]),
                                              float(row[12]), float(row[13])))

        filter = get_rows.filter(lambda x: (x[0].lower()=='baby food') or \
                                           (x[0].lower() == 'cereal') or \
                                           (x[0].lower() == 'fruits') or \
                                           (x[0].lower() == 'meat') or \
                                           (x[0].lower() == 'vegetables') or \
                                           (x[0].lower() == 'beverages') or \
                                           (x[0].lower() == 'snacks') )

        renueve = filter.map(lambda x: (x[0], x[1])).reduceByKey(
            lambda x, y: x + y)
        costo = filter.map(lambda x: (x[0], x[2])).reduceByKey(
            lambda x, y: x + y)
        profit = filter.map(lambda x: (x[0], x[3])).reduceByKey(
            lambda x, y: x + y)

        xxx = get_ejex_ejey(renueve)
        print(str(xxx[0]))
        print(str(xxx[1]))

        strring = graph_js_apilda('renueve', str(xxx[0]), str(xxx[1]), 'bar',
                                  'x', 'y', '1')

        xxx = get_ejex_ejey(costo)
        print(str(xxx[0]))
        print(str(xxx[1]))

        strring += graph_js_apilda('costo', str(xxx[0]), str(xxx[1]), 'bar',
                                   'x', 'y', '2')

        xxx = get_ejex_ejey(profit)
        print(str(xxx[0]))
        print(str(xxx[1]))

        strring += graph_js_apilda('profit', str(xxx[0]), str(xxx[1]), 'bar',
                                   'x', 'y', '3')

        write_js_tarea('tarea_reporte', strring)
        write_html("tarea_reporte", "Tarea")
コード例 #6
0
    def second_query(self):

        get_rows = self.data.map(lambda row: (row[5], row[4], 1))

        rows_nintendo = get_rows.filter(
            lambda row: row[0].lower() == "nintendo")

        rows_final = rows_nintendo.map(lambda row: (row[1], row[2]))

        total = rows_final.reduceByKey(lambda x, y: x + y)

        print("segunda consulta archivo 2")
        #print(rows_nintendo.collect())
        print(total.collect())
        xxx = get_ejex_ejey(total)
        print(str(xxx[0]))
        print(str(xxx[1]))

        graph_js("archivo1_reporte2", str(xxx[0]), str(xxx[1]), 'pie',
                 'labels', 'values')
        write_html("archivo1_reporte2",
                   "Total de generos publicados por nintendo")
コード例 #7
0
def write_examples():

    description = "KJV Bible Chapter Word Frequencies: Examples"
    base_template_args = get_base_template_args(
        description,
        ",".join([
            "KJV",
            "Bible",
            "chapter",
            "word frequency",
            "relative frequency",
            "examples",
        ]),
        description,
    )

    with open("examples.md", "r") as read_file:
        examples_source = read_file.read()
    examples_html = markdown2.markdown(examples_source, extras=["tables"])
    examples_html = examples_html.replace('align="right"', 'class="numerical"')
    #   The align attribute which markdown2 puts on th and td elements is
    #   obsolete.
    #   It will fail HTML validation by the W3C's
    #       [Nu Html Checker](https://validator.w3.org/)
    #   Replace it with CSS styling.
    new_template_args = {
        "images_path": "./images",
        "styles_path": "./styles",
        "examples_html": examples_html,
    }

    html_folder = os.path.join(os.getcwd(), "public_html")
    write_html(
        base_template_args,
        new_template_args,
        "templates/examples.mako",
        html_folder,
        "examples.html",
    )
コード例 #8
0
ファイル: spider.py プロジェクト: ryan147k/spiders
def zhihu_spider():
    url = 'https://www.zhihu.com/question/61170968'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        # 'accept-encoding': 'gzip, deflate, br', # 加上这一行会乱码
        'accept-language': 'zh-CN,zh;q=0.9',
        'referer': r'https://www.zhihu.com/signin?next=%2F',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '******',
        'upgrade-insecure-requests': '1'
    }
    cookie_list = CookiePool.get_cookies(Zhihu.cookie_table_name)
    cookie = cookie_list[0][1]
    headers['cookie'] = cookie
    response = requests.get(url, headers=headers)
    # print(response.content)
    write_html(response)
コード例 #9
0
def write_bible_book_index(book_abbrev):

    bible_books = get_bible_books()
    bible_book_names = {
        bible_books[bible_book][0]: bible_book
        for bible_book in bible_books
    }
    bible_book_name = bible_book_names[book_abbrev]
    book_length = bible_books[bible_book_name][1]

    description = f"KJV Bible Chapter Word Frequencies: {bible_book_name}"
    base_template_args = get_base_template_args(
        description,
        ",".join(
            ["KJV", "Bible", bible_book_name, "chapter", "word frequency"]),
        description,
    )

    new_template_args = {
        "images_path": "../images",
        "styles_path": "../styles",
        "bible_book_name": bible_book_name,
        "book_abbrev": book_abbrev,
        "chapters_in_book": book_length,
    }

    book_num = f"{str(get_book_nums()[book_abbrev]).zfill(2)}"
    html_folder = os.path.join(os.getcwd(), "public_html")
    chapter_folder = os.path.join(html_folder,
                                  f"{book_num}-{book_abbrev.lower()}")
    os.makedirs(chapter_folder, exist_ok=True)
    write_html(
        base_template_args,
        new_template_args,
        "templates/bible_book_index.mako",
        chapter_folder,
        f"{book_abbrev.lower()}-index.html",
    )
コード例 #10
0
    def first_query(self):

        get_rows = self.data.map(lambda row: (row[4], float(row[10])))

        genre_filters= get_rows.filter( lambda row:  (row[0].lower() == "action") \
                                        or (row[0].lower() == "sports") \
                                        or (row[0].lower() == "fighting") \
                                        or (row[0].lower() == "shooter") \
                                        or (row[0].lower() == "racing") \
                                        or (row[0].lower() == "adventure") \
                                        or (row[0].lower() == "strategy") )

        total = genre_filters.reduceByKey(lambda x, y: x + y)

        #print(genre_filters.collect())
        print(total.collect())

        xxx = get_ejex_ejey(total)
        print(str(xxx[0]))
        print(str(xxx[1]))

        graph_js("archivo1_reporte1", str(xxx[0]), str(xxx[1]), 'bar')
        write_html("archivo1_reporte1", "Ventas globales de la sig categorias")
コード例 #11
0
    def second_query(self):
        #use la orden
        get_rows = self.data.map(lambda row: (row[1], row[5].split("/")[2], int(row[8]))) \
            .filter(lambda row: row[0].lower() == 'guatemala')

        total = get_rows.map(lambda row: (row[1] + "x", row[2])).reduceByKey(
            lambda x, y: x + y)

        orden = total.sortBy(lambda row: row[1], ascending=False)

        #ventas_anio = get_rows.filter(lambda row: row[1]=="2019")
        #GUATEMLA
        print("segunda consulta archivo 2")
        #print(get_rows.collect())
        #print(total.collect())
        print(orden.collect())

        xxx = get_ejex_ejey(orden)
        print(str(xxx[0]))
        print(str(xxx[1]))

        graph_js("archivo2_reporte2", str(xxx[0]), str(xxx[1]), 'bar')
        write_html("archivo2_reporte2", "Año con mas unidades venididas")
コード例 #12
0
    def third_query(self):

        get_rows = self.data.map(
            lambda row: (row[5].split("/")[2], row[0], float(row[13]), row[3]))
        year2010 = get_rows.filter(
            lambda row: row[0] == "2010" and row[3].lower() == "online")

        total = year2010.map(lambda row: (row[1], row[2])).reduceByKey(
            lambda x, y: x + y)
        total_ordenado = total.sortBy(lambda row: row[1], ascending=False)

        print("tercera consulta archivo 2")
        #print(get_rows.collect())
        #print(year2010.collect())
        #print(total.collect())
        print(total_ordenado.collect())

        xxx = get_ejex_ejey(total_ordenado)
        print(str(xxx[0]))
        print(str(xxx[1]))

        graph_js("archivo2_reporte3", str(xxx[0]), str(xxx[1]), 'bar')
        write_html("archivo2_reporte3", "Año 2010 ventas online x region")
コード例 #13
0
                print("Iteration: %08d/%08d" % (iterations + 1, max_iter))
                write_loss(iterations, trainer, train_writer)

            # Write images
            if (iterations + 1) % config['image_save_iter'] == 0:
                with torch.no_grad():
                    test_image_outputs = trainer.sample(
                        test_display_images_a, test_display_images_b)
                    train_image_outputs = trainer.sample(
                        train_display_images_a, train_display_images_b)
                write_2images(test_image_outputs, display_size,
                              image_directory, 'test_%08d' % (iterations + 1))
                write_2images(train_image_outputs, display_size,
                              image_directory, 'train_%08d' % (iterations + 1))
                # HTML
                write_html(output_directory + "/index.html", iterations + 1,
                           config['image_save_iter'], 'images')

            if (iterations + 1) % config['image_display_iter'] == 0:
                with torch.no_grad():
                    image_outputs = trainer.sample(train_display_images_a,
                                                   train_display_images_b)
                write_2images(image_outputs, display_size, image_directory,
                              'train_current')

            # Save network weights
            if (iterations + 1) % config['snapshot_save_iter'] == 0:
                trainer.save(checkpoint_directory, iterations)

            iterations += 1
            if iterations >= max_iter:
                sys.exit('Finish training')
コード例 #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config',
                        type=str,
                        default='configs/edges2handbags_folder.yaml',
                        help='Path to the config file.')
    parser.add_argument('--output_path',
                        type=str,
                        default='.',
                        help="outputs path")
    #resume option => [, default='730000']
    parser.add_argument("--resume", default='150000', action="store_true")
    parser.add_argument('--trainer',
                        type=str,
                        default='MUNIT',
                        help="MUNIT|UNIT")
    opts = parser.parse_args()

    cudnn.benchmark = True

    # Load experiment setting
    config = get_config(opts.config)
    max_iter = config['max_iter']
    display_size = config['display_size']
    config['vgg_model_path'] = opts.output_path

    # Setup model and data loader
    if opts.trainer == 'MUNIT':
        trainer = MUNIT_Trainer(config)
    elif opts.trainer == 'UNIT':
        trainer = UNIT_Trainer(config)
    else:
        sys.exit("Only support MUNIT|UNIT")
    trainer.cuda()
    train_loader_a, train_loader_b, test_loader_a, test_loader_b = get_all_data_loaders(
        config)
    train_display_images_a = torch.stack(
        [train_loader_a.dataset[i] for i in range(display_size)]).cuda()
    train_display_images_b = torch.stack(
        [train_loader_b.dataset[i] for i in range(display_size)]).cuda()
    test_display_images_a = torch.stack(
        [test_loader_a.dataset[i] for i in range(display_size)]).cuda()
    test_display_images_b = torch.stack(
        [test_loader_b.dataset[i] for i in range(display_size)]).cuda()

    # Setup logger and output folders
    model_name = os.path.splitext(os.path.basename(opts.config))[0]
    train_writer = tensorboardX.SummaryWriter(
        os.path.join(opts.output_path + "/logs", model_name))
    output_directory = os.path.join(opts.output_path + "/outputs", model_name)
    checkpoint_directory, image_directory = prepare_sub_folder(
        output_directory)
    shutil.copy(opts.config, os.path.join(
        output_directory, 'config.yaml'))  # copy config file to output folder

    # Start training
    iterations = trainer.resume(checkpoint_directory,
                                hyperparameters=config) if opts.resume else 0
    while True:
        for it, (images_a,
                 images_b) in enumerate(zip(train_loader_a, train_loader_b)):
            trainer.update_learning_rate()
            images_a, images_b = images_a.cuda().detach(), images_b.cuda(
            ).detach()

            with Timer("Elapsed time in update: %f"):
                # Main training code
                trainer.dis_update(images_a, images_b, config)
                trainer.gen_update(images_a, images_b, config)
                torch.cuda.synchronize()

            # Dump training stats in log file
            if (iterations + 1) % config['log_iter'] == 0:
                print("Iteration: %08d/%08d" % (iterations + 1, max_iter))
                write_loss(iterations, trainer, train_writer)

            # Write images
            if (iterations + 1) % config['image_save_iter'] == 0:
                with torch.no_grad():
                    test_image_outputs = trainer.sample(
                        test_display_images_a, test_display_images_b)
                    train_image_outputs = trainer.sample(
                        train_display_images_a, train_display_images_b)
                write_2images(test_image_outputs, display_size,
                              image_directory, 'test_%08d' % (iterations + 1))
                write_2images(train_image_outputs, display_size,
                              image_directory, 'train_%08d' % (iterations + 1))
                # HTML
                write_html(output_directory + "/index.html", iterations + 1,
                           config['image_save_iter'], 'images')

            if (iterations + 1) % config['image_display_iter'] == 0:
                with torch.no_grad():
                    image_outputs = trainer.sample(train_display_images_a,
                                                   train_display_images_b)
                write_2images(image_outputs, display_size, image_directory,
                              'train_current')

            # Save network weights
            if (iterations + 1) % config['snapshot_save_iter'] == 0:
                trainer.save(checkpoint_directory, iterations)

            iterations += 1
            if iterations >= max_iter:
                sys.exit('Finish training')
コード例 #15
0
def main(argv):
    (opts, args) = parser.parse_args(argv)
    cudnn.benchmark = True
    model_name = os.path.splitext(os.path.basename(opts.config))[0]

    # Load experiment setting
    config = get_config(opts.config)
    max_iter = config['max_iter']
    display_size = config['display_size']

    # Setup model and data loader
    trainer = MUNIT_Trainer(config)
    trainer.cuda()
    train_loader_a, train_loader_b, test_loader_a, test_loader_b = get_all_data_loaders(
        config)
    test_display_images_a = Variable(torch.stack(
        [test_loader_a.dataset[i] for i in range(display_size)]).cuda(),
                                     volatile=True)
    test_display_images_b = Variable(torch.stack(
        [test_loader_b.dataset[i] for i in range(display_size)]).cuda(),
                                     volatile=True)
    train_display_images_a = Variable(torch.stack(
        [train_loader_a.dataset[i] for i in range(display_size)]).cuda(),
                                      volatile=True)
    train_display_images_b = Variable(torch.stack(
        [train_loader_b.dataset[i] for i in range(display_size)]).cuda(),
                                      volatile=True)

    # Setup logger and output folders
    train_writer = tensorboard.SummaryWriter(os.path.join(
        opts.log, model_name))
    output_directory = os.path.join(opts.outputs, model_name)
    checkpoint_directory, image_directory = prepare_sub_folder(
        output_directory)
    shutil.copy(opts.config, os.path.join(
        output_directory, 'config.yaml'))  # copy config file to output folder

    # Start training
    iterations = trainer.resume(checkpoint_directory) if opts.resume else 0
    while True:
        for it, (images_a,
                 images_b) in enumerate(izip(train_loader_a, train_loader_b)):
            trainer.update_learning_rate()
            images_a, images_b = Variable(images_a.cuda()), Variable(
                images_b.cuda())

            # Main training code
            trainer.dis_update(images_a, images_b, config)
            trainer.gen_update(images_a, images_b, config)

            # Dump training stats in log file
            if (iterations + 1) % config['log_iter'] == 0:
                print("Iteration: %08d/%08d" % (iterations + 1, max_iter))
                write_loss(iterations, trainer, train_writer)

            # Write images
            if (iterations + 1) % config['image_save_iter'] == 0:
                # Test set images
                image_outputs = trainer.sample(test_display_images_a,
                                               test_display_images_b)
                write_images(
                    image_outputs, display_size,
                    '%s/gen_test%08d.jpg' % (image_directory, iterations + 1))
                # Train set images
                image_outputs = trainer.sample(train_display_images_a,
                                               train_display_images_b)
                write_images(
                    image_outputs, display_size,
                    '%s/gen_train%08d.jpg' % (image_directory, iterations + 1))
                # HTML
                write_html(output_directory + "/index.html", iterations + 1,
                           config['image_save_iter'], 'images')
            if (iterations + 1) % config['image_save_iter'] == 0:
                image_outputs = trainer.sample(test_display_images_a,
                                               test_display_images_b)
                write_images(image_outputs, display_size,
                             '%s/gen.jpg' % image_directory)

            # Save network weights
            if (iterations + 1) % config['snapshot_save_iter'] == 0:
                trainer.save(checkpoint_directory, iterations)

            iterations += 1
            if iterations >= max_iter:
                return
コード例 #16
0
ファイル: train.py プロジェクト: vict0rsch/MUNIT
                image_directory,
                "test_%08d" % (iterations + 1),
                comet_exp=comet_exp,
            )
            write_2images(
                train_image_outputs,
                display_size,
                image_directory,
                "train_%08d" % (iterations + 1),
                comet_exp=comet_exp,
            )
            # HTML
            write_html(
                output_directory + "/index.html",
                iterations + 1,
                config["image_save_iter"],
                "images",
                comet_exp=comet_exp,
            )

        if (iterations + 1) % config["image_display_iter"] == 0:
            with torch.no_grad():
                image_outputs = trainer.sample(train_display_images_a,
                                               train_display_images_b)
            write_2images(image_outputs, display_size, image_directory,
                          "train_current")

        # Save network weights
        if (iterations + 1) % config["snapshot_save_iter"] == 0:
            trainer.save(checkpoint_directory, iterations)
コード例 #17
0
ファイル: comments.py プロジェクト: Kipparis/textanalys
    def process_data(self):
        print("#" * 10, "Processing data", "#" * 10)

        grades = []
        # Создаём классы комментариев
        for raw in zip(self.grade, self.owned, self.reviews,
                self.ingame_hours, self.helpful, self.funny, self.texts):
            # Передаём в новый коммент текст и оценку
            comm = Comment(raw[0], raw[-1])
            self.comments.append(comm)
            grades.append(comm.grade)
            if raw[0] == "1":
                Comments.positive_comments.append(comm)
            else:
                Comments.negative_comments.append(comm)

        # Сохраняем кол-во позитивных и негативных комментов
        
        comments_count = {
            "pos_comm": len(Comments.positive_comments),
            "neg_comm": len(Comments.negative_comments)
            }
        with open('data/comments_count.json', '+w') as file:
            json.dump(comments_count, file)

        Comments.grades = np.array(grades)
        np.save("data/grades.npy", Comments.grade)
        
        count = 1
        if ut.path_exists('data/words_count.json'):
            with open('data/words_count.json', 'r', encoding='utf-8') as file:
                print("Loading words_count from json")
                Comments.features_count = json.load(file)
                count = 0

        # Подсчитываем для каждого коммента вектор признаков
        # from utils import Watcher следить за прогрессом
        Comments.stupid_comments = ''

        wt = Watcher(len(self.comments))
        for comm in self.comments:
            wt.display_load(self.comments.index(comm), "making vector")
            comm.make_vector()
            # Подсчитываем кол-во фичей в этом комменте и добавляем в общую кучу
            if count: comm.count() # Подсчитываем кол-во фичей во всех комментах


        if not ut.path_exists('data/words_count.json'):
            with open('data/words_count.json', 'w') as file:
                json.dump(Comments.features_count, file, ensure_ascii=False)
        
        count = 1
        if ut.path_exists('data/idf.json'):
            with open('data/idf.json', 'r', encoding='utf-8') as file:
                print("Loading idf from json")
                Comments.idf = json.load(file)
                for comm in Comments.comments:
                    wt.display_load(Comments.comments.index(comm), "loading tf-idf")
                    comm.load_tf_idf()
                count = 0
        if count:
            for comm in self.comments:
                wt.display_load(self.comments.index(comm), "counting tf-idf")
                # Считаем и сразу удаляем ненужные слова
                comm.count_tf_idf()
                # pprint(comm.tf_idf)

        if not ut.path_exists('data/idf.json'):
            with open('data/idf.json', 'w') as file:
                json.dump(Comments.idf, file, ensure_ascii=False)

        # Выводим бесполезные слова
        ut.write_html("data/tf-idf-useless.txt", Comment.tf_idf_words)
        print("OUTPUTED DATA/TF-IDF USELESS")

        # Создаём массив таргет_нэймс
        # Подсчитываем кол-во разных слов
        target_names = set()
        for comm in self.comments:
            wt.display_load(self.comments.index(comm), "creating target names")
            # print("Comm {} from {}\tCreating target names".format(self.comments.index(comm), len(self.comments)))
            for feature in comm.features:
                target_names.add(feature)

        Comments.target_names = sorted(list(target_names))

        target_names_text = ""
        for name in Comments.target_names:
            target_names_text += name + "\n::|::\n" 
        ut.write_html("data/target_names.txt", target_names_text)

        print("Comments.target_names len is:\t{}".format(len(Comments.target_names)))

        
        wt = Watcher(len(Comments.target_names))

        # Загружаем чтобы не считать всё заново
        count = 1
        if ut.path_exists('data/target_names_indexes.json'):
            with open('data/target_names_indexes.json', 'r', encoding='utf-8') as file:
                print("Loading target_names_indexes from json")
                Comments.target_names_dict = json.load(file)
                count = 0

        
        oldLetter = ''
        endIndex = 0
        startIndex = 0
        if count:
            # Сделать по индексам, и когда нашли конечный индекс сразу смещаться на него -1
            # Делать без внутреннего цикла, а просто заводить переменную *текущая буква* и когда другая буква не равна ей просто ставить конечный индекс - новое слово
            for j in range(0, len(Comments.target_names)):
                name = Comments.target_names[j]
                if name == '':
                    continue
                wt.display_load(j, "counting indexes")
                letter = name[0]

                if letter != oldLetter:
                    endIndex = j
                    if j != 0:
                        Comments.target_names_dict[oldLetter] = str(startIndex) + ":" + str(endIndex)
                    startIndex = j
                    oldLetter = letter

                

        # Сохраняем то что сейчас написали, т.к. процесс мега трудоёмкий
        if not ut.path_exists('data/target_names_indexes.json'):
            with open('data/target_names_indexes.json', 'w') as file:
                json.dump(Comments.target_names_dict, file, ensure_ascii=False)
                print("Ended saving file")
        
        wt = Watcher(len(Comments.comments))

        # TODO: Сделать подгрузку логарифмического выражения

        # Подсчитываем delta tf - idf
        for comm in self.comments:
            wt.display_load(self.comments.index(comm), "counting delta tf-idf")
            comm.count_values()

        ut.write_html('data/delta_tf_idf_log.txt', Comments.output)

        # Сохраняем логарифмическое выражение
        with open('data/delta_tf_idf_frac.json', 'w') as file:
            json.dump(Comments.delta_tf_idf_frac, file, ensure_ascii=False)
            print("Ended saving file")


        data = sparse.lil_matrix((len(Comments.comments), len(Comments.target_names)))
        comments_len = len(Comments.comments)

        for i in range(0, comments_len):
            wt.display_load(i, "editing matrix")
            comment = Comments.comments[i]

            for feature in comment.features:
                if feature == '':
                    continue
                if feature not in comm.values:
                    continue
                
                # для каждой фичи находим индекс
                first_letter = feature[0]

                if first_letter not in Comments.target_names_dict:
                    continue

                start_ind = int(Comments.target_names_dict[first_letter].split(':')[0])
                end_ind = int(Comments.target_names_dict[first_letter].split(':')[-1])

                for j in range(start_ind, end_ind):
                    if Comments.target_names[j] == feature:
                        data[i, j] = comment.values[feature]


        Comments.data = data

        print("Data shape:\t{}".format(Comments.data.shape))
        print("Grade shape:\t{}".format(Comments.grades.shape))
        print("Target_name len:\t{}".format(len(Comments.target_names)))


        # Сохраняем массив
        # np.save("data/data.npy", Comments.data)
        # sparse.save_npz('data/data.npz', data)
        Comments.save_sparse_matrix(Comments, "data/data.npz", Comments.data)
コード例 #18
0
    def parse_json_comments(self, response):

        print("==============\nstart parsing json\n===============")

        num = re.compile(
            r'[0-9]+\.?[0-9]*')  # Регулярное выражение для определения числа

        data = json.loads(response.body)
        ut.write_html(self.dest + "comments.html", data['html'])

        html = data['html'].replace('<br>',
                                    '\n')  # Заменяем для целостности комента

        selector = Selector(text=html)
        selector.remove_namespaces()

        output = ""

        # Используем регулярное выражение для получения только полных комментариев
        review_boxes = selector.xpath(
            "//div[re:test(@class, '\Areview_box\s*\Z')]")
        for review in review_boxes:
            output += "\n=======================\n"

            if review.css('div.persona_name') is None:
                continue  # Если такого не существует пропускаем

            persona_name = review.css('div.persona_name')

            if persona_name.css('a::text').extract_first() is None:
                name = "i have to search in span"
                continue
            else:
                name = str(persona_name.css('a::text').extract_first())

            if persona_name.css('a::attr(href)').extract_first() is None:
                url = "have to search in another place"
                continue
            else:
                url = str(persona_name.css('a::attr(href)').extract_first())

            if url != "None" and url is not None:
                person_id = url.split('/')[-2]
            else:
                person_id = "Doesn't exist"

            if review.css(
                    'div.num_owned_games a::text').extract_first() is None:
                num_owned_games = "Didn't find"
                continue
            else:
                num_owned_games = str(
                    review.css('div.num_owned_games a::text').extract_first()
                ).split(' ')[-1]
                num_owned_games = num_owned_games.replace(',', '')
                num_owned_games = num_owned_games.replace('.', '')

            if review.css('div.num_reviews a::text').extract_first() is None:
                num_reviews = "Didn't find"
                continue
            else:
                num_reviews_text = review.css(
                    'div.num_reviews a::text').extract_first().strip()
                if num.match(num_reviews_text):
                    num_reviews = (num.findall(num_reviews_text))[0].strip()
                    num_reviews = num_reviews.replace(',', '')
                    num_reviews = num_reviews.replace('.', '')
                else:
                    num_reviews = "0"

            if review.xpath('.//div[contains(@class, "title ellipsis")]/text()'
                            ).extract_first() is None:
                grade = "Didn't find"
                continue
            else:
                grade = review.xpath(
                    './/div[contains(@class, "title ellipsis")]/text()'
                ).extract_first()
                if grade == "Рекомендую":
                    grade = "1"
                else:
                    grade = "0"

            if review.xpath('.//div[contains(@class, "hours ellipsis")]/text()'
                            ).extract_first() is None:
                hours = "Didn't find"
                continue
            else:
                hours = review.xpath(
                    './/div[contains(@class, "hours ellipsis")]/text()'
                ).extract_first()
                hours = hours.split(' ')[-2].replace('.', '')
                hours = hours.replace(',', '')

            if review.css('div.vote_info::text').extract_first() is None:
                num_useful = "Didn't find"
                num_funny = "Didn't find"
                continue
            else:
                useful = "Not found"
                funny = "Not found"

                num_useful = '0'
                num_funny = '0'

                votes_info = review.css('div.vote_info::text').extract()

                for _ in votes_info:
                    votes = _.splitlines()
                    for vote in votes:
                        if 'полезным' in vote:
                            useful = vote.strip()
                            num_useful = num.findall(useful)[0].strip()
                        elif 'забавным' in vote:
                            funny = vote.strip()
                            num_funny = num.findall(funny)[0].strip()

            if review.css('div.content::text').extract_first() is None:
                text = "None"
                continue
            else:
                text = review.css('div.content::text').extract_first()

            num_reviews = num.findall(num_reviews_text)[0]

            output += "Name\tis:\t{}\n".format(name)
            output += "Url\tis:\t{}\n".format(url)
            output += "Id \tis:\t{}\n".format(person_id)
            output += "Owned games:\t{}\n".format(num_owned_games)
            output += "Num reviews:\t{}\n".format(num_reviews)
            output += "Grade\tis:\t{}\n".format(grade)
            output += "Ingame hours:\t{}\n".format(hours)

            output += "People think it helpful:\t{}\n".format(num_useful)
            output += "People think it funny:\t\t{}\n".format(num_funny)

            # output += "Text:\n{}\n".format(text)

            Comments.add_comment(Comments, text, num_owned_games, num_reviews,
                                 grade, hours, num_useful, num_funny)

            output += "=======================\n"

        ut.write_html(self.dest + "reviewers.txt", output)

        # output = ""
        # comments = selector.css('div.review_box').css('div.content::text').extract()
        # for comment in comments:
        #     comment = comment.strip()
        #     if not comment:
        #         continue    # Пропускаем если строчка пустая
        #     output += "\n=============================\n"
        #     output += comment
        #     output += "\n=============================\n"

        # ut.write_html(self.dest + 'comments.txt', output)

        print("==============\nended parsing json\n===============")
コード例 #19
0
                            os.path.join(
                                image_directory, 'gen_b2a_' + 'train_%08d' %
                                (iterations + 1) + '.jpg'),
                            filename='train_gen_b2a_im-iteration: ' +
                            str(iterations) + '.jpg')
                        telegram_bot_send_document(
                            os.path.join(
                                image_directory, 'gen_b2a_' + 'test_%08d' %
                                (iterations + 1) + '.jpg'),
                            filename='test_gen_b2a_im-iteration: ' +
                            str(iterations) + '.jpg')

                # HTML
                write_html(output_directory + "/index.html",
                           iterations + 1,
                           config['image_save_iter'],
                           'images',
                           do_a2b=config['do_a2b'],
                           do_b2a=config['do_b2a'])

            if (iterations + 1) % config['image_display_iter'] == 0:
                with torch.no_grad():
                    image_outputs = trainer.sample(train_display_images_a,
                                                   train_display_images_b)

                write_2images(image_outputs,
                              display_size * config['council']['council_size'],
                              image_directory,
                              'train_current',
                              do_a2b=config['do_a2b'],
                              do_b2a=config['do_b2a'])
コード例 #20
0
ファイル: train.py プロジェクト: phonx/MUNIT
        # Dump training stats in log file
        if (iterations + 1) % config['log_iter'] == 0:
            print("Iteration: %08d/%08d" % (iterations + 1, max_iter))
            write_loss(iterations, trainer, train_writer)

        # Write images
        if (iterations + 1) % config['image_save_iter'] == 0:
            # Test set images
            image_outputs = trainer.sample(test_display_images_a, test_display_images_b)
            write_2images(image_outputs, display_size, image_directory, 'test_%08d' % (iterations + 1))
            # Train set images
            image_outputs = trainer.sample(train_display_images_a, train_display_images_b)
            write_2images(image_outputs, display_size, image_directory, 'train_%08d' % (iterations + 1))
            # HTML
            write_html(output_directory + "/index.html", iterations + 1, config['image_save_iter'], 'images')

        if (iterations + 1) % config['image_display_iter'] == 0:
            train_display_images_a = Variable(torch.stack([train_loader_a.dataset[i] for i in range(display_size)]).cuda(), volatile=True)
            train_display_images_b = Variable(torch.stack([train_loader_b.dataset[i] for i in range(display_size)]).cuda(), volatile=True)
            image_outputs = trainer.sample(train_display_images_a, train_display_images_b)
            write_2images(image_outputs, display_size, image_directory, 'train_current')

        # Save network weights
        if (iterations + 1) % config['snapshot_save_iter'] == 0:
            trainer.save(checkpoint_directory, iterations)

        iterations += 1
        if iterations >= max_iter:
            sys.exit('Finish training')
コード例 #21
0
def main():
    cudnn.benchmark = True
    # Load experiment setting
    config = get_config(opts.config)
    max_iter = config['max_iter']
    display_size = config['display_size']
    config['vgg_model_path'] = opts.output_path

    # Setup model and data loader
    trainer = UNIT_Trainer(config)
    if torch.cuda.is_available():
        trainer.cuda(config['gpuID'])
    train_loader_a, train_loader_b, test_loader_a, test_loader_b = get_all_data_loaders(
        config)

    # Setup logger and output folders
    model_name = os.path.splitext(os.path.basename(opts.config))[0]
    writer = SummaryWriter(os.path.join(opts.output_path + "/logs",
                                        model_name))
    output_directory = os.path.join(opts.output_path + "/outputs", model_name)
    checkpoint_directory, image_directory = prepare_sub_folder(
        output_directory)
    shutil.copy(opts.config, os.path.join(
        output_directory, 'config.yaml'))  # copy config file to output folder

    print('start training !!')
    # Start training
    iterations = trainer.resume(checkpoint_directory,
                                hyperparameters=config) if opts.resume else 0

    TraindataA = data_prefetcher(train_loader_a)
    TraindataB = data_prefetcher(train_loader_b)
    testdataA = data_prefetcher(test_loader_a)
    testdataB = data_prefetcher(test_loader_b)

    while True:
        dataA = TraindataA.next()
        dataB = TraindataB.next()
        if dataA is None or dataB is None:
            TraindataA = data_prefetcher(train_loader_a)
            TraindataB = data_prefetcher(train_loader_b)
            dataA = TraindataA.next()
            dataB = TraindataB.next()
        with Timer("Elapsed time in update: %f"):
            # Main training code
            for _ in range(3):
                trainer.content_update(dataA, dataB, config)
            trainer.dis_update(dataA, dataB, config)
            trainer.gen_update(dataA, dataB, config)
            # torch.cuda.synchronize()
        trainer.update_learning_rate()
        # Dump training stats in log file
        if (iterations + 1) % config['log_iter'] == 0:
            print("Iteration: %08d/%08d" % (iterations + 1, max_iter))
            write_loss(iterations, trainer, writer)
        if (iterations + 1) % config['image_save_iter'] == 0:
            testa = testdataA.next()
            testb = testdataB.next()
            if dataA is None or dataB is None or dataA.size(
                    0) != display_size or dataB.size(0) != display_size:
                testdataA = data_prefetcher(test_loader_a)
                testdataB = data_prefetcher(test_loader_b)
                testa = testdataA.next()
                testb = testdataB.next()
            with torch.no_grad():
                test_image_outputs = trainer.sample(testa, testb)
                train_image_outputs = trainer.sample(dataA, dataB)
            if test_image_outputs is not None and train_image_outputs is not None:
                write_2images(test_image_outputs, display_size,
                              image_directory, 'test_%08d' % (iterations + 1))
                write_2images(train_image_outputs, display_size,
                              image_directory, 'train_%08d' % (iterations + 1))
                # HTML
                write_html(output_directory + "/index.html", iterations + 1,
                           config['image_save_iter'], 'images')

        if (iterations + 1) % config['image_display_iter'] == 0:
            with torch.no_grad():
                image_outputs = trainer.sample(dataA, dataB)
            if image_outputs is not None:
                write_2images(image_outputs, display_size, image_directory,
                              'train_current')

            # Save network weights
        if (iterations + 1) % config['snapshot_save_iter'] == 0:
            trainer.save(checkpoint_directory, iterations)

        iterations += 1
        if iterations >= max_iter:
            writer.close()
            sys.exit('Finish training')
コード例 #22
0
def main():
    from utils import get_all_data_loaders, prepare_sub_folder, write_html, write_loss, get_config, write_2images, Timer
    import argparse
    from torch.autograd import Variable
    from trainer import MUNIT_Trainer, UNIT_Trainer
    import torch.backends.cudnn as cudnn
    import torch

    # try:
    #     from itertools import izip as zip
    # except ImportError:  # will be 3.x series
    #     pass

    import os
    import sys
    import tensorboardX
    import shutil

    os.environ["CUDA_VISIBLE_DEVICES"] = str(0)

    parser = argparse.ArgumentParser()
    parser.add_argument('--config',
                        type=str,
                        default='configs/edges2handbags_folder.yaml',
                        help='Path to the config file.')
    parser.add_argument('--output_path',
                        type=str,
                        default='.',
                        help="outputs path")
    parser.add_argument("--resume", action="store_true")
    parser.add_argument('--trainer',
                        type=str,
                        default='MUNIT',
                        help="MUNIT|UNIT")
    opts = parser.parse_args()

    cudnn.benchmark = True
    '''
    Note: https://www.pytorchtutorial.com/when-should-we-set-cudnn-benchmark-to-true/
        大部分情况下,设置这个 flag 可以让内置的 cuDNN 的 auto-tuner 自动寻找最适合当前配置的高效算法,来达到优化运行效率的问题
        1.  如果网络的输入数据维度或类型上变化不大,设置  torch.backends.cudnn.benchmark = true  可以增加运行效率;
        2.  如果网络的输入数据在每次 iteration 都变化的话,会导致 cnDNN 每次都会去寻找一遍最优配置,这样反而会降低运行效率。
    '''

    # Load experiment setting
    config = get_config(opts.config)
    max_iter = config['max_iter']
    display_size = config['display_size']
    config['vgg_model_path'] = opts.output_path

    # Setup model and data loader
    if opts.trainer == 'MUNIT':
        trainer = MUNIT_Trainer(config)
    elif opts.trainer == 'UNIT':
        trainer = UNIT_Trainer(config)
    else:
        sys.exit("Only support MUNIT|UNIT")
    trainer.cuda()
    train_loader_a, train_loader_b, test_loader_a, test_loader_b = get_all_data_loaders(
        config)
    train_display_images_a = torch.stack(
        [train_loader_a.dataset[i] for i in range(display_size)]).cuda()
    train_display_images_b = torch.stack(
        [train_loader_b.dataset[i] for i in range(display_size)]).cuda()
    test_display_images_a = torch.stack(
        [test_loader_a.dataset[i] for i in range(display_size)]).cuda()
    test_display_images_b = torch.stack(
        [test_loader_b.dataset[i] for i in range(display_size)]).cuda()

    # Setup logger and output folders
    model_name = os.path.splitext(os.path.basename(opts.config))[0]
    train_writer = tensorboardX.SummaryWriter(
        os.path.join(opts.output_path + "/logs", model_name))
    output_directory = os.path.join(opts.output_path + "/outputs", model_name)
    checkpoint_directory, image_directory = prepare_sub_folder(
        output_directory)
    shutil.copy(opts.config, os.path.join(
        output_directory, 'config.yaml'))  # copy config file to output folder

    # Start training
    iterations = trainer.resume(checkpoint_directory,
                                hyperparameters=config) if opts.resume else 0
    while True:
        for it, (images_a,
                 images_b) in enumerate(zip(train_loader_a, train_loader_b)):
            trainer.update_learning_rate()
            images_a, images_b = images_a.cuda().detach(), images_b.cuda(
            ).detach()

            with Timer("Elapsed time in update: %f"):
                # Main training code
                trainer.dis_update(images_a, images_b, config)
                trainer.gen_update(images_a, images_b, config)
                torch.cuda.synchronize()

            # Dump training stats in log file
            if (iterations + 1) % config['log_iter'] == 0:
                print("Iteration: %08d/%08d" % (iterations + 1, max_iter))
                write_loss(iterations, trainer, train_writer)

            # Write images
            if (iterations + 1) % config['image_save_iter'] == 0:
                with torch.no_grad():
                    test_image_outputs = trainer.sample(
                        test_display_images_a, test_display_images_b)
                    train_image_outputs = trainer.sample(
                        train_display_images_a, train_display_images_b)
                write_2images(test_image_outputs, display_size,
                              image_directory, 'test_%08d' % (iterations + 1))
                write_2images(train_image_outputs, display_size,
                              image_directory, 'train_%08d' % (iterations + 1))
                # HTML
                write_html(output_directory + "/index.html", iterations + 1,
                           config['image_save_iter'], 'images')

            if (iterations + 1) % config['image_display_iter'] == 0:
                with torch.no_grad():
                    image_outputs = trainer.sample(train_display_images_a,
                                                   train_display_images_b)
                write_2images(image_outputs, display_size, image_directory,
                              'train_current')

            # Save network weights
            if (iterations + 1) % config['snapshot_save_iter'] == 0:
                trainer.save(checkpoint_directory, iterations)

            iterations += 1
            if iterations >= max_iter:
                sys.exit('Finish training')
コード例 #23
0
ファイル: train.py プロジェクト: Zheng222/DMFN
            v_input, v_output, v_target = [], [], []
            visual_images = []
            for index, val_data in enumerate(val_loader):
                if index < config['display_num']:
                    model.feed_data(val_data)
                    model.test()
                    visuals = model.get_current_visuals()
                    v_input.append(visuals['input'])
                    v_output.append(visuals['output'])
                    v_target.append(visuals['target'])
                else:
                    break

            visual_images.extend(v_input)
            visual_images.extend(v_output)
            visual_images.extend(v_target)
            _write_images(visual_images, config['display_num'],
                          '%s/val_%08d.jpg' % (image_dir, current_step))
            # HTML
            write_html(output_dir + '/index.html', current_step,
                       config['save_image_iter'], 'images')

        # save models
        if current_step % config['save_model_iter'] == 0:
            print('Saving models.')
            model.save(current_step)

print('Saving the final model.')
model.save('latest')
print('End of training.')
コード例 #24
0
def main(opts):
    # Load experiment setting
    config = get_config(opts.config)
    max_iter = config['max_iter']
    # Override the batch size if specified.
    if opts.batch_size != 0:
        config['batch_size'] = opts.batch_size

    trainer = Trainer(config)
    trainer.cuda()
    if opts.multigpus:
        ngpus = torch.cuda.device_count()
        config['gpus'] = ngpus
        print("Number of GPUs: %d" % ngpus)
        trainer.model = torch.nn.DataParallel(trainer.model,
                                              device_ids=range(ngpus))
    else:
        config['gpus'] = 1

    loaders = get_train_loaders(config)
    train_content_loader = loaders[0]
    train_class_loader = loaders[1]
    test_content_loader = loaders[2]
    test_class_loader = loaders[3]

    # Setup logger and output folders
    model_name = os.path.splitext(os.path.basename(opts.config))[0]
    train_writer = SummaryWriter(
        os.path.join(opts.output_path + "/logs", model_name))
    output_directory = os.path.join(opts.output_path + "/outputs", model_name)
    checkpoint_directory, image_directory = make_result_folders(
        output_directory)
    shutil.copy(opts.config, os.path.join(output_directory, 'config.yaml'))

    iterations = trainer.resume(checkpoint_directory,
                                hp=config,
                                multigpus=opts.multigpus) if opts.resume else 0

    while True:
        for it, (co_data, cl_data) in enumerate(
                zip(train_content_loader, train_class_loader)):
            with Timer("Elapsed time in update: %f"):
                d_acc = trainer.dis_update(co_data, cl_data, config)
                g_acc = trainer.gen_update(co_data, cl_data, config,
                                           opts.multigpus)
                torch.cuda.synchronize()
                print('D acc: %.4f\t G acc: %.4f' % (d_acc, g_acc))

            if (iterations + 1) % config['log_iter'] == 0:
                print("Iteration: %08d/%08d" % (iterations + 1, max_iter))
                write_loss(iterations, trainer, train_writer)

            if ((iterations + 1) % config['image_save_iter'] == 0
                    or (iterations + 1) % config['image_display_iter'] == 0):
                if (iterations + 1) % config['image_save_iter'] == 0:
                    key_str = '%08d' % (iterations + 1)
                    write_html(output_directory + "/index.html",
                               iterations + 1, config['image_save_iter'],
                               'images')
                else:
                    key_str = 'current'
                with torch.no_grad():
                    for t, (val_co_data, val_cl_data) in enumerate(
                            zip(train_content_loader, train_class_loader)):
                        if t >= opts.test_batch_size:
                            break
                        val_image_outputs = trainer.test(
                            val_co_data, val_cl_data, opts.multigpus)
                        write_1images(val_image_outputs, image_directory,
                                      'train_%s_%02d' % (key_str, t))
                    for t, (test_co_data, test_cl_data) in enumerate(
                            zip(test_content_loader, test_class_loader)):
                        if t >= opts.test_batch_size:
                            break
                        test_image_outputs = trainer.test(
                            test_co_data, test_cl_data, opts.multigpus)
                        write_1images(test_image_outputs, image_directory,
                                      'test_%s_%02d' % (key_str, t))

            if (iterations + 1) % config['snapshot_save_iter'] == 0:
                trainer.save(checkpoint_directory, iterations, opts.multigpus)
                print('Saved model at iteration %d' % (iterations + 1))

            iterations += 1
            if iterations >= max_iter:
                print("Finish Training")
                sys.exit(0)