Ejemplo n.º 1
0
            word2vec.get_similar_words(u"France")))
        logger.info(u"Words similar to phone ({}) : {}".format(
            word2vec.get_word_count(u"phone"),
            word2vec.get_similar_words(u"phone")))
        logger.info(u"Words similar to ask ({}) : {}".format(
            word2vec.get_word_count(u"ask"),
            word2vec.get_similar_words(u"ask")))
        logger.info(u"Words similar to september ({}) : {}".format(
            word2vec.get_word_count(u"september"),
            word2vec.get_similar_words(u"september")))
        logger.info(u"Words similar to blue ({}) : {}".format(
            word2vec.get_word_count(u"blue"),
            word2vec.get_similar_words(u"blue")))

        # Test relatedness
        relatedness, relatedness_words = Metrics.relatedness(
            wordsim353, word2vec)
        print(u"Relatedness : {}, on {} words".format(relatedness,
                                                      relatedness_words))

        # If we want a figure
        if args.image is not None:
            # Order by word count
            word_counters = list()
            word_counts = word2vec.get_word_counts()
            for word_text in word_counts.keys():
                word_counters.append((word_text, word_counts[word_text]))
            # end for
            word_counters = sorted(word_counters,
                                   key=lambda tup: tup[1],
                                   reverse=True)
                                    spectral_radius=rc_spectral_radius, w_sparsity=rc_w_sparsity)

    # Add examples
    for author_index, author_id in enumerate((args.author1, args.author2)):
        author_path = os.path.join(args.dataset, "total", author_id)
        for file_index in training_set_indexes:
            file_path = os.path.join(author_path, str(file_index) + ".txt")
            classifier.train(io.open(file_path, 'r').read(), author_index)
            # end for
    # end for

    # Finalize model training
    classifier.finalize(verbose=True)

    # Init test epoch
    test_set = list()

    # Get text
    for author_index, author_id in enumerate((args.author1, args.author2)):
        author_path = os.path.join(args.dataset, "total", str(author_id))
        for file_index in test_set_indexes:
            file_path = os.path.join(author_path, str(file_index) + ".txt")
            test_set.append((io.open(file_path, 'r').read(), author_index))
        # end for
    # end for

    # Success rate
    success_rate = Metrics.success_rate(classifier, test_set, verbose=True, debug=True)
    print(u"Success rate : {}".format(success_rate))

# end if
    # For each distance measure
    for distance_measure in ['euclidian', 'cosine', 'cosine_abs']:
        print(u"#" * 100)
        print(u"# " + distance_measure)
        print(u"#" * 100)

        # Similarities
        Visualization.similar_words(
            [u"he", u"computer", u"million", u"Toronto", u"France", u"phone", u"ask", u"september", u"blue", u"king",
             u"man", u"woman"],
            word2vec, distance_measure=distance_measure, limit=args.n_similar_words)

        # Word computing
        Visualization.king_man_woman(word2vec, u"king", u"man", u"woman", distance_measure=distance_measure)

        # Test relatedness
        relatedness, relatedness_words = Metrics.relatedness(wordsim353, word2vec, distance_measure=distance_measure)
        print(u"Relatedness : {}, on {} words".format(relatedness, relatedness_words))
    # end for

    # If we want a figure
    if args.image is not None:
        selected_words = [u"switzerland", u"france", u"italy", u"spain", u"germany", u"canada", u"belgium", u"bern",
                          u"paris", u"rome", u"madrid", u"berlin", u"ottawa", u"brussels"]
        Visualization.top_words_figure(word2vec, word_embeddings, args.image, args.fig_size, args.count_limit)
        Visualization.words_figure(selected_words, word2vec, word_embeddings, args.image + u"_words", args.fig_size,
                                   reduction='PCA')
    # end if

# end if
                    test_set.append((io.open(file_path,
                                             'r').read(), author_index))
                else:
                    # Sentence success rate
                    nlp = spacy.load(args.lang)
                    doc = nlp(io.open(file_path, 'r').read())
                    for sentence in doc.sents:
                        test_set.append((sentence, author_index))
                    # end for
                # end if
            # end for
        # end for

        # Success rate
        success_rate = Metrics.success_rate(classifier,
                                            test_set,
                                            verbose=args.verbose,
                                            debug=args.debug)
        logger.info(u"\t{} - Success rate : {}".format(k, success_rate))

        # Save result
        success_rates[k] = success_rate

        # Reset
        classifier.reset()
    # end for

    # Over all success rate
    logger.info(u"All - Success rate : {}".format(np.average(success_rates)))

# end if