Exemple #1
0
def check(in_filename, sen_repr_path, db_path, word_repr, dictionary):
    # read the data set
    db = r.read_files_float(db_path)
    # read dictionary
    d = r.read_dictionary(dictionary, word_repr)

    # read the original sentences indices and filter them
    orig_sent = r.read_files(in_filename)
    f1_sent = f.remove_long_short_sentences(orig_sent)
    gc.collect()
    f2_sent = f.remove_unknown(f1_sent)
    gc.collect()

    # read the representations
    sen_repr = list()
    fid = open(sen_repr_path)
    lines = fid.readlines()
    fid.close()
    for i in range(len(lines)):
        sen_repr.append([i, lines[i]])

    word_test_flag = True
    sentence_test_flag = True
    log_word = ""
    log_sen = ""

    # testing
    for i in range(len(f2_sent)):
        # target_word = 0  # first word test
        target_word = len(f2_sent[i][1]) - 1  # last word

        sen_from_db = c.vector2string(db[i * 2][1][1:1001])
        w_from_db = c.vector2string(db[i * 2][1][1001:2001])

        w_target = c.vector2string(d[f2_sent[i][1][target_word] - 1][1])
        sen_target = c.vector2string([float(x) for x in sen_repr[f2_sent[i][0]][1].split()])

        if w_from_db != w_target:
            log_word += "From DB: " + w_from_db + "\n"
            log_word += "Target: " + w_target + "\n\n"
            word_test_flag = False

        if sen_from_db != sen_target:
            log_sen += "From DB: " + sen_from_db + "\n"
            log_sen += "Target: " + sen_target + "\n\n"
            sentence_test_flag = False

    # test summary
    if sentence_test_flag and word_test_flag:
        print "Test pass!"
    elif not sentence_test_flag and word_test_flag:
        print "Word test pass, sentence test failed."
        print log_sen
    elif sentence_test_flag and not word_test_flag:
        print "Sentence test pass, word test failed."
        print log_word
    else:
        print "Both sentence and word tests failed."
        print "SENTENCE:"
        print log_sen
        print "WORD:"
        print log_word
    parser = argparse.ArgumentParser(
        description="Create DB for the representation analysis, i.e. remove long/short sentences, remove "
        "sentences with known words"
    )
    parser.add_argument(
        "in_filename", help="The path to the train/test/val file, it should be in index format not" " exact words"
    )
    parser.add_argument("out_filename", help="The output path should be dir")
    parser.add_argument("file_name", help="the file name to be created for each test")
    parser.add_argument(
        "--words_repr", help="The path to the words representation file", default="../data/enc_dec_100/word_rep.txt"
    )
    parser.add_argument("--dictionary", help="The path to the dictionary", default="../data/orig/dictionary.txt")
    args = parser.parse_args()

    dictionary = r.read_dictionary(args.dictionary, args.words_repr)
    print "Dictionary size is: ", len(dictionary)

    sent = r.read_files(args.in_filename)
    print "Number of original sentences is: ", len(sent)

    # =========== FIRST WORD =========== #
    print ("\nCreate first word db ...")
    first_word_path = args.out_filename + "first_word/"
    first_word_filename = first_word_path + args.file_name
    if not os.path.exists(first_word_path):
        os.mkdir(args.out_filename + "first_word")
    db.create_first_word_db(first_word_filename, sent)
    print ("Done.")
    # ================================== #