Ejemplo n.º 1
0
def create_sentence_dicts(fst_split_csv_path, snd_split_csv_path,
                          fst_split_npy_path, snd_split_npy_path, vocab):
    """
  Creates a mapping between the original STS benchmark sentences,
  and their modified versions, where every out of vocab words are removed.
  """
    fst_sentence_dict = {}
    sentence_states = numpy.load(fst_split_npy_path)

    with open(fst_split_csv_path, 'r') as f:
        for index, line in enumerate(f):
            fst_sentence_dict[' '.join([
                word for word in tokenize_sentence(line.strip().split())
                if word in vocab
            ])] = sentence_states[index]

    del sentence_states
    snd_sentence_dict = {}
    sentence_states = numpy.load(snd_split_npy_path)

    with open(snd_split_csv_path, 'r') as f:
        for index, line in enumerate(f):
            snd_sentence_dict[' '.join([
                word for word in tokenize_sentence(line.strip().split())
                if word in vocab
            ])] = sentence_states[index]

    del sentence_states
    return fst_sentence_dict, snd_sentence_dict
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', type=str)
    parser.add_argument('-o', '--output', type=str)
    parser.add_argument('-v', '--vocab', type=str)

    args = parser.parse_args([
        '-i', '/media/patrik/1EDB65B8599DD93E/Downloads'
        '/stsbenchmark/sts-train.csv', '-o',
        '/media/patrik/1EDB65B8599DD93E/GitHub'
        '/Seq2seqChatbots/sts_benchmark', '-v',
        '/media/patrik/1EDB65B8599DD93E/GitHub/'
        'Seq2seqChatbots/data_dir/DailyDialog/'
        'base_both_avg_embedding_clustering/vocab'
    ])

    assert os.path.isdir(args.output)

    file = os.path.splitext(os.path.basename(args.input))

    split_input_fst, split_input_snd = split_sts_data(args.input, file,
                                                      args.output)

    # Collecting the existing words in the data, and records their frequency.
    word_count = 0
    vocab = {}
    with open(split_input_fst, 'r') as f:
        for line in f:
            for word in tokenize_sentence(line.strip().split()):
                vocab[word] = vocab.get(word, 0) + 1
                word_count += 1

    with open(split_input_snd, 'r') as f:
        for line in f:
            for word in tokenize_sentence(line.strip().split()):
                vocab[word] = vocab.get(word, 0) + 1
                word_count += 1

    for word in vocab:
        vocab[word] /= word_count

    # Iterating through the provided word vector vocabulary, and
    # pairing each word of the data with their frequency and embedding vector.
    dictionary = {}
    with open(args.vocab, 'r') as v:
        for line in v:
            line_as_list = line.strip().split()
            if line_as_list[0] in vocab:
                dictionary[line_as_list[0]] = (vocab[line_as_list[0]],
                                               numpy.array([
                                                   float(num)
                                                   for num in line_as_list[1:]
                                               ]))
                del vocab[line_as_list[0]]

    del vocab

    create_benchmark(args.input, dictionary)
Ejemplo n.º 3
0
def create_benchmark(sts_file_path, fst_dict, snd_dict, vocab):
    """
  Creates the STS benchmark, by using the sentence dictionaries, that
  map the original sentences to their encoder states produced by
  the Seq2Seq model.
  """
    target_correlation = []
    predicted_correlation = []
    with open(sts_file_path, 'r') as f:
        for line in f:
            line_as_list = line.split('\t')
            first_sentence = [
                word
                for word in tokenize_sentence(line_as_list[5].strip().split())
                if word in vocab and word != ''
            ]
            second_sentence = [
                word
                for word in tokenize_sentence(line_as_list[6].strip().split())
                if word in vocab and word != ''
            ]
            if len(first_sentence) > 2 and len(second_sentence) > 2:
                predicted_correlation.append(
                    calculate_correlation(fst_dict[' '.join(first_sentence)],
                                          snd_dict[' '.join(second_sentence)]))
                target_correlation.append(float(line_as_list[4].strip()))

    target_correlation = numpy.array(target_correlation)
    predicted_correlation = numpy.array(predicted_correlation).reshape(-1)
    predicted_correlation = process_correlations(predicted_correlation)

    corr, pvalue = scipy.stats.spearmanr(target_correlation,
                                         predicted_correlation)

    error = (
        numpy.sqrt(numpy.sum(
            (target_correlation - predicted_correlation)**2)) /
        len(predicted_correlation))

    logger.info('RNNState Correlation error (MSE): {}, '
                'Pearson correlation {}, pvalue {}'.format(
                    error, corr, pvalue))
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        help='path of the STS'
                        'benchmark data')
    parser.add_argument('-o', '--output', type=str, help='output directory')
    parser.add_argument('-v',
                        '--vocab',
                        type=str,
                        help='vocabulary, that has'
                        'been used by the'
                        'model during training')

    args = parser.parse_args([
        '-i', '/media/patrik/1EDB65B8599DD93E/Downloads'
        '/stsbenchmark/sts-train.csv', '-o',
        '/media/patrik/1EDB65B8599DD93E/GitHub'
        '/Seq2seqChatbots/sts_benchmark', '-v',
        '/media/patrik/1EDB65B8599DD93E'
        '/GitHub/Seq2seqChatbots/data_dir'
        '/DailyDialog/base_with_numbers'
        '/vocab.chatbot.16384'
    ])

    assert os.path.isdir(args.output)
    file = os.path.splitext(os.path.basename(args.input))

    # Used by the model for decoding output.
    output_file_path_fst = os.path.join(args.output,
                                        '{}-first{}'.format(file[0], file[1]))
    output_file_path_snd = os.path.join(args.output,
                                        '{}-second{}'.format(file[0], file[1]))

    with open(args.vocab, 'r', encoding='utf-8') as v:
        vocab = {line.strip() for line in v if line.strip() != ''}

    temp_fst = os.path.join(args.output,
                            '{}-first-temp{}'.format(file[0], file[1]))
    temp_snd = os.path.join(args.output,
                            '{}-second-temp{}'.format(file[0], file[1]))

    split_input_fst, split_input_snd = split_sts_data(args.input, file,
                                                      args.output)

    fst_sentence_dict = {}
    with open(temp_fst, 'w') as temp_f:
        with open(split_input_fst, 'r', encoding='utf-8') as f:
            for line in f:
                reduced_sentence = ' '.join([
                    word for word in tokenize_sentence(line.strip().split())
                    if word in vocab and word.strip() != ''
                ])
                fst_sentence_dict[reduced_sentence] = line.strip()
                temp_f.write(reduced_sentence + '\n')

    generate_states(temp_fst, output_file_path_fst)
    os.remove(temp_fst)

    transformed_output = []
    with open(output_file_path_fst, 'r', encoding='utf-8') as f:
        for line in f:
            transformed_output.append(line.strip())

    with open(output_file_path_fst, 'w', encoding='utf-8') as f:
        for line in transformed_output:
            f.write(fst_sentence_dict[line] + '\n')

    snd_sentence_dict = {}
    with open(temp_snd, 'w') as temp_f:
        with open(split_input_snd, 'r', encoding='utf-8') as f:
            for line in f:
                reduced_sentence = ' '.join([
                    word for word in tokenize_sentence(line.strip().split())
                    if word in vocab and word.strip() != ''
                ])
                snd_sentence_dict[reduced_sentence] = line.strip()
                temp_f.write(reduced_sentence + '\n')

    generate_states(temp_snd, output_file_path_snd)
    os.remove(temp_snd)

    transformed_output = []
    with open(output_file_path_snd, 'r', encoding='utf-8') as f:
        for line in f:
            transformed_output.append(line.strip())

    with open(output_file_path_snd, 'w', encoding='utf-8') as f:
        for line in transformed_output:
            f.write(snd_sentence_dict[line] + '\n')

    os.remove(split_input_fst)
    os.remove(split_input_snd)

    fst_dict, snd_dict = create_sentence_dicts(
        output_file_path_fst, output_file_path_snd,
        os.path.splitext(output_file_path_fst)[0] + '.npy',
        os.path.splitext(output_file_path_snd)[0] + '.npy', vocab)

    create_benchmark(args.input, fst_dict, snd_dict, vocab)
def create_benchmark(sts_file_path, vocab):
    """
  Creates a benchmark for the provided sts file.
  Each sentence will be represented by the weighted average of the
  word embeddings in that sentence vectors.
  """

    # Inverse frequency weight.
    def w_avg(freq):
        return 0.001 / (0.001 + freq)

    target_correlation = []
    predicted_correlation = []
    with open(sts_file_path, 'r') as f:
        for line in f:
            line_as_list = line.split('\t')

            valid_words = 0
            vectors = []

            # STS data is a .csv, where the 5. and 6. columns hold the sentences,
            # and the 4. column holds the correlations
            first_sentence = None
            for word in tokenize_sentence(line_as_list[5].strip().split()):

                # Each sentence is split into words, and the vector corresponding
                # to each element will be weighted, and summed.
                vector = vocab.get(word)
                if vector is not None:
                    vectors.append(vector[1] * w_avg(vector[0]))
                    valid_words += 1

            if valid_words != 0:
                # If there were any words in the sentence, to find a vector for,
                # represent the sentence by the average of these vectors.
                first_sentence = numpy.sum(numpy.array(vectors),
                                           axis=0) / valid_words

            vectors = []
            valid_words = 0
            second_sentence = None
            for word in tokenize_sentence(line_as_list[6].strip().split()):
                vector = vocab.get(word)
                if vector is not None:
                    vectors.append(vector[1] * w_avg(vector[0]))
                    valid_words += 1

            if valid_words != 0:
                second_sentence = numpy.sum(numpy.array(vectors),
                                            axis=0) / valid_words

            if first_sentence is not None and second_sentence is not None:
                # If both vectors contain more than 0 words in the vocab,
                # calculate their cosine similarity.
                predicted_correlation.append(
                    calculate_correlation(first_sentence, second_sentence))
                target_correlation.append(float(line_as_list[4].strip()))

    # The predicted similarity and the target similarity is compared
    # with pearson rang correlation.
    target_correlation = numpy.array(target_correlation)
    predicted_correlation = numpy.array(predicted_correlation).reshape(-1)
    predicted_correlation = process_correlations(predicted_correlation)

    corr, pvalue = scipy.stats.spearmanr(target_correlation,
                                         predicted_correlation)

    error = (
        numpy.sqrt(numpy.sum(
            (target_correlation - predicted_correlation)**2)) /
        len(predicted_correlation))

    logger.info('Average embedding Correlation error (MSE): {}, '
                'Pearson correlation {}, pvalue {}'.format(
                    error, corr, pvalue))