def create_sentence_dicts(fst_split_csv_path, snd_split_csv_path, fst_split_npy_path, snd_split_npy_path, vocab): """ Creates a mapping between the original STS benchmark sentences, and their modified versions, where every out of vocab words are removed. """ fst_sentence_dict = {} sentence_states = numpy.load(fst_split_npy_path) with open(fst_split_csv_path, 'r') as f: for index, line in enumerate(f): fst_sentence_dict[' '.join([ word for word in tokenize_sentence(line.strip().split()) if word in vocab ])] = sentence_states[index] del sentence_states snd_sentence_dict = {} sentence_states = numpy.load(snd_split_npy_path) with open(snd_split_csv_path, 'r') as f: for index, line in enumerate(f): snd_sentence_dict[' '.join([ word for word in tokenize_sentence(line.strip().split()) if word in vocab ])] = sentence_states[index] del sentence_states return fst_sentence_dict, snd_sentence_dict
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', type=str) parser.add_argument('-o', '--output', type=str) parser.add_argument('-v', '--vocab', type=str) args = parser.parse_args([ '-i', '/media/patrik/1EDB65B8599DD93E/Downloads' '/stsbenchmark/sts-train.csv', '-o', '/media/patrik/1EDB65B8599DD93E/GitHub' '/Seq2seqChatbots/sts_benchmark', '-v', '/media/patrik/1EDB65B8599DD93E/GitHub/' 'Seq2seqChatbots/data_dir/DailyDialog/' 'base_both_avg_embedding_clustering/vocab' ]) assert os.path.isdir(args.output) file = os.path.splitext(os.path.basename(args.input)) split_input_fst, split_input_snd = split_sts_data(args.input, file, args.output) # Collecting the existing words in the data, and records their frequency. word_count = 0 vocab = {} with open(split_input_fst, 'r') as f: for line in f: for word in tokenize_sentence(line.strip().split()): vocab[word] = vocab.get(word, 0) + 1 word_count += 1 with open(split_input_snd, 'r') as f: for line in f: for word in tokenize_sentence(line.strip().split()): vocab[word] = vocab.get(word, 0) + 1 word_count += 1 for word in vocab: vocab[word] /= word_count # Iterating through the provided word vector vocabulary, and # pairing each word of the data with their frequency and embedding vector. dictionary = {} with open(args.vocab, 'r') as v: for line in v: line_as_list = line.strip().split() if line_as_list[0] in vocab: dictionary[line_as_list[0]] = (vocab[line_as_list[0]], numpy.array([ float(num) for num in line_as_list[1:] ])) del vocab[line_as_list[0]] del vocab create_benchmark(args.input, dictionary)
def create_benchmark(sts_file_path, fst_dict, snd_dict, vocab): """ Creates the STS benchmark, by using the sentence dictionaries, that map the original sentences to their encoder states produced by the Seq2Seq model. """ target_correlation = [] predicted_correlation = [] with open(sts_file_path, 'r') as f: for line in f: line_as_list = line.split('\t') first_sentence = [ word for word in tokenize_sentence(line_as_list[5].strip().split()) if word in vocab and word != '' ] second_sentence = [ word for word in tokenize_sentence(line_as_list[6].strip().split()) if word in vocab and word != '' ] if len(first_sentence) > 2 and len(second_sentence) > 2: predicted_correlation.append( calculate_correlation(fst_dict[' '.join(first_sentence)], snd_dict[' '.join(second_sentence)])) target_correlation.append(float(line_as_list[4].strip())) target_correlation = numpy.array(target_correlation) predicted_correlation = numpy.array(predicted_correlation).reshape(-1) predicted_correlation = process_correlations(predicted_correlation) corr, pvalue = scipy.stats.spearmanr(target_correlation, predicted_correlation) error = ( numpy.sqrt(numpy.sum( (target_correlation - predicted_correlation)**2)) / len(predicted_correlation)) logger.info('RNNState Correlation error (MSE): {}, ' 'Pearson correlation {}, pvalue {}'.format( error, corr, pvalue))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', type=str, help='path of the STS' 'benchmark data') parser.add_argument('-o', '--output', type=str, help='output directory') parser.add_argument('-v', '--vocab', type=str, help='vocabulary, that has' 'been used by the' 'model during training') args = parser.parse_args([ '-i', '/media/patrik/1EDB65B8599DD93E/Downloads' '/stsbenchmark/sts-train.csv', '-o', '/media/patrik/1EDB65B8599DD93E/GitHub' '/Seq2seqChatbots/sts_benchmark', '-v', '/media/patrik/1EDB65B8599DD93E' '/GitHub/Seq2seqChatbots/data_dir' '/DailyDialog/base_with_numbers' '/vocab.chatbot.16384' ]) assert os.path.isdir(args.output) file = os.path.splitext(os.path.basename(args.input)) # Used by the model for decoding output. output_file_path_fst = os.path.join(args.output, '{}-first{}'.format(file[0], file[1])) output_file_path_snd = os.path.join(args.output, '{}-second{}'.format(file[0], file[1])) with open(args.vocab, 'r', encoding='utf-8') as v: vocab = {line.strip() for line in v if line.strip() != ''} temp_fst = os.path.join(args.output, '{}-first-temp{}'.format(file[0], file[1])) temp_snd = os.path.join(args.output, '{}-second-temp{}'.format(file[0], file[1])) split_input_fst, split_input_snd = split_sts_data(args.input, file, args.output) fst_sentence_dict = {} with open(temp_fst, 'w') as temp_f: with open(split_input_fst, 'r', encoding='utf-8') as f: for line in f: reduced_sentence = ' '.join([ word for word in tokenize_sentence(line.strip().split()) if word in vocab and word.strip() != '' ]) fst_sentence_dict[reduced_sentence] = line.strip() temp_f.write(reduced_sentence + '\n') generate_states(temp_fst, output_file_path_fst) os.remove(temp_fst) transformed_output = [] with open(output_file_path_fst, 'r', encoding='utf-8') as f: for line in f: transformed_output.append(line.strip()) with open(output_file_path_fst, 'w', encoding='utf-8') as f: for line in transformed_output: f.write(fst_sentence_dict[line] + '\n') snd_sentence_dict = {} with open(temp_snd, 'w') as temp_f: with open(split_input_snd, 'r', encoding='utf-8') as f: for line in f: reduced_sentence = ' '.join([ word for word in tokenize_sentence(line.strip().split()) if word in vocab and word.strip() != '' ]) snd_sentence_dict[reduced_sentence] = line.strip() temp_f.write(reduced_sentence + '\n') generate_states(temp_snd, output_file_path_snd) os.remove(temp_snd) transformed_output = [] with open(output_file_path_snd, 'r', encoding='utf-8') as f: for line in f: transformed_output.append(line.strip()) with open(output_file_path_snd, 'w', encoding='utf-8') as f: for line in transformed_output: f.write(snd_sentence_dict[line] + '\n') os.remove(split_input_fst) os.remove(split_input_snd) fst_dict, snd_dict = create_sentence_dicts( output_file_path_fst, output_file_path_snd, os.path.splitext(output_file_path_fst)[0] + '.npy', os.path.splitext(output_file_path_snd)[0] + '.npy', vocab) create_benchmark(args.input, fst_dict, snd_dict, vocab)
def create_benchmark(sts_file_path, vocab): """ Creates a benchmark for the provided sts file. Each sentence will be represented by the weighted average of the word embeddings in that sentence vectors. """ # Inverse frequency weight. def w_avg(freq): return 0.001 / (0.001 + freq) target_correlation = [] predicted_correlation = [] with open(sts_file_path, 'r') as f: for line in f: line_as_list = line.split('\t') valid_words = 0 vectors = [] # STS data is a .csv, where the 5. and 6. columns hold the sentences, # and the 4. column holds the correlations first_sentence = None for word in tokenize_sentence(line_as_list[5].strip().split()): # Each sentence is split into words, and the vector corresponding # to each element will be weighted, and summed. vector = vocab.get(word) if vector is not None: vectors.append(vector[1] * w_avg(vector[0])) valid_words += 1 if valid_words != 0: # If there were any words in the sentence, to find a vector for, # represent the sentence by the average of these vectors. first_sentence = numpy.sum(numpy.array(vectors), axis=0) / valid_words vectors = [] valid_words = 0 second_sentence = None for word in tokenize_sentence(line_as_list[6].strip().split()): vector = vocab.get(word) if vector is not None: vectors.append(vector[1] * w_avg(vector[0])) valid_words += 1 if valid_words != 0: second_sentence = numpy.sum(numpy.array(vectors), axis=0) / valid_words if first_sentence is not None and second_sentence is not None: # If both vectors contain more than 0 words in the vocab, # calculate their cosine similarity. predicted_correlation.append( calculate_correlation(first_sentence, second_sentence)) target_correlation.append(float(line_as_list[4].strip())) # The predicted similarity and the target similarity is compared # with pearson rang correlation. target_correlation = numpy.array(target_correlation) predicted_correlation = numpy.array(predicted_correlation).reshape(-1) predicted_correlation = process_correlations(predicted_correlation) corr, pvalue = scipy.stats.spearmanr(target_correlation, predicted_correlation) error = ( numpy.sqrt(numpy.sum( (target_correlation - predicted_correlation)**2)) / len(predicted_correlation)) logger.info('Average embedding Correlation error (MSE): {}, ' 'Pearson correlation {}, pvalue {}'.format( error, corr, pvalue))