def string_to_indices(string: str, word_to_indices_map_param: Dict, nb_words: int = None) -> List: """ Tokenizes a string and converts to indices specified in word_to_indices_map; performs also OOV replacement :param string: string :param word_to_indices_map_param: map (word index, embedding index) :param nb_words: all words with higher index are treated as OOV :return: a list of indices """ tokens = vocabulary_embeddings_extractor.tokenize(string) # now convert tokens to indices; set to 2 for OOV word_indices_list = [ word_to_indices_map_param.get(word, 2) for word in tokens ] # limit words to max nb_words (set them to OOV = 2): if nb_words: word_indices_list = [ 2 if word_index >= nb_words else word_index for word_index in word_indices_list ] return word_indices_list
def load_test_dataset(output, embeddings): # Load the linguistic features print(("Loading linguistic features from %s" % output)) ling_feat_spmatrix, docids = load_ling_features( 'new_test_data', output, '', output, model.features.shape[1] - len(embeddings[0])) print('Loaded libSVM data') X = [] test_ids = [] a = [] for file_name in listdir(input_dir): if file_name.split('.')[-1] != 'csv': print("Skipping files without .csv suffix: %s" % input_dir + '/' + file_name) continue data = pd.read_csv(os.path.join(input_dir, file_name), delimiter='\t', na_values=[]) data = data.fillna('N/A') ids = data['#id'].values a1 = data['argument'].values a1_tokens = [ vocabulary_embeddings_extractor.tokenize(a1_line) for a1_line in a1 ] a1_indices = [[ word_to_indices_map.get(word, 2) for word in a1_tokens_line ] for a1_tokens_line in a1_tokens] Xa1 = np.array([[1] + a1_indices_line for a1_indices_line in a1_indices]) valid_args = np.in1d(ids, docids) a1 = a1[valid_args] Xa1 = Xa1[valid_args] ids = ids[valid_args] a.extend(a1) X.extend(Xa1) test_ids.extend(ids) # load the embeddings docid_to_idx_map = np.argsort(docids).flatten() test_items_feat, uids = concat_feature_sets( (test_ids), [X], ling_feat_spmatrix, embeddings, docid_to_idx_map) return test_items_feat, uids
def load_single_file(directory, file_name, word_to_indices_map, nb_words=None): """ Loads a single file and returns a tuple of x vectors and y labels :param directory: dir :param file_name: file name :param word_to_indices_map: words to their indices :param nb_words: maximum word index to be kept; otherwise treated as OOV :return: tuple of lists of integers """ f = open(directory + file_name, 'r') lines = f.readlines() # remove first line with comments del lines[0] x_vectors = [] y_labels = [] id_vector = [] for line in lines: # print line arg_id, score, a1 = line.split('\t') # print(arg_id, label, a1, a2) id_vector.append(arg_id) a1_tokens = vocabulary_embeddings_extractor.tokenize(a1) # print(a1_tokens) # print(a2_tokens) # now convert tokens to indices; set to 2 for OOV a1_indices = [word_to_indices_map.get(word, 2) for word in a1_tokens] # join them into one vector, start with 1 for start_of_sequence, add also 1 in between x = [1] + a1_indices + [1] # print(x) # convert score to float y = float(score) x_vectors.append(x) y_labels.append(y) # replace all word indices larger than nb_words with OOV if nb_words: x_vectors = [[2 if word_index >= nb_words else word_index for word_index in x] for x in x_vectors] train_instances = x_vectors train_labels = y_labels return train_instances, train_labels, id_vector
def load_single_file(directory, file_name, word_to_indices_map, nb_words=None, reduced_label_set=False): """ Loads a single file and returns a tuple of x vectors and y labels :param directory: dir :param file_name: file name :param word_to_indices_map: words to their indices :param nb_words: maximum word index to be kept; otherwise treated as OOV :return: tuple of lists of integers """ f = open(directory + file_name, 'r') lines = f.readlines() # remove first line with comments del lines[0] x_vectors = [] y_labels = [] id_vector = [] for line in lines: # print line arg_id, label, a1, a2 = line.split('\t') # print(arg_id, label, a1, a2) id_vector.append(arg_id) a1_tokens = vocabulary_embeddings_extractor.tokenize(a1) a2_tokens = vocabulary_embeddings_extractor.tokenize(a2) # print(a1_tokens) # print(a2_tokens) # now convert tokens to indices; set to 2 for OOV a1_indices = [word_to_indices_map.get(word, 2) for word in a1_tokens] a2_indices = [word_to_indices_map.get(word, 2) for word in a2_tokens] # join them into one vector, start with 1 for start_of_sequence, add also 1 in between x = [1] + a1_indices + [1] + a2_indices # print(x) # map class to vector all_labels = [ "o5_1", "o5_2", "o5_3", "o6_1", "o6_2", "o6_3", "o7_1", "o7_2", "o7_3", "o7_4", "o8_1", "o8_4", "o8_5", "o9_1", "o9_2", "o9_3", "o9_4" ] if reduced_label_set: all_labels = ["o5", "o6", "o7"] # zeros vector y y = np.zeros(len(all_labels)) # split label by comma print(all_labels) for l in label.split(','): print(l) sup_label = l.split('_')[0] print(sup_label) index_in_labels = all_labels.index(sup_label) # and set to one y[index_in_labels] = 1 # print('Y vector: ', y, 'for class', label) x_vectors.append(x) y_labels.append(y) # replace all word indices larger than nb_words with OOV if nb_words: x_vectors = [[ 2 if word_index >= nb_words else word_index for word_index in x ] for x in x_vectors] train_instances = x_vectors train_labels = y_labels return train_instances, train_labels, id_vector
def load_single_file_separate_args(directory, file_name, word_to_indices_map, nb_words=None): """ Loads a single file and returns a tuple of x vectors and y labels :param directory: dir :param file_name: file name :param word_to_indices_map: words to their indices :param nb_words: maximum word index to be kept; otherwise treated as OOV :return: tuple of lists of integers """ f = open(directory + file_name, 'r') lines = f.readlines() # remove first line with comments del lines[0] x_vectors_a1 = [] x_vectors_a2 = [] train_a1 = [] train_a2 = [] y_labels = [] id_vector = [] turkerids = [] for line in lines[1:]: # print line toks = line.split('\t') if len(toks) != 5: raise Exception arg_id, turker_id, label, a1, a2 = toks # print(arg_id, label, a1, a2) id_vector.append(arg_id) turkerids.append(turker_id) a1_tokens = vocabulary_embeddings_extractor.tokenize(a1) a2_tokens = vocabulary_embeddings_extractor.tokenize(a2) # print(a1_tokens) # print(a2_tokens) # now convert tokens to indices; set to 2 for OOV a1_indices = [word_to_indices_map.get(word, 2) for word in a1_tokens] a2_indices = [word_to_indices_map.get(word, 2) for word in a2_tokens] train_a1.append(a1) train_a2.append(a2) # join them into one vector, start with 1 for start_of_sequence, add also 1 in between x1 = [1] + a1_indices x2 = [1] + a2_indices # print(x) # map class to vector if 'a1' == label: y = 2 elif 'a2' == label: y = 0 else: y = 1 x_vectors_a1.append(x1) x_vectors_a2.append(x2) y_labels.append(y) # replace all word indices larger than nb_words with OOV if nb_words: x_vectors_a1 = [[2 if word_index >= nb_words else word_index for word_index in x] for x in x_vectors_a1] x_vectors_a2 = [[2 if word_index >= nb_words else word_index for word_index in x] for x in x_vectors_a2] train_instances_a1 = x_vectors_a1 train_instances_a2 = x_vectors_a2 train_labels = y_labels return train_instances_a1, train_instances_a2, train_labels, id_vector, turkerids, train_a1, train_a2
def load_single_file(directory, file_name, word_to_indices_map, nb_words=None): """ Loads a single file and returns a tuple of x vectors and y labels :param directory: dir :param file_name: file name :param word_to_indices_map: words to their indices :param nb_words: maximum word index to be kept; otherwise treated as OOV :return: tuple of lists of integers """ f = open(os.path.join(directory, file_name), 'r') lines = f.readlines() # remove first line with comments del lines[0] x_vectors = [] y_labels = [] id_vector = [] x_vectors_reversed = [] y_labels_reversed = [] for line in lines: # print line toks = line.split('\t') arg_id = toks[0] label = toks[1] a1 = toks[-2] a2 = toks[-1] # print(arg_id, label, a1, a2) id_vector.append(arg_id) a1_tokens = vocabulary_embeddings_extractor.tokenize(a1) a2_tokens = vocabulary_embeddings_extractor.tokenize(a2) # print(a1_tokens) # print(a2_tokens) # now convert tokens to indices; set to 2 for OOV a1_indices = [word_to_indices_map.get(word, 2) for word in a1_tokens] a2_indices = [word_to_indices_map.get(word, 2) for word in a2_tokens] # join them into one vector, start with 1 for start_of_sequence, add also 1 in between x = [1] + a1_indices + [1] + a2_indices # print(x) # let's do the oversampling trick :) x_reverse = [1] + a2_indices + [1] + a1_indices # map class to vector if 'a1' == label: y = 0 y_reverse = 2 elif 'a2' == label: y = 2 y_reverse = 0 else: y = 1 y_reverse = 1 x_vectors.append(x) y_labels.append(y) x_vectors_reversed.append(x_reverse) y_labels_reversed.append(y_reverse) # replace all word indices larger than nb_words with OOV if nb_words: x_vectors = [[2 if word_index >= nb_words else word_index for word_index in x] for x in x_vectors] x_vectors_reversed = [[2 if word_index >= nb_words else word_index for word_index in x] for x in x_vectors_reversed] train_instances = x_vectors train_labels = y_labels return train_instances, train_labels, id_vector, x_vectors_reversed, y_labels_reversed
def load_single_file(directory, file_name, word_to_indices_map, nb_words=None): """ Loads a single file and returns a tuple of x vectors and y labels :param directory: dir :param file_name: file name :param word_to_indices_map: words to their indices :param nb_words: maximum word index to be kept; otherwise treated as OOV :return: tuple of lists of integers """ f = open(directory + file_name, 'r') lines = f.readlines() # remove first line with comments del lines[0] x_vectors = [] y_labels = [] id_vector = [] x_vectors_reversed = [] y_labels_reversed = [] for line in lines: # print line arg_id, label, a1, a2 = line.split('\t') # print(arg_id, label, a1, a2) id_vector.append(arg_id) a1_tokens = vocabulary_embeddings_extractor.tokenize(a1) a2_tokens = vocabulary_embeddings_extractor.tokenize(a2) # print(a1_tokens) # print(a2_tokens) # now convert tokens to indices; set to 2 for OOV a1_indices = [word_to_indices_map.get(word, 2) for word in a1_tokens] a2_indices = [word_to_indices_map.get(word, 2) for word in a2_tokens] # join them into one vector, start with 1 for start_of_sequence, add also 1 in between x = [1] + a1_indices + [1] + a2_indices # print(x) # let's do the oversampling trick :) x_reverse = [1] + a2_indices + [1] + a1_indices # map class to vector if 'a1' == label: y = 0 y_reverse = 1 elif 'a2' == label: y = 1 y_reverse = 0 else: y = 2 y_reverse = 2 x_vectors.append(x) y_labels.append(y) x_vectors_reversed.append(x_reverse) y_labels_reversed.append(y_reverse) # replace all word indices larger than nb_words with OOV if nb_words: x_vectors = [[2 if word_index >= nb_words else word_index for word_index in x] for x in x_vectors] x_vectors_reversed = [[2 if word_index >= nb_words else word_index for word_index in x] for x in x_vectors_reversed] train_instances = x_vectors train_labels = y_labels return train_instances, train_labels, id_vector, x_vectors_reversed, y_labels_reversed