Python tokenize Examples, vocabulary_embeddings_extractor.tokenize Python Examples

Example #1

0

Show file

File: data_loader.py Project: zhaomeiqian/SemEval2018--argument_reasoning_comprehension

def string_to_indices(string: str,
                      word_to_indices_map_param: Dict,
                      nb_words: int = None) -> List:
    """
    Tokenizes a string and converts to indices specified in word_to_indices_map; performs also OOV replacement
    :param string: string
    :param word_to_indices_map_param: map (word index, embedding index)
    :param nb_words: all words with higher index are treated as OOV
    :return: a list of indices
    """
    tokens = vocabulary_embeddings_extractor.tokenize(string)

    # now convert tokens to indices; set to 2 for OOV
    word_indices_list = [
        word_to_indices_map_param.get(word, 2) for word in tokens
    ]

    # limit words to max nb_words (set them to OOV = 2):
    if nb_words:
        word_indices_list = [
            2 if word_index >= nb_words else word_index
            for word_index in word_indices_list
        ]

    return word_indices_list

Example #2

0

Show file

File: example_use.py Project: hrx-code/tacl2018-preference-convincing

def load_test_dataset(output, embeddings):
    # Load the linguistic features
    print(("Loading linguistic features from %s" % output))
    ling_feat_spmatrix, docids = load_ling_features(
        'new_test_data', output, '', output,
        model.features.shape[1] - len(embeddings[0]))

    print('Loaded libSVM data')

    X = []
    test_ids = []
    a = []

    for file_name in listdir(input_dir):
        if file_name.split('.')[-1] != 'csv':
            print("Skipping files without .csv suffix: %s" % input_dir + '/' +
                  file_name)
            continue

        data = pd.read_csv(os.path.join(input_dir, file_name),
                           delimiter='\t',
                           na_values=[])
        data = data.fillna('N/A')

        ids = data['#id'].values
        a1 = data['argument'].values

        a1_tokens = [
            vocabulary_embeddings_extractor.tokenize(a1_line) for a1_line in a1
        ]
        a1_indices = [[
            word_to_indices_map.get(word, 2) for word in a1_tokens_line
        ] for a1_tokens_line in a1_tokens]
        Xa1 = np.array([[1] + a1_indices_line
                        for a1_indices_line in a1_indices])

        valid_args = np.in1d(ids, docids)
        a1 = a1[valid_args]
        Xa1 = Xa1[valid_args]
        ids = ids[valid_args]

        a.extend(a1)
        X.extend(Xa1)
        test_ids.extend(ids)

    # load the embeddings
    docid_to_idx_map = np.argsort(docids).flatten()
    test_items_feat, uids = concat_feature_sets(
        (test_ids), [X], ling_feat_spmatrix, embeddings, docid_to_idx_map)

    return test_items_feat, uids

Example #3

0

Show file

File: data_loader_regression.py Project: zshwuhan/acl2016-convincing-arguments

def load_single_file(directory, file_name, word_to_indices_map, nb_words=None):
    """
    Loads a single file and returns a tuple of x vectors and y labels
    :param directory: dir
    :param file_name: file name
    :param word_to_indices_map: words to their indices
    :param nb_words: maximum word index to be kept; otherwise treated as OOV
    :return: tuple of lists of integers
    """
    f = open(directory + file_name, 'r')
    lines = f.readlines()
    # remove first line with comments
    del lines[0]

    x_vectors = []
    y_labels = []
    id_vector = []

    for line in lines:
        # print line
        arg_id, score, a1 = line.split('\t')
        # print(arg_id, label, a1, a2)

        id_vector.append(arg_id)

        a1_tokens = vocabulary_embeddings_extractor.tokenize(a1)

        # print(a1_tokens)
        # print(a2_tokens)

        # now convert tokens to indices; set to 2 for OOV
        a1_indices = [word_to_indices_map.get(word, 2) for word in a1_tokens]

        # join them into one vector, start with 1 for start_of_sequence, add also 1 in between
        x = [1] + a1_indices + [1]
        # print(x)

        # convert score to float
        y = float(score)

        x_vectors.append(x)
        y_labels.append(y)

    # replace all word indices larger than nb_words with OOV
    if nb_words:
        x_vectors = [[2 if word_index >= nb_words else word_index for word_index in x] for x in x_vectors]

    train_instances = x_vectors
    train_labels = y_labels

    return train_instances, train_labels, id_vector

Example #4

0

Show file

File: data_loader.py Project: shubhampachori12110095/emnlp2016-empirical-convincingness

def load_single_file(directory,
                     file_name,
                     word_to_indices_map,
                     nb_words=None,
                     reduced_label_set=False):
    """
    Loads a single file and returns a tuple of x vectors and y labels
    :param directory: dir
    :param file_name: file name
    :param word_to_indices_map: words to their indices
    :param nb_words: maximum word index to be kept; otherwise treated as OOV
    :return: tuple of lists of integers
    """
    f = open(directory + file_name, 'r')
    lines = f.readlines()
    # remove first line with comments
    del lines[0]

    x_vectors = []
    y_labels = []
    id_vector = []

    for line in lines:
        # print line
        arg_id, label, a1, a2 = line.split('\t')
        # print(arg_id, label, a1, a2)

        id_vector.append(arg_id)

        a1_tokens = vocabulary_embeddings_extractor.tokenize(a1)
        a2_tokens = vocabulary_embeddings_extractor.tokenize(a2)

        # print(a1_tokens)
        # print(a2_tokens)

        # now convert tokens to indices; set to 2 for OOV
        a1_indices = [word_to_indices_map.get(word, 2) for word in a1_tokens]
        a2_indices = [word_to_indices_map.get(word, 2) for word in a2_tokens]

        # join them into one vector, start with 1 for start_of_sequence, add also 1 in between
        x = [1] + a1_indices + [1] + a2_indices
        # print(x)

        # map class to vector
        all_labels = [
            "o5_1", "o5_2", "o5_3", "o6_1", "o6_2", "o6_3", "o7_1", "o7_2",
            "o7_3", "o7_4", "o8_1", "o8_4", "o8_5", "o9_1", "o9_2", "o9_3",
            "o9_4"
        ]
        if reduced_label_set:
            all_labels = ["o5", "o6", "o7"]

        # zeros vector y
        y = np.zeros(len(all_labels))
        # split label by comma
        print(all_labels)
        for l in label.split(','):
            print(l)
            sup_label = l.split('_')[0]
            print(sup_label)
            index_in_labels = all_labels.index(sup_label)
            # and set to one
            y[index_in_labels] = 1

        # print('Y vector: ', y, 'for class', label)

        x_vectors.append(x)
        y_labels.append(y)

    # replace all word indices larger than nb_words with OOV
    if nb_words:
        x_vectors = [[
            2 if word_index >= nb_words else word_index for word_index in x
        ] for x in x_vectors]

    train_instances = x_vectors
    train_labels = y_labels

    return train_instances, train_labels, id_vector

Example #5

0

Show file

def load_single_file_separate_args(directory, file_name, word_to_indices_map, nb_words=None):
    """
    Loads a single file and returns a tuple of x vectors and y labels
    :param directory: dir
    :param file_name: file name
    :param word_to_indices_map: words to their indices
    :param nb_words: maximum word index to be kept; otherwise treated as OOV
    :return: tuple of lists of integers
    """
    f = open(directory + file_name, 'r')
    lines = f.readlines()
    # remove first line with comments
    del lines[0]

    x_vectors_a1 = []
    x_vectors_a2 = []
    train_a1 = []
    train_a2 = []
    y_labels = []
    id_vector = []
    turkerids = []
    
    for line in lines[1:]:
        # print line
        toks = line.split('\t')
        if len(toks) != 5:
            raise Exception
        arg_id, turker_id, label, a1, a2 = toks 
        # print(arg_id, label, a1, a2)

        id_vector.append(arg_id)
        turkerids.append(turker_id)

        a1_tokens = vocabulary_embeddings_extractor.tokenize(a1)
        a2_tokens = vocabulary_embeddings_extractor.tokenize(a2)

        # print(a1_tokens)
        # print(a2_tokens)

        # now convert tokens to indices; set to 2 for OOV
        a1_indices = [word_to_indices_map.get(word, 2) for word in a1_tokens]
        a2_indices = [word_to_indices_map.get(word, 2) for word in a2_tokens]
        
        train_a1.append(a1)
        train_a2.append(a2)
        
        # join them into one vector, start with 1 for start_of_sequence, add also 1 in between
        x1 = [1] + a1_indices 
        x2 = [1] + a2_indices
        # print(x)

        # map class to vector
        if 'a1' == label:
            y = 2
        elif 'a2' == label:
            y = 0
        else:
            y = 1

        x_vectors_a1.append(x1)
        x_vectors_a2.append(x2)
        y_labels.append(y)

    # replace all word indices larger than nb_words with OOV
    if nb_words:
        x_vectors_a1 = [[2 if word_index >= nb_words else word_index for word_index in x] for x in x_vectors_a1]
        x_vectors_a2 = [[2 if word_index >= nb_words else word_index for word_index in x] for x in x_vectors_a2]

    train_instances_a1 = x_vectors_a1
    train_instances_a2 = x_vectors_a2
    train_labels = y_labels

    return train_instances_a1, train_instances_a2, train_labels, id_vector, turkerids, train_a1, train_a2

Example #6

0

Show file

def load_single_file(directory, file_name, word_to_indices_map, nb_words=None):
    """
    Loads a single file and returns a tuple of x vectors and y labels
    :param directory: dir
    :param file_name: file name
    :param word_to_indices_map: words to their indices
    :param nb_words: maximum word index to be kept; otherwise treated as OOV
    :return: tuple of lists of integers
    """
    f = open(os.path.join(directory, file_name), 'r')
    lines = f.readlines()
    # remove first line with comments
    del lines[0]

    x_vectors = []
    y_labels = []
    id_vector = []

    x_vectors_reversed = []
    y_labels_reversed = []

    for line in lines:
        # print line
        toks = line.split('\t')
        arg_id = toks[0]
        label = toks[1]
        a1 = toks[-2]
        a2 = toks[-1]
        # print(arg_id, label, a1, a2)

        id_vector.append(arg_id)

        a1_tokens = vocabulary_embeddings_extractor.tokenize(a1)
        a2_tokens = vocabulary_embeddings_extractor.tokenize(a2)

        # print(a1_tokens)
        # print(a2_tokens)

        # now convert tokens to indices; set to 2 for OOV
        a1_indices = [word_to_indices_map.get(word, 2) for word in a1_tokens]
        a2_indices = [word_to_indices_map.get(word, 2) for word in a2_tokens]

        # join them into one vector, start with 1 for start_of_sequence, add also 1 in between
        x = [1] + a1_indices + [1] + a2_indices
        # print(x)

        # let's do the oversampling trick :)
        x_reverse = [1] + a2_indices + [1] + a1_indices

        # map class to vector
        if 'a1' == label:
            y = 0
            y_reverse = 2
        elif 'a2' == label:
            y = 2
            y_reverse = 0
        else:
            y = 1
            y_reverse = 1

        x_vectors.append(x)
        y_labels.append(y)

        x_vectors_reversed.append(x_reverse)
        y_labels_reversed.append(y_reverse)

    # replace all word indices larger than nb_words with OOV
    if nb_words:
        x_vectors = [[2 if word_index >= nb_words else word_index for word_index in x] for x in x_vectors]
        x_vectors_reversed = [[2 if word_index >= nb_words else word_index for word_index in x] for x in
                              x_vectors_reversed]

    train_instances = x_vectors
    train_labels = y_labels

    return train_instances, train_labels, id_vector, x_vectors_reversed, y_labels_reversed

Example #7

0

Show file

File: data_loader.py Project: IreneRicciola/acl2016-convincing-arguments

def load_single_file(directory, file_name, word_to_indices_map, nb_words=None):
    """
    Loads a single file and returns a tuple of x vectors and y labels
    :param directory: dir
    :param file_name: file name
    :param word_to_indices_map: words to their indices
    :param nb_words: maximum word index to be kept; otherwise treated as OOV
    :return: tuple of lists of integers
    """
    f = open(directory + file_name, 'r')
    lines = f.readlines()
    # remove first line with comments
    del lines[0]

    x_vectors = []
    y_labels = []
    id_vector = []

    x_vectors_reversed = []
    y_labels_reversed = []

    for line in lines:
        # print line
        arg_id, label, a1, a2 = line.split('\t')
        # print(arg_id, label, a1, a2)

        id_vector.append(arg_id)

        a1_tokens = vocabulary_embeddings_extractor.tokenize(a1)
        a2_tokens = vocabulary_embeddings_extractor.tokenize(a2)

        # print(a1_tokens)
        # print(a2_tokens)

        # now convert tokens to indices; set to 2 for OOV
        a1_indices = [word_to_indices_map.get(word, 2) for word in a1_tokens]
        a2_indices = [word_to_indices_map.get(word, 2) for word in a2_tokens]

        # join them into one vector, start with 1 for start_of_sequence, add also 1 in between
        x = [1] + a1_indices + [1] + a2_indices
        # print(x)

        # let's do the oversampling trick :)
        x_reverse = [1] + a2_indices + [1] + a1_indices

        # map class to vector
        if 'a1' == label:
            y = 0
            y_reverse = 1
        elif 'a2' == label:
            y = 1
            y_reverse = 0
        else:
            y = 2
            y_reverse = 2

        x_vectors.append(x)
        y_labels.append(y)

        x_vectors_reversed.append(x_reverse)
        y_labels_reversed.append(y_reverse)

    # replace all word indices larger than nb_words with OOV
    if nb_words:
        x_vectors = [[2 if word_index >= nb_words else word_index for word_index in x] for x in x_vectors]
        x_vectors_reversed = [[2 if word_index >= nb_words else word_index for word_index in x] for x in
                              x_vectors_reversed]

    train_instances = x_vectors
    train_labels = y_labels

    return train_instances, train_labels, id_vector, x_vectors_reversed, y_labels_reversed