コード例 #1
0
    def from_word_vectors(cls, word_vectors, unique_labels):
        """Instantiate the vectorizer"""
        review_vocab = word_vectors
        rating_vocab = Indexer()

        # Add ratings
        for l in unique_labels:
            rating_vocab.add_and_get_index(l)

        return cls(review_vocab, rating_vocab)
def read_word_embeddings(embeddings_file: str) -> WordEmbeddings:
    """
    Loads the given embeddings (ASCII-formatted) into a WordEmbeddings object. Augments this with an UNK embedding
    that is the 0 vector. Reads in all embeddings with no filtering -- you should only use this for relativized
    word embedding files.
    :param embeddings_file: path to the file containing embeddings
    :return: WordEmbeddings object reflecting the words and their embeddings
    """
    f = open(embeddings_file)
    word_indexer = Indexer()
    vectors = []
    # Make position 0 a PAD token, which can be useful if you
    word_indexer.add_and_get_index("PAD")
    # Make position 1 the UNK token
    word_indexer.add_and_get_index("UNK")
    for line in f:
        if line.strip() != "":
            space_idx = line.find(' ')
            word = line[:space_idx]
            numbers = line[space_idx + 1:]
            float_numbers = [
                float(number_str) for number_str in numbers.split()
            ]
            vector = np.array(float_numbers)
            word_indexer.add_and_get_index(word)
            # Append the PAD and UNK vectors to start. Have to do this weirdly because we need to read the first line
            # of the file to see what the embedding dim is
            if len(vectors) == 0:
                vectors.append(np.zeros(vector.shape[0]))
                vectors.append(np.zeros(vector.shape[0]))
            vectors.append(vector)
    f.close()
    # Turn vectors into a 2-D numpy array
    return WordEmbeddings(word_indexer, np.array(vectors))
コード例 #3
0
class CharTokenizer:
    """
    Class to create char tokens
    """
    def __init__(self, max_word_length):
        vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' ']
        self.char_vocab_index = Indexer()
        self.char_vocab_index.add_and_get_index(PAD_TOKEN)  # PAD is 0
        self.char_vocab_index.add_and_get_index(
            UNK_TOKEN)  # Unknown token is 1
        for char in vocab:
            self.char_vocab_index.add_and_get_index(char)

        self.max_word_length = max_word_length

    def convert_words_to_charids(self, words):
        word_charids = []
        for w in words:
            charids = []
            for c in w:
                charids.append(self.char_vocab_index.index_of(c))
            charids = charids[:self.max_word_length]
            if len(charids) < self.max_word_length:
                charids.extend([0] * (self.max_word_length - len(charids)))
            word_charids.append(charids)

        return word_charids
コード例 #4
0
class CharBaselineReader(nn.Module):
    """
    Baseline QA Model
    [Architecture]
        0) Inputs: passages and questions
        1) Embedding Layer: converts words to vectors
        2) Context2Query: computes weighted sum of question embeddings for
               each position in passage.
        3) Passage Encoder: LSTM or GRU.
        4) Question Encoder: LSTM or GRU.
        5) Question Attentive Sum: computes weighted sum of question hidden.
        6) Start Position Pointer: computes scores (logits) over passage
               conditioned on the question vector.
        7) End Position Pointer: computes scores (logits) over passage
               conditioned on the question vector.

    Args:
        args: `argparse` object.

    Inputs:
        batch: a dictionary containing batched tensors.
            {
                'passages': LongTensor [batch_size, p_len],
                'questions': LongTensor [batch_size, q_len],
                'start_positions': Not used in `forward`,
                'end_positions': Not used in `forward`,
            }

    Returns:
        Logits for start positions and logits for end positions.
        Tuple: ([batch_size, p_len], [batch_size, p_len])
    """
    def __init__(self, args):
        super().__init__()

        self.args = args
        self.pad_token_id = args.pad_token_id

        # Initialize embedding layer (1)
        self.embedding = nn.Embedding(args.vocab_size, args.embedding_dim)

        # Initialize char embedding layer
        self.char_embedding = nn.Embedding(args.char_vocab_size,
                                           args.char_embedding_dim)

        # Initialize Context2Query (2)
        self.aligned_att = AlignedAttention(args.embedding_dim,
                                            args.char_embedding_dim)

        rnn_cell = nn.LSTM if args.rnn_cell_type == 'lstm' else nn.GRU

        # Initialize passage encoder (3)
        self.passage_rnn = rnn_cell(
            args.embedding_dim * 2,
            args.hidden_dim,
            bidirectional=args.bidirectional,
            batch_first=True,
        )

        # Initialize question encoder (4)
        self.question_rnn = rnn_cell(
            args.embedding_dim,
            args.hidden_dim,
            bidirectional=args.bidirectional,
            batch_first=True,
        )

        self.dropout = nn.Dropout(self.args.dropout)

        # Adjust hidden dimension if bidirectional RNNs are used
        _hidden_dim = (args.hidden_dim *
                       2 if args.bidirectional else args.hidden_dim)

        # Initialize attention layer for question attentive sum (5)
        self.question_att = SpanAttention(_hidden_dim)

        # Initialize bilinear layer for start positions (6)
        self.start_output = BilinearOutput(_hidden_dim, _hidden_dim)

        # Initialize bilinear layer for end positions (7)
        self.end_output = BilinearOutput(_hidden_dim, _hidden_dim)

        # Initialize char indexer
        vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' ']
        self.char_vocab_index = Indexer()
        for char in vocab:
            self.char_vocab_index.add_and_get_index(char)

    def load_pretrained_embeddings(self, vocabulary, path):
        """
        Loads GloVe vectors and initializes the embedding matrix.

        Args:
            vocabulary: `Vocabulary` object.
            path: Embedding path, e.g. "glove/glove.6B.300d.txt".
        """

        if self.args.embedding == 'glove':
            embedding_map = load_cached_embeddings(path)

            # Create embedding matrix. By default, embeddings are randomly
            # initialized from Uniform(-0.1, 0.1).
            embeddings = torch.zeros(
                (len(vocabulary),
                 self.args.embedding_dim)).uniform_(-0.1, 0.1)

            # Initialize pre-trained embeddings.
            num_pretrained = 0
            for (i, word) in enumerate(vocabulary.words):
                if word in embedding_map:
                    #embeddings[i] = torch.tensor(embedding_map[word])
                    num_pretrained += 1

            # Place embedding matrix on GPU.
            self.embedding.weight.data = cuda(self.args, embeddings)
        else:
            #####################
            # Loads Fasttext embeddings
            embedding_map = load_fasttext_embeddings(path)

            # Create embedding matrix. By default, embeddings are randomly
            # initialized from Uniform(-0.1, 0.1).
            embeddings = torch.zeros(
                (len(vocabulary),
                 self.args.embedding_dim)).uniform_(-0.1, 0.1)

            # Initialize pre-trained embeddings.
            num_pretrained = 0
            for (i, word) in enumerate(vocabulary.words):
                embeddings[i] = torch.tensor(
                    embedding_map.get_word_vector(word))
                num_pretrained += 1

            # Place embedding matrix on GPU.
            self.embedding.weight.data = cuda(self.args, embeddings)

        return num_pretrained

    def sorted_rnn(self, sequences, sequence_lengths, rnn):
        """
        Sorts and packs inputs, then feeds them into RNN.

        Args:
            sequences: Input sequences, [batch_size, len, dim].
            sequence_lengths: Lengths for each sequence, [batch_size].
            rnn: Registered LSTM or GRU.

        Returns:
            All hidden states, [batch_size, len, hid].
        """
        # Sort input sequences
        sorted_inputs, sorted_sequence_lengths, restoration_indices = _sort_batch_by_length(
            sequences, sequence_lengths)
        # Pack input sequences
        packed_sequence_input = pack_padded_sequence(
            sorted_inputs,
            sorted_sequence_lengths.data.long().tolist(),
            batch_first=True)
        # Run RNN
        packed_sequence_output, _ = rnn(packed_sequence_input, None)
        # Unpack hidden states
        unpacked_sequence_tensor, _ = pad_packed_sequence(
            packed_sequence_output, batch_first=True)
        # Restore the original order in the batch and return all hidden states
        return unpacked_sequence_tensor.index_select(0, restoration_indices)

    def forward(self, batch):
        # Obtain masks and lengths for passage and question.
        passage_mask = (batch['passages'] != self.pad_token_id
                        )  # [batch_size, p_len]
        question_mask = (batch['questions'] != self.pad_token_id
                         )  # [batch_size, q_len]
        passage_lengths = passage_mask.long().sum(-1)  # [batch_size]
        question_lengths = question_mask.long().sum(-1)  # [batch_size]

        # 1) Embedding Layer: Embed the passage and question.
        passage_embeddings = self.embedding(
            batch['passages'])  # [batch_size, p_len, p_dim]
        question_embeddings = self.embedding(
            batch['questions'])  # [batch_size, q_len, q_dim]

        passage_char_embeddings = self.char_embedding(
            batch['char_passages']
        )  # [batch_size, p_len, word_length, word_dim] [64, 168, 16, 64]
        question_char_embeddings = self.char_embedding(
            batch['char_questions']
        )  # [batch_size, q_len, word_length, word_dim]

        if self.args.char_embedding_type == 'average':
            # Average char embeddings baseline
            passage_char_embeddings_avg = passage_char_embeddings.mean(
                dim=2).squeeze(0)
            question_char_embeddings_avg = question_char_embeddings.mean(
                dim=2).squeeze(0)

            passage_final_embeddings = torch.cat(
                [passage_embeddings, passage_char_embeddings_avg], dim=2)
            question_final_embeddings = torch.cat(
                [question_embeddings, question_char_embeddings_avg], dim=2)
            #print('passage_char_embeddings ', passage_char_embeddings.shape)
            #print('question_char_embeddings ', question_char_embeddings.shape)

        else:
            # Conv 1D char embeddings
            passage_char_embeddings_conv1d_input = passage_char_embeddings.reshape(
                (-1, passage_char_embeddings.shape[3],
                 passage_char_embeddings.shape[2]))
            question_char_embeddings_conv1d_input = question_char_embeddings.reshape(
                (-1, question_char_embeddings.shape[3],
                 question_char_embeddings.shape[2]))

            conv1d = torch.nn.Conv1d(self.args.char_embedding_dim,
                                     self.args.char_embedding_dim, 3)
            relu = torch.nn.ReLU()

            if torch.cuda.is_available():
                conv1d.cuda()
                relu.cuda()

            passage_char_embeddings_tmp1 = relu(
                conv1d(passage_char_embeddings_conv1d_input))
            # Last dimension of conv1d output we want to collapse using global max pooling
            passage_char_embeddings_final = torch.nn.functional.max_pool1d(
                passage_char_embeddings_tmp1,
                passage_char_embeddings_tmp1.shape[2]).squeeze(2).reshape(
                    passage_char_embeddings.shape[0],
                    passage_char_embeddings.shape[1], -1)

            question_char_embeddings_tmp1 = relu(
                conv1d(question_char_embeddings_conv1d_input))
            # Last dimension of conv1d output we want to collapse using global max pooling
            question_char_embeddings_final = torch.nn.functional.max_pool1d(
                question_char_embeddings_tmp1,
                question_char_embeddings_tmp1.shape[2]).squeeze(2).reshape(
                    question_char_embeddings.shape[0],
                    question_char_embeddings.shape[1], -1)

            passage_final_embeddings = torch.cat(
                [passage_embeddings, passage_char_embeddings_final], dim=2)
            question_final_embeddings = torch.cat(
                [question_embeddings, question_char_embeddings_final], dim=2)

        # 2) Context2Query: Compute weighted sum of question embeddings for
        #        each passage word and concatenate with passage embeddings.
        aligned_scores = self.aligned_att(
            passage_final_embeddings, question_final_embeddings,
            ~question_mask)  # [batch_size, p_len, q_len]
        aligned_embeddings = aligned_scores.bmm(
            question_embeddings)  # [batch_size, p_len, q_dim]
        passage_embeddings = cuda(
            self.args,
            torch.cat((passage_embeddings, aligned_embeddings), 2),
        )  # [batch_size, p_len, p_dim + q_dim]

        # 3) Passage Encoder
        passage_hidden = self.sorted_rnn(
            passage_embeddings, passage_lengths,
            self.passage_rnn)  # [batch_size, p_len, p_hid]
        passage_hidden = self.dropout(
            passage_hidden)  # [batch_size, p_len, p_hid]

        # 4) Question Encoder: Encode question embeddings.
        question_hidden = self.sorted_rnn(
            question_embeddings, question_lengths,
            self.question_rnn)  # [batch_size, q_len, q_hid]

        # 5) Question Attentive Sum: Compute weighted sum of question hidden
        #        vectors.
        question_scores = self.question_att(question_hidden, ~question_mask)
        question_vector = question_scores.unsqueeze(1).bmm(
            question_hidden).squeeze(1)
        question_vector = self.dropout(question_vector)  # [batch_size, q_hid]

        # 6) Start Position Pointer: Compute logits for start positions
        start_logits = self.start_output(passage_hidden, question_vector,
                                         ~passage_mask)  # [batch_size, p_len]

        # 7) End Position Pointer: Compute logits for end positions
        end_logits = self.end_output(passage_hidden, question_vector,
                                     ~passage_mask)  # [batch_size, p_len]

        return start_logits, end_logits  # [batch_size, p_len], [batch_size, p_len]
コード例 #5
0
from keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import json
import pandas as pd
from utils import get_train_data_from_csv, get_dev_data_from_csv, get_test_data_from_csv, Indexer, get_indexer
from nltk.tokenize import TweetTokenizer
from sklearn.metrics import classification_report

include_test = True

tknr = TweetTokenizer()
indexer = get_indexer('indexer_15_dups.csv')
word_indexer = Indexer()
word_indexer.add_and_get_index("UNK")

train_data = get_train_data_from_csv('data/train_15_ds.csv')[0:1000]
dev_data = get_dev_data_from_csv('data/dev_15_ds.csv')[:200]
test_data = get_test_data_from_csv('data/test_15_ds.csv')[0:200]

X_train = []
Y_train = []
X_dev = []
Y_dev = []
Y_dev_true = []
X_test = []
Y_test = []
Y_test_true = []

for d in train_data:
コード例 #6
0
                  columns=['mId', 'tmdbId', 'title'],
                  index=False)
    ''' create genres
	mId2Genre: 45463 lines, each line includes (mId, num of genres, gIds)
	Genre2Id:  20 lines, each line includes (gId, genre name)
	gId ranges from 45843 to 45862
	'''
    f = open("processed_data/mId2Genre.txt", "w")
    genreIdx = Indexer()
    for idx, row in movies.iterrows():
        mId, raw_genres = row['mId'], row['genres']
        raw_genres = raw_genres.replace("\'", "\"")
        genres_l = json.loads(raw_genres)
        f.write("%d %d" % (mId, len(genres_l)))
        for g in genres_l:
            f.write(" %d" % (genreIdx.add_and_get_index(g['name']) + id_base))
        f.write("\n")
    f.close()

    f = open("processed_data/Genre2Id.txt", "w")
    num_genres = len(genreIdx)
    for i in range(num_genres):
        f.write("%d %s\n" % (i + id_base, genreIdx.get_object(i)))
    f.close()
    id_base += num_genres
    ''' create credits
	mId2CC.txt: 45476 lines
	each line includes (mId, num of crew/casts, cIds)
	'''
    credits = readCreditData(args, tmid2mid)
    print("credits.shape %s" % (str(credits.shape)))