def encode_sentences(desc, sentence_list, model, imdb_key=None, is_qa=False):
    """Encode a list of sentences given the model.
    """

    if desc == 'skipthought':
        # encode a sentence list directly
        features = skipthoughts.encode(model, sentence_list, verbose=False)

    elif desc == 'vis-text-embed':
        # normalize sentence lists
        norm_sentence_list = [
            utils.normalize_alphanumeric(sentence.lower())
            for sentence in sentence_list
        ]
        # allows to encode a sentence list directly
        features = model.encode(norm_sentence_list)

    elif desc.startswith('tfidf'):
        desc_dim = len(model.vocab)
        midx = model.doc_names.index(imdb_key)
        # use scipy sparse matrix when encoding stories, otherwise too huge!
        if is_qa:
            features = np.zeros((len(sentence_list), desc_dim),
                                dtype='float32')
        else:
            features = sps.dok_matrix((len(sentence_list), desc_dim),
                                      dtype='float32')

        for s, sentence in enumerate(sentence_list):
            # NOTE: use both alphanumeric and stemming normalization
            sentence = utils.normalize_stemming(
                utils.normalize_alphanumeric(sentence.lower())).split(' ')
            # for each word in the normalized sentence
            for word in sentence:
                if word not in model.vocab: continue
                widx = model.vocab.index(word)
                features[s, widx] = model.tfidf[widx][midx]

            if is_qa:  # if not sparse, use numpy.linalg.norm
                features[s] /= (np.linalg.norm(features[s]) + 1e-6)
            else:  # if sparse, use scipy.sparse.linalg.norm
                features[s] /= (sps.linalg.norm(features[s]) + 1e-6)

    elif desc == 'word2vec':
        desc_dim = model.get_vector(model.vocab[-1]).shape[0]
        features = np.zeros((len(sentence_list), desc_dim), dtype='float32')
        for s, sentence in enumerate(sentence_list):
            # NOTE: use only alphanumeric normalization, no stemming
            sentence = utils.normalize_alphanumeric(
                sentence.lower()).split(' ')
            # for each word in the normalized sentence
            for word in sentence:
                if word not in model.vocab: continue
                features[s] += model.get_vector(word)

            features[s] /= (np.linalg.norm(features[s]) + 1e-6)

    return features
    def create_vocabulary(self,
                          QAs,
                          stories,
                          v2i,
                          w2v_vocab=None,
                          word_thresh=1):
        """Create the vocabulary by taking all words in stories, questions, and answers taken together.
        Also, keep only words that appear in the word2vec model vocabulary (if provided with one).
        """

        print "Creating vocabulary.",
        if w2v_vocab is not None:
            print "Adding words based on word2vec"
        else:
            print "Adding all words"
        # Get all story words
        all_words = [
            word for story in stories for sent in story for word in sent
        ]

        # Parse QAs to get actual words
        QA_words = []
        for QA in QAs:
            QA_words.append({})
            QA_words[-1]['q_w'] = utils.normalize_alphanumeric(
                QA.question.lower()).split(' ')
            QA_words[-1]['a_w'] = [
                utils.normalize_alphanumeric(answer.lower()).split(' ')
                for answer in QA.answers
            ]

        # Append question and answer words to all_words
        for QAw in QA_words:
            all_words.extend(QAw['q_w'])
            for answer in QAw['a_w']:
                all_words.extend(answer)

        # threshold vocabulary, at least N instances of every word
        vocab = Counter(all_words)
        vocab = [k for k in vocab.keys() if vocab[k] >= word_thresh]

        # create vocabulary index
        for w in vocab:
            if w not in v2i.keys():
                if w2v_vocab is None:
                    # if word2vec is not provided, just dump the word to vocab
                    v2i[w] = len(v2i)
                elif w2v_vocab is not None and w in w2v_vocab:
                    # check if word in vocab, or else ignore
                    v2i[w] = len(v2i)

        print "Created a vocabulary of %d words. Threshold removed %.2f %% words" \
                %(len(v2i), 100*(1. * len(set(all_words)) - len(v2i))/len(all_words))

        return QA_words, v2i
def encode_sentences(desc, sentence_list, model, imdb_key=None, is_qa=False):
    """Encode a list of sentences given the model.
    """

    if desc == 'skipthought':
        # encode a sentence list directly
        features = skipthoughts.encode(model, sentence_list, verbose=False)

    elif desc == 'vis-text-embed':
        # normalize sentence lists
        norm_sentence_list = [utils.normalize_alphanumeric(sentence.lower()) for sentence in sentence_list]
        # allows to encode a sentence list directly
        features = model.encode(norm_sentence_list)

    elif desc.startswith('tfidf'):
        desc_dim = len(model.vocab)
        midx = model.doc_names.index(imdb_key)
        # use scipy sparse matrix when encoding stories, otherwise too huge!
        if is_qa:
            features = np.zeros((len(sentence_list), desc_dim), dtype='float32')
        else:
            features = sps.dok_matrix((len(sentence_list), desc_dim), dtype='float32')

        for s, sentence in enumerate(sentence_list):
            # NOTE: use both alphanumeric and stemming normalization
            sentence = utils.normalize_stemming(utils.normalize_alphanumeric(sentence.lower())).split(' ')
            # for each word in the normalized sentence
            for word in sentence:
                if word not in model.vocab: continue
                widx = model.vocab.index(word)
                features[s,widx] = model.tfidf[widx][midx]

            if is_qa:  # if not sparse, use numpy.linalg.norm
                features[s] /= (np.linalg.norm(features[s]) + 1e-6)
            else:  # if sparse, use scipy.sparse.linalg.norm
                features[s] /= (sps.linalg.norm(features[s]) + 1e-6)

    elif desc == 'word2vec':
        desc_dim = model.get_vector(model.vocab[-1]).shape[0]
        features = np.zeros((len(sentence_list), desc_dim), dtype='float32')
        for s, sentence in enumerate(sentence_list):
            # NOTE: use only alphanumeric normalization, no stemming
            sentence = utils.normalize_alphanumeric(sentence.lower()).split(' ')
            # for each word in the normalized sentence
            for word in sentence:
                if word not in model.vocab: continue
                features[s] += model.get_vector(word)

            features[s] /= (np.linalg.norm(features[s]) + 1e-6)

    return features
def encode_tfidf_model(document_type, word_thresh=1):
    """Load TF-IDF model.
    """

    tfidf_fname = utils.TFIDF_TEMPLATE % (document_type, word_thresh)
    check_save_directory(filename=tfidf_fname)

    if os.path.exists(tfidf_fname):
        with open(tfidf_fname, 'rb') as fid:
            TFIDF = pickle.load(fid)

    else:
        # read the story and gather words
        story, _ = mqa.get_story_qa_data('full', document_type)
        sorted_movies = sorted(story.keys())
        all_words_use = []
        for imdb_key in sorted_movies:
            all_words_use.append([])
            for sentence in story[imdb_key]:
                norm_sentence = utils.normalize_stemming(
                    utils.normalize_alphanumeric(sentence.lower()))
                all_words_use[-1].extend(norm_sentence.split(' '))

        # compute TFIDF
        TFIDF = tfidfcalc.TFIDF(sorted_movies)
        TFIDF.get_filtered_vocabulary(all_words_use, word_thresh=word_thresh)
        TFIDF.compute_tfidf(all_words_use)

        # dump to pickle file for future
        with open(tfidf_fname, 'wb') as fid:
            pickle.dump(TFIDF, fid)

    return TFIDF
def encode_tfidf_model(document_type, word_thresh=1):
    """Load TF-IDF model.
    """

    tfidf_fname = utils.TFIDF_TEMPLATE %(document_type, word_thresh)
    check_save_directory(filename=tfidf_fname)

    if os.path.exists(tfidf_fname):
        with open(tfidf_fname, 'rb') as fid:
            TFIDF = pickle.load(fid)

    else:
        # read the story and gather words
        story, _ = mqa.get_story_qa_data('full', document_type)
        sorted_movies = sorted(story.keys())
        all_words_use = []
        for imdb_key in sorted_movies:
            all_words_use.append([])
            for sentence in story[imdb_key]:
                norm_sentence = utils.normalize_stemming(utils.normalize_alphanumeric(sentence.lower()))
                all_words_use[-1].extend(norm_sentence.split(' '))

        # compute TFIDF
        TFIDF = tfidfcalc.TFIDF(sorted_movies)
        TFIDF.get_filtered_vocabulary(all_words_use, word_thresh=word_thresh)
        TFIDF.compute_tfidf(all_words_use)

        # dump to pickle file for future
        with open(tfidf_fname, 'wb') as fid:
            pickle.dump(TFIDF, fid)

    return TFIDF
Ejemplo n.º 6
0
def encode_w2v_gensim(sentence):
    #embedding = list()
    embedding = np.zeros(300)
    sentence = utils.normalize_alphanumeric(sentence.lower())
    word_list = sentence.split()
    #word_list = word_tokenize(sentence)
    word_size = 0
    for word in word_list:
        if word in ignore_word_list : continue
        try:
            embedding = embedding + gensim_model[word]
            if nan_check(embedding):
                print 'nan word >> ', word
                embed()
            word_size += 1
            #embedding.extend(list(gensim_model[word]))
        except:
            pass
            #print "KEY ERROR : " + word
            #print "Full sentence >> ",
            #print word_list

    #if word_size > word_clip_size: embedding = embedding[:word_clip_size*w2v_dim]
    #elif word_size < word_clip_size : embedding.extend([0.0]*w2v_dim*(word_clip_size-word_size))
    #print len(embedding)
    #assert len(embedding) == w2v_dim * word_clip_size
    embedding_norm = np.sum(embedding**2)
    embedding = embedding / (embedding_norm + 1e-6)
    assert embedding.shape == (300, )
    return embedding
def create_vocabulary(QAs, stories, v2i, w2v_vocab=None, word_thresh=2):
    """Create the vocabulary by taking all words in stories, questions, and answers taken together.
    Also, keep only words that appear in the word2vec model vocabulary (if provided with one).
    """

    print "Creating vocabulary.",
    if w2v_vocab is not None:
        print "Adding words based on word2vec"
    else:
        print "Adding all words"
    # Get all story words
    all_words = [word for story in stories for sent in story for word in sent]

    # Parse QAs to get actual words
    QA_words = []
    for QA in QAs:
        QA_words.append({})
        QA_words[-1]['q_w'] = utils.normalize_alphanumeric(QA.question.lower()).split(' ')
        QA_words[-1]['a_w'] = [utils.normalize_alphanumeric(answer.lower()).split(' ') for answer in QA.answers]

    # Append question and answer words to all_words
    for QAw in QA_words:
        all_words.extend(QAw['q_w'])
        for answer in QAw['a_w']:
            all_words.extend(answer)

    # threshold vocabulary, at least N instances of every word
    vocab = Counter(all_words)
    vocab = [k for k in vocab.keys() if vocab[k] >= word_thresh]

    # create vocabulary index
    for w in vocab:
        if w not in v2i.keys():
            if w2v_vocab is None:
                # if word2vec is not provided, just dump the word to vocab
                v2i[w] = len(v2i)
            elif w2v_vocab is not None and w in w2v_vocab:
                # check if word in vocab, or else ignore
                v2i[w] = len(v2i)

    print "Created a vocabulary of %d words. Threshold removed %.2f %% words" \
            %(len(v2i), 100*(1. * len(set(all_words)) - len(v2i))/len(all_words))

    return QA_words, v2i
def normalize_documents(stories, normalize_for=('lower', 'alphanumeric'), max_words=40):
    """Normalize all stories in the dictionary, get list of words per sentence.
    """

    for movie in stories.keys():
        for s, sentence in enumerate(stories[movie]):
            sentence = sentence.lower()
            if 'alphanumeric' in normalize_for:
                sentence = utils.normalize_alphanumeric(sentence)
            sentence = sentence.split(' ')[:max_words]
            stories[movie][s] = sentence
    return stories
def normalize_documents(stories, normalize_for=('lower', 'alphanumeric'), max_words=40):
    """Normalize all stories in the dictionary, get list of words per sentence.
    """

    for movie in stories.keys():
        for s, sentence in enumerate(stories[movie]):
            sentence = sentence.lower()
            if 'alphanumeric' in normalize_for:
                sentence = utils.normalize_alphanumeric(sentence)
            sentence = sentence.split(' ')[:max_words]
            stories[movie][s] = sentence
    return stories
Ejemplo n.º 10
0
def answer_length(QA):
    """Hasty student answering questions based on the length of the answers.
    """

    shortest, longest, different = {}, {}, {}
    for qa in QA:
        # get all answer lengths
        ans_length = np.zeros((5))
        for k, ans in enumerate(qa.answers):
            ans_length[k] = len(utils.normalize_stemming(utils.normalize_alphanumeric(ans)))
        # pick shortest answer
        shortest.update({qa.qid:np.argmin(ans_length)})
        # pick longest answer
        longest.update({qa.qid:np.argmax(ans_length)})
        # pick most different sized answer
        mean_length = np.mean(ans_length)
        different.update({qa.qid:np.argmax(np.abs(ans_length - mean_length))})

    answer_options = {'hasty-shortest': shortest,
                      'hasty-longest': longest,
                      'hasty-different': different}
    return answer_options