Esempio n. 1
0
def get_embedding(paragraph, question, representation_name, vocab=None):
    paragraphs_vectors = []
    questions_vectors = []

    if representation_name == "tfidf":
        if vocab:
            print("vocab specified")
            vectorizer = TfidfVectorizer(vocabulary=vocab, ngram_range=(1, 3))
        else:
            vectorizer = TfidfVectorizer(max_features=15000,
                                         ngram_range=(1, 3))
        paragraphs_vectors, vectorizer = getTfIdfReprentation(
            paragraph, vectorizer)
        questions_vectors = vectorizer.transform(question)  #.todense()

    elif representation_name == "glove":
        vectorizer = CountVectorizer()
        X = vectorizer.fit(paragraph).vocabulary_

        glove_dict = get_gloves_dict("glove.42B.300d.txt")
        key_set = set(X.keys()) & set(glove_dict.keys())
        glove_dict_vocab_corpus = {key: glove_dict[key] for key in key_set}

        paragraphs_vectors = get_plong_corpus(paragraph,
                                              glove_dict_vocab_corpus)
        questions_vectors = get_plong_corpus(paragraph,
                                             glove_dict_vocab_corpus)

    elif representation_name == "bert":
        vectorizer = Vectorizer()

        vectorizer.bert(paragraph)
        paragraphs_vectors = vectorizer.vectors

        vectorizer.bert(question)
        questions_vectors = vectorizer.vectors

    else:
        print("error representation_name")

    return paragraphs_vectors, questions_vectors