Beispiel #1
0
    def test_conversion(self):
        word2vec2tensor(word2vec_model_path=self.datapath, tensor_filename=self.output_folder)

        with utils.open(self.metadata_file, 'rb') as f:
            metadata = f.readlines()

        with utils.open(self.tensor_file, 'rb') as f:
            vectors = f.readlines()

        # check if number of words and vector size in tensor file line up with word2vec
        with utils.open(self.datapath, 'rb') as f:
            first_line = f.readline().strip()

        number_words, vector_size = map(int, first_line.split(b' '))
        self.assertTrue(len(metadata) == len(vectors) == number_words,
            ('Metadata file %s and tensor file %s imply different number of rows.'
                % (self.metadata_file, self.tensor_file)))

        # grab metadata and vectors from written file
        metadata = [word.strip() for word in metadata]
        vectors = [vector.replace(b'\t', b' ') for vector in vectors]

        # get the originaly vector KV model
        orig_model = KeyedVectors.load_word2vec_format(self.datapath, binary=False)

        # check that the KV model and tensor files have the same values key-wise
        for word, vector in zip(metadata, vectors):
            word_string = word.decode("utf8")
            vector_string = vector.decode("utf8")
            vector_array = np.array(list(map(float, vector_string.split())))
            np.testing.assert_almost_equal(orig_model[word_string], vector_array, decimal=5)
    def testConversion(self):
        word2vec2tensor(word2vec_model_path=self.datapath, tensor_filename=self.output_folder)

        with smart_open(self.metadata_file, 'rb') as f:
            metadata = f.readlines()

        with smart_open(self.tensor_file, 'rb') as f:
            vectors = f.readlines()

        # check if number of words and vector size in tensor file line up with word2vec
        with smart_open(self.datapath, 'rb') as f:
            first_line = f.readline().strip()

        number_words, vector_size = map(int, first_line.split(b' '))
        self.assertTrue(len(metadata) == len(vectors) == number_words,
            ('Metadata file %s and tensor file %s imply different number of rows.'
                % (self.metadata_file, self.tensor_file)))

        # grab metadata and vectors from written file
        metadata = [word.strip() for word in metadata]
        vectors = [vector.replace(b'\t', b' ') for vector in vectors]

        # get the originaly vector KV model
        orig_model = KeyedVectors.load_word2vec_format(self.datapath, binary=False)

        # check that the KV model and tensor files have the same values key-wise
        for word, vector in zip(metadata, vectors):
            word_string = word.decode("utf8")
            vector_string = vector.decode("utf8")
            vector_array = np.array(list(map(float, vector_string.split())))
            np.testing.assert_almost_equal(orig_model[word_string], vector_array, decimal=5)
Beispiel #3
0
def main():
    save_name = "word2vec.model"
    wiki_parser = Wiki()
    sentence_corpus_, d_fname, corpora_fname = wiki_parser.clean_corpora(
        should_save=True)
    model = Word2Vec(sentence_corpus_, size=150, window=5, min_count=5)
    # model = FastText(size=300, window=4, min_count=4)  # instantiate
    #model.build_vocab(sentences=sentence_corpus_)
    print('Training')
    model.train(sentences=sentence_corpus_,
                total_examples=model.corpus_count,
                epochs=50,
                total_words=model.corpus_total_words)  # train
    # fname = get_tmpfile("fasttext.model")
    model.wv.save_word2vec_format(save_name, binary=True)
    print('Plotting')
    # display_closestwords_tsnescatterplot(model, "amanita_muscaria")
    print(save_name)

    word2vec2tensor.word2vec2tensor(save_name, "fungi_w2v.tsv")
def main():

    globals.SPLIT_WITH_DATE = False
    globals.VOCAB_LOWERCASE = True

    global NUM_POS
    global NUM_NEG

    parser = argparse.ArgumentParser()

    # Data loading params
    parser.add_argument("-f",
                        "--data-file",
                        help="location of data file",
                        required=True)
    arguments = parser.parse_args()
    globals.XML_FILE = arguments.data_file

    xml_file = globals.XML_FILE

    print("All arguments: ", arguments)

    text_list = []
    with open(xml_file, "rb") as xmlf:
        journal_context = etree.iterparse(xmlf,
                                          events=(
                                              'start',
                                              'end',
                                          ),
                                          encoding='utf-8')
        fast_iter(journal_context, get_text_and_metadata, text_list)

    # should_remove_stop_words = True
    # should_stem = False

    # pos_vocab_proc = VocabProcessor(word_tokenize, 16, should_remove_stop_words, should_stem)
    # neg_vocab_proc = VocabProcessor(word_tokenize, 16, should_remove_stop_words, should_stem)

    text_only = [str(text['text']) for text in text_list]

    train_corpus = list(read_corpus(text_only))
    # train_corpus = list(read_corpus(text_list))

    # for idx, doc in enumerate(text_list):
    print(train_corpus[:2])

    model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=3)
    model.build_vocab(train_corpus)
    model.train(train_corpus,
                total_examples=model.corpus_count,
                epochs=model.iter)

    print("Training done!")

    output_file = os.path.splitext(os.path.basename(
        globals.XML_FILE))[0] + "_doc2vec_50dim.w2v"

    # model.save_word2vec_format(output_file, doctag_vec=True, word_vec=False)
    model.save_word2vec_format(output_file)
    print("Model saved!")

    word2vec2tensor.word2vec2tensor(output_file, "chemistry")

    with open("chemistry_metadata.tsv", "wb") as out:
        out.write("Title\tIndexed\n".encode("utf8"))
        for doc in text_list:
            # print(doc['article_title'])
            out_string = doc['article_title'] + "\t" + doc['target'] + "\n"
            out.write(out_string.encode("utf8"))