Ejemplo n.º 1
0
def build_glove(word2vec, target_files, output_path):
    word2vec1 = KeyedVectors(vector_size=300)
    print(word2vec1.vectors.shape, (len(word2vec1.vocab), word2vec1.vector_size))
    buf1 = []
    buf2 = []
    contains = set()

    def add_buffer(w, f):
        nonlocal buf1, buf2
        if w not in contains:
            buf1.append(w)
            buf2.append(f)
            contains.add(w)

    def clear_buffer():
        nonlocal buf1, buf2
        buf1 = []
        buf2 = []

    for f in target_files:
        for i, s in enumerate(load_json(f), 1):
            sentence = s['description']

            for w in tokenize(sentence):
                w = w.lower()
                if w in word2vec:
                    add_buffer(w, word2vec[w])
            if i % 10 == 0 and len(buf1) > 0:
                word2vec1.add(buf1, buf2, replace=False)
                clear_buffer()
    if len(buf1) > 0:
        word2vec1.add(buf1, buf2, replace=False)

    print(word2vec1.vectors.shape, (len(word2vec1.vocab), word2vec1.vector_size))
    KeyedVectors.save_word2vec_format(word2vec1, output_path, binary=True)
Ejemplo n.º 2
0
def new_w2v():
    pkl_file = open("%s/data.para" % TPS, 'rb')
    vocab_u, vocab_i = load_vocabulary(pkl_file)
    # print(vocab_u)
    print(len(vocab_u))
    print(len(vocab_i))
    print(vocab_u['love'])
    all_words = set()
    all_words = all_words.union(set(vocab_u.keys()))
    print(len(all_words))
    all_words = all_words.union(set(vocab_i.keys()))
    print(len(all_words))
    length = len(all_words)
    w2v_model = KeyedVectors.load_word2vec_format('E:/embedding/GoogleNews-vectors-negative300.bin', binary=True)
    word_list = list(all_words)
    embeds_list = []
    miss = set()
    for w in word_list:
        if w in w2v_model:
            # in_set.add(w)
            embeds = w2v_model[w]
        else:
            miss.add(w)
            embeds = np.random.uniform(-0.25, 0.25, 300)
        embeds_list.append(embeds)
    print("miss:", len(miss)/len(all_words))
    new_w2v = KeyedVectors(300)
    new_w2v.add(word_list, embeds_list)
    new_w2v.save_word2vec_format("%s/google.w2v.bin" % TPS, binary=True)
def make_bert_sentence_file(filename, bert_sent_model, labels, vec_size=300):
    #Get all the
    embeddings = get_sentence_bert(bert_sent_model, labels)
    kv = KeyedVectors(vector_size=vec_size)
    vec_id_list = range(0, len(labels))
    kv.add(vec_id_list, embeddings)
    kv.save_word2vec_format(filename, binary=False)
    return
def make_word2vec_file(filename, model, labels):
    # Get mean word2vec vector for all labels and write them to a file.

    kv = KeyedVectors(vector_size=model.wv.vector_size)
    vec_id_list = range(0, len(labels))

    vectors = []
    for label in labels:
        vec = get_mean_vector(model, label)
        vectors.append(vec)
    kv.add(vec_id_list, vectors)
    kv.save_word2vec_format(filename, binary=False)
    return
Ejemplo n.º 5
0
def reduce_word2vec_vocab(input_path, output_path, vocab):
    """
    Downsamples the vocabulary in word2vec embeddings to less storage overhead.
    Given the input path of the embeddings and the vocabulary needed, create
    a new word2vec model removing words not in the voabulary. Save this resulting
    model in the output_path.
    """
    input_model = KeyedVectors.load_word2vec_format(input_path, binary=True)
    output_model = KeyedVectors(100)
    for word in vocab:
        if word in input_model.vocab:
            output_model.add([word], [input_model[word]])

    output_model.save_word2vec_format(output_path, binary=True)
Ejemplo n.º 6
0
def main(kv_filepath, vocab_filepath, output_filepath):

    model = KeyedVectors.load_word2vec_format(kv_filepath, binary=True)
    vocab = Vocab(vocab_filepath)
    short_kv = KeyedVectors(vector_size=len(model['hello']))

    for word in vocab.word2int.keys():
        try:
            short_kv.add(word, model[word])
        except KeyError:
            continue

    short_kv.save_word2vec_format(
        os.path.join(output_filepath, 'short-vectors.bin'))
    def save(self, filename="gensim_KeyedVectors.txt"):
        '''
        Saves the model to the specified filename as a gensim KeyedVectors in the
        text format so you can load it separately.
        '''

        # Creates an empty KeyedVectors with our embedding size
        kv = KeyedVectors(vector_size=self.hidden_layer_size)
        vectors = []
        words = []
        # Get the list of words/vectors in a consistent order
        for index, word in enumerate(self.index_to_word):
            vectors.append(self.W[index].copy())
            words.append(word)

        # Fills the KV object with our data in the right order
        kv.add(words, vectors)
        kv.save_word2vec_format(filename, binary=False)
Ejemplo n.º 8
0
def readOneUsernameTextFile(filename, saveToFile_vectors, saveToFile_model,
                            subreddit, username, model, count):

    update = False
    c = 0
    for batch in usernameSentenceIterator(filename):
        #a batch is a list of sentences
        #a sentence is a list of words.
        update = True
        if (c == 0 and count == 0):
            update = False  # don't update the first model
        c += 1
        model.build_vocab(batch, update=update)
        model.train(batch, total_examples=model.corpus_count, epochs=100)
    KeyedVectors.save_word2vec_format(
        model.wv, saveToFile_vectors,
        binary=False)  # save the vectors for ease of use
    model.save(saveToFile_model)  # save the model information
    return model
Ejemplo n.º 9
0
def compute_doc_vecs(experiment,
                     data_dir='./data',
                     workers=None,
                     override=False,
                     dense_vector_size=300,
                     sparse_vector_size=500000,
                     gpu=None):
    """

    Examples:

    python cli.py compute_doc_vecs wikisource --override=1 --gpu 0
    python cli.py compute_doc_vecs ocb --override=1 --gpu 1


    :param data_dir: Path to data (for input and output)
    :param experiment: Experiment name (ocb or wikisource)
    :param workers: Number of workers
    :param override: Override existing output
    :param dense_vector_size: Size of dense document vectors (avg word2vec, graph embeddings, ...)
    :param sparse_vector_size: Size of sparse document vectors (TF-IDF)
    :param cuda_device: Use CUDA device for Transformer models
    :return:
    """
    env = get_env()
    data_dir = Path(data_dir)

    logger.info(f'Experiment: {experiment}')

    exp = Experiment(name=experiment, env=env, data_dir=data_dir)

    exp.load_data()
    exp.filter_docs()

    models_dir = exp.models_dir
    common_kwargs = exp.get_common_kwargs()

    if not workers:
        workers = env['workers']

    logger.info(f'Using {workers} workers')

    if gpu:
        logger.info(f'Using CUDA device={gpu}')
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

    # TF-IDF
    out_fp = models_dir / 'tfidf.pickle'
    if override or not os.path.exists(out_fp):
        rs = TfIdfRecSys(vector_size=sparse_vector_size, **common_kwargs)
        rs.train(exp.texts)
        rs.save_to_disk(out_fp, override=override)

    # Doc2Vec
    out_fp = models_dir / 'doc2vec.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = Doc2VecRecSys(**common_kwargs, vector_size=dense_vector_size)
        rs.train(exp.texts)
        rs.save_word2vec_format(out_fp, override=override)

    out_fp = models_dir / 'doc2vec_512.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = Doc2VecRecSys(**common_kwargs, vector_size=dense_vector_size)
        rs.train(exp.get_limited_texts(512))
        rs.save_word2vec_format(out_fp, override=override)

    out_fp = models_dir / 'doc2vec_4096.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = Doc2VecRecSys(**common_kwargs, vector_size=dense_vector_size)
        rs.train(exp.get_limited_texts(4096))
        rs.save_word2vec_format(out_fp, override=override)

    # Avg GloVe
    out_fp = models_dir / 'avg_glove.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = WeightedAvgWordVectorsRecSys(w2v_model=exp.get_w2v_model('glove'),
                                          **common_kwargs)
        rs.train(exp.texts)
        rs.save_word2vec_format(out_fp, override=override)

    # With custom GloVe embeddings
    out_fp = models_dir / 'avg_glove_custom.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = WeightedAvgWordVectorsRecSys(
            w2v_model=exp.get_w2v_model('glove_custom'), **common_kwargs)
        rs.train(exp.texts)
        rs.save_word2vec_format(out_fp, override=override)

    out_fp = models_dir / 'avg_fasttext.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = WeightedAvgWordVectorsRecSys(
            w2v_model=exp.get_w2v_model('fasttext'), **common_kwargs)
        rs.train(exp.texts)
        rs.save_word2vec_format(out_fp, override=override)

    out_fp = models_dir / 'avg_fasttext_custom.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = WeightedAvgWordVectorsRecSys(
            w2v_model=exp.get_w2v_model('fasttext_custom'), **common_kwargs)
        rs.train(exp.texts)
        rs.save_word2vec_format(out_fp, override=override)

    out_fp = models_dir / 'avg_fasttext_custom_512.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = WeightedAvgWordVectorsRecSys(
            w2v_model=exp.get_w2v_model('fasttext_custom'), **common_kwargs)
        rs.train(exp.get_limited_texts(512))
        rs.save_word2vec_format(out_fp, override=override)

    out_fp = models_dir / 'avg_fasttext_custom_4096.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = WeightedAvgWordVectorsRecSys(
            w2v_model=exp.get_w2v_model('fasttext_custom'), **common_kwargs)
        rs.train(exp.get_limited_texts(4096))
        rs.save_word2vec_format(out_fp, override=override)

    # Transformers
    # BERT standard pooled
    out_fp = models_dir / 'bert-base-cased.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = TransformerRecSys(model_name_or_path=env['bert_dir'] +
                               '/bert-base-cased',
                               **common_kwargs)
        rs.train(exp.texts)
        rs.save_word2vec_format(models_dir / 'bert-base-cased.w2v.txt',
                                override=override)

    # All "MEAN" transformers
    for tf_name in [
            'bert-base-cased', 'bert-large-cased', 'roberta-base',
            'roberta-large', 'legal-bert'
    ]:
        out_fp = models_dir / f'{tf_name}_mean.w2v.txt'
        if override or not os.path.exists(out_fp):
            rs = TransformerRecSys(model_name_or_path=env['bert_dir'] + '/' +
                                   tf_name,
                                   pooling_strategy='reduce_mean',
                                   **common_kwargs)
            rs.train(exp.texts)
            rs.save_word2vec_format(out_fp, override=override)

    # Long former
    if transformers.__version__ == '2.0.0':

        from longformer.longformer import Longformer
        from transformers import RobertaTokenizer

        out_fp = models_dir / 'longformer-base-4096-mean.w2v.txt'
        if override or not os.path.exists(out_fp):
            lf_lm = Longformer.from_pretrained(env['bert_dir'] +
                                               '/longformer-base-4096')
            lf_tokenizer = RobertaTokenizer.from_pretrained(env['bert_dir'] +
                                                            '/roberta-base')
            lf_tokenizer.max_len = lf_lm.config.max_position_embeddings

            rs = TransformerRecSys(language_model=lf_lm,
                                   tokenizer=lf_tokenizer,
                                   max_length=4096,
                                   pooling_strategy='reduce_mean',
                                   **common_kwargs)
            rs.train(exp.texts)
            rs.save_word2vec_format(out_fp, override=override)

        out_fp = models_dir / 'longformer-large-4096-mean.w2v.txt'
        if override or not os.path.exists(out_fp):
            lf_lm = Longformer.from_pretrained(env['bert_dir'] +
                                               '/longformer-large-4096')
            lf_tokenizer = RobertaTokenizer.from_pretrained(env['bert_dir'] +
                                                            '/roberta-large')
            lf_tokenizer.max_len = lf_lm.config.max_position_embeddings

            rs = TransformerRecSys(language_model=lf_lm,
                                   tokenizer=lf_tokenizer,
                                   max_length=4096,
                                   pooling_strategy='reduce_mean',
                                   **common_kwargs)
            rs.train(exp.texts)
            rs.save_word2vec_format(out_fp, override=override)
    else:
        # Wait for https://github.com/allenai/longformer/pull/14
        logger.warning('Cannot run LongFormer with transformers!=2.0.0')

    # Sentence transformer
    if LooseVersion(transformers.__version__) >= LooseVersion('2.8.0'):
        # See https://github.com/UKPLab/sentence-transformers/blob/master/requirements.txt#L1
        st_models = [
            'bert-base-nli-mean-tokens',
            'bert-large-nli-mean-tokens',
            'roberta-base-nli-mean-tokens',
            'roberta-large-nli-mean-tokens',
            'bert-base-nli-stsb-mean-tokens',
            'bert-large-nli-stsb-mean-tokens',
            'roberta-base-nli-stsb-mean-tokens',
            'roberta-large-nli-stsb-mean-tokens',
        ]
        st_dir = env['datasets_dir'] + '/sentence_transformers/'

        for st_model_name in st_models:
            out_fp = models_dir / f's{st_model_name}.w2v.txt'
            if override or not os.path.exists(out_fp):
                rs = SentenceTransformerRecSys(model_name_or_path=st_dir +
                                               st_model_name,
                                               **common_kwargs)
                rs.train(exp.texts)
                rs.save_word2vec_format(out_fp, override=override)
        #    break
    else:
        logger.warning(
            'Cannot run sentence-transformers with transformers==%s' %
            transformers.__version__)

    # Citation

    # DeepWalk
    out_fp = models_dir / 'deepwalk.pickle'
    if override or not os.path.exists(out_fp):
        rs = GraphEmbeddingRecSys(include_seeds=exp.get_included_seeds(),
                                  graph_model_cls='karateclub.DeepWalk',
                                  graph_model_kwargs=dict(
                                      dimensions=dense_vector_size,
                                      workers=workers),
                                  **common_kwargs)
        rs.train(exp.cits)
        rs.save_to_disk(out_fp, override=override)

    # Diff2Vec
    """
    out_fp = models_dir / 'diff2vec.pickle'
    if override or not os.path.exists(out_fp):
        diff2vec = GraphEmbeddingRecSys(
            include_seeds=exp.get_included_seeds(),
            graph_model_cls='karateclub.Diff2Vec',
            graph_model_kwargs=dict(dimensions=dense_vector_size, workers=workers),
            **common_kwargs
        )
        diff2vec.train(exp.cits)
        diff2vec.save_to_disk(out_fp, override=override)
    """

    # Walklets
    out_fp = models_dir / 'walklets.pickle'
    if override or not os.path.exists(out_fp):
        walklets_window_size = 5  # or 3
        walklets_dim = int(dense_vector_size /
                           walklets_window_size)  # must be int
        rs = GraphEmbeddingRecSys(include_seeds=exp.get_included_seeds(),
                                  graph_model_cls='karateclub.Walklets',
                                  graph_model_kwargs=dict(
                                      dimensions=walklets_dim,
                                      window_size=walklets_window_size,
                                      workers=workers),
                                  **common_kwargs)
        rs.train(exp.cits)
        rs.save_to_disk(out_fp, override=override)

    # Node2Vec
    out_fp = models_dir / 'node2vec.pickle'
    if override or not os.path.exists(out_fp):
        rs = GraphEmbeddingRecSys(include_seeds=exp.get_included_seeds(),
                                  graph_model_cls='node2vec.Node2Vec',
                                  graph_model_kwargs=dict(
                                      dimensions=dense_vector_size,
                                      workers=workers),
                                  **common_kwargs)
        rs.train(exp.cits)
        rs.save_to_disk(out_fp, override=override)

    # NodeSketch
    """
    out_fp = models_dir / 'nodesketch.pickle'
    if override or not os.path.exists(out_fp):
        nodesketch = GraphEmbeddingRecSys(
            include_seeds=exp.get_included_seeds(),
            graph_model_cls='karateclub.NodeSketch',
            graph_model_kwargs=dict(dimensions=dense_vector_size),
            **common_kwargs
        )
        nodesketch.train(exp.cits)
        nodesketch.save_to_disk(out_fp, override=override)
    """

    # BoostNE
    out_fp = models_dir / 'boostne.pickle'
    if override or not os.path.exists(out_fp):
        boostne_iters = 9  # 14
        boostne_dim = 30  # 20

        assert boostne_dim * (boostne_iters + 1) == dense_vector_size

        boostne = GraphEmbeddingRecSys(
            include_seeds=exp.get_included_seeds(),
            # vector_size=dense_vector_size,
            graph_model_cls='karateclub.BoostNE',
            graph_model_kwargs=dict(
                dimensions=boostne_dim,  # 8
                order=2,  # 2
                iterations=boostne_iters,  # 16
                alpha=0.01,
            ),
            # Take only embedding from last boosting
            # node_embedding_slice=slice(dense_vector_size * boostne_iters, dense_vector_size * (boostne_iters + 1)),
            **common_kwargs)
        boostne.train(exp.cits)
        boostne.save_to_disk(out_fp, override=override)

    # Poincare
    from gensim.models.poincare import PoincareModel
    out_fp = models_dir / 'poincare.w2v.txt'
    if override or not os.path.exists(out_fp):
        poincare_model = PoincareModel(
            exp.cits,
            size=300,
            alpha=0.1,
            negative=10,
            workers=1,
            epsilon=1e-05,
            regularization_coeff=1.0,
            burn_in=10,
            burn_in_alpha=0.01,
            init_range=(-0.001, 0.001),
        )
        poincare_model.train(epochs=50, )
        # init empty model
        poincare = KeyedVectors(vector_size=poincare_model.kv.vector_size)

        # ignore items not part of gold standard
        for doc_id in list(poincare_model.kv.vocab.keys()):
            if doc_id in exp.get_included_seeds():
                poincare.add(doc_id, poincare_model.kv.get_vector(doc_id))
        poincare.save_word2vec_format(out_fp)

    logger.info('Done')
Ejemplo n.º 10
0
Archivo: data_cli.py Proyecto: j5bd/q
def build_avg_word_vectors(hf_dataset, w2v_path, output_path, override=False):
    """

    Run with: $ ./data_cli.py build_avg_word_vectors paperswithcode_aspects ./output/fasttext.w2v.txt ./output/pwc_doc_id2avg_fasttext.w2v.txt

    :param hf_dataset:
    :param w2v_path:
    :param output_path:
    :param override:
    :return:
    """
    stop_words = 'english'
    count_vector_size = 100000

    if os.path.exists(output_path):
        if override:
            logger.debug(f'Override {output_path}')
            os.remove(output_path)
        else:
            logger.info(
                f'Stop. Output file exists already (override disabled): {output_path}'
            )
            return

    w2v_model = KeyedVectors.load_word2vec_format(w2v_path)
    doc_model = KeyedVectors(vector_size=w2v_model.vector_size)

    count_vec = CountVectorizer(stop_words=stop_words,
                                analyzer='word',
                                lowercase=True,
                                ngram_range=(1, 1),
                                max_features=count_vector_size)

    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir='./data/nlp_cache',
                           split='docs')
    logger.info(f'Documents loaded: {len(docs_ds):,}')

    # Extract plain text
    texts = []
    doc_id2idx = {}
    idx2doc_id = {}

    for idx, doc in enumerate(docs_ds):
        # Extract plain text
        texts.append(doc['title'] + ': ' + doc['abstract'])
        doc_id2idx[doc['paper_id']] = idx
        idx2doc_id[idx] = doc['paper_id']

    # Transforms the data into a bag of words
    count_train = count_vec.fit(texts)
    idx2bow = count_vec.transform(texts)
    vidx2word = {v: k for k, v in count_train.vocabulary_.items()}

    assert len(vidx2word) == len(count_train.vocabulary_)

    logger.info(f'Vocab size: {len(count_train.vocabulary_)}')

    for idx, text in enumerate(
            tqdm(texts, total=len(texts), desc='Converting docs to vectors')):
        bow = idx2bow[idx].A[0]

        vectors = []
        weights = []

        for _idx, count in enumerate(bow):
            if count > 0:
                word = vidx2word[_idx]
                try:
                    v = w2v_model.get_vector(word)
                    vectors.append(v)
                    weights.append(count)
                except KeyError:
                    # unknown word
                    pass

                pass

        # Check if at least one document term exists as word vector
        if vectors and weights:
            # Weight avg
            doc = np.average(np.array(vectors),
                             axis=0,
                             weights=np.array(weights))

            # Add to model with doc_id
            doc_model.add([str(idx2doc_id[idx])], [doc])
        else:
            logger.debug(
                f'Cannot add document {idx2doc_id[idx]} due to missing word vectors'
            )

    # Save to disk
    doc_model.save_word2vec_format(output_path)
    logger.info(f'Saved to: {output_path}')
Ejemplo n.º 11
0
Archivo: data_cli.py Proyecto: j5bd/q
def build_specter_vectors(hf_dataset: str,
                          specter_path: str,
                          output_path: str,
                          cuda_device: int = -1,
                          batch_size: int = 32,
                          vector_size: int = 768,
                          override=False):
    """
    Run with: $ ./data_cli.py build_specter_vectors paperswithcode_aspects ./specter_archive ./output/pwc_doc_id2specter.w2v.txt --cuda_device=5

    Download specter:
    $ wget https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/specter/archive.tar.gz
    $ tar -xzvf archive.tar.gz

    :param vector_size:
    :param output_path: ./output
    :param override:
    :param cuda_device:
    :param batch_size:
    :param hf_dataset:
    :param specter_path: Path to specter
    :return:
    """
    from specter.predict_command import predictor_from_archive
    from allennlp.models import load_archive

    # load to register
    from specter.model import Model
    from specter.data import DataReader, DataReaderFromPickled
    from specter.predictor import SpecterPredictor

    if Model and DataReader and SpecterPredictor:
        pass

    if os.path.exists(output_path) and not override:
        logger.error(f'Output file exists already: {output_path}')
        return

    # Dataset
    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir='./data/nlp_cache',
                           split='docs')
    logger.info(f'Documents loaded: {len(docs_ds):,}')
    papers_to_embed = [doc for doc in docs_ds]

    # Specter settings
    archive_path = os.path.join(specter_path, 'model.tar.gz')
    metadata_path = os.path.join(specter_path, 'metadata_sample.json')
    included_text_fields = 'abstract title'
    vocab_dir = os.path.join(specter_path, 'data/vocab/')

    cuda_device = int(cuda_device)

    overrides = f"{{'model':{{'predict_mode':'true','include_venue':'false'}},'dataset_reader':{{'type':'specter_data_reader','predict_mode':'true','paper_features_path':'{metadata_path}','included_text_fields': '{included_text_fields}'}},'vocabulary':{{'directory_path':'{vocab_dir}'}}}}"

    logger.info(f'SPECTER overrides: {overrides}')

    archive = load_archive(archive_path,
                           cuda_device=cuda_device,
                           overrides=overrides)

    predictor = predictor_from_archive(archive,
                                       predictor_name='specter_predictor',
                                       paper_features_path=metadata_path)

    # Batches
    def chunks(lst, chunk_size):
        """Splits a longer list to respect batch size"""
        for i in range(0, len(lst), chunk_size):
            yield lst[i:i + chunk_size]

    batches_count = int(len(papers_to_embed) / batch_size)
    batch_embed_papers = []

    # 30min on GPU
    for batch in tqdm(chunks(papers_to_embed, batch_size),
                      total=batches_count):
        batch_out = predictor.predict_batch_json(batch)
        batch_embed_papers += batch_out

    # To keyed vectors
    doc_model = KeyedVectors(vector_size=vector_size)

    for embed_paper in tqdm(batch_embed_papers):
        doc_model.add([embed_paper['paper_id']], [embed_paper['embedding']])

    # Save to disk
    doc_model.save_word2vec_format(output_path)

    logger.info('Done')
Ejemplo n.º 12
0
Archivo: data_cli.py Proyecto: j5bd/q
def build_transformers_vectors(hf_dataset: str,
                               model_name_or_path: str,
                               output_path: str,
                               pooling: str,
                               batch_size: int = 16,
                               override: bool = False):
    """

    $ ./data_cli.py build_transformers_vectors paperswithcode_aspects scibert-scivocab-uncased ./output/scibert-cls --pooling=cls --batch_size=16

    :param hf_dataset:
    :param model_name_or_path:
    :param output_path:
    :param pooling:
    :param override:
    :return:
    """

    env = get_env()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    pooling_strategies = ['cls', 'mean']

    if os.path.exists(output_path) and not override:
        logger.error(f'Output file exists already: {output_path}')
        sys.exit(1)

    if pooling not in pooling_strategies:
        raise ValueError(f'Invalid pooling: {pooling}')

    # Model path from env
    if not os.path.exists(model_name_or_path) and os.path.exists(
            os.path.join(env['bert_dir'], model_name_or_path)):
        model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path)

    # Dataset
    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir='./data/nlp_cache',
                           split='docs')
    logger.info(f'Documents loaded: {len(docs_ds):,}')

    # Model
    model = AutoModel.from_pretrained(model_name_or_path)
    model = model.to(device)

    # Tokenize docs
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

    texts = [doc['title'] + ': ' + doc['abstract'] for doc in docs_ds]

    inputs = tokenizer(texts,
                       add_special_tokens=True,
                       return_tensors='pt',
                       padding=True,
                       max_length=model.config.max_position_embeddings,
                       truncation=True,
                       return_token_type_ids=False,
                       return_attention_mask=True)

    ds = TensorDataset(inputs['input_ids'], inputs['attention_mask'])
    dl = DataLoader(ds, shuffle=False, batch_size=batch_size)

    # Vectors
    doc_model = KeyedVectors(vector_size=model.config.hidden_size)

    with torch.no_grad():
        for batch_idx, batch_data in enumerate(tqdm(dl, desc='Inference')):
            batch_data = tuple(t.to(device) for t in batch_data)

            outputs = model(*batch_data, return_dict=True)

            if pooling == 'cls':
                batch_embeddings = outputs['pooler_output'].detach().cpu(
                ).numpy()
            elif pooling == 'mean':
                batch_embeddings = np.mean(
                    outputs['last_hidden_state'].detach().cpu().numpy(),
                    axis=1)
            else:
                raise NotImplementedError()

            batch_ids = docs_ds[batch_idx * batch_size:batch_idx * batch_size +
                                batch_size]['paper_id']
            doc_model.add(batch_ids, batch_embeddings)

    # Save to disk
    doc_model.save_word2vec_format(output_path)

    logger.info('Done')
Ejemplo n.º 13
0
import json
import numpy as np
from gensim.models import KeyedVectors as KV
from tqdm import tqdm
from gensim.models import Word2Vec

with open('index_title.json') as f:
    i_t = json.loads(f.read())
f = open('nv77k', 'r')
num_of_nodes, dim = [int(x) for x in f.readline().split()]
nv = KV(vector_size = dim)
for line in tqdm(f.readlines()):
    splits = line.split()
    nv[i_t[splits[0]]] = np.array([float(x) for x in splits[1:]])

nv.save_word2vec_format('nv77k.emb')
Ejemplo n.º 14
0
 def save(self, kvs: KeyedVectors):
     filepath = path.join(self._folder_path, kvs.name)
     kvs.save_word2vec_format(filepath, binary=True)
Ejemplo n.º 15
0
def convert_word2vec_from_bin_to_txt(file_path, save_path):
    model = KeyedVectors.load_word2vec_format(file_path, binary=True)
    KeyedVectors.save_word2vec_format(model, save_path)