Exemple #1
0
def test_encode_paper():
    tokenizer, model = oagbert("oagbert-v2-test")
    title = "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"
    abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation..."
    authors = [
        "Jacob Devlin", "Ming-Wei Chang", "Kenton Lee", "Kristina Toutanova"
    ]
    venue = "north american chapter of the association for computational linguistics"
    affiliations = ["Google"]
    concepts = [
        "language model", "natural language inference", "question answering"
    ]
    # encode paper
    paper_info = model.encode_paper(
        title=title,
        abstract=abstract,
        venue=venue,
        authors=authors,
        concepts=concepts,
        affiliations=affiliations,
        reduction="max",
    )

    assert len(paper_info) == 5
    assert paper_info["text"][0]["type"] == "TEXT"
    assert len(paper_info["authors"]) == 4
    assert len(paper_info["venue"][0]["token_ids"]) == 9
    assert tuple(paper_info["text"][0]["sequence_output"].shape) == (43, 768)
    assert len(paper_info["text"][0]["pooled_output"]) == 768
Exemple #2
0
    def __init__(self, app: str, model: str, **kwargs):
        super(OAGBertInferencePipepline, self).__init__(app, model=model, **kwargs)

        load_weights = kwargs["load_weights"] if "load_weights" in kwargs else True

        from cogdl.oag import oagbert

        self.tokenizer, self.bert_model = oagbert(model, load_weights=load_weights)
Exemple #3
0
def test_oagbert():
    tokenizer, bert_model = oagbert("oagbert-test", load_weights=False)

    sequence = "CogDL is developed by KEG, Tsinghua."
    tokens = tokenizer(sequence, return_tensors="pt")
    outputs = bert_model(**tokens, checkpoint_activations=True)

    assert len(outputs) == 2
    assert tuple(outputs[0].shape) == (1, 14, 32)
    assert tuple(outputs[1].shape) == (1, 32)
Exemple #4
0
def test_oagbert_v2():
    tokenizer, model = oagbert("oagbert-v2-test")
    sequence = "CogDL is developed by KEG, Tsinghua."
    span_prob, token_probs = model.calculate_span_prob(
        title=sequence,
        decode_span_type="FOS",
        decode_span="data mining",
        mask_propmt_text="Field of Study:",
        debug=False,
    )
    assert span_prob >= 0 and span_prob <= 1
    results = model.decode_beamsearch(title=sequence,
                                      decode_span_type="FOS",
                                      decode_span_length=2,
                                      beam_width=2,
                                      force_forward=False)
    assert len(results) == 2
    model.generate_title(
        abstract=
        "To enrich language models with domain knowledge is crucial but difficult. Based on the world's largest public academic graph Open Academic Graph (OAG), we pre-train an academic language model, namely OAG-BERT, which integrates massive heterogeneous entities including paper, author, concept, venue, and affiliation. To better endow OAG-BERT with the ability to capture entity information, we develop novel pre-training strategies including heterogeneous entity type embedding, entity-aware 2D positional encoding, and span-aware entity masking. For zero-shot inference, we design a special decoding strategy to allow OAG-BERT to generate entity names from scratch. We evaluate the OAG-BERT on various downstream academic tasks, including NLP benchmarks, zero-shot entity inference, heterogeneous graph link prediction, and author name disambiguation. Results demonstrate the effectiveness of the proposed pre-training approach to both comprehending academic texts and modeling knowledge from heterogeneous entities. OAG-BERT has been deployed to multiple real-world applications, such as reviewer recommendations for NSFC (National Nature Science Foundation of China) and paper tagging in the AMiner system. It is also available to the public through the CogDL package.",
        max_length=20,
    )
Exemple #5
0
import torch
from cogdl.oag import oagbert

tokenizer, bert_model = oagbert()
bert_model.eval()

sequence = [
    "CogDL is developed by KEG, Tsinghua.",
    "OAGBert is developed by KEG, Tsinghua."
]
tokens = tokenizer(sequence, return_tensors="pt", padding=True)

with torch.no_grad():
    outputs = bert_model(**tokens)

print(outputs[0])
from cogdl.oag import oagbert

tokenizer, model = oagbert("oagbert-v2")
title = "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"
abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation..."
authors = ["Jacob Devlin", "Ming-Wei Chang", "Kenton Lee", "Kristina Toutanova"]
venue = "north american chapter of the association for computational linguistics"
affiliations = ["Google"]
concepts = ["language model", "natural language inference", "question answering"]
# encode paper
paper_info = model.encode_paper(
    title=title,
    abstract=abstract,
    venue=venue,
    authors=authors,
    concepts=concepts,
    affiliations=affiliations,
    reduction="max",
)

for name, content in paper_info.items():
    print(name)
    print(content)
import os
from cogdl.oag import oagbert
import torch
import torch.nn.functional as F
import numpy as np

# load time
tokenizer, model = oagbert("oagbert-v2-sim")
model.eval()

# Paper 1
title = "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"
abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation..."
authors = [
    "Jacob Devlin", "Ming-Wei Chang", "Kenton Lee", "Kristina Toutanova"
]
venue = "north american chapter of the association for computational linguistics"
affiliations = ["Google"]
concepts = [
    "language model", "natural language inference", "question answering"
]

# encode first paper
(
    input_ids,
    input_masks,
    token_type_ids,
    masked_lm_labels,
    position_ids,
    position_ids_second,
    masked_positions,