Example #1
0
def test_encode_paper():
    tokenizer, model = oagbert("oagbert-v2-test")
    title = "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"
    abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation..."
    authors = ["Jacob Devlin", "Ming-Wei Chang", "Kenton Lee", "Kristina Toutanova"]
    venue = "north american chapter of the association for computational linguistics"
    affiliations = ["Google"]
    concepts = ["language model", "natural language inference", "question answering"]
    # encode paper
    paper_info = model.encode_paper(
        title=title,
        abstract=abstract,
        venue=venue,
        authors=authors,
        concepts=concepts,
        affiliations=affiliations,
        reduction="max",
    )

    assert len(paper_info) == 5
    assert paper_info["text"][0]["type"] == "TEXT"
    assert len(paper_info["authors"]) == 4
    assert len(paper_info["venue"][0]["token_ids"]) == 9
    assert tuple(paper_info["text"][0]["sequence_output"].shape) == (43, 768)
    assert len(paper_info["text"][0]["pooled_output"]) == 768
Example #2
0
    def __init__(self, app: str, model: str, **kwargs):
        super(OAGBertInferencePipepline, self).__init__(app,
                                                        model=model,
                                                        **kwargs)

        load_weights = kwargs[
            "load_weights"] if "load_weights" in kwargs else True
        self.tokenizer, self.bert_model = oagbert(model,
                                                  load_weights=load_weights)
Example #3
0
def test_oagbert():
    tokenizer, bert_model = oagbert("oagbert-test", load_weights=False)

    sequence = "CogDL is developed by KEG, Tsinghua."
    tokens = tokenizer(sequence, return_tensors="pt")
    outputs = bert_model(**tokens, checkpoint_activations=True)

    assert len(outputs) == 2
    assert tuple(outputs[0].shape) == (1, 14, 32)
    assert tuple(outputs[1].shape) == (1, 32)
Example #4
0
def test_oagbert_v2():
    tokenizer, model = oagbert("oagbert-v2-test")
    sequence = "CogDL is developed by KEG, Tsinghua."
    span_prob, token_probs = model.calculate_span_prob(
        title=sequence,
        decode_span_type='FOS',
        decode_span='data mining',
        mask_propmt_text='Field of Study:',
        debug=False)
    assert span_prob >= 0 and span_prob <= 1
    results = model.decode_beamsearch(title=sequence,
                                      decode_span_type='FOS',
                                      decode_span_length=2,
                                      beam_width=2,
                                      force_forward=False)
    assert len(results) == 2
Example #5
0
from cogdl import oagbert

tokenizer, bert_model = oagbert()

sequence = "CogDL is developed by KEG, Tsinghua."
tokens = tokenizer(sequence, return_tensors="pt")
outputs = bert_model(**tokens)

print(outputs[0].shape)
Example #6
0
from cogdl import oagbert
from cogdl.oag.utils import colored
import math

tokenizer, model = oagbert("oagbert-v2-zh")
model.eval()

title = '基于随机化矩阵分解的网络嵌入方法'
abstract = '''随着互联网的普及,越来越多的问题以社交网络这样的网络形式出现.网络通常用图数据表示,由于图数据处理的挑战性,如何从图中学习到重要的信息是当前被广泛关注的问题.网络嵌入就是通过分析图数据得到反映网络结构的特征向量,利用它们进而实现各种数据挖掘任务,例如边预测、节点分类、网络重构、标签推荐和异常检测.最近,基于矩阵分解的网络嵌入方法NetMF被提出,它在理论上统一了多种网络嵌入方法,并且在处理实际数据时表现出很好的效果.然而,在处理大规模网络时,NetMF需要极大的时间和空间开销.本文使用快速随机化特征值分解和单遍历奇异值分解技术对NetMF进行改进,提出一种高效率、且内存用量小的矩阵分解网络嵌入算法eNetMF.首先,我们提出了适合于对称稀疏矩阵的随机化特征值分解算法freigs,它在处理实际的归一化网络矩阵时比传统的截断特征值分解算法快近10倍,且几乎不损失准确度.其次,我们提出使用单遍历奇异值分解处理NetMF方法中高次近似矩阵从而避免稠密矩阵存储的技术,它大大减少了网络嵌入所需的内存用量.最后,我们提出一种简洁的、且保证分解结果对称的随机化单遍历奇异值分解算法,将它与上述技术结合得到eNetMF算法.基于5个实际的网络数据集,我们评估了eNetMF学习到的网络低维表示在多标签节点分类和边预测上的有效性.实验结果表明,使用eNetMF替代NetMF后在后续得到的多标签分类性能指标上几乎没有损失,但在处理大规模数据时有超过40倍的加速与内存用量节省.在一台32核的机器上,eNetMF仅需约1.3 h即可对含一百多万节点的YouTube数据学习到网络嵌入,内存用量仅为120GB,并得到较高质量的分类结果.此外,最近被提出的网络嵌入算法NetSMF由于图稀疏化过程的内存需求太大,无法在256 GB内存的机器上处理两个较大的网络数据,而ProNE算法则在多标签分类的结果上表现不稳定,得到的Macro-F1值都比较差.因此,eNetMF算法在结果质量上明显优于NetSMF和ProNE算法.在边预测任务上,eNetMF算法也表现出与其它方法差不多甚至更好的性能.'''

# calculate the probability of `machine learning`, `artificial intelligence`, `language model` for GPT-3 paper
print('=== Span Probability ===')
for span in ['机器学习', '网络嵌入', '随机化特征值分解']:
    span_prob, token_probs = model.calculate_span_prob(
        title=title,
        abstract=abstract,
        decode_span_type='FOS',
        decode_span=span,
        mask_propmt_text='Field of Study:',
        debug=False)
    print('%s probability: %.4f' % (span.ljust(30), span_prob))
print()

# decode a list of Field-Of-Study using beam search
concepts = []
print('=== Generated FOS ===')
for i in range(4):
    candidates = []
    for span_length in range(1, 5):
        results = model.decode_beamsearch(title=title,
                                          abstract=abstract,
Example #7
0
from cogdl import oagbert

tokenizer, model = oagbert('oagbert-v2-lm')
model.eval()

for seq, prob in model.generate_title(
        abstract=
        "To enrich language models with domain knowledge is crucial but difficult. Based on the world's largest public academic graph Open Academic Graph (OAG), we pre-train an academic language model, namely OAG-BERT, which integrates massive heterogeneous entities including paper, author, concept, venue, and affiliation. To better endow OAG-BERT with the ability to capture entity information, we develop novel pre-training strategies including heterogeneous entity type embedding, entity-aware 2D positional encoding, and span-aware entity masking. For zero-shot inference, we design a special decoding strategy to allow OAG-BERT to generate entity names from scratch. We evaluate the OAG-BERT on various downstream academic tasks, including NLP benchmarks, zero-shot entity inference, heterogeneous graph link prediction, and author name disambiguation. Results demonstrate the effectiveness of the proposed pre-training approach to both comprehending academic texts and modeling knowledge from heterogeneous entities. OAG-BERT has been deployed to multiple real-world applications, such as reviewer recommendations for NSFC (National Nature Science Foundation of China) and paper tagging in the AMiner system. It is also available to the public through the CogDL package."
):
    print('Title: %s' % seq)
    print('Perplexity: %.4f' % prob)
import os
from cogdl import oagbert
import torch
import torch.nn.functional as F
import numpy as np

# load time
tokenizer, model = oagbert("oagbert-v2-zh-sim")
model.eval()

# Paper 1
title = "国内外尾矿坝事故致灾因素分析"
abstract = "通过搜集已有尾矿坝事故资料,分析了国内外尾矿坝事故与坝高、筑坝工艺及致灾因素的关系。对147起尾矿坝事故的分析研究表明, 引起尾矿坝事故的主要因素为降雨,其次为地震、管理等;"

# Encode first paper
input_ids, input_masks, token_type_ids, masked_lm_labels, position_ids, position_ids_second, masked_positions, num_spans = model.build_inputs(
    title=title, abstract=abstract)
_, paper_embed_1 = model.bert.forward(
    input_ids=torch.LongTensor(input_ids).unsqueeze(0),
    token_type_ids=torch.LongTensor(token_type_ids).unsqueeze(0),
    attention_mask=torch.LongTensor(input_masks).unsqueeze(0),
    output_all_encoded_layers=False,
    checkpoint_activations=False,
    position_ids=torch.LongTensor(position_ids).unsqueeze(0),
    position_ids_second=torch.LongTensor(position_ids_second).unsqueeze(0))

# Positive Paper 2
title = "尾矿库工程特性及其安全监控系统研究"
abstract = "总结了尾矿坝工程的特殊性和复杂性.为了保证尾矿坝在全生命周期(包括运行期及其闭库后)的安全,发展尾矿库安全监控系统具有重要意义.提出了尾矿库安全监控的基础框架,分析了尾矿库安全监测的主要内容及关键问题,为保证尾矿库的安全提供强有力的科学和技术依据."
# Encode second paper
input_ids, input_masks, token_type_ids, masked_lm_labels, position_ids, position_ids_second, masked_positions, num_spans = model.build_inputs(