Ejemplo n.º 1
0
def keyphrase(text: str,
              k: int = 10,
              stopword: Optional[List[str]] = None) -> List[str]:
    """关键句抽取
    Args:
      text: str, 输入文本
      k: int, 返回 topk 个关键词
      stopword: List[str], 关键词列表,默认 None
    Return:
      List[str]
    """
    text = text.strip()
    if stopword is None:
        stopword = []

    docs = []
    for sent in split_text(text):
        words = []
        for word, _ in xmnlp.tag(sent):
            if word not in stopword:
                words.append(word)
        docs.append(words)

    tr = TextRank(docs)
    res = []
    for idx in tr.topk(k):
        res.append(''.join(docs[idx]))

    return res
Ejemplo n.º 2
0
def test_tag(lexical_data):
    res = [[('结婚', 'v'), ('的', 'u'), ('和', 'c'), ('尚未', 'd'), ('结婚', 'v'), ('的', 'u'), ('都', 'd'), ('成', 'v'), ('了', 'u'), ('和尚', 'nn')],
           [('我', 'r'), ('喜欢', 'v'), ('《', 'w'), ('瓦尔登', 'nr'), ('湖', 'n'), ('》', 'w'), ('这', 'r'), ('本书', 'r'), (',', 'w'), ('如果', 'c'), ('你', 'r'), ('也', 'd'), ('喜欢', 'v'), (',', 'w'), ('欢迎', 'v'), ('联系', 'vn'), ('我', 'r'), ('xmlee97', 'x'), ('@', 'w'), ('gmail', 'x'), ('.', 'w'), ('com', 'x'), ('一起', 's'), ('交流', 'vn')],  # NOQA
           [('<', 'w'), ('h1', 'x'), ('>', 'w'), ('谷歌', 'nt'), ('<', 'w'), ('/', 'w'), ('h1', 'x'), ('>', 'w'), ('的', 'u'), ('网址', 'n'), ('是', 'v'), ('https', 'x'), (':', 'w'), ('/', 'w'), ('/', 'w'), ('google', 'x'), ('.', 'w'), ('com', 'x')],  # NOQA
           [('现在', 't'), ('时间', 'n'), ('是', 'v'), ('2021年2月', 't')],
           [('现任', 'v'), ('美国', 'ns'), ('总统', 'nn'), ('是', 'v'), ('拜登', 'nr')]]
    preds = [xmnlp.tag(data) for data in lexical_data]
    for (y_pred, y_true) in zip(preds, res):
        assert y_pred == y_true
Ejemplo n.º 3
0
def test_tag(postag_data):
    res = [[('结婚', 'v'), ('的', 'uj'), ('和', 'c'), ('尚未', 'd'), ('结婚', 'v'),
            ('的', 'uj'), ('都', 'd'), ('成', 'n'), ('了', 'ul'), ('和尚', 'nr')],
           [('工信处', 'n'), ('女干事', 'n'), ('每月', 'r'), ('经过', 'p'), ('下属', 'v'),
            ('科室', 'n'), ('都', 'd'), ('要', 'v'), ('亲口', 'n'), ('交代', 'n'),
            ('24', 'm'), ('口', 'q'), ('交换机', 'n'), ('等', 'u'), ('技术性', 'n'),
            ('器件', 'n'), ('的', 'uj'), ('安装', 'v'), ('工作', 'vn')],
           [('他', 'r'), ('正在', 't'), ('量', 'n'), ('和服', 'nz'), ('尺寸', 'n')]]
    preds = [xmnlp.tag(data) for data in postag_data]
    for (y_pred, y_true) in zip(preds, res):
        assert y_pred == y_true
Ejemplo n.º 4
0
def test_tag(postag_data):
    res = [[('结婚', 'v'), ('的', 'uj'), ('和', 'c'), ('尚未', 'd'), ('结婚', 'v'),
            ('的', 'uj'), ('都', 'd'), ('成', 'n'), ('了', 'ul'), ('和尚', 'nr')],
           [('我', 'r'), ('喜欢', 'v'), ('《瓦尔登湖》', 'book'), ('这', 'r'),
            ('本书', 'r'), (',', 'w'), ('如果', 'c'), ('你', 'r'), ('也', 'd'),
            ('喜欢', 'v'), ('欢迎', 'v'), ('联系', 'n'), ('我', 'r'),
            ('*****@*****.**', 'email')],
           [('<h1>', 'html'), ('谷歌', 'n'), ('</h1>', 'html'), ('的', 'uj'),
            ('网址', 'n'), ('是', 'v'), ('https://google.com', 'url')],
           [('现在', 't'), ('时间', 'n'), ('是', 'v'), ('2019年10月', 'datetime')]]
    preds = [xmnlp.tag(data) for data in postag_data]
    for (y_pred, y_true) in zip(preds, res):
        assert y_pred == y_true
Ejemplo n.º 5
0
# -*- coding: utf-8 -*-

import xmnlp
import time

time_start = time.time()

print("开始处理")
persons = []
with open('分词处理/人民的名义.txt') as f1:
    chapter = f1.read()
    for word, tag in xmnlp.tag(chapter):
        if tag == "nr":
            print(word)
            persons.append(word)
f1.close()

persons = list(set(persons))
with open('xmnlp.txt', "w") as wf:
    for word in persons:
        wf.writelines("{}\n".format(word))
wf.close()
time_end = time.time()
print('总共耗时:', str(time_end - time_start) + 's')
print('处理完成!')
Ejemplo n.º 6
0
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE."""

import sys
sys.path.append("..")

if sys.version_info[0] == 2:
    reload(sys)
    sys.setdefaultencoding('utf8')

import xmnlp
xmnlp.set_userdict('./userdict.txt')
xmnlp.set_stopword('./stopword.txt')

doc = """自然语言处理: 是人工智能和语言学领域的分支学科。
在这此领域中探讨如何处理及运用自然语言;自然语言认知则是指让电脑“懂”人类的语言。 
自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。"""

out = []
tagset = set()
for word, tag in xmnlp.tag(doc, hmm=True):
    out.append(word + ' ' + tag)
    tagset.add(tag)
print(' / '.join(out))
print()

for tag in tagset:
    print(tag, ':', xmnlp.tag_mean(tag))
Ejemplo n.º 7
0
    tagset.add(tag)
print(' / '.join(out))
print()
"""
 1.1  输出词性的中文意思

"""
for tag in tagset:
    print(tag, ':', xm.tag_mean(tag))

print('\n++++++++++++++++++++++++ usage 2 ++++++++++++++++++++++++\n')
"""
 2. 直接引包使用 (不可使用系统停用词)

"""
import xmnlp

out = []
tagset = set()
for word, tag in xmnlp.tag(doc, hmm=False):
    out.append(word + ' ' + tag)
    tagset.add(tag)
print(' / '.join(out))
print()
"""
 2.1  输出词性的中文意思

"""
for tag in tagset:
    print(tag, ':', xm.tag_mean(tag))
Ejemplo n.º 8
0
# -*- coding: utf-8 -*-

import xmnlp
from pymongo import MongoClient

# 定义 mongodb 连接对象
mongo = MongoClient()
# 使用的 mongodb 指定数据库数据表
db = mongo['jinyong']['xiaoshuo']
save_path = "F:/jinyong/data/persons.txt"

print("start processing...")
persons = []
for book_obj in db.find():
    for chapter in book_obj["chapters"]:
        for word, tag in xmnlp.tag(chapter['content']):
            if tag == "nr":
                persons.append(word)
print("save to {}...".format(save_path))
# 去重
persons = list(set(persons))
with open(save_path, "w") as wf:
    for word in persons:
        wf.writelines("{}\n".format(word))