コード例 #1
0
ファイル: __init__.py プロジェクト: mapix/Pinyin
    def __init__(self):

        self.word_to_pinyins = defaultdict(list)
        f = open(FILE_WORDS, 'rb')
        for line in f:
            pinyin, words = line.strip().decode("utf-8").split()
            for item in words:
                self.word_to_pinyins[item].append(pinyin)
        f.close()

        self.word_to_pinyin = {}
        f = open(FILE_WORD, 'rb')
        for line in f:
            word, pinyin = line.strip().decode("utf-8").split(",")
            self.word_to_pinyin[word] = pinyin
        f.close()

        self.term_to_pinyin = {}
        f = open(FILE_TERM, 'rb')
        for line in f:
            term, pinyin = line.strip().decode("utf-8").split("#")
            self.term_to_pinyin[term] = pinyin.split("@")
        f.close()

        f = open(FILE_USER_DICT, 'rb')
        jieba.setLogLevel(logging.INFO)
        jieba.initialize()
        jieba.load_userdict(f)
        f.close()
コード例 #2
0
ファイル: algorithm.py プロジェクト: JOHNKYON/PDF_Crawler
def init(jieba_parallel=False):
    # 加载英语/中文停止词,分别来自nltk和zhon
    global english_stopwords, chinese_stopwords
    english_stopwords = set(nltk.corpus.stopwords.words('english'))
    chinese_stopwords = {word[:-1] for word in codecs.open("stopwords.txt", "r", encoding="utf-8")}

    # 设置结巴分词log级别
    jieba.setLogLevel("INFO")
    # 设置结巴分词字典文件
    jieba.set_dictionary("./jieba_dict.txt")
    # 修改结巴分词临时工作目录
    jieba.tmp_dir = os.getcwd()
    # 开启并行分词模式,进程数为CPU核心数
    if jieba_parallel:
        jieba.enable_parallel()

    config.log.info("module algorithm has initialized successfully.")
コード例 #3
0
import pickle
import jieba
import json
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

jieba.setLogLevel('WARN')

num_words = 80000
maxlen = 400

tokenizer_fact = Tokenizer(num_words=num_words)

# train tokenizer
# for i in range(18):
#     print('start big_fact_cut_%d_%d' % (i * 100000, i * 100000 + 100000))
#     with open('./data_deal/data_cut/big_fact_cut_%d_%d_new.pkl' % (i * 100000, i * 100000 + 100000), mode='rb') as f:
#         big_fact_cut = pickle.load(f)
#     texts_cut_len = len(big_fact_cut)
#     n = 0
#     # 分批训练
#     while n < texts_cut_len:
#         tokenizer_fact.fit_on_texts(texts=big_fact_cut[n:n + 10000])
#         n += 10000
#         if n < texts_cut_len:
#             print('tokenizer finish fit %d samples' % n)
#         else:
#             print('tokenizer finish fit %d samples' % texts_cut_len)
#     print('finish big_fact_cut_%d_%d' % (i * 100000, i * 100000 + 100000))
#
コード例 #4
0
def jieba_cut(sequences):
    jieba.setLogLevel(20)
    jieba.enable_parallel(8)
    for sequence in sequences:
        data = jieba.cut(sequence)
        yield ' '.join(data)
コード例 #5
0
ファイル: leancloud_api.py プロジェクト: 00nanhai/picwall
 def get_tag_list(filename):
     txt = filename.split('.')[0]
     jieba.setLogLevel(60)
     seg_list = jieba.cut(txt)
     return [i for i in seg_list if len(i) >= 2]
コード例 #6
0
ファイル: features.py プロジェクト: leahic/beautifulsentence
	def __init__(self , datapath = '../data'):
		self.datapath = datapath if datapath.endswith('/') else datapath + '/'
		jieba.setLogLevel('NOTSET')
コード例 #7
0
ファイル: extractor.py プロジェクト: SatanWoo/Carder
# -*- coding:utf-8 -*-
# Created Time: Fri Jun  5 11:59:09 2015
# Purpose: extract web article
# Mail: [email protected]
__author__ = "Wayne Ho"

import re
import os
import sys
import json
from bs4 import BeautifulSoup
import urllib, cStringIO
from PIL import Image
import jieba
import jieba.analyse
jieba.setLogLevel(60)  # disable loading messages
import logging
logging.basicConfig(
    format='%(asctime)s [%(levelname)s]  %(message)s',
    level=logging.INFO,
    filename="backtest.log",
    filemode='w'
)
logging.getLogger().addHandler(logging.StreamHandler())


def readFile(fname):
    """read a file, or via stdin
    @param fname: file name
    @type fname: str
    @return: str
コード例 #8
0
ファイル: cut_sentence.py プロジェクト: kenzzuli/chat_service
"""
分词
"""
import jieba.posseg as psg
import jieba
import logging
import config
import string
from lib import stopwords

# 设置jieba的日志等级
jieba.setLogLevel(logging.INFO)
# 所有的小写字母 abcdefghijklmnopqrstuvwxyz
lower_letters = string.ascii_lowercase + "+"
# 所有的标点符号
punctuations = [" ", "?", ",", "。", "!", ":", "?", ",", ".", "!", ":"]
# 加载词典
jieba.load_userdict(config.user_dict_path)


def _cut_sentence_by_word(sentence, with_pos, use_stopwords):
    """
    英汉都按照词来切分
    "python和c++哪个难?" --> ["python","和","c++","哪个", "难", "?"]
    """
    if with_pos:
        ret = psg.lcut(sentence)  # 结果是jieba自定义的pair对象
        ret = [(i.word, i.flag) for i in ret]  # 将pair对象转成元组
        if use_stopwords:
            ret = [i for i in ret if i[0] not in stopwords]
        return ret
コード例 #9
0
ファイル: test_wording.py プロジェクト: pipiBRH/pname_wording
def test_cut():
    jieba.setLogLevel(logging.INFO)
    # logging.basicConfig(level=logging.INFO)
    c = Cut()

    test_cases = [
        {
            "cht": {
                "word": [
                    "吋"
                ]
            },
            "eng": {
                "ansh": [],
                "dash": [],
                "hash": [],
                "nash": [
                    "475mm"
                ],
                "num": [
                    "19"
                ],
                "word": [
                    "nwb"
                ]
            },
            "input": "日本nwb 三節式雨刷 19吋/475mm",
            "num_list": [
                "19"
            ],
            "tag_list": [
                "475mm",
                "nwb",
                "三節式",
                "日本",
                "雨刷"
            ]
        },
        {

            "cht": {
                "word": [
                    "跨域",
                    "全",
                    "音域"
                ]
            },
            "eng": {
                "ansh": [
                    "o3"
                ],
                "dash": [],
                "hash": [],
                "nash": [],
                "num": [],
                "word": [
                    "s",
                    "spearx",
                    "t"
                ]
            },
            "input": "【spearx 聲特科技】 spearx 跨域美聲 t+s o3全音域留聲耳機-黑色",
            "num_list": [],
            "tag_list": [
                "o3",
                "s",
                "spearx",
                "t",
                "留聲",
                "科技",
                "美聲",
                "耳機",
                "聲特",
                "黑色"
            ]
        }
    ]

    for test_case in test_cases:
        assert test_case == c.cut(test_case['input'])
コード例 #10
0
"""
@author:XuMing([email protected])
@description: 配置切词器
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import logging
import os

import jieba
from jieba import posseg

jieba.setLogLevel(log_level="ERROR")


def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a peice of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens


def segment(sentence, cut_type='word', pos=False):
    """
    切词
    :param sentence:
コード例 #11
0
ファイル: train_eval.py プロジェクト: qiuhere/bigdata__
# -*- coding: utf-8 -*-
import re
import time
import random
import jieba
import torch
import logging
import torch.nn as nn
from torchnet import meter
from model import EncoderRNN, LuongAttnDecoderRNN
from utils.greedysearch import GreedySearchDecoder
from dataload import get_dataloader
from config import Config
jieba.setLogLevel(logging.INFO)  #关闭jieba输出信息


def maskNLLLoss(inp, target, mask):
    '''
    inp: shape [batch_size,voc_length]
    target: shape [batch_size] 经过view ==> [batch_size, 1] 这样就和inp维数相同,可以用gather
        target作为索引,在dim=1上索引inp的值,得到的形状同target [batch_size, 1]
        然后压缩维度,得到[batch_size], 取负对数后
        选择那些值为1的计算loss, 并求平均,得到loss
        故loss实际是batch_size那列的均值,表示一个句子在某个位置(t)上的平均损失值
        故nTotal表示nTotal个句子在某个位置上有值
    mask: shape [batch_size]
    loss: 平均一个句子在t位置上的损失值
    '''
    nTotal = mask.sum()  #padding是0,非padding是1,因此sum就可以得到词的个数
    crossEntropy = -torch.log(
        torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
コード例 #12
0
    obj = pickle.load(open(file_name, 'rb'))
    return obj


#------------------------------------------------------------------------------
if __name__ == '__main__':
    '''
    DEBUG
    INFO
    WARNING
    ERROR
    CRITICAL
    '''

    import logging
    jieba.setLogLevel(logging.WARNING)

    def save_ngram(n):
        global gram_level
        print('count_ngram', n, gram_level)
        count_ngram = wikichs_count_ngram(n)
        count_ngram = Counter({k: v
                               for k, v in count_ngram.items()
                               if v > 10})  #save memory
        save_obj(count_ngram, 'data/count_ngram' + str(n) + gram_level)

    gram_level = 'char'
    save_ngram(1)
    save_ngram(2)

    gram_level = 'word'
コード例 #13
0
'''
此类用于数据转换
将 普通数据 转为模型能读懂的数据
'''

import jieba
import logging
import joblib
import numpy as np

jieba.setLogLevel(logging.INFO)  #不输出jieba日志


class DataSet:
    def __init__(self, sentence_len, dictionary_path):
        #加载中文词典
        self.__Chinese_dict = joblib.load(dictionary_path)
        self.sentence_len = int(sentence_len)  #句子长度 单位:词

    #将每个句子的列表 转为 编码后的列表
    def data_to_train(self, list_str):
        Chinese_dict = self.__Chinese_dict  #提取中文词典
        Chinese_reverse = {v: k for k, v in Chinese_dict.items()}  #反转字典

        #将 句子表 编码
        list_coding = []  #保存编码的整个表
        #遍历每个句子
        for row in list_str:
            text_coding = []  #保存编码的句子

            #分词
コード例 #14
0
import json
import jieba
import random
import plotly.express as px
import argparse
import sys

jieba.setLogLevel(20)  # disable initialization info

parser = argparse.ArgumentParser()

parser.add_argument(
    "-v",
    "--visualize",
    required=False,
    default="False",
    help="whether to visualize results",
)
parser.add_argument(
    "-l",
    "--location",
    required=False,
    default="data/known.txt",
    help="location for known user vocabulary",
)
parser.add_argument(
    "-t",
    "--typesplit",
    required=False,
    default="wo",
    help=
コード例 #15
0
#coding=utf8
import sys
import os
import pickle as pkl
from os.path import exists as os_exists
import time

from tqdm import tqdm
import jieba
import numpy as np
from torch.utils.data import Dataset, DataLoader

jieba.setLogLevel(log_level=0)

from PublicConfig import PublicConfig

public_config = PublicConfig()


def tokenizer(sentence, cut="jieba"):
    if not isinstance(sentence, str):
        return [public_config.UNK]

    if cut == "jieba":
        return list(jieba.cut(sentence))
    elif cut == "char":
        return list(sentence)
    else:
        raise NotImplementedError("分词器可选:jieba 或者 char")

コード例 #16
0
 def get_tag_list(filename):
     txt = filename.split('.')[0]
     jieba.setLogLevel(60)
     seg_list = jieba.cut(txt)
     return [i for i in seg_list if len(i) >= 2]
コード例 #17
0
ファイル: get_embed.py プロジェクト: WangMeng2018/ALSA
import argparse
import os
import numpy as np
import pandas as pd
import pickle
import jieba

jieba.setLogLevel(20)


def set_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_data_path', default='data\\train.csv')
    parser.add_argument('--test_data_path', default='data\\test_public.csv')
    parser.add_argument('--embed_data_path',
                        default='data\\Tencent_AILab_ChineseEmbedding.txt')
    parser.add_argument('--out_embed_dir', default='output')
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = set_args()

    # load data, get vocab
    print('start load data.....')
    train_contents = pd.read_csv(args['train_data_path'])['content'].tolist()
    test_contents = pd.read_csv(args['test_data_path'])['content'].tolist()
    contents = train_contents + test_contents
    wordset = set()
    for content in contents:
コード例 #18
0
ファイル: svm_full.py プロジェクト: fendaq/cail-1
    fin.close()

    return alltext, accu_label, law_label, time_label


def train_SVC(vec, label, class_weight=None):
    SVC = LinearSVC(class_weight=class_weight)

    #SVC = LinearSVC()
    SVC.fit(vec, label)
    return SVC


if __name__ == '__main__':
    import logging
    jieba.setLogLevel(logging.CRITICAL)
    logfilename = "train.log"
    root_logger = create_logger()
    logger = create_logger(logfilename)

    dim = int(sys.argv[1])
    seg_method = sys.argv[2]
    ngram = int(sys.argv[3])
    min_df = int(sys.argv[4])
    train_fname = sys.argv[5]
    test_filename = sys.argv[6]
    class_weight = sys.argv[7]
    if class_weight == 'none':
        class_weight = None

    #train
コード例 #19
0
ファイル: __main__.py プロジェクト: aelfric/summarize
                    help="don't use the Hidden Markov Model")
parser.add_argument("-q",
                    "--quiet",
                    action="store_true",
                    default=False,
                    help="don't print loading messages to stderr")
parser.add_argument("-V",
                    '--version',
                    action='version',
                    version="Jieba " + jieba.__version__)
parser.add_argument("filename", nargs='?', help="input file")

args = parser.parse_args()

if args.quiet:
    jieba.setLogLevel(60)
delim = text_type(args.delimiter)
cutall = args.cutall
hmm = args.hmm
fp = open(args.filename, 'r') if args.filename else sys.stdin

if args.dict:
    jieba.initialize(args.dict)
else:
    jieba.initialize()
if args.user_dict:
    jieba.load_userdict(args.user_dict)

ln = fp.readline()
while ln:
    l = ln.rstrip('\r\n')
コード例 #20
0
    ''' Cut Chinese string into multiple sentences. '''
    para = re.sub('([,;:。!?\?])([^』」”’])', r"\1\n\2", para)
    para = re.sub('(\.{6})([^』」”’])', r"\1\n\2", para)
    para = re.sub('(\…{2})([^』」”’])', r"\1\n\2", para)
    para = re.sub('([。!?\?][』」”’])([^,。!?\?])', r'\1\n\2', para)
    para = para.rstrip()
    return para.split('\n')


if __name__ == '__main__':
    assert not os.path.exists('finished_files')
    cmd = os.popen('mkdir finished_files/')
    cmd = os.popen('mkdir finished_files/train')
    cmd = os.popen('mkdir finished_files/val')
    cmd = os.popen('mkdir finished_files/test')
    jieba.setLogLevel(logging.ERROR)
    jieba.initialize()

    # Count entries in LCSTS
    filename = 'LCSTS/DATA/PART_I.txt'
    cmd = os.popen('cat %s | grep -c "</doc>"' % filename)
    data_count = int(cmd.read().strip())

    # Data counter
    data_iter = 0
    train_iter = 0
    val_iter = 0
    test_iter = 0
    vocab_counter = Counter()

    # Collect lines for soup
コード例 #21
0
ファイル: utils.py プロジェクト: alanguo001/soph
# coding: utf-8

import re
import jieba
import logging
from functools import partial


jieba.setLogLevel(logging.INFO)

PUNCTS_PATTERN = re.compile(ur"[.,;:!?'\"~\[\]\(\)\{\}_—。….,;、:!?‘’“”〕《》【】〖〗()「」~]")
SPACES_PATTERN = re.compile(ur"[\r\n\t\u00a0 ]")
SENT_SEP = u'。,!?~;:.,!?:;'


def encode_from_unicode(text):
    """将文本转换为 str 格式"""
    return text.encode('utf-8') if isinstance(text, unicode) else text


def decode_to_unicode(text):
    """将文本转换为 unicode 格式"""
    return text.decode('utf-8') if isinstance(text, str) else text


def to_halfwidth(text):
    """将文本中的全角字符转换为半角字符"""
    text = decode_to_unicode(text)

    res = u''
    for uchar in text:
コード例 #22
0
# -*- coding: UTF-8 -*-

import sqlite3
import jieba
import logging
jieba.setLogLevel(logging.INFO)  #设置不输出信息

conn = sqlite3.connect('./QA_data/QA.db')

cursor = conn.cursor()
stop_words = []
with open('./QA_data/stop_words.txt', encoding='gbk') as f:
    for line in f.readlines():
        stop_words.append(line.strip('\n'))


def match(input_question):
    res = []
    cnt = {}
    question = list(jieba.cut(input_question, cut_all=False))  #对查询字符串进行分词
    for word in reversed(question):  #去除停用词
        if word in stop_words:
            question.remove(word)
    for tag in question:  #按照每个tag,循环构造查询语句
        keyword = "'%" + tag + "%'"
        result = cursor.execute("select * from QA where tag like " + keyword)
        for row in result:
            if row[0] not in cnt.keys():
                cnt[row[0]] = 0
            cnt[row[0]] += 1  #统计记录出现的次数
    try:
コード例 #23
0
ファイル: preprocess.py プロジェクト: csdaiwei/nlpwork
# [email protected]
# featurization text data in ./data folder and save as .npz file

# do not import this module

import os
import pdb
import logging
import numpy as np
from time import time

import jieba
import jieba.analyse

jieba.setLogLevel(logging.ERROR)

#########################################################
#### functions ##########################################
#########################################################

def load_data(type):
	assert type == 'train' or type == 'test'

	label_file = 'data/' + type +'2.rlabelclass'
	sample_path = 'data/' + type + '2'

	dataset = []	#list of data, each element is a tuple (filename, label, content)

	label_dict = {}
	lines = open(label_file).read().strip().split('\n')
コード例 #24
0
 def __init__(self, input_queue):
     self._input_queue = input_queue
     self._stopWordList()
     jieba.setLogLevel(30)
コード例 #25
0
ファイル: __init__.py プロジェクト: leahic/beautifulsentence
# -*- coding: utf-8 -*-
# Licensed under the GNU GPLv2 - http://www.gnu.org/licenses/gpl-2.0.html

import sys
import time
import numpy as np
import jieba
import jieba.posseg as pseg
import cPickle as pickle

jieba.setLogLevel('NOTSET')

def property(val):
	if isinstance(val , str) or isinstance(val , unicode):
		cutlist = pseg.cut(val , HMM=False)
		return ''.join( [ flag for word , flag in cutlist ] )
	else:
		raise("param must be unicode")

def scalemax(vec):
	vec = np.array(vec , dtype = np.float32)
	if vec.max() != 0:
		vec =  vec / vec.max()
	return list(vec)

def normalize(vec):
	vec = np.array(vec , dtype = np.float32)
	denominator = np.sum( vec )
	if  denominator != 0:
		vec  = vec / denominator
	return list(vec)
コード例 #26
0
from typing import List
from pathlib import Path

import jieba
from torch import tensor

from ad_detector.logger import Logger
from ad_detector.config import device

jieba.setLogLevel('INFO')


def sentence2tensor(sentence: str,
                    content_size: int,
                    word2idx: dict,
                    stop_words: List[str] = None) -> tensor:
    words = jieba.lcut(sentence)  # tokenize
    if stop_words is not None:
        words = [i for i in words if i not in stop_words]  # delete stop words
    ret = list()
    for i in words:  # word -> idx
        if i not in word2idx.keys():
            word2idx[i] = len(word2idx) + 1
        ret.append(word2idx[i])
    # if len(ret) > content_size:
    #     Logger('sentence2tensor').warning('content length out of size, result will be truncated.')
    while len(ret) < content_size:  # padding
        ret.append(0)
    ret = ret[:content_size]
    return tensor(ret, device=device)
コード例 #27
0
ファイル: __main__.py プロジェクト: WilliamRen/jieba
                    nargs='?', const=' ',
                    help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM")
parser.add_argument("-a", "--cut-all",
                    action="store_true", dest="cutall", default=False,
                    help="full pattern cutting")
parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false",
                    default=True, help="don't use the Hidden Markov Model")
parser.add_argument("-q", "--quiet", action="store_true", default=False,
                    help="don't print loading messages to stderr")
parser.add_argument("-V", '--version', action='version',
		    version="Jieba " + jieba.__version__)
parser.add_argument("filename", nargs='?', help="input file")

args = parser.parse_args()

if args.quiet:
	jieba.setLogLevel(60)
delim = unicode(args.delimiter)
cutall = args.cutall
hmm = args.hmm
fp = open(args.filename, 'r') if args.filename else sys.stdin

jieba.initialize()
ln = fp.readline()
while ln:
	l = ln.rstrip('\r\n')
	print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8'))
	ln = fp.readline()

fp.close()
コード例 #28
0
@Author: randolph
@Date: 2020-06-01 23:58:59
@LastEditors: randolph
@LastEditTime: 2020-06-03 19:49:16
@version: 1.0
@Contact: [email protected]
@Descripttion: 用jieba处理唐诗三百首作业
'''
import logging
import re
from collections import Counter

import jieba  # 处理自然语言库
import jieba.posseg as pseg

jieba.setLogLevel(logging.INFO)     # 提升jieba日志级别 关闭jieba debug日志输出
jieba.initialize()                  # 手动初始化jieba 加快调用函数速度

POEM_FILE = 'e:/randolph/husky_pywork/poem_300/poem.txt'        # 古诗词源文件路径


def route(ori_data):
    '''函数入口判断
    '''
    flag = input('')
    if flag == "作者":
        n = int(input(''))
        count_authors(ori_data, n)              # 统计作者姓名频次
    elif flag == "人物":
        n = int(input(''))
        count_names(n)                          # 统计人物姓名频次