Esempi in Python per cut, esempi in Python per fool.cut

Esempio n. 1

0

Mostra file

def draw_1(s):
    m = s
    l = fool.cut(s)[0]
    print(l)
    p = product_grammar(m)
    grammar = CFG.fromstring("""
    S ->NP V NP U L|NP U NP V L| NP U L V NP|L U NP V NP|L V NP U NP|NP V L U NP
    NP -> N N|r NP|NP A NP|M Q NP|N|NP U NP|A U NP|N NP|NP C NP|NP U|M NP
    VP ->V|V NP|V VP|A VP|VP NP|VP U|VP C VP|VP P|VP uguo
    V -> v|vi|vshi
    N ->n|nr|t|ns|f|nx|nz
    R ->r
    C ->c
    P ->p
    L ->R|R NP
    U ->ude|y
    A ->a|d|ad
    M ->m
    Q ->q
    """ + p)
    cp = nltk.ChartParser(grammar)
    tree = cp.parse(l)
    stree = []
    for s in tree:
        st = []
        #s.draw()
        for i in range(len(s)):
            st.append([s[i].label(), ''.join(s[i].leaves())])
        stree.append(st)
    return stree

Esempio n. 2

0

Mostra file

File: utils.py Progetto: gfmei/python

    def create_batches(self, train_file, batch_size, sequence_length):

        self.x_data = []
        self.y_data = []
        padding_index = self.vocab_size - 1
        for line in open(train_file):
            line = line.decode('utf-8').replace('\n', '')
            text, label = line.strip().split('\t')
            tokens = fool.cut(re.sub(r'\w+', ' L', text))
            seq_ids = [self.token_dictionary.get(token) for token in tokens[0] if token not in self.stop_words and
                       self.token_dictionary.get(token) is not None and not chinese.is_other_all(token)]
            seq_ids = seq_ids[:sequence_length]
            for _ in range(len(seq_ids), sequence_length):
                seq_ids.append(padding_index)

            self.x_data.append(seq_ids)
            self.y_data.append(self.label_dictionary.get(label))

        self.num_batches = int(len(self.x_data) / batch_size)
        self.x_data = self.x_data[:self.num_batches * batch_size]
        self.y_data = self.y_data[:self.num_batches * batch_size]

        self.x_data = np.array(self.x_data, dtype=int)
        self.y_data = np.array(self.y_data, dtype=int)
        self.x_batches = np.split(self.x_data.reshape(batch_size, -1), self.num_batches, 1)
        self.y_batches = np.split(self.y_data.reshape(batch_size, -1), self.num_batches, 1)
        self.pointer = 0

Esempio n. 3

0

Mostra file

File: svm_model.py Progetto: lxdhhy/Movie-QA-System

def word2vec(line):
    word2id_list = [0] * len(vocab_dict)
    entities = {}
    for x in person_names:
        if x in line:
            line = line.replace(x, " nnt ")
            entities[0] = x
    for x in movie_names:
        if x in line:
            line = line.replace(x, " nm ")
            entities[1] = x
    for x in genre_names:
        if x in line:
            line = line.replace(x, " ng ")
            entities[2] = x
    words, ner = fool.analysis(line)
    for entity in ner[0]:
        if (entity[2] == "person" or entity[2] == "company"):
            line = line.replace(entity[3], " nnt ")

    for word in fool.cut(line)[0]:
        # for word in list(jieba.cut(line)):
        try:
            word2id_list[int(vocab_dict[word])] = 1
        except:
            pass
    return word2id_list, entities

Esempio n. 4

0

Mostra file

def tcut():
    text = "我在北京天安门"
    words, ners = fool.analysis(text)
    print(ners)
    words = fool.pos_cut(text)
    print(words)
    fool.delete_userdict()
    print(fool.cut(text))

Esempio n. 5

0

Mostra file

def get_segmentation(line, print_=False):
    '''
    获取分词文本
    '''
    res = fool.cut(line.strip())
    if print_:
        print(','.join(res[0]))
    return res[0]

Esempio n. 6

0

Mostra file

class clf_model:

    def __init__(self):
        self.model = ""
        self.vectorizer = ""
    def train(self):
        d_train = pd.read_excel("data_train.xlsx")

        d_train.sentence_train = d_train.sentence_train.apply(self.fun_clean)
        print("训练样本 = %d" % len(d_train))

        self.vectorizer = TfidfVectorizer(analyzer="word", token_pattern=r"(?u)\b\w+\b")  # 注意，这里自己指定token_pattern，否则sklearn会自动将一个字长度的单词过滤筛除
        features = self.vectorizer.fit_transform(d_train.sentence_train)
        print("训练样本特征表长度为 " + str(features.shape))

        self.model = LogisticRegression(C=10)
        self.model.fit(features, d_train.label)


    def predict_model(self, sentence):

        if sentence in ["好的", "需要", "是的", "要的", "好", "要", "是"]:
            return 1, 0.8


        sent_features = self.vectorizer.transform([sentence])
        pre_test = self.model.predict_proba(sent_features).tolist()[0]
        clf_result = pre_test.index(max(pre_test))
        score = max(pre_test)
        return clf_result, score


    def predict_rule(self, sentence):

        sentence = sentence.replace(' ', '')
        if re.findall(r'不需要|不要|停止|终止|退出|不买|不定|不订', sentence):
            return 2, 0.8
        elif re.findall(r'订|定|预定|买|购', sentence) or sentence in ["好的","需要","是的","要的","好","要","是"]:
            return 1, 0.8
        else:
            return 0, 0.8


    def fun_clean(self, sentence):


        words, ners = fool.analysis(sentence)

        ners = ners[0].sort(key=lambda x: len(x[-1]), reverse=True)
           if ners:
            for ner in ners:
                sentence = sentence.replace(ner[-1], ' ' + ner[2] + ' ')

        word_lst = [w for w in fool.cut(sentence)[0] if w not in stopwords]
        output_str = ' '.join(word_lst)
        output_str = re.sub(r'\s+', ' ', output_str)
        return output_str.strip()

Esempio n. 7

0

Mostra file

def get_segmentation(line, print_=False):
    '''
    获取分词文本
    '''
    load_dict('F:\\114代码\\i\\wordSegment\\kw.txt')
    res = fool.cut(line.strip())
    if print_:
        print(','.join(res[0]))
    return res[0]

Esempio n. 8

0

Mostra file

File: getNewWord.py Progetto: lzq603/itwords

def classify(word, dict):

    corpus = []
    sql = "select * from T_Keywords"
    results = mysql.select(sql)
    for category in categories:
        words = ""
        for result in results:
            if result[2] == category:
                fool.load_userdict(dict)
                line = " ".join(fool.cut(result[3])[0])  #将每一类的分词拼接成一个字符串
                words = words + line
        corpus.append(words)

    exp = get_parses(word)  #获取当前词的解释

    fool.load_userdict(dict)
    expwords = " ".join(fool.cut(exp)[0])  #对解释进行切词
    corpus.append(expwords)

    vectorizer = CountVectorizer()
    csr_mat = vectorizer.fit_transform(corpus)
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(csr_mat)
    y = np.array(categories)

    model = SVC()
    length = categories.__len__()
    model.fit(tfidf[0:length], y)
    predicted = model.predict(tfidf[length:])

    #对新查询到的词进行插入操作
    sql = "insert into T_Keywords(keyword,category,weight,explanation) values('%s','%s','%s','%s')" % (
        word, predicted[0], 1, exp)
    kid = mysql.exec(sql)

    #爬取相关的链接并插入
    hrefs = get_policy(word)
    for href in hrefs:
        title = href.get('title')
        url = href.get('url')
        sql = "insert into T_Links(title,href,kid) values('%s','%s','%s')" % (
            title, url, kid)
        mysql.exec(sql)

Esempio n. 9

0

Mostra file

def processSentence(sentence):
    #print(fool.cut(sentence))
    #print(fool.pos_cut(sentence))
    try:
        print(fool.cut(sentence))
        print(fool.pos_cut(sentence))
        words, ners = fool.analysis(sentence)
        print(words,ners)
    except:
        pass

Esempio n. 10

0

Mostra file

File: test.py Progetto: yishuihanhan/evaluation

	def test_seg(self):
		# 甲骨分词
		jiagu_result = []
		for sen in sentence:
			jiagu_result.append(jiagu.seg(sen))

		# 结巴分词
		jieba_result = []
		for sen in sentence:
			jieba_result.append(jieba.cut(sen))
		
		# 哈工大LTP
		pyltp_result = []
		for sen in sentence:
			pyltp_result.append(self.ltpseg.segment(sen))
			
		# HanLP
		pyhanlp_result = []
		for sen in sentence:
			words = []
			for term in pyhanlp.HanLP.segment(sen):
				words.append(term.word)
			pyhanlp_result.append(words)
		
		# 清华分词
		thulac_result = []
		for sen in sentence:
			thulac_result.append(self.thu1.cut(sen, text=True).split())
			
		# NLPIR
		pynlpir_result = []
		for sen in sentence:
			pynlpir_result.append(pynlpir.segment(sen, pos_tagging=False))
			
		# SnowNLP
		snownlp_result = []
		for sen in sentence:
			snownlp_result.append(snownlp.SnowNLP(sen).words)
			
		# FoolNLTK
		fool_result = fool.cut(sentence)

		for sen, jgr, jbr, ltp, hanlp, thu, nlpir, snow, fnltk, in zip(sentence, jiagu_result,
					jieba_result, pyltp_result, pyhanlp_result,
					thulac_result, pynlpir_result, snownlp_result, fool_result):
			print('句子：\t\t' + sen + '\n')
			print('结巴：\t\t' + ' '.join(jbr))
			print('HanLP：\t\t' + ' '.join(hanlp))
			print('SnowNLP\t\t' + ' '.join(snow))
			print('FoolNLTK\t' + ' '.join(fnltk))
			print('甲骨：\t\t' + ' '.join(jgr))
			print('哈工大：\t' + ' '.join(ltp))
			print('清华：\t\t' + ' '.join(thu))
			print('NLPIR：\t\t' + ' '.join(nlpir))
			print('\n')

Esempio n. 11

0

Mostra file

def cutNewsTitleByFool(fromfilename, tofilename):
    try:
        ffile = open(fromfilename, 'r', encoding='utf8')
        tfile = open(tofilename, 'w', encoding='utf8')
        title = ffile.readline()
        while title:
            tfile.write(' '.join(fool.cut(title)))
            title = ffile.readline()
        ffile.close()
        tfile.close()
    except Exception as e:
        print(e)

Esempio n. 12

0

Mostra file

File: utils.py Progetto: gfmei/python

    def transform_raw(self, text, sequence_length):

        if not isinstance(text, unicode):
            text = text.decode('utf-8')
        tokens = fool.cut(re.sub(r'\w+', ' L', text))[0]
        x = [self.token_dictionary.get(token) for token in tokens if not chinese.is_other_all(token)
             and token not in self.stop_words]
        x = x[:sequence_length]
        padding_index = self.vocab_size - 1
        for _ in range(len(x), sequence_length):
            x.append(padding_index)
        self.words = [token for token in tokens if not chinese.is_other_all(token)]
        return x

Esempio n. 13

0

Mostra file

def before_data_clean():
    comment_data = pd.read_excel('F:/learning/weibo/Result.xlsx')
    print (comment_data)
    text = ",".join(comment_data[0])
    text =str(text)
    print (text)

    a = fool.cut(text)
    print (a)
    cut_text = ' '.join(a[0])
    instance = pd.DataFrame(a[0], columns=["instance"])
    pd.DataFrame(instance).to_excel('F:/learning/weibo/instance.xls', encoding='utf_8_sig')
    c = Counter(a[0])
    c.most_common(30)
    pd.DataFrame(c.most_common(30)).to_excel('F:/learning/weibo/enci.xls', encoding='utf_8_sig')

Esempio n. 14

0

Mostra file

def getWordStatsWithFool(data):
    """

    :param data: tuple类型的数据，data【n】【0】是弹幕数据
    :return:
    """
    wordFrequency = {}

    for i in range(len(data)):
        barrage = data[i][0]
        # print(fool.cut(barrage))
        for word in fool.cut(barrage)[0]:
            # print(word)
            if word in wordFrequency.keys():
                wordFrequency[word] += 1
            else:
                wordFrequency[word] = 1
    return wordFrequency

Esempio n. 15

0

Mostra file

def segmentation_conversion_helper(fn,
                                   list_line,
                                   sub_folder,
                                   phrase_syllable="phrase"):
    list_line_char = [[line[0], line[1], ' '.join(fool.cut(line[2])[0])]
                      for line in list_line
                      if len(line[2].replace(" ", "")) > 0]
    list_line_pinyin = [[
        line[0], line[1],
        pinyin.get(line[2], format='strip', delimiter=' ')
    ] for line in list_line if len(line[2].replace(" ", "")) > 0]
    write_line(filename=os.path.join(mandarin_kugou_root, sub_folder,
                                     fn + '_' + phrase_syllable + '_char.txt'),
               list_line=list_line_char)
    write_line(filename=os.path.join(
        mandarin_kugou_root, sub_folder,
        fn + '_' + phrase_syllable + '_pinyin.txt'),
               list_line=list_line_pinyin)

Esempio n. 16

0

Mostra file

 def fun_clean(self, sentence):
     """
     预处理函数
     :输入 用户输入语句:
     :输出 预处理结果:
     """
     # 使用foolnltk进行实体识别
     words, ners = fool.analysis(sentence)
     # 对识别结果按长度倒序排序
     ners = ners[0].sort(key=lambda x: len(x[-1]), reverse=True)
     # 如果有实体被识别出来，就将实体的字符串替换成实体类别的字符串（目的是看成一类单词，看成一种共同的特征）
     if ners:
         for ner in ners:
             sentence = sentence.replace(ner[-1], ' ' + ner[2] + ' ')
     # 分词，并去除停用词
     word_lst = [w for w in fool.cut(sentence)[0] if w not in stopwords]
     output_str = ' '.join(word_lst)
     output_str = re.sub(r'\s+', ' ', output_str)
     return output_str.strip()

Esempio n. 17

0

Mostra file

def draw_1(s):
    m = s
    l = fool.cut(s)[0]
    print(l)
    p = product_grammar(m)
    grammar = CFG.fromstring("""
	S -> NP L NP|NP vshi NP y|NP L P NP|NP L P NP F|NP vshi R|T vshi R
	NP -> nr nr| nr ude n| nr n|NP ude NP|NP NP|z ude n|a ude n|v ude n|nr|n|b ude|ns ude|ns|ns ude NP|m n|m q n|A\
    |d m|m|NP c NP|NP p NP
	VP -> v NP|v VP
	L ->vshi d vshi
	P ->p|vi p
	F ->f
	T ->t
	R ->r|r NP|r ude NP
	A ->a|d a|m q|d a ude
	""" + p)
    cp = nltk.ChartParser(grammar)
    trees = cp.parse(l)
    for s in trees:
        print(s)

Esempio n. 18

0

Mostra file

File: utils.py Progetto: gfmei/python

    def create_dictionary(self, train_file, save_dir):
        """
        从原始文本文件中创建字典
        :param train_file: 原始训练文件文档
        :param save_dir: 词典保存路径
        :return: token_dictionary, label_dictionary, labels, vocab_size, n_classes
        """
        token_dictionary = {}
        token_index = 0
        label_dictionary = {}
        label_index = 0
        labels = []

        for line in open(train_file):
            line = line.decode('utf-8').replace('\n', '')
            text, label = line.strip().split('\t')
            tokens = fool.cut(re.sub(r'\w+', ' L', text))
            # print(tokens)
            if label not in label_dictionary:
                label_dictionary[label] = label_index
                labels.append(label)
                label_index += 1

            for token in tokens[0]:
                if token not in token_dictionary and not chinese.is_other_all(token) and token not in self.stop_words:
                    token_dictionary[token] = token_index
                    token_index += 1

        token_dictionary['</s>'] = token_index
        token_index += 1
        self.vocab_size = len(token_dictionary)
        self.n_classes = len(label_dictionary)
        print('Corpus Vocabulary:{0}, Classes:{1}'.format(self.vocab_size, self.n_classes))

        with open(save_dir + 'dictionary', 'w') as f:
            pickle.dump((token_dictionary, label_dictionary), f)

        self.token_dictionary = token_dictionary
        self.label_dictionary = label_dictionary
        self.labels = labels

Esempio n. 19

0

Mostra file

File: project2_main.py Progetto: guifaChild/ai_nlp

    def fun_clean(self, sentence):
        # 函数目标：预处理函数，将必要的实体转换成统一符号（利于分类准确），去除停用词等
        # input：sentence（用户输入语句）
        # output：sentence（预处理结果）
        """
        预处理函数，将必要的实体转换成统一符号（利于分类准确），去除停用词等
        """
        words, ners = fool.analysis(sentence)
        ners = ners[0].sort(key=lambda x: len(x[-1]), reverse=True)

        if ners:
            for ner in ners:
                sentence.replace(ner[-1], ' ' + ner[2] + ' ')

        wordslist = fool.cut(sentence)[0]

        wordslist = [word for word in wordslist if word not in stopwords]

        sentence = ' '.join(wordslist)
        sentence = re.sub(r'\s+', ' ', sentence).strip()

        return sentence

Esempio n. 20

0

Mostra file

File: 0809 project5-聊天机器人代码.py Progetto: lijianss/NLP_GreedyAI

    def predict_model(self, sentence):
        # 函数目标：使用意图分类模型预测意图
        #  input：sentence（用户输入）
        # output：clf_result（意图类别），score（意图分数）

        # --------------
        # 对样本中没有的特殊情况做特别判断
        if sentence in ["好的", "需要", "是的", "要的", "好", "要", "是"]:
            return 1, 0.8
        # --------------

        """
        TODO：利用已训练好的意图分类模型进行意图识别
        """

        sent = self.fun_clean(' '.join(fool.cut(sentence)[0]))
        inputs = self.vectorizer.transform([sent])
        scores = self.model.predict_proba(inputs)[0]
        clf_result = np.argmax(scores, axis=0)
        score = scores[clf_result]

        return clf_result, score

Esempio n. 21

0

Mostra file

def segment_lyric_convert_pinyin_mir1k():
    openCC = OpenCC('tw2s')
    folder_lyrics_mir1k = os.path.join(mir1k_root, 'Lyrics')
    filenames_lyrics_mir1k = list(
        set(get_filenames_in_folder(folder_lyrics_mir1k)))
    for fn in filenames_lyrics_mir1k:
        fn_txt = os.path.join(folder_lyrics_mir1k, fn + '.txt')
        try:
            list_line = read_mir1k_lyrics(fn_txt)
            line_simplified = openCC.convert(list_line[0])
            line_pinyin = pinyin.get(line_simplified,
                                     format='strip',
                                     delimiter=' ')
            line_char = ' '.join(fool.cut(line_simplified)[0])
            write_lyrics_one_line(filename=os.path.join(
                mir1k_root, 'annotation', fn + '_phrase_char.txt'),
                                  line=line_char)
            write_lyrics_one_line(filename=os.path.join(
                mir1k_root, 'annotation', fn + '_phrase_pinyin.txt'),
                                  line=line_pinyin)
        except UnicodeDecodeError:
            print(fn)

Esempio n. 22

0

Mostra file

File: fenci.py Progetto: howardhh/python-script

import fool
import xlrd
import xlwt

workbook = xlwt.Workbook(encoding='ascii')
worksheet = workbook.add_sheet('My Worksheet')

data = xlrd.open_workbook('zwcg.xls')

table = data.sheet_by_name(u'Sheet1')
nrows = table.nrows

for i in range(nrows):
    text = table.row_values(i)
    worklist = fool.cut(text[0])
    for j in range(len(worklist)):
        worksheet.write(i, j, label=worklist[j])

workbook.save('result.xls')

Esempio n. 23

0

Mostra file

File: test3.py Progetto: ayiis/coding

"""
    https://github.com/rockyzhengwu/FoolNLTKhttps://github.com/rockyzhengwu/FoolNLTK
"""
import q
from pyhanlp import HanLP, JClass

with open("../test_data/1.txt", "r") as rf:
    text = rf.read()

text = text[:502]

import fool

result = fool.cut(text)

print(" ".join(result[0]))

import fool

words, ners = fool.analysis(text)
print(ners)
"""
[[
    (0, 5, 'company', '新浪科技'),
    (6, 9, 'location', '北京'),
    (10, 18, 'time', '4月29日晚间'),
    (20, 25, 'company', '搜狗公司'),
    (24, 27, 'time', '今天'),
    (31, 37, 'time', '3月31日'),
    (37, 47, 'time', '2019年第一季度'),
    (60, 65, 'time', '第一季度'),

Esempio n. 24

0

Mostra file

 def foolnltk(self, text):
     # FoolNLTK
     fool_result = fool.cut(text)
     return fool_result

Esempio n. 25

0

Mostra file

'''
源教程来自:

https://github.com/rockyzhengwu/FoolNLTK/blob/master/README_CH.md

'''
import fool
path=r"C:\Users\lenvov\Desktop\my_diy_dic.txt" #txt文件保存用户本地自定义词典，每行格式为：词 权重
fool.load_userdict(path) #加载自定义词典
#词典只能定义词的权值，不能定义词的词性，故对词性标注没有帮助
#fool.delete_userdict(); #删除用户自定义词典

text="习近平觉得张构架的趣多多比希斯罗机场的巧克力味的奥利奥要贵得多。"
words, ners = fool.analysis(text) #words列表保存分词后词性标注的结果（只使用自带词典不添加自定义词典），ners保存识别得到的实体（存在分词不准确但命名实体识别正确的现象，但使用自定义字典以后便可修正）
# 实体识别过程得到的words列表不受自定义词典影响。一般不用

print('文本切分：',fool.cut(text),'\n')
print('文本切分后进行词性标注：',fool.pos_cut(text),'\n')
print('words：',words,'\n')
print('实体识别',ners,'\n')

Esempio n. 26

0

Mostra file

File: dictonary.py Progetto: xujin8/FoolNLTK

#!/usr/bin/env python
# -*-coding:utf-8-*-

import fool

text = "我在北京天安门看你难受香菇,一一千四百二十九"

print("no dict:", fool.cut(text))
fool.load_userdict("./test_dict.txt")
print("use dict: ", fool.cut(text))
fool.delete_userdict()
print("delete dict:", fool.cut(text))

words, ners = fool.analysis(text)
print("ners: ", ners)

Esempio n. 27

0

Mostra file

#!/usr/bin/env python
# -*-coding:utf-8-*-

import fool

text = ["我在北京天安门看你难受香菇,一一千四百二十九", "我在北京晒太阳你在非洲看雪", "千年不变的是什么", "我在北京天安门。"]

print("no dict:", fool.cut(text, ignore=True))
fool.load_userdict("./test_dict.txt")
print("use dict: ", fool.cut(text))
fool.delete_userdict()
print("delete dict:", fool.cut(text))

pos_words = fool.pos_cut(text)
print("pos result", pos_words)

words, ners = fool.analysis(text)
print("ners: ", ners)

ners = fool.ner(text)
print("ners:", ners)

Esempio n. 28

0

Mostra file

File: __main__.py Progetto: zuidai/FoolNLTK

    metavar="DELIM",
    nargs='?',
    const='_',
    help=
    "enable POS tagging; if DELIM is specified, use DELIM instead of '_' for POS delimiter"
)
parser.add_argument("-D", "--dict", help="use DICT as dictionary")
parser.add_argument(
    "-u",
    "--user-dict",
    help=
    "use USER_DICT together with the default dictionary or DICT (if specified)"
)

parser.add_argument("filename", nargs='?', help="input file")

args = parser.parse_args()

delim = args.delimiter

fp = open(args.filename, 'r') if args.filename else sys.stdin
ln = fp.readline()

while ln:
    l = ln.rstrip('\r\n')
    result = delim.join(fool.cut(ln.rstrip('\r\n')))
    print(result)
    ln = fp.readline()

fp.close()

Esempio n. 29

0

Mostra file

File: foolnltk.py Progetto: Stone-Hammer/foolnltk

import fool

text = "2017年12月29日，上海嘉定公安机关接到报警电话，市民称其接到一家自称为某装饰公司员工的电话，对方在向其推销房屋装潢工程时能准确说出其姓名、手机号、房产地址等个人信息，该市民感觉个人信息被侵犯，于是报警。"
print(fool.cut(text))
# ['一个', '傻子', '在', '北京']

Esempio n. 30

0

Mostra file

File: test.py Progetto: oujieww/oqmrc2018

    def text_to_vec(self, text):

        words = fool.cut(text)[0]

        return list(map(self.to_num, words))

Esempio n. 31

0

Mostra file

File: __init__.py Progetto: ayiis/coding

def fool_cut(text):
    cut = fool.cut(text)
    return " ".join(cut[0])

Esempio n. 32

0

Mostra file

File: __main__.py Progetto: forin-xyz/FoolNLTK

parser.add_argument("filename", nargs='?', help="input file")

args = parser.parse_args()

delim = args.delimiter
plim = args.pos

batch_zize = args.batch_size

if args.user_dict:
    fool.load_userdict(args.user_dict)

fp = open(args.filename, 'r') if args.filename else sys.stdin
lines = fp.readlines(batch_zize)


while lines:
    lines = [ln.strip("\r\n") for ln in lines]
    if args.pos:
        result_list  = fool.pos_cut(lines)
        for res in result_list:
            out_str = [plim.join(p) for p in res]
            print(delim.join(out_str))
    else:
        result_list = fool.cut(lines)
        for res in result_list:
            print(delim.join(res))
    lines = fp.readlines(batch_zize)

fp.close()

Esempio n. 33

0

Mostra file

File: dictonary.py Progetto: forin-xyz/FoolNLTK

#!/usr/bin/env python
# -*-coding:utf-8-*-

import fool

text = ["我在北京天安门看你难受香菇,一一千四百二十九", "我在北京晒太阳你在非洲看雪", "千年不变的是什么", "我在北京天安门。"]

print("no dict:", fool.cut(text, ignore=True))
fool.load_userdict("./test_dict.txt")
print("use dict: ", fool.cut(text))
fool.delete_userdict()
print("delete dict:", fool.cut(text))

pos_words =fool.pos_cut(text)
print("pos result", pos_words)

words, ners = fool.analysis(text)
print("ners: ", ners)

ners = fool.ner(text)
print("ners:", ners)

Esempio n. 34

0

Mostra file

    All_Dict['main2_' + str(i)] = list()
    All_Dict['year_' + str(i)] = list()

user_dict = "./_reference/thulac/THUOCL_it_space.txt"
fool.load_userdict(user_dict)
count = 0
unstructured = list()
with open(raw_cn, 'r', encoding='UTF-8') as raw:
    for line in raw:
        temp = line.split('\t')
        if len(temp) == 3:
            current_year = str(temp[2].strip())
            All_Dict['title_' + current_year].append(temp[0])
            All_Dict['year_' + current_year].append(current_year)
            #All_Dict['main1_' + current_year].extend(fool.cut(temp[1]))
            All_Dict['main2_' + current_year].extend(fool.cut(temp[1]))
        else:
            unstructured.append(count)
        count += 1

        if count % 1000 == 0:
            print("Time for 1000 sentences: %.2f" % (time.time() - TempTime))
            TempTime = time.time()

print("unstructured sample:")
print(unstructured)

for i in range(1998, 2018):
    with open('./structured_data/fool/com_cn_title_' + str(i) + '.txt',
              'w',
              encoding='UTF-8') as f:

Esempio n. 35

0

Mostra file

parser.add_argument("filename", nargs='?', help="input file")

args = parser.parse_args()

delim = args.delimiter
plim = args.pos

batch_zize = args.batch_size

if args.user_dict:
    fool.load_userdict(args.user_dict)

fp = open(args.filename, 'r') if args.filename else sys.stdin
lines = fp.readlines(batch_zize)

while lines:
    lines = [ln.strip("\r\n") for ln in lines]
    if args.pos:
        result_list = fool.pos_cut(lines)
        for res in result_list:
            out_str = [plim.join(p) for p in res]
            print(delim.join(out_str))
    else:
        result_list = fool.cut(lines)
        for res in result_list:
            print(delim.join(res))
    lines = fp.readlines(batch_zize)

fp.close()