def preprocess(filename):
    f_save = open('data/char_test.txt', 'w', encoding='utf-8')
    pynlpir.open()
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            lst = line.rstrip().split(' ')
            for item in lst:
                c, t = item.split('/')
                if t == 'o':
                    c = pynlpir.segment(c, pos_tagging=False)
                    for i, x in enumerate(c):
                        f_save.write(x + ' ' + 'O' + '\n')
                elif t == 'ns':
                    c = pynlpir.segment(c, pos_tagging=False)
                    for i, x in enumerate(c):
                        if i == 0:
                            f_save.write(x + ' ' + 'B-LOC' + '\n')
                        else:
                            f_save.write(x + ' ' + 'I-LOC' + '\n')
                elif t == 'nt':
                    c = pynlpir.segment(c, pos_tagging=False)
                    for i, x in enumerate(c):
                        if i == 0:
                            f_save.write(x + ' ' + 'B-ORG' + '\n')
                        else:
                            f_save.write(x + ' ' + 'I-ORG' + '\n')
                elif t == 'nr':
                    c = pynlpir.segment(c, pos_tagging=False)
                    for i, x in enumerate(c):
                        if i == 0:
                            f_save.write(x + ' ' + 'B-PER' + '\n')
                        else:
                            f_save.write(x + ' ' + 'I-PER' + '\n')
            f_save.write('\n')
    f_save.close()
Beispiel #2
0
 def test_segment(self):
     """Tests that the segment() function works as expected."""
     s = "我们都是美国人。"
     seg_s = pynlpir.segment(s, pos_tagging=False)
     pos_seg_s = pynlpir.segment(s, pos_tagging=True, pos_names="child")
     npos_seg_s = pynlpir.segment(s, pos_tagging=True, pos_names=None)
     ppos_seg_s = pynlpir.segment(s, pos_tagging=True, pos_names="all")
     expected_seg_s = ["我们", "都", "是", "美国", "人", "。"]
     expected_pos_seg_s = [
         ("我们", "personal pronoun"),
         ("都", "adverb"),
         ("是", "verb 是"),
         ("美国", "transcribed toponym"),
         ("人", "noun"),
         ("。", "period"),
     ]
     expected_npos_seg_s = [("我们", "rr"), ("都", "d"), ("是", "vshi"), ("美国", "nsf"), ("人", "n"), ("。", "wj")]
     expected_ppos_seg_s = [
         ("我们", "pronoun:personal pronoun"),
         ("都", "adverb"),
         ("是", "verb:verb 是"),
         ("美国", "noun:toponym:transcribed toponym"),
         ("人", "noun"),
         ("。", "punctuation mark:period"),
     ]
     self.assertEqual(expected_seg_s, seg_s)
     self.assertEqual(expected_pos_seg_s, pos_seg_s)
     self.assertEqual(expected_npos_seg_s, npos_seg_s)
     self.assertEqual(expected_ppos_seg_s, ppos_seg_s)
Beispiel #3
0
def build_list_and_idmap(train_dict, context_dict, path=None):
    question_list = []
    question_idmap = {}

    context_list = []
    context_idmap = {}

    with tqdm(total=len(train_dict)) as pbar:
        pbar.set_description('build list_and_idmap of train_dict')
        for index, item in enumerate(train_dict.items()):
            _id, _item = item
            question = _item['question']
            question_list.append(pynlpir.segment(question, pos_tagging=False))
            question_idmap[_id] = str(index)
            question_idmap[str(index)] = _id
            pbar.update(1)

    if path and os.path.exists(path.get('context_idmap')) and os.path.exists(
            path.get('context_list')):
        context_list = load_pkl_data(path.get('context_list'))
        context_idmap = load_pkl_data(path.get('context_idmap'))
    else:
        with tqdm(total=len(context_dict)) as pbar:
            pbar.set_description('build list_and_idmap of context_dict')
            for index, item in enumerate(context_dict.items()):
                _id, doc = item
                context_list.append(
                    pynlpir.segment(doc["text"], pos_tagging=False))
                context_idmap[_id] = str(index)
                context_idmap[str(index)] = _id
                pbar.update(1)
        save_pkl_data(context_list, path.get('context_list'))
        save_pkl_data(context_idmap, path.get('context_idmap'))

    return question_list, question_idmap, context_list, context_idmap
Beispiel #4
0
def segment(path='F:/Data/Chinese/chinese.json',
            json_path='F:/Data/Chinese/chinese_token.json'):
    """
    NLPIR分词+根据词性清洗+去掉为问题或回答空的项
    :param path: 源数据路径
    :param json_path: 结果保存路径
    :return:
    """
    # 启动分词工具
    pynlpir.open()
    # 只保留文本部分,并分词,根据词性过滤
    # 保留以下词性的词,并去除词性标记
    # 词性含义请查看https://github.com/tsroten/pynlpir/blob/master/pynlpir/pos_map.py
    word_filter = {
        'noun', 'time word', 'locative word', 'noun of locality', 'verb',
        'adjective', 'distinguishing word', 'status word', 'numeral'
    }
    # 清除分词异常的数据
    question_id_filter = {294118450, 300106271, 291834409}

    # 边读边处理边写入文件,减少内存消耗
    count = 0
    with open(path, 'r') as f_in, open(json_path, 'w') as f_out:
        for line in f_in:
            q = json.loads(line)
            if q['question_id'] in question_id_filter:
                continue
            # 干掉有换行的情况 小写化
            if '\n' in q['question']:
                print 'question:'
                print q['question']
                q['question'] = q['question'].replace('\n', ' ')
            q['question'] = [
                w[0] for w in pynlpir.segment(q['question'].lower())
                if w[1] in word_filter and w[0] != u''
            ]

            for a in q['answers']:
                # 干掉有换行的情况
                if '\n' in a['answer']:
                    print 'answer:'
                    print a['answer']
                    a['answer'] = a['answer'].replace('\n', ' ')
                a['answer'] = [
                    w[0] for w in pynlpir.segment(a['answer'].lower())
                    if w[1] in word_filter and w[0] != u''
                ]
            # 清除回答为空
            q['answers'] = [a for a in q['answers'] if len(a['answer']) > 0]
            count = count + 1
            if count % 1000 == 0:
                print count
            # 清除回答列表为空和问题为空的
            if len(q['question']) > 0 and len(q['answers']) > 0:
                f_out.write(json.dumps(q))
                f_out.write('\n')
    pynlpir.close()
Beispiel #5
0
 def test_segment_space(self):
     """Tests that the fix for issue #2 works."""
     s = '这个句子有 空格。'
     seg_s = pynlpir.segment(s, pos_tagging=False)
     pos_seg_s = pynlpir.segment(s)
     expected_seg_s = ['这个', '句子', '有', ' ', '空格', '。']
     expected_pos_seg_s = [('这个', 'pronoun'), ('句子', 'noun'), ('有', 'verb'),
                           (' ', None), ('空格', 'noun'),
                           ('。', 'punctuation mark')]
     self.assertEqual(expected_seg_s, seg_s)
     self.assertEqual(expected_pos_seg_s, pos_seg_s)
Beispiel #6
0
 def test_segment_space(self):
     """Tests that the fix for issue #2 works."""
     s = '这个句子有 空格。'
     seg_s = pynlpir.segment(s, pos_tagging=False)
     pos_seg_s = pynlpir.segment(s)
     expected_seg_s = ['这个', '句子', '有', ' ', '空格', '。']
     expected_pos_seg_s = [('这个', 'pronoun'), ('句子', 'noun'),
                           ('有', 'verb'), (' ', None), ('空格', 'noun'),
                           ('。', 'punctuation mark')]
     self.assertEqual(expected_seg_s, seg_s)
     self.assertEqual(expected_pos_seg_s, pos_seg_s)
def parse_name(row):
    segments = pynlpir.segment(row, pos_names='all')
    temp_index = False
    try:
        segments = pynlpir.segment(row, pos_names='all')
    except Exception as e:
        return temp_index
    for segment in segments:
        if segment[1] == u'noun:personal name':
            temp_index = True
    return temp_index
def words_cixing(question,pos=1):
    #pos=1,标注词性;否则不标注
    pynlpir.open()
    if pos:
        pos1=['{}/{}'.format(k,v)for k,v in pynlpir.segment(question, pos_names=None,pos_tagging=pos)]
    else:
        pos0=pynlpir.segment(question)
    pynlpir.close()
    if pos:
        return pos1
    else :
        return pos0
def words_cixing(question, pos=1):
    #pos=1,标注词性;否则不标注
    pynlpir.open()
    if pos:
        pos1 = [
            '{}/{}'.format(k, v) for k, v in pynlpir.segment(
                question, pos_names=None, pos_tagging=pos)
        ]
    else:
        pos0 = pynlpir.segment(question)
    pynlpir.close()
    if pos:
        return pos1
    else:
        return pos0
Beispiel #10
0
    def parse_skills(self, line):
        skills = []
        pairs = []
        match_list = ['prow', 'eng']
        w_tlist = pseg.cut(line)
        temp = []
        lasttag = 'prow'
        for word, tag in w_tlist:
            if tag in match_list:
                if lasttag in match_list:
                    temp.append((word, tag))
                else:
                    pairs.append(temp)
                    temp = [(word, tag)]
            lasttag = tag

        for line in pairs:
            des = ""
            for index, pair in enumerate(line):
                if pair[1] == 'prow' and (index == 0 or index == len(line)-1)\
                        and pynlpir.segment(pair[0])[0][1] != 'noun':
                    continue
                des += pair[0]
            if len(des) != 0:
                if des not in skills:
                    wash_text = jieba.analyse.extract_tags(des,
                                                           withWeight=True)
                    if sum([pair[1] for pair in wash_text]) > 8.0:
                        skills.append(des)
        return skills
Beispiel #11
0
def refine_corpus(path):

    if os.path.isdir(path):
        files = [os.path.join(path, file) for file in os.listdir(path)]
    else:
        files = [
            path,
        ]

    titles = []
    corpus = []

    ambiguity = re.compile("\{.*?\}")
    redundancy = re.compile("\(.*?\)|\d")

    for file in files:
        with open(file) as f:
            poem_set = json.load(f)

        for poem in poem_set:
            paragraphs = "".join(poem["paragraphs"])
            if ambiguity.search(paragraphs):
                continue
            else:
                paragraphs = redundancy.sub("", paragraphs)

            titles.append(poem["title"])
            token_poem = " ".join(
                pynlpir.segment(paragraphs, pos_tagging=False))

            corpus.append(token_poem)

        del poem_set

    return titles, corpus
Beispiel #12
0
def mysegment(filename2w, filename2seg, srting2strp):
    dataMat = []
    labelMat = []
    fr = open(filename2w)
    fl = open(filename2seg, 'w')
    arrayOLines = fr.readlines()
    length = len(arrayOLines)
    for j in range(length):
        lineArr = arrayOLines[j].strip().split(';')
        if len(lineArr) < 3:
            pass
        else:
            fl.write(str(j))
            fl.write(";")
            fl.write(str(lineArr[1]))
            fl.write(";")
            fl.write(str(lineArr[2]))
            fl.write(";")
            seg = pynlpir.segment(lineArr[1], pos_tagging=False)
            for item in seg:
                if str(item) in srting2strp:
                    pass
                else:
                    fl.write(str(item))
                    fl.write(",")
            fl.write(";\n")
    fl.close()
    pynlpir.close()
Beispiel #13
0
    def train(self):
        # df_table = {"valid": {"science": 35, "physics": 34, "robot": 57}, "invalid": {"fat": 30, "large": 34, "cheap": 55}}
        # The number of articles containing "science", "physics" or "robot"
        # prior_table = {"valid": 183, "invalid": 244}
        pynlpir.open()
        prior_table = {ele: 0 for ele in self.category_list}
        posterior_table = {ele: dict() for ele in self.category_list}

        i = 0
        for sample in self.training_set_material:
            buffer = sample.split("\t")
            text = buffer[0]
            seg_words = pynlpir.segment(text, pos_tagging=False)
            words_set = set(seg_words)
            try:
                label = buffer[1]
            except:
                print("Line " + str(i) + "in training set corrupted")
                continue
            prior_table[label] += 1
            for word in words_set:   # all words in the text
                if word in posterior_table[label].keys():
                    posterior_table[label][word] += 1   # posterior count +1 when this word already exists in posterior
                else:
                    posterior_table[label][word] = 1  # posterior count assigned to 1 when this word does exist in posterior yet
            i += 1
        return prior_table, posterior_table
Beispiel #14
0
 def get_tokenised_parts(self):
     pynlpir.open()
     for s in self.sentences:
         sen_parts = re.split('[?!.,。,?!]', s)
         for sen_part in sen_parts:
             tokens = pynlpir.segment(sen_part)
             yield tokens
Beispiel #15
0
 def get_result(self, paragraph):
     self.paragraph = paragraph
     self.segments = pynlpir.segment(self.paragraph,
                                     pos_names='all',
                                     pos_tagging=False)
     self.key_words = pynlpir.get_key_words(self.paragraph,
                                            weighted=False,
                                            max_words=20)
     self.new_sentence_wordlist = [0] * len(self.key_words)
     key_words = pynlpir.get_key_words(self.paragraph,
                                       max_words=20,
                                       weighted=True)
     self.key_weight = [item[1] for item in key_words]
     sentence_dict = self.cal_text_simliarity()
     keys = list(sentence_dict.keys())
     val = list(sentence_dict.values())
     temp = sorted(
         list(map(val.index, heapq.nlargest(self.maxSumarySize, val))))
     for i in temp[:2]:
         if keys[i] != self.sentence()[0]:
             self.result.append(keys[i])
     self.result.insert(0, self.sentence()[0])
     if len(",".join(self.result)) < self.length:
         self.result.append(keys[temp[2]])
     return ",".join(self.result)
Beispiel #16
0
def filter_q_by_word_freq(dfs_q_filters, train=None):
    question_f = []
    with tqdm(total=len(train)) as pbar:
        pbar.set_description('filter question by q_dfs')
        for item in train:
            q_cut = pynlpir.segment(item.get('question'), pos_tagging=False)
            question = [word for word in q_cut if word in dfs_q_filters]
            if len(question) == 0:
                isBad = True
            else:
                isBad = False
            q = {
                'question': question,
                'isBad': isBad,
                'docid': item['docid'],
                'id': item['id'],
            }
            question_f.append(q)
            pbar.update(1)
    if False:
        q_empty = []
        for item in question_f:
            if len(item['question']) == 0:
                q_empty.append(item)
        if len(q_empty) != 0:
            raise Exception(
                'Empty in q after filtering, have {} empty q'.format(
                    len(q_empty)))

    return question_f
Beispiel #17
0
def Participle(input_File):
    Total_data = ''
    for root, dirs, files in os.walk(
            input_File):  #'D:\文本数据挖掘\搜狗实验室内容分类数据集\C000024' C000024_Participle
        for filespath in files:
            print(os.path.join(root, filespath))
            p = re.compile(r'/')
            Folder = p.split(os.path.join(root, filespath))
            #print(Folder[-2])
            #File_name=Folder[-2]+'\\'+Folder[-1]
            File_body = readfile(os.path.join(root, filespath))
            File_Participle = pynlpir.segment(File_body, pos_tagging=False)
            #提取停词
            File_Participle_delstopwords = ''
            for word in File_Participle:
                #去掉左右两侧空格
                word = word.strip()
                if word not in stopwords:
                    if word >= u'\u4e00' and word <= u'\u9fa5':  # 判断是否是汉字
                        #对分词进行空格隔开
                        File_Participle_delstopwords = File_Participle_delstopwords + ' ' + word
            #############################################对分词进行utf-8的格式进行保存,以便用后面的word2vector可以读取和使用
            #############################################非常重要#########################################################
            File_Participle_delstopwords = File_Participle_delstopwords[
                1:len(File_Participle_delstopwords)]
            if not (os.path.exists('文本分词结果' + '/' + Folder[-2])):
                os.mkdir('文本分词结果' + '/' + Folder[-2])
            savefile('文本分词结果' + '/' + Folder[-2] + '/' + Folder[-1],
                     File_Participle_delstopwords.encode("utf-8"))
            Total_data = Total_data + File_Participle_delstopwords + '\n'
    savefile('文本' + '/' + Folder[-2] + '_train.txt',
             Total_data.encode("utf-8"))
Beispiel #18
0
 def segment(self, sentence):  #分词
     pynlpir.open(license_code=")VhTW_9s02tDm")
     list = pynlpir.segment(sentence)
     wordList = []
     for res in list:
         wordList.append(res[0])
     return wordList
def find_raw_entity(text, entity_dict):
    raw_entities = []

    segments = pynlpir.segment(text, pos_names='all')
    for segment in segments:
        try:
            if (segment[1] == 'noun:other proper noun') | (
                    segment[1] == 'noun:organization/group name'):
                raw_entities.append(segment[0])
            elif segment[1].startswith('noun:personal name'):
                raw_entities.append(segment[0])
            elif segment[1].startswith('noun:toponym'):
                raw_entities.append(segment[0])
            elif (segment[1] == 'noun') and len(segment[0]) > 1:  # 个人添加的,有主观倾向
                raw_entities.append(segment[0])
        except:
            continue

    entity_list = list(set(raw_entities))
    entity_id_list = []
    for entity in entity_list:
        try:
            entity_id_list.append(entity_dict[entity])
        except:
            entity_id_list.append(entity)
    return entity_id_list
Beispiel #20
0
def main(input_file, output_file):
    pynlpir.open()
    fw = open(output_file, 'w+', encoding='utf-8')
    pos2id = get_pos_map()
    data = read_corpus(input_file)
    for _sent, _tags in data:
        sent = ''.join(_sent)
        result = pynlpir.segment(sent,
                                 pos_tagging=True,
                                 pos_names='parent',
                                 pos_english=True)
        # print(result)
        i = 0
        for _word, _speech in result:
            for j in range(len(_word)):
                char = _word[j]
                speech = ''
                if _speech is None or _speech not in reserve_pos_list:
                    speech = 'O'
                else:
                    speech = '-'.join(_speech.split(' '))
                    if j == 0:
                        speech = 'B-' + speech
                    else:
                        speech = 'I-' + speech
                if i >= len(_tags):
                    print(i, len(_sent), _sent)
                fw.write(char + ' ' + _tags[i] + ' ' + speech + '\n')
                i += 1
        fw.write('\n')
    fw.close()
    pynlpir.close()
Beispiel #21
0
def get_train_with_doc(train, bm25_model, context_idmap, k=5):
    save_path = '../data/rank/train_with_doc_top{}_2.pkl'.format(k)
    if os.path.exists(save_path):
        train_with_doc = load_pkl_data(save_path)
    else:
        train_with_doc = []
        with tqdm(total=len(train)) as pbar:
            pbar.set_description('build train with doc in top-{}'.format(k))
            for item in train:
                question = item['question']
                qid = item['qid']
                q_cut = pynlpir.segment(question, pos_tagging=False)
                bm25_score = bm25_model.get_scores(q_cut)
                bm25_score = [[context_idmap[str(index)], score]
                              for index, score in enumerate(bm25_score)]
                bm25_score.sort(key=op.itemgetter(1), reverse=True)
                best_text_id = [item[0] for item in bm25_score[:k]]
                # if item['docid'] in best_doc_id:
                #     answer = item['answer']

                train_sample = {
                    'qid': qid,
                    'question': question,
                    'text_ids': best_text_id,
                    'answer': item['answer'],
                    'answer_span': item['answer_span'],
                    "docid": item['docid']
                }

                train_with_doc.append(train_sample)
                pbar.update(1)
            save_pkl_data(train_with_doc, save_path)
    return train_with_doc
Beispiel #22
0
 def test_segment_space(self):
     """Tests that the fix for issue #2 works."""
     s = "这个句子有 空格。"
     seg_s = pynlpir.segment(s, pos_tagging=False)
     pos_seg_s = pynlpir.segment(s)
     expected_seg_s = ["这个", "句子", "有", " ", "空格", "。"]
     expected_pos_seg_s = [
         ("这个", "pronoun"),
         ("句子", "noun"),
         ("有", "verb"),
         (" ", None),
         ("空格", "noun"),
         ("。", "punctuation mark"),
     ]
     self.assertEqual(expected_seg_s, seg_s)
     self.assertEqual(expected_pos_seg_s, pos_seg_s)
Beispiel #23
0
 def init(self, filename=TRAINSETFILE, IsTraining=True, IsSegment=True):
     with open(filename, encoding='GB18030') as file:
         filereader = csv.reader(file,
                                 dialect='excel-tab',
                                 quoting=csv.QUOTE_NONE)
         if not IsSegment:
             for item in filereader:
                 self.userlist.append(item)
         else:
             pynlpir.open()
             if IsTraining:
                 infoflag = 4
             else:
                 infoflag = 1
             # count_test =0
             for userquery in filereader:
                 userdict = {}
                 self.userinfo.append(userquery[:infoflag])
                 for item in userquery[infoflag:]:
                     for word in pynlpir.segment(item, pos_tagging=False):
                         if word not in self.dict.keys():
                             self.dict[word] = 0
                         if word in userdict.keys():
                             userdict[word] += 1
                         else:
                             userdict[word] = 1
                 self.userlist.append(userdict)
                 # count_test +=1
                 # if count_test>100:
                 #   break
             pynlpir.close()
     self.IsTraining = IsTraining
     self.IsSegment = IsSegment
     self.IsDF = False
Beispiel #24
0
def stati_pos(content, lang='zh'):
    """
    :param lang: zh, jp
    统计词性,返回字典,具体词性看相关说明 
    todo: 返回特殊词集合
    """
    pos_count = {}
    if lang == 'zh':
        items = pynlpir.segment(content, pos_english=False)
        for item in items:
            if item[1] is None:
                continue
            pos = item[1]
            pos_count[pos] = pos_count.get(pos, 0) + 1

    elif lang == 'jp':
        res = mecab.parse(content)
        for item in res.split('\n'):
            if item == 'EOS':
                break
            pos = item.split('\t')[3]
            pos_count[pos] = pos_count.get(pos, 0) + 1

    # 可以扩展en..
    return pos_count
Beispiel #25
0
def separateWordFromFile(fileName):
	pynlpir.open()
	file = open(fileName,'r')
	lines = file.readlines()
	i = 0
	allSegmentResult = []
	#print type(s)
	label = []
	for line in lines:
		i = i+1
		textsegment = line
		if textsegment == "\n":
			print "skip"
			continue
		##note:
		'''   gbk 转 utf-8时,    
		   gbk --> unicode --> utf-8
           分解为两个步骤,
                   1.    gbk --> unicode
                             python 语法:你的字符串.decode("gbk")
                   2.    unicode --> utf-8
                            python 语法:你的字符串.decode("gbk").encode("utf-8")
		'''

		segmentResult = pynlpir.segment(textsegment,pos_tagging=True)
		newSegmentResult = removePunctuation(segmentResult)
		allSegmentResult.append(newSegmentResult)

	print len(allSegmentResult)
	file.close()
	pynlpir.close()
	#print label
	return allSegmentResult
Beispiel #26
0
    def train(self):
        # df_table = {"valid": {"science": 35, "physics": 34, "robot": 57}, "invalid": {"fat": 30, "large": 34, "cheap": 55}}
        # The number of articles containing "science", "physics" or "robot"
        # prior_table = {"valid": 183, "invalid": 244}
        pynlpir.open()
        prior_table = {ele: 0 for ele in self.category_list}
        posterior_table = {ele: dict() for ele in self.category_list}

        i = 0
        for sample in self.training_set_material:
            buffer = sample.split("\t")
            text = buffer[0]
            seg_words = pynlpir.segment(text, pos_tagging=False)
            words_set = set(seg_words)
            try:
                label = buffer[1]
            except:
                print("Line " + str(i) + "in training set corrupted")
                continue
            prior_table[label] += 1
            for word in words_set:  # all words in the text
                if word in posterior_table[label].keys():
                    posterior_table[label][word] += 1  # posterior count +1 when this word already exists in posterior
                else:
                    posterior_table[label][
                        word
                    ] = 1  # posterior count assigned to 1 when this word does exist in posterior yet
            i += 1
        return prior_table, posterior_table
def splitFile(docName, encodingType):
    '''
		default code style of docName : encodingType
		function : segmente the chinese text of docName and return 
	'''
    # all is wrote in cache -- ok ? maybe wrote in files
    f = file(docName, 'r')
    pynlpir.open(encoding='utf-8')
    contest = []
    line = f.readline()
    cou = 0
    while line:
        line = line.strip()
        cou += 1

        try:
            line = line.decode(encodingType)
            if line.find(testChar) != -1:  #delete the file header
                line = f.readline()
                continue
            temp = pynlpir.segment(line, pos_tagging=False)
            contest += temp
            line = f.readline()
        except:
            line = f.readline()
            # print '.'
            # print "err %s, %d"%(docName, cou)
    f.close()
    pynlpir.close()
    return contest
def fenci(content):
    dict = {}
    # pr.open()
    # dicConf=GetDicConfig()
    # FilePath=dicConf['Testfilepath']
    # DicNews=GetDictFromJsonFile(FilePath)
    # content=DicNews['content']
    pr.open()
    segs = pr.segment(content, pos_english=False, pos_names='child')
    AllList = []
    NamedList = []
    OtherList = []
    for w, c in segs:
        if len(w) < 2:
            continue
        else:
            AllList.append(w)
            if c == '地名' or c == '人名':
                NamedList.append(w)
            else:
                OtherList.append(w)
    #print("NameList=",NamedList)
    #print('OtherList=',OtherList)
    #print('Alllist=',AllList)
    dict.update({'NameList': NamedList})
    dict.update({'OtherList': OtherList})
    dict.update({'AllList': AllList})
    pr.close()
    return dict
Beispiel #29
0
def wordSegmenter(sentence='', pathOfStopWords=''):
    """
    将传入的句子分词并去除停用词
    :param sentence:         传入的句子
    :param pathOfStopWords:  停用词的路径
    :return:                 分词并去除停用词后由空格分隔的字符串
    """
    #打开分词器
    pynlpir.open()
    #分词
    seg_list = []
    for seg in pynlpir.segment(sentence):
        seg_list.append(seg[0])
    #去除停用词
    resultWords = []
    if pathOfStopWords == '':  #没指定停用词就使用默认的停用词
        pathOfStopWords = path.join(ROOT, STOP_WORDS)
    f_stop = open(pathOfStopWords, 'rt', encoding='utf-8')
    try:
        f_stop_text = f_stop.read()
    finally:
        f_stop.close()
    f_stop_words = f_stop_text.split("\n")
    for seg in seg_list:
        seg = seg.strip()
        if re.match(r'[a-zA-Z0-9]+', seg):  #去掉英文以及数字
            continue
        if len(seg) > 0 and (seg not in f_stop_words):
            resultWords.append(seg)
    return " ".join(resultWords)
Beispiel #30
0
def process_text(text):
    lowered = text.lower()

    tokens = pynlpir.segment(lowered, pos_names='child')

    filtered = [word[0] for word in tokens if filter(word)]

    return filtered
Beispiel #31
0
def cut(data_list):
    """
    分词
    """
    pynlpir.open()
    data_list = [(pynlpir.segment(x)) for x in data_list]
    pynlpir.close()
    return data_list
Beispiel #32
0
def cut_sentence(sentence):
    try:
        words = pynlpir.segment(sentence, pos_tagging=False)
    except:
        words = jieba.cut(str(sentence))

    words = [word for word in words if word not in stopwords]
    return "#".join(words)
Beispiel #33
0
def get_question_corpus(train):
    question_corpus = []
    for item in train:
        cut_words = pynlpir.segment(item.get('question'), pos_tagging=False)
        if 'delete stop word':
            pass
        question_corpus.append(cut_words)
    return question_corpus
Beispiel #34
0
 def sentence_seg(sentence):
     """
     对句子进行分词处理
     :param sentence: 需要分词的句子
     :return: 分词后的结果 [(分词1,词性1),(分词2,词性2)...]
     """
     pos_seg_list = pynlpir.segment(sentence + ',')
     return pos_seg_list[:-1]
Beispiel #35
0
    def segment(self):
        """
        fni:  str;  input file name with path
        fno:  str;  output file name with path
        lang: str;  language code
        pos:  bool; POS tags included
        n:    int;  no. of lines processed
        """
        import copy
        from PyQt5.QtWidgets import QApplication
        from opencc import OpenCC

        openCC = OpenCC('t2s')  # convert from Traditional-to-Simplified
        pynlpir.open(encoding="utf-8")
        print("Finished initializing ITCLAS/NLPIR")
        count = lineCount(self.fni)
        fit  = open(self.fni, "r", encoding="UTF-8")
        fot  = open(self.fno, "w", encoding="UTF-8", newline="\n")

        sep = " " # separator of Chinese tokens (space by default)
        n = 0
        for linet in fit:

            n += 1
            if (linet.strip() == ''): # empty string
                fot.write("\n")
                continue
            lines = openCC.convert(linet.strip())
            lines_seg = pynlpir.segment(lines, pos_tagging=True, pos_names=None) 
            # segment with optional POS-tagging

            # The following segments the zht text according to the
            # segmentation patterns obtained from NLPIR above
            tokens   = []  # initialize list to hold 'words' of segmented zht line
            pos_tags = []  # initialize list to hold pos tags of segmented words
            while len(lines_seg) > 0:  # loop until nothing is left in lines_seg
                t, p = lines_seg.pop(0)  # remove leftmost zhs token and save to variable t0
                m = len(t)  # no. of characters in token
                tokens.append(linet[0:m])  # add corresponding zht token to tokens[]
                pos_tags.append(p)
                linet = linet[m:]  # delete token from zht line (from beginning of string)

            #fot.write(sep.join(tokens)+"\n")  # wirte zht-seg output
            tok_pos = ["{}_{}".format(x, y) for x,y in zip(tokens, pos_tags)]  # list of tok_pos pairs
            fot.write(sep.join(tok_pos)+"\n")
            #if (n == 1): break
            if n % 50 == 0:
                self.window.ui.progressBar.setValue(round(100 * n / self.fi_linecount, 0))
                self.window.ui.progressBar.repaint()
                QApplication.processEvents()
        self.window.ui.progressBar.setValue(100)
        self.window.ui.progressBar.repaint()

        fit.close()
        fot.close()
        pynlpir.close()
        self.numLineProcessed = n
        return n
Beispiel #36
0
def prep_word_dict():
    
    CURRENT_W = None
    with open(IN_FILE) as fin:
        while True:
            try:
                line = fin.readline()
            except:
                print("READ ERROR:%d" %(LINE_NUM) )
                continue
            if not line:
                print("PROCESS DONE!")
                break

            if line[:4] == '[DDv' :
                CURRENT_W = line[5: line.index(']')]
                term_to_id(CURRENT_W)
                continue

            if CURRENT_W and line[0] == '【' and ('=】' in line):
                line_x = line[line.index('】')+1:]
                line_x = line_x.split()
                if line_x:
                    for item in line_x:
                        term_to_id(item)
                continue

    LINE_NUM = 0
    with open(YL_FILE) as fin, open(YLP_FILE, 'w') as fout:
        while True:
            try:
                line = fin.readline()
            except:
                print("READ ERROR:%d" %(LINE_NUM) )
                continue
            if not line:
                print("PROCESS DONE!")
                break

            LINE_NUM += 1
            if not (LINE_NUM % 5000): print('C:%d' %(LINE_NUM))
            if len(line) > 30: continue

            seg_list = pynlpir.segment(line, pos_tagging=False)
            for i in range(len(seg_list)):
                if is_zhs(seg_list[i]):
                    term_to_id(seg_list[i])
                elif len(seg_list[i]) == 1 and is_punct(seg_list[i]):
                    seg_list[i] = PUNCING
                else:
                    seg_list[i] = PADDING
            fout.write(' '.join(seg_list) + '\n')

    term_to_id(PADDING)
    #term_to_id(PUNCING)
    print('SEN DONE!')
Beispiel #37
0
 def test_double_slash(self):
     """Tests for issue #7 -- double slashes raises exception."""
     s = '转发微博 //@张明明:霸气全露'
     seg_s = pynlpir.segment(s)
     expected_seg_s = [('转发', 'verb'), ('微', 'adjective'),
                       ('博', 'adjective'), (' ', None),
                       ('//', 'string'), ('@张明明', 'noun'),
                       (':', 'punctuation mark'), ('霸气', 'noun'),
                       ('全', 'adverb'), ('露', 'verb')]
     self.assertEqual(expected_seg_s, seg_s)
Beispiel #38
0
	def get_feat(self, text):
		tokens = []
		filtered = []
		segs = pynlpir.segment(text, pos_tagging = False)
		for seg in segs:
			if self.validtoken(seg):
				tokens.append(seg)
			else:
				filtered.append(seg)
		return tokens, filtered
Beispiel #39
0
def word_to_class(word):

    c = "null"
    c_list = []
    try:
        c_list = pynlpir.segment(word)
    except Exception as e:
        print (e)
        return c
    if len(c_list)>=1 and c_list[0][1] != None:
        c = c_list[-1][1]
    return c
Beispiel #40
0
 def test_segment(self):
     """Tests that the segment() function works as expected."""
     s = '我们都是美国人。'
     seg_s = pynlpir.segment(s, pos_tagging=False)
     pos_seg_s = pynlpir.segment(s, pos_tagging=True, pos_names='child')
     npos_seg_s = pynlpir.segment(s, pos_tagging=True, pos_names=None)
     ppos_seg_s = pynlpir.segment(s, pos_tagging=True, pos_names='all')
     expected_seg_s = ['我们', '都', '是', '美国', '人', '。']
     expected_pos_seg_s = [('我们', 'personal pronoun'), ('都', 'adverb'),
                           ('是', 'verb 是'), ('美国', 'transcribed toponym'),
                           ('人', 'noun'), ('。', 'period')]
     expected_npos_seg_s = [('我们', 'rr'), ('都', 'd'), ('是', 'vshi'),
                            ('美国', 'nsf'), ('人', 'n'), ('。', 'wj')]
     expected_ppos_seg_s = [('我们', 'pronoun:personal pronoun'),
                            ('都', 'adverb'), ('是', 'verb:verb 是'),
                            ('美国', 'noun:toponym:transcribed toponym'),
                            ('人', 'noun'), ('。', 'punctuation mark:period')]
     self.assertEqual(expected_seg_s, seg_s)
     self.assertEqual(expected_pos_seg_s, pos_seg_s)
     self.assertEqual(expected_npos_seg_s, npos_seg_s)
     self.assertEqual(expected_ppos_seg_s, ppos_seg_s)
Beispiel #41
0
 def query2words(self,query):
   words = []
   segs = query.split(' ')
   for s in segs:
     s = s.strip() ## need regularization
     if s in self.vocab: words.append(s) ## in word2vec vocab
     else:
       pynlpir.open()
       # words.extend(pynlpir.get_key_words(query,max_words=3))
       word_segs = pynlpir.segment(query,pos_tagging=False)
       for word in word_segs:
         if word not in self.stop_list: words.append(word)
       print(words)
   return words
 def _part_document(self):
   pynlpir.open()
   docs = {}
   for dirname, dirnames,filenames in os.walk('dependence/new_data'):
     for filename in filenames:
       path = os.path.join(dirname, filename)
       text = ''
       with io.open(path, 'r',encoding='utf-8') as f:
         text = f.readline()
         words = pynlpir.segment(text,pos_tagging=False)
         clean_words = [w for w in words if w not in self.stop_list and len(w)>1]
         index = filename[:6]
         docs[index] = clean_words
   dictionary = corpora.Dictionary(docs.values())
   corporas = {index: dictionary.doc2bow(docs[index]) for index in docs}
   return docs, dictionary, corporas
Beispiel #43
0
 def test_double_slash(self):
     """Tests for issue #7 -- double slashes raises exception."""
     s = "转发微博 //@张明明:霸气全露"
     seg_s = pynlpir.segment(s)
     expected_seg_s = [
         ("转发", "verb"),
         ("微", "adjective"),
         ("博", "adjective"),
         (" ", None),
         ("//", "string"),
         ("@张明明", "noun"),
         (":", "punctuation mark"),
         ("霸气", "noun"),
         ("全", "adverb"),
         ("露", "verb"),
     ]
     self.assertEqual(expected_seg_s, seg_s)
 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode,
         **kwargs)
     nlpir.Init(nlpir.PACKAGE_DIR, nlpir.UTF8_CODE)
     pynlpir.open()
     pynlpir.open(encoding='utf-8')
     seglist = pynlpir.segment(value,)
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos=start_pos+value.find(w)
         if chars:
             t.startchar=start_char+value.find(w)
             t.endchar=start_char+value.find(w)+len(w)
         yield t      #通过生成器返回每个分词的结果token
Beispiel #45
0
    def predict(self, text):
        # words = [word1, word2, word3, ...]
        pynlpir.open()
        seg_words = pynlpir.segment(text, pos_tagging=False)
        words_set = set(seg_words)
        result = dict()
        for category in self.category_list:
            prob = self.comp_prop(category, words_set)
            result[category] = prob

        """
        buffer = [result[my_key] for my_key in result.keys()]
        score_sum = sum(buffer)
        # result = {my_key: result[my_key]/score_sum for my_key in result.keys()}
        """
        buffer = list(result.items())
        buffer.sort(key=lambda x: x[1], reverse=True)
        top_category = buffer[0][0]
        return top_category
def dispatch_me(str_test):

    print("测试语句:%s" %(str_test))
    line_p = hanzi_prep.split_into_sentences(str_test)
    lines = []
    for line_i in line_p:
        lines.extend(line_i)
    str_i = ''.join(lines)
    if USE_SEGMENT == "JIEBA":
        print("==JIEBA分词==")
        jieba_i = ' '.join(jieba.cut(str_i, cut_all=False))
    elif USE_SEGMENT == "ICTCLAS":
        print("==NLPIR分词==")
        jieba_i = ' '.join(pynlpir.segment(str_i, pos_tagging=False))
    else:
        print("ERROR:未知分词系统!")
        return None

    print("分词结果:%s"%(repr(jieba_i)))
    jieba_i = jieba_i.split()
    jieba_len = len(jieba_i)
    result_collect = []
    for i in range(0,jieba_len):
        if i > 0:
            head = jieba_i[i-1]
        else:
            head = None
        if i < jieba_len -1:
            tail = jieba_i[i+1]
        else:
            tail = None
        ret = calc_list_pro(jieba_i[i], head, tail)
        if ret:
            ret_pro = find_max_dict(ret)
            if ret_pro:
                print("词汇:[[%s]], 最大概率义项:%s, 概率:%f" %(jieba_i[i], ret_pro[0], ret_pro[1]))
                print("DEBUG:::"+repr(ret))
                result_collect.append((jieba_i[i], ret_pro[0], ret_pro[1]))
        else:
            print("无计算结果")

    return result_collect
Beispiel #47
0
 def document2sentences(self,document):
   pynlpir.open()
   words = pynlpir.segment(document,pos_tagging=False)
   sign = ['。', ';', '.', ';']
   pause_position = []
   for i in range(len(words)):
     if words[i] in sign: pause_position.append(i)
   setences = []
   if len(pause_position) == 0:
     clean_d = [s.strip() for s in words if s not in self.stop_list]
     setences.append(' '.join(clean_d)+'\n')
   else:
     for i in range(len(pause_position)):
       setence = []
       if i == 0: setence = words[:pause_position[i]]
       elif i == len(pause_position)-1 and i != 0: break
       else: setence = words[pause_position[i]:pause_position[i+1]]
       clean_s = [s.strip() for s in setence if s not in self.stop_list]
       setences.append(' '.join(clean_s)+'\n')
   return setences
Beispiel #48
0
def read_lexical_datas(file, compose_func=None):
    pynlpir.open()
    f = open(file, 'r', encoding='utf-8')
    tokens_list = [pynlpir.segment(line.rstrip('\n').replace('幺', '一'), pos_tagging=False) for line in f]
    if compose_func is None:
        word_idx = {}
        for tokens in tokens_list:
            for token in tokens:
                if token not in word_idx:
                    word_idx[token] = len(word_idx)
        array = numpy.zeros([len(tokens_list), len(word_idx)])
        for i, tokens in enumerate(tokens_list):
            for token in tokens:
                array[i][word_idx[token]] = 1.0
    else:
        print('reading word vectors')
        word_vecs = word_vec.read_word_vec(r'../data/vectors_cbow')
        print('reading complete')
        array = numpy.asarray([compose_func(tokens, word_vecs) for tokens in tokens_list])
    return array
Beispiel #49
0
def main():
    py.open()
    a = sys.argv[1]
    result = py.segment(a)
    res_str = []
    for r in result:
        if len(r[0]) == 2 and (r[1] == "noun" or r[1] == "verb" or r[1] == "adjective"):
            f_result = fsame.find(r[0])
            ff_result = fsame.ffind(r[0])
            if f_result == r[0] or ff_result == r[0]:
                res_str.append(r[0])
            else:
                if random.randint(0, 1) == 0:
                    res_str.append(f_result)
                else:
                    res_str.append(ff_result)
        else:
            res_str.append(r[0])
    print "".join(res_str)
    py.close()
def extract_news_kws(hot_news):
    pynlpir.open()
    s = hot_news
    kw_list = pynlpir.segment(s, pos_tagging=True, pos_names=None)
    kws = ""
    for kw in kw_list:
        pos = kw[0]
        tagging = kw[1]
        try:
            if tagging:
                # test if tagging is none, which means the pos is a space character
                tagging_first = tagging[0]
            else:
                tagging_first = ""
        except:
            tagging_first = ""
        if tagging_first == "n" and len(pos) > 1:
            if pos != "quot":
                kws = kws + pos + u" "
    kws = kws.strip(u" ")
    return kws
def tokenize(file):
    words = []
    pynlpir.open()
    directory = '\\resources\\original files\\htl_del_4000\\'
    posWords = codecs.open(directory + file + 'Words.txt', 'w+', 'utf-8')
    with codecs.open(directory + file + '.txt', 'r', 'utf-8') as posFile:
        for s in posFile.readlines():
            # print posFile.readline()
            a = pynlpir.segment(s, pos_tagging=False)
            # print a
            for i in range(len(a)):
                # print a[i]
                if i != (len(a) - 1):
                    # print 'i='+str(i)
                    # print 'a='+str(len(a))
                    posWords.write(a[i] + ' ')
                else:
                    posWords.write(a[i] + '\r')
                    # for i in a:
                    #    posWords.write(i + ';')
                    # posWords.write('\0')
    posWords.close()
Beispiel #52
0
def part_sentence(stop_list):
  pynlpir.open()
  for dirname, dirnames,filenames in os.walk('dependence/ch_corporas/wiki/lost'):
      for filename in filenames:
        lines = []
        read_path = os.path.join(dirname, filename)
        rf = open(read_path,'rb')
        print(filename)
        for line in rf:
          # detector.feed(byte)
          encoding = chardet.detect(line)['encoding']
          if encoding == None: encoding = 'utf-8'
          new_line = line.decode(encoding,'ignore')
          words = pynlpir.segment(new_line,pos_tagging=False)
          clean_words = [w.strip() for w in words if w not in stop_list]
          str_line = ' '.join(clean_words)
          if str_line: lines.append(str_line+'\n')
        rf.close()
        write_path = os.path.join('dependence/ch_corporas/wiki_clean', filename)
        wf = open(write_path, 'w')
        wf.writelines(lines)
        wf.close()
Beispiel #53
0
    def __init__(self, content, norm="l1_norm"):
        self.norm = norm
        pynlpir.open()
        words = pynlpir.segment(content, pos_tagging=True, pos_names=None)

        kws = ""
        for word in words:
            pos = word[0]
            tagging = word[1]
            try:
                if tagging:
                    # test if tagging is none, which means the pos is a space character
                    tagging_first = tagging[0]
                else:
                    tagging_first = ""
            except:
                tagging_first = ""
            if tagging_first == "n" and len(pos) > 1:
                if pos != "quot":
                    kws = kws + pos + u" "
        result = kws.split(" ")
        self.PoS = result
Beispiel #54
0
    def test_issue_52(self):
        """
        Tests for issue #52 -- segment(pos_names='all') fails for certain texts
        input.
        """
        # it seems '甲' returns 'Mg', which is not listed in the POS_MAP.
        # thus in this case 'None' needs to be returned for '甲'.
        s = u'其中,新增了甲卡西酮、曲马多、安钠咖等12种新类型毒品的定罪量刑数量标准,' \
            u'并下调了在我国危害较为严重的毒品氯胺酮的定罪量刑数量标准。'

        segments = pynlpir.segment(s=s, pos_tagging=True, pos_names='all')

        expected_segments = [
            (u'其中', 'pronoun:demonstrative pronoun'),
            (u',', 'punctuation mark:comma'), (u'新增', 'verb'),
            (u'了', 'particle:particle 了/喽'), (u'甲', 'numeral'),
            (u'卡', 'noun'), (u'西', 'distinguishing word'), (u'酮', 'noun'),
            (u'、', 'punctuation mark:enumeration comma'),
            (u'曲马多', 'noun:personal name:transcribed personal name'),
            (u'、', 'punctuation mark:enumeration comma'),
            (u'安', 'noun:personal name:Chinese surname'), (u'钠', 'noun'),
            (u'咖', 'noun'), (u'等', 'particle:particle 等/等等/云云'),
            (u'12', 'numeral'), (u'种', 'classifier'), (u'新', 'adjective'),
            (u'类型', 'noun'), (u'毒品', 'noun'),
            (u'的', 'particle:particle 的/底'), (u'定罪', 'verb:noun-verb'),
            (u'量刑', 'verb:noun-verb'), (u'数量', 'noun'), (u'标准', 'noun'),
            (u',', 'punctuation mark:comma'),
            (u'并', 'conjunction:coordinating conjunction'), (u'下调', 'verb'),
            (u'了', 'particle:particle 了/喽'), (u'在', 'preposition'),
            (u'我国', 'noun'), (u'危害', 'verb:noun-verb'), (u'较为', 'adverb'),
            (u'严重', 'adjective'), (u'的', 'particle:particle 的/底'),
            (u'毒品', 'noun'), (u'氯', 'noun'), (u'胺', 'noun'), (u'酮', 'noun'),
            (u'的', 'particle:particle 的/底'), (u'定罪', 'verb:noun-verb'),
            (u'量刑', 'verb:noun-verb'), (u'数量', 'noun'), (u'标准', 'noun'),
            (u'。', 'punctuation mark:period'),
        ]

        self.assertEqual(segments, expected_segments)
writer = csv.writer(of, delimiter=",", quotechar='|', quoting=csv.QUOTE_MINIMAL)
errors = 0
unbounderrors = 0

for f in files:
	infile = "./../" + f
	with open(infile, 'rb') as csvfile:
		count = 0
		reader = csv.reader(csvfile, delimiter=",")
		for row in reader:
			if row[10]!="":
				mid = row[0]
				message = row[6]
				censored = 1
				try:
					segmented = pynlpir.segment(message)
				except UnicodeDecodeError:
					errors += 1
					continue
				except UnboundLocalError:
					unbounderrors += 1
					print "what??"
					continue
				except:
					print "core dump...?"
					continue

				mString = ""
				for segment in segmented:
					mString += segment[0]
					mString += " "
Beispiel #56
0
 def __init__(self, file):
     with open(file, 'r', encoding="utf-8") as f:
         content = f.read()
         pynlpir.open()
         result = pynlpir.segment(content, pos_tagging=False)
     self.PoS = result
Beispiel #57
0
if not os.path.exists(FILE_NAME_JIEBA):
    with open(FILE_NAME_PREP) as fin:
        with open(FILE_NAME_JIEBA, "w") as fout:
            for line in fin:
                i = i + 1
                if not i % 1000:
                    print("C:%d" % (i))
                line_p = hanzi_prep.split_into_sentences_e(line)
                for line_i in line_p:
                    # 用空格分割每个汉字
                    str_i = "".join(line_i)
                    str_j = ""
                    if USE_SEGMENT == "JIEBA":
                        str_j = " ".join(jieba.cut(str_i, cut_all=False))
                    elif USE_SEGMENT == "ICTCLAS":
                        str_j = " ".join(pynlpir.segment(str_i, pos_tagging=False))
                    else:
                        print("ERROR:未知分词系统!")
                    fout.write(str_j + "\n")

if USE_SEGMENT == "ICTCLAS":
    print("END:ICTCLAS分词系统")
    pynlpir.close()

elif USE_SEGMENT == "JIEBA":
    print("END:JIEBA分词系统")

else:
    print("END:未知分词系统")

# 计算N-Gram词频信息
Beispiel #58
0
Datei: test.py Projekt: xgli/code
#!/usr/bin/env python
# coding=utf-8

import pynlpir
from pynlpir import nlpir



pynlpir.open()

s = '北京邮电大学是一所美丽的学校'
print(pynlpir.segment(s))

nlpir.Exit()
Beispiel #59
0
import pynlpir
import MySQLdb
# import sys

# reload(sys)
# sys.setdefaultencoding('utf-8')

conn= MySQLdb.connect(
	host='localhost',
	port = 3306,
	user='******',
	passwd='1111',
	db ='sinadb',
	charset='utf8'
)
cur = conn.cursor()
pynlpir.open()

res = cur.execute("SELECT weibotext FROM hot_1_user_weibo WHERE UNIX_TIMESTAMP(time) > UNIX_TIMESTAMP('2016-4-3 12:00:00')")
info = cur.fetchmany(res)
nlpir_results = pynlpir.segment(info[0][0])
for nlpir_result in nlpir_results:
	print nlpir_result[0], nlpir_result[1]


# pynlpir.open()
# s = u':【东北衰败宣告了国企城市的破产 】东北衰落的原因很简单,那就是经济被国企吸干了。东北是全球苏联式经济的最佳典范。苏联计划经济已经垮台了,东北国企还在苟延残喘……'
# print s
# for x in pynlpir.segment(s):
# 	print x[0], x[1]
def correct_me(str_test, enhance = True):

    print("")
    print("测试语句:%s" %(str_test))
    line_p = hanzi_prep.split_into_sentences(str_test)
    lines = []
    for line_i in line_p:
        lines.extend(line_i)
    str_i = ''.join(lines)
    if USE_SEGMENT == "JIEBA":
        print("==JIEBA分词==")
        jieba_i = ' '.join(jieba.cut(str_i, cut_all=False))
    elif USE_SEGMENT == "ICTCLAS":
        print("==NLPIR分词==")
        jieba_i = ' '.join(pynlpir.segment(str_i, pos_tagging=False))
    else:
        print("未知分词类型!")
        jieba_i = []
    print("分词结果:%s"%(repr(jieba_i)))
    jieba_i = jieba_i.split()
    jieba_len = len(jieba_i)
    if jieba_len < 3:
        print("词数太小,放弃纠错!")
        return
    jieba_key = []
    jieba_pro = []
    for i in range(1,jieba_len):

        #不考虑开头结尾模式
        tmp_str = jieba_i[i-1] + jieba_i[i]
        pro = JIEBA_HZ.get(tmp_str)

        jieba_key.append(tmp_str)
        if pro:
            jieba_pro.append(pro)
        else:
            jieba_pro.append(0)

    print("分词表:"+repr(jieba_key))
    print("概率表:"+repr(jieba_pro))

    jieba_pro_t = []
    for i in range(0,jieba_len-2):
        jieba_pro_t.append( jieba_pro[i] + jieba_pro[i+1])

    min_index = jieba_pro_t.index(min(jieba_pro_t)) + 1
    print("可疑位置:[%d]->%s"%(min_index,jieba_i[min_index]))
    to_do = []
    g_check_a = None
    g_check_e = None
    #纠错位置不可能在开头或者结尾
    to_do.append(jieba_i[min_index-1])
    to_do.append(jieba_i[min_index])
    to_do.append(jieba_i[min_index+1])
    if min_index - 2 >= 0:
        g_check_a = jieba_i[min_index-2]
    if min_index + 2 < jieba_len:
        g_check_e = jieba_i[min_index+2]

    print("需要处理:"+repr(to_do))
    print("辅助检测:%s,%s" %(g_check_a, g_check_e))

    #保存最终的结果
    p_res_stage1 = {}
    p_res_stage2 = {}
    p_res_stage3 = {}

    if enhance:
        #STAGE1 假设分词没有错误
        p_res_st1 = sub_correct_me_ext(to_do[0], to_do[1], to_do[2], 1)
        #STAGE2 假设第一和第二个合并
        p_res_st2 = sub_correct_me_ext(g_check_a, to_do[0]+to_do[1], to_do[2], 2)
        #STAGE3 假设第二和第三个合并
        p_res_st3 = sub_correct_me_ext(to_do[0], to_do[1]+to_do[2], g_check_e, 3)
    else:
        #STAGE1 假设分词没有错误
        p_res_st1 = sub_correct_me(to_do[0], to_do[1], to_do[2], 1)
        #STAGE2 假设第一和第二个合并
        p_res_st2 = sub_correct_me(g_check_a, to_do[0]+to_do[1], to_do[2], 2)
        #STAGE3 假设第二和第三个合并
        p_res_st3 = sub_correct_me(to_do[0], to_do[1]+to_do[2], g_check_e, 3)

    #打印纠正结果   
    cor_ret = correct_result(to_do, [p_res_st1, p_res_st2, p_res_st3], True)
    if not cor_ret:
        final_words = ['NONE']
    else:
        if cor_ret['type'] == 1:
            final_words = jieba_i[0:min_index-1] + [ to_do[0], cor_ret['item'], to_do[2] ] + jieba_i[min_index+2:jieba_len]
        elif cor_ret['type'] == 2:
            final_words = jieba_i[0:min_index-1] + [ cor_ret['item'], to_do[2] ] + jieba_i[min_index+2:jieba_len]
        elif cor_ret['type'] == 3:
            final_words = jieba_i[0:min_index-1] + [ to_do[0], cor_ret['item'] ] + jieba_i[min_index+2:jieba_len]
        else:
            final_words = ['NONE']

    return ''.join(final_words)