Beispiel #1
0
def word_segment(text):
    pynlpir.open()
    segments = nlpir.segment_pos(text)
    segment_result = []
    pos_result = []
    for segment in segments:
        segment_result.append(segment[0])
        pos_result.append(segment[1])
    pynlpir.close()
    return segment_result, pos_result
Beispiel #2
0
def get_key_words(text):
    pynlpir.open()
    result = []
    keywords = pynlpir.get_key_words(text, weighted=True)
    if len(keywords) == 0:
        return result
    for i in range(len(keywords)):
        keyword = keywords[i][0]
        result.append(keyword)
    pynlpir.close()
    return result
Beispiel #3
0
def nlpir_keywords(text,n):
	pynlpir.open()
	# print '关键词测试:\n'
	key_words = list(pynlpir.get_key_words(text,n,weighted=False))
	# for key_word in key_words:
	#     print key_word[0], '\t', key_word[1]
	 
	pynlpir.close()
	
	print key_words
	return key_words
def words_cixing(question,pos=1):
    #pos=1,标注词性;否则不标注
    pynlpir.open()
    if pos:
        pos1=['{}/{}'.format(k,v)for k,v in pynlpir.segment(question, pos_names=None,pos_tagging=pos)]
    else:
        pos0=pynlpir.segment(question)
    pynlpir.close()
    if pos:
        return pos1
    else :
        return pos0
Beispiel #5
0
 def test_license_auto_update(self):
     """Tests that the auto-update of the license works."""
     try:
         # switch old one to the new one
         os.rename(os.path.join(DATA_DIR, LICENSE_NAME),
                   os.path.join(DATA_DIR, "{}.copy".format(LICENSE_NAME)))
         os.rename(os.path.join(DATA_DIR, "{}.old".format(LICENSE_NAME)),
                   os.path.join(DATA_DIR, LICENSE_NAME))
         pynlpir.open()
         pynlpir.close()
     finally:
         # switch back the license
         os.rename(os.path.join(DATA_DIR, LICENSE_NAME),
                   os.path.join(DATA_DIR, "{}.old".format(LICENSE_NAME)))
         os.rename(os.path.join(DATA_DIR, "{}.copy".format(LICENSE_NAME)),
                   os.path.join(DATA_DIR, LICENSE_NAME))
Beispiel #6
0
    def test_license_expire(self):
        """Tests that a RuntimeError is raised if the license is invalid."""
        temp_dir = tempfile.mkdtemp()
        temp_data_dir = os.path.join(temp_dir, 'Data')
        shutil.copytree(DATA_DIR, temp_data_dir)
        shutil.copy(LICENSE_FILE, temp_data_dir)

        self.assertRaises(RuntimeError, pynlpir.open, temp_dir)

        temp_license_file = os.path.join(temp_data_dir, LICENSE_NAME)
        os.remove(temp_license_file)

        self.assertRaises(RuntimeError, pynlpir.open, temp_dir)

        shutil.rmtree(temp_dir)
        pynlpir.close()
Beispiel #7
0
def main():
    py.open()
    a = sys.argv[1]
    result = py.segment(a)
    res_str = []
    for r in result:
        if len(r[0]) == 2 and (r[1] == "noun" or r[1] == "verb" or r[1] == "adjective"):
            f_result = fsame.find(r[0])
            ff_result = fsame.ffind(r[0])
            if f_result == r[0] or ff_result == r[0]:
                res_str.append(r[0])
            else:
                if random.randint(0, 1) == 0:
                    res_str.append(f_result)
                else:
                    res_str.append(ff_result)
        else:
            res_str.append(r[0])
    print "".join(res_str)
    py.close()
Beispiel #8
0
	def post(self,request):
		obj_id = request.POST['obj_id']
		school = MySchool.objects.get(id=int(obj_id))
		feeds = []

		# weibo
		# App Key:802677147
		# App Secret:f75be23800d779cc9dbbf6b467b7ff61		
		# Redirect url: https://api.weibo.com/oauth2/default.html
		# code: 4ccb7879bf204466b80e02c106d09727

		# read baidu
		params = {'keyword':school.name}

		# send a 3rd party service request
		baidu_consumer.delay(params)

		# read saved feeds
		feeds = MyBaiduStream.objects.filter(school=school).order_by('-last_updated')[:100]
		content = loader.get_template(self.template_name)
		tieba_html= content.render(Context({
			'obj':school,
			'feeds': feeds,
			}))

		# hot topics
		pynlpir.open() # must have this line!
		topics = feeds[:50]
		content = loader.get_template(self.newsticker_template_name)
		newsticker_html= content.render(Context({
			'objs':topics,
			'keywords': pynlpir.get_key_words(''.join([f.name+f.description for f in feeds]), max_words=50, weighted=True)
			}))
		pynlpir.close()

		return HttpResponse(json.dumps({'bd_html':tieba_html,'news_html':newsticker_html}), 
			content_type='application/javascript')
Beispiel #9
0
def import_userdict(file_dir):
    pynlpir.open()
    nlpir.import_userdict(file_dir)
    pynlpir.close()
Beispiel #10
0
def Participle(list_datas, filename_stopwords):
    #分词
    time_start = time.time()
    print("正在分词...")

    list_garbagesT.clear()
    list_words_stop = GetListWords(filename_stopwords)

    pynlpir.open()
    for data in list_datas:
        # segments = pynlpir.segment(data.content, pos_names='all',pos_english=False)
        # file_nlp.write('\n')
        # for segment in segments:
        #     file_nlp.write("[ %s : %s ]" % (segment[0], segment[1]))
        #     file_nlp.write('\n')

        if len(data.content) < 8:
            data.error = "内容过短"
            list_garbagesT.append(data)
            continue
        list_words = pynlpir.get_key_words(data.content, max_words=70)
        if len(list_words) == 0:
            data.error = "没有分词结果"
            list_garbagesT.append(data)
            continue
        #print("开始停词")
        for word in list_words:
            if word in list_words_stop:
                #print("停了个词" + word)
                continue
            if word == '':
                data.error = "包含空白分词"
                list_garbagesT.append(data)
                break
            #统计词频
            contentT = data.content
            count = 0
            while contentT.find(word) > -1:
                contentT = contentT.replace(word, '', 1)
                count += 1
            if count == 0:
                data.error = "分词不属于原文"
                list_garbagesT.append(data)
                break
            #保存词频统计结果
            data.dict_words_tfidf[word] = count
        if len(data.dict_words_tfidf) == 0:
            data.error = "词频统计结果为空"
            list_garbagesT.append(data)
            continue

    #清除垃圾数据
    for data in list_garbagesT:
        list_datas.remove(data)
        list_garbages.append(data)
    list_garbagesT.clear()

    pynlpir.close()

    time_end = time.time()
    print("用时 : %.2f s" % (time_end - time_start))
    return list_datas
Beispiel #11
0
    def segment(self):
        """
        fni:  str;  input file name with path
        fno:  str;  output file name with path
        lang: str;  language code
        pos:  bool; POS tags included
        n:    int;  no. of lines processed
        """
        import copy
        from PyQt5.QtWidgets import QApplication
        from opencc import OpenCC

        openCC = OpenCC('t2s')  # convert from Traditional-to-Simplified
        pynlpir.open(encoding="utf-8")
        print("Finished initializing ITCLAS/NLPIR")
        count = lineCount(self.fni)
        fit = open(self.fni, "r", encoding="UTF-8")
        fot = open(self.fno, "w", encoding="UTF-8", newline="\n")

        sep = " "  # separator of Chinese tokens (space by default)
        n = 0
        for linet in fit:

            n += 1
            if (linet.strip() == ''):  # empty string
                fot.write("\n")
                continue
            lines = openCC.convert(linet.strip())
            lines_seg = pynlpir.segment(lines,
                                        pos_tagging=True,
                                        pos_names=None)
            # segment with optional POS-tagging

            # The following segments the zht text according to the
            # segmentation patterns obtained from NLPIR above
            tokens = [
            ]  # initialize list to hold 'words' of segmented zht line
            pos_tags = [
            ]  # initialize list to hold pos tags of segmented words
            while len(
                    lines_seg) > 0:  # loop until nothing is left in lines_seg
                t, p = lines_seg.pop(
                    0)  # remove leftmost zhs token and save to variable t0
                m = len(t)  # no. of characters in token
                tokens.append(
                    linet[0:m])  # add corresponding zht token to tokens[]
                pos_tags.append(p)
                linet = linet[
                    m:]  # delete token from zht line (from beginning of string)

            #fot.write(sep.join(tokens)+"\n")  # wirte zht-seg output
            tok_pos = ["{}".format(x) for x, y in zip(tokens, pos_tags)
                       ]  # list of tok_pos pairs
            fot.write(sep.join(tok_pos) + "\n")
            #if (n == 1): break
            if n % 50 == 0:
                self.window.ui.progressBar.setValue(
                    round(100 * n / self.fi_linecount, 0))
                self.window.ui.progressBar.repaint()
                QApplication.processEvents()
        self.window.ui.progressBar.setValue(100)
        self.window.ui.progressBar.repaint()

        fit.close()
        fot.close()
        pynlpir.close()
        self.numLineProcessed = n
        return n
Beispiel #12
0
def close():
    return pynlpir.close()
Beispiel #13
0
def partition(input_path, output_path):
    '''
    分词,把input _path 里的文本文件分词,结果存在output_path
    :param input_path: 文本文件路径
    :param output_path: 分词结果的路径
    :return: 编码错误的词的错误
    '''
    f3 = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8')
    f3_name = f3.name

    stop_set = []
    f_stop_list = open(
        'C:/Users/i-zhanghaoran/Desktop/Extract_main_word&Sentiment_anaylsis/extract_main_word/stop_list.txt',
        'r',
        encoding='utf-8')
    for line in f_stop_list:
        stop_set.append(line.split()[0])
    stop_set = set(stop_set)

    os.chdir(input_path)
    f_lst = os.listdir(os.getcwd())
    cnt1 = 0
    nlpir = pynlpir.nlpir
    pynlpir.open()
    nlpir.ImportUserDict(
        b'C:/Users/i-zhanghaoran/Desktop/Extract_main_word&Sentiment_anaylsis/new_bigdic.txt'
    )
    for item in f_lst:
        ans_lst = []
        f = open(item, 'r', encoding='utf-8')
        s = bytes(f.read(), encoding='utf-8')
        f.close()

        size = ctypes.c_int()
        result = nlpir.ParagraphProcessA(s, ctypes.byref(size), True)
        result_t_vector = ctypes.cast(result, ctypes.POINTER(nlpir.ResultT))
        words = []

        for i in range(0, size.value):
            r = result_t_vector[i]
            word = s[r.start:r.start + r.length]
            words.append((word, r.sPOS))

        f2 = open(output_path + item, 'w', encoding='utf-8')
        for word, pos in words:
            # try:
            if word.decode('utf-8') not in stop_set:
                if pos.decode('utf-8') > b'z'.decode('utf-8') or pos.decode(
                        'utf-8').upper() == pos.decode(
                            'utf-8') and pos.decode('utf-8') != '':
                    ans_lst.append((pos.decode('utf-8'), word.decode('utf-8')))
                f2.write(
                    (word.decode('utf-8') + '  ' + pos.decode('utf-8') + '\n'))
                f3.write(
                    (word.decode('utf-8') + '  ' + pos.decode('utf-8') + '\n'))
                # except:
                #     cnt1+=1
                # else:
                #     f2.write(word.decode('utf-8') + '\n')

        keys = pynlpir.get_key_words(s, max_words=10, weighted=False)
        ans_set = list(set(ans_lst))
        feqrence = [0 for k in range(len(ans_set))]
        for k in range(len(ans_set)):
            for item in ans_lst:
                if item == ans_set[k]:
                    feqrence[k] += 1
        f2.write('\n\nMy tags: ')
        type_lst = []
        for item in ans_set:  # ans_set:  ('COMPANY_OF_INDUSTRY_56', '兴业银行')
            if item[0] not in type_lst:
                type_lst.append(item[0])
        type_lst.sort()

        ans_s = ''
        for k in range(len(type_lst)):
            ans_s += str(type_lst[k]) + ': '
            for l in range(len(ans_set)):
                if ans_set[l][0] == type_lst[k]:
                    # 这里插入一个函数,来表示股票与基金间的关系
                    ans_s += stock2fund(ans_set, feqrence, l)
                    # ans_s+=' ('+str(ans_set[l][1])+': '+str(feqrence[l])+')'
            ans_s += '\n'
        f2.write(ans_s)
        f2.write('\n\nkeyword: ')

        # 这里是在数分词器给出的关键词词频
        keys_f = [0 for l in range(len(keys))]

        commen_last_name = [
            '王', '李', '张', '刘', '陈', '杨', '黄', '赵', '吴', '周', '徐', '孙', '马',
            '朱', '胡', '郭', '何', '高', '林', '郑', '谢', '罗', '梁', '宋', '唐', '许',
            '韩', '冯', '邓', '曹', '彭', '曾', '蕭', '田', '董', '袁', '潘', '于', '蒋',
            '蔡', '余', '杜', '叶', '程', '苏', '魏', '吕', '丁', '任', '沈', '姚', '卢',
            '姜', '崔', '钟', '谭', '陆', '汪', '范', '金', '石', '廖', '贾', '夏', '韦',
            '付', '方', '白', '邹', '孟', '熊', '秦', '邱', '江', '尹', '薛', '闫', '段',
            '雷', '侯', '龙', '史', '陶', '黎', '贺', '顾', '毛', '郝', '龚', '邵', '万',
            '钱', '严', '覃', '武', '戴', '莫', '孔', '向', '汤'
        ]
        ans3 = ''

        f3.seek(0)
        for line in f3:
            if len(line.split()) == 2:
                name = line.split()[0]
                pos = line.split()[1]
                for l in range(len(keys)):
                    if name == keys[l]:
                        keys_f[l] += 1
                if name[0] in commen_last_name and name not in [
                        '万元', '周一', '周二', '周三', '周四', '周五', '周六', '周日', '周天'
                ] and len(name) in [2, 3] and pos == 'nr':
                    ans3 += '  ' + name

        ans2 = ''
        for l in range(len(keys)):
            ans2 += str(keys[l]) + ': ' + str(keys_f[l]) + '  '

        f2.write(ans2)
        f2.write('\n\nRelated person: ' + ans3)
        f2.close()

    pynlpir.close()
    return cnt1
Beispiel #14
0
def segment_tagging(sentence):
    pynlpir.open()  # Initializes the NLPIR API
    sentence_segment_tag = pynlpir.segment(
        sentence)  #Get the result of segment and tagging
    pynlpir.close()  # Exits the NLPIR and frees allocated memory.
    return sentence_segment_tag  #return results
    def _convert_examples_to_features(self,
                                      examples,
                                      doc_stride,
                                      is_train=True,
                                      topRate=0.5):
        """Loads a data file into a list of `InputBatch`s."""
        unique_id = 1000000000
        features = []
        pynlpir.open()
        with tqdm(total=len(examples),
                  desc="convert examples to features add bm25:") as pbar:
            for example_id, example in enumerate(examples):
                qid = example.qid
                qusetion = example.qusetion
                docids = example.docids
                contexts = example.contexts
                answer = None
                answer_span = None

                features_temp = []
                sub_doc = []
                for context_index, (docid, context) in enumerate(
                        zip(docids, contexts)):
                    if is_train:
                        answer_span = example.answer_span

                    qusetion_tokens = self.tokenizer.tokenize(
                        qusetion) if len(qusetion) > 0 else []
                    if len(qusetion_tokens
                           ) > self.max_query_length:  # cut at tail
                        qusetion_tokens = qusetion_tokens[0:self.
                                                          max_query_length]

                    token_to_orig_index = []
                    orig_to_token_index = []
                    context_tokens = []
                    for (i, word) in enumerate(context):
                        orig_to_token_index.append(len(context_tokens))
                        sub_tokens = self.tokenizer.tokenize(word)
                        for sub_token in sub_tokens:
                            token_to_orig_index.append(i)
                            context_tokens.append(sub_token)

                    token_start_position = None
                    token_end_position = None
                    if is_train:
                        token_start_position = -1
                        token_end_position = -1
                    if is_train:
                        token_start_position = orig_to_token_index[
                            answer_span[0]]
                        if answer_span[1] < len(context) - 1:
                            token_end_position = orig_to_token_index[
                                answer_span[1] + 1] - 1
                        else:
                            token_end_position = len(context_tokens) - 1
                        (token_start_position,
                         token_end_position) = self._improve_answer_span(
                             context_tokens, token_start_position,
                             token_end_position, example.answer)

                    # We can have documents that are longer than the maximum sequence length.
                    # To deal with this we do a sliding window approach, where we take chunks
                    # of the up to our max length with a stride of `doc_stride`.
                    max_context_length = self.max_seq_length - self.max_query_length - 3
                    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
                        "DocSpan", ["start", "length"])
                    doc_spans = []
                    start_offset = 0
                    while start_offset < len(context_tokens):
                        length = len(context_tokens) - start_offset
                        if length > max_context_length:
                            length = max_context_length
                        doc_spans.append(
                            _DocSpan(start=start_offset, length=length))
                        if start_offset + length == len(context_tokens):
                            break
                        start_offset += min(length, doc_stride)

                    for (doc_span_index, doc_span) in enumerate(doc_spans):
                        token_to_orig_map = {}
                        token_is_max_context = {}
                        tokens = ["[CLS]"] + qusetion_tokens + ["[SEP]"]
                        segment_ids = [0] * len(tokens)
                        for i in range(doc_span.length):
                            split_token_index = doc_span.start + i
                            token_to_orig_map[
                                len(tokens
                                    )] = token_to_orig_index[split_token_index]

                            is_max_context = self._check_is_max_context(
                                doc_spans, doc_span_index, split_token_index)
                            token_is_max_context[len(tokens)] = is_max_context
                            tokens.append(context_tokens[split_token_index])
                            segment_ids.append(1)
                        tokens.append("[SEP]")
                        segment_ids.append(1)

                        input_ids = self.tokenizer.convert_tokens_to_ids(
                            tokens)
                        input_mask = [1] * len(tokens)

                        padding = [0] * (self.max_seq_length - len(input_ids))
                        input_ids += padding
                        input_mask += padding
                        segment_ids += padding

                        assert len(input_ids) == self.max_seq_length
                        assert len(input_mask) == self.max_seq_length
                        assert len(segment_ids) == self.max_seq_length

                        start_position = None
                        end_position = None
                        if is_train:
                            # For training, if our document chunk does not contain an annotation
                            # we throw it out, since there is nothing to predict.
                            doc_start = doc_span.start
                            doc_end = doc_span.start + doc_span.length - 1
                            out_of_span = False
                            if not (token_start_position >= doc_start
                                    and token_end_position <= doc_end):
                                out_of_span = True
                            if not out_of_span:
                                doc_offset = len(qusetion_tokens) + 2
                                start_position = token_start_position - doc_start + doc_offset
                                end_position = token_end_position - doc_start + doc_offset
                            else:
                                continue

                        features_temp.append(
                            InputFeatures(
                                unique_id=unique_id,
                                qid=qid,
                                context_index=context_index,
                                doc_span_index=doc_span_index,
                                tokens=tokens,
                                token_to_orig_map=token_to_orig_map,
                                token_is_max_context=token_is_max_context,
                                input_ids=input_ids,
                                input_mask=input_mask,
                                segment_ids=segment_ids,
                                docid=docid,
                                answer=answer,
                                start_idx=start_position,
                                end_idx=end_position))
                        if not is_train:
                            sub_doc.append(
                                pynlpir.segment(''.join(tokens),
                                                pos_tagging=False))
                        unique_id += 1
                try:
                    if not is_train:
                        bm25_model = BM25(sub_doc)
                        bm25_score = bm25_model.get_scores(
                            pynlpir.segment(qusetion, False))
                        rankindex = np.argsort(-np.array(bm25_score))
                        features.extend([
                            features_temp[i]
                            for i in rankindex[:int(len(rankindex) * topRate)]
                        ])
                    else:
                        features.extend(features_temp)
                except:
                    print('end')
                pbar.update(1)
        pynlpir.close()
        return features
Beispiel #16
0
    def getKeywordsAndSave(self, *args, **kwargs):
        import pickle
        freq_lower_bound = int(kwargs["freq_lower_bound"])
        token_len_lower_bound = int(kwargs["token_len_lower_bound"])
        doc_len_lower_bound = int(kwargs["doc_len_lower_bound"])
        doc_len_upper_bound = int(kwargs["doc_len_upper_bound"])

        if str(kwargs["method"]) == "keyword":
            file_keywords = open(
                self.conf_io["prefix"] +
                self.conf_io["output_data_directory"] +
                str(kwargs["target_name"]) + '.fine.keywords', 'w')
        elif str(kwargs["method"]) == "normal":
            file_keywords = open(
                self.conf_io["prefix"] +
                self.conf_io["output_data_directory"] +
                str(kwargs["target_name"]) + '.keywords', 'w')
        tokens = []
        token_indexes = {}
        if bool(kwargs["static_file"]) is True:
            source_name = self.conf_io["prefix"] + self.conf_io[
                "output_data_directory"] + str(kwargs["source_name"])
            with open(source_name, 'r') as f:
                _ind = 0
                for ind, line in enumerate(f):
                    try:
                        with Timer('calculateTokens') as t:
                            tokens.append(
                                self.calculateTokens(
                                    line,
                                    method=str(kwargs["method"]),
                                    doc_len_lower_bound=doc_len_lower_bound,
                                    doc_len_upper_bound=doc_len_upper_bound))
                        # [experimental feature]
                        # this is to be used with LDA
                        # to show what raw doc is associated with each topic
                        token_indexes[ind] = _ind
                        _ind += 1
                    except Exception as e:
                        if e is KeyboardInterrupt:
                            break
                        print e
                        print "error with ", line
                        continue
                    else:
                        pass
                for line in tokens:
                    if line is not None:
                        filtered_tokens = [
                            token for token in line.split(',')
                            if self.frequency[token.lower()] > freq_lower_bound
                            and len(token) > token_len_lower_bound
                        ]
                        filtered_tokens = ','.join(filtered_tokens)
                        file_keywords.write('%s\n' %
                                            (filtered_tokens.encode('utf-8')))
                        file_keywords.flush()
            f.close()
            # experimental
            json.dump(token_indexes,
                      open(self.f_token_indexes + "token_indexes.pickle", "w"),
                      ensure_ascii=True)
        else:
            doc_list = args[0]
            for ind, line in enumerate(list(doc_list)):
                try:
                    tokens.append(
                        self.calculateTokens(
                            line,
                            method=str(kwargs["method"]),
                            doc_len_lower_bound=doc_len_lower_bound,
                            doc_len_upper_bound=doc_len_upper_bound))
                except Exception as e:
                    if e is KeyboardInterrupt:
                        break
                    print e
                    print "error with ", line
                    continue
                else:
                    pass
            for line in tokens:
                if line is not None:
                    filtered_tokens = [
                        token for token in line.split(',')
                        if self.frequency[token.lower()] > freq_lower_bound
                        and len(token) > token_len_lower_bound
                    ]
                    filtered_tokens = ','.join(filtered_tokens)
                    file_keywords.write('%s\n' %
                                        (filtered_tokens.encode('utf-8')))
                    file_keywords.flush()
        file_keywords.close()
        pynlpir.close()
        return True
def segment_filter():
    """
    原始文件分词并过滤
    :return:
    """
    # 获取文件列表
    file_list = os.listdir(path)
    # file_list = ['caption_validation_annotations_20170910.json']

    res = []  # 结果列表

    # 启动分词工具
    pynlpir.open()
    for file_name in file_list:
        file_path = os.path.join(path, file_name)
        # 打开文件
        f = open(file_path, 'r')
        # 载入json,文件只有一行,一个json
        j = json.loads(f.readline())
        # j = j[:100]
        # 只取caption部分
        j = [x['caption'] for x in j]
        # 分词
        j = [[y.replace('\n', ' ') for y in x] for x in j]
        j = [[pynlpir.segment(y) for y in x] for x in j]
        # 根据词性清洗词
        # 保留以下词性的词,并去除词性标记
        # 词性含义请查看https://github.com/tsroten/pynlpir/blob/master/pynlpir/pos_map.py
        word_filter = ('noun', 'time word', 'locative word',
                       'noun of locality', 'verb', 'adjective',
                       'distinguishing word', 'status word', 'numeral',
                       'adverb')
        j = [[[z[0] for z in y if z[1] in word_filter] for y in x] for x in j]

        # 去除为空的句子
        j = [[y for y in x if len(y) != 0] for x in j]

        # 清除重复
        for x in range(len(j)):
            temp = []
            for i in range(len(j[x])):
                flag = True
                for k in range(i + 1, len(j[x])):
                    temp_set1 = set(j[x][i])
                    temp_set2 = set(j[x][k])
                    if len(temp_set1 | temp_set2) == len(temp_set1
                                                         & temp_set2):
                        flag = False
                        break
                if flag:
                    temp.append(j[x][i])
            j[x] = temp
        # 加到结果集
        res[len(res):len(res)] = j
    pynlpir.close()
    print '分词完成'

    # 保存到json文件
    res = [{'caption': x} for x in res]
    # 保存到json对象
    json_obj = json.dumps(res)
    # 写入文件
    f = open(json_path, 'w')
    f.write(json_obj)
    f.close()
    print '存入json文件:%s' % json_path
Beispiel #18
0
 def tearDown(self):
     pynlpir.close()
Beispiel #19
0
def get_key_words():
    s = ''
    max_words = MAX_WORDS_DEFAULT
    max_hot_words = MAX_HOT_WORDS_DEFAULT
    update_hot_word = UPDATE_HOT_WORD_DEFAULT
    # get doc
    if request.method == 'POST':
        s = request.form.get('s', type=str, default='')
        update_hot_word = request.form.get(
            'update_hot_word', type=str,
            default=UPDATE_HOT_WORD_DEFAULT)  # 是否更新hot_word表
        try:
            max_words = request.form.get('max_words',
                                         type=str,
                                         default=MAX_WORDS_DEFAULT)
            if max_words != '':  # 有max_words参数(可能是默认值'3')
                print('[POST] max_words yes')
                max_words = int(max_words.strip())
                print('\tmax_words =', max_words)
            else:
                max_words = MAX_WORDS_DEFAULT
                print('[POST] max_words no')
        except:  # max_words参数处理异常,设置默认值3
            max_words = MAX_WORDS_DEFAULT
        try:
            max_hot_words = request.form.get('max_hot_words',
                                             type=str,
                                             default=MAX_HOT_WORDS_DEFAULT)
            if max_hot_words != '':
                max_hot_words = int(max_hot_words.strip())
            else:
                max_hot_words = MAX_HOT_WORDS_DEFAULT
        except:
            max_hot_words = MAX_HOT_WORDS_DEFAULT
    elif request.method == 'GET':
        s = request.args.get('s')
        update_hot_word = request.args.get('update_hot_word')
        if update_hot_word != 'False':
            update_hot_word = 'True'
        try:
            max_words = int(request.args.get('max_words').strip())
        except:
            max_words = MAX_WORDS_DEFAULT
        try:
            max_hot_words = int(request.args.get('max_hot_words').strip())
        except:
            max_hot_words = MAX_HOT_WORDS_DEFAULT

    print('[PID]', os.getppid())

    # get key words
    if s == '':  # 文章内容为空,不分析
        return 'null'
    else:  # 分析关键词
        try:
            pynlpir.open()
            key_word_list = pynlpir.get_key_words(s,
                                                  max_words=max_words,
                                                  weighted=False)
            # temp_str = ''
            for i in range(len(key_word_list)):
                key_word_list[i] = key_word_list[i]
        except:
            key_word_list = []
        else:
            pynlpir.close()
        if update_hot_word == 'True':
            # 新开一个线程,更新数据库
            print('[update_hot_word] True')
            t = threading.Thread(target=db_helper.update_tables,
                                 args=(','.join(key_word_list), max_hot_words))
            t.setDaemon(True)
            t.start()
        else:
            print('[update_hot_word] False')
        return ','.join(key_word_list)
def textual(uid):
    import torch
    import pynlpir
    import random
    import numpy as np
    torch.set_num_threads(8)
    torch.manual_seed(1)
    random.seed(1)
    # opening embedding file
    print('opening embedding file')
    f = open('sgns_weibo.bigram-char', 'r', encoding='utf8')
    raw = f.readlines()
    f.close()

    # constructing word to index dictionary
    print('constructing word to index dictionary')
    word_to_ix = dict()
    iter = 0
    for line in raw:
        word_to_ix[(line.split())[0]] = iter
        iter = iter + 1

    for i in ['ttttt', 'ggggg', 'uuuuu', 'eeeee', 'ooooo', ' ']:
        word_to_ix[i] = iter
        iter += 1

    model_path = 'mr_best_model_minibatch_acc_7863.model'
    EMBEDDING_DIM = 300
    VOCAB_SIZE = 195203
    HIDDEN_DIM = 100  # TO BE TUNED
    LABEL_SIZE = 2
    BATCH_SIZE = 1  # TO BE TUNED
    EPOCH = 100  # TO BE TUNED
    DROPOUT = 0.5  # TO BE TUNED 0.95* every epoch
    NUM_LAYER = 2  # TO BE TUNED
    model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, LABEL_SIZE,
                           BATCH_SIZE, DROPOUT, NUM_LAYER)
    model.load_state_dict(
        torch.load(model_path, map_location=torch.device('cpu')))

    path = './data/' + uid + '.txt'

    text_set = []
    pynlpir.open()
    f = open(path, 'r', encoding='utf8')
    raw = f.readlines()
    f.close()
    if len(raw) == 0:
        return [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]  #to be determined

    for line in raw:
        #print(line)
        if (len(line) == 0):
            continue
        # tokenizer from chinese academy of sciences
        temp = pynlpir.segment(line, pos_tagging=False)  # 使用pos_tagging来关闭词性标注
        temp2 = [x for x in temp if x != ' ']  # remove redundant spaces

        sentence = []
        for word in temp2:
            if word in word_to_ix.keys():
                sentence.append(word_to_ix[word])
            else:
                sentence.append(word_to_ix['ooooo'])  # oov

        text_set.append(sentence)
    pynlpir.close()

    scores = []
    model.eval()
    if len(text_set) == 0:
        return [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
    for sent in text_set:
        if (len(sent) == 0):
            continue
        model.hidden = model.init_hidden()  #detach from last example
        #print(sent)
        temp = model(torch.tensor(sent))
        temp = (temp[0][1]).item()
        scores.append(np.e**temp)

    #print(scores)
    feature = []
    # mean >0.8 >0.9 <0.1 <0.2 max min median  continuous>0.75 continuous<0.25
    feature.append(np.mean(scores))
    feature.append(more_than(scores, 0.8))
    feature.append(more_than(scores, 0.9))
    feature.append(less_than(scores, 0.1))
    feature.append(less_than(scores, 0.2))
    feature.append(max(scores))
    feature.append(min(scores))
    feature.append(np.median(scores))

    count = 0
    temp = 0
    for x in scores:
        if x > 0.75:
            temp += 1
        else:
            count = max(count, temp)
            temp = 0
    count = max(count, temp)
    feature.append(count)

    count = 0
    temp = 0
    for x in scores:
        if x < 0.25:
            temp += 1
        else:
            count = max(count, temp)
            temp = 0
    count = max(count, temp)
    feature.append(count)
    return feature
Beispiel #21
0
def createDocMapAndClickInfo(total_set_file, doc_set_file):
    user_map_r2v = {}
    user_map_v2r = {}
    doc_map_r2v = {}  #doc map(Facilitate the calculation in PLSA)
    doc_map_v2r = {}  #not use
    user_set = set()  #total users
    doc_set = set()  #total documents
    user_click_count = {}
    doc_click_count = {}  #clicks in every document
    user_doc_click_count = {}  #clicks in specific document from specific user

    if os.path.isfile(doc_set_file):
        is_write_need_file = True
    else:
        is_write_need_file = False
    fp_total_set = open(total_set_file, 'r')
    if is_write_need_file == False:
        fp_doc_set = open(doc_set_file, 'w')
        fp_doc_map_r2v = open('../PLSA/data/doc_map_r2v.csv', 'w')
        fp_doc_map_v2r = open('../PLSA/data/doc_map_v2r.csv', 'w')
        fp_doc_click_count = open('../PLSA/data/doc_click_count.csv', 'w')
        fp_user_doc_click_count = open('../PLSA/data/user_doc_click_count.csv',
                                       'w')
    cnt = 0
    cnt1 = 0
    pynlpir.open()
    for line in fp_total_set:
        word = line.split('\t')
        user_set.add(word[0])
        doc_set.add(word[1])
        doc_click_count.setdefault(word[1], 0)
        doc_click_count[word[1]] += 1
        user_click_count.setdefault(word[0], 0)
        user_click_count[word[0]] += 1
        user_doc_click_count.setdefault(word[0], {})
        if user_doc_click_count[word[0]].has_key(word[1]) == False:
            user_doc_click_count[word[0]][word[1]] = 0
        user_doc_click_count[word[0]][word[1]] += 1
        if user_map_r2v.has_key(word[0]) == False:
            user_map_r2v[word[0]] = cnt1
            user_map_v2r[cnt1] = word[0]
            cnt1 += 1
        if doc_map_r2v.has_key(word[1]) == False:
            doc_map_r2v[word[1]] = cnt
            doc_map_v2r[cnt] = word[1]
            cnt += 1
            if is_write_need_file == False:
                title_split_result = pynlpir.nlpir.ParagraphProcess(
                    word[4], True)
                content_split_result = pynlpir.nlpir.ParagraphProcess(
                    word[5], True)
                #make sure that news id map is true
                fp_doc_set.write(
                    '%s\t%s\t%s' %
                    (word[1], title_split_result,
                     content_split_result))  #, content_split_result))

    # doc_map = sorted(doc_map_r2v.items(), key=lambda d:d[1], reverse=False)
    if is_write_need_file == False:
        for d, dtag in doc_map_r2v.items():
            fp_doc_map_r2v.write('%s %d\n' % (d, dtag))
        for dtag, d in doc_map_v2r.items():
            fp_doc_map_v2r.write('%d %s\n' % (dtag, d))
        for d, dclicks in doc_click_count.items():
            fp_doc_click_count.write('%s %d\n' % (d, dclicks))
    user_clicks = 0
    for u, uitem in user_doc_click_count.items():
        for d in uitem.keys():
            if is_write_need_file == False:
                fp_user_doc_click_count.write('%s %s %d\n' % (u, d, uitem[d]))
            user_clicks += uitem[d]
    print 'user clicks = ', user_clicks

    pynlpir.close()
    if is_write_need_file == False:
        fp_doc_set.close()
        fp_total_set.close()
    print 'number of users:', len(user_set)
    print 'number of documents:', len(doc_set)

    print 'createDocMap end'
    #user_set (real_user_id) doc_set(real_news_id)
    #doc_map_r2v (real_news_id -> virtual_news_id)
    #doc_map_v2r (virtual_news_id -> real_news_id)
    #doc_click_count (real_news_id -> clicks)
    #user_doc_click_count (real_user_id, real_news_id -> clicks)
    return user_set, doc_set, user_map_r2v, user_map_v2r, doc_map_r2v, doc_map_v2r, user_click_count, doc_click_count, user_doc_click_count
def train():
    # opening embedding file
    print('opening embedding file')
    f = open(path + 'sgns.weibo.bigram-char', 'r', encoding='utf8')
    raw = f.readlines()
    f.close()

    # constructing word to index dictionary
    print('constructing word to index dictionary')
    word_to_ix = dict()
    iter = 0
    for line in raw:
        word_to_ix[(line.split())[0]] = iter
        iter = iter + 1

    for i in ['ttttt', 'ggggg', 'uuuuu', 'eeeee', 'ooooo', ' ']:
        word_to_ix[i] = iter
        iter += 1

    # loading the pre-trained embedding vectors
    print('loading the pre-trained embedding vectors')
    embed_vectors = []
    for line in raw:
        embed_vectors.append([float(j) for j in ((line.split())[1:])])

    for i in ['ttttt', 'ggggg', 'uuuuu', 'eeeee', 'ooooo']:
        embed_vectors.append((torch.zeros(300)).tolist())  #randn or zeros

    embed_vectors.append((torch.zeros(300)).tolist())  #for ' '
    global FINAL
    FINAL = len(embed_vectors) - 1

    # load the train, val, test data
    print('load the train, val, test data')
    df = pd.read_excel(path + 'final data.xlsx')

    # generate matrix with first-col uid second-col botornot
    idbotmat = []
    for i in range(len(df)):
        temp = []
        temp.append(df['uid'][i])
        temp.append(df['botornot'][i])
        idbotmat.append(temp)

    # all text into text_set
    # where every element is [sentence, label]
    # sentence is another list
    text_set = []
    pynlpir.open()  # 打开分词器
    for pair in idbotmat:
        f = open(path + 'data/' + str(pair[0]) + '.txt', 'r', encoding='utf8')
        #print(f)
        raw = f.readlines()
        f.close()
        if len(raw) == 0:
            continue

        for line in raw:
            # tokenizer from chinese academy of sciences
            temp = pynlpir.segment(line,
                                   pos_tagging=False)  # 使用pos_tagging来关闭词性标注
            temp2 = [x for x in temp if x != ' ']  #remove redundant spaces

            sentence = []
            for word in temp2:
                if word in word_to_ix.keys():
                    sentence.append(word_to_ix[word])
                else:
                    sentence.append(word_to_ix['ooooo'])  #oov

            data = [sentence, pair[1]]
            text_set.append(data)
    pynlpir.close()
    print('Total sentences: ' + str(len(text_set)))
    random.shuffle(text_set)

    #determine train dev test
    train_ratio = 0.7
    dev_ratio = 0.1
    test_ratio = 0.2
    total_sample = len(text_set)
    train_set = text_set[:int(total_sample * train_ratio)]
    dev_set = text_set[int(total_sample *
                           train_ratio):int(total_sample *
                                            (train_ratio + dev_ratio))]
    test_set = text_set[int(total_sample * (train_ratio + dev_ratio)):]

    print('delcaring model')
    best_dev_acc = 0.0
    EMBEDDING_DIM = len(embed_vectors[0])
    VOCAB_SIZE = len(embed_vectors)
    HIDDEN_DIM = 100  # TO BE TUNED
    LABEL_SIZE = 2
    BATCH_SIZE = 64  # TO BE TUNED
    EPOCH = 100  # TO BE TUNED
    DROPOUT = 0.5  # TO BE TUNED 0.95* every epoch
    NUM_LAYER = 2  # TO BE TUNED

    # declare model
    model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, LABEL_SIZE,
                           BATCH_SIZE, DROPOUT, NUM_LAYER)

    #load the word embedding
    embedtensor = torch.tensor(embed_vectors)
    #embedtensor.to(device)
    model.word_embeddings.weight.data = embedtensor
    #model.to(device)

    #how many parameters in the model
    print('Total params in the network: ' + str(count_parameters(model)))

    loss_function = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-2)  # TO BE TUNED
    #optimizer = optim.SGD(model.parameters(), lr = 1e-2, momentum = 0.9)

    print('training starts now')
    torch.autograd.set_detect_anomaly(True)
    no_up = 0
    for i in range(EPOCH):
        print('epoch: %d start!' % i)
        random.shuffle(train_set)
        model.lstm.dropout = DROPOUT
        train_epoch(model, train_set, loss_function, optimizer, BATCH_SIZE,
                    i)  #what to do? what to pass?
        DROPOUT = DROPOUT * 0.95  #dropout scheduling
        print('now best dev acc:', best_dev_acc)
        dev_acc = evaluate(model, dev_set, loss_function,
                           'dev')  #what to do? what to pass?
        test_acc = evaluate(model, test_set, loss_function,
                            'test')  #what to do? what to pass?
        if dev_acc > best_dev_acc:
            best_dev_acc = dev_acc
            #!rm '/content/gdrive/My Drive/Colab Notebooks/best_models/mr_best_model_minibatch_acc_*.model' #colab only
            os.system(cmd + ' ' + path +
                      'best_models/mr_best_model_minibatch_acc_*.model')
            print('New Best Dev!!!')
            torch.save(
                model.state_dict(),
                path + 'best_models/mr_best_model_minibatch_acc_' +
                str(int(test_acc * 10000)) + '.model')
            no_up = 0
        else:
            no_up += 1
            if no_up >= 10:
                print("so what")
def nlpir_tokenizer(sentence):
    pynlpir.open()
    segs = pynlpir.segment(sentence, pos_tagging=False)
    pynlpir.close()
    return segs
Beispiel #24
0
def createDocMapAndClickInfo(total_set_file, doc_set_file):
    doc_map1 = {}  #doc map(Facilitate the calculation in PLSA)
    doc_map2 = {}  #not use
    user_set = set()  #total users
    doc_set = set()  #total documents
    doc_click_count = {}  #clicks in every document
    user_doc_click_count = {}  #clicks in specific document from specific user

    if os.path.isfile(doc_set_file):
        is_write_need_file = True
    else:
        is_write_need_file = False
    fp_total_set = open(total_set_file, 'r')
    if is_write_need_file == False:
        fp_doc_set = open(doc_set_file, 'w')
        fp_doc_map1 = open('data//doc_map1.csv', 'w')
        fp_doc_map2 = open('data//doc_map2.csv', 'w')
        fp_doc_click_count = open('data//doc_click_count.csv', 'w')
        fp_user_doc_click_count = open('data//user_doc_click_count.csv', 'w')
    cnt = 0
    pynlpir.open()
    for line in fp_total_set:
        word = line.split('\t')
        user_set.add(word[0])
        doc_set.add(word[1])
        doc_click_count.setdefault(word[1], 0)
        doc_click_count[word[1]] += 1
        user_doc_click_count.setdefault(word[0], {})
        if user_doc_click_count[word[0]].has_key(word[1]) == False:
            user_doc_click_count[word[0]][word[1]] = 0
        user_doc_click_count[word[0]][word[1]] += 1
        if doc_map1.has_key(word[1]) == False:
            doc_map1[word[1]] = cnt
            doc_map2[cnt] = word[1]
            cnt += 1
            if is_write_need_file == False:
                # title_split_result = pynlpir.nlpir.ParagraphProcess(word[4], True)
                content_split_result = pynlpir.nlpir.ParagraphProcess(word[5], True)
                fp_doc_set.write('%s\t%s' %(word[1], content_split_result))#, content_split_result))

    # doc_map = sorted(doc_map1.items(), key=lambda d:d[1], reverse=False)
    if is_write_need_file == False:
        for d, dtag in doc_map1.items():
            fp_doc_map1.write('%s %d\n' %(d, dtag))
        for dtag, d in doc_map2.items():
            fp_doc_map2.write('%d %s\n' %(dtag, d))
        for d, dclicks in doc_click_count.items():
            fp_doc_click_count.write('%s %d\n' %(d, dclicks))
    user_clicks = 0
    for u, uitem in user_doc_click_count.items():
        for d in uitem.keys():
            if is_write_need_file == False:
                fp_user_doc_click_count.write('%s %s %d\n' %(u, d, uitem[d]))
            user_clicks += uitem[d]
    print 'user clicks = ', user_clicks

    pynlpir.close()
    if is_write_need_file == False:
        fp_doc_set.close()
        fp_total_set.close()
    print 'number of users:', len(user_set)
    print 'number of documents:', len(doc_set)

    print 'createDocMap end'
    return user_set, doc_set, doc_map1, doc_map2, doc_click_count, user_doc_click_count
Beispiel #25
0
def drive_end():
    pynlpir.close()
Beispiel #26
0
 def pynlpir_segment(self, sentence):
     # pynlpir分词
     pynlpir.open()
     sentence = pynlpir.segment(sentence, pos_tagging=False)
     pynlpir.close()
     return ' '.join(sentence)
Beispiel #27
0
def segment(sentence):
    pynlpir.open()  #Initializes the NLPIR API
    sentence_segment = pynlpir.segment(sentence, pos_tagging=False)  #
    pynlpir.close()  #Exits the NLPIR and frees allocated memory.
    return sentence_segment
Beispiel #28
0
def close_pynlpir():
    global pynlpir
    pynlpir.close()
Beispiel #29
0
 def tearDown(self):
     pynlpir.close()
Beispiel #30
0
                line_p = hanzi_prep.split_into_sentences_e(line)
                for line_i in line_p:
                    # 用空格分割每个汉字
                    str_i = "".join(line_i)
                    str_j = ""
                    if USE_SEGMENT == "JIEBA":
                        str_j = " ".join(jieba.cut(str_i, cut_all=False))
                    elif USE_SEGMENT == "ICTCLAS":
                        str_j = " ".join(pynlpir.segment(str_i, pos_tagging=False))
                    else:
                        print("ERROR:未知分词系统!")
                    fout.write(str_j + "\n")

if USE_SEGMENT == "ICTCLAS":
    print("END:ICTCLAS分词系统")
    pynlpir.close()

elif USE_SEGMENT == "JIEBA":
    print("END:JIEBA分词系统")

else:
    print("END:未知分词系统")

# 计算N-Gram词频信息
# if not os.path.exists(FILE_NAME_UNIC_LM):
#    str_cmd = "ngram-count -text %s -order 2 -write %s" %(FILE_NAME_UNIC, FILE_NAME_UNIC_CNT)
#    print("正在执行:%s" %(str_cmd))
#    os.system(str_cmd)
#    str_cmd = "ngram-count -read %s -order 2 -lm %s -gt1min 2 -gt1max 5 -gt2min 2 -gt2max 5 " %(FILE_NAME_UNIC_CNT, FILE_NAME_UNIC_LM)
#    print("正在执行:%s" %(str_cmd))
#    os.system(str_cmd)
Beispiel #31
0
 def close(self):
     pynlpir.close()
Beispiel #32
0
def createDocMapAndClickInfo(total_set_file, doc_set_file):
    doc_map1 = {}  #doc map(Facilitate the calculation in PLSA)
    doc_map2 = {}  #not use
    user_set = set()  #total users
    doc_set = set()  #total documents
    doc_click_count = {}  #clicks in every document
    user_doc_click_count = {}  #clicks in specific document from specific user

    if os.path.isfile(doc_set_file):
        is_write_need_file = True
    else:
        is_write_need_file = False
    fp_total_set = open(total_set_file, 'r')
    if is_write_need_file == False:
        fp_doc_set = open(doc_set_file, 'w')
        fp_doc_map1 = open('data//doc_map1.csv', 'w')
        fp_doc_map2 = open('data//doc_map2.csv', 'w')
        fp_doc_click_count = open('data//doc_click_count.csv', 'w')
        fp_user_doc_click_count = open('data//user_doc_click_count.csv', 'w')
    cnt = 0
    pynlpir.open()
    for line in fp_total_set:
        word = line.split('\t')
        user_set.add(word[0])
        doc_set.add(word[1])
        doc_click_count.setdefault(word[1], 0)
        doc_click_count[word[1]] += 1
        user_doc_click_count.setdefault(word[0], {})
        if user_doc_click_count[word[0]].has_key(word[1]) == False:
            user_doc_click_count[word[0]][word[1]] = 0
        user_doc_click_count[word[0]][word[1]] += 1
        if doc_map1.has_key(word[1]) == False:
            doc_map1[word[1]] = cnt
            doc_map2[cnt] = word[1]
            cnt += 1
            if is_write_need_file == False:
                # title_split_result = pynlpir.nlpir.ParagraphProcess(word[4], True)
                content_split_result = pynlpir.nlpir.ParagraphProcess(
                    word[5], True)
                fp_doc_set.write(
                    '%s\t%s' %
                    (word[1], content_split_result))  #, content_split_result))

    # doc_map = sorted(doc_map1.items(), key=lambda d:d[1], reverse=False)
    if is_write_need_file == False:
        for d, dtag in doc_map1.items():
            fp_doc_map1.write('%s %d\n' % (d, dtag))
        for dtag, d in doc_map2.items():
            fp_doc_map2.write('%d %s\n' % (dtag, d))
        for d, dclicks in doc_click_count.items():
            fp_doc_click_count.write('%s %d\n' % (d, dclicks))
    user_clicks = 0
    for u, uitem in user_doc_click_count.items():
        for d in uitem.keys():
            if is_write_need_file == False:
                fp_user_doc_click_count.write('%s %s %d\n' % (u, d, uitem[d]))
            user_clicks += uitem[d]
    print 'user clicks = ', user_clicks

    pynlpir.close()
    if is_write_need_file == False:
        fp_doc_set.close()
        fp_total_set.close()
    print 'number of users:', len(user_set)
    print 'number of documents:', len(doc_set)

    print 'createDocMap end'
    return user_set, doc_set, doc_map1, doc_map2, doc_click_count, user_doc_click_count
Beispiel #33
0
def tycl_replace(pat_name):
    global tycl_rep
    global stop_pos
    pynlpir.open()
    fp = file(os.path.join('./manual_pattern', pat_name), 'rb')
    fp_out = file(os.path.join('./extend_pattern', pat_name + '_tycl'), 'wb')
    for pat_line in fp:
        fp_out.write('=' * 50 + '\n')
        pat_line = (pat_line.strip()).decode('UTF-8')
        pat_line = pat_line.split()
        pat = pat_line[0]
        c_pat = pat
        for k, v in tycl_rep.iteritems():
            c_pat = c_pat.replace(k, v)
        seg_list = pynlpir.segment(pat, pos_tagging=False)
        seg_line = pynlpir.segment(pat, pos_tagging=True)
        c_seg_line = pynlpir.segment(c_pat, pos_tagging=True)
        for s_i in range(len(seg_line) - 1, -1, -1):
            seg_line[s_i] = list(seg_line[s_i])
            if seg_line[s_i][1] == None:
                seg_line[s_i][1] = u'None'
                if seg_line[s_i][0] == u' ':
                    del seg_line[s_i]
            else:
                seg_line[s_i][1] = seg_line[s_i][1].replace(' ', '-')
        seg_line_str = ' '.join('/'.join(y) for y in seg_line)
        for s_i in range(len(c_seg_line) - 1, -1, -1):
            c_seg_line[s_i] = list(c_seg_line[s_i])
            if c_seg_line[s_i][1] == None:
                c_seg_line[s_i][1] = u'None'
                if c_seg_line[s_i][0] == u' ':
                    del c_seg_line[s_i]
            else:
                c_seg_line[s_i][1] = c_seg_line[s_i][1].replace(' ', '-')
        c_seg_line_str = ' '.join('/'.join(y) for y in c_seg_line)
        fp_out.write(pat.encode('UTF-8') + '\n')
        fp_out.write(c_pat.encode('UTF-8') + '\n')
        fp_out.write('/'.join(y.encode('UTF-8') for y in seg_list))
        fp_out.write('\n')
        fp_out.write(seg_line_str.encode('UTF-8') + '\n')
        fp_out.write(c_seg_line_str.encode('UTF-8') + '\n')
        ss_i = 0  #the relative index of the word
        for s_i in range(len(seg_line)):
            left_sign = seg_line[s_i][0]
            right_sign = seg_line[s_i][1]
            if left_sign == "MED" or left_sign == "DIS" or left_sign == "SYM" or left_sign == "TRE" or \
               left_sign == "<":
                continue
            elif right_sign == None or right_sign == "punctuation-mark" or right_sign == "numeral" or \
               right_sign == "particle":
                ss_i += 1
                continue
            else:
                rep_word = seg_list[s_i]
                rep_ret = set()
                ids_list = sear_num_new(rep_word)
                if ids_list != None:
                    for id_i in ids_list:
                        one_part = sear_words(id_i)
                        assert one_part != None
                        if len(one_part) > 1:
                            #print rep_word, one_part
                            tmp_part = one_part[:]
                            tmp_part.remove(rep_word)
                            #TODO... the replace operations
                            for tmp_i in range(len(tmp_part) - 1, -1, -1):
                                rep_w = tmp_part[tmp_i]
                                new_str = seg_list[:s_i]
                                new_str.append(rep_w)
                                new_str.extend(seg_list[s_i + 1:])
                                new_str = ''.join(new_str)
                                c_new_str = new_str
                                for k, v in tycl_rep.iteritems():
                                    c_new_str = c_new_str.replace(k, v)
                                fp_out.write('*** ' +
                                             c_new_str.encode('UTF-8') + '\t')
                                new_seg = pynlpir.segment(new_str,
                                                          pos_tagging=True)
                                c_new_seg = pynlpir.segment(c_new_str,
                                                            pos_tagging=True)
                                #something to do
                                for n_i in range(len(c_new_seg) - 1, -1, -1):
                                    c_new_seg[n_i] = list(c_new_seg[n_i])
                                    if c_new_seg[n_i][1] == None:
                                        c_new_seg[n_i][1] = u'None'
                                        if c_new_seg[n_i][0] == u' ':
                                            del c_new_seg[n_i]
                                    else:
                                        c_new_seg[n_i][1] = c_new_seg[n_i][
                                            1].replace(' ', '-')
                                c_new_seg_str = ' '.join('/'.join(y)
                                                         for y in c_new_seg)
                                fp_out.write(
                                    c_new_seg_str.encode('UTF-8') + '\n')
                                #something done
                                is_continue = False
                                if len(c_new_seg) == len(c_seg_line):
                                    if c_new_seg[ss_i][1] == c_seg_line[ss_i][
                                            1]:  #only compare the pos of the word
                                        pass
                                    else:
                                        tmp_part.remove(rep_w)
                                        continue
                                    for front_i in range(ss_i - 1, -1, -1):
                                        if c_new_seg[front_i][0] == c_seg_line[
                                                front_i][0] and c_new_seg[
                                                    front_i][1] == c_seg_line[
                                                        front_i][1]:
                                            pass
                                        else:
                                            tmp_part.remove(rep_w)
                                            is_continue = True
                                            break
                                    if is_continue == True:
                                        continue
                                    for back_i in range(
                                            ss_i + 1, len(c_seg_line)):
                                        if c_new_seg[back_i][0] == c_seg_line[
                                                back_i][0] and c_new_seg[
                                                    back_i][1] == c_seg_line[
                                                        back_i][1]:
                                            pass
                                        else:
                                            tmp_part.remove(rep_w)
                                            break
                                else:
                                    tmp_part.remove(rep_w)
                                    continue
                            #DONE
                            for ti in tmp_part:
                                rep_ret.add(ti)
                fp_out.write('>>>>>> %s\n' % rep_word.encode('UTF-8'))
                #for ret_i in rep_ret:
                fp_out.write(' '.join(y.encode('UTF-8') for y in rep_ret))
                fp_out.write('\n')
                ss_i += 1
        '''
        for i in range(1, len(pat_line)):
            fp_out.write(' '+pat_line[i].encode('UTF-8'))
        '''
    fp.close()
    fp_out.close()
    pynlpir.close()
def parse_words(s):
    pynlpir.open()
    key_words = pynlpir.get_key_words(s, weighted=True)
    pynlpir.close()
    return key_words
Beispiel #35
0
    def partition(self, input_path, output_path):
        """
        分词,把input _path 里的文本文件分词,结果存在output_path
        :param input_path: 文本文件路径
        :param output_path: 分词结果的路径
        :return: 编码错误的词的错误
        """
        f3 = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8')
        f3_name = f3.name

        stop_set = []
        f_stop_list = open(self.root_dir + '/nlp/stop_list.txt',
                           'r',
                           encoding='utf-8')
        for line in f_stop_list:
            if line.split():
                stop_set.append(line.split()[0])
        stop_set = set(stop_set)

        os.chdir(input_path)
        f_lst = os.listdir(os.getcwd())
        cnt1 = 0
        nlpir = pynlpir.nlpir
        pynlpir.open()

        big_dic = self.root_dir + '/nlp/new_bigdic.txt'
        nlpir.ImportUserDict(big_dic.encode('utf-8'))

        for f_item in f_lst:
            try:
                ans_lst = []
                f = open(f_item, 'r', encoding='utf-8')
                s = bytes(f.read(), encoding='utf-8')
                f.close()

                size = ctypes.c_int()
                result = nlpir.ParagraphProcessA(s, ctypes.byref(size), True)
                result_t_vector = ctypes.cast(result,
                                              ctypes.POINTER(nlpir.ResultT))
                words = []

                for i in range(0, size.value):
                    r = result_t_vector[i]
                    word = s[r.start:r.start + r.length]
                    words.append((word, r.sPOS))

                for word, pos in words:
                    # try:
                    if word.decode('utf-8') not in stop_set:
                        if pos.decode('utf-8') > b'z'.decode(
                                'utf-8') or pos.decode(
                                    'utf-8').upper() == pos.decode(
                                        'utf-8') and pos.decode('utf-8') != '':
                            ans_lst.append(
                                (pos.decode('utf-8'), word.decode('utf-8')))

                        f3.write((word.decode('utf-8') + '  ' +
                                  pos.decode('utf-8') + '\n'))

                keys = pynlpir.get_key_words(s, max_words=10, weighted=False)
                ans_set = list(set(ans_lst))
                frequency = [0 for k in range(len(ans_set))]
                for k in range(len(ans_set)):
                    for item in ans_lst:
                        if item == ans_set[k]:
                            frequency[k] += 1

                type_lst = []
                for item in ans_set:  # ans_set:  ('COMPANY_OF_INDUSTRY_56', '兴业银行')
                    if item[0] not in type_lst:
                        type_lst.append(item[0])
                type_lst.sort()

                ans_s = ''
                main_character = self.select_main_character(
                    f_item, ans_set, frequency)
                # print('return things',main_character)
                if main_character:
                    f2 = open(output_path + f_item, 'w', encoding='utf-8')
                    main_company = main_character[0]
                    main_industry = main_character[1]
                    if main_company:
                        ans_s += 'Main company:  '
                        for _ in range(len(main_company)):
                            ans_s += str(main_company[_][0][1]) + '\t' + str(
                                main_company[_][0][0])
                        ans_s += '\n'

                    if main_industry:
                        ans_s += 'Main industry:  '
                        for _ in range(len(main_industry)):
                            ans_s += str(main_industry[_][0][1]) + '\t' + str(
                                main_industry[_][0][0])
                        ans_s += '\n'

                    f2.write(ans_s)
                    # 如果这两个同时为空,那么就是无主题的了,抛弃之

                    # 这里是在数分词器给出的关键词词频
                    keys_f = [0 for l in range(len(keys))]

                    # 这里是找文中出现的人名,同时数了关键词的词频
                    commen_last_name = [
                        '王', '李', '张', '刘', '陈', '杨', '黄', '赵', '吴', '周', '徐',
                        '孙', '马', '朱', '胡', '郭', '何', '高', '林', '郑', '谢', '罗',
                        '梁', '宋', '唐', '许', '韩', '冯', '邓', '曹', '彭', '曾', '蕭',
                        '田', '董', '袁', '潘', '于', '蒋', '蔡', '余', '杜', '叶', '程',
                        '苏', '魏', '吕', '丁', '任', '沈', '姚', '卢', '姜', '崔', '钟',
                        '谭', '陆', '汪', '范', '金', '石', '廖', '贾', '夏', '韦', '付',
                        '方', '白', '邹', '孟', '熊', '秦', '邱', '江', '尹', '薛', '闫',
                        '段', '雷', '侯', '龙', '史', '陶', '黎', '贺', '顾', '毛', '郝',
                        '龚', '邵', '万', '钱', '严', '覃', '武', '戴', '莫', '孔', '向',
                        '汤'
                    ]
                    ans3 = ''
                    f3.seek(0)
                    for line in f3:
                        if len(line.split()) == 2:
                            name = line.split()[0]
                            pos = line.split()[1]
                            for l in range(len(keys)):
                                if name == keys[l]:
                                    keys_f[l] += 1
                                    # if name[0] in commen_last_name and name not in ['万元','周一','周二','周三','周四','周五','周六','周日','周天'] and len(name) in [2,3] and pos=='nr':
                                    #     ans3+='  '+name

                    ans2 = 'Key words:  '
                    for l in range(len(keys)):
                        ans2 += str(keys[l]) + ': ' + str(keys_f[l]) + '  '

                    f2.write(ans2)
                    # f2.write('\n\nRelated person: '+ans3)
                    f2.close()
                else:
                    continue
            except Exception as e:
                print('Exception in partition_main_character', e)
        pynlpir.close()
        return cnt1
Beispiel #36
0
 def close(self):
     """关闭与释放nlp"""
     pynlpir.close()
     self.postagger.release()
     self.recognizer.release()
     self.parser.release()
Beispiel #37
0
def import_userdict(file_dir):
    pynlpir.open()
    nlpir.import_userdict(file_dir)
    pynlpir.close()
Beispiel #38
0
    def __init__(self, filename=TRAINSETFILE, IsTraining=True, IsSegment=True):
        #区分训练集和测试集,是否要分词
        #训练集和测试集的局别只在于训练集的前四项为用户属性
        #而测试集的前1项为用户属性
        #如果分词,读取后的类里面包含的有用信息是:
        #用户信息列表
        #用户词频列表
        #总词典
        self.userlist = []
        self.userinfo = []
        self.dict = Counter({})
        self.IsTraining = IsTraining
        self.IsSegment = IsSegment
        self.IsDF = False
        with open(filename, encoding='GB18030') as file:
            filereader = csv.reader(file,
                                    dialect='excel-tab',
                                    quoting=csv.QUOTE_NONE)
            if not IsSegment:
                for item in filereader:
                    self.userlist.append(item)
            else:
                pynlpir.open()
                if IsTraining:
                    infoflag = 4
                else:
                    infoflag = 1
                # count_test =0
                for userquery in filereader:

                    begin = datetime.now()

                    userdict = {}
                    userdictflag = {}  #计算df时需要用一个标志标记该词是否在该文档中计算过

                    self.userinfo.append(userquery[:infoflag])
                    for item in userquery[infoflag:]:

                        # #使用counter类,但循环变多反而变慢了 第一条0.194s
                        # userdict = Counter(pynlpir.segment(item))
                        # userdict = Counter({word[0]:value for word,value in userdict.items() if word[1] in wordset})
                        # self.dict += Counter({word:1 for word in userdict})

                        # #使用counter类,改变了循环的顺序,但是还是变慢了,第一条0.161
                        # userdict = [word[0] for word in pynlpir.segment(item) if word[1] in wordset]
                        # userdict = Counter(userdict)
                        # self.dict += Counter({word:1 for word in userdict})

                        # # 使用counter类
                        # pass

                        #最开始的循环计数,第一条 0.149s
                        for word in pynlpir.segment(item):
                            if word[1] in wordset:
                                word = word[0]
                                if word in userdict.keys():
                                    userdict[word] += 1
                                    userdictflag[word] = False
                                else:
                                    userdict[word] = 1
                                    userdictflag[word] = True
                                if word not in self.dict.keys():
                                    self.dict[word] = 0
                                if userdictflag[word]:
                                    self.dict[word] += 1

                    self.userlist.append(userdict)

                    end = datetime.now()
                    print(end - begin)

                    # count_test +=1
                    # if count_test>100:
                    #    break
                pynlpir.close()
                self.IsDF = True
Beispiel #39
0
	def __del__(self):
		pynlpir.close()
Beispiel #40
0
def corpus_segment(corpus_path, seg_path):
    '''''

    corpus_path是未分词语料库路径

    seg_path是分词后语料库存储路径

    '''
    max_seg = 80000
    max_train_seg = 70000
    pynlpir.open()  #分词系统
    catelist = os.listdir(corpus_path)  # 获取corpus_path下的所有子目录
    ''''' 
     py_data01/留学/
    其中子目录的名字就是类别名,例如: 
    '''

    # 获取每个目录(类别)下所有的文件
    finish = ['产经', '法治', '房产', '教育', '金融', '军事', '能源', '台湾', '文化', '证券']
    for category in catelist:
        ''''' 

        这里category就是类别,如军事 

        '''
        if category not in finish:

            i = 0
            flag = 0
            #class_path = corpus_path + category + "/"  # 拼出分类子目录的路径如:train_corpus/art/
            class_path = os.path.join(corpus_path, category)
            #seg_dir = seg_path + category + "/"  # 拼出分词后存贮的对应目录路径如:train_corpus_seg/art/
            seg_dir = os.path.join(seg_path, category)
            seg_test_dir = os.path.join(
                "D:\\Py_Learn\\textclassify_work\\results\\test_corpus_seg",
                category)
            #print("wh-1",seg_dir)
            if not os.path.exists(seg_dir):  # 是否存在分词目录,如果没有则创建该目录

                os.makedirs(seg_dir)
            if not os.path.exists(seg_test_dir):  # 是否存在分词目录,如果没有则创建该目录

                os.makedirs(seg_test_dir)

            years = os.listdir(class_path)  # 获取未分词语料库中某一类别中的所有文本

            for year in years:  # 遍历类别目录下的所有文件

                if flag == 1:
                    break
                yearname = os.path.join(
                    class_path, year)  # 拼出文件名年份路径如:train_corpus/art/21.txt
                months = os.listdir(yearname)

                for month in months:

                    if flag == 1:
                        break
                    print("cleaning:" + category + "month" + month)
                    month_path = os.path.join(yearname, month)
                    raw_path = os.listdir(month_path)
                    for document in raw_path:
                        i += 1
                        if i > max_seg:
                            flag = 1
                            break
                        fullname = os.path.join(month_path, document)
                        content = readfile(fullname)  # 读取文件内容
                        '''''此时,content里面存贮的是原文本的所有字符,例如多余的空格、空行、回车等等, 

                          接下来,我们需要把这些无关痛痒的字符统统去掉,变成只有标点符号做间隔的紧凑的文本内容 

                     '''
                        #content = content.replace('\n', '')  # 删除换行
                        #content = content.replace(' ', '')  # 删除空行、多余的空格
                        try:
                            con_segx = pynlpir.segment(content,
                                                       pos_english=True)
                        except UnicodeDecodeError:
                            print(category + " " + document +
                                  " UnicodeDecodeError_wh")
                        content_seg = [
                            element[0] for element in con_segx
                            if element[1] == 'noun'
                        ]
                        #content_seg = jieba.cut(content)  # 为文件内容分词
                        if i <= max_train_seg:
                            savefile(
                                seg_dir + "\\" + document,
                                " ".join(content_seg))  # 将处理后的文件保存到分词后语料目录
                        else:
                            savefile(seg_test_dir + "\\" + document,
                                     " ".join(content_seg))
    pynlpir.close()

    print("中文语料分词结束!!!")
Beispiel #41
0
# -*- coding: utf-8 -*-
"""
Created on Mon May  6 19:00:50 2019

@author: 92111
"""

import pynlpir
pynlpir.open()
with open("test.txt", "r", encoding='utf-8') as f1:
    text = f1.read()
    seg_list = pynlpir.segment(text, 0)
    f2 = open("result.txt", "a", encoding='utf-8')
    for word in seg_list:
        f2.write(word + " ")
    f2.close()
pynlpir.close()
Beispiel #42
0
def parse_words(s):
    #调用pynlpir模块分析字符串的关键字及其weight
    pynlpir.open()
    key_words = pynlpir.get_key_words(s, weighted=True)
    pynlpir.close()
    return key_words