Beispiel #1
0
def main():
    fp_dict = open('../dict.txt')
    wordDict = {}
    for eachWord in fp_dict:
        wordDict[u(eachWord.split('\t')[0].strip(), 'utf-8')] = 1
    segStr = u'''
    00 19960101
01 第3版/国际
02 ■
03 短讯 阿拉法特视察拉马拉
04 ■
05
  巴勒斯坦自治领导机构主席阿拉法特30日上午视察了拉马拉,受到成千上万
巴勒斯坦市民的热烈欢迎。阿拉法特检阅了仪仗队,并在震耳欲聋的欢呼声中向群
众发表了讲话。
00 19960101
01 第3版/国际
02 ■
03 北约驻波黑维和部队司令表示 拒绝塞族推迟移交塞控区
04 ■
05
  新华社贝尔格莱德12月30日电 北约驻波黑维和部队总司令史密斯30日
致函波黑塞族议会主席克拉伊什尼克,拒绝塞族领导人关于推迟移交萨拉热窝塞族
区的要求。
    '''
    print segStr
    wordList = fwd_mm_seg(wordDict, 4, segStr)
    for wordstr in wordList:
        print wordstr,'/',
Beispiel #2
0
def main():
    fp_dict = open('../dict.txt')
    wordDict = {}
    for eachWord in fp_dict:
        wordDict[u(eachWord.split('\t')[0].strip(), 'utf-8')] = 1
    segStr = u'''
    00 19960101
01 第3版/国际
02 ■
03 短讯 阿拉法特视察拉马拉
04 ■
05
  巴勒斯坦自治领导机构主席阿拉法特30日上午视察了拉马拉,受到成千上万
巴勒斯坦市民的热烈欢迎。阿拉法特检阅了仪仗队,并在震耳欲聋的欢呼声中向群
众发表了讲话。
00 19960101
01 第3版/国际
02 ■
03 北约驻波黑维和部队司令表示 拒绝塞族推迟移交塞控区
04 ■
05
  新华社贝尔格莱德12月30日电 北约驻波黑维和部队总司令史密斯30日
致函波黑塞族议会主席克拉伊什尼克,拒绝塞族领导人关于推迟移交萨拉热窝塞族
区的要求。
    '''
    print segStr
    wordList = fwd_mm_seg(wordDict, 4, segStr)
    for wordstr in wordList:
        print wordstr, '/',
Beispiel #3
0
def main():
    fp_dict = open('../dict.txt')
    wordDict = {}
    for eachWord in fp_dict:
        wordDict[u(eachWord.split('\t')[0].strip(), 'utf-8')] = 1
    segStr = u'巴勒斯坦自治领导机构主席阿拉法特'
    print segStr
    bwd_mm_seg(wordDict, 4, segStr)
    wordList = bwd_mm_seg(wordDict, 4, segStr)
    for wordstr in wordList:
        print wordstr,
Beispiel #4
0
def main():
    fp_dict = open('../dict.txt')
    wordDict = {}
    for eachWord in fp_dict:
        wordDict[u(eachWord.split('\t')[0].strip(), 'utf-8')] = 1
    segStr = u'巴勒斯坦自治领导机构主席阿拉法特'
    print segStr
    bwd_mm_seg(wordDict, 4, segStr)
    wordList = bwd_mm_seg(wordDict, 4, segStr)
    for wordstr in wordList:
        print wordstr,
Beispiel #5
0
    def test__get_diffs(self, mock_is_string, mock_get_data, mock_diff, 
                        mock_open):
        orig_entry = lxml.etree.Element("Path", name="/test", type="file",
                                        perms='0644', owner='root',
                                        group='root')
        orig_entry.text = "test"
        ondisk = "test2"
        setup = dict(encoding="utf-8", ppath='/', max_copies=5)
        ptool = self.get_obj(posix=get_posix_object(setup=setup))

        def reset():
            mock_is_string.reset_mock()
            mock_get_data.reset_mock()
            mock_diff.reset_mock()
            mock_open.reset_mock()
            return copy.deepcopy(orig_entry)
        
        mock_is_string.return_value = True
        mock_get_data.return_value = (orig_entry.text, False)
        mock_open.return_value.read.return_value = ondisk
        mock_diff.return_value = ["-test2", "+test"]

        # binary data in the entry
        entry = reset()
        ptool._get_diffs(entry, is_binary=True)
        mock_open.assert_called_with(entry.get("name"))
        mock_open.return_value.read.assert_any_call()
        self.assertFalse(mock_diff.called)
        self.assertEqual(entry.get("current_bfile"), b64encode(ondisk))

        # binary data on disk
        entry = reset()
        mock_is_string.return_value = False
        ptool._get_diffs(entry, content=ondisk)
        self.assertFalse(mock_open.called)
        self.assertFalse(mock_diff.called)
        self.assertEqual(entry.get("current_bfile"), b64encode(ondisk))

        # sensitive, non-interactive -- do nothing
        entry = reset()
        mock_is_string.return_value = True
        ptool._get_diffs(entry, sensitive=True, interactive=False)
        self.assertFalse(mock_open.called)
        self.assertFalse(mock_diff.called)
        self.assertXMLEqual(entry, orig_entry)

        # sensitive, interactive
        entry = reset()
        ptool._get_diffs(entry, sensitive=True, interactive=True)
        mock_open.assert_called_with(entry.get("name"))
        mock_open.return_value.read.assert_any_call()
        mock_diff.assert_called_with(ondisk, entry.text, difflib.unified_diff,
                                     filename=entry.get("name"))
        self.assertIsNotNone(entry.get("qtext"))
        del entry.attrib['qtext']
        self.assertItemsEqual(orig_entry.attrib, entry.attrib)

        # non-sensitive, non-interactive
        entry = reset()
        ptool._get_diffs(entry, content=ondisk)
        self.assertFalse(mock_open.called)
        mock_diff.assert_called_with(ondisk, entry.text, difflib.ndiff,
                                     filename=entry.get("name"))
        self.assertIsNone(entry.get("qtext"))
        self.assertEqual(entry.get("current_bdiff"),
                         b64encode("\n".join(mock_diff.return_value)))
        del entry.attrib["current_bdiff"]
        self.assertItemsEqual(orig_entry.attrib, entry.attrib)

        # non-sensitive, interactive -- do everything. also test
        # appending to qtext
        entry = reset()
        entry.set("qtext", "test")
        ptool._get_diffs(entry, interactive=True)
        mock_open.assert_called_with(entry.get("name"))
        mock_open.return_value.read.assert_any_call()
        self.assertItemsEqual(mock_diff.call_args_list,
                              [call(ondisk, entry.text, difflib.unified_diff,
                                    filename=entry.get("name")),
                               call(ondisk, entry.text, difflib.ndiff,
                                    filename=entry.get("name"))])
        self.assertIsNotNone(entry.get("qtext"))
        self.assertTrue(entry.get("qtext").startswith("test\n"))
        self.assertEqual(entry.get("current_bdiff"),
                         b64encode("\n".join(mock_diff.return_value)))
        del entry.attrib['qtext']
        del entry.attrib["current_bdiff"]
        self.assertItemsEqual(orig_entry.attrib, entry.attrib)

        # non-sensitive, interactive with unicode data
        entry = reset()
        entry.text = u("tëst")
        encoded = entry.text.encode(setup['encoding'])
        mock_get_data.return_value = (encoded, False)
        ptool._get_diffs(entry, interactive=True)
        mock_open.assert_called_with(entry.get("name"))
        mock_open.return_value.read.assert_any_call()
        self.assertItemsEqual(mock_diff.call_args_list,
                              [call(ondisk, encoded, difflib.unified_diff,
                                    filename=entry.get("name")),
                               call(ondisk, encoded, difflib.ndiff,
                                    filename=entry.get("name"))])
        self.assertIsNotNone(entry.get("qtext"))
        self.assertEqual(entry.get("current_bdiff"),
                         b64encode("\n".join(mock_diff.return_value)))
        del entry.attrib['qtext']
        del entry.attrib["current_bdiff"]
        self.assertItemsEqual(orig_entry.attrib, entry.attrib)
Beispiel #6
0
def file_seg_process(filename, method):
    '''
    @param filename: 文件名
    @param method:   分词算法 { 0:正向,1:逆向 }
    '''
    # 打开文件
    fp_dict = open('dict.txt')
    fp_input = open('corpus/'+filename)
    fp_output = open('corpus_seg/'+filename, 'w')
    
    wordDict = {} 
    # 读取字典到内存中
    for eachWord in fp_dict:
        wordDict[u(eachWord.split()[0].strip(), CODEC)] = 1

    # 对input每一行操作
    str = ''
    for eachLine in fp_input:
        line_out = ''
        # 每一段作为一行输入给分词函数
        sub = strQ2B(u(eachLine.strip(), CODEC))
        if not sub.startswith('  '):
            str += sub
            continue
        strlen = len(str)
        while strlen > 0:
            # 英文字符或数字--原文输出
            m = re.match(r'\w+', str)
            if m is not None:
                subStr = m.group()
                line_out += subStr.encode(CODEC)+'/'
                subLen = len(subStr)
                str = str[subLen:]
                strlen = strlen - subLen
                continue
            # 短句结尾标志--输出换行
            if str[0:1].encode(CODEC) in [',','。','!','?',':']:
                subStr = str[0:1]
                line_out += '\n'
                subLen = len(subStr)
                str = str[subLen:]
                strlen = strlen - subLen
            # 汉字--分词处理,输出 词/词
            m = re.match(ur'[\u4e00-\u9fa5]+', str)
            if m is not None:
                subStr = m.group()
                if method == 0:
                    # 正向最大匹配
                    wordList = fwd_mm_seg(wordDict, 8, subStr)
                else:
                    # 逆向最大匹配
                    wordList = bwd_mm_seg(wordDict, 8, subStr)
                line_out += wordList[0].encode(CODEC)+'/'
                for eachWord in wordList[1:]:
                    line_out += eachWord.encode(CODEC)+'/'
                subLen = len(subStr)
                str = str[subLen:]
                strlen = strlen - subLen
                continue
            # 其他特殊字符--跳过
            str = str[1:]
            strlen = strlen - 1
        # 跳过处理后为空行的段落
        if len(line_out.strip()) == 0:
            continue
        # 写入文件
        fp_output.write(line_out + '\n')
        str = sub
    # close file
    fp_input.close()
    fp_dict.close()
    fp_output.close()