def main(): fp_dict = open('../dict.txt') wordDict = {} for eachWord in fp_dict: wordDict[u(eachWord.split('\t')[0].strip(), 'utf-8')] = 1 segStr = u''' 00 19960101 01 第3版/国际 02 ■ 03 短讯 阿拉法特视察拉马拉 04 ■ 05 巴勒斯坦自治领导机构主席阿拉法特30日上午视察了拉马拉,受到成千上万 巴勒斯坦市民的热烈欢迎。阿拉法特检阅了仪仗队,并在震耳欲聋的欢呼声中向群 众发表了讲话。 00 19960101 01 第3版/国际 02 ■ 03 北约驻波黑维和部队司令表示 拒绝塞族推迟移交塞控区 04 ■ 05 新华社贝尔格莱德12月30日电 北约驻波黑维和部队总司令史密斯30日 致函波黑塞族议会主席克拉伊什尼克,拒绝塞族领导人关于推迟移交萨拉热窝塞族 区的要求。 ''' print segStr wordList = fwd_mm_seg(wordDict, 4, segStr) for wordstr in wordList: print wordstr,'/',
def main(): fp_dict = open('../dict.txt') wordDict = {} for eachWord in fp_dict: wordDict[u(eachWord.split('\t')[0].strip(), 'utf-8')] = 1 segStr = u''' 00 19960101 01 第3版/国际 02 ■ 03 短讯 阿拉法特视察拉马拉 04 ■ 05 巴勒斯坦自治领导机构主席阿拉法特30日上午视察了拉马拉,受到成千上万 巴勒斯坦市民的热烈欢迎。阿拉法特检阅了仪仗队,并在震耳欲聋的欢呼声中向群 众发表了讲话。 00 19960101 01 第3版/国际 02 ■ 03 北约驻波黑维和部队司令表示 拒绝塞族推迟移交塞控区 04 ■ 05 新华社贝尔格莱德12月30日电 北约驻波黑维和部队总司令史密斯30日 致函波黑塞族议会主席克拉伊什尼克,拒绝塞族领导人关于推迟移交萨拉热窝塞族 区的要求。 ''' print segStr wordList = fwd_mm_seg(wordDict, 4, segStr) for wordstr in wordList: print wordstr, '/',
def main(): fp_dict = open('../dict.txt') wordDict = {} for eachWord in fp_dict: wordDict[u(eachWord.split('\t')[0].strip(), 'utf-8')] = 1 segStr = u'巴勒斯坦自治领导机构主席阿拉法特' print segStr bwd_mm_seg(wordDict, 4, segStr) wordList = bwd_mm_seg(wordDict, 4, segStr) for wordstr in wordList: print wordstr,
def test__get_diffs(self, mock_is_string, mock_get_data, mock_diff, mock_open): orig_entry = lxml.etree.Element("Path", name="/test", type="file", perms='0644', owner='root', group='root') orig_entry.text = "test" ondisk = "test2" setup = dict(encoding="utf-8", ppath='/', max_copies=5) ptool = self.get_obj(posix=get_posix_object(setup=setup)) def reset(): mock_is_string.reset_mock() mock_get_data.reset_mock() mock_diff.reset_mock() mock_open.reset_mock() return copy.deepcopy(orig_entry) mock_is_string.return_value = True mock_get_data.return_value = (orig_entry.text, False) mock_open.return_value.read.return_value = ondisk mock_diff.return_value = ["-test2", "+test"] # binary data in the entry entry = reset() ptool._get_diffs(entry, is_binary=True) mock_open.assert_called_with(entry.get("name")) mock_open.return_value.read.assert_any_call() self.assertFalse(mock_diff.called) self.assertEqual(entry.get("current_bfile"), b64encode(ondisk)) # binary data on disk entry = reset() mock_is_string.return_value = False ptool._get_diffs(entry, content=ondisk) self.assertFalse(mock_open.called) self.assertFalse(mock_diff.called) self.assertEqual(entry.get("current_bfile"), b64encode(ondisk)) # sensitive, non-interactive -- do nothing entry = reset() mock_is_string.return_value = True ptool._get_diffs(entry, sensitive=True, interactive=False) self.assertFalse(mock_open.called) self.assertFalse(mock_diff.called) self.assertXMLEqual(entry, orig_entry) # sensitive, interactive entry = reset() ptool._get_diffs(entry, sensitive=True, interactive=True) mock_open.assert_called_with(entry.get("name")) mock_open.return_value.read.assert_any_call() mock_diff.assert_called_with(ondisk, entry.text, difflib.unified_diff, filename=entry.get("name")) self.assertIsNotNone(entry.get("qtext")) del entry.attrib['qtext'] self.assertItemsEqual(orig_entry.attrib, entry.attrib) # non-sensitive, non-interactive entry = reset() ptool._get_diffs(entry, content=ondisk) self.assertFalse(mock_open.called) mock_diff.assert_called_with(ondisk, entry.text, difflib.ndiff, filename=entry.get("name")) self.assertIsNone(entry.get("qtext")) self.assertEqual(entry.get("current_bdiff"), b64encode("\n".join(mock_diff.return_value))) del entry.attrib["current_bdiff"] self.assertItemsEqual(orig_entry.attrib, entry.attrib) # non-sensitive, interactive -- do everything. also test # appending to qtext entry = reset() entry.set("qtext", "test") ptool._get_diffs(entry, interactive=True) mock_open.assert_called_with(entry.get("name")) mock_open.return_value.read.assert_any_call() self.assertItemsEqual(mock_diff.call_args_list, [call(ondisk, entry.text, difflib.unified_diff, filename=entry.get("name")), call(ondisk, entry.text, difflib.ndiff, filename=entry.get("name"))]) self.assertIsNotNone(entry.get("qtext")) self.assertTrue(entry.get("qtext").startswith("test\n")) self.assertEqual(entry.get("current_bdiff"), b64encode("\n".join(mock_diff.return_value))) del entry.attrib['qtext'] del entry.attrib["current_bdiff"] self.assertItemsEqual(orig_entry.attrib, entry.attrib) # non-sensitive, interactive with unicode data entry = reset() entry.text = u("tëst") encoded = entry.text.encode(setup['encoding']) mock_get_data.return_value = (encoded, False) ptool._get_diffs(entry, interactive=True) mock_open.assert_called_with(entry.get("name")) mock_open.return_value.read.assert_any_call() self.assertItemsEqual(mock_diff.call_args_list, [call(ondisk, encoded, difflib.unified_diff, filename=entry.get("name")), call(ondisk, encoded, difflib.ndiff, filename=entry.get("name"))]) self.assertIsNotNone(entry.get("qtext")) self.assertEqual(entry.get("current_bdiff"), b64encode("\n".join(mock_diff.return_value))) del entry.attrib['qtext'] del entry.attrib["current_bdiff"] self.assertItemsEqual(orig_entry.attrib, entry.attrib)
def file_seg_process(filename, method): ''' @param filename: 文件名 @param method: 分词算法 { 0:正向,1:逆向 } ''' # 打开文件 fp_dict = open('dict.txt') fp_input = open('corpus/'+filename) fp_output = open('corpus_seg/'+filename, 'w') wordDict = {} # 读取字典到内存中 for eachWord in fp_dict: wordDict[u(eachWord.split()[0].strip(), CODEC)] = 1 # 对input每一行操作 str = '' for eachLine in fp_input: line_out = '' # 每一段作为一行输入给分词函数 sub = strQ2B(u(eachLine.strip(), CODEC)) if not sub.startswith(' '): str += sub continue strlen = len(str) while strlen > 0: # 英文字符或数字--原文输出 m = re.match(r'\w+', str) if m is not None: subStr = m.group() line_out += subStr.encode(CODEC)+'/' subLen = len(subStr) str = str[subLen:] strlen = strlen - subLen continue # 短句结尾标志--输出换行 if str[0:1].encode(CODEC) in [',','。','!','?',':']: subStr = str[0:1] line_out += '\n' subLen = len(subStr) str = str[subLen:] strlen = strlen - subLen # 汉字--分词处理,输出 词/词 m = re.match(ur'[\u4e00-\u9fa5]+', str) if m is not None: subStr = m.group() if method == 0: # 正向最大匹配 wordList = fwd_mm_seg(wordDict, 8, subStr) else: # 逆向最大匹配 wordList = bwd_mm_seg(wordDict, 8, subStr) line_out += wordList[0].encode(CODEC)+'/' for eachWord in wordList[1:]: line_out += eachWord.encode(CODEC)+'/' subLen = len(subStr) str = str[subLen:] strlen = strlen - subLen continue # 其他特殊字符--跳过 str = str[1:] strlen = strlen - 1 # 跳过处理后为空行的段落 if len(line_out.strip()) == 0: continue # 写入文件 fp_output.write(line_out + '\n') str = sub # close file fp_input.close() fp_dict.close() fp_output.close()