class TestBuildLinks(unittest.TestCase): def setUp(self): self.maxDiff = None self.cleaner = Cleaner() def test_build_links(self): text = "[[印欧语系|西方语言]]中“數學”(μαθηματικά)一詞源自於[[古希臘語]]的μάθημα(máthēma),其" \ "有“學習”、“學問”、“[[科學]]”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞μα" \ "θηματικός(mathēmatikós),意思為''和學習有關的''或''用功的'',亦會被用來指''數學的''。其在[[英语]]中" \ "表面上的複數形式,及在[[法语]]中的表面複數形式''les mathématiques'',可溯至[[拉丁文]]的中性複數''mathe" \ "matica'',由[[西塞罗]]譯自希臘文複數τα μαθηματικά(ta mathēmatiká),此一希臘語被[[亚里士多德]]拿來指" \ "「[[萬物皆數]]」的概念。" expected = "西方语言中“數學”(μαθηματικά)一詞源自於古希臘語的μάθημα(máthēma),其有“學習”、“學問”、“科" \ "學”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞μαθηματικός(mathēmatikós)," \ "意思為''和學習有關的''或''用功的'',亦會被用來指''數學的''。其在英语中表面上的複數形式,及在法语中的表面複數" \ "形式''les mathématiques'',可溯至拉丁文的中性複數''mathematica'',由西塞罗譯自希臘文複數τα μαθηματικά(t" \ "a mathēmatiká),此一希臘語被亚里士多德拿來指「萬物皆數」的概念。" actual, links = self.cleaner.build_links(text) self.assertEqual(expected, actual) def test_no_links(self): text = "西方语言中“數學”(μαθηματικά)一詞源自於古希臘語的μάθημα(máthēma),其有“學習”、“學問”、“科" \ "學”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞μαθηματικός(mathēmatikós)," \ "意思為''和學習有關的''或''用功的'',亦會被用來指''數學的''。其在英语中表面上的複數形式,及在法语中的表面複數" \ "形式''les mathématiques'',可溯至拉丁文的中性複數''mathematica'',由西塞罗譯自希臘文複數τα μαθηματικά(t" \ "a mathēmatiká),此一希臘語被亚里士多德拿來指「萬物皆數」的概念。" expected = "西方语言中“數學”(μαθηματικά)一詞源自於古希臘語的μάθημα(máthēma),其有“學習”、“學問”、“科" \ "學”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞μαθηματικός(mathēmatikós)," \ "意思為''和學習有關的''或''用功的'',亦會被用來指''數學的''。其在英语中表面上的複數形式,及在法语中的表面複數" \ "形式''les mathématiques'',可溯至拉丁文的中性複數''mathematica'',由西塞罗譯自希臘文複數τα μαθηματικά(t" \ "a mathēmatiká),此一希臘語被亚里士多德拿來指「萬物皆數」的概念。" text, links = self.cleaner.build_links(text) actual, links = self.cleaner.build_links(text) self.assertEqual(expected, actual) self.assertEqual(id(actual), id(text)) def test_category(self): text = "2004年6月28日 [[User:Shizhao|Shizhao]] [[MediaWiki:Categoryarticlecount]]被保护" expected = "2004年6月28日 Shizhao Categoryarticlecount被保护" text, links = self.cleaner.build_links(text) actual, links = self.cleaner.build_links(text) self.assertEqual(expected, actual) text = "[[Category:未被普遍承認的歷史國家]]" expected = "未被普遍承認的歷史國家" text, links = self.cleaner.build_links(text) actual, links = self.cleaner.build_links(text) self.assertEqual(expected, actual) text = "柏拉圖的著作(其中大多數都是對話錄)曾經被以好幾種不同方式出版過;因此對於柏拉圖著作的命名和引用也有數種不同的" \ "方式。有獨立條目的柏拉圖對話錄介紹可以在[[:Category:柏拉圖對話錄]]找到。" expected = "柏拉圖的著作(其中大多數都是對話錄)曾經被以好幾種不同方式出版過;因此對於柏拉圖著作的命名和引用也有數種不同" \ "的方式。有獨立條目的柏拉圖對話錄介紹可以在柏拉圖對話錄找到。" text, links = self.cleaner.build_links(text) actual, links = self.cleaner.build_links(text) self.assertEqual(expected, actual)
class TestIterate(unittest.TestCase): def setUp(self): self.maxDiff = None self.cleaner = Cleaner() self.current_path = os.path.dirname(os.path.abspath(__file__)) self.sample_file_path = os.path.join(self.current_path, 'wikis', 'zhwiki-test-pages.xml') def read_target(self, name): path = os.path.join(self.current_path, 'targets', name + '.txt') with codecs.open(path, 'r', 'utf8') as reader: target = reader.read() return target def save_temp(self, name, text): path = os.path.join(self.current_path, 'targets', name + '.tmp') with codecs.open(path, 'w', 'utf8') as writer: writer.write(text) def test_broken(self): broken_files = ['zhwiki-broken-%d.xml' % i for i in range(1, 5)] for broken_file in broken_files: path = os.path.join(self.current_path, 'wikis', broken_file) for _ in iterate(path): self.assertTrue(False) def test_clean(self): targets = { '数学': 'Mathematics', '哲学': 'Philosophy', '文學': 'Literature', } for target_title, target in targets.items(): found = False for title, text in iterate(self.sample_file_path): if title == target_title: found = True text = self.cleaner.clean_text(text) actual, _ = self.cleaner.build_links(text) expected = self.read_target(target) if actual != expected: self.save_temp(target, actual) self.assertEqual(expected, actual, target) else: text = self.cleaner.clean_text(text) self.cleaner.build_links(text) self.assertTrue(found)
class TestCleanText(unittest.TestCase): def setUp(self): self.maxDiff = None self.cleaner = Cleaner() def test_case_1(self): text = "[[印欧语系|西方语言]]中“數學”({{lang-el|μαθηματικά}})一詞源自於[[古希臘語]]的{{lang|el|μάθημα}}({" \ "{lang|la|máthēma}}),其有“學習”、“學問”、“[[科學]]”,以及另外還有個較狹義且技術性的意思-「數學研究」," \ "即使在其語源內。其形容詞{{lang|el|μαθηματικός}}({{lang|la|mathēmatikós}}),意思為''和學習有關的''或" \ "''用功的'',亦會被用來指''數學的''。其在[[英语]]中表面上的複數形式,及在[[法语]]中的表面複數形式''{{lang|f" \ "r|les mathématiques}}'',可溯至[[拉丁文]]的中性複數''{{lang|la|mathematica}}'',由[[西塞罗]]譯自希臘" \ "文複數{{lang|el|τα μαθηματικά}}({{lang|la|ta mathēmatiká}}),此一希臘語被[[亚里士多德]]拿來指「[[萬" \ "物皆數]]」的概念。" expected = "西方语言中“數學”(μαθηματικά)一詞源自於古希臘語的μάθημα(máthēma),其有“學習”、“學問”、“科" \ "學”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞μαθηματικός(mathēmatikós)," \ "意思為和學習有關的或用功的,亦會被用來指數學的。其在英语中表面上的複數形式,及在法语中的表面複數" \ "形式les mathématiques,可溯至拉丁文的中性複數mathematica,由西塞罗譯自希臘文複數τα μαθηματικά(t" \ "a mathēmatiká),此一希臘語被亚里士多德拿來指「萬物皆數」的概念。" actual = self.cleaner.clean_text(text) actual, links = self.cleaner.build_links(actual) self.assertEqual(expected, actual) def test_case_3(self): text = "例如,[[全球資訊網]]是在[[歐洲核子研究組織]]由-{A|zh:[[蒂姆·伯纳斯-李]];zh-cn:[[蒂姆·伯纳斯-李]];zh-tw:[[提" \ "姆·柏納-李]];zh-hk:[[添·柏納-李]];}-創始與發展成功的,原先設計目标為向組織內部和全世界的物理學者提供資訊傳播服務。" \ "廣受歡迎的[[arXiv]]網站也是在類似狀況下創立的。" expected = "例如,全球資訊網是在歐洲核子研究組織由蒂姆·伯纳斯-李創始與發展成功的,原先設計目标為向組織內部和全世界的物理學" \ "者提供資訊傳播服務。廣受歡迎的arXiv網站也是在類似狀況下創立的。" actual = self.cleaner.clean_text(text) actual, links = self.cleaner.build_links(actual) self.assertEqual(expected, actual) def test_case_4(self): text = "亚里士多德死后,整个哲学界陷入了独立时期,称为{{link-en|希腊化哲学|Hellenistic_philosophy}}时期。因为整个社会" \ "和政治陷入混乱。这段时期产生了[[斯多葛学派]]和[[伊壁鸠鲁学派]],以及[[皮浪主义|怀疑主义派]]、[[新柏拉图主义|新柏" \ "拉图派]]和{{le|新毕达哥拉斯主义|Neopythagoreanism}}。这些学派的共同特点是伦理化。斯多葛学派主要是顺应自然和自制" \ "。伊壁鸠鲁学派则是把快乐作为生活的本质和善的标准。而新柏拉图派和新毕达哥拉斯派都是带有[[宗教]]主义的哲学,并逐渐产" \ "生融化[[基督教]]和希腊哲学于一体的理论,即为后来的[[基督教哲学]]。" expected = "亚里士多德死后,整个哲学界陷入了独立时期,称为希腊化哲学时期。因为整个社会和政治陷入混乱。这段时期产生了斯多葛学" \ "派和伊壁鸠鲁学派,以及怀疑主义派、新柏拉图派和新毕达哥拉斯主义。这些学派的共同特点是伦理化。斯多葛学派主要是顺应" \ "自然和自制。伊壁鸠鲁学派则是把快乐作为生活的本质和善的标准。而新柏拉图派和新毕达哥拉斯派都是带有宗教主义的哲学," \ "并逐渐产生融化基督教和希腊哲学于一体的理论,即为后来的基督教哲学。" actual = self.cleaner.clean_text(text) actual, links = self.cleaner.build_links(actual) self.assertEqual(expected, actual)
def load_files(self): cleaner = Cleaner() i = 0 for title, text in iterate( 'wiki/simplewiki-20191120-pages-articles.xml'): if i >= self.files_number: break cleaned_text = cleaner.clean_text(text)[:self.characters_per_file] cleaned_fragment, _ = cleaner.build_links(text) self.texts.append(title) word_tokens = self.pattern.sub(' ', cleaned_text.lower()).split(' ') cleaned_text = [ PorterStemmer().stem(w) for w in word_tokens if w not in self.stop_words ] self.file_dictionaries.append(Counter(cleaned_text)) self.bag_of_words = self.bag_of_words.union(set(cleaned_text)) i += 1
def load_files(self, dictionary_size=20000): cleaner = Cleaner() i = 0 for title, text in iterate( 'wiki/simplewiki-20191120-pages-articles.xml'): if i >= self.files_number: break cleaned_text = cleaner.clean_text(text)[:self.characters_per_file] cleaned_fragment, _ = cleaner.build_links(text) self.texts.append(title) word_tokens = self.pattern.sub(' ', cleaned_text.lower()).split(' ') cleaned_text = [ PorterStemmer().stem(w) for w in word_tokens if w not in self.stop_words ] self.file_dictionaries.append(Counter(cleaned_text)) self.bag_of_words = self.bag_of_words.union(set(cleaned_text)) i += 1 self.dictionary = {w: 0 for w in self.bag_of_words} for file in self.file_dictionaries: for word in self.bag_of_words: if word in file.keys(): self.dictionary[word] += 1 if len(self.dictionary) > dictionary_size: self.dictionary = Counter( self.dictionary).most_common(dictionary_size) self.bag_of_words = [] for (word, num) in self.dictionary: self.bag_of_words.append(word) self.nw_vector.append(num) else: self.bag_of_words = list(self.dictionary.keys()) self.nw_vector = list(self.dictionary.values())
import string, re, os, sys from tqdm import tqdm cleaner = Cleaner() my_cleaner = MyCleaner() lines = [] brk = 40000 print("Extracting text from xml ...") for title, text in tqdm(iterate('raw/wiki/rowiki-latest-pages-articles.xml')): #if brk<=0: # break #brk-=1 text = cleaner.clean_text(text) cleaned_text, links = cleaner.build_links(text) # get text lines.extend(cleaned_text.splitlines()) print("Cleaning extracted text ...") sys.stdout.flush() cleaned_lines, stats = my_cleaner.process(lines, min_line_length=30, disable_pbar=False) my_cleaner.print_stats(stats) print("Post-cleaning extracted text ...") forbidden_in = ["٭", "*", "†", "sp.", " f.", ".org", "oraș în", "localitate în", "comună în", "sat în", ".com", ".it", "o.o.", "px", ".jpg", ".gif", " n. ", ".bmp", "\\", "(n.", "\\left", "\\right", "(d.", " ", "::", "[[", "//", ", un ora", "este un municipiu", "este o comun", "este un ora", "{{", "Period", "from:", "till:", "BackgroundColors", "canvas:", "color:", "width:", "align:", "fontsize:", "pos:", "File", "##", "==", "image:", "ISBN", "\\over", "\\math", "style", "border", "background", "Wikipedia", "id:", "bar:", "ImageSize", "height:", "DateFormat", "text:", "orientation:", "format:", "position:", "columns:", "id:", "value:", "legend:", "ScaleMajor", "increment:", "ScaleMinor", "increment:", "REDIRECT"] forbidden_startswith = ["redirect", "Reședințe", "Locuri", "Sedii municipale", "Orașe", "Orase", "Actori", "Actri", "Localit", "Municipii", "Pagina", "List", "Secole", "Limbi", ":", "«",".",";","?","!","#"] + [x for x in string.punctuation] forbidden_endswith = ["Descoperă",")","}","?)","aici",".ro","-lea",";"] # ^word: regex re1 = re.compile(r"^\w+:", re.UNICODE) # \d)$ ex: Coreea, statul Koryo: Kojong (Wang Ch'ol) (rege din dinastia Wang, 1214-1259)
# Dependencies # pip install wiki-dump-reader # pip install tqdm from wiki_dump_reader import Cleaner, iterate from tqdm import tqdm import re cleaner = Cleaner() output = open('bn_wiki.txt', 'w') for title, text in tqdm(iterate('bnwiki-latest-pages-articles.xml')): text = cleaner.clean_text(text) cleaned_text, _ = cleaner.build_links(text) cleaned_text = re.sub(r'[A-Za-z]', '', cleaned_text) # print(cleaned_text) output.write(cleaned_text + "\n") output.close()