def main():
	pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
	pathArticles = os.path.join(PATH_CSV, FILENAME_ARTICLES)
	pathArticlesRedirect = os.path.join(PATH_CSV, FILENAME_REDIRECT)
	pathTemplateRedirect = os.path.join(PATH_CSV, FILENAME_TEMPLATE)

	templateCount = 0
	articleCount = 0
	totalCount = 0
	redirectCount = 0

	with open(pathArticles, 'w') as output_file:
		cw = csv.writer(output_file, delimiter='\t')
		cw.writerow(['Title', 'Text'])

	cleaner = Cleaner()
	for title, text in tqdm(iterate(pathWikiXML)):
		totalCount += 1
		text = cleaner.clean_text(text)
		#cleaned_text, links = cleaner.build_links(text)

		if text.startswith("REDIRECT"):
			redirectCount += 1
		elif text.startswith("TEMPLATE"):
			templateCount += 1
		else:
			articleCount += 1
			with open(pathArticles, 'a') as output_file:
				cw = csv.writer(output_file, delimiter='\t')
				cw.writerow([title, text])

	print("Total pages: {:,}".format(totalCount))
	print("Template pages: {:,}".format(templateCount))
	print("Article pages: {:,}".format(articleCount))
	print("Redirect pages: {:,}".format(redirectCount))
Example #2
0
class TestIterate(unittest.TestCase):
    def setUp(self):
        self.maxDiff = None
        self.cleaner = Cleaner()
        self.current_path = os.path.dirname(os.path.abspath(__file__))
        self.sample_file_path = os.path.join(self.current_path, 'wikis',
                                             'zhwiki-test-pages.xml')

    def read_target(self, name):
        path = os.path.join(self.current_path, 'targets', name + '.txt')
        with codecs.open(path, 'r', 'utf8') as reader:
            target = reader.read()
        return target

    def save_temp(self, name, text):
        path = os.path.join(self.current_path, 'targets', name + '.tmp')
        with codecs.open(path, 'w', 'utf8') as writer:
            writer.write(text)

    def test_broken(self):
        broken_files = ['zhwiki-broken-%d.xml' % i for i in range(1, 5)]
        for broken_file in broken_files:
            path = os.path.join(self.current_path, 'wikis', broken_file)
            for _ in iterate(path):
                self.assertTrue(False)

    def test_clean(self):
        targets = {
            '数学': 'Mathematics',
            '哲学': 'Philosophy',
            '文學': 'Literature',
        }
        for target_title, target in targets.items():
            found = False
            for title, text in iterate(self.sample_file_path):
                if title == target_title:
                    found = True
                    text = self.cleaner.clean_text(text)
                    actual, _ = self.cleaner.build_links(text)
                    expected = self.read_target(target)
                    if actual != expected:
                        self.save_temp(target, actual)
                    self.assertEqual(expected, actual, target)
                else:
                    text = self.cleaner.clean_text(text)
                    self.cleaner.build_links(text)
            self.assertTrue(found)
class TestCleanText(unittest.TestCase):
    def setUp(self):
        self.maxDiff = None
        self.cleaner = Cleaner()

    def test_case_1(self):
        text = "[[印欧语系|西方语言]]中“數學”({{lang-el|μαθηματικά}})一詞源自於[[古希臘語]]的{{lang|el|μάθημα}}({" \
               "{lang|la|máthēma}}),其有“學習”、“學問”、“[[科學]]”,以及另外還有個較狹義且技術性的意思-「數學研究」," \
               "即使在其語源內。其形容詞{{lang|el|μαθηματικός}}({{lang|la|mathēmatikós}}),意思為''和學習有關的''或" \
               "''用功的'',亦會被用來指''數學的''。其在[[英语]]中表面上的複數形式,及在[[法语]]中的表面複數形式''{{lang|f" \
               "r|les mathématiques}}'',可溯至[[拉丁文]]的中性複數''{{lang|la|mathematica}}'',由[[西塞罗]]譯自希臘" \
               "文複數{{lang|el|τα μαθηματικά}}({{lang|la|ta mathēmatiká}}),此一希臘語被[[亚里士多德]]拿來指「[[萬" \
               "物皆數]]」的概念。"
        expected = "西方语言中“數學”(μαθηματικά)一詞源自於古希臘語的μάθημα(máthēma),其有“學習”、“學問”、“科" \
                   "學”,以及另外還有個較狹義且技術性的意思-「數學研究」,即使在其語源內。其形容詞μαθηματικός(mathēmatikós)," \
                   "意思為和學習有關的或用功的,亦會被用來指數學的。其在英语中表面上的複數形式,及在法语中的表面複數" \
                   "形式les mathématiques,可溯至拉丁文的中性複數mathematica,由西塞罗譯自希臘文複數τα μαθηματικά(t" \
                   "a mathēmatiká),此一希臘語被亚里士多德拿來指「萬物皆數」的概念。"
        actual = self.cleaner.clean_text(text)
        actual, links = self.cleaner.build_links(actual)
        self.assertEqual(expected, actual)

    def test_case_3(self):
        text = "例如,[[全球資訊網]]是在[[歐洲核子研究組織]]由-{A|zh:[[蒂姆·伯纳斯-李]];zh-cn:[[蒂姆·伯纳斯-李]];zh-tw:[[提" \
               "姆·柏納-李]];zh-hk:[[添·柏納-李]];}-創始與發展成功的,原先設計目标為向組織內部和全世界的物理學者提供資訊傳播服務。" \
               "廣受歡迎的[[arXiv]]網站也是在類似狀況下創立的。"
        expected = "例如,全球資訊網是在歐洲核子研究組織由蒂姆·伯纳斯-李創始與發展成功的,原先設計目标為向組織內部和全世界的物理學" \
                   "者提供資訊傳播服務。廣受歡迎的arXiv網站也是在類似狀況下創立的。"
        actual = self.cleaner.clean_text(text)
        actual, links = self.cleaner.build_links(actual)
        self.assertEqual(expected, actual)

    def test_case_4(self):
        text = "亚里士多德死后,整个哲学界陷入了独立时期,称为{{link-en|希腊化哲学|Hellenistic_philosophy}}时期。因为整个社会" \
               "和政治陷入混乱。这段时期产生了[[斯多葛学派]]和[[伊壁鸠鲁学派]],以及[[皮浪主义|怀疑主义派]]、[[新柏拉图主义|新柏" \
               "拉图派]]和{{le|新毕达哥拉斯主义|Neopythagoreanism}}。这些学派的共同特点是伦理化。斯多葛学派主要是顺应自然和自制" \
               "。伊壁鸠鲁学派则是把快乐作为生活的本质和善的标准。而新柏拉图派和新毕达哥拉斯派都是带有[[宗教]]主义的哲学,并逐渐产" \
               "生融化[[基督教]]和希腊哲学于一体的理论,即为后来的[[基督教哲学]]。"
        expected = "亚里士多德死后,整个哲学界陷入了独立时期,称为希腊化哲学时期。因为整个社会和政治陷入混乱。这段时期产生了斯多葛学" \
                   "派和伊壁鸠鲁学派,以及怀疑主义派、新柏拉图派和新毕达哥拉斯主义。这些学派的共同特点是伦理化。斯多葛学派主要是顺应" \
                   "自然和自制。伊壁鸠鲁学派则是把快乐作为生活的本质和善的标准。而新柏拉图派和新毕达哥拉斯派都是带有宗教主义的哲学," \
                   "并逐渐产生融化基督教和希腊哲学于一体的理论,即为后来的基督教哲学。"
        actual = self.cleaner.clean_text(text)
        actual, links = self.cleaner.build_links(actual)
        self.assertEqual(expected, actual)
Example #4
0
    def load_files(self):
        cleaner = Cleaner()
        i = 0
        for title, text in iterate(
                'wiki/simplewiki-20191120-pages-articles.xml'):
            if i >= self.files_number:
                break
            cleaned_text = cleaner.clean_text(text)[:self.characters_per_file]
            cleaned_fragment, _ = cleaner.build_links(text)
            self.texts.append(title)

            word_tokens = self.pattern.sub(' ',
                                           cleaned_text.lower()).split(' ')
            cleaned_text = [
                PorterStemmer().stem(w) for w in word_tokens
                if w not in self.stop_words
            ]
            self.file_dictionaries.append(Counter(cleaned_text))
            self.bag_of_words = self.bag_of_words.union(set(cleaned_text))
            i += 1
Example #5
0
    def load_files(self, dictionary_size=20000):
        cleaner = Cleaner()
        i = 0
        for title, text in iterate(
                'wiki/simplewiki-20191120-pages-articles.xml'):
            if i >= self.files_number:
                break
            cleaned_text = cleaner.clean_text(text)[:self.characters_per_file]
            cleaned_fragment, _ = cleaner.build_links(text)
            self.texts.append(title)

            word_tokens = self.pattern.sub(' ',
                                           cleaned_text.lower()).split(' ')
            cleaned_text = [
                PorterStemmer().stem(w) for w in word_tokens
                if w not in self.stop_words
            ]
            self.file_dictionaries.append(Counter(cleaned_text))
            self.bag_of_words = self.bag_of_words.union(set(cleaned_text))
            i += 1

        self.dictionary = {w: 0 for w in self.bag_of_words}
        for file in self.file_dictionaries:
            for word in self.bag_of_words:
                if word in file.keys():
                    self.dictionary[word] += 1

        if len(self.dictionary) > dictionary_size:
            self.dictionary = Counter(
                self.dictionary).most_common(dictionary_size)
            self.bag_of_words = []
            for (word, num) in self.dictionary:
                self.bag_of_words.append(word)
                self.nw_vector.append(num)
        else:
            self.bag_of_words = list(self.dictionary.keys())
            self.nw_vector = list(self.dictionary.values())
from text_cleaner import Cleaner as MyCleaner
import string, re, os, sys
from tqdm import tqdm

cleaner = Cleaner()
my_cleaner = MyCleaner()
lines = []

brk = 40000
print("Extracting text from xml ...")
for title, text in tqdm(iterate('raw/wiki/rowiki-latest-pages-articles.xml')):
    #if brk<=0:
    #    break
    #brk-=1

    text = cleaner.clean_text(text)
    cleaned_text, links = cleaner.build_links(text) # get text
    lines.extend(cleaned_text.splitlines())

print("Cleaning extracted text ...")
sys.stdout.flush()
cleaned_lines, stats = my_cleaner.process(lines, min_line_length=30, disable_pbar=False)
my_cleaner.print_stats(stats)


print("Post-cleaning extracted text ...")
forbidden_in = ["٭", "*", "†", "sp.", " f.", ".org", "oraș în", "localitate în", "comună în", "sat în", ".com", ".it", "o.o.", "px", ".jpg", ".gif", " n. ", ".bmp", "\\", "(n.", "\\left", "\\right", "(d.", "&nbsp;", "::", "[[", "//", ", un ora", "este un municipiu", "este o comun", "este un ora", "{{", "Period", "from:", "till:", "BackgroundColors", "canvas:", "color:", "width:", "align:", "fontsize:", "pos:", "File", "##", "==", "image:", "ISBN", "\\over", "\\math", "style", "border", "background", "Wikipedia", "id:", "bar:", "ImageSize", "height:", "DateFormat", "text:", "orientation:", "format:", "position:", "columns:", "id:", "value:", "legend:", "ScaleMajor", "increment:", "ScaleMinor", "increment:", "REDIRECT"]
forbidden_startswith = ["redirect", "Reședințe", "Locuri", "Sedii municipale", "Orașe", "Orase", "Actori", "Actri", "Localit", "Municipii", "Pagina", "List", "Secole", "Limbi", ":", "«",".",";","?","!","#"] + [x for x in string.punctuation]
forbidden_endswith = ["Descoperă",")","}","?)","aici",".ro","-lea",";"]
# ^word: regex
re1 = re.compile(r"^\w+:", re.UNICODE)
    def run(self):
        """Cleans the text gotten from wikipedia.

        Returns:
            True if the stage execution succeded, False otherwise.
        """
        self.logger.info("Starting text cleaning...")
        input_file_path = join(constants.TMP_PATH,
                               "{}.raw.txt".format(self.parent.topic))
        output_file_path = join(constants.TMP_PATH,
                                "{}.clean.txt".format(self.parent.topic))
        cleaner = Cleaner()

        with open(input_file_path, "r") as file:
            text = file.read()

        text = re.sub('&nbsp', '', text)

        self.logger.info(
            "Cleaning the markup and applying token-wise operations")
        lemmatizer = WordNetLemmatizer()
        articles = text.split("<<article_end>>")
        for i in range(len(articles)):
            article = articles[i]
            # Removing special tokens
            article = re.sub('<<article_start>>', '', article)
            # Removing wikipedia markup
            article = cleaner.clean_text(article)
            # Removing left out >
            article = re.sub(">", '', article)
            # Openning up [[...]]
            article = re.sub('\[{2}(.*?)(\|[\w\s\|]*)?\]{2}', '\\1', article)
            # Removing |
            article = re.sub('\|', ' ', article)

            tokens = word_tokenize(article)
            for j in range(len(tokens)):
                token = tokens[j]
                token = token.lower()
                token = token.encode("ascii", "ignore")
                token = token.decode()
                token = lemmatizer.lemmatize(token)
                tokens[j] = token
            article = " ".join(tokens)

            articles[i] = "<<article_start>> {} <<article_end>>".format(
                article)
        text = " ".join(articles)

        self.logger.info("Changing years to <<year>>")
        text = re.sub(' \d{4}(\-\d+|s)?', ' <<year>>', text)

        self.logger.info("Changing numbers to <<number>>")
        text = re.sub(' \d[\d\.,%]*(st|nd|rd|th| %)?', ' <<number>>', text)
        text = re.sub('<<number>>\-[\d\.,%]+', '<<number>>', text)

        self.logger.info("Section title formatting")
        text = re.sub('==+(.*?)==+',
                      '<<section_title_start>> \\1 <<section_title_end>>',
                      text)

        self.logger.info("Removing extra white-spaces")
        text = re.sub('\s\s+', ' ', text)

        with open(output_file_path, "w") as file:
            file.write(text)
            num_tokens = len(text.split(" "))
            self.logger.info(
                "Saved the cleaned text. Contains ~ {} tokens".format(
                    num_tokens))
        return True