def main():
	pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
	pathArticles = os.path.join(PATH_CSV, FILENAME_ARTICLES)
	pathArticlesRedirect = os.path.join(PATH_CSV, FILENAME_REDIRECT)
	pathTemplateRedirect = os.path.join(PATH_CSV, FILENAME_TEMPLATE)

	templateCount = 0
	articleCount = 0
	totalCount = 0
	redirectCount = 0

	with open(pathArticles, 'w') as output_file:
		cw = csv.writer(output_file, delimiter='\t')
		cw.writerow(['Title', 'Text'])

	cleaner = Cleaner()
	for title, text in tqdm(iterate(pathWikiXML)):
		totalCount += 1
		text = cleaner.clean_text(text)
		#cleaned_text, links = cleaner.build_links(text)

		if text.startswith("REDIRECT"):
			redirectCount += 1
		elif text.startswith("TEMPLATE"):
			templateCount += 1
		else:
			articleCount += 1
			with open(pathArticles, 'a') as output_file:
				cw = csv.writer(output_file, delimiter='\t')
				cw.writerow([title, text])

	print("Total pages: {:,}".format(totalCount))
	print("Template pages: {:,}".format(templateCount))
	print("Article pages: {:,}".format(articleCount))
	print("Redirect pages: {:,}".format(redirectCount))
Example #2
0
    def load_files(self):
        cleaner = Cleaner()
        i = 0
        for title, text in iterate(
                'wiki/simplewiki-20191120-pages-articles.xml'):
            if i >= self.files_number:
                break
            cleaned_text = cleaner.clean_text(text)[:self.characters_per_file]
            cleaned_fragment, _ = cleaner.build_links(text)
            self.texts.append(title)

            word_tokens = self.pattern.sub(' ',
                                           cleaned_text.lower()).split(' ')
            cleaned_text = [
                PorterStemmer().stem(w) for w in word_tokens
                if w not in self.stop_words
            ]
            self.file_dictionaries.append(Counter(cleaned_text))
            self.bag_of_words = self.bag_of_words.union(set(cleaned_text))
            i += 1
Example #3
0
    def load_files(self, dictionary_size=20000):
        cleaner = Cleaner()
        i = 0
        for title, text in iterate(
                'wiki/simplewiki-20191120-pages-articles.xml'):
            if i >= self.files_number:
                break
            cleaned_text = cleaner.clean_text(text)[:self.characters_per_file]
            cleaned_fragment, _ = cleaner.build_links(text)
            self.texts.append(title)

            word_tokens = self.pattern.sub(' ',
                                           cleaned_text.lower()).split(' ')
            cleaned_text = [
                PorterStemmer().stem(w) for w in word_tokens
                if w not in self.stop_words
            ]
            self.file_dictionaries.append(Counter(cleaned_text))
            self.bag_of_words = self.bag_of_words.union(set(cleaned_text))
            i += 1

        self.dictionary = {w: 0 for w in self.bag_of_words}
        for file in self.file_dictionaries:
            for word in self.bag_of_words:
                if word in file.keys():
                    self.dictionary[word] += 1

        if len(self.dictionary) > dictionary_size:
            self.dictionary = Counter(
                self.dictionary).most_common(dictionary_size)
            self.bag_of_words = []
            for (word, num) in self.dictionary:
                self.bag_of_words.append(word)
                self.nw_vector.append(num)
        else:
            self.bag_of_words = list(self.dictionary.keys())
            self.nw_vector = list(self.dictionary.values())
 def setUp(self):
     self.maxDiff = None
     self.cleaner = Cleaner()
from wiki_dump_reader import Cleaner, iterate
from text_cleaner import Cleaner as MyCleaner
import string, re, os, sys
from tqdm import tqdm

cleaner = Cleaner()
my_cleaner = MyCleaner()
lines = []

brk = 40000
print("Extracting text from xml ...")
for title, text in tqdm(iterate('raw/wiki/rowiki-latest-pages-articles.xml')):
    #if brk<=0:
    #    break
    #brk-=1

    text = cleaner.clean_text(text)
    cleaned_text, links = cleaner.build_links(text) # get text
    lines.extend(cleaned_text.splitlines())

print("Cleaning extracted text ...")
sys.stdout.flush()
cleaned_lines, stats = my_cleaner.process(lines, min_line_length=30, disable_pbar=False)
my_cleaner.print_stats(stats)


print("Post-cleaning extracted text ...")
forbidden_in = ["٭", "*", "†", "sp.", " f.", ".org", "oraș în", "localitate în", "comună în", "sat în", ".com", ".it", "o.o.", "px", ".jpg", ".gif", " n. ", ".bmp", "\\", "(n.", "\\left", "\\right", "(d.", "&nbsp;", "::", "[[", "//", ", un ora", "este un municipiu", "este o comun", "este un ora", "{{", "Period", "from:", "till:", "BackgroundColors", "canvas:", "color:", "width:", "align:", "fontsize:", "pos:", "File", "##", "==", "image:", "ISBN", "\\over", "\\math", "style", "border", "background", "Wikipedia", "id:", "bar:", "ImageSize", "height:", "DateFormat", "text:", "orientation:", "format:", "position:", "columns:", "id:", "value:", "legend:", "ScaleMajor", "increment:", "ScaleMinor", "increment:", "REDIRECT"]
forbidden_startswith = ["redirect", "Reședințe", "Locuri", "Sedii municipale", "Orașe", "Orase", "Actori", "Actri", "Localit", "Municipii", "Pagina", "List", "Secole", "Limbi", ":", "«",".",";","?","!","#"] + [x for x in string.punctuation]
forbidden_endswith = ["Descoperă",")","}","?)","aici",".ro","-lea",";"]
Example #6
0
 def setUp(self):
     self.cleaner = Cleaner()
Example #7
0
 def setUp(self):
     self.maxDiff = None
     self.cleaner = Cleaner()
     self.current_path = os.path.dirname(os.path.abspath(__file__))
     self.sample_file_path = os.path.join(self.current_path, 'wikis',
                                          'zhwiki-test-pages.xml')
    def run(self):
        """Cleans the text gotten from wikipedia.

        Returns:
            True if the stage execution succeded, False otherwise.
        """
        self.logger.info("Starting text cleaning...")
        input_file_path = join(constants.TMP_PATH,
                               "{}.raw.txt".format(self.parent.topic))
        output_file_path = join(constants.TMP_PATH,
                                "{}.clean.txt".format(self.parent.topic))
        cleaner = Cleaner()

        with open(input_file_path, "r") as file:
            text = file.read()

        text = re.sub('&nbsp', '', text)

        self.logger.info(
            "Cleaning the markup and applying token-wise operations")
        lemmatizer = WordNetLemmatizer()
        articles = text.split("<<article_end>>")
        for i in range(len(articles)):
            article = articles[i]
            # Removing special tokens
            article = re.sub('<<article_start>>', '', article)
            # Removing wikipedia markup
            article = cleaner.clean_text(article)
            # Removing left out >
            article = re.sub(">", '', article)
            # Openning up [[...]]
            article = re.sub('\[{2}(.*?)(\|[\w\s\|]*)?\]{2}', '\\1', article)
            # Removing |
            article = re.sub('\|', ' ', article)

            tokens = word_tokenize(article)
            for j in range(len(tokens)):
                token = tokens[j]
                token = token.lower()
                token = token.encode("ascii", "ignore")
                token = token.decode()
                token = lemmatizer.lemmatize(token)
                tokens[j] = token
            article = " ".join(tokens)

            articles[i] = "<<article_start>> {} <<article_end>>".format(
                article)
        text = " ".join(articles)

        self.logger.info("Changing years to <<year>>")
        text = re.sub(' \d{4}(\-\d+|s)?', ' <<year>>', text)

        self.logger.info("Changing numbers to <<number>>")
        text = re.sub(' \d[\d\.,%]*(st|nd|rd|th| %)?', ' <<number>>', text)
        text = re.sub('<<number>>\-[\d\.,%]+', '<<number>>', text)

        self.logger.info("Section title formatting")
        text = re.sub('==+(.*?)==+',
                      '<<section_title_start>> \\1 <<section_title_end>>',
                      text)

        self.logger.info("Removing extra white-spaces")
        text = re.sub('\s\s+', ' ', text)

        with open(output_file_path, "w") as file:
            file.write(text)
            num_tokens = len(text.split(" "))
            self.logger.info(
                "Saved the cleaned text. Contains ~ {} tokens".format(
                    num_tokens))
        return True