def main(): pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI) pathArticles = os.path.join(PATH_CSV, FILENAME_ARTICLES) pathArticlesRedirect = os.path.join(PATH_CSV, FILENAME_REDIRECT) pathTemplateRedirect = os.path.join(PATH_CSV, FILENAME_TEMPLATE) templateCount = 0 articleCount = 0 totalCount = 0 redirectCount = 0 with open(pathArticles, 'w') as output_file: cw = csv.writer(output_file, delimiter='\t') cw.writerow(['Title', 'Text']) cleaner = Cleaner() for title, text in tqdm(iterate(pathWikiXML)): totalCount += 1 text = cleaner.clean_text(text) #cleaned_text, links = cleaner.build_links(text) if text.startswith("REDIRECT"): redirectCount += 1 elif text.startswith("TEMPLATE"): templateCount += 1 else: articleCount += 1 with open(pathArticles, 'a') as output_file: cw = csv.writer(output_file, delimiter='\t') cw.writerow([title, text]) print("Total pages: {:,}".format(totalCount)) print("Template pages: {:,}".format(templateCount)) print("Article pages: {:,}".format(articleCount)) print("Redirect pages: {:,}".format(redirectCount))
def load_files(self): cleaner = Cleaner() i = 0 for title, text in iterate( 'wiki/simplewiki-20191120-pages-articles.xml'): if i >= self.files_number: break cleaned_text = cleaner.clean_text(text)[:self.characters_per_file] cleaned_fragment, _ = cleaner.build_links(text) self.texts.append(title) word_tokens = self.pattern.sub(' ', cleaned_text.lower()).split(' ') cleaned_text = [ PorterStemmer().stem(w) for w in word_tokens if w not in self.stop_words ] self.file_dictionaries.append(Counter(cleaned_text)) self.bag_of_words = self.bag_of_words.union(set(cleaned_text)) i += 1
def load_files(self, dictionary_size=20000): cleaner = Cleaner() i = 0 for title, text in iterate( 'wiki/simplewiki-20191120-pages-articles.xml'): if i >= self.files_number: break cleaned_text = cleaner.clean_text(text)[:self.characters_per_file] cleaned_fragment, _ = cleaner.build_links(text) self.texts.append(title) word_tokens = self.pattern.sub(' ', cleaned_text.lower()).split(' ') cleaned_text = [ PorterStemmer().stem(w) for w in word_tokens if w not in self.stop_words ] self.file_dictionaries.append(Counter(cleaned_text)) self.bag_of_words = self.bag_of_words.union(set(cleaned_text)) i += 1 self.dictionary = {w: 0 for w in self.bag_of_words} for file in self.file_dictionaries: for word in self.bag_of_words: if word in file.keys(): self.dictionary[word] += 1 if len(self.dictionary) > dictionary_size: self.dictionary = Counter( self.dictionary).most_common(dictionary_size) self.bag_of_words = [] for (word, num) in self.dictionary: self.bag_of_words.append(word) self.nw_vector.append(num) else: self.bag_of_words = list(self.dictionary.keys()) self.nw_vector = list(self.dictionary.values())
def setUp(self): self.maxDiff = None self.cleaner = Cleaner()
from wiki_dump_reader import Cleaner, iterate from text_cleaner import Cleaner as MyCleaner import string, re, os, sys from tqdm import tqdm cleaner = Cleaner() my_cleaner = MyCleaner() lines = [] brk = 40000 print("Extracting text from xml ...") for title, text in tqdm(iterate('raw/wiki/rowiki-latest-pages-articles.xml')): #if brk<=0: # break #brk-=1 text = cleaner.clean_text(text) cleaned_text, links = cleaner.build_links(text) # get text lines.extend(cleaned_text.splitlines()) print("Cleaning extracted text ...") sys.stdout.flush() cleaned_lines, stats = my_cleaner.process(lines, min_line_length=30, disable_pbar=False) my_cleaner.print_stats(stats) print("Post-cleaning extracted text ...") forbidden_in = ["٭", "*", "†", "sp.", " f.", ".org", "oraș în", "localitate în", "comună în", "sat în", ".com", ".it", "o.o.", "px", ".jpg", ".gif", " n. ", ".bmp", "\\", "(n.", "\\left", "\\right", "(d.", " ", "::", "[[", "//", ", un ora", "este un municipiu", "este o comun", "este un ora", "{{", "Period", "from:", "till:", "BackgroundColors", "canvas:", "color:", "width:", "align:", "fontsize:", "pos:", "File", "##", "==", "image:", "ISBN", "\\over", "\\math", "style", "border", "background", "Wikipedia", "id:", "bar:", "ImageSize", "height:", "DateFormat", "text:", "orientation:", "format:", "position:", "columns:", "id:", "value:", "legend:", "ScaleMajor", "increment:", "ScaleMinor", "increment:", "REDIRECT"] forbidden_startswith = ["redirect", "Reședințe", "Locuri", "Sedii municipale", "Orașe", "Orase", "Actori", "Actri", "Localit", "Municipii", "Pagina", "List", "Secole", "Limbi", ":", "«",".",";","?","!","#"] + [x for x in string.punctuation] forbidden_endswith = ["Descoperă",")","}","?)","aici",".ro","-lea",";"]
def setUp(self): self.cleaner = Cleaner()
def setUp(self): self.maxDiff = None self.cleaner = Cleaner() self.current_path = os.path.dirname(os.path.abspath(__file__)) self.sample_file_path = os.path.join(self.current_path, 'wikis', 'zhwiki-test-pages.xml')
def run(self): """Cleans the text gotten from wikipedia. Returns: True if the stage execution succeded, False otherwise. """ self.logger.info("Starting text cleaning...") input_file_path = join(constants.TMP_PATH, "{}.raw.txt".format(self.parent.topic)) output_file_path = join(constants.TMP_PATH, "{}.clean.txt".format(self.parent.topic)) cleaner = Cleaner() with open(input_file_path, "r") as file: text = file.read() text = re.sub(' ', '', text) self.logger.info( "Cleaning the markup and applying token-wise operations") lemmatizer = WordNetLemmatizer() articles = text.split("<<article_end>>") for i in range(len(articles)): article = articles[i] # Removing special tokens article = re.sub('<<article_start>>', '', article) # Removing wikipedia markup article = cleaner.clean_text(article) # Removing left out > article = re.sub(">", '', article) # Openning up [[...]] article = re.sub('\[{2}(.*?)(\|[\w\s\|]*)?\]{2}', '\\1', article) # Removing | article = re.sub('\|', ' ', article) tokens = word_tokenize(article) for j in range(len(tokens)): token = tokens[j] token = token.lower() token = token.encode("ascii", "ignore") token = token.decode() token = lemmatizer.lemmatize(token) tokens[j] = token article = " ".join(tokens) articles[i] = "<<article_start>> {} <<article_end>>".format( article) text = " ".join(articles) self.logger.info("Changing years to <<year>>") text = re.sub(' \d{4}(\-\d+|s)?', ' <<year>>', text) self.logger.info("Changing numbers to <<number>>") text = re.sub(' \d[\d\.,%]*(st|nd|rd|th| %)?', ' <<number>>', text) text = re.sub('<<number>>\-[\d\.,%]+', '<<number>>', text) self.logger.info("Section title formatting") text = re.sub('==+(.*?)==+', '<<section_title_start>> \\1 <<section_title_end>>', text) self.logger.info("Removing extra white-spaces") text = re.sub('\s\s+', ' ', text) with open(output_file_path, "w") as file: file.write(text) num_tokens = len(text.split(" ")) self.logger.info( "Saved the cleaned text. Contains ~ {} tokens".format( num_tokens)) return True