def get_articles(self, start): arr = [] wiki = Wikipedia(language="en") article = wiki.article(start) arr.append(self.text_cleaning(article)) for title in article.links: article = wiki.article(title) arr.append(self.text_cleaning(article)) return arr
class WikiParser: def __init__(self): self.wiki = Wikipedia(language="en") self.punc = re.compile('[%s]' % re.escape(punctuation)) def get_articles(self, start, depth, max_count): iterations = 0 links = [start] list_of_strings = [] while iterations <= depth and len(list_of_strings) <= max_count: links_temp = [] for link in links: if iterations <= depth and len(list_of_strings) <= max_count: try: article = self.wiki.article(link) text = self.process(article.plaintext()) new_links = article.links list_of_strings.append(text) links_temp.extend(new_links) print(f"Processed link {link}") except AttributeError: print(f"Skipped link {link}") continue else: break links = links_temp iterations += 1 return list_of_strings def process(self, text): tokens = text.split(" ") return " ".join([self.punc.sub("", x.lower().strip()) for x in tokens])