Example #1
0
 def get_articles(self, start):
     arr = []
     wiki = Wikipedia(language="en")
     article = wiki.article(start)
     arr.append(self.text_cleaning(article))
     for title in article.links:
         article = wiki.article(title)
         arr.append(self.text_cleaning(article))
     return arr
Example #2
0
class WikiParser:
    def __init__(self):
        self.wiki = Wikipedia(language="en")
        self.punc = re.compile('[%s]' % re.escape(punctuation))

    def get_articles(self, start, depth, max_count):
        iterations = 0
        links = [start]
        list_of_strings = []
        while iterations <= depth and len(list_of_strings) <= max_count:
            links_temp = []
            for link in links:
                if iterations <= depth and len(list_of_strings) <= max_count:
                    try:
                        article = self.wiki.article(link)
                        text = self.process(article.plaintext())
                        new_links = article.links
                        list_of_strings.append(text)
                        links_temp.extend(new_links)
                        print(f"Processed link {link}")
                    except AttributeError:
                        print(f"Skipped link {link}")
                        continue
                else:
                    break
            links = links_temp
            iterations += 1

        return list_of_strings

    def process(self, text):
        tokens = text.split(" ")
        return " ".join([self.punc.sub("", x.lower().strip()) for x in tokens])