Example #1
0
 def __init__(self):
     # self.DAO = DAO()
     self.wikipedia = Wikipedia_API()
     self.searcheachpage = 0
     self.keywordlist = 0
     self.html = 0
     self.titlehtml = 0
     self.gettitle = 0
     self.geturl = 0
     self.listofwords = 0
     self.alllinks = 0
     self.getpage = 0
     self.tags = 0
     self.hreflist = 0
     self.badfilter = 0
     self.goodfilter = 0
     self.filtered = 0
     self.baseurl = "https://en.wikipedia.org"
     self.basesearch = "Feces"
     self.linkforwiki = 0
     self.listoftitles = 0
     self.displaytitle = 0
     self.titlenumber = 0
     self.totallinkslist = 0
     self.totalgoodlinks = []
     self.userstart()
Example #2
0
 def __init__(self):
     # self.DAO = DAO()
     self.wikipedia = Wikipedia_API()
     self.searcheachpage = 0
     self.keywordlist = 0
     self.html = 0
     self.titlehtml = 0
     self.gettitle = 0
     self.geturl = 0
     self.listofwords = 0
     self.alllinks = 0
     self.getpage = 0
     self.tags = 0
     self.hreflist = 0
     self.badfilter = 0
     self.goodfilter = 0
     self.filtered = 0
     self.baseurl = "https://en.wikipedia.org"
     self.basesearch = "Feces"
     self.linkforwiki = 0
     self.listoftitles = 0
     self.displaytitle = 0
     self.titlenumber = 0
     self.totallinkslist = 0
     self.totalgoodlinks = []
     self.userstart()
Example #3
0
 def __init__(self, title=None):
     self.wiki = Wikipedia_API.Wikipedia_API()
     if title == None:
         self.cr = cleanrandom.Exponentialsearch()
         self.article = self.cr.repeat()
     else:
         self.article = self.wiki.get_article_info(title)
     self.title = self.article['title'].encode('utf-8')
     self.content = self.article['content'].encode('utf-8')
Example #4
0
class Trainingset:
    def __init__(self):
        self.DAO = DAO()
        self.api = Wikipedia_API()
        self.links = self.read_file("links.txt")
        ## self.random = self.api.get_random(10, [], 0)

    def annotate(self, input):
        print("Beginning annotation of {} titles.".format(len(input)))
        inserted = 0
        for title in input:
            try:
                page_info = self.api.get_article_info(title)
                inserted_id = self.DAO.insert_one(page_info, "topics")
                print("Inserted: {}".format(inserted_id))
                inserted += 1
            except Exception as e:
                print(e)
        print("Inserted: {} article objects.".format(inserted))

    def test(self, input):
        successfully_parsed = 0
        errored_out = []
        for title in input:
            try:
                page_info = self.api.get_article_info(title)
                if page_info["links"] is not None:
                    successfully_parsed += 1
                    print("Parsed {}".format(title))
            except Exception as e:
                print(e)
                errored_out.append(title)

        return successfully_parsed, errored_out

    def read_file(self, file):
        file = open("links.txt", "r+")
        lines = file.readlines()
        file.close()
        return [line[:-1] for line in lines]
Example #5
0
class Trainingset:
    def __init__(self):
        self.DAO = DAO()
        self.api = Wikipedia_API()
        self.links = self.read_file("links.txt")
        ## self.random = self.api.get_random(10, [], 0)
        
    def annotate(self, input):
        print("Beginning annotation of {} titles.".format(len(input)))
        inserted = 0
        for title in input:
            try:
                page_info = self.api.get_article_info(title)
                inserted_id = self.DAO.insert_one(page_info, "topics")
                print("Inserted: {}".format(inserted_id))
                inserted += 1
            except Exception as e:
                print(e)
        print("Inserted: {} article objects.".format(inserted))
            
    def test(self, input):
        successfully_parsed = 0
        errored_out = []
        for title in input:
            try:
                page_info = self.api.get_article_info(title)
                if page_info['links'] is not None:
                    successfully_parsed += 1
                    print("Parsed {}".format(title))
            except Exception as e:
                print(e)
                errored_out.append(title)
                
        return successfully_parsed, errored_out
            
    def read_file(self, file):
        file = open('links.txt', 'r+')
        lines = file.readlines()
        file.close()
        return [line[:-1] for line in lines]
Example #6
0
 def __init__(self):
     self.DAO = DAO()
     self.api = Wikipedia_API()
     self.links = self.read_file("links.txt")
Example #7
0
class Exponentialsearch:
    def __init__(self):
        # self.DAO = DAO()
        self.wikipedia = Wikipedia_API()
        self.searcheachpage = 0
        self.keywordlist = 0
        self.html = 0
        self.titlehtml = 0
        self.gettitle = 0
        self.geturl = 0
        self.listofwords = 0
        self.alllinks = 0
        self.getpage = 0
        self.tags = 0
        self.hreflist = 0
        self.badfilter = 0
        self.goodfilter = 0
        self.filtered = 0
        self.baseurl = "https://en.wikipedia.org"
        self.basesearch = "Feces"
        self.linkforwiki = 0
        self.listoftitles = 0
        self.displaytitle = 0
        self.titlenumber = 0
        self.totallinkslist = 0
        self.totalgoodlinks = []
        self.userstart()

    def start(self, searchexact):
        getpage = wikipedia.page(searchexact)
        self.gettitle = getpage.title
        self.geturl = getpage.url

        urllink = self.geturl
        with urllib.request.urlopen(urllink) as url:
            html = url.read()
            self.html = html
            self.titlehtml = html

    def continuesearch(self, nextlink):
        getpage = wikipedia.page(nextlink)
        self.gettitle = getpage.title
        self.geturl = getpage.url

        urllink = self.geturl
        with urllib.request.urlopen(urllink) as url:
            html = url.read()
            self.html = html
            self.titlehtml = html

    def findhref(self):  # html
        html = self.html
        html = html.decode().split('<a href="')  #
        tags = []
        listofwords = []

        for foreachhref in html:
            tags.append(foreachhref)

        for eachitem in tags:
            cuthalf = eachitem.split('"')  #
            listofwords.append(cuthalf)
            self.listofwords = listofwords

    def titlemaker(self):
        html = self.titlehtml
        html = html.decode().split('title="')
        tags = []
        listoftitles = []

        for eachtitle in html:
            tags.append(eachtitle)

        for eachitem in tags:
            cuthalf = eachitem.split('"')
            listoftitles.append(cuthalf)
            self.listoftitles = listoftitles

    def retreivefirst(self):
        hreflist = []
        for eachlist in self.listofwords:
            hreflist.append(eachlist[0])
        self.hreflist = hreflist

    def retreivefirsttitle(self):
        titlelist = []
        title = []
        for eachlist in self.listoftitles:
            titlelist.append(eachlist[0])

        encode = [x.encode("utf-8") for x in titlelist]
        self.displaytitle = encode

    def randomtitle(self):
        length = len(self.displaytitle)

        self.titlenumber = random.randint(10, length)

    def filterlinks(self):
        self.badfilter = []
        for eachlink in self.hreflist:
            if "\n" in eachlink:
                self.badfilter.append(eachlink)
            if "\t" in eachlink:
                self.badfilter.append(eachlink)
            if "#" in eachlink:
                self.badfilter.append(eachlink)
            if " " in eachlink:
                self.badfilter.append(eachlink)

        self.goodfilter = list(set(self.hreflist) - set(self.badfilter))

    def wikilinker(self):
        linkforwiki = []
        for eachlink in self.goodfilter:
            newurl = self.baseurl + eachlink
            linkforwiki.append(newurl)
            self.linkforwiki = linkforwiki

    def totallinks(self):
        links = self.displaytitle
        self.totalgoodlinks += links
        self.totalgoodlinks = list(set(self.totalgoodlinks))

    def userstart(self):
        # input("Hit enter to start")
        self.start(self.basesearch)
        self.main()
        self.repeat()

    def main(self):
        self.findhref()
        self.retreivefirst()
        self.filterlinks()
        self.wikilinker()
        self.titlemaker()
        self.retreivefirsttitle()
        self.randomtitle()

    def newmain(self, nextlink):
        self.continuesearch(nextlink)
        self.findhref()
        self.retreivefirst()
        self.filterlinks()
        self.titlemaker()
        self.retreivefirsttitle()
        self.randomtitle()

    def repeat(self):
        exit = False
        while exit != True:  # this number should never hit 0, so you need to breka the loop somehow
            try:
                time.sleep(1.0)
                self.randomtitle()
                self.basesearch = self.displaytitle[self.titlenumber]
                # print("Title (encoded in utf-8): ", self.basesearch)
                new_article = self.wikipedia.get_article_info(self.basesearch.decode("utf-8"))
                if new_article["title"] == None or new_article["content"] == None:
                    raise SyntaxError
                exit = True
                return new_article
                # while True:
                # time.sleep(0.1)
                # count = self.DAO.count("new_articles")
                # if count >= 20:
                # pass
                # else:
                # self.DAO.insert_one(new_article,"new_articles")
                # return False
            except:
                exit = False
Example #8
0
class Exponentialsearch():
    def __init__(self):
        # self.DAO = DAO()
        self.wikipedia = Wikipedia_API()
        self.searcheachpage = 0
        self.keywordlist = 0
        self.html = 0
        self.titlehtml = 0
        self.gettitle = 0
        self.geturl = 0
        self.listofwords = 0
        self.alllinks = 0
        self.getpage = 0
        self.tags = 0
        self.hreflist = 0
        self.badfilter = 0
        self.goodfilter = 0
        self.filtered = 0
        self.baseurl = "https://en.wikipedia.org"
        self.basesearch = "Feces"
        self.linkforwiki = 0
        self.listoftitles = 0
        self.displaytitle = 0
        self.titlenumber = 0
        self.totallinkslist = 0
        self.totalgoodlinks = []
        self.userstart()

    def start(self, searchexact):
        getpage = wikipedia.page(searchexact)
        self.gettitle = getpage.title
        self.geturl = getpage.url

        urllink = self.geturl
        with urllib.request.urlopen(urllink) as url:
            html = url.read()
            self.html = html
            self.titlehtml = html

    def continuesearch(self, nextlink):
        getpage = wikipedia.page(nextlink)
        self.gettitle = getpage.title
        self.geturl = getpage.url

        urllink = self.geturl
        with urllib.request.urlopen(urllink) as url:
            html = url.read()
            self.html = html
            self.titlehtml = html

    def findhref(self):#html
        html = self.html
        html = html.decode().split("<a href=\"")#
        tags = []
        listofwords = []

        for foreachhref in html:
            tags.append(foreachhref)

        for eachitem in tags:
            cuthalf = eachitem.split("\"")#
            listofwords.append(cuthalf)
            self.listofwords = listofwords

    def titlemaker(self):
        html = self.titlehtml
        html = html.decode().split("title=\"")
        tags = []
        listoftitles = []

        for eachtitle in html:
            tags.append(eachtitle)

        for eachitem in tags:
            cuthalf = eachitem.split("\"")
            listoftitles.append(cuthalf)
            self.listoftitles = listoftitles

    def retreivefirst(self):
        hreflist = []
        for eachlist in self.listofwords:
            hreflist.append(eachlist[0])
        self.hreflist = hreflist

    def retreivefirsttitle(self):
        titlelist = []
        title = []
        for eachlist in self.listoftitles:
            titlelist.append(eachlist[0])

        encode = [x.encode('utf-8') for x in titlelist]
        self.displaytitle = encode

    def randomtitle(self):
        length = len(self.displaytitle)

        self.titlenumber = random.randint(10, length)

    def filterlinks(self):
        self.badfilter = []
        for eachlink in self.hreflist:
            if "\n" in eachlink:
                self.badfilter.append(eachlink)
            if "\t" in eachlink:
                self.badfilter.append(eachlink)
            if "#" in eachlink:
                self.badfilter.append(eachlink)
            if " " in eachlink:
                self.badfilter.append(eachlink)

        self.goodfilter = list(set(self.hreflist) - set(self.badfilter))

    def wikilinker(self):
        linkforwiki = []
        for eachlink in self.goodfilter:
            newurl = self.baseurl + eachlink
            linkforwiki.append(newurl)
            self.linkforwiki = linkforwiki

    def totallinks(self):
        links = self.displaytitle
        self.totalgoodlinks += links
        self.totalgoodlinks = list(set(self.totalgoodlinks))

    def userstart(self):
        # input("Hit enter to start")
        self.start(self.basesearch)
        self.main()
        self.repeat()


    def main(self):
        self.findhref()
        self.retreivefirst()
        self.filterlinks()
        self.wikilinker()
        self.titlemaker()
        self.retreivefirsttitle()
        self.randomtitle()

    def newmain(self, nextlink):
        self.continuesearch(nextlink)
        self.findhref()
        self.retreivefirst()
        self.filterlinks()
        self.titlemaker()
        self.retreivefirsttitle()
        self.randomtitle()

    def repeat(self):
        exit = False
        while exit != True: #this number should never hit 0, so you need to breka the loop somehow
            try:
                time.sleep(1.0)
                self.randomtitle()
                self.basesearch = self.displaytitle[self.titlenumber]
                # print("Title (encoded in utf-8): ", self.basesearch)
                new_article = self.wikipedia.get_article_info(self.basesearch.decode('utf-8'))
                if new_article['title'] == None or new_article['content'] == None:
                    raise SyntaxError
                exit = True
                return new_article
                # while True:
                    # time.sleep(0.1)
                    # count = self.DAO.count("new_articles")
                    # if count >= 20:
                        # pass
                    # else:
                        # self.DAO.insert_one(new_article,"new_articles")
                        # return False
            except:
                exit = False
Example #9
0
 def __init__(self):
     self.DAO = DAO()
     self.api = Wikipedia_API()
     self.links = self.read_file("links.txt")