Ejemplo n.º 1
0
    def getClosestPages(self):
        try:
            pagesBlock = self.doc.find_all("div", id="mw-pages")

            if len(pagesBlock) != 1:
                raise Exception("No Page Section")

            pageDoc = pagesBlock[0]
            pageCategory = pageDoc.find_all("a")

            # print 'about to get the pages from page list of length ',len(pageCategory)
            for pageTag in pageCategory:
                pageLink = pageTag.get("href")
                strPage = pageLink.encode("ascii", "ignore")
                # check to make sure the page I'm adding to the subpages list is a valid wikipedia article
                if page.isPage(strPage):
                    newPage = page.Page(strPage, self.depth)
                    self.subPages.append(newPage)

        except:
            print "Unexpected error:", sys.exc_info()[0]
Ejemplo n.º 2
0
def main():
	already_made = False
	while True:
		made = raw_input("Have you already initialized a prior probability?(Y/N) ")
		if made == "Y":
			already_made = True
			break
		elif made == "N":
			already_made = False
			break

	if not already_made:
		while True:
			catCount = raw_input("How many categories will you be using? ")
			try:
				count = int(catCount)
				break
			except:
				print "not a valid integer"

		while True:
			inpDepth = raw_input("How deep do you want to traverse (increases exponentially)? ")
			try:
				depth = int(inpDepth)
				break
			except:
				print "not a valid integer"

		while True:
			bagOfWords = raw_input("Use link approach or bag of words?(enter 1/0 respectively): ")
			try:
				BOWval = int(bagOfWords)
				useLinks = not not BOWval
				break
			except:
				print "not a valid integer"

		allCategoriesLinks = []
		print 'Please give the input in the form \"/wiki/Category:Example_category\".'
		print 'If a category is not valid input or not recognized, it will be dropped by the classifier.'
		inc = 0
		while inc < count:
			catLink = raw_input("enter your category: ")
			if categories.isCategory(catLink):
				allCategoriesLinks.append(catLink)
				inc += 1
			else:
				print "invalid category format"

		print 'Creating prior probilities for naive bayes clssification...'
		allCatsLinks,occurMatrix,totals,keyDict,useLinks = classify.createClassifier(allCategoriesLinks,useLinks,depth)

		serialize(allCatsLinks,'allCatsLinks.p')
		serialize(occurMatrix,'occurMatrix.p')
		serialize(totals,'totals.p')
		serialize(keyDict,'keyDict.p')
		serialize(useLinks,'useLinks.p')
		print 'Prior probabilities are now stored in serialized files.'

	while True:
		checkPageLink = raw_input("In similar format, give URL suffix of page you would like to classify: ")
		if page.isPage(checkPageLink):
			break
		else:
			print "There was an error connecting to the given page."

	checkPage = page.Page(checkPageLink)

	print 'Getting distribution for page over categories...'
	allCatsLinks = unpack('allCatsLinks.p')
	occurMatrix = unpack('occurMatrix.p')
	totals = unpack('totals.p')
	keyDict = unpack('keyDict.p')
	useLinks = unpack('useLinks.p')
	distribution = classify.naiveBayes(checkPage,allCatsLinks,occurMatrix,totals,keyDict,useLinks)
	for result in distribution:
		print checkPageLink,'is a subpage of', result[0], 'with probability', round(result[1]*100,5),'%'

	return None