コード例 #1
0
 def trainClassifier(self):
     #gera lista de listas
     list_layers = []
     for layer in self.layers:
         current_layer = []
         for page in layer:
             current_layer.append(page.text)
         list_layers.append(current_layer)
     self.classifier = QueueClassifier(list_layers)
     print self.classifier.predictQueue("ei ou are one")
コード例 #2
0
class ContextGraph(object):

    def __init__(self):
        self.queues = []
        self.layers = []
        self.layers.append([])
        self.layers.append([])
        self.layers.append([])
        self.layers.append([])
        self.layers.append([])

        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())
        self.queues.append(LayerQueue())

        self.fetcher = Fetcher()
        self.visited_urls = set()
        self.urls_already_in_layers = set()
    def addToQueue(self, queue, page, score):
        self.queues[queue].putPage(page, score)

    def getNextQueuedPage(self):
    	for queue in self.queues:
    		if not queue.isEmpty():
    			return queue.getPage()
    	else:
    		print "No urls in queue"
    		return 'Empty Queue'

    """Pega pagina, classifica bota na queue correta"""
    def classify(self, page_dict):
        layer = self.classifier.predictQueue(page_dict['text'])
        layer, score = self.classifier.predictQueue(page_dict['text'])
        #ele vai adicionar todos os links dessa pagina na queue classificada, n eh soh um addToQueue, sao varios
        for link in page_dict['links']:
            self.addToQueue(layer, link, score)
        return layer

    def store(self, page_dict, layer, page_id):
        crawl_directory = 'crawl/'
        path = crawl_directory+str(layer)
        self.check_dir(path)
        path = path+'/'+urlparse(page_dict['url']).netloc
        self.check_dir(path)
        path = path+'/'+str(page_id)
        page_store = open(path, 'w')
        page_store.write(page_dict['url']+'\n')
        page_store.write(page_dict['text']+'\n')

    def check_dir(self, path):
        if not os.path.exists(path):
            os.mkdir(path)

    def addSeed(self, _seed):
        self.layers[0].append(_seed)

    def addToLayer(self, layer, page):
        if not page.url in self.urls_already_in_layers:
            self.urls_already_in_layers.add(page.url)
            self.layers[layer].append(page)
#        +'/'+page.url.replace("s", " ")
#        file = open("layers/"+str(layer)+'/'+page.url.replace("/", "_"),"wb")
#        pickle.dump(page, file)

    """ layer = o layer da page passada """
    def constructGraph(self, seed, layer, num_backlinks):
        print "constructing layer:", layer, " of seed:", seed.url
        list_urls = self.getBackLinks(seed.url, num_backlinks, layer)
        if layer<=3:
            for url in list_urls:
                page = Page(url)      #points_to = seed
                print 'fetching backlink page: ', url
                page_dict = self.fetcher.getPage(url)
                page.setText(page_dict['text'])
                self.addToLayer(layer+1, page)

    def initGraph(self):
        """1 de cada"""
#        for seed in self.layers[0]:
#            self.constructGraph(seed, 0, 300)
#        for page in self.layers[1]:
#            self.constructGraph(page, 1, 1)
#        for page in self.layers[2]:
#            self.constructGraph(page, 2, 1)
#        for page in self.layers[3]:
#            self.constructGraph(page, 3, 1)
#        for page in self.layers[4]:
#            self.constructGraph(page, 4, 1)
        """aleatorio"""
        num_backlinks = 10
        num_each = 10
        for seed in self.layers[0]:
            self.constructGraph(seed, 0, num_backlinks)
        already_used= set()
        for num in range(0, num_backlinks):
            i = random.randint(0, len(self.layers[1])-1)
            while i in already_used:
                i = random.randint(0, len(self.layers[1])-1)
            already_used.add(i)
            self.constructGraph(self.layers[1][i], 1, num_each)
        already_used= set()
        for num in range(0, num_backlinks):
            i = random.randint(0, len(self.layers[2])-1)
            while i in already_used:
                i = random.randint(0, len(self.layers[2])-1)
            already_used.add(i)
            self.constructGraph(self.layers[2][i], 2, num_each)
        already_used= set()
        for num in range(0, num_backlinks):
            i = random.randint(0, len(self.layers[3])-1)
            while i in already_used:
                i = random.randint(0, len(self.layers[3])-1)
            already_used.add(i)
            self.constructGraph(self.layers[3][i], 3, num_each)
#        already_used= set()
#        for num in range(0, num_backlinks):
#            i = random.randint(0, len(self.layers[4])-1)
#            while i in already_used:
#                i = random.randint(0, len(self.layers[4])-1)
#            already_used.add(i)
#            self.constructGraph(self.layers[4][i], 4, num_each)

    def readLayerZeroFile(self, layer0_txt):
        layer0_txt = open(layer0_txt, 'r')
        for line in layer0_txt:
            line = line.replace("\n", "")
            if not line=="" and not line==" ":
                self.addSeed(Page(line.replace("\n", ""), True))

    def readSeedsFile(self, seeds_txt):
        seeds_txt = open(seeds_txt, 'r')
        for line in seeds_txt:
            line = line.replace("\n", "")
            if not line=="":
                self.addToQueue(5, line.replace("\n", ""), 1) #score = 1

    def getBackLinks(self, page_url, num, layer):
        print 'Getting ', page_url, 'backlinks...'
        yahoo = Yahoo(page_url, num)
        return yahoo.lista
#        lista = []
#        for pos in range(0, num):
#            url = 'www.backlink'+str(pos)+'.com?layer='+str(layer)
#            lista.append(url)
#        return lista


    def trainClassifier(self):
        #gera lista de listas
        list_layers = []
        for layer in self.layers:
            current_layer = []
            for page in layer:
                current_layer.append(page.text)
            list_layers.append(current_layer)
        self.classifier = QueueClassifier(list_layers)
        print self.classifier.predictQueue("ei ou are one")

    def fetchSeeds(self):
        for page in self.layers[0]:
            page_dict = self.fetcher.getPage(page.url)
            page.setText(page_dict['text'])

    def printLayers(self):
        for i in range(0,5):
            print "Layer", i, ' - Size: ', len(self.layers[i])
            for page in self.layers[i]:
                if not page.isSeed():
                    pass
#                    print "[",i,"] -", page.url, "| points_to:", page.points_to.url, "| Seed?= ", page.isSeed()
                else:
                    pass