Example #1
0
#HOMEPAGE='https://thenewboston.com/'
#HOMEPAGE='http://www.health.com/'
HOMEPAGE = 'http://www.hindustantimes.com/'

DOMAIN_NAME = get_domain_name(HOMEPAGE)
#DOMAIN_NAME='health.com/food'
#DOMAIN_NAME='health.com'

QUEUE_FILE = PROJECT_NAME + '/queue.txt'
CRAWLED_FILE = PROJECT_NAME + '/crawled.txt'
NUMBER_OF_THREADS = 100

#queue variable is basically thread queue
queue = Queue()
spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)


#create worker threads (die when main exits)
def create_workers():
    for _ in range(NUMBER_OF_THREADS):
        t = threading.Thread(target=work)
        #print("check1")
        t.daemon = True
        # daemon so that thread dies when main exits
        t.start()
        # with t.start() thread will start executing the target, that is the work function, initially queue is empty so it will wait


# do the next job in the queue
def work():
Example #2
0
		self.spider = spider()
		self.indexer = indexer()
		self.parser = parser()
		self.urllist = urllist
	
	def start(self):
		if len(self.urllist) == 0:
			return False
		self.spider.addurllist(self.urllist)
		self.spider.setparser(self.parser)
		self.spider.setindexer(self.indexer)
		spider.run()
		return True

	def cleanup(self):
		self.indexer.closedb()
	

if __name__ == "__main__":

	spider = spider()
	#spider.addurl('http://localhost:9080/setest/test.php')
	spider.addurl('http://hq.booksarefun.com/')
	parserobj = parser()
	indexobj = indexer()
	spider.setparser(parserobj)
	spider.setindexer(indexobj)
	spider.run()
	indexobj.closedb()
	print 'done!'
Example #3
0
	def __init__(self,urllist = []):
		self.spider = spider()
		self.indexer = indexer()
		self.parser = parser()
		self.urllist = urllist
Example #4
0
#!/usr/bin/python
#coding=utf-8

from Spider import spider
from ActorName import ActorNameHelper
import sys



if __name__ == "__main__":

    anh = ActorNameHelper("xiami_music_artist.txt")
    ans = 1
    id,name = anh.getName()
    while name != None:
        print str(ans) + ':\t' + id+'\t'+name
        crawler = spider()
        crawler.run(name,id)
        id,name = anh.getName()
        ans += 1
        sys.stdout.flush()
    anh.close()
Example #5
0
from Spider import spider

if __name__ == "__main__":
    startURL = "http://baike.baidu.com/item/python"
    crawler = spider(startURL)
    crawler.crawl(size=10)

Example #6
0
from Spider import spider
from Mongo import Database
from threading import Thread

linkCount = curCount = 0
i = 0
url = "https://www.w3schools.com/"
mongoData = Database("W3", linkCount, url)
while (i < mongoData.linksCount()):
    try:
        spiderLeg = spider(mongoData.getNext(curCount))
        curCount += 1
        spiderLeg.crawl()
        linkCount = mongoData.insertDB(spiderLeg.linkURI, spiderLeg.texts,
                                       spiderLeg.CurLink, spiderLeg.Meta,
                                       linkCount)
        print "Link ", i, " Done!!"
    except:
        print "Dropped!!"
    i += 1
Example #7
0
def crawl():
    spider(urls=[
        "http://www.shopbop.com", "http://www.kilimall.co.ke/",
        "http://www.jumia.co.ke"
    ])
Example #8
0
import threading
from queue import Queue
from Spider import spider
from domain import *
from source import *

Project_Name = '' # Name of The Directory (input)
home_page = '' # Home Page of The Site You Want to Crawl (input)
domain_name = get_domain(home_page)
queue_file = Project_Name + '/queue.txt'
crawled_file = Project_Name + '/crawled.txt'
number_of_threads = 8
thread_queue = Queue()
spider(home_page, Project_Name, domain_name)

def create_workers():
    for x in range(number_of_threads):
        t = threading.Thread(target=work)
        t.daemon = True
        t.start()


def work():
    while True:
        url = thread_queue.get()
        spider.crawl_page(threading.current_thread().name, url)
        thread_queue.task_done()

def create_jobs():
    for link in file_to_set(queue_file):
        thread_queue.put(link)
Example #9
0
from Spider import spider
from Database import *
from threading import Thread
"""
        Developed by: Prateek Jha, 15 May 2017
"""

linkCount = curCount = 0
i = 0
url = "***************"
initialize(linkCount, url)
while (i < 5):
    spiderLeg = spider(getNext(curCount))
    curCount += 1
    spiderLeg.crawl()
    linkCount = insertDB(spiderLeg.linknText, spiderLeg.headings, url,
                         linkCount)
    print "Test Completed Successfully!!"
    i += 1
Example #10
0
import threading
from Functions import file_to_set
from queue import Queue
from Spider import spider
from domain import get_domain_name

#Each itteration of the program is a new project
PROJECT_NAME = 'Web Crawler'  #Naming the current project
HOMEPAGE ='https://www.reuters.com/' #Gives starting page
DOMAIN_NAME = get_domain_name(HOMEPAGE) #Function is called that gets domain name
QUEUE_FILE = PROJECT_NAME + '/queue.txt' 
CRAWLED_FILE = PROJECT_NAME + '/crawled.txt'
NUMBER_OF_THREADS = 8

queue = Queue()
spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)  # First spider is called 


# creating worker threads
def create_workers():
    for _ in range(NUMBER_OF_THREADS): #itterates as many times as there are threads
        t = threading.Thread(target=work) 
        t.daemon = True
        t.start()


# Do the next job in the queue
def work():
    while True:
        url = queue.get() 
        spider.crawl_page(threading.current_thread().name, url) #crawls page in current thread
Example #11
0
 def __init__(self, urllist=[]):
     self.spider = spider()
     self.indexer = indexer()
     self.parser = parser()
     self.urllist = urllist
Example #12
0
        self.spider = spider()
        self.indexer = indexer()
        self.parser = parser()
        self.urllist = urllist

    def start(self):
        if len(self.urllist) == 0:
            return False
        self.spider.addurllist(self.urllist)
        self.spider.setparser(self.parser)
        self.spider.setindexer(self.indexer)
        spider.run()
        return True

    def cleanup(self):
        self.indexer.closedb()


if __name__ == "__main__":

    spider = spider()
    #spider.addurl('http://localhost:9080/setest/test.php')
    spider.addurl('http://hq.booksarefun.com/')
    parserobj = parser()
    indexobj = indexer()
    spider.setparser(parserobj)
    spider.setindexer(indexobj)
    spider.run()
    indexobj.closedb()
    print 'done!'