# Creating custom configuration for nutch crawl custom_configFile = {} custom_configFile['http.agent.name'] = "Team 33 : CSCI 572" custom_configFile['http.agent.email'] = "*****@*****.**" custom_configFile['http.agent.rotate'] = "true" custom_configFile['db.url.normalizers'] = "true" custom_configFile['generate.update.crawldb'] = "true" custom_configFile['http.content.limit'] = "-1" custom_configFile['http.enable.if.modified.since.header'] = "false" custom_configFile['http.timeout'] = "20000" custom_configFile['fetcher.threads.fetch'] = "20" custom_configFile['fetcher.threads.per.queue'] = "10" custom_configFile['http.max.delays'] = "1000" # reading seed urls from file fopen = open("urls.txt", "r") urlList = fopen.readlines() for i in range(0, len(urlList)): seed_urls.append(urlList[i]) # crawling seed urls with custom configured nutch sd = sc.create('crawl-seed', seed_urls) nt = Nutch(custom_configFile) jc = JobClient(sv, 'test', 'default') cc = nt.Crawl(sd, sc, jc) while True: job = cc.progress( ) # gets the current job if no progress, else iterates and makes progress if job == None: break
from nutch.nutch import Nutch from nutch.nutch import SeedClient from nutch.nutch import Server from nutch.nutch import JobClient import nutch sv = Server('http://localhost:8081') sc = SeedClient(sv) seed_urls = ('http://espn.go.com', 'http://www.espn.com') sd = sc.create('espn-seed', seed_urls) nt = Nutch('default') jc = JobClient(sv, 'test', 'default') cc = nt.Crawl(sd, sc, jc) while True: job = cc.progress() # gets the current job if no progress, else iterates and makes progress if job == None: break
from nutch.nutch import Nutch from nutch.nutch import SeedClient from nutch.nutch import Server from nutch.nutch import JobClient import nutch import sys sv=Server('http://localhost:8081') sc=SeedClient(sv) # reading the seed file from the command line inputdata=open(sys.argv[1],'r') # reading the seed file line by line for line in inputdata: seed_urls=(line) sd= sc.create('seed',seed_urls) nt = Nutch('default') jc = JobClient(sv, 'test', 'default') cc = nt.Crawl(sd, sc, jc) while True: job = cc.progress() # gets the current job if no progress, else iterates and makes progress if job == None: print "no more jobs" break
from nutch.nutch import Nutch from nutch.nutch import SeedClient from nutch.nutch import Server from nutch.nutch import JobClient import nutch sv=Server('http://localhost:8081') sc=SeedClient(sv) seed_urls=('http://www.espn.com') sd= sc.create('espn-seed',seed_urls) nt = Nutch('default') jc = JobClient(sv, 'test', 'default') cc = nt.Crawl(sd, sc, jc) while True: job = cc.progress() # gets the current job if no progress, else iterates and makes progress if job == None: break
from nutch.nutch import Nutch from nutch.nutch import SeedClient from nutch.nutch import Server from nutch.nutch import JobClient import nutch sv=Server('http://localhost:5557') sc=SeedClient(sv) seed_urls=('http://armslist.com') sd= sc.create('weapon-seed',seed_urls) nt = Nutch('default') jc = JobClient(sv, 'test', 'default') cc = nt.Crawl(sd, sc, jc) while True: job = cc.progress() # gets the current job if no progress, else iterates an makes progress if job is None: break