Exemple #1
0
# Creating custom configuration for nutch crawl
custom_configFile = {}
custom_configFile['http.agent.name'] = "Team 33 : CSCI 572"
custom_configFile['http.agent.email'] = "*****@*****.**"
custom_configFile['http.agent.rotate'] = "true"
custom_configFile['db.url.normalizers'] = "true"
custom_configFile['generate.update.crawldb'] = "true"
custom_configFile['http.content.limit'] = "-1"
custom_configFile['http.enable.if.modified.since.header'] = "false"
custom_configFile['http.timeout'] = "20000"
custom_configFile['fetcher.threads.fetch'] = "20"
custom_configFile['fetcher.threads.per.queue'] = "10"
custom_configFile['http.max.delays'] = "1000"

# reading seed urls from file
fopen = open("urls.txt", "r")
urlList = fopen.readlines()
for i in range(0, len(urlList)):
    seed_urls.append(urlList[i])

# crawling seed urls with custom configured nutch
sd = sc.create('crawl-seed', seed_urls)
nt = Nutch(custom_configFile)
jc = JobClient(sv, 'test', 'default')
cc = nt.Crawl(sd, sc, jc)
while True:
    job = cc.progress(
    )  # gets the current job if no progress, else iterates and makes progress
    if job == None:
        break
from nutch.nutch import Nutch
from nutch.nutch import SeedClient
from nutch.nutch import Server
from nutch.nutch import JobClient
import nutch

sv = Server('http://localhost:8081')
sc = SeedClient(sv)
seed_urls = ('http://espn.go.com', 'http://www.espn.com')
sd = sc.create('espn-seed', seed_urls)

nt = Nutch('default')
jc = JobClient(sv, 'test', 'default')
cc = nt.Crawl(sd, sc, jc)
while True:
    job = cc.progress()  # gets the current job if no progress, else iterates and makes progress
    if job == None:
        break
from nutch.nutch import Nutch
from nutch.nutch import SeedClient
from nutch.nutch import Server
from nutch.nutch import JobClient
import nutch
import sys

sv=Server('http://localhost:8081')
sc=SeedClient(sv)

# reading the seed file from the command line
inputdata=open(sys.argv[1],'r')
# reading the seed file line by line
for line in inputdata:
    seed_urls=(line)
    sd= sc.create('seed',seed_urls) 

    nt = Nutch('default')
    jc = JobClient(sv, 'test', 'default')
    cc = nt.Crawl(sd, sc, jc)   
    while True:
        job = cc.progress() # gets the current job if no progress, else iterates and makes progress
        if job == None:
            print "no more jobs"
            break 
from nutch.nutch import Nutch
from nutch.nutch import SeedClient
from nutch.nutch import Server
from nutch.nutch import JobClient
import nutch

sv=Server('http://localhost:8081')
sc=SeedClient(sv)
seed_urls=('http://www.espn.com')
sd= sc.create('espn-seed',seed_urls) 

nt = Nutch('default')
jc = JobClient(sv, 'test', 'default')
cc = nt.Crawl(sd, sc, jc)

while True:
    job = cc.progress() # gets the current job if no progress, else iterates and makes progress
    if job == None:
        break
Exemple #5
0
from nutch.nutch import Nutch
from nutch.nutch import SeedClient
from nutch.nutch import Server
from nutch.nutch import JobClient
import nutch

sv=Server('http://localhost:5557')
sc=SeedClient(sv)
seed_urls=('http://armslist.com')
sd= sc.create('weapon-seed',seed_urls)

nt = Nutch('default')
jc = JobClient(sv, 'test', 'default')
cc = nt.Crawl(sd, sc, jc)
while True:
    job = cc.progress() # gets the current job if no progress, else iterates an makes progress
    if job is None:
        break