Exemple #1
0
__author__ = 'TEAM 33 CSCI 572'
from nutch.nutch import Nutch
from nutch.nutch import SeedClient
from nutch.nutch import Server
from nutch.nutch import JobClient
import nutch
import io

sv = Server('http://*****:*****@usc.edu"
custom_configFile['http.agent.rotate'] = "true"
custom_configFile['db.url.normalizers'] = "true"
custom_configFile['generate.update.crawldb'] = "true"
custom_configFile['http.content.limit'] = "-1"
custom_configFile['http.enable.if.modified.since.header'] = "false"
custom_configFile['http.timeout'] = "20000"
custom_configFile['fetcher.threads.fetch'] = "20"
custom_configFile['fetcher.threads.per.queue'] = "10"
custom_configFile['http.max.delays'] = "1000"

# reading seed urls from file
fopen = open("urls.txt", "r")
urlList = fopen.readlines()
for i in range(0, len(urlList)):
    seed_urls.append(urlList[i])
def get_seed_client():
    sv = Server(url)
    sc = SeedClient(sv)
    return sc