Ejemplos de Spider.process_page en Python

Lenguaje de programación: Python

Namespace/Package Name: spider

Clase / Tipo: Spider

Método / Función: process_page

Ejemplos en hotexamples.com: 2

Python Spider.process_page - 2 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de spider.Spider.process_page extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

Spider(30)

crawl_page(30)

crawl(14)

__init__(8)

craw(4)

Search(4)

crawl_genre(3)

build_node(3)

analyse(3)

process_page(2)

court(2)

add_url(2)

content_list(2)

GetInfo(2)

crowl(1)

crowl_page(1)

GET(1)

crawled_page(1)

createResultExcel(1)

get2l_url(1)

crawledPage(1)

crawle_page_in_queue(1)

crawl_weather(1)

crawl_video_urls(1)

crawl_robots(1)

data(1)

getfilename(1)

get3l_url(1)

post(1)

update(1)

startCrawl(1)

setworkdir(1)

setfilename(1)

setDaemon(1)

responseCallback(1)

parse_blog(1)

getSoup(1)

linkCallback(1)

levelCallback(1)

is_valid(1)

is_outgoing(1)

htmlCallback(1)

get_pdfs(1)

crawl_page_graph(1)

crawl_async_slots(1)

crawl_next_page_from_queue(1)

authorized(1)

Process(1)

ReturnValues(1)

Text(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: mlArchivesDownloader.py Proyecto: cupescapone/miningTools

def downloadArchivesList(aList, container, extension='.txt.gz', numThreads=5):
    '''Set up downloader'''
    queue = initDownloader(numThreads)

    import csv
    f = open(aList, 'rb')
    reader = csv.reader(f)
    for row in reader:
        startURL = row[0]
        
        mlName = startURL.split('/')[-2]
        
        spider = Spider(startURL)
        spider.process_page(startURL)
            
        '''Only the links to archive files are interesting:
        mailing list archive file names end with '.txt.gz' '''
        urlList = [x for x in sorted(spider.URLs) if x.endswith(extension)]
        if len(urlList):
            print '%s: %d archives' % (mlName, len(urlList))
            store = os.path.join(container, mlName)
            if not (os.path.isdir(store)):
                    os.system("mkdir %s" % store)
                
            '''Download each archive'''
            addToQ(queue, urlList, store)
                        
    '''If here, download finished. Stop threads'''
    stopDownloader(queue, numThreads)

Ejemplo n.º 2

Mostrar archivo

Archivo: mlArchivesDownloader.py Proyecto: cupescapone/miningTools

def downloadArchives(startURL, container, lookInsideSubfolders=False, extension='.txt.gz', numThreads=5):
    '''Crawl <startURL> and find all mailing list archives (given the filename <extension>).
    Store the files in the folder with the path <container>.
    If <lookInsideSubfolders>, then go one level deeper (crawl all first-order links as well).
    '''

    '''Set up downloader'''
    queue = initDownloader(numThreads)
    
    print 'Downloading archives from', startURL
        
    if not lookInsideSubfolders:
        spider = Spider(startURL)
        spider.process_page(startURL)
    
        '''Only the links to archive files are interesting:
        mailing list archive file names end with '.txt.gz' '''
        urlList = [x for x in sorted(spider.URLs) if x.endswith(extension)]
        print '%d archives' % (len(urlList))
            
        addToQ(queue, urlList, container)
            
    else:        
        spider = Spider(startURL)
        spider.process_page(startURL)
        
        for link in sorted(spider.URLs):
            subspider = Spider(link)
            subspider.process_page(link)
            
            mlName = link.split('/')[-2]
    
            '''Only the links to archive files are interesting:
            mailing list archive file names end with '.txt.gz' '''
            urlList = [x for x in sorted(subspider.URLs) if x.endswith(extension)]
            if len(urlList):
                print '%s: %d archives' % (mlName, len(urlList))
                '''Create a folder for the mailing list'''
                store = os.path.join(container, mlName)
                if not (os.path.isdir(store)):
                    os.system("mkdir %s" % store)
                
                addToQ(queue, urlList, store)
                    
    '''If here, download finished. Stop threads'''
    stopDownloader(queue, numThreads)