Python curlcrawl Beispiele

Programmiersprache: Python

Namespace / Paketname: crawler.curlcrawl

Methode / Funktion: curlcrawl

Beispiele auf hotexamples.com: 4

Python curlcrawl - 4 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die crawler.curlcrawl.curlcrawl, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

Datei: crawlsuggestions.py Projekt: rumpelt/ContextTrack

def main(argv):
    ifile = open(argv[1])
    dumpdir = argv[2]
    startrow = int(argv[3])
    creader = utfcsv.UnicodeReader(ifile)
    rowc = 0
#    print startrow
    for row in creader: 
        rowc = rowc + 1
        if rowc < startrow:
            continue
        context = row[0]
        idt = row[1]
        #print idt
        if len(row) > 3 and len(row[3]) > 0:
            url = row[3]
        else:
            continue
    
        dirc = dumpdir+'/'+idt
        if os.path.exists(dirc):
            continue
        print 'getting url' , ' ', url.encode('ascii','ignore')
        fetched = curlcrawl.curlcrawl([url], maxlink=10, dumpdir=dirc, mode=0750)
        if os.path.exists(dirc):
            idfile = codecs.open(dirc+'/'+'idfile',encoding="utf-8",mode='wb')
            idfile.write(idt)
            idfile.close()
        print 'got context ' , context , ' for id ' , idt.encode('ascii','ignore') , ' feteched ' , fetched

Beispiel #2

Datei anzeigen

Datei: crawlsuggestions.py Projekt: rumpelt/ContextTrack

def main(argv):
    ifile = open(argv[1])
    dumpdir = argv[2]
    startrow = int(argv[3])
    creader = utfcsv.UnicodeReader(ifile)
    rowc = 0
    #    print startrow
    for row in creader:
        rowc = rowc + 1
        if rowc < startrow:
            continue
        context = row[0]
        idt = row[1]
        #print idt
        if len(row) > 3 and len(row[3]) > 0:
            url = row[3]
        else:
            continue

        dirc = dumpdir + '/' + idt
        if os.path.exists(dirc):
            continue
        print 'getting url', ' ', url.encode('ascii', 'ignore')
        fetched = curlcrawl.curlcrawl([url],
                                      maxlink=10,
                                      dumpdir=dirc,
                                      mode=0750)
        if os.path.exists(dirc):
            idfile = codecs.open(dirc + '/' + 'idfile',
                                 encoding="utf-8",
                                 mode='wb')
            idfile.write(idt)
            idfile.close()
        print 'got context ', context, ' for id ', idt.encode(
            'ascii', 'ignore'), ' feteched ', fetched

Beispiel #3

Datei anzeigen

Datei: crawlExampleSite.py Projekt: rumpelt/ContextTrack

#!/usr/bin/python
"""
This script crawls each of the site mentioned in examples.txt file
of the context trec 2012.
For each of the examples it create a directory with that example number
and stores the crawled pages of that site into that directory.
The pages are stored as per the path in url of the crawled page.
For example 
  http://www.cis.udel.edu/home/main.html : saved to /home dir
 http://www.cis.udel.edu/home/nextdir/text.html will be save to /home/nextdir
"""

import saxparser

from crawler import curlcrawl
examplelist = list()
exhandler = saxparser.ExampleFileHandler(examplelist)
saxparser.parse('/usa/arao/trec/contexttrec12/examples.txt',exhandler)
for ex in examplelist[0:3]:
    fetched = curlcrawl.curlcrawl([ex.attribute['url']],maxlink=10,dumpdir='/usa/arao/trec/contexttrec12/texamplesites/'+ex.attribute['number'], mode = 0750)
    print ex.attribute['number'] , ' ', fetched

Beispiel #4

Datei anzeigen

#!/usr/bin/python
"""
This script crawls each of the site mentioned in examples.txt file
of the context trec 2012.
For each of the examples it create a directory with that example number
and stores the crawled pages of that site into that directory.
The pages are stored as per the path in url of the crawled page.
For example 
  http://www.cis.udel.edu/home/main.html : saved to /home dir
 http://www.cis.udel.edu/home/nextdir/text.html will be save to /home/nextdir
"""

import saxparser

from crawler import curlcrawl
examplelist = list()
exhandler = saxparser.ExampleFileHandler(examplelist)
saxparser.parse('/usa/arao/trec/contexttrec12/examples.txt', exhandler)
for ex in examplelist[0:3]:
    fetched = curlcrawl.curlcrawl(
        [ex.attribute['url']],
        maxlink=10,
        dumpdir='/usa/arao/trec/contexttrec12/texamplesites/' +
        ex.attribute['number'],
        mode=0750)
    print ex.attribute['number'], ' ', fetched