def main(argv):
    ifile = open(argv[1])
    dumpdir = argv[2]
    startrow = int(argv[3])
    creader = utfcsv.UnicodeReader(ifile)
    rowc = 0
#    print startrow
    for row in creader: 
        rowc = rowc + 1
        if rowc < startrow:
            continue
        context = row[0]
        idt = row[1]
        #print idt
        if len(row) > 3 and len(row[3]) > 0:
            url = row[3]
        else:
            continue
    
        dirc = dumpdir+'/'+idt
        if os.path.exists(dirc):
            continue
        print 'getting url' , ' ', url.encode('ascii','ignore')
        fetched = curlcrawl.curlcrawl([url], maxlink=10, dumpdir=dirc, mode=0750)
        if os.path.exists(dirc):
            idfile = codecs.open(dirc+'/'+'idfile',encoding="utf-8",mode='wb')
            idfile.write(idt)
            idfile.close()
        print 'got context ' , context , ' for id ' , idt.encode('ascii','ignore') , ' feteched ' , fetched
def main(argv):
    ifile = open(argv[1])
    dumpdir = argv[2]
    startrow = int(argv[3])
    creader = utfcsv.UnicodeReader(ifile)
    rowc = 0
    #    print startrow
    for row in creader:
        rowc = rowc + 1
        if rowc < startrow:
            continue
        context = row[0]
        idt = row[1]
        #print idt
        if len(row) > 3 and len(row[3]) > 0:
            url = row[3]
        else:
            continue

        dirc = dumpdir + '/' + idt
        if os.path.exists(dirc):
            continue
        print 'getting url', ' ', url.encode('ascii', 'ignore')
        fetched = curlcrawl.curlcrawl([url],
                                      maxlink=10,
                                      dumpdir=dirc,
                                      mode=0750)
        if os.path.exists(dirc):
            idfile = codecs.open(dirc + '/' + 'idfile',
                                 encoding="utf-8",
                                 mode='wb')
            idfile.write(idt)
            idfile.close()
        print 'got context ', context, ' for id ', idt.encode(
            'ascii', 'ignore'), ' feteched ', fetched
#!/usr/bin/python
"""
This script crawls each of the site mentioned in examples.txt file
of the context trec 2012.
For each of the examples it create a directory with that example number
and stores the crawled pages of that site into that directory.
The pages are stored as per the path in url of the crawled page.
For example 
  http://www.cis.udel.edu/home/main.html : saved to /home dir
 http://www.cis.udel.edu/home/nextdir/text.html will be save to /home/nextdir
"""

import saxparser

from crawler import curlcrawl
examplelist = list()
exhandler = saxparser.ExampleFileHandler(examplelist)
saxparser.parse('/usa/arao/trec/contexttrec12/examples.txt',exhandler)
for ex in examplelist[0:3]:
    fetched = curlcrawl.curlcrawl([ex.attribute['url']],maxlink=10,dumpdir='/usa/arao/trec/contexttrec12/texamplesites/'+ex.attribute['number'], mode = 0750)
    print ex.attribute['number'] , ' ', fetched


Beispiel #4
0
#!/usr/bin/python
"""
This script crawls each of the site mentioned in examples.txt file
of the context trec 2012.
For each of the examples it create a directory with that example number
and stores the crawled pages of that site into that directory.
The pages are stored as per the path in url of the crawled page.
For example 
  http://www.cis.udel.edu/home/main.html : saved to /home dir
 http://www.cis.udel.edu/home/nextdir/text.html will be save to /home/nextdir
"""

import saxparser

from crawler import curlcrawl
examplelist = list()
exhandler = saxparser.ExampleFileHandler(examplelist)
saxparser.parse('/usa/arao/trec/contexttrec12/examples.txt', exhandler)
for ex in examplelist[0:3]:
    fetched = curlcrawl.curlcrawl(
        [ex.attribute['url']],
        maxlink=10,
        dumpdir='/usa/arao/trec/contexttrec12/texamplesites/' +
        ex.attribute['number'],
        mode=0750)
    print ex.attribute['number'], ' ', fetched