def main(argv): ifile = open(argv[1]) dumpdir = argv[2] startrow = int(argv[3]) creader = utfcsv.UnicodeReader(ifile) rowc = 0 # print startrow for row in creader: rowc = rowc + 1 if rowc < startrow: continue context = row[0] idt = row[1] #print idt if len(row) > 3 and len(row[3]) > 0: url = row[3] else: continue dirc = dumpdir+'/'+idt if os.path.exists(dirc): continue print 'getting url' , ' ', url.encode('ascii','ignore') fetched = curlcrawl.curlcrawl([url], maxlink=10, dumpdir=dirc, mode=0750) if os.path.exists(dirc): idfile = codecs.open(dirc+'/'+'idfile',encoding="utf-8",mode='wb') idfile.write(idt) idfile.close() print 'got context ' , context , ' for id ' , idt.encode('ascii','ignore') , ' feteched ' , fetched
def main(argv): ifile = open(argv[1]) dumpdir = argv[2] startrow = int(argv[3]) creader = utfcsv.UnicodeReader(ifile) rowc = 0 # print startrow for row in creader: rowc = rowc + 1 if rowc < startrow: continue context = row[0] idt = row[1] #print idt if len(row) > 3 and len(row[3]) > 0: url = row[3] else: continue dirc = dumpdir + '/' + idt if os.path.exists(dirc): continue print 'getting url', ' ', url.encode('ascii', 'ignore') fetched = curlcrawl.curlcrawl([url], maxlink=10, dumpdir=dirc, mode=0750) if os.path.exists(dirc): idfile = codecs.open(dirc + '/' + 'idfile', encoding="utf-8", mode='wb') idfile.write(idt) idfile.close() print 'got context ', context, ' for id ', idt.encode( 'ascii', 'ignore'), ' feteched ', fetched
#!/usr/bin/python """ This script crawls each of the site mentioned in examples.txt file of the context trec 2012. For each of the examples it create a directory with that example number and stores the crawled pages of that site into that directory. The pages are stored as per the path in url of the crawled page. For example http://www.cis.udel.edu/home/main.html : saved to /home dir http://www.cis.udel.edu/home/nextdir/text.html will be save to /home/nextdir """ import saxparser from crawler import curlcrawl examplelist = list() exhandler = saxparser.ExampleFileHandler(examplelist) saxparser.parse('/usa/arao/trec/contexttrec12/examples.txt',exhandler) for ex in examplelist[0:3]: fetched = curlcrawl.curlcrawl([ex.attribute['url']],maxlink=10,dumpdir='/usa/arao/trec/contexttrec12/texamplesites/'+ex.attribute['number'], mode = 0750) print ex.attribute['number'] , ' ', fetched
#!/usr/bin/python """ This script crawls each of the site mentioned in examples.txt file of the context trec 2012. For each of the examples it create a directory with that example number and stores the crawled pages of that site into that directory. The pages are stored as per the path in url of the crawled page. For example http://www.cis.udel.edu/home/main.html : saved to /home dir http://www.cis.udel.edu/home/nextdir/text.html will be save to /home/nextdir """ import saxparser from crawler import curlcrawl examplelist = list() exhandler = saxparser.ExampleFileHandler(examplelist) saxparser.parse('/usa/arao/trec/contexttrec12/examples.txt', exhandler) for ex in examplelist[0:3]: fetched = curlcrawl.curlcrawl( [ex.attribute['url']], maxlink=10, dumpdir='/usa/arao/trec/contexttrec12/texamplesites/' + ex.attribute['number'], mode=0750) print ex.attribute['number'], ' ', fetched