def checkConfig(): infoprtr = printinfo.printInfo() # input directory exists if not os.path.exists(runconfig.inputdir): infoprtr.printStatus('inputdir exists','no') return False else: infoprtr.printStatus('inputdir exists','yes') # check if repository folder exists if not os.path.exists(runconfig.outputdir): infoprtr.printStatus('outputdir exists','no') return False else: infoprtr.printStatus('outputdir exists','yes') # permission to write into the repository folder if os.path.exists(runconfig.outputdir+'9999'): shutil.rmtree(runconfig.outputdir+'9999') try: os.makedirs(runconfig.outputdir+'9999') shutil.rmtree(runconfig.outputdir+'9999') except OSError,e: print e infoprtr.printStatus('Write permission to outputdir','no') return False
def main(jobid, verbose=False): # record start time tic = time.time() # create on-screen information printer infoprinter = printinfo.printInfo() # check configurations config = Config_global(jobid) config.check_config() config.open_logw() if verbose: print "jobid: %(1)s" % {"1": config.jobid} # create the WARC_professor object warcproc = WARC_processor(config) # from os import listdir warcfiles = warcproc.get_warcs() if warcfiles: for wf in warcfiles: # decompress before extraction will increase speed wf_path = unzip_gz(wf, verbose=verbose) if wf.endswith(".gz") else wf warcproc.extract_all(wf_path, verbose=verbose) # delete after extraction if config.delete_after_import: delete_file(wf_path) # close log file config.close_logw() # print counters in a new line warcproc.counters.printCounter() warcproc.counters.printCountertoFile(config.sum_file) # record end time to calculate processing time # because strftime() will truncate the time string when converting to the # user-defined time format, we add "1" second to compensate this loss. toc = time.time() processingtime = time.strftime('%H:%M:%S', time.gmtime(toc - tic + 1)) infoprinter.printPara("warc files", str(len(warcfiles))) infoprinter.printPara('Processing time', processingtime) print 'end importing from directory: ', config.crawler_dir print 'logfile output to: ', config.log_file
def checkConfig(): infoprtr = printinfo.printInfo() # crawl repository exists if not os.path.exists(runconfig.crawlrepo): infoprtr.printStatus('crawlrepo exists','no') return False else: infoprtr.printStatus('crawlrepo exists','yes') # permission to write into the output folder testdir = os.path.join(runconfig.cde["outputdir"],'9999') if os.path.exists(testdir): shutil.rmtree(testdir) try: os.makedirs(testdir) shutil.rmtree(testdir) except OSError,e: print e infoprtr.printStatus('Write permission to outputdir','no') return False
def checkConfig(): infoprtr = printinfo.printInfo() # crawl repository exists if not os.path.exists(runconfig.crawlrepo): infoprtr.printStatus('crawlrepo exists', 'no') return False else: infoprtr.printStatus('crawlrepo exists', 'yes') # permission to write into the output folder testdir = os.path.join(runconfig.cde["outputdir"], '9999') if os.path.exists(testdir): shutil.rmtree(testdir) try: os.makedirs(testdir) shutil.rmtree(testdir) except OSError, e: print e infoprtr.printStatus('Write permission to outputdir', 'no') return False
def startup(verbal=False): # record start time tic = time.time() # create on-screen information print object infoprinter = printinfo.printInfo() # create document writer writer = output.CiteSeerWriter([runconfig.cdilite["docdir"], runconfig.cdilite["crawler"]]) # create document logger (for this middleware) doclogger = Doc_Logger(os.getenv("HOSTNAME")) # create general log configers and config logs logconfiger = Log_Configer() logconfiger.config_loggers() # parse log file g = create_instance(runconfig.cdilite["logparser"], runconfig.cdilite["doclist"]) g.extract_info(logsummaryfile=runconfig.cdilite["logsummaryfile"]) # prepare to write xml file impl = getDOMImplementation() xDoc = impl.createDocument(None, "response", None) root = xDoc.documentElement root.setAttribute("location", runconfig.cdilite["docdir"]) # number counter counters = counter.Counter() counters.newCounter("all") counters.setCounter("all", g.nline["parsed"]) counters.newCounter("failed_BadURL") counters.newCounter("failed_FileNotFound") # save the current path currentPath = os.getcwd() # loop over each information tuple extracted from document list file # each tuple contains the name of the pdf files if verbal: print "counters.all = ", counters.all for i in range(0, counters.all): print "" sys.stdout.write("\r") sys.stdout.write("%9d/%-9d " % (i + 1, counters.all)) sys.stdout.write("\n") infoprinter.printPara("URL", g.rel_path[i]) code = None # get resource variable "r" if verbal: print "g.parent_url[i] = ", g.parent_url[i] if verbal: print "g.url[i] = ", g.url[i] try: r = resource.Resource( code, g.parent_url[i], g.url[i], g.is_seed[i], g.hop[i], runconfig.batch, g.anchor_text[i] ) except BadResourceError, e: infoprinter.printStatus("URL Parse", "fail") counters.addCounter("failed_BadURL") continue r.crawl_date = g.crawl_date[i] r.content_type = g.content_type[i] infoprinter.printPara("mime-type", r.content_type) # where crawled documents are saved # retrieve the local hard copy of document infile = os.path.join(currentPath, runconfig.cdilite["docdir"], g.rel_path[i]) inpdf = infile # e.g., filepath/file.pdf if "%" in inpdf: inpdf = urllib.unquote(inpdf) # unquote escapes, e.g., %7 -> ~ # try to remove the last back slash from the full path # or try to see if fullpath/index.html exists, maybe that is the file # if document file still cannot be found, write into log and skip it inpdfpath = inpdf if not os.path.exists(inpdfpath): msg = doclogger.generator("FileNotFound", infile, r) logging.getLogger("document").info(msg) counters.addCounter("failed_FileNotFound") infoprinter.printStatus("Document file found", "no") # inpdfpath is the "corrected" file path inpdf = inpdfpath infoprinter.printStatus("Document file found", "yes") # load pdf file content to calculate encryption f = open(inpdf, "r") data = f.read() f.close() # calculate SHA1 r.content_sha1 = hashlib.sha1(data).hexdigest() try: # only save metadata file writer.save_met(r, inpdf) except IOError, e: msg = doclogger.generator("IOErrorSave", infile, r) logging.getLogger("document").info(msg)
def startup(verbal=False): # record start time tic = time.time() # create on-screen information print object infoprinter = printinfo.printInfo() # create document writer writer = output.CiteSeerWriter([runconfig.cdilite['docdir'],runconfig.cdilite['crawler']]) # create document logger (for this middleware) doclogger = Doc_Logger(os.getenv('HOSTNAME')) # create general log configers and config logs logconfiger = Log_Configer() logconfiger.config_loggers() # parse log file g = create_instance(runconfig.cdilite['logparser'],runconfig.cdilite['doclist']) g.extract_info(logsummaryfile=runconfig.cdilite['logsummaryfile']) # prepare to write xml file impl = getDOMImplementation() xDoc = impl.createDocument(None, "response", None) root = xDoc.documentElement root.setAttribute("location", runconfig.cdilite['docdir']) # number counter counters = counter.Counter() counters.newCounter('all') counters.setCounter('all',g.nline['parsed']) counters.newCounter('failed_BadURL') counters.newCounter('failed_FileNotFound') # save the current path currentPath = os.getcwd() # loop over each information tuple extracted from document list file # each tuple contains the name of the pdf files if verbal: print "counters.all = ",counters.all for i in range(0,counters.all): print '' sys.stdout.write("\r") sys.stdout.write("%9d/%-9d " % (i+1,counters.all)) sys.stdout.write("\n") infoprinter.printPara('URL',g.rel_path[i]) code = None # get resource variable "r" if verbal: print 'g.parent_url[i] = ',g.parent_url[i] if verbal: print 'g.url[i] = ',g.url[i] try: r = resource.Resource(code,g.parent_url[i],g.url[i],\ g.is_seed[i],g.hop[i],runconfig.batch,g.anchor_text[i]) except BadResourceError,e: infoprinter.printStatus('URL Parse','fail') counters.addCounter('failed_BadURL') continue r.crawl_date = g.crawl_date[i] r.content_type = g.content_type[i] infoprinter.printPara('mime-type',r.content_type) # where crawled documents are saved # retrieve the local hard copy of document infile = os.path.join(currentPath,runconfig.cdilite['docdir'],g.rel_path[i]) inpdf = infile # e.g., filepath/file.pdf if '%' in inpdf: inpdf = urllib.unquote(inpdf) #unquote escapes, e.g., %7 -> ~ # try to remove the last back slash from the full path # or try to see if fullpath/index.html exists, maybe that is the file # if document file still cannot be found, write into log and skip it inpdfpath = inpdf if not os.path.exists(inpdfpath): msg = doclogger.generator('FileNotFound',infile,r) logging.getLogger('document').info(msg) counters.addCounter('failed_FileNotFound') infoprinter.printStatus('Document file found','no') # inpdfpath is the "corrected" file path inpdf = inpdfpath infoprinter.printStatus('Document file found','yes') # load pdf file content to calculate encryption f = open(inpdf,'r') data = f.read() f.close() # calculate SHA1 r.content_sha1 = hashlib.sha1(data).hexdigest() try: # only save metadata file writer.save_met(r,inpdf) except IOError,e: msg = doclogger.generator('IOErrorSave',infile,r) logging.getLogger('document').info(msg)
def startup(verbal=True): # record start time tic = time.time() # create on-screen information print object infoprinter = printinfo.printInfo() # check configurations if not checkConfig(): infoprinter.printStatus('Configuration check','fail') raise SystemExit("Change your configurations in runconfig.py") else: infoprinter.printStatus('Configuration check','ok') # create document writer writer = output.CiteSeerWriter([runconfig.outputdir,runconfig.crawler]) # create URL filter urlfilter = urlfilters.URLFilter(blacklistfile=runconfig.blacklistfile,domainblacklistfile=runconfig.domainblacklistfile) # create document type filter mimetypefilter = Mime_Type_Filter(runconfig.allow_doc_type) # create document content filter doccontentfilter = filter_doc.Doc_Content_Filter(runconfig.tempdir) # create text extractor textextractor = textextract.Text_Extractor() # create document logger (for this middleware) doclogger = Doc_Logger(os.getenv('HOSTNAME'),mimetypefilter) # create general log configers and config logs logconfiger = Log_Configer() logconfiger.config_loggers() # parse log file print 'parsing log file...' g = create_instance(runconfig.logparser,runconfig.logfile) g.extract_info(logsummaryfile=runconfig.logsummaryfile,skip=runconfig.skip,nloglines=runconfig.nloglines) print 'parsing lot file finished' # number counter counters = counter.Counter() counters.newCounter('all') counters.setCounter('all',g.nline['parsed']) counters.newCounter('saved_New') counters.newCounter('saved_Duplicate') counters.newCounter('filtered') counters.newCounter('filtered_URLFilter') counters.newCounter('filtered_MimetypeFilter') counters.newCounter('filtered_DocContentFilter') counters.newCounter('failed') counters.newCounter('failed_TextExtract') counters.newCounter('failed_FileNotFound') # if inputs are pdf/ps counters.newCounter('failed_PDFFileNotFound') # if inputs are gzipped counters.newCounter('failed_BadURL') # Bad URL counters.newCounter('failed_SaveError') # if error occurs when saving docs # create output directory if it does not exist if not os.path.exists(runconfig.outputdir): os.makedirs(runconfig.outputdir) # create temp directory if it does not exist if not os.path.exists(runconfig.tempdir): os.makedirs(runconfig.tempdir) # a mapping file is automatically generated if only export files # (no db input) if runconfig.toggle_save_doc_separate: open(runconfig.tempdir+'mapping.csv','w') # if required to visit database, make sure that database and tables # are created if runconfig.toggle_save_to_db: cdb = crawldb.CrawlDB() # print database name infoprinter.printPara('Database name',cdb.dbname) # create document and parent table if they do not exist cdb.createTables() # save the current path savedPath = os.getcwd() # loop over each information tuple extracted from crawler log file for i in range(0,counters.all): print '' sys.stdout.write("\r") sys.stdout.write("%9d/%-9d " % (i+1,counters.all)) sys.stdout.write("\n") infoprinter.printPara('URL',g.url[i]) # apply the URL filter if runconfig.toggle_urlfilter: if not urlfilter.check(g.url[i]): msg = "%s %s %s" % ('URLRejected',urlfilter.rejectreason,g.url[i]) logging.getLogger('document').info(msg) counters.addCounter('filtered_URLFilter') if verbal: infoprinter.printStatus('URL accepted','no') continue # get resource variable "r" try: code = None r = resource.Resource(code,g.parent_url[i],g.url[i],\ g.is_seed[i],g.hop[i],runconfig.batch,g.anchor_text[i]) except BadResourceError,e: infoprinter.printStatus('URL Parse','fail') counters.addCounter('failed_BadURL') continue # url length cannot be longer th r.crawl_date = g.crawl_date[i] r.content_type = g.content_type[i] infoprinter.printPara('mime-type',r.content_type) # where crawled documents are saved # retrieve the local hard copy of document # If files are downloaded using "lftp", input file path should be # constructed by appending the relative file path to "conf.inputdir" if runconfig.crawler.lower() == 'lftp': infile = runconfig.inputdir+g.rel_path[i] elif runconfig.crawler.lower() == 'heritrix' and runconfig.saver.lower() == 'mirror': infile = runconfig.inputdir+r.host+r.path else: infile = runconfig.inputdir+g.rel_path[i] # apply doctype_filter, which checks the document mimetype type mimetypefilter_ok = mimetypefilter.check(r) if not mimetypefilter_ok: msg = doclogger.generator('DocumentTypeNotAccepted',infile,r) logging.getLogger('document').info(msg) counters.addCounter('filtered_MimetypeFilter') if verbal: infoprinter.printStatus('Accepted document type','no') continue else: if verbal: infoprinter.printStatus('Accepted document type','yes') r.ext = mimetypefilter.ext # check if document is already in db # if it returns False, continue to next step # if it returns True,log it and skip processing this one # However, if the overwrite_file toggle is set, we need to continue to the # next step anyway if runconfig.toggle_save_to_db: recordExist = cdb.checkRecord(runconfig.dbt_document,md5=r.md5) if not recordExist: infoprinter.printStatus('New document','yes') else: msg = doclogger.generator('saved_Duplicate',infile,r) logging.getLogger('document').info(msg) counters.addCounter('saved_Duplicate') infoprinter.printStatus('New document','no') if not runconfig.overwrite_file: continue # check existence of input file, if the name part of "infile" # contains wild card characters e.g., %, # try to recover it to normal # "infile" is the original full file path from crawl log (may contain # escape characters and may by in zipped format) # "inpdf" contains original file names saved in disk (no escape characters, # and in acceptable file format, e.g., PDF/postscript) # "inpdfpath" contains the correct path of input file name, see below. in # some cases, url paths are not correctly normalized # and need to be corrected. For example, if the last segment does not # contain ".", it is taken as a directory and a "/" is # added, while this is incorrect. inpdf = infile # e.g., filepath/file.pdf if '%' in inpdf: inpdf = urllib.unquote(inpdf) #unquote escapes, e.g., %7 -> ~ # try to remove the last back slash from the full path # or try to see if fullpath/index.html exists, maybe that is the file # if document file still cannot be found, write into log and skip it inpdfpath = inpdf if not os.path.exists(inpdfpath): inpdfpath = inpdf[:-1] if not os.path.exists(inpdfpath): inpdfpath = inpdf+'index.html' if not os.path.exists(inpdfpath): # try to download the the paper using "wget" # downloaded paper is saved to temporary directory and renamed # to "wget.pdf". Note that we just temporirily add an extention # of ".pdf", but it may not be a PDF file. If it is not, # it will be filtered out by the doc_type_filter later. # add quotes to url # if download is not successful, we mark this document as # "FileNotFound" wgeturl = '"'+r.url+'"' wgetfile = os.path.join(runconfig.tempdir,"wget."+r.ext) wgetcmd = "wget "+wgeturl+" -O "+wgetfile # first remove the existing "wget.pdf" if it exists if os.path.exists(wgetfile): rmcmd = "rm -rf "+wgetfile cmdoutput = commands.getoutput(rmcmd) # download document using "wget", time out is 5 min cmdoutput = timeoutpython.run(wgetcmd, shell=True, timeout=300) # if function returns "-9", download failed, skip this doc #if cmdoutput[0] == -9: # print cmdoutput #cmdoutput = commands.getoutput(wgetcmd) #print 'cmdoutput = ',cmdoutput # Check if file downloaded successfully if (not os.path.exists(wgetfile)) or (cmdoutput[0] == -9): msg = doclogger.generator('FileNotFound',infile,r) logging.getLogger('document').info(msg) counters.addCounter('failed_FileNotFound') if verbal: infoprinter.printStatus('Document file found','no') infoprinter.printPara('infile',infile) continue else: inpdfpath = wgetfile # inpdfpath is the "corrected" file path inpdf = inpdfpath if verbal: infoprinter.printStatus('Document file found','yes') infoprinter.printPara('Document file path',inpdf) # If input file is in zipped format, assuming it is a .tar.gz file # we do the following things # * copy the .tar.gz file to a temp directory # * decompress it using tar -xvzf # * find the .pdf file inside the unzipped # * do whatever we want ... # * remove everything in the temp directory cmd_file = 'file -i "'+infile+'"' cmdoutput = commands.getoutput(cmd_file) #t = cmdoutput.split(' ') #infilemimetype = t[-1] #infoprinter.printStatus('MIME-type',infilemimetype) #print cmdoutput if 'application/x-gzip' in cmdoutput: infoprinter.printStatus('MIME-type','application/x-gzip') cmd_rm = 'rm -rf '+runconfig.tempdir+'*' cmdoutput = commands.getoutput(cmd_rm) cmd_cp = 'cp "'+infile+'" '+runconfig.tempdir cmdoutput = commands.getoutput(cmd_cp) # sometimes, for some (unknown) reasons, the "-C" option # does not work well for "tar" command, so we cd to the # temp directory, extract files from the .tar.gz and return # to the main directory # # obtain the file name from the full path: infilename infilename = os.path.split(infile)[1] os.chdir(runconfig.tempdir) cmd_tar = 'tar -xvzf "'+infilename+'"' cmdoutput = commands.getoutput(cmd_tar) os.chdir(savedPath) # only look for pdf files for root,dirs,files in os.walk(runconfig.tempdir): inpdffound = False for f in files: if f.endswith('pdf'): inpdf = os.path.join(root,f) inpdffound = True break if inpdffound == True: break if not inpdffound: msg = doclogger.generator('PDFFileNotFound',infile,r) logging.getLogger('document').info(msg) counters.addCounter('failed_PDFFileNotFound') infoprinter.printStatus('PDF Document file found','no') continue # document file is found # check if need to use doc_content_filter if runconfig.toggle_doc_content_filter: # extract text from documents filefmt = mimetypefilter.doctype if verbal: infoprinter.printPara('Mime type',filefmt) # acceptable formats: e.g., "application/pdf","application/postscript" textextractmsg = textextractor.extract(inpdf,filefmt) # classify document if text is extracted successfully if 'Success' in textextractmsg: infoprinter.printStatus('Extract text','success') # not a paper, log it and proceed it to the next if doccontentfilter.Decider(textextractor.outtxtfile,inpdf) == -1: counters.addCounter('filtered_DocContentFilter') msg = doclogger.generator('NotAcademic',infile,r) logging.getLogger('document').info(msg) infoprinter.printStatus('Accepted document content','no') continue else: infoprinter.printStatus('Accepted document content','yes') else: # text extraction fails, report error and write it into log file infoprinter.printStatus('Extract text','fail') counters.addCounter('failed_TextExtract') msg = doclogger.generator(textextractmsg,infile,r) logging.getLogger('document').info(msg) continue # determine the FINAL mimetype of this document, if it is # "application/pdf", use ".pdf" as the extension, if it is # "application/postscript", use ".ps" as the extension # "inpdf" is the final pdf file to be accepted (after re-download, after # filters) if mimetypefilter.doctype == 'application/pdf': r.ext = 'pdf' elif mimetypefilter.doctype == 'application/postscript': r.ext = 'ps' else: cmd_file = 'file -i "'+inpdf+'"' cmdoutput = commands.getoutput(cmd_file) if 'application/postscript' in cmdoutput: r.ext = 'ps' else: infoprinter.printStatus('Recognizable mimetype','no') sys.exit(cmdoutput) # write document information into database # database settings can be found at settings.py # read file content and calculate the SHA1 value # read PDF document information # In some cases, the actual PDF was downloaded but the URL ends with a # slash: for example # dial.academielouvain.be/vital/access/services/Download/boreal:12685/PDF_01/ # the downloaded file is renamed as "index.html" though it is PDF file. In this case, # we try "inpdf/index.html" to see if we can actually identify this file. # If this does not work, it could be that Heritrix downloads the file as "PDF_01", this # happens for the URL below, when the actual file is named "75" under the 78/ directory # www.br-ie.org/pub/index.php/rbie/article/viewFile/78/75/ # # If we still cannot find any file, we have to skip it try: f = open(inpdf,'r') data = f.read() f.close() except IOError: # just remove the last "slash" try: f = open(inpdf[:-1],'r') data = f.read() f.close() except IOError: try: f = open(inpdf+'index.html','r') data = f.read() f.close() except IOError: msg = doclogger.generator('FileNotFound',infile,r) logging.getLogger('document').info(msg) counters.addCounter('failed_FileNotFound') infoprinter.printStatus('Document file found','no') continue # If required to save crawled documents separately, # do not save to db, only save document to outputdir # Files are named using numbers starting from 1 # A mapping file is automatically generated filenamebody = id_to_fname(i+1,r.ext) outdoc = runconfig.outputdir+filenamebody if runconfig.toggle_save_doc_separate: mappingline = outdoc+','+infile # may not be inpdf ff = open(outdoc,'w') ff.write(data) ff.close try: f = open(outdoc) msg = doclogger.generator('saved_New',infile,r) logging.getLogger('document').info(msg) infoprinter.printStatus('Document saved','yes') # number of saved documents counters.addCounter('saved_New') except IOError,e: infoprinter.printStatus('Document saved','no') raise SystemExit(e)
def startup(verbose=False): # create on-screen information print object infoprinter = printinfo.printInfo() # define counters counters = counter.Counter() counters.newCounter('all') counters.setCounter('all',0) counters.newCounter('healthy') counters.newCounter('inrepo') # create output directory if it does not exist if not os.path.exists(dochealthcheck_config.outputdir): os.makedirs(dochealthcheck_config.outputdir) # create database object cdb = crawldb.CrawlDB() # print database names infoprinter.printPara('Database name',cdb.dbname) # create lock object update_doc_lock = threading.Lock() try: update_doc_lock.acquire() cursor = connection.cursor() # select documents to check #dbquery = "SELECT id FROM "+dbt_name+" WHERE submission_id=-2" dbquery = "SELECT id FROM "+dochealthcheck_config.dbt_name+" WHERE submission_id=-2" print dbquery cursor.execute(dbquery) rows = cursorutils.dictfetchall(cursor) if not rows: recordExist = False infoprinter.printPara('Number of records',str(0)) return else: recordExist = True infoprinter.printPara('Number of records',str(len(rows))) ids = rows finally: update_doc_lock.release() # open document size file to write f_docsize = open(dochealthcheck_config.outputdir+dochealthcheck_config.f_docsize,'w') f_docsize.write('crawlid byte\n') # open unhealthy document to write f_unhealthdoc = open(dochealthcheck_config.outputdir+dochealthcheck_config.f_unhealthdoc,'w') f_unhealthdoc.write('unhealth_crawlid\n') # start checking each file counters.setCounter('all',len(ids)) ids_unhealth = [] for id in ids: # construct the full document path from the document ID infile = dochealthcheck_config.inputdir+idtopath(id['id']) # check if file exists if not os.path.exists(infile): infoprinter.printStatus('file exists','no') continue counters.addCounter('inrepo') # check file size in bytes statinfo = os.stat(infile) s = str(id['id'])+' '+str(statinfo.st_size) f_docsize.write(s+'\n') # check the file type cmd_file = 'file -i "'+infile+'"' cmdoutput = commands.getoutput(cmd_file) if verbose: print cmdoutput # check each accepted document, documents whose mimetypes are not # in the accepted mime types are identified as "unhealthy" healthy = False for am in dochealthcheck_config.accepted_mimes: if am in cmdoutput: healthy = True print 'document is healthy',id['id'] counters.addCounter('healthy') break if healthy: continue print "unhealthy document: ",id['id'] # write unheathy document ID to output file f_unhealthdoc.write(str(id['id'])+'\n') ids_unhealth.append(id['id']) # delete file folder from repository if dochealthcheck_config.toggle_delete_from_repo: infiledir = os.path.dirname(infile) cmd_repo = 'rm -rf '+infiledir cmd_repo_output = commands.getoutput(cmd_repo) if not os.path.exists(infiledir): infoprinter.printStatus(cmd_repo,'OK') else: infoprinter.printStatus(cmd_repo,'FAIL') return # delete records from database if dochealthcheck_config.toggle_delete_from_db: # delete the record from database cmd_db = 'DELETE FROM '+dochealthcheck_config.dbt_name+' WHERE id='+str(id['id']) print cmd_db cursor.execute(cmd_db) # close filese f_docsize.close() f_unhealthdoc.close() # commit all transactions after looping over all documents if dochealthcheck_config.toggle_delete_from_db: transaction.commit_unless_managed() # print out counters counters.printCounter()
def startup(): # record start time tic = time.time() # create on-screen information print object infoprinter = printinfo.printInfo() # check configurations if not checkConfig(): infoprinter.printStatus('Configuration check','fail') raise SystemExit("Change your configurations in runconfig.py") else: infoprinter.printStatus('Configuration check','ok') # create exporter exporter = output.CiteSeerExporter([runconfig.cde["outputdir"],runconfig.crawlrepo]) # create crawldb cdb = crawldb.CrawlDB() # create general log configers and config logs logconfiger = Log_Configer() logconfiger.config_loggers() # process DB query, raise error if ids is empty dbquery = runconfig.cde["dbquery"] ids = cdb.queryDocID(dbquery) infoprinter.printPara('#docid',str(len(ids))) if not ids: infoprinter.printStatus('DB query','fail') os.exit() # number counter counters = counter.Counter() counters.newCounter('all') counters.setCounter('all',len(ids)) counters.newCounter('copied') # export each queried document if runconfig.cde["toggle_export"]: i = 0 for id in ids: i = i + 1 print "%9d/%-9d : %9d" % (i,counters.all,id) if exporter.doc_export(id): counters.addCounter('copied') else: infoprinter.printStatus(str(id),'fail') # log successful # check repository to see if output PDF files are there #msg = doclogger.generator('saved_New',infile,r) #logging.getLogger('document').info(msg) #infoprinter.printStatus('Document saved','yes') # number of documents which are written into db #counters.addCounter('saved_New') counters.printCounter() counters.printCountertoFile(runconfig.cde["summaryfile"]) # record end time to calculate processing time # because strftime() will truncate the time string when converting to the # user-defined time format, we add "1" second to compensate this loss. toc = time.time() processingtime = time.strftime('%H:%M:%S',time.gmtime(toc-tic+1)) infoprinter.printPara('Processing time: ',processingtime)
def startup(): # record start time tic = time.time() # create on-screen information print object infoprinter = printinfo.printInfo() # check configurations if not checkConfig(): infoprinter.printStatus('Configuration check', 'fail') raise SystemExit("Change your configurations in runconfig.py") else: infoprinter.printStatus('Configuration check', 'ok') # create exporter exporter = output.CiteSeerExporter( [runconfig.cde["outputdir"], runconfig.crawlrepo]) # create crawldb cdb = crawldb.CrawlDB() # create general log configers and config logs logconfiger = Log_Configer() logconfiger.config_loggers() # process DB query, raise error if ids is empty dbquery = runconfig.cde["dbquery"] ids = cdb.queryDocID(dbquery) infoprinter.printPara('#docid', str(len(ids))) if not ids: infoprinter.printStatus('DB query', 'fail') os.exit() # number counter counters = counter.Counter() counters.newCounter('all') counters.setCounter('all', len(ids)) counters.newCounter('copied') # export each queried document if runconfig.cde["toggle_export"]: i = 0 for id in ids: i = i + 1 print "%9d/%-9d : %9d" % (i, counters.all, id) if exporter.doc_export(id): counters.addCounter('copied') else: infoprinter.printStatus(str(id), 'fail') # log successful # check repository to see if output PDF files are there #msg = doclogger.generator('saved_New',infile,r) #logging.getLogger('document').info(msg) #infoprinter.printStatus('Document saved','yes') # number of documents which are written into db #counters.addCounter('saved_New') counters.printCounter() counters.printCountertoFile(runconfig.cde["summaryfile"]) # record end time to calculate processing time # because strftime() will truncate the time string when converting to the # user-defined time format, we add "1" second to compensate this loss. toc = time.time() processingtime = time.strftime('%H:%M:%S', time.gmtime(toc - tic + 1)) infoprinter.printPara('Processing time: ', processingtime)
def startup(): # define constant parameters datestr = datetime.now().strftime("%Y-%m-%d") output_file = 'whitelist.'+datestr+'.csv' socket.setdefaulttimeout(runconfig.whitelistgen['sockettimeout']) # create on-screen information print object infoprinter = printinfo.printInfo() # print database name infoprinter.printPara('DATABASE',settings.DATABASES['default']['NAME']) # generate other parameters headers = {"User-Agent":runconfig.whitelistgen['user_agent']} # create general log configers and config logs logconfiger = LogConfiger() logconfiger.config_loggers(runconfig.general_logs) # create document logger doclogger = DocLogger() # load the blacklist and remove trailing '\n' blacklisturlstrail = file(runconfig.blacklistfile).readlines() blacklisturls = [] for url in blacklisturlstrail: blacklisturls.append(url.strip('\n')) # load domain blacklist file and remove the trailing '\n' domainblacklisttrail = file(runconfig.domainblacklistfile).readlines() domainblacklisturls = [] for durl in domainblacklisttrail: domainblacklisturls.append(durl.strip('\n')) # number counter counters = counter.Counter() counters.newCounter('all') # number of accepted URLs counters.newCounter('accepted') # ids includes main_crawl_parenturl ID and main_crawl_submission ID # dbtable = 'parenturl' or 'submission' # URL includes urls from main_crawl_parenturl and main_crawl_submission ids = [] dbtable = [] urls = [] # retrieve all parent urls cursor = connection.cursor() dbquery = 'SELECT id,url FROM main_crawl_parenturl;' #infoprinter.printStatus(dbquery,'running') cursor.execute(dbquery) rows = cursor.fetchall() for row in rows: ids.append(row[0]) dbtable.append('parenturl') urls.append(row[1]) #infoprinter.printStatus(dbquery,'OK') infoprinter.printPara('#parentURLs',str(len(ids))) # retrieve all submitted urls dbquery = 'SELECT id,url FROM main_crawl_submission;' #infoprinter.printStatus(dbquery,'running') cursor.execute(dbquery) rows = cursor.fetchall() #infoprinter.printStatus(dbquery,'OK') for row in rows: ids.append(row[0]) dbtable.append('submission') urls.append(row[1]) infoprinter.printPara('#submittedURLs',str(len(ids))) infoprinter.printPara('TOTAL #candidate urls: ',str(len(ids))) counters.setCounter('all',len(ids)) # output result into result file recordWriter = csv.writer(open(output_file,'wb'),delimiter=',', \ quotechar='"',quoting=csv.QUOTE_NONE) #print 'id range: ',ids[0],ids[-1] # loop over all urls in the parent url list and check the following # requirements # (1) if the url host matches any host names in the blacklist # (2) if there's any document urls found in the main_crawl_document table # (3) if the url is currently alive # (4) write into log # (5) write into output file, which should contain the following fields # (*) id of this parent url in the main_crawl_parenturl table # (*) number of document url found from main_crawl_document table # (*) host name of this parent URL string # (*) parent url string # fields are enclosed by double quotes and separated by commas # # The log contains the following fields (log will slow down the # process and should be disabled after debugging and testing) # (1) id of this parent url in the main_crawl_parenturl table # (2) host name of this parent url # (3) weather this parent url host matches any urls in the blacklist # (4) number of document urls found from main_crawl_document table # (5) if the url is currently alive # (6) parent url string # record_tuples = [] # the sorted results n_documenturls = [] # the number of documents for id,dbt,url in zip(ids,dbtable,urls): print '' print '%(#)9s - %(s)9s - %(url)s' % {'#':id,'s':dbt,'url':url} scheme,host,path,query,fragment = urlsplit(url) # check the url structure: must be complete if (not scheme) or (not host): infoprinter.printStatus('complete url','no') continue # generate the "parent_url" object for logging and final output # parent_url.url_is_alive = 0 # parent_url.n_documenturl = 0 # parent_url.pass_blacklist_check = 1 parent_url = ParentUrl(id,dbt,url,0,0,host,1) # CHECK A0: if it matches any domain in the domain blacklist for durl in domainblacklisturls: if host.endswith(durl): parent_url.pass_blacklist_check = 0 break if not parent_url.pass_blacklist_check: infoprinter.printStatus('Blacklist check','fail') # save into log msg = doclogger.generator_url(parent_url) logging.getLogger('whitelistgen').info(msg) continue # CHECK A1: if it matches any host in the blacklist if host in blacklisturls: infoprinter.printStatus('Blacklist check','fail') parent_url.pass_blacklist_check = 0 # save into log msg = doclogger.generator_url(parent_url) logging.getLogger('whitelistgen').info(msg) continue # Pass domain blacklist check infoprinter.printStatus('Blacklist check','pass') # CHECK B: number of document urls found in the main_crawl_document table # we only check URLs in the parenturl table. if dbt == 'parenturl': dbquery = """SELECT id FROM main_crawl_parenturl WHERE url=%s""" #infoprinter.printStatus(dbquery,'running') cursor.execute(dbquery, (url,)) rows = cursor.fetchall() parentid = rows[0][0] infoprinter.printStatus(dbquery,'ok') dbquery = 'SELECT count(*) FROM main_crawl_document WHERE state=1 and parent_id = %(#)s' % {'#':parentid} #infoprinter.printStatus(dbquery,'running') cursor.execute(dbquery) rows = cursor.fetchall() parent_url.n_documenturl = rows[0][0] infoprinter.printStatus(dbquery,'ok') if not parent_url.n_documenturl: infoprinter.printStatus('Ingestable document links check','fail') # save into log msg = doclogger.generator_url(parent_url) logging.getLogger('whitelistgen').info(msg) continue else: infoprinter.printStatus('Ingestable document links check','pass') else: parent_url.n_documenturl = 9999 infoprinter.printStatus('User submitted URL','yes') infoprinter.printStatus('Ingestable document links check','pass') # CHECK C: url is alive if runconfig.whitelistgen['checkurlalive']: infoprinter.printStatus('Check URL is alive','running') parent_url.url_is_alive = checkURLalive(url) else: #assume URL is alive if do not check parent_url.url_is_alive = 1 if not parent_url.url_is_alive: infoprinter.printStatus('Check URL is alive','fail') # save into log msg = doclogger.generator_url(parent_url) logging.getLogger('whitelistgen').info(msg) continue else: infoprinter.printStatus('Check URL is alive','pass') # save into log msg = doclogger.generator_url(parent_url) logging.getLogger('whitelistgen').info(msg) # save selected urls into a tuple list before sorting them if (parent_url.pass_blacklist_check) and parent_url.url_is_alive and parent_url.n_documenturl: record_tuple = (id,dbt,url,parent_url.url_is_alive,\ parent_url.n_documenturl,host,parent_url.pass_blacklist_check) record_tuples.append(record_tuple) infoprinter.printStatus('URL included in whitelist','yes') counters.addCounter('accepted') # sort results by the number of documents, user submitted documents are at the top record_tuples_sort = sorted(record_tuples, key=itemgetter(4),reverse=True) for r in record_tuples_sort: parent_url = ParentUrl(r[0],r[1],r[2],r[3],r[4],r[5],r[6]) record = doclogger.generator_record(parent_url) recordWriter.writerow(record) # print counters counters.printCounter()