Beispiel #1
0
def checkConfig():
    infoprtr = printinfo.printInfo()

    # input directory exists
    if not os.path.exists(runconfig.inputdir):
 	infoprtr.printStatus('inputdir exists','no')
	return False
    else:
	infoprtr.printStatus('inputdir exists','yes')
    
    # check if repository folder exists
    if not os.path.exists(runconfig.outputdir):
	infoprtr.printStatus('outputdir exists','no')
	return False
    else:
 	infoprtr.printStatus('outputdir exists','yes')

    # permission to write into the repository folder 
    if os.path.exists(runconfig.outputdir+'9999'):
	shutil.rmtree(runconfig.outputdir+'9999')
    try:
	os.makedirs(runconfig.outputdir+'9999')
	shutil.rmtree(runconfig.outputdir+'9999')
    except OSError,e:
	print e
	infoprtr.printStatus('Write permission to outputdir','no')
	return False
Beispiel #2
0
def main(jobid, verbose=False):

    # record start time
    tic = time.time()

    # create on-screen information printer
    infoprinter = printinfo.printInfo()

    # check configurations
    config = Config_global(jobid)
    config.check_config()
    config.open_logw()
    if verbose: print "jobid: %(1)s" % {"1": config.jobid}

    # create the WARC_professor object
    warcproc = WARC_processor(config)

    #   from os import listdir
    warcfiles = warcproc.get_warcs()

    if warcfiles:
        for wf in warcfiles:
            # decompress before extraction will increase speed
            wf_path = unzip_gz(wf,
                               verbose=verbose) if wf.endswith(".gz") else wf
            warcproc.extract_all(wf_path, verbose=verbose)
            # delete after extraction
            if config.delete_after_import: delete_file(wf_path)

    # close log file
    config.close_logw()

    # print counters in a new line
    warcproc.counters.printCounter()
    warcproc.counters.printCountertoFile(config.sum_file)

    # record end time to calculate processing time
    # because strftime() will truncate the time string when converting to the
    # user-defined time format, we add "1" second to compensate this loss.
    toc = time.time()
    processingtime = time.strftime('%H:%M:%S', time.gmtime(toc - tic + 1))
    infoprinter.printPara("warc files", str(len(warcfiles)))
    infoprinter.printPara('Processing time', processingtime)
    print 'end importing from directory: ', config.crawler_dir
    print 'logfile output to: ', config.log_file
Beispiel #3
0
def checkConfig():
    infoprtr = printinfo.printInfo()

    # crawl repository exists
    if not os.path.exists(runconfig.crawlrepo):
 	infoprtr.printStatus('crawlrepo exists','no')
	return False
    else:
	infoprtr.printStatus('crawlrepo exists','yes')
    
    # permission to write into the output folder 
    testdir = os.path.join(runconfig.cde["outputdir"],'9999')
    if os.path.exists(testdir):
	shutil.rmtree(testdir)
    try:
	os.makedirs(testdir)
	shutil.rmtree(testdir)
    except OSError,e:
	print e
	infoprtr.printStatus('Write permission to outputdir','no')
	return False
Beispiel #4
0
def checkConfig():
    infoprtr = printinfo.printInfo()

    # crawl repository exists
    if not os.path.exists(runconfig.crawlrepo):
        infoprtr.printStatus('crawlrepo exists', 'no')
        return False
    else:
        infoprtr.printStatus('crawlrepo exists', 'yes')

    # permission to write into the output folder
    testdir = os.path.join(runconfig.cde["outputdir"], '9999')
    if os.path.exists(testdir):
        shutil.rmtree(testdir)
    try:
        os.makedirs(testdir)
        shutil.rmtree(testdir)
    except OSError, e:
        print e
        infoprtr.printStatus('Write permission to outputdir', 'no')
        return False
Beispiel #5
0
def startup(verbal=False):

    # record start time
    tic = time.time()

    # create on-screen information print object
    infoprinter = printinfo.printInfo()

    # create document writer
    writer = output.CiteSeerWriter([runconfig.cdilite["docdir"], runconfig.cdilite["crawler"]])

    # create document logger (for this middleware)
    doclogger = Doc_Logger(os.getenv("HOSTNAME"))

    # create general log configers and config logs
    logconfiger = Log_Configer()
    logconfiger.config_loggers()

    # parse log file
    g = create_instance(runconfig.cdilite["logparser"], runconfig.cdilite["doclist"])
    g.extract_info(logsummaryfile=runconfig.cdilite["logsummaryfile"])

    # prepare to write xml file
    impl = getDOMImplementation()
    xDoc = impl.createDocument(None, "response", None)
    root = xDoc.documentElement
    root.setAttribute("location", runconfig.cdilite["docdir"])

    # number counter
    counters = counter.Counter()
    counters.newCounter("all")
    counters.setCounter("all", g.nline["parsed"])
    counters.newCounter("failed_BadURL")
    counters.newCounter("failed_FileNotFound")

    # save the current path
    currentPath = os.getcwd()

    # loop over each information tuple extracted from document list file
    # each tuple contains the name of the pdf files
    if verbal:
        print "counters.all = ", counters.all
    for i in range(0, counters.all):
        print ""
        sys.stdout.write("\r")
        sys.stdout.write("%9d/%-9d  " % (i + 1, counters.all))
        sys.stdout.write("\n")
        infoprinter.printPara("URL", g.rel_path[i])

        code = None

        # get resource variable "r"
        if verbal:
            print "g.parent_url[i] = ", g.parent_url[i]
        if verbal:
            print "g.url[i] = ", g.url[i]
        try:
            r = resource.Resource(
                code, g.parent_url[i], g.url[i], g.is_seed[i], g.hop[i], runconfig.batch, g.anchor_text[i]
            )
        except BadResourceError, e:
            infoprinter.printStatus("URL Parse", "fail")
            counters.addCounter("failed_BadURL")
            continue

        r.crawl_date = g.crawl_date[i]
        r.content_type = g.content_type[i]
        infoprinter.printPara("mime-type", r.content_type)

        # where crawled documents are saved
        # retrieve the local hard copy of document
        infile = os.path.join(currentPath, runconfig.cdilite["docdir"], g.rel_path[i])

        inpdf = infile  # e.g., filepath/file.pdf
        if "%" in inpdf:
            inpdf = urllib.unquote(inpdf)  # unquote escapes, e.g., %7 -> ~

        # try to remove the last back slash from the full path
        # or try to see if fullpath/index.html exists, maybe that is the file
        # if document file still cannot be found, write into log and skip it
        inpdfpath = inpdf
        if not os.path.exists(inpdfpath):
            msg = doclogger.generator("FileNotFound", infile, r)
            logging.getLogger("document").info(msg)
            counters.addCounter("failed_FileNotFound")
            infoprinter.printStatus("Document file found", "no")

            # inpdfpath is the "corrected" file path
        inpdf = inpdfpath
        infoprinter.printStatus("Document file found", "yes")

        # load pdf file content to calculate encryption
        f = open(inpdf, "r")
        data = f.read()
        f.close()

        # calculate SHA1
        r.content_sha1 = hashlib.sha1(data).hexdigest()

        try:
            # only save metadata file
            writer.save_met(r, inpdf)
        except IOError, e:
            msg = doclogger.generator("IOErrorSave", infile, r)
            logging.getLogger("document").info(msg)
Beispiel #6
0
def startup(verbal=False):

  # record start time 
  tic = time.time()

  # create on-screen information print object
  infoprinter = printinfo.printInfo()

  # create document writer
  writer = output.CiteSeerWriter([runconfig.cdilite['docdir'],runconfig.cdilite['crawler']])

  # create document logger (for this middleware)
  doclogger = Doc_Logger(os.getenv('HOSTNAME'))

  # create general log configers and config logs
  logconfiger = Log_Configer()
  logconfiger.config_loggers()

  # parse log file
  g = create_instance(runconfig.cdilite['logparser'],runconfig.cdilite['doclist'])
  g.extract_info(logsummaryfile=runconfig.cdilite['logsummaryfile'])


  # prepare to write xml file
  impl = getDOMImplementation()
  xDoc = impl.createDocument(None, "response", None)
  root = xDoc.documentElement
  root.setAttribute("location", runconfig.cdilite['docdir'])



  # number counter
  counters = counter.Counter()
  counters.newCounter('all')
  counters.setCounter('all',g.nline['parsed'])
  counters.newCounter('failed_BadURL')
  counters.newCounter('failed_FileNotFound')

  # save the current path 
  currentPath = os.getcwd()

  # loop over each information tuple extracted from document list file 
  # each tuple contains the name of the pdf files
  if verbal: print "counters.all = ",counters.all
  for i in range(0,counters.all):
    print ''
    sys.stdout.write("\r")
    sys.stdout.write("%9d/%-9d  " % (i+1,counters.all))
    sys.stdout.write("\n")
    infoprinter.printPara('URL',g.rel_path[i])

    code = None
    
    # get resource variable "r"
    if verbal: print 'g.parent_url[i] = ',g.parent_url[i]
    if verbal: print 'g.url[i] = ',g.url[i]
    try:
        r = resource.Resource(code,g.parent_url[i],g.url[i],\
            g.is_seed[i],g.hop[i],runconfig.batch,g.anchor_text[i])
    except BadResourceError,e:
	infoprinter.printStatus('URL Parse','fail')
	counters.addCounter('failed_BadURL')
	continue

    r.crawl_date = g.crawl_date[i]
    r.content_type = g.content_type[i]
    infoprinter.printPara('mime-type',r.content_type)

    # where crawled documents are saved
    # retrieve the local hard copy of document
    infile = os.path.join(currentPath,runconfig.cdilite['docdir'],g.rel_path[i])

    inpdf = infile # e.g., filepath/file.pdf 
    if '%' in inpdf: 
      inpdf = urllib.unquote(inpdf) #unquote escapes, e.g., %7 -> ~

    # try to remove the last back slash from the full path 
    # or try to see if fullpath/index.html exists, maybe that is the file
    # if document file still cannot be found, write into log and skip it
    inpdfpath = inpdf
    if not os.path.exists(inpdfpath):
	msg = doclogger.generator('FileNotFound',infile,r)
  	logging.getLogger('document').info(msg)
	counters.addCounter('failed_FileNotFound')
	infoprinter.printStatus('Document file found','no')

    # inpdfpath is the "corrected" file path
    inpdf = inpdfpath
    infoprinter.printStatus('Document file found','yes')

    # load pdf file content to calculate encryption
    f = open(inpdf,'r')
    data = f.read()
    f.close()

    # calculate SHA1
    r.content_sha1 = hashlib.sha1(data).hexdigest() 
  
    try:
        # only save metadata file
        writer.save_met(r,inpdf) 
    except IOError,e:
        msg = doclogger.generator('IOErrorSave',infile,r)
        logging.getLogger('document').info(msg)
Beispiel #7
0
def startup(verbal=True):

  # record start time 
  tic = time.time()

  # create on-screen information print object
  infoprinter = printinfo.printInfo()

  # check configurations
  if not checkConfig():
    infoprinter.printStatus('Configuration check','fail')
    raise SystemExit("Change your configurations in runconfig.py")
  else:
    infoprinter.printStatus('Configuration check','ok')

  # create document writer
  writer = output.CiteSeerWriter([runconfig.outputdir,runconfig.crawler])

  # create URL filter
  urlfilter = urlfilters.URLFilter(blacklistfile=runconfig.blacklistfile,domainblacklistfile=runconfig.domainblacklistfile)

  # create document type filter
  mimetypefilter = Mime_Type_Filter(runconfig.allow_doc_type)

  # create document content filter
  doccontentfilter = filter_doc.Doc_Content_Filter(runconfig.tempdir)

  # create text extractor 
  textextractor = textextract.Text_Extractor()

  # create document logger (for this middleware)
  doclogger = Doc_Logger(os.getenv('HOSTNAME'),mimetypefilter)

  # create general log configers and config logs
  logconfiger = Log_Configer()
  logconfiger.config_loggers()

  # parse log file
  print 'parsing log file...'
  g = create_instance(runconfig.logparser,runconfig.logfile)
  g.extract_info(logsummaryfile=runconfig.logsummaryfile,skip=runconfig.skip,nloglines=runconfig.nloglines)
  print 'parsing lot file finished'

  # number counter
  counters = counter.Counter()
  counters.newCounter('all')
  counters.setCounter('all',g.nline['parsed'])
  counters.newCounter('saved_New') 
  counters.newCounter('saved_Duplicate')
  counters.newCounter('filtered')
  counters.newCounter('filtered_URLFilter')
  counters.newCounter('filtered_MimetypeFilter')
  counters.newCounter('filtered_DocContentFilter')
  counters.newCounter('failed')
  counters.newCounter('failed_TextExtract')
  counters.newCounter('failed_FileNotFound')    # if inputs are pdf/ps
  counters.newCounter('failed_PDFFileNotFound') # if inputs are gzipped
  counters.newCounter('failed_BadURL') 		# Bad URL
  counters.newCounter('failed_SaveError')	# if error occurs when saving docs

  # create output directory if it does not exist
  if not os.path.exists(runconfig.outputdir):
      os.makedirs(runconfig.outputdir)

  # create temp directory if it does not exist
  if not os.path.exists(runconfig.tempdir):
      os.makedirs(runconfig.tempdir)

  # a mapping file is automatically generated if only export files 
  # (no db input) 
  if runconfig.toggle_save_doc_separate:
    open(runconfig.tempdir+'mapping.csv','w')

  # if required to visit database, make sure that database and tables 
  # are created
  if runconfig.toggle_save_to_db:
    cdb = crawldb.CrawlDB()
    # print database name
    infoprinter.printPara('Database name',cdb.dbname)
    # create document and parent table if they do not exist
    cdb.createTables()

  # save the current path 
  savedPath = os.getcwd()

  # loop over each information tuple extracted from crawler log file 
  for i in range(0,counters.all):
    print ''
    sys.stdout.write("\r")
    sys.stdout.write("%9d/%-9d  " % (i+1,counters.all))
    sys.stdout.write("\n")
    infoprinter.printPara('URL',g.url[i])

    # apply the URL filter
    if runconfig.toggle_urlfilter:
    	if not urlfilter.check(g.url[i]):
	    msg = "%s %s %s" % ('URLRejected',urlfilter.rejectreason,g.url[i])
            logging.getLogger('document').info(msg)
            counters.addCounter('filtered_URLFilter')
            if verbal: infoprinter.printStatus('URL accepted','no')
            continue
    
    # get resource variable "r"
    try:
        code = None
        r = resource.Resource(code,g.parent_url[i],g.url[i],\
            g.is_seed[i],g.hop[i],runconfig.batch,g.anchor_text[i])
    except BadResourceError,e:
	infoprinter.printStatus('URL Parse','fail')
	counters.addCounter('failed_BadURL')
	continue

    # url length cannot be longer th
    r.crawl_date = g.crawl_date[i]
    r.content_type = g.content_type[i]
    infoprinter.printPara('mime-type',r.content_type)

    # where crawled documents are saved
    # retrieve the local hard copy of document
    # If files are downloaded using "lftp", input file path should be 
    # constructed by appending the relative file path to "conf.inputdir"
    if runconfig.crawler.lower() == 'lftp':
        infile = runconfig.inputdir+g.rel_path[i]   
    elif runconfig.crawler.lower() == 'heritrix' and runconfig.saver.lower() == 'mirror':
        infile = runconfig.inputdir+r.host+r.path   
    else: 
        infile = runconfig.inputdir+g.rel_path[i]

    # apply doctype_filter, which checks the document mimetype type
    mimetypefilter_ok = mimetypefilter.check(r)
    if not mimetypefilter_ok: 
      msg = doclogger.generator('DocumentTypeNotAccepted',infile,r)
      logging.getLogger('document').info(msg)
      counters.addCounter('filtered_MimetypeFilter')
      if verbal: infoprinter.printStatus('Accepted document type','no')
      continue
    else:
      if verbal: infoprinter.printStatus('Accepted document type','yes')

    r.ext = mimetypefilter.ext

    # check if document is already in db
    # if it returns False, continue to next step
    # if it returns True,log it and skip processing this one
    # However, if the overwrite_file toggle is set, we need to continue to the
    # next step anyway
    if runconfig.toggle_save_to_db:
        recordExist = cdb.checkRecord(runconfig.dbt_document,md5=r.md5)
	if not recordExist:
	    infoprinter.printStatus('New document','yes')
       	else:
    	    msg = doclogger.generator('saved_Duplicate',infile,r)
    	    logging.getLogger('document').info(msg)
	    counters.addCounter('saved_Duplicate')
       	    infoprinter.printStatus('New document','no')
	    if not runconfig.overwrite_file:
    	        continue
   
    # check existence of input file, if the name part of "infile" 
    # contains wild card characters e.g., %, 
    # try to recover it to normal 
    # "infile" is the original full file path from crawl log (may contain 
    # escape characters and may by in zipped format) 
    # "inpdf" contains original file names saved in disk (no escape characters, 
    # and in acceptable file format, e.g., PDF/postscript)
    # "inpdfpath" contains the correct path of input file name, see below. in 
    # some cases, url paths are not correctly normalized 
    # and need to be corrected. For example, if the last segment does not 
    # contain ".", it is taken as a directory and a "/" is 
    # added, while this is incorrect. 
    inpdf = infile # e.g., filepath/file.pdf 
    if '%' in inpdf: 
      inpdf = urllib.unquote(inpdf) #unquote escapes, e.g., %7 -> ~

    # try to remove the last back slash from the full path 
    # or try to see if fullpath/index.html exists, maybe that is the file
    # if document file still cannot be found, write into log and skip it
    inpdfpath = inpdf
    if not os.path.exists(inpdfpath):
	inpdfpath = inpdf[:-1]
	if not os.path.exists(inpdfpath):
	    inpdfpath = inpdf+'index.html'
	    if not os.path.exists(inpdfpath):
		# try to download the the paper using "wget"
		# downloaded paper is saved to temporary directory and renamed
		# to "wget.pdf". Note that we just temporirily add an extention
		# of ".pdf", but it may not be a PDF file. If it is not,
		# it will be filtered out by the doc_type_filter later.
		# add quotes to url
 		# if download is not successful, we mark this document as
		# "FileNotFound"
		wgeturl = '"'+r.url+'"'
		wgetfile = os.path.join(runconfig.tempdir,"wget."+r.ext)
		wgetcmd = "wget "+wgeturl+" -O "+wgetfile
		
		# first remove the existing "wget.pdf" if it exists
		if os.path.exists(wgetfile):
		    rmcmd = "rm -rf "+wgetfile
		    cmdoutput = commands.getoutput(rmcmd)
		# download document using "wget", time out is 5 min
		cmdoutput = timeoutpython.run(wgetcmd, shell=True, timeout=300)
		# if function returns "-9", download failed, skip this doc
		#if cmdoutput[0] == -9:
		#    print cmdoutput
		#cmdoutput = commands.getoutput(wgetcmd)
	        #print 'cmdoutput = ',cmdoutput

		# Check if file downloaded successfully
		if (not os.path.exists(wgetfile)) or (cmdoutput[0] == -9):
      		    msg = doclogger.generator('FileNotFound',infile,r)
      		    logging.getLogger('document').info(msg)
      		    counters.addCounter('failed_FileNotFound')
		    if verbal: 
      		     	infoprinter.printStatus('Document file found','no')
      		    	infoprinter.printPara('infile',infile)
      		    continue
		else:
		    inpdfpath = wgetfile

    # inpdfpath is the "corrected" file path
    inpdf = inpdfpath
    if verbal:
        infoprinter.printStatus('Document file found','yes')
    	infoprinter.printPara('Document file path',inpdf)
    
    # If input file is in zipped format, assuming it is a .tar.gz file
    # we do the following things
    # * copy the .tar.gz file to a temp directory 
    # * decompress it using tar -xvzf 
    # * find the .pdf file inside the unzipped 
    # * do whatever we want ...
    # * remove everything in the temp directory 
    cmd_file = 'file -i "'+infile+'"'
    cmdoutput = commands.getoutput(cmd_file)
    #t = cmdoutput.split(' ')
    #infilemimetype = t[-1]
    #infoprinter.printStatus('MIME-type',infilemimetype)
    #print cmdoutput
    if 'application/x-gzip' in cmdoutput:
      infoprinter.printStatus('MIME-type','application/x-gzip')
      cmd_rm = 'rm -rf '+runconfig.tempdir+'*'
      cmdoutput = commands.getoutput(cmd_rm)

      cmd_cp = 'cp "'+infile+'" '+runconfig.tempdir
      cmdoutput = commands.getoutput(cmd_cp)

      # sometimes, for some (unknown) reasons, the "-C" option
      # does not work well for "tar" command, so we cd to the
      # temp directory, extract files from the .tar.gz and return
      # to the main directory
      #
      # obtain the file name from the full path: infilename
      infilename = os.path.split(infile)[1]
      os.chdir(runconfig.tempdir)
      cmd_tar = 'tar -xvzf "'+infilename+'"'
      cmdoutput = commands.getoutput(cmd_tar)
      os.chdir(savedPath)
  
      # only look for pdf files
      for root,dirs,files in os.walk(runconfig.tempdir):
        inpdffound = False
        for f in files:
	  if f.endswith('pdf'):
	    inpdf = os.path.join(root,f)
            inpdffound = True
            break
        if inpdffound == True:
          break
      if not inpdffound: 
        msg = doclogger.generator('PDFFileNotFound',infile,r)
        logging.getLogger('document').info(msg)
        counters.addCounter('failed_PDFFileNotFound')
        infoprinter.printStatus('PDF Document file found','no')
        continue
    
    # document file is found
    # check if need to use doc_content_filter
    if runconfig.toggle_doc_content_filter:
      
      # extract text from documents 
      filefmt = mimetypefilter.doctype

      if verbal: infoprinter.printPara('Mime type',filefmt)
      # acceptable formats: e.g., "application/pdf","application/postscript" 
      textextractmsg = textextractor.extract(inpdf,filefmt) 

      # classify document if text is extracted successfully
      if 'Success' in textextractmsg:
          infoprinter.printStatus('Extract text','success')
          # not a paper, log it and proceed it to the next
          if doccontentfilter.Decider(textextractor.outtxtfile,inpdf) == -1:
	      counters.addCounter('filtered_DocContentFilter')
              msg = doclogger.generator('NotAcademic',infile,r)
              logging.getLogger('document').info(msg)
	      infoprinter.printStatus('Accepted document content','no')
              continue
	  else:
	      infoprinter.printStatus('Accepted document content','yes')
      else: # text extraction fails, report error and write it into log file
          infoprinter.printStatus('Extract text','fail')
	  counters.addCounter('failed_TextExtract')
          msg = doclogger.generator(textextractmsg,infile,r)
          logging.getLogger('document').info(msg)
          continue

    # determine the FINAL mimetype of this document, if it is 
    # "application/pdf", use ".pdf" as the extension, if it is 
    # "application/postscript", use ".ps" as the extension
    # "inpdf" is the final pdf file to be accepted (after re-download, after
    # filters)
    if mimetypefilter.doctype == 'application/pdf':
	r.ext = 'pdf'
    elif mimetypefilter.doctype == 'application/postscript':
	r.ext = 'ps'
    else:
        cmd_file = 'file -i "'+inpdf+'"'
        cmdoutput = commands.getoutput(cmd_file)
     	if 'application/postscript' in cmdoutput:
	    r.ext = 'ps'
	else:
	    infoprinter.printStatus('Recognizable mimetype','no')
	    sys.exit(cmdoutput)


    # write document information into database
    # database settings can be found at settings.py
    # read file content and calculate the SHA1 value
    # read PDF document information
    # In some cases, the actual PDF was downloaded but the URL ends with a 
    # slash: for example
    # dial.academielouvain.be/vital/access/services/Download/boreal:12685/PDF_01/
    # the downloaded file is renamed as "index.html" though it is PDF file. In this case,
    # we try "inpdf/index.html" to see if we can actually identify this file.
    # If this does not work, it could be that Heritrix downloads the file as "PDF_01", this
    # happens for the URL below, when the actual file is named "75" under the 78/ directory
    # www.br-ie.org/pub/index.php/rbie/article/viewFile/78/75/
    #
    # If we still cannot find any file, we have to skip it
    try:
        f = open(inpdf,'r')
        data = f.read()
        f.close()
    except IOError:
	# just remove the last "slash"
	try: 
	    f = open(inpdf[:-1],'r')
            data = f.read()
            f.close()
	except IOError:
	    try:
 	    	f = open(inpdf+'index.html','r')
            	data = f.read()
            	f.close()
	    except IOError:
      	    	msg = doclogger.generator('FileNotFound',infile,r)
            	logging.getLogger('document').info(msg)
      	    	counters.addCounter('failed_FileNotFound')
      	    	infoprinter.printStatus('Document file found','no')
       	    	continue

    # If required to save crawled documents separately,
    # do not save to db, only save document to outputdir
    # Files are named using numbers starting from 1
    # A mapping file is automatically generated
    filenamebody = id_to_fname(i+1,r.ext)
    outdoc = runconfig.outputdir+filenamebody
    if runconfig.toggle_save_doc_separate:
      mappingline = outdoc+','+infile # may not be inpdf
      ff = open(outdoc,'w')
      ff.write(data)
      ff.close
      try:
	f = open(outdoc)
        msg = doclogger.generator('saved_New',infile,r)
        logging.getLogger('document').info(msg)
        infoprinter.printStatus('Document saved','yes')
        # number of saved documents 
        counters.addCounter('saved_New')
      except IOError,e:
        infoprinter.printStatus('Document saved','no')
        raise SystemExit(e)
Beispiel #8
0
def startup(verbose=False):

    # create on-screen information print object
    infoprinter = printinfo.printInfo()
  
    # define counters
    counters = counter.Counter()
    counters.newCounter('all')
    counters.setCounter('all',0)
    counters.newCounter('healthy')
    counters.newCounter('inrepo')
  
    # create output directory if it does not exist
    if not os.path.exists(dochealthcheck_config.outputdir):
        os.makedirs(dochealthcheck_config.outputdir)
  
    # create database object
    cdb = crawldb.CrawlDB()
    # print database names
    infoprinter.printPara('Database name',cdb.dbname)

    # create lock object
    update_doc_lock = threading.Lock()
  
    try:
        update_doc_lock.acquire()
        cursor = connection.cursor()
     	# select documents to check
    	#dbquery = "SELECT id FROM "+dbt_name+" WHERE submission_id=-2" 
    	dbquery = "SELECT id FROM "+dochealthcheck_config.dbt_name+" WHERE submission_id=-2" 
    	print dbquery
        cursor.execute(dbquery)
        rows = cursorutils.dictfetchall(cursor)
        if not rows:
            recordExist = False
            infoprinter.printPara('Number of records',str(0))
            return
        else:
            recordExist = True
            infoprinter.printPara('Number of records',str(len(rows)))
            ids = rows
    finally:
        update_doc_lock.release()
  
    # open document size file to write 
    f_docsize = open(dochealthcheck_config.outputdir+dochealthcheck_config.f_docsize,'w') 
    f_docsize.write('crawlid byte\n')
  
    # open unhealthy document to write
    f_unhealthdoc = open(dochealthcheck_config.outputdir+dochealthcheck_config.f_unhealthdoc,'w')
    f_unhealthdoc.write('unhealth_crawlid\n')
  
    # start checking each file
    counters.setCounter('all',len(ids))
    ids_unhealth = []
    for id in ids:
     	# construct the full document path from the document ID
        infile = dochealthcheck_config.inputdir+idtopath(id['id'])
      
        # check if file exists
        if not os.path.exists(infile):
            infoprinter.printStatus('file exists','no')
            continue
        counters.addCounter('inrepo')
      
        # check file size in bytes
        statinfo = os.stat(infile)
        s = str(id['id'])+' '+str(statinfo.st_size)
        f_docsize.write(s+'\n')
          
        # check the file type
        cmd_file = 'file -i "'+infile+'"'
        cmdoutput = commands.getoutput(cmd_file)
	if verbose: print cmdoutput
      
        # check each accepted document, documents whose mimetypes are not
        # in the accepted mime types are identified as "unhealthy"
	healthy = False
	for am in dochealthcheck_config.accepted_mimes:
            if am in cmdoutput:
		healthy = True
		print 'document is healthy',id['id']
      	    	counters.addCounter('healthy')
                break
        if healthy:
	    continue

     	print "unhealthy document: ",id['id']
        # write unheathy document ID to output file
        f_unhealthdoc.write(str(id['id'])+'\n')
        ids_unhealth.append(id['id'])
      
       
        # delete file folder from repository
        if dochealthcheck_config.toggle_delete_from_repo:
	    infiledir = os.path.dirname(infile)
            cmd_repo = 'rm -rf '+infiledir
	    cmd_repo_output = commands.getoutput(cmd_repo)
	    if not os.path.exists(infiledir):
             	infoprinter.printStatus(cmd_repo,'OK')
	    else: 
		infoprinter.printStatus(cmd_repo,'FAIL')
		return
      
        # delete records from database
        if dochealthcheck_config.toggle_delete_from_db:
            # delete the record from database
            cmd_db = 'DELETE FROM '+dochealthcheck_config.dbt_name+' WHERE id='+str(id['id'])
            print cmd_db
            cursor.execute(cmd_db)
      
    # close filese
    f_docsize.close()
    f_unhealthdoc.close()

    # commit all transactions after looping over all documents
    if dochealthcheck_config.toggle_delete_from_db:
        transaction.commit_unless_managed()
     
    # print out counters
    counters.printCounter()
Beispiel #9
0
def startup():

  # record start time 
  tic = time.time()

  # create on-screen information print object
  infoprinter = printinfo.printInfo()

  # check configurations
  if not checkConfig():
    infoprinter.printStatus('Configuration check','fail')
    raise SystemExit("Change your configurations in runconfig.py")
  else:
    infoprinter.printStatus('Configuration check','ok')

  # create exporter
  exporter = output.CiteSeerExporter([runconfig.cde["outputdir"],runconfig.crawlrepo])

  # create crawldb
  cdb = crawldb.CrawlDB()

  # create general log configers and config logs
  logconfiger = Log_Configer()
  logconfiger.config_loggers()

  # process DB query, raise error if ids is empty
  dbquery = runconfig.cde["dbquery"]
  ids = cdb.queryDocID(dbquery)
  infoprinter.printPara('#docid',str(len(ids)))
  if not ids:
      infoprinter.printStatus('DB query','fail')
      os.exit()

  # number counter
  counters = counter.Counter()
  counters.newCounter('all')
  counters.setCounter('all',len(ids))
  counters.newCounter('copied') 

  # export each queried document
  if runconfig.cde["toggle_export"]:
      i = 0
      for id in ids:
	  i = i + 1
	  print "%9d/%-9d : %9d" % (i,counters.all,id)
          if exporter.doc_export(id): 
	  	counters.addCounter('copied')
      	  else:
	  	infoprinter.printStatus(str(id),'fail')

    # log successful
    # check repository to see if output PDF files are there
    #msg = doclogger.generator('saved_New',infile,r)
    #logging.getLogger('document').info(msg)
    #infoprinter.printStatus('Document saved','yes')
    # number of documents which are written into db
    #counters.addCounter('saved_New')
        
  counters.printCounter()
  counters.printCountertoFile(runconfig.cde["summaryfile"])

  # record end time to calculate processing time
  # because strftime() will truncate the time string when converting to the
  # user-defined time format, we add "1" second to compensate this loss. 
  toc = time.time()
  processingtime = time.strftime('%H:%M:%S',time.gmtime(toc-tic+1))
  infoprinter.printPara('Processing time: ',processingtime)
Beispiel #10
0
def startup():

    # record start time
    tic = time.time()

    # create on-screen information print object
    infoprinter = printinfo.printInfo()

    # check configurations
    if not checkConfig():
        infoprinter.printStatus('Configuration check', 'fail')
        raise SystemExit("Change your configurations in runconfig.py")
    else:
        infoprinter.printStatus('Configuration check', 'ok')

    # create exporter
    exporter = output.CiteSeerExporter(
        [runconfig.cde["outputdir"], runconfig.crawlrepo])

    # create crawldb
    cdb = crawldb.CrawlDB()

    # create general log configers and config logs
    logconfiger = Log_Configer()
    logconfiger.config_loggers()

    # process DB query, raise error if ids is empty
    dbquery = runconfig.cde["dbquery"]
    ids = cdb.queryDocID(dbquery)
    infoprinter.printPara('#docid', str(len(ids)))
    if not ids:
        infoprinter.printStatus('DB query', 'fail')
        os.exit()

    # number counter
    counters = counter.Counter()
    counters.newCounter('all')
    counters.setCounter('all', len(ids))
    counters.newCounter('copied')

    # export each queried document
    if runconfig.cde["toggle_export"]:
        i = 0
        for id in ids:
            i = i + 1
            print "%9d/%-9d : %9d" % (i, counters.all, id)
            if exporter.doc_export(id):
                counters.addCounter('copied')
            else:
                infoprinter.printStatus(str(id), 'fail')

    # log successful
    # check repository to see if output PDF files are there
    #msg = doclogger.generator('saved_New',infile,r)
    #logging.getLogger('document').info(msg)
    #infoprinter.printStatus('Document saved','yes')
    # number of documents which are written into db
    #counters.addCounter('saved_New')

    counters.printCounter()
    counters.printCountertoFile(runconfig.cde["summaryfile"])

    # record end time to calculate processing time
    # because strftime() will truncate the time string when converting to the
    # user-defined time format, we add "1" second to compensate this loss.
    toc = time.time()
    processingtime = time.strftime('%H:%M:%S', time.gmtime(toc - tic + 1))
    infoprinter.printPara('Processing time: ', processingtime)
Beispiel #11
0
def startup():

  # define constant parameters
  datestr = datetime.now().strftime("%Y-%m-%d")
  output_file = 'whitelist.'+datestr+'.csv'
  socket.setdefaulttimeout(runconfig.whitelistgen['sockettimeout'])

  # create on-screen information print object
  infoprinter = printinfo.printInfo()
  # print database name
  infoprinter.printPara('DATABASE',settings.DATABASES['default']['NAME'])

  # generate other parameters
  headers = {"User-Agent":runconfig.whitelistgen['user_agent']}

  # create general log configers and config logs
  logconfiger = LogConfiger()
  logconfiger.config_loggers(runconfig.general_logs)

  # create document logger
  doclogger = DocLogger()

  # load the blacklist and remove trailing '\n' 
  blacklisturlstrail = file(runconfig.blacklistfile).readlines()
  blacklisturls = []
  for url in blacklisturlstrail: blacklisturls.append(url.strip('\n'))

  # load domain blacklist file and remove the trailing '\n'
  domainblacklisttrail = file(runconfig.domainblacklistfile).readlines()
  domainblacklisturls = []
  for durl in domainblacklisttrail: domainblacklisturls.append(durl.strip('\n'))

  # number counter
  counters = counter.Counter()
  counters.newCounter('all')
  # number of accepted URLs
  counters.newCounter('accepted')

  # ids includes main_crawl_parenturl ID and main_crawl_submission ID
  # dbtable = 'parenturl' or 'submission'
  # URL includes urls from main_crawl_parenturl and main_crawl_submission
  ids = []
  dbtable = []
  urls = [] 

  # retrieve all parent urls 
  cursor = connection.cursor()
  dbquery = 'SELECT id,url FROM main_crawl_parenturl;'
  #infoprinter.printStatus(dbquery,'running')
  cursor.execute(dbquery)
  rows = cursor.fetchall()
  for row in rows:
    ids.append(row[0])
    dbtable.append('parenturl')
    urls.append(row[1])
  #infoprinter.printStatus(dbquery,'OK')
  infoprinter.printPara('#parentURLs',str(len(ids)))

  # retrieve all submitted urls 
  dbquery = 'SELECT id,url FROM main_crawl_submission;'
  #infoprinter.printStatus(dbquery,'running')
  cursor.execute(dbquery)
  rows = cursor.fetchall()
  #infoprinter.printStatus(dbquery,'OK')
  for row in rows:
    ids.append(row[0])
    dbtable.append('submission')
    urls.append(row[1])
  infoprinter.printPara('#submittedURLs',str(len(ids)))
  
  infoprinter.printPara('TOTAL #candidate urls: ',str(len(ids)))
  counters.setCounter('all',len(ids))

  # output result into result file
  recordWriter = csv.writer(open(output_file,'wb'),delimiter=',', \
                            quotechar='"',quoting=csv.QUOTE_NONE)
  
  #print 'id range: ',ids[0],ids[-1]
  # loop over all urls in the parent url list and check the following
  # requirements
  # (1) if the url host matches any host names in the blacklist
  # (2) if there's any document urls found in the main_crawl_document table
  # (3) if the url is currently alive
  # (4) write into log 
  # (5) write into output file, which should contain the following fields
  #     (*) id of this parent url in the main_crawl_parenturl table
  #     (*) number of document url found from main_crawl_document table
  #	(*) host name of this parent URL string
  #     (*) parent url string
  #     fields are enclosed by double quotes and separated by commas
  # 
  # The log contains the following fields (log will slow down the
  # process and should be disabled after debugging and testing)
  # (1) id of this parent url in the main_crawl_parenturl table
  # (2) host name of this parent url
  # (3) weather this parent url host matches any urls in the blacklist
  # (4) number of document urls found from main_crawl_document table
  # (5) if the url is currently alive
  # (6) parent url string
  # 
  record_tuples = [] # the sorted results 
  n_documenturls = [] # the number of documents
  for id,dbt,url in zip(ids,dbtable,urls):
    print ''
    print '%(#)9s - %(s)9s - %(url)s' % {'#':id,'s':dbt,'url':url}
    scheme,host,path,query,fragment = urlsplit(url)
 
    # check the url structure: must be complete
    if (not scheme) or (not host):
      infoprinter.printStatus('complete url','no')
      continue

    # generate the "parent_url" object for logging and final output
    # parent_url.url_is_alive = 0
    # parent_url.n_documenturl = 0
    # parent_url.pass_blacklist_check = 1
    parent_url = ParentUrl(id,dbt,url,0,0,host,1)

    # CHECK A0: if it matches any domain in the domain blacklist
    for durl in domainblacklisturls:
     	if host.endswith(durl): 
	    parent_url.pass_blacklist_check = 0
	    break
    if not parent_url.pass_blacklist_check:
	infoprinter.printStatus('Blacklist check','fail')
        # save into log
        msg = doclogger.generator_url(parent_url)
        logging.getLogger('whitelistgen').info(msg)
	continue

    # CHECK A1: if it matches any host in the blacklist
    if host in blacklisturls: 
      infoprinter.printStatus('Blacklist check','fail')
      parent_url.pass_blacklist_check = 0
      # save into log
      msg = doclogger.generator_url(parent_url)
      logging.getLogger('whitelistgen').info(msg)
      continue

    # Pass domain blacklist check
    infoprinter.printStatus('Blacklist check','pass')

    # CHECK B: number of document urls found in the main_crawl_document table
    # we only check URLs in the parenturl table. 
    if dbt == 'parenturl':
        dbquery = """SELECT id FROM main_crawl_parenturl WHERE url=%s""" 

        #infoprinter.printStatus(dbquery,'running')
	cursor.execute(dbquery, (url,)) 
        rows = cursor.fetchall()
        parentid = rows[0][0]
        infoprinter.printStatus(dbquery,'ok')
    
        dbquery = 'SELECT count(*) FROM main_crawl_document WHERE state=1 and parent_id = %(#)s' % {'#':parentid}
        #infoprinter.printStatus(dbquery,'running')
        cursor.execute(dbquery)
        rows = cursor.fetchall()
        parent_url.n_documenturl = rows[0][0]
        infoprinter.printStatus(dbquery,'ok')
        if not parent_url.n_documenturl:
          infoprinter.printStatus('Ingestable document links check','fail')
          # save into log
          msg = doclogger.generator_url(parent_url)
          logging.getLogger('whitelistgen').info(msg)
          continue
        else:
          infoprinter.printStatus('Ingestable document links check','pass')
    else:
	parent_url.n_documenturl = 9999
	infoprinter.printStatus('User submitted URL','yes')
	infoprinter.printStatus('Ingestable document links check','pass')
    
    # CHECK C: url is alive
    if runconfig.whitelistgen['checkurlalive']:
	infoprinter.printStatus('Check URL is alive','running')
    	parent_url.url_is_alive = checkURLalive(url)
    else: #assume URL is alive if do not check
    	parent_url.url_is_alive = 1

    if not parent_url.url_is_alive:
	infoprinter.printStatus('Check URL is alive','fail')
        # save into log
        msg = doclogger.generator_url(parent_url)
        logging.getLogger('whitelistgen').info(msg)
      	continue
    else:
	infoprinter.printStatus('Check URL is alive','pass')
      
    # save into log
    msg = doclogger.generator_url(parent_url)
    logging.getLogger('whitelistgen').info(msg)

    # save selected urls into a tuple list before sorting them
    if (parent_url.pass_blacklist_check) and parent_url.url_is_alive and parent_url.n_documenturl:
      record_tuple = (id,dbt,url,parent_url.url_is_alive,\
			parent_url.n_documenturl,host,parent_url.pass_blacklist_check)
      record_tuples.append(record_tuple)
      infoprinter.printStatus('URL included in whitelist','yes')
      counters.addCounter('accepted')

  # sort results by the number of documents, user submitted documents are at the top
  record_tuples_sort = sorted(record_tuples, key=itemgetter(4),reverse=True)
  for r in record_tuples_sort:
    parent_url = ParentUrl(r[0],r[1],r[2],r[3],r[4],r[5],r[6])
    record = doclogger.generator_record(parent_url)
    recordWriter.writerow(record)
  
  # print counters
  counters.printCounter()