Example #1
0
    def checkRecord(self, dbt_name, md5='', id=0):
        # check which key to use
        if md5 == '' and id == 0:
            # a keyword should be provided
            raise Exception("You do not provide a key")
        elif md5 != '' and id > 0:
            # only one keyword
            raise Exception("Only one key is needed")
        elif md5 != '':
            q = " WHERE md5='" + md5 + "'"
        elif id > 0:
            q = " WHERE id=" + str(id)

        try:
            self.update_doc_lock.acquire()
            cursor = connection.cursor()
            dbquery = "SELECT * FROM " + dbt_name + q
            cursor.execute(dbquery)
            rows = cursorutils.dictfetchall(cursor)
            if not rows:
                recordExist = False
            else:
                recordExist = True
        finally:
            self.update_doc_lock.release()
        return recordExist
Example #2
0
 def save_doc(self, r, data, pid):
     # save db first to get the id
     db_entry_updated = False
     # save a new copy of file only if it is new or an updated version
     file_updated = True
     try:
         self.update_doc_lock.acquire()
         cursor = connection.cursor()
         dbquery = "SELECT * FROM "+runconfig.dbt_document+\
      " WHERE md5='"+r.md5+"'"
         cursor.execute(dbquery)
         rows = cursorutils.dictfetchall(cursor)
         # insert a new record
         # note that the cursor cannot convert a Python "None" to
         # a MySQL NULL value.
         parent_idstr = str(pid) if str(pid) != 'None' else None
         if not rows:
             dbquery = "INSERT INTO "+runconfig.dbt_document+\
                 "(url,md5,host,rev_host,content_sha1,discover_date,update_date,parent_id,submission_id,state)"+\
                               " VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
             dbquerypar = (r.url, r.md5, r.host, r.host[::-1],
                           r.content_sha1, str(r.crawl_date),
                           str(r.crawl_date), parent_idstr, str(r.batch),
                           '0')
             #print dbquery % dbquerypar
             try:
                 cursor.execute(dbquery, dbquerypar)
             except TypeError, e:
                 print 'output.py. TypeError. dbquery = ', dbquery % dbquerypar
                 raise SystemExit(e)
             except _mysql_exceptions.OperationalError, e:
                 print 'output.py. MySQL Operationl Error. dbquery = ', dbquery % dbquerypar
                 raise SystemExit(e)
Example #3
0
    def checkRecord(self,dbt_name,md5='',id=0):
	# check which key to use
	if md5 == '' and id == 0:
	    # a keyword should be provided
	    raise Exception("You do not provide a key")
	elif md5 != '' and id >0:
	    # only one keyword 
	    raise Exception("Only one key is needed")
	elif md5 != '':
	    q = " WHERE md5='"+md5+"'"
	elif id >0:
	    q = " WHERE id="+str(id)

        try:
            self.update_doc_lock.acquire()
            cursor = connection.cursor()
            dbquery = "SELECT * FROM "+dbt_name+q
            cursor.execute(dbquery)
            rows = cursorutils.dictfetchall(cursor)
            if not rows:
		recordExist = False
            else:
		recordExist = True
        finally:
            self.update_doc_lock.release()                                       
	return recordExist
Example #4
0
    def save_doc(self, r, data, pid):
        # save db first to get the id
        db_entry_updated = False
	# save a new copy of file only if it is new or an updated version
        file_updated = True
        try:
            self.update_doc_lock.acquire()
	    cursor = connection.cursor()
	    dbquery = "SELECT * FROM "+runconfig.dbt_document+\
		" WHERE md5='"+r.md5+"'"
	    cursor.execute(dbquery)
	    rows = cursorutils.dictfetchall(cursor)
	    # insert a new record
 	    # note that the cursor cannot convert a Python "None" to 
	    # a MySQL NULL value. 
	    parent_idstr = str(pid) if str(pid) != 'None' else None
	    if not rows:
		dbquery = "INSERT INTO "+runconfig.dbt_document+\
		    "(url,md5,host,rev_host,content_sha1,discover_date,update_date,parent_id,submission_id,state)"+\
                    " VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                dbquerypar = (r.url,r.md5,r.host,r.host[::-1],r.content_sha1,str(r.crawl_date),str(r.crawl_date),parent_idstr,str(r.batch),'0')
                #print dbquery % dbquerypar
		try: 
		    cursor.execute(dbquery,dbquerypar)
	   	except TypeError,e:
		    print 'output.py. TypeError. dbquery = ',dbquery % dbquerypar
		    raise SystemExit(e)
		except _mysql_exceptions.OperationalError,e:
		    print 'output.py. MySQL Operationl Error. dbquery = ',dbquery % dbquerypar
		    raise SystemExit(e)
Example #5
0
 def dbcheck(self, r):
     try:
         self.update_doc_lock.acquire()
         cursor = connection.cursor()
         dbquery = "SELECT * FROM "+runconfig.dbt_document+\
      " WHERE md5='"+r.md5+"'"
         cursor.execute(dbquery)
         rows = cursorutils.dictfetchall(cursor)
         if not rows:
             return False
         else:
             row = rows[0]
             self.docid = row['id']
     finally:
         self.update_doc_lock.release()
Example #6
0
    def dbcheck(self,r):
        try:
            self.update_doc_lock.acquire()
	    cursor = connection.cursor()
	    dbquery = "SELECT * FROM "+runconfig.dbt_document+\
		" WHERE md5='"+r.md5+"'"
	    cursor.execute(dbquery)
	    rows = cursorutils.dictfetchall(cursor)
	    if not rows:
		return False
	    else: 
		row = rows[0]
		self.docid = row['id']
        finally:
            self.update_doc_lock.release()                                                                                                      
Example #7
0
    def queryDocID(self, dbquery):
        docids = []
        try:
            self.update_doc_lock.acquire()
            cursor = connection.cursor()
            cursor.execute(dbquery)
            rows = cursorutils.dictfetchall(cursor)
            if not rows:
                return docids
        finally:
            self.update_doc_lock.release()

        for row in rows:
            docids.append(row['id'])

        return docids
Example #8
0
    def queryDocID(self,dbquery):
	docids = []
	try:
	    self.update_doc_lock.acquire()
	    cursor = connection.cursor()
	    cursor.execute(dbquery)
	    rows = cursorutils.dictfetchall(cursor)
	    if not rows:
		return docids
	finally:
	    self.update_doc_lock.release()

	for row in rows:
	    docids.append(row['id'])

	return docids
Example #9
0
    def save(self, r, data):
        # save parent first (even fail to save doc)
        if r.parent_url != None:
            try:
                self.update_parent_lock.acquire()
                cursor = connection.cursor()
                dbquery = "SELECT * FROM " + runconfig.dbt_parenturl + " WHERE md5='" + r.parent_md5 + "'"
                cursor.execute(dbquery)
                rows = cursorutils.dictfetchall(cursor)
                # insert a new parent URL
                if not rows:
                    dbquery = "INSERT INTO "+runconfig.dbt_parenturl+\
                 " (url,md5,first_crawl_date,last_crawl_date,is_live) "+\
                                      " VALUES (%s,%s,%s,%s,%s)"
                    dbquerypar = (r.parent_url.decode('utf8'), r.parent_md5,
                                  str(r.crawl_date), str(r.crawl_date), '1')
                    cursor.execute(dbquery, dbquerypar)
                    pid = cursor.lastrowid
                    #transaction.commit_unless_managed()
# update an existing parent URL
                else:
                    row = rows[0]
                    pid = row['id']
                    if r.crawl_date > row['last_crawl_date']:
                        dbquery = "UPDATE "+runconfig.dbt_parenturl+\
                            " SET last_crawl_date='"+str(r.crawl_date)+\
                            "' WHERE id="+str(row['id'])+";"
                        cursor.execute(dbquery)
                    #transaction.commit_unless_managed()
            finally:
                self.update_parent_lock.release()
        else:
            pid = None

        # save document
# rpid is the parent id of the resource URL
# if the resource URL is a seed, rpid is None
# other wise, it is the id in the parent url table.
        self.save_doc(r, data, pid)
Example #10
0
    def save(self, r, data):
        # save parent first (even fail to save doc)
        if r.parent_url != None:              
            try:
                self.update_parent_lock.acquire()
	  	cursor = connection.cursor()
                dbquery = "SELECT * FROM "+runconfig.dbt_parenturl+" WHERE md5='"+r.parent_md5+"'"
		cursor.execute(dbquery)
		rows = cursorutils.dictfetchall(cursor)
	 	# insert a new parent URL
		if not rows:
		    dbquery = "INSERT INTO "+runconfig.dbt_parenturl+\
			" (url,md5,first_crawl_date,last_crawl_date,is_live) "+\
                        " VALUES (%s,%s,%s,%s,%s)" 
                    dbquerypar = (r.parent_url.decode('utf8'),r.parent_md5,str(r.crawl_date),str(r.crawl_date),'1')
		    cursor.execute(dbquery,dbquerypar)
		    pid = cursor.lastrowid
		    transaction.commit_unless_managed()
		# update an existing parent URL
		else:
		    row = rows[0]
		    pid = row['id']
		    if r.crawl_date > row['last_crawl_date']:
			dbquery = "UPDATE "+runconfig.dbt_parenturl+\
			    " SET last_crawl_date='"+str(r.crawl_date)+\
			    "' WHERE id="+str(row['id'])+";"
			cursor.execute(dbquery)
		        transaction.commit_unless_managed()
            finally:
                self.update_parent_lock.release()
        else:
            pid = None
            
        # save document
   	# rpid is the parent id of the resource URL
	# if the resource URL is a seed, rpid is None
	# other wise, it is the id in the parent url table. 
        self.save_doc(r, data, pid)
Example #11
0
def startup(verbose=False):

    # create on-screen information print object
    infoprinter = printinfo.printInfo()
  
    # define counters
    counters = counter.Counter()
    counters.newCounter('all')
    counters.setCounter('all',0)
    counters.newCounter('healthy')
    counters.newCounter('inrepo')
  
    # create output directory if it does not exist
    if not os.path.exists(dochealthcheck_config.outputdir):
        os.makedirs(dochealthcheck_config.outputdir)
  
    # create database object
    cdb = crawldb.CrawlDB()
    # print database names
    infoprinter.printPara('Database name',cdb.dbname)

    # create lock object
    update_doc_lock = threading.Lock()
  
    try:
        update_doc_lock.acquire()
        cursor = connection.cursor()
     	# select documents to check
    	#dbquery = "SELECT id FROM "+dbt_name+" WHERE submission_id=-2" 
    	dbquery = "SELECT id FROM "+dochealthcheck_config.dbt_name+" WHERE submission_id=-2" 
    	print dbquery
        cursor.execute(dbquery)
        rows = cursorutils.dictfetchall(cursor)
        if not rows:
            recordExist = False
            infoprinter.printPara('Number of records',str(0))
            return
        else:
            recordExist = True
            infoprinter.printPara('Number of records',str(len(rows)))
            ids = rows
    finally:
        update_doc_lock.release()
  
    # open document size file to write 
    f_docsize = open(dochealthcheck_config.outputdir+dochealthcheck_config.f_docsize,'w') 
    f_docsize.write('crawlid byte\n')
  
    # open unhealthy document to write
    f_unhealthdoc = open(dochealthcheck_config.outputdir+dochealthcheck_config.f_unhealthdoc,'w')
    f_unhealthdoc.write('unhealth_crawlid\n')
  
    # start checking each file
    counters.setCounter('all',len(ids))
    ids_unhealth = []
    for id in ids:
     	# construct the full document path from the document ID
        infile = dochealthcheck_config.inputdir+idtopath(id['id'])
      
        # check if file exists
        if not os.path.exists(infile):
            infoprinter.printStatus('file exists','no')
            continue
        counters.addCounter('inrepo')
      
        # check file size in bytes
        statinfo = os.stat(infile)
        s = str(id['id'])+' '+str(statinfo.st_size)
        f_docsize.write(s+'\n')
          
        # check the file type
        cmd_file = 'file -i "'+infile+'"'
        cmdoutput = commands.getoutput(cmd_file)
	if verbose: print cmdoutput
      
        # check each accepted document, documents whose mimetypes are not
        # in the accepted mime types are identified as "unhealthy"
	healthy = False
	for am in dochealthcheck_config.accepted_mimes:
            if am in cmdoutput:
		healthy = True
		print 'document is healthy',id['id']
      	    	counters.addCounter('healthy')
                break
        if healthy:
	    continue

     	print "unhealthy document: ",id['id']
        # write unheathy document ID to output file
        f_unhealthdoc.write(str(id['id'])+'\n')
        ids_unhealth.append(id['id'])
      
       
        # delete file folder from repository
        if dochealthcheck_config.toggle_delete_from_repo:
	    infiledir = os.path.dirname(infile)
            cmd_repo = 'rm -rf '+infiledir
	    cmd_repo_output = commands.getoutput(cmd_repo)
	    if not os.path.exists(infiledir):
             	infoprinter.printStatus(cmd_repo,'OK')
	    else: 
		infoprinter.printStatus(cmd_repo,'FAIL')
		return
      
        # delete records from database
        if dochealthcheck_config.toggle_delete_from_db:
            # delete the record from database
            cmd_db = 'DELETE FROM '+dochealthcheck_config.dbt_name+' WHERE id='+str(id['id'])
            print cmd_db
            cursor.execute(cmd_db)
      
    # close filese
    f_docsize.close()
    f_unhealthdoc.close()

    # commit all transactions after looping over all documents
    if dochealthcheck_config.toggle_delete_from_db:
        transaction.commit_unless_managed()
     
    # print out counters
    counters.printCounter()