class httpHelper: logger=None def __init__(self): self.logger=AppLog() def httpExists(self,url): host, path = urlparse.urlsplit(url)[1:3] found = None try: connection = httplib.HTTPConnection(host) ## Make HTTPConnection Object connection.request("HEAD", path) responseOb = connection.getresponse() ## Grab HTTPResponse Object if responseOb.status in [200,201,204,302]: found = responseOb.status except Exception, e: logvar= e.__class__, e, url self.logger.logInfo(logvar) return found
class httpHelper: logger = None def __init__(self): self.logger = AppLog() def httpExists(self, url): host, path = urlparse.urlsplit(url)[1:3] found = None try: connection = httplib.HTTPConnection( host) ## Make HTTPConnection Object connection.request("HEAD", path) responseOb = connection.getresponse() ## Grab HTTPResponse Object if responseOb.status in [200, 201, 204, 302]: found = responseOb.status except Exception, e: logvar = e.__class__, e, url self.logger.logInfo(logvar) return found
class UrlHelper: logger = None def __init__(self): self.logger = AppLog() def isUrl(self, url): p = re.compile( 'http?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' ) m = p.match(url) if m: return True else: return False def getRealURL(self, url): try: socket.setdefaulttimeout(30) realURL = urllib2.urlopen(url).geturl() return realURL except urllib2.HTTPError: self.logger.logInfo( 'Exception in getRealURL method urllib2.HTTPError (helper/urlhelper.py)' + str(sys.exc_info()[0])) return None except urllib2.URLError: self.logger.logInfo( 'Exception in getRealURL method urllib2.URLError (helper/urlhelper.py)' + str(sys.exc_info()[0])) return None except: return None def getHeaderInfo(self, soup): headrDict = {} title = None meta = [] description = None metaTitle = None try: title = soup.find('head').title.string meta = soup.findAll('meta') except: self.logger.logInfo( 'getHeaderInfo some meta tags function not tags (helper/urlhelper.py) ' ) for m in meta: try: if m['name'].lower() == 'title': metaTitle = m['content'] if metaTitle: title = metaTitle elif m['name'].lower() == 'description': description = m['content'] except: self.logger.logInfo( 'getHeaderInfo some meta tags function not tags (helper/urlhelper.py)' ) headrDict['title'] = title headrDict['description'] = description return headrDict
def __init__(self): self.logger = AppLog()
def __init__(self): self.logger=AppLog()
#encoding:UTF-8 from dal.urlprovider import Urlprovider from helper.applog import AppLog from helper.stringhelper import stringHelper from helper.urlhelper import UrlHelper import sys logger = AppLog() strObj = stringHelper() ulrObj = UrlHelper() urlpObj = Urlprovider() try: posts = urlpObj.getPosts() for post in posts: postid = post['id'] postText = post['text'] postURLs = strObj.extractURL(postText) for url in postURLs: if url: orgurl = ulrObj.getRealURL(url) if orgurl: urlpObj.addURL(url, orgurl, postid) urlpObj.updatePostUrlExtracted(postid, 1) except:
class Urlprovider: logger=None def __init__(self): self.logger=AppLog() def getPosts(self): try: dbObj=MySQLHelper() query="select * from posts where urlextracted=0 order by id;" result=dbObj.query(query) return result except: self.logger.logInfo('Exception in getPosts method (dal/urlprovider.py)') def addURL(self,url,orgurl,postid): try: dbObj=MySQLHelper() query="select id,counter from urls where orgurl='"+orgurl+"'" result=dbObj.query(query) if len(result) == 0: tubleData=(url,orgurl,postid) query="insert into urls (url,orgurl,postid) values (%s,%s,%s)" result=dbObj.executequery(query,tubleData) else: query='' id=result[0]['id'] counter=result[0]['counter'] if counter is None: counter=0 counter += 1 query="update urls set counter="+str(counter)+" where id="+str(id) result=dbObj.query(query) except: self.logger.logInfo('Exception in addURL method (dal/urlprovider.py)') def updatePostUrlExtracted(self,postid,value): try: dbObj=MySQLHelper() query="update posts set urlextracted="+str(value)+" where id="+str(postid) result=dbObj.query(query) return result except: self.logger.logInfo('Exception in updatePostStatus method (dal/twitterprovider.py)') def getURLs(self): try: dbObj=MySQLHelper() query="select * from urls where status=0" result=dbObj.query(query) return result except: self.logger.logInfo('Exception in updatePostStatus method (dal/twitterprovider.py)') def addUrlHeaderInfo(self,urlid,metatitle,metadescription): try: dbObj=MySQLHelper() tubleData=(metatitle,metadescription) query="update urls set metatitle=%s,metadescription=%s where id="+str(urlid) result=dbObj.executequery(query,tubleData) return result except: self.logger.logInfo('Exception in updatePostStatus method (dal/twitterprovider.py)') def saveUrlIMG(self,urlid,filename): try: dbObj=MySQLHelper() tubleData=(urlid,filename) query="insert into urlimgs (urlid,filename) values (%s,%s)" result=dbObj.executequery(query,tubleData) return result except: self.logger.logInfo('Exception in saveUrlIMG method (dal/twitterprovider.py)') def updateUrlStatus(self,urlid,value): try: dbObj=MySQLHelper() query="update urls set status="+str(value)+" where id="+str(urlid) result=dbObj.query(query) return result except: self.logger.logInfo('Exception in updateUrlStatus method (dal/twitterprovider.py)')
extension=os.path.splitext(img)[1] newImgName=str(uuid.uuid1())+str(extension) im.thumbnail(newSize, Image.ANTIALIAS) im.save(out_folder+"/"+newImgName,imgformat) os.remove(imgfile) saveImgList.append(newImgName) except: logger.logInfo("in urldatagrapper.py open images : "+str(sys.exc_info()[0])) for saveimg in saveImgList: urlproviderObj.saveUrlIMG(urlid,saveimg) logger=AppLog() urlproviderObj=Urlprovider() urlHelpObj=UrlHelper() try: urls=urlproviderObj.getURLs() for url in urls: if httpExists(url['orgurl']): urlid=url['id'] soup=getUrlHTMLsoup(url['orgurl']) if soup: headrDict=urlHelpObj.getHeaderInfo(soup) urlproviderObj.addUrlHeaderInfo(urlid,headrDict['title'], headrDict['description']) getUrlImgs(url['orgurl'],urlid,soup)
#encoding:UTF-8 from dal.urlprovider import Urlprovider from helper.applog import AppLog from helper.stringhelper import stringHelper from helper.urlhelper import UrlHelper import sys logger=AppLog() strObj=stringHelper() ulrObj=UrlHelper() urlpObj=Urlprovider() try: posts=urlpObj.getPosts() for post in posts: postid=post['id'] postText=post['text'] postURLs=strObj.extractURL(postText) for url in postURLs: if url: orgurl=ulrObj.getRealURL(url) if orgurl: urlpObj.addURL(url, orgurl, postid) urlpObj.updatePostUrlExtracted(postid,1) except:
class Urlprovider: logger = None def __init__(self): self.logger = AppLog() def getPosts(self): try: dbObj = MySQLHelper() query = "select * from posts where urlextracted=0 order by id;" result = dbObj.query(query) return result except: self.logger.logInfo( 'Exception in getPosts method (dal/urlprovider.py)') def addURL(self, url, orgurl, postid): try: dbObj = MySQLHelper() query = "select id,counter from urls where orgurl='" + orgurl + "'" result = dbObj.query(query) if len(result) == 0: tubleData = (url, orgurl, postid) query = "insert into urls (url,orgurl,postid) values (%s,%s,%s)" result = dbObj.executequery(query, tubleData) else: query = '' id = result[0]['id'] counter = result[0]['counter'] if counter is None: counter = 0 counter += 1 query = "update urls set counter=" + str( counter) + " where id=" + str(id) result = dbObj.query(query) except: self.logger.logInfo( 'Exception in addURL method (dal/urlprovider.py)') def updatePostUrlExtracted(self, postid, value): try: dbObj = MySQLHelper() query = "update posts set urlextracted=" + str( value) + " where id=" + str(postid) result = dbObj.query(query) return result except: self.logger.logInfo( 'Exception in updatePostStatus method (dal/twitterprovider.py)' ) def getURLs(self): try: dbObj = MySQLHelper() query = "select * from urls where status=0" result = dbObj.query(query) return result except: self.logger.logInfo( 'Exception in updatePostStatus method (dal/twitterprovider.py)' ) def addUrlHeaderInfo(self, urlid, metatitle, metadescription): try: dbObj = MySQLHelper() tubleData = (metatitle, metadescription) query = "update urls set metatitle=%s,metadescription=%s where id=" + str( urlid) result = dbObj.executequery(query, tubleData) return result except: self.logger.logInfo( 'Exception in updatePostStatus method (dal/twitterprovider.py)' ) def saveUrlIMG(self, urlid, filename): try: dbObj = MySQLHelper() tubleData = (urlid, filename) query = "insert into urlimgs (urlid,filename) values (%s,%s)" result = dbObj.executequery(query, tubleData) return result except: self.logger.logInfo( 'Exception in saveUrlIMG method (dal/twitterprovider.py)') def updateUrlStatus(self, urlid, value): try: dbObj = MySQLHelper() query = "update urls set status=" + str( value) + " where id=" + str(urlid) result = dbObj.query(query) return result except: self.logger.logInfo( 'Exception in updateUrlStatus method (dal/twitterprovider.py)')
else: extension = os.path.splitext(img)[1] newImgName = str(uuid.uuid1()) + str(extension) im.thumbnail(newSize, Image.ANTIALIAS) im.save(out_folder + "/" + newImgName, imgformat) os.remove(imgfile) saveImgList.append(newImgName) except: logger.logInfo("in urldatagrapper.py open images : " + str(sys.exc_info()[0])) for saveimg in saveImgList: urlproviderObj.saveUrlIMG(urlid, saveimg) logger = AppLog() urlproviderObj = Urlprovider() urlHelpObj = UrlHelper() try: urls = urlproviderObj.getURLs() for url in urls: if httpExists(url['orgurl']): urlid = url['id'] soup = getUrlHTMLsoup(url['orgurl']) if soup: headrDict = urlHelpObj.getHeaderInfo(soup) urlproviderObj.addUrlHeaderInfo(urlid, headrDict['title'], headrDict['description'])