Example #1
0
class httpHelper:
    
    logger=None
    
    def __init__(self):
        self.logger=AppLog()
    
    def httpExists(self,url):
        
        host, path = urlparse.urlsplit(url)[1:3]
        found = None
        try:

            connection = httplib.HTTPConnection(host)  ## Make HTTPConnection Object
            connection.request("HEAD", path)
            responseOb = connection.getresponse()      ## Grab HTTPResponse Object
    
            if responseOb.status in [200,201,204,302]:
                found = responseOb.status

        except Exception, e:
            logvar= e.__class__,  e, url
            self.logger.logInfo(logvar)
            
        return found
Example #2
0
class httpHelper:

    logger = None

    def __init__(self):
        self.logger = AppLog()

    def httpExists(self, url):

        host, path = urlparse.urlsplit(url)[1:3]
        found = None
        try:

            connection = httplib.HTTPConnection(
                host)  ## Make HTTPConnection Object
            connection.request("HEAD", path)
            responseOb = connection.getresponse()  ## Grab HTTPResponse Object

            if responseOb.status in [200, 201, 204, 302]:
                found = responseOb.status

        except Exception, e:
            logvar = e.__class__, e, url
            self.logger.logInfo(logvar)

        return found
Example #3
0
class UrlHelper:

    logger = None

    def __init__(self):
        self.logger = AppLog()

    def isUrl(self, url):
        p = re.compile(
            'http?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        )
        m = p.match(url)
        if m:
            return True
        else:
            return False

    def getRealURL(self, url):
        try:
            socket.setdefaulttimeout(30)
            realURL = urllib2.urlopen(url).geturl()
            return realURL
        except urllib2.HTTPError:
            self.logger.logInfo(
                'Exception in getRealURL method urllib2.HTTPError (helper/urlhelper.py)'
                + str(sys.exc_info()[0]))
            return None
        except urllib2.URLError:
            self.logger.logInfo(
                'Exception in getRealURL method  urllib2.URLError (helper/urlhelper.py)'
                + str(sys.exc_info()[0]))
            return None
        except:
            return None

    def getHeaderInfo(self, soup):
        headrDict = {}
        title = None
        meta = []
        description = None
        metaTitle = None

        try:
            title = soup.find('head').title.string
            meta = soup.findAll('meta')
        except:
            self.logger.logInfo(
                'getHeaderInfo some meta tags function not tags (helper/urlhelper.py) '
            )

        for m in meta:
            try:
                if m['name'].lower() == 'title':
                    metaTitle = m['content']
                    if metaTitle:
                        title = metaTitle
                elif m['name'].lower() == 'description':
                    description = m['content']
            except:
                self.logger.logInfo(
                    'getHeaderInfo some meta tags function not tags (helper/urlhelper.py)'
                )

        headrDict['title'] = title
        headrDict['description'] = description
        return headrDict
Example #4
0
 def __init__(self):
     self.logger = AppLog()
Example #5
0
 def __init__(self):
     self.logger=AppLog()
Example #6
0
#encoding:UTF-8
from dal.urlprovider import Urlprovider
from helper.applog import AppLog
from helper.stringhelper import stringHelper
from helper.urlhelper import UrlHelper
import sys

logger = AppLog()
strObj = stringHelper()
ulrObj = UrlHelper()
urlpObj = Urlprovider()

try:

    posts = urlpObj.getPosts()

    for post in posts:

        postid = post['id']
        postText = post['text']
        postURLs = strObj.extractURL(postText)

        for url in postURLs:
            if url:
                orgurl = ulrObj.getRealURL(url)
                if orgurl:
                    urlpObj.addURL(url, orgurl, postid)

        urlpObj.updatePostUrlExtracted(postid, 1)

except:
Example #7
0
class Urlprovider:
    
    logger=None
    
    def __init__(self):
        self.logger=AppLog()
 
    def getPosts(self):
        try:
            dbObj=MySQLHelper()
            query="select * from posts where urlextracted=0 order by id;"
            result=dbObj.query(query)
            return result
        except:
            self.logger.logInfo('Exception in getPosts method (dal/urlprovider.py)')
            
    def addURL(self,url,orgurl,postid):
        try:
            dbObj=MySQLHelper()
            query="select id,counter from urls where orgurl='"+orgurl+"'"
            result=dbObj.query(query)
            if len(result) == 0:
                tubleData=(url,orgurl,postid)
                query="insert into urls (url,orgurl,postid) values (%s,%s,%s)"
                result=dbObj.executequery(query,tubleData)
            else:
                query=''
                id=result[0]['id']
                counter=result[0]['counter']
                if counter is None:
                    counter=0
                counter += 1
                query="update urls set counter="+str(counter)+" where id="+str(id)
                result=dbObj.query(query)

        except:
            self.logger.logInfo('Exception in addURL method (dal/urlprovider.py)')
            
    def updatePostUrlExtracted(self,postid,value):
        try:
            dbObj=MySQLHelper()
            query="update posts set urlextracted="+str(value)+" where id="+str(postid)
            result=dbObj.query(query)
            return result
        except:
            self.logger.logInfo('Exception in updatePostStatus method (dal/twitterprovider.py)')
            
    def getURLs(self):
        try:
            dbObj=MySQLHelper()
            query="select * from urls where status=0"
            result=dbObj.query(query)
            return result
        except:
            self.logger.logInfo('Exception in updatePostStatus method (dal/twitterprovider.py)')
            
    def addUrlHeaderInfo(self,urlid,metatitle,metadescription):
        try:
            dbObj=MySQLHelper()
            tubleData=(metatitle,metadescription)
            query="update urls set metatitle=%s,metadescription=%s where id="+str(urlid)
            result=dbObj.executequery(query,tubleData)
            return result
        except:
            self.logger.logInfo('Exception in updatePostStatus method (dal/twitterprovider.py)')
            
    def saveUrlIMG(self,urlid,filename):
        try:
            dbObj=MySQLHelper()
            tubleData=(urlid,filename)
            query="insert into urlimgs (urlid,filename) values (%s,%s)"
            result=dbObj.executequery(query,tubleData)
            return result
        except:
            self.logger.logInfo('Exception in saveUrlIMG method (dal/twitterprovider.py)')
            
    def updateUrlStatus(self,urlid,value):
        try:
            dbObj=MySQLHelper()
            query="update urls set status="+str(value)+" where id="+str(urlid)
            result=dbObj.query(query)
            return result
        except:
            self.logger.logInfo('Exception in updateUrlStatus method (dal/twitterprovider.py)')
            
Example #8
0
                extension=os.path.splitext(img)[1]
                newImgName=str(uuid.uuid1())+str(extension)
                im.thumbnail(newSize, Image.ANTIALIAS)
                im.save(out_folder+"/"+newImgName,imgformat)
                os.remove(imgfile)
                saveImgList.append(newImgName)
        except:
            logger.logInfo("in urldatagrapper.py open images  : "+str(sys.exc_info()[0]))
                
    for saveimg in saveImgList:
        urlproviderObj.saveUrlIMG(urlid,saveimg)
        


                            
logger=AppLog()
urlproviderObj=Urlprovider()
urlHelpObj=UrlHelper()

try:

    urls=urlproviderObj.getURLs()

    for url in urls:
        if httpExists(url['orgurl']):
            urlid=url['id']
            soup=getUrlHTMLsoup(url['orgurl'])
            if soup:
                headrDict=urlHelpObj.getHeaderInfo(soup)
                urlproviderObj.addUrlHeaderInfo(urlid,headrDict['title'], headrDict['description'])
                getUrlImgs(url['orgurl'],urlid,soup)
Example #9
0
#encoding:UTF-8
from dal.urlprovider import Urlprovider
from helper.applog import AppLog
from helper.stringhelper import stringHelper
from helper.urlhelper import UrlHelper
import sys

logger=AppLog()
strObj=stringHelper()
ulrObj=UrlHelper()
urlpObj=Urlprovider()

try:
    
    posts=urlpObj.getPosts()
    
    for post in posts:
        
        postid=post['id']
        postText=post['text']
        postURLs=strObj.extractURL(postText)
        
        for url in postURLs:
            if url:
                orgurl=ulrObj.getRealURL(url)
                if orgurl:
                    urlpObj.addURL(url, orgurl, postid)
                    
        urlpObj.updatePostUrlExtracted(postid,1)
        
except:
Example #10
0
class Urlprovider:

    logger = None

    def __init__(self):
        self.logger = AppLog()

    def getPosts(self):
        try:
            dbObj = MySQLHelper()
            query = "select * from posts where urlextracted=0 order by id;"
            result = dbObj.query(query)
            return result
        except:
            self.logger.logInfo(
                'Exception in getPosts method (dal/urlprovider.py)')

    def addURL(self, url, orgurl, postid):
        try:
            dbObj = MySQLHelper()
            query = "select id,counter from urls where orgurl='" + orgurl + "'"
            result = dbObj.query(query)
            if len(result) == 0:
                tubleData = (url, orgurl, postid)
                query = "insert into urls (url,orgurl,postid) values (%s,%s,%s)"
                result = dbObj.executequery(query, tubleData)
            else:
                query = ''
                id = result[0]['id']
                counter = result[0]['counter']
                if counter is None:
                    counter = 0
                counter += 1
                query = "update urls set counter=" + str(
                    counter) + " where id=" + str(id)
                result = dbObj.query(query)

        except:
            self.logger.logInfo(
                'Exception in addURL method (dal/urlprovider.py)')

    def updatePostUrlExtracted(self, postid, value):
        try:
            dbObj = MySQLHelper()
            query = "update posts set urlextracted=" + str(
                value) + " where id=" + str(postid)
            result = dbObj.query(query)
            return result
        except:
            self.logger.logInfo(
                'Exception in updatePostStatus method (dal/twitterprovider.py)'
            )

    def getURLs(self):
        try:
            dbObj = MySQLHelper()
            query = "select * from urls where status=0"
            result = dbObj.query(query)
            return result
        except:
            self.logger.logInfo(
                'Exception in updatePostStatus method (dal/twitterprovider.py)'
            )

    def addUrlHeaderInfo(self, urlid, metatitle, metadescription):
        try:
            dbObj = MySQLHelper()
            tubleData = (metatitle, metadescription)
            query = "update urls set metatitle=%s,metadescription=%s where id=" + str(
                urlid)
            result = dbObj.executequery(query, tubleData)
            return result
        except:
            self.logger.logInfo(
                'Exception in updatePostStatus method (dal/twitterprovider.py)'
            )

    def saveUrlIMG(self, urlid, filename):
        try:
            dbObj = MySQLHelper()
            tubleData = (urlid, filename)
            query = "insert into urlimgs (urlid,filename) values (%s,%s)"
            result = dbObj.executequery(query, tubleData)
            return result
        except:
            self.logger.logInfo(
                'Exception in saveUrlIMG method (dal/twitterprovider.py)')

    def updateUrlStatus(self, urlid, value):
        try:
            dbObj = MySQLHelper()
            query = "update urls set status=" + str(
                value) + " where id=" + str(urlid)
            result = dbObj.query(query)
            return result
        except:
            self.logger.logInfo(
                'Exception in updateUrlStatus method (dal/twitterprovider.py)')
Example #11
0
            else:
                extension = os.path.splitext(img)[1]
                newImgName = str(uuid.uuid1()) + str(extension)
                im.thumbnail(newSize, Image.ANTIALIAS)
                im.save(out_folder + "/" + newImgName, imgformat)
                os.remove(imgfile)
                saveImgList.append(newImgName)
        except:
            logger.logInfo("in urldatagrapper.py open images  : " +
                           str(sys.exc_info()[0]))

    for saveimg in saveImgList:
        urlproviderObj.saveUrlIMG(urlid, saveimg)


logger = AppLog()
urlproviderObj = Urlprovider()
urlHelpObj = UrlHelper()

try:

    urls = urlproviderObj.getURLs()

    for url in urls:
        if httpExists(url['orgurl']):
            urlid = url['id']
            soup = getUrlHTMLsoup(url['orgurl'])
            if soup:
                headrDict = urlHelpObj.getHeaderInfo(soup)
                urlproviderObj.addUrlHeaderInfo(urlid, headrDict['title'],
                                                headrDict['description'])