Beispiel #1
0
def getUrlImgs(url, urlid, soup):

    config = Config()
    myConfiguration = config.getConfParser()
    out_folder = myConfiguration.get('files', 'img')

    imagelist = []
    urlHelpObj = UrlHelper()
    parsed = list(urlparse.urlparse(url))
    pathParsed = parsed[2]

    for image in soup.findAll("img"):
        try:
            src = image['src']
            if src:
                filename = image["src"].split("/")[-1]
                outpath = os.path.join(out_folder, filename)
                extension = os.path.splitext(outpath)[1]

                if extension.lower() in [
                        '.gif', '.jpg', '.png', '.bmp', '.jpeg', '.tif',
                        '.tiff'
                ]:
                    if urlHelpObj.isUrl(src):
                        retriveImg(src, outpath)
                        imagelist.append(filename)
                    else:
                        parsed[2] = image["src"]
                        urlExist = httpExists(urlparse.urlunparse(parsed))
                        if urlExist == 1:
                            retriveImg(urlparse.urlunparse(parsed), outpath)
                            imagelist.append(filename)
                        else:
                            virDirs = pathParsed.split('/')
                            for dir in virDirs:
                                if dir:
                                    parsed[2] = dir + "/" + image["src"]
                                    urlExist = httpExists(
                                        urlparse.urlunparse(parsed))
                                    if urlExist == 1:
                                        retriveImg(urlparse.urlunparse(parsed),
                                                   outpath)
                                        imagelist.append(filename)
        except:
            logger.logInfo("no src found in img tag  : " +
                           str(sys.exc_info()[0]))

    imagelist = f7(imagelist)
    imageProcessing(urlid, imagelist)
Beispiel #2
0
def getUrlImgs(url,urlid,soup):
    
    config=Config()
    myConfiguration=config.getConfParser()
    out_folder=myConfiguration.get('files', 'img')
    
    imagelist=[]
    urlHelpObj=UrlHelper()
    parsed = list(urlparse.urlparse(url))
    pathParsed=parsed[2]  

    for image in soup.findAll("img"):
        try:  
            src= image['src']
            if src:
                filename = image["src"].split("/")[-1]
                outpath = os.path.join(out_folder, filename)
                extension = os.path.splitext(outpath)[1]
                
                if extension.lower() in ['.gif','.jpg','.png','.bmp','.jpeg','.tif','.tiff']:
                    if urlHelpObj.isUrl(src):
                        retriveImg(src,outpath)
                        imagelist.append(filename)
                    else:
                        parsed[2] = image["src"]
                        urlExist=httpExists(urlparse.urlunparse(parsed))
                        if urlExist==1:
                            retriveImg(urlparse.urlunparse(parsed),outpath)
                            imagelist.append(filename)
                        else:
                            virDirs=pathParsed.split('/')
                            for dir in virDirs:
                                if dir:
                                    parsed[2] = dir+"/"+image["src"]
                                    urlExist=httpExists(urlparse.urlunparse(parsed))
                                    if urlExist==1:
                                        retriveImg(urlparse.urlunparse(parsed),outpath)
                                        imagelist.append(filename)
        except:
            logger.logInfo("no src found in img tag  : "+str(sys.exc_info()[0]))
            
    imagelist=f7(imagelist)
    imageProcessing(urlid,imagelist)
Beispiel #3
0
#encoding:UTF-8
from dal.urlprovider import Urlprovider
from helper.applog import AppLog
from helper.stringhelper import stringHelper
from helper.urlhelper import UrlHelper
import sys

logger = AppLog()
strObj = stringHelper()
ulrObj = UrlHelper()
urlpObj = Urlprovider()

try:

    posts = urlpObj.getPosts()

    for post in posts:

        postid = post['id']
        postText = post['text']
        postURLs = strObj.extractURL(postText)

        for url in postURLs:
            if url:
                orgurl = ulrObj.getRealURL(url)
                if orgurl:
                    urlpObj.addURL(url, orgurl, postid)

        urlpObj.updatePostUrlExtracted(postid, 1)

except:
Beispiel #4
0
                im.thumbnail(newSize, Image.ANTIALIAS)
                im.save(out_folder+"/"+newImgName,imgformat)
                os.remove(imgfile)
                saveImgList.append(newImgName)
        except:
            logger.logInfo("in urldatagrapper.py open images  : "+str(sys.exc_info()[0]))
                
    for saveimg in saveImgList:
        urlproviderObj.saveUrlIMG(urlid,saveimg)
        


                            
logger=AppLog()
urlproviderObj=Urlprovider()
urlHelpObj=UrlHelper()

try:

    urls=urlproviderObj.getURLs()

    for url in urls:
        if httpExists(url['orgurl']):
            urlid=url['id']
            soup=getUrlHTMLsoup(url['orgurl'])
            if soup:
                headrDict=urlHelpObj.getHeaderInfo(soup)
                urlproviderObj.addUrlHeaderInfo(urlid,headrDict['title'], headrDict['description'])
                getUrlImgs(url['orgurl'],urlid,soup)
                urlproviderObj.updateUrlStatus(urlid,1)
            else:
#encoding:UTF-8
from dal.urlprovider import Urlprovider
from helper.applog import AppLog
from helper.stringhelper import stringHelper
from helper.urlhelper import UrlHelper
import sys

logger=AppLog()
strObj=stringHelper()
ulrObj=UrlHelper()
urlpObj=Urlprovider()

try:
    
    posts=urlpObj.getPosts()
    
    for post in posts:
        
        postid=post['id']
        postText=post['text']
        postURLs=strObj.extractURL(postText)
        
        for url in postURLs:
            if url:
                orgurl=ulrObj.getRealURL(url)
                if orgurl:
                    urlpObj.addURL(url, orgurl, postid)
                    
        urlpObj.updatePostUrlExtracted(postid,1)
        
except:
Beispiel #6
0
                newImgName = str(uuid.uuid1()) + str(extension)
                im.thumbnail(newSize, Image.ANTIALIAS)
                im.save(out_folder + "/" + newImgName, imgformat)
                os.remove(imgfile)
                saveImgList.append(newImgName)
        except:
            logger.logInfo("in urldatagrapper.py open images  : " +
                           str(sys.exc_info()[0]))

    for saveimg in saveImgList:
        urlproviderObj.saveUrlIMG(urlid, saveimg)


logger = AppLog()
urlproviderObj = Urlprovider()
urlHelpObj = UrlHelper()

try:

    urls = urlproviderObj.getURLs()

    for url in urls:
        if httpExists(url['orgurl']):
            urlid = url['id']
            soup = getUrlHTMLsoup(url['orgurl'])
            if soup:
                headrDict = urlHelpObj.getHeaderInfo(soup)
                urlproviderObj.addUrlHeaderInfo(urlid, headrDict['title'],
                                                headrDict['description'])
                getUrlImgs(url['orgurl'], urlid, soup)
                urlproviderObj.updateUrlStatus(urlid, 1)