def getUrlImgs(url, urlid, soup): config = Config() myConfiguration = config.getConfParser() out_folder = myConfiguration.get('files', 'img') imagelist = [] urlHelpObj = UrlHelper() parsed = list(urlparse.urlparse(url)) pathParsed = parsed[2] for image in soup.findAll("img"): try: src = image['src'] if src: filename = image["src"].split("/")[-1] outpath = os.path.join(out_folder, filename) extension = os.path.splitext(outpath)[1] if extension.lower() in [ '.gif', '.jpg', '.png', '.bmp', '.jpeg', '.tif', '.tiff' ]: if urlHelpObj.isUrl(src): retriveImg(src, outpath) imagelist.append(filename) else: parsed[2] = image["src"] urlExist = httpExists(urlparse.urlunparse(parsed)) if urlExist == 1: retriveImg(urlparse.urlunparse(parsed), outpath) imagelist.append(filename) else: virDirs = pathParsed.split('/') for dir in virDirs: if dir: parsed[2] = dir + "/" + image["src"] urlExist = httpExists( urlparse.urlunparse(parsed)) if urlExist == 1: retriveImg(urlparse.urlunparse(parsed), outpath) imagelist.append(filename) except: logger.logInfo("no src found in img tag : " + str(sys.exc_info()[0])) imagelist = f7(imagelist) imageProcessing(urlid, imagelist)
def getUrlImgs(url,urlid,soup): config=Config() myConfiguration=config.getConfParser() out_folder=myConfiguration.get('files', 'img') imagelist=[] urlHelpObj=UrlHelper() parsed = list(urlparse.urlparse(url)) pathParsed=parsed[2] for image in soup.findAll("img"): try: src= image['src'] if src: filename = image["src"].split("/")[-1] outpath = os.path.join(out_folder, filename) extension = os.path.splitext(outpath)[1] if extension.lower() in ['.gif','.jpg','.png','.bmp','.jpeg','.tif','.tiff']: if urlHelpObj.isUrl(src): retriveImg(src,outpath) imagelist.append(filename) else: parsed[2] = image["src"] urlExist=httpExists(urlparse.urlunparse(parsed)) if urlExist==1: retriveImg(urlparse.urlunparse(parsed),outpath) imagelist.append(filename) else: virDirs=pathParsed.split('/') for dir in virDirs: if dir: parsed[2] = dir+"/"+image["src"] urlExist=httpExists(urlparse.urlunparse(parsed)) if urlExist==1: retriveImg(urlparse.urlunparse(parsed),outpath) imagelist.append(filename) except: logger.logInfo("no src found in img tag : "+str(sys.exc_info()[0])) imagelist=f7(imagelist) imageProcessing(urlid,imagelist)
#encoding:UTF-8 from dal.urlprovider import Urlprovider from helper.applog import AppLog from helper.stringhelper import stringHelper from helper.urlhelper import UrlHelper import sys logger = AppLog() strObj = stringHelper() ulrObj = UrlHelper() urlpObj = Urlprovider() try: posts = urlpObj.getPosts() for post in posts: postid = post['id'] postText = post['text'] postURLs = strObj.extractURL(postText) for url in postURLs: if url: orgurl = ulrObj.getRealURL(url) if orgurl: urlpObj.addURL(url, orgurl, postid) urlpObj.updatePostUrlExtracted(postid, 1) except:
im.thumbnail(newSize, Image.ANTIALIAS) im.save(out_folder+"/"+newImgName,imgformat) os.remove(imgfile) saveImgList.append(newImgName) except: logger.logInfo("in urldatagrapper.py open images : "+str(sys.exc_info()[0])) for saveimg in saveImgList: urlproviderObj.saveUrlIMG(urlid,saveimg) logger=AppLog() urlproviderObj=Urlprovider() urlHelpObj=UrlHelper() try: urls=urlproviderObj.getURLs() for url in urls: if httpExists(url['orgurl']): urlid=url['id'] soup=getUrlHTMLsoup(url['orgurl']) if soup: headrDict=urlHelpObj.getHeaderInfo(soup) urlproviderObj.addUrlHeaderInfo(urlid,headrDict['title'], headrDict['description']) getUrlImgs(url['orgurl'],urlid,soup) urlproviderObj.updateUrlStatus(urlid,1) else:
#encoding:UTF-8 from dal.urlprovider import Urlprovider from helper.applog import AppLog from helper.stringhelper import stringHelper from helper.urlhelper import UrlHelper import sys logger=AppLog() strObj=stringHelper() ulrObj=UrlHelper() urlpObj=Urlprovider() try: posts=urlpObj.getPosts() for post in posts: postid=post['id'] postText=post['text'] postURLs=strObj.extractURL(postText) for url in postURLs: if url: orgurl=ulrObj.getRealURL(url) if orgurl: urlpObj.addURL(url, orgurl, postid) urlpObj.updatePostUrlExtracted(postid,1) except:
newImgName = str(uuid.uuid1()) + str(extension) im.thumbnail(newSize, Image.ANTIALIAS) im.save(out_folder + "/" + newImgName, imgformat) os.remove(imgfile) saveImgList.append(newImgName) except: logger.logInfo("in urldatagrapper.py open images : " + str(sys.exc_info()[0])) for saveimg in saveImgList: urlproviderObj.saveUrlIMG(urlid, saveimg) logger = AppLog() urlproviderObj = Urlprovider() urlHelpObj = UrlHelper() try: urls = urlproviderObj.getURLs() for url in urls: if httpExists(url['orgurl']): urlid = url['id'] soup = getUrlHTMLsoup(url['orgurl']) if soup: headrDict = urlHelpObj.getHeaderInfo(soup) urlproviderObj.addUrlHeaderInfo(urlid, headrDict['title'], headrDict['description']) getUrlImgs(url['orgurl'], urlid, soup) urlproviderObj.updateUrlStatus(urlid, 1)