def genericImageExtract(url=None, account_id=None): ## function for generic image sites. Just find a big image, and make sure it's bigger than ## min x and min y min_x = 400 min_y = 400 next_biggest = 50 biggest = {'url': '', 'width': 0, 'height': 0} next_biggest = {'url': '', 'width': 0, 'height': 0} if url == None: return throwGenericError() contents = getSiteContents(url) if not contents: return throwGenericError() image_list = find_all_images.findImages(contents, url) #print "images for " + url #pprint.pprint(image_list) if image_list != None: ## if there is one big image and the rest are x% less big than it, go for it ## the requirement for % calculation is that BOTH dimentions be greater than 50% of maximum for image in image_list: if image['width'] > min_x and image['height'] > min_y and image[ 'width'] > biggest[1] and image['height'] > biggest[2]: if biggest['width'] > next_biggest['width'] and biggest[ 'height'] > next_biggest['height']: next_biggest = biggest biggest = image elif image['width'] > next_biggest['width'] and image[ 'height'] > next_biggest['height']: next_biggest = image if biggest['url'] != '': x_ratio = float(next_biggest['width']) / float(biggest['width']) y_ratio = float(next_biggest['height']) / float(biggest['height']) if y_ratio < 0.51 or x_ratio < 0.51: #we have a winner #print "biggest image: " + biggest[0] #print "next biggest: " + next_biggest[0] #print "x ratio = " + str(float(next_biggest[1]) / float(biggest[1])) #print "y ratio = " + str(float(next_biggest[2]) / float(biggest[2])) return json.dumps( returnStructure(url, biggest['url'], '', biggest['width'], biggest['height'])) return throwGenericError()
def genericImageExtract(url=None, account_id=None): ## function for generic image sites. Just find a big image, and make sure it's bigger than ## min x and min y min_x = 400 min_y = 400 next_biggest = 50 biggest = {'url':'', 'width':0, 'height':0} next_biggest = {'url':'', 'width':0, 'height':0} if url == None: return throwGenericError() contents = getSiteContents(url) if not contents: return throwGenericError() image_list = find_all_images.findImages(contents, url) #print "images for " + url #pprint.pprint(image_list) if image_list != None: ## if there is one big image and the rest are x% less big than it, go for it ## the requirement for % calculation is that BOTH dimentions be greater than 50% of maximum for image in image_list: if image['width'] > min_x and image['height'] > min_y and image['width'] > biggest[1] and image['height'] > biggest[2]: if biggest['width'] > next_biggest['width'] and biggest['height'] > next_biggest['height']: next_biggest = biggest biggest = image elif image['width'] > next_biggest['width'] and image['height'] > next_biggest['height']: next_biggest = image if biggest['url'] != '': x_ratio = float(next_biggest['width']) / float(biggest['width']) y_ratio = float(next_biggest['height']) / float(biggest['height']) if y_ratio < 0.51 or x_ratio < 0.51: #we have a winner #print "biggest image: " + biggest[0] #print "next biggest: " + next_biggest[0] #print "x ratio = " + str(float(next_biggest[1]) / float(biggest[1])) #print "y ratio = " + str(float(next_biggest[2]) / float(biggest[2])) return json.dumps(returnStructure(url, biggest['url'], '', biggest['width'], biggest['height'])) return throwGenericError()
def fetchStory(url): siteDB = 'peepbuzz' infoModule.info.page['url'] = url log.plog("fetching " + url, 2) request_obj = urllib2.Request(url) request_obj.add_header('Referer', 'http://www.google.com/') request_obj.add_header( 'User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)' ) try: websock = urllib2.urlopen(request_obj) except IOError: log.plog('could not open ' + url, 4) return failOn('could not open ' + url) responseCode = websock.getcode() headerInfo = websock.info() pprint.pprint(headerInfo) log.plog('urllib2 response code: ' + str(responseCode), 2) if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303: log.plog('got failure response code from server', 4) return failOn('got failure response code from server') contentType = headerInfo.gettype() if contentType != 'text/html' and contentType != 'text/html, text/html': log.plog('content type: ' + contentType + '. not fetching', 4) return failOn('content type: ' + contentType + '. not fetching') readWithTimeout = timeout.TimeoutFunction(websock.read, 5) #infoModule.info.page['rawHTML'] = websock.read() try: infoModule.info.page['rawHTML'] = readWithTimeout() except timeout.TimeoutFunctionException: log.plog("timeout while trying to fetch " + url, 101) return failOn('read timeout ' + url) redirURL = websock.geturl() if redirURL != url: log.plog('redirected to ' + redirURL, 2) url = redirURL #redirected urls need to be blocked too if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) return failOn('article length exceeds 500k, probably not html') windows_trouble_list = [u'\x93', u'\x92', u'\x91', u'\x96', u'\x94'] cd = chardet.detect(infoModule.info.page['rawHTML']) if cd['encoding'] != 'ascii': log.plog('Server encoding: ' + cd['encoding'], 2) oldHTML = infoModule.info.page['rawHTML'] infoModule.info.page['rawHTML'] = infoModule.info.page[ 'rawHTML'].decode(cd['encoding']) windows_chars_in_html = [ trouble for trouble in windows_trouble_list if infoModule.info.page['rawHTML'].find(trouble) >= 0 ] if len(windows_chars_in_html) > 0: #windows = infoModule.info.page['rawHTML'].find(u'\x93') log.plog('this is actually windows-1252', 3) infoModule.info.page['rawHTML'] = oldHTML.decode('windows-1252') # some configuration options infoModule.info.page['meta_description'] = '' meta_search = re.search( 'meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1) log.plog( "meta_description: " + infoModule.info.page['meta_description'], 2) log.plog( '======================================= TITLE ================================', 2) # get title #set HTMLTitle first HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I) if HTMLTitle != None: infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1) log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2) else: infoModule.info.page['HTMLTitle'] = "" title = find_title.findTitle() if title != False: infoModule.info.page['title'] = title log.plog('title from regex', 2) if 'potential_title' in infoModule.info.page and len( infoModule.info.page['potential_title']) > 0: infoModule.info.page['title'] = strip_html.clearHTML( infoModule.info.page['potential_title']) log.plog('title from potential_title', 2) else: infoModule.info.page['title'] = real_title2.realTitle() if infoModule.info.page['title'] == False: infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] log.plog('using html title', 2) else: log.plog('title from realTitle', 2) if infoModule.info.page['title'] == '': log.plog('could not find title for page. Setting to HTML Title', 4) infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] #clear html from title infoModule.info.page['title'] = strip_html.clearHTML( infoModule.info.page['title']) #also titleCase the title #infoModule.info.page['title'] = infoModule.info.page['title'].title() log.plog('final title: ' + infoModule.info.page['title'], 2) #cd = chardet.detect(infoModule.info.page['title']) #if cd['encoding'] != 'ascii': # log.plog('title encoding: ' + cd['encoding'], 2) # oldTitle = infoModule.info.page['title'] # infoModule.info.page['title'] = infoModule.info.page['title'].decode(cd['encoding']) # windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['title'].find(trouble) >= 0] # if len(windows_chars_in_html) > 0: # #windows = infoModule.info.page['rawHTML'].find(u'\x93') # log.plog('title is actually windows-1252', 3) # infoModule.info.page['title'] = oldTitle.decode('windows-1252') log.plog( '======================================= OUTLINE ================================', 2) ## fetch outline #remove special case elements from the html. These are lines or blocks of code that cause #problems if left in infoModule.info.page['plainText'] = strip_html.removeSpecialCases( infoModule.info.page['rawHTML']) infoModule.info.page['plainText'] = strip_html.clearHTML( infoModule.info.page['plainText']) #clearHTML can't take out title, because title gets passed to clearHTML, but it should be removed here infoModule.info.page['plainText'] = re.sub( '<title.*?</title.*?>', '', infoModule.info.page['plainText'], 0, re.I | re.S | re.M) outline = False #this toggle allows for ignoring regex in favor of body_extractor log.plog('searching for body using body extractor', 2) infoModule.info.site['body_extractor_no_date'] = True outline = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False) infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML'] if outline != None: abbreviatedHTML = html_body_extractor.html_body_extractor( infoModule.info.page['rawHTML'], outline) if abbreviatedHTML != None: infoModule.info.page['rawHTML'] = abbreviatedHTML infoModule.info.page['outline'] = outline #use largestBlock to strip leading dom elements off that seem extraneous infoModule.info.page['outline'] = largestBlock.removePreceedingBlocks( infoModule.info.page['imageHTML'], infoModule.info.page['outline']) else: log.plog('could not create an outline for this story!', 5) infoModule.info.page['outline'] = '' #return failOn('could not create an outline for this story!') # outline must be at least minOutlineLen minOutlineLen = 255 if len(infoModule.info.page['outline']) > 0 and len( infoModule.info.page['outline']) < minOutlineLen: log.plog('outline too short, assuming failure', 3) infoModule.info.page['outline'] = '' log.plog( '======================================= IMAGES ================================', 2) #find images image_start_marker = '' image_end_marker = '' imageArray = find_all_images.findImages(infoModule.info.page['imageHTML'], url) if imageArray == None: log.plog('could not find image', 3) imageArray = '' log.plog( '======================================= VIDEOS ================================', 2) ###look for videos allVideosJSON = find_all_videos.find_all_videos( infoModule.info.page['imageHTML']) allVideos = json.loads(allVideosJSON) if len(allVideos) > 0: log.plog('found video embed', 2) print allVideosJSON #if no outline and no images over x by y and no videos, then no story if infoModule.info.page['outline'] == '' and ( imageArray == '' or imageArray == []) and allVideos == '': failOn('nothing found') #largest image if no outline must be at least 450 x 450 to make it an image page largestImageDimensions = 0 largestImage = [] for image in imageArray: if image['width'] * image['height'] > largestImageDimensions: largestImage = image largestImageDimensions = image['width'] * image['height'] print largestImage minImageSize = 400 if infoModule.info.page['outline'] == '' and allVideos == [] and ( largestImage == [] or largestImage['width'] < minImageSize or largestImage['height'] < minImageSize): return (failOn( 'no story or video found, and largest image less than min size')) status = 'OK' storyObj = {} storyObj['title'] = infoModule.info.page['title'] storyObj['outline'] = unicodeMapper.clearCurlies( infoModule.info.page['outline']) storyObj['url'] = url storyObj['images'] = imageArray storyObj['videos'] = allVideos returnVal = {"status": status, "story": storyObj} output = json.dumps(returnVal) return output
def fetchStory(url): siteDB = 'peepbuzz' infoModule.info.page['url'] = url log.plog("fetching " + url, 2) request_obj = urllib2.Request(url) request_obj.add_header('Referer', 'http://www.google.com/') request_obj.add_header('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)') try: websock = urllib2.urlopen(request_obj) except IOError: log.plog('could not open ' + url, 4) return failOn('could not open ' + url) responseCode = websock.getcode() headerInfo = websock.info() pprint.pprint(headerInfo) log.plog('urllib2 response code: ' + str(responseCode), 2) if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303: log.plog('got failure response code from server', 4) return failOn('got failure response code from server') contentType = headerInfo.gettype() if contentType != 'text/html' and contentType != 'text/html, text/html': log.plog('content type: ' + contentType + '. not fetching', 4) return failOn('content type: ' + contentType + '. not fetching') readWithTimeout = timeout.TimeoutFunction(websock.read, 5) #infoModule.info.page['rawHTML'] = websock.read() try: infoModule.info.page['rawHTML'] = readWithTimeout() except timeout.TimeoutFunctionException: log.plog("timeout while trying to fetch " + url, 101) return failOn('read timeout ' + url) redirURL = websock.geturl() if redirURL != url: log.plog('redirected to ' + redirURL, 2) url = redirURL #redirected urls need to be blocked too if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) return failOn('article length exceeds 500k, probably not html') windows_trouble_list = [u'\x93', u'\x92', u'\x91', u'\x96', u'\x94'] cd = chardet.detect(infoModule.info.page['rawHTML']) if cd['encoding'] != 'ascii': log.plog('Server encoding: ' + cd['encoding'], 2) oldHTML = infoModule.info.page['rawHTML'] infoModule.info.page['rawHTML'] = infoModule.info.page['rawHTML'].decode(cd['encoding']) windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['rawHTML'].find(trouble) >= 0] if len(windows_chars_in_html) > 0: #windows = infoModule.info.page['rawHTML'].find(u'\x93') log.plog('this is actually windows-1252', 3) infoModule.info.page['rawHTML'] = oldHTML.decode('windows-1252') # some configuration options infoModule.info.page['meta_description'] = '' meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1) log.plog("meta_description: " + infoModule.info.page['meta_description'], 2) log.plog('======================================= TITLE ================================', 2) # get title #set HTMLTitle first HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I) if HTMLTitle != None: infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1) log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2) else: infoModule.info.page['HTMLTitle'] = "" title = find_title.findTitle() if title != False: infoModule.info.page['title'] = title log.plog('title from regex', 2) if 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0: infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['potential_title']) log.plog('title from potential_title', 2) else: infoModule.info.page['title'] = real_title2.realTitle() if infoModule.info.page['title'] == False: infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] log.plog('using html title', 2) else: log.plog('title from realTitle', 2) if infoModule.info.page['title'] == '': log.plog('could not find title for page. Setting to HTML Title', 4) infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] #clear html from title infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['title']) #also titleCase the title #infoModule.info.page['title'] = infoModule.info.page['title'].title() log.plog('final title: ' + infoModule.info.page['title'], 2) #cd = chardet.detect(infoModule.info.page['title']) #if cd['encoding'] != 'ascii': # log.plog('title encoding: ' + cd['encoding'], 2) # oldTitle = infoModule.info.page['title'] # infoModule.info.page['title'] = infoModule.info.page['title'].decode(cd['encoding']) # windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['title'].find(trouble) >= 0] # if len(windows_chars_in_html) > 0: # #windows = infoModule.info.page['rawHTML'].find(u'\x93') # log.plog('title is actually windows-1252', 3) # infoModule.info.page['title'] = oldTitle.decode('windows-1252') log.plog('======================================= OUTLINE ================================', 2) ## fetch outline #remove special case elements from the html. These are lines or blocks of code that cause #problems if left in infoModule.info.page['plainText'] = strip_html.removeSpecialCases(infoModule.info.page['rawHTML']) infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['plainText']) #clearHTML can't take out title, because title gets passed to clearHTML, but it should be removed here infoModule.info.page['plainText'] = re.sub('<title.*?</title.*?>', '', infoModule.info.page['plainText'], 0, re.I | re.S | re.M) outline = False #this toggle allows for ignoring regex in favor of body_extractor log.plog('searching for body using body extractor', 2) infoModule.info.site['body_extractor_no_date'] = True outline = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False) infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML']; if outline != None: abbreviatedHTML = html_body_extractor.html_body_extractor(infoModule.info.page['rawHTML'], outline) if abbreviatedHTML != None: infoModule.info.page['rawHTML'] = abbreviatedHTML infoModule.info.page['outline'] = outline #use largestBlock to strip leading dom elements off that seem extraneous infoModule.info.page['outline'] = largestBlock.removePreceedingBlocks(infoModule.info.page['imageHTML'], infoModule.info.page['outline']) else: log.plog('could not create an outline for this story!', 5) infoModule.info.page['outline'] = '' #return failOn('could not create an outline for this story!') # outline must be at least minOutlineLen minOutlineLen = 255 if len(infoModule.info.page['outline']) > 0 and len(infoModule.info.page['outline']) < minOutlineLen: log.plog('outline too short, assuming failure', 3) infoModule.info.page['outline'] = '' log.plog('======================================= IMAGES ================================', 2) #find images image_start_marker = '' image_end_marker = '' imageArray = find_all_images.findImages(infoModule.info.page['imageHTML'], url) if imageArray == None: log.plog('could not find image', 3) imageArray = '' log.plog('======================================= VIDEOS ================================', 2) ###look for videos allVideosJSON = find_all_videos.find_all_videos(infoModule.info.page['imageHTML']) allVideos = json.loads(allVideosJSON) if len(allVideos) > 0: log.plog('found video embed', 2) print allVideosJSON #if no outline and no images over x by y and no videos, then no story if infoModule.info.page['outline'] == '' and (imageArray == '' or imageArray == []) and allVideos == '': failOn('nothing found') #largest image if no outline must be at least 450 x 450 to make it an image page largestImageDimensions = 0; largestImage = [] for image in imageArray: if image['width'] * image['height'] > largestImageDimensions: largestImage = image largestImageDimensions = image['width'] * image['height'] print largestImage minImageSize = 400 if infoModule.info.page['outline'] == '' and allVideos == [] and (largestImage == [] or largestImage['width'] < minImageSize or largestImage['height'] < minImageSize): return(failOn('no story or video found, and largest image less than min size')) status = 'OK' storyObj = {} storyObj['title'] = infoModule.info.page['title'] storyObj['outline'] = unicodeMapper.clearCurlies(infoModule.info.page['outline']) storyObj['url'] = url storyObj['images'] = imageArray storyObj['videos'] = allVideos returnVal = {"status" : status, "story" : storyObj} output = json.dumps(returnVal) return output