def fetchStory(url): siteDB = 'peepbuzz' infoModule.info.page['url'] = url log.plog("fetching " + url, 2) request_obj = urllib2.Request(url) request_obj.add_header('Referer', 'http://www.google.com/') request_obj.add_header( 'User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)' ) try: websock = urllib2.urlopen(request_obj) except IOError: log.plog('could not open ' + url, 4) return failOn('could not open ' + url) responseCode = websock.getcode() headerInfo = websock.info() pprint.pprint(headerInfo) log.plog('urllib2 response code: ' + str(responseCode), 2) if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303: log.plog('got failure response code from server', 4) return failOn('got failure response code from server') contentType = headerInfo.gettype() if contentType != 'text/html' and contentType != 'text/html, text/html': log.plog('content type: ' + contentType + '. not fetching', 4) return failOn('content type: ' + contentType + '. not fetching') readWithTimeout = timeout.TimeoutFunction(websock.read, 5) #infoModule.info.page['rawHTML'] = websock.read() try: infoModule.info.page['rawHTML'] = readWithTimeout() except timeout.TimeoutFunctionException: log.plog("timeout while trying to fetch " + url, 101) return failOn('read timeout ' + url) redirURL = websock.geturl() if redirURL != url: log.plog('redirected to ' + redirURL, 2) url = redirURL #redirected urls need to be blocked too if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) return failOn('article length exceeds 500k, probably not html') windows_trouble_list = [u'\x93', u'\x92', u'\x91', u'\x96', u'\x94'] cd = chardet.detect(infoModule.info.page['rawHTML']) if cd['encoding'] != 'ascii': log.plog('Server encoding: ' + cd['encoding'], 2) oldHTML = infoModule.info.page['rawHTML'] infoModule.info.page['rawHTML'] = infoModule.info.page[ 'rawHTML'].decode(cd['encoding']) windows_chars_in_html = [ trouble for trouble in windows_trouble_list if infoModule.info.page['rawHTML'].find(trouble) >= 0 ] if len(windows_chars_in_html) > 0: #windows = infoModule.info.page['rawHTML'].find(u'\x93') log.plog('this is actually windows-1252', 3) infoModule.info.page['rawHTML'] = oldHTML.decode('windows-1252') # some configuration options infoModule.info.page['meta_description'] = '' meta_search = re.search( 'meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1) log.plog( "meta_description: " + infoModule.info.page['meta_description'], 2) log.plog( '======================================= TITLE ================================', 2) # get title #set HTMLTitle first HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I) if HTMLTitle != None: infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1) log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2) else: infoModule.info.page['HTMLTitle'] = "" title = find_title.findTitle() if title != False: infoModule.info.page['title'] = title log.plog('title from regex', 2) if 'potential_title' in infoModule.info.page and len( infoModule.info.page['potential_title']) > 0: infoModule.info.page['title'] = strip_html.clearHTML( infoModule.info.page['potential_title']) log.plog('title from potential_title', 2) else: infoModule.info.page['title'] = real_title2.realTitle() if infoModule.info.page['title'] == False: infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] log.plog('using html title', 2) else: log.plog('title from realTitle', 2) if infoModule.info.page['title'] == '': log.plog('could not find title for page. Setting to HTML Title', 4) infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] #clear html from title infoModule.info.page['title'] = strip_html.clearHTML( infoModule.info.page['title']) #also titleCase the title #infoModule.info.page['title'] = infoModule.info.page['title'].title() log.plog('final title: ' + infoModule.info.page['title'], 2) #cd = chardet.detect(infoModule.info.page['title']) #if cd['encoding'] != 'ascii': # log.plog('title encoding: ' + cd['encoding'], 2) # oldTitle = infoModule.info.page['title'] # infoModule.info.page['title'] = infoModule.info.page['title'].decode(cd['encoding']) # windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['title'].find(trouble) >= 0] # if len(windows_chars_in_html) > 0: # #windows = infoModule.info.page['rawHTML'].find(u'\x93') # log.plog('title is actually windows-1252', 3) # infoModule.info.page['title'] = oldTitle.decode('windows-1252') log.plog( '======================================= OUTLINE ================================', 2) ## fetch outline #remove special case elements from the html. These are lines or blocks of code that cause #problems if left in infoModule.info.page['plainText'] = strip_html.removeSpecialCases( infoModule.info.page['rawHTML']) infoModule.info.page['plainText'] = strip_html.clearHTML( infoModule.info.page['plainText']) #clearHTML can't take out title, because title gets passed to clearHTML, but it should be removed here infoModule.info.page['plainText'] = re.sub( '<title.*?</title.*?>', '', infoModule.info.page['plainText'], 0, re.I | re.S | re.M) outline = False #this toggle allows for ignoring regex in favor of body_extractor log.plog('searching for body using body extractor', 2) infoModule.info.site['body_extractor_no_date'] = True outline = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False) infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML'] if outline != None: abbreviatedHTML = html_body_extractor.html_body_extractor( infoModule.info.page['rawHTML'], outline) if abbreviatedHTML != None: infoModule.info.page['rawHTML'] = abbreviatedHTML infoModule.info.page['outline'] = outline #use largestBlock to strip leading dom elements off that seem extraneous infoModule.info.page['outline'] = largestBlock.removePreceedingBlocks( infoModule.info.page['imageHTML'], infoModule.info.page['outline']) else: log.plog('could not create an outline for this story!', 5) infoModule.info.page['outline'] = '' #return failOn('could not create an outline for this story!') # outline must be at least minOutlineLen minOutlineLen = 255 if len(infoModule.info.page['outline']) > 0 and len( infoModule.info.page['outline']) < minOutlineLen: log.plog('outline too short, assuming failure', 3) infoModule.info.page['outline'] = '' log.plog( '======================================= IMAGES ================================', 2) #find images image_start_marker = '' image_end_marker = '' imageArray = find_all_images.findImages(infoModule.info.page['imageHTML'], url) if imageArray == None: log.plog('could not find image', 3) imageArray = '' log.plog( '======================================= VIDEOS ================================', 2) ###look for videos allVideosJSON = find_all_videos.find_all_videos( infoModule.info.page['imageHTML']) allVideos = json.loads(allVideosJSON) if len(allVideos) > 0: log.plog('found video embed', 2) print allVideosJSON #if no outline and no images over x by y and no videos, then no story if infoModule.info.page['outline'] == '' and ( imageArray == '' or imageArray == []) and allVideos == '': failOn('nothing found') #largest image if no outline must be at least 450 x 450 to make it an image page largestImageDimensions = 0 largestImage = [] for image in imageArray: if image['width'] * image['height'] > largestImageDimensions: largestImage = image largestImageDimensions = image['width'] * image['height'] print largestImage minImageSize = 400 if infoModule.info.page['outline'] == '' and allVideos == [] and ( largestImage == [] or largestImage['width'] < minImageSize or largestImage['height'] < minImageSize): return (failOn( 'no story or video found, and largest image less than min size')) status = 'OK' storyObj = {} storyObj['title'] = infoModule.info.page['title'] storyObj['outline'] = unicodeMapper.clearCurlies( infoModule.info.page['outline']) storyObj['url'] = url storyObj['images'] = imageArray storyObj['videos'] = allVideos returnVal = {"status": status, "story": storyObj} output = json.dumps(returnVal) return output
def fetchStory(url): siteDB = 'peepbuzz' infoModule.info.page['url'] = url log.plog("fetching " + url, 2) request_obj = urllib2.Request(url) request_obj.add_header('Referer', 'http://www.google.com/') request_obj.add_header('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)') try: websock = urllib2.urlopen(request_obj) except IOError: log.plog('could not open ' + url, 4) return failOn('could not open ' + url) responseCode = websock.getcode() headerInfo = websock.info() pprint.pprint(headerInfo) log.plog('urllib2 response code: ' + str(responseCode), 2) if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303: log.plog('got failure response code from server', 4) return failOn('got failure response code from server') contentType = headerInfo.gettype() if contentType != 'text/html' and contentType != 'text/html, text/html': log.plog('content type: ' + contentType + '. not fetching', 4) return failOn('content type: ' + contentType + '. not fetching') readWithTimeout = timeout.TimeoutFunction(websock.read, 5) #infoModule.info.page['rawHTML'] = websock.read() try: infoModule.info.page['rawHTML'] = readWithTimeout() except timeout.TimeoutFunctionException: log.plog("timeout while trying to fetch " + url, 101) return failOn('read timeout ' + url) redirURL = websock.geturl() if redirURL != url: log.plog('redirected to ' + redirURL, 2) url = redirURL #redirected urls need to be blocked too if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) return failOn('article length exceeds 500k, probably not html') windows_trouble_list = [u'\x93', u'\x92', u'\x91', u'\x96', u'\x94'] cd = chardet.detect(infoModule.info.page['rawHTML']) if cd['encoding'] != 'ascii': log.plog('Server encoding: ' + cd['encoding'], 2) oldHTML = infoModule.info.page['rawHTML'] infoModule.info.page['rawHTML'] = infoModule.info.page['rawHTML'].decode(cd['encoding']) windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['rawHTML'].find(trouble) >= 0] if len(windows_chars_in_html) > 0: #windows = infoModule.info.page['rawHTML'].find(u'\x93') log.plog('this is actually windows-1252', 3) infoModule.info.page['rawHTML'] = oldHTML.decode('windows-1252') # some configuration options infoModule.info.page['meta_description'] = '' meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1) log.plog("meta_description: " + infoModule.info.page['meta_description'], 2) log.plog('======================================= TITLE ================================', 2) # get title #set HTMLTitle first HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I) if HTMLTitle != None: infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1) log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2) else: infoModule.info.page['HTMLTitle'] = "" title = find_title.findTitle() if title != False: infoModule.info.page['title'] = title log.plog('title from regex', 2) if 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0: infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['potential_title']) log.plog('title from potential_title', 2) else: infoModule.info.page['title'] = real_title2.realTitle() if infoModule.info.page['title'] == False: infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] log.plog('using html title', 2) else: log.plog('title from realTitle', 2) if infoModule.info.page['title'] == '': log.plog('could not find title for page. Setting to HTML Title', 4) infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] #clear html from title infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['title']) #also titleCase the title #infoModule.info.page['title'] = infoModule.info.page['title'].title() log.plog('final title: ' + infoModule.info.page['title'], 2) #cd = chardet.detect(infoModule.info.page['title']) #if cd['encoding'] != 'ascii': # log.plog('title encoding: ' + cd['encoding'], 2) # oldTitle = infoModule.info.page['title'] # infoModule.info.page['title'] = infoModule.info.page['title'].decode(cd['encoding']) # windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['title'].find(trouble) >= 0] # if len(windows_chars_in_html) > 0: # #windows = infoModule.info.page['rawHTML'].find(u'\x93') # log.plog('title is actually windows-1252', 3) # infoModule.info.page['title'] = oldTitle.decode('windows-1252') log.plog('======================================= OUTLINE ================================', 2) ## fetch outline #remove special case elements from the html. These are lines or blocks of code that cause #problems if left in infoModule.info.page['plainText'] = strip_html.removeSpecialCases(infoModule.info.page['rawHTML']) infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['plainText']) #clearHTML can't take out title, because title gets passed to clearHTML, but it should be removed here infoModule.info.page['plainText'] = re.sub('<title.*?</title.*?>', '', infoModule.info.page['plainText'], 0, re.I | re.S | re.M) outline = False #this toggle allows for ignoring regex in favor of body_extractor log.plog('searching for body using body extractor', 2) infoModule.info.site['body_extractor_no_date'] = True outline = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False) infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML']; if outline != None: abbreviatedHTML = html_body_extractor.html_body_extractor(infoModule.info.page['rawHTML'], outline) if abbreviatedHTML != None: infoModule.info.page['rawHTML'] = abbreviatedHTML infoModule.info.page['outline'] = outline #use largestBlock to strip leading dom elements off that seem extraneous infoModule.info.page['outline'] = largestBlock.removePreceedingBlocks(infoModule.info.page['imageHTML'], infoModule.info.page['outline']) else: log.plog('could not create an outline for this story!', 5) infoModule.info.page['outline'] = '' #return failOn('could not create an outline for this story!') # outline must be at least minOutlineLen minOutlineLen = 255 if len(infoModule.info.page['outline']) > 0 and len(infoModule.info.page['outline']) < minOutlineLen: log.plog('outline too short, assuming failure', 3) infoModule.info.page['outline'] = '' log.plog('======================================= IMAGES ================================', 2) #find images image_start_marker = '' image_end_marker = '' imageArray = find_all_images.findImages(infoModule.info.page['imageHTML'], url) if imageArray == None: log.plog('could not find image', 3) imageArray = '' log.plog('======================================= VIDEOS ================================', 2) ###look for videos allVideosJSON = find_all_videos.find_all_videos(infoModule.info.page['imageHTML']) allVideos = json.loads(allVideosJSON) if len(allVideos) > 0: log.plog('found video embed', 2) print allVideosJSON #if no outline and no images over x by y and no videos, then no story if infoModule.info.page['outline'] == '' and (imageArray == '' or imageArray == []) and allVideos == '': failOn('nothing found') #largest image if no outline must be at least 450 x 450 to make it an image page largestImageDimensions = 0; largestImage = [] for image in imageArray: if image['width'] * image['height'] > largestImageDimensions: largestImage = image largestImageDimensions = image['width'] * image['height'] print largestImage minImageSize = 400 if infoModule.info.page['outline'] == '' and allVideos == [] and (largestImage == [] or largestImage['width'] < minImageSize or largestImage['height'] < minImageSize): return(failOn('no story or video found, and largest image less than min size')) status = 'OK' storyObj = {} storyObj['title'] = infoModule.info.page['title'] storyObj['outline'] = unicodeMapper.clearCurlies(infoModule.info.page['outline']) storyObj['url'] = url storyObj['images'] = imageArray storyObj['videos'] = allVideos returnVal = {"status" : status, "story" : storyObj} output = json.dumps(returnVal) return output
return False retval = websock.read() return retval if __name__ == '__main__': if len(sys.argv) > 1: url = sys.argv[1] infoModule.info.site['body_extractor_no_date'] = True infoModule.info.page['rawHTML'] = fetchPage(url) htmlTitle() infoModule.info.page['title'] = real_title2.realTitle() print infoModule.info.page['title'] #sys.exit() infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML']) infoModule.info.site['body_extractor_no_date'] = True infoModule.info.page['meta_description'] = '' meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1) print "meta_description: " + infoModule.info.page['meta_description'] outline = extract(infoModule.info.page['plainText'], doAsciiConvert=False) outline = unicodeMapper.clearCurlies(outline) print outline else: unittest.main()
#sys.exit() meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1).decode('utf-8') print "meta_description: " + infoModule.info.page['meta_description'] infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML']) be_results = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False) if be_results != None: print be_results return be_results if __name__ == '__main__': if len(sys.argv) > 1: url = sys.argv[1] be_results = bodyExtract(url) html = infoModule.info.page['rawHTML'] if be_results == None: print "no body extractor results" sys.exit() html = removePreceedingBlocks(html, be_results) print "TITLE: " + infoModule.info.page['title'] html = unicodeMapper.clearCurlies(html) print "BODY: " + html #for i in range(len(html)): # print html[i] + ":" + str(ord(html[i]))
infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1).decode( 'utf-8') print "meta_description: " + infoModule.info.page['meta_description'] infoModule.info.page['plainText'] = strip_html.clearHTML( infoModule.info.page['rawHTML']) be_results = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False) if be_results != None: print be_results return be_results if __name__ == '__main__': if len(sys.argv) > 1: url = sys.argv[1] be_results = bodyExtract(url) html = infoModule.info.page['rawHTML'] if be_results == None: print "no body extractor results" sys.exit() html = removePreceedingBlocks(html, be_results) print "TITLE: " + infoModule.info.page['title'] html = unicodeMapper.clearCurlies(html) print "BODY: " + html #for i in range(len(html)): # print html[i] + ":" + str(ord(html[i]))