def scanPage(): if 'url' not in infoModule.info.page: log.plog('scan page called without url', 4) sys.exit() log.plog("fetching " + infoModule.info.page['url'], 2) socket = urllib.urlopen(infoModule.info.page['url']) infoModule.info.page['rawHTML'] = socket.read() redirURL = socket.geturl() if redirURL != infoModule.info.page['url']: log.plog('redirected to ' + redirURL, 2) infoModule.info.page['url'] = redirURL ## maybe check last modified header and don't get stories older than 7 days? if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) sys.exit() print "<b>URL</b> <a href=\"" + infoModule.info.page[ 'url'] + "\">" + infoModule.info.page['url'] + "</a><br />" #print "<b>URL</b> " + infoModule.info.page['url'] + "<br />" # get title #title = find_title.findTitle() #if title != False: # infoModule.info.page['title'] = title #elif 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0: # infoModule.info.page['title'] = infoModule.info.page['potential_title'] #else: # log.plog('no title found!', 3) # sys.exit() ##print infoModule.info.page['title'] #find images #if 'image_start_marker' in infoModule.info.source: # image_start_marker = infoModule.info.source['image_start_marker'] #else: # image_start_marker = '' #if 'image_end_marker' in infoModule.info.source: # image_end_marker = infoModule.info.source['image_end_marker'] #else: # image_end_marker = '' #imageArray = find_images.findImages(infoModule.info.page['rawHTML'], image_start_marker, image_end_marker) #x = imageArray[0] #y = imageArray[1] #imageURL = imageArray[2] #if imageURL == '': # log.plog('could not find image', 3) #else: # log.plog('image found: ' + imageURL, 2) # infoModule.info.page['largestImage'] = imageURL # infoModule.info.page['maxSize'] = x * y ## image credit if any infoModule.info.page['imageSource'] = '' if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source: imageSource = find_credit.findCredit( infoModule.info.page['rawHTML'], infoModule.info.source['image_source_start_marker'], infoModule.info.source['image_source_end_marker']) if imageSource != False and imageSource != None: infoModule.info.page['imageSource'] = imageSource #print "<b>Image Credit:</b> " + imageSource + "<br />" ###look for videos #videoHunter = find_video.youtube() #videoLink = videoHunter.getURL(infoModule.info.page['rawHTML']) #if videoLink == False: # log.plog('no video found', 2) # infoModule.info.page['vlink'] = '' #else: # log.plog('found video embed', 2) # infoModule.info.page['vlink'] = videoLink ## parse links in page #links.linkScoring(infoModule.info.page['rawHTML'], 'subs') #links.linkScoring(infoModule.info.page['rawHTML'], 'newsroom') #links.outboundLinks(infoModule.info.page['rawHTML']) ##author in story? if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source: author = find_author.findAuthor() if author != False: author = author.replace('<', '<') infoModule.info.page['author'] = author print "<b>Author:</b> " + author + "<br />" else: infoModule.info.page['author'] = '' else: infoModule.info.page['author'] = '' ## fetch outline if 'featured_source' in infoModule.info.source and infoModule.info.source[ 'featured_source'] == '1': infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures( infoModule.info.page['rawHTML']) else: infoModule.info.page['plainText'] = strip_html.clearHTML( infoModule.info.page['rawHTML']) outline = find_story.findStoryViaRegex() if outline != False: if 'featured_source' in infoModule.info.source and infoModule.info.source[ 'featured_source'] == '1': infoModule.info.page['outline'] = strip_html.clearHTMLFeatures( outline) else: infoModule.info.page['outline'] = strip_html.clearHTML(outline) print "<b>Outline:</b> " + infoModule.info.page['outline'] + "<br />" print "<hr>"
# set database infoModule.info.site['database'] = 'db_celebrifi' infoModule.info.site['dblink'] = link randSourceQ = mysql_tools.mysqlQuery("select * from db_sportifi.sources where source_id=6479", link) randSource = randSourceQ.fetch_row(1,1) for key in randSource[0].keys(): infoModule.info.source[key] = randSource[0][key] #find recent story from that source randStoryQ = mysql_tools.mysqlQuery("select url from db_celebrifi.subs where source_id=" + randSource[0]['source_id'] + " order by sub_id desc limit 1", link) randStory = randStoryQ.fetch_row(1,1) url = randStory[0]['url'] url = 'http://nhlhotstove.com/the-price-was-right-and-price-it-is/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+NHLHotStove+%28The+Hot+Stove%29' print url urlSocket = urllib.urlopen(url) html = urlSocket.read() infoModule.info.page['url'] = url infoModule.info.page['rawHTML'] = html outline = find_story.findStoryViaRegex() if outline == False: print "findStoryViaRegex failed" else: print outline
def scanPage(): if 'url' not in infoModule.info.page: log.plog('scan page called without url', 4) sys.exit() log.plog("fetching " + infoModule.info.page['url'], 2) socket = urllib.urlopen(infoModule.info.page['url']) infoModule.info.page['rawHTML'] = socket.read() redirURL = socket.geturl() if redirURL != infoModule.info.page['url']: log.plog('redirected to ' + redirURL, 2) infoModule.info.page['url'] = redirURL ## maybe check last modified header and don't get stories older than 7 days? if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) sys.exit() print "<b>URL</b> <a href=\"" + infoModule.info.page['url'] + "\">" + infoModule.info.page['url'] + "</a><br />" # get title #title = find_title.findTitle() #if title != False: # infoModule.info.page['title'] = title #elif 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0: # infoModule.info.page['title'] = infoModule.info.page['potential_title'] #else: # log.plog('no title found!', 3) # sys.exit() ##print infoModule.info.page['title'] #find images #if 'image_start_marker' in infoModule.info.source: # image_start_marker = infoModule.info.source['image_start_marker'] #else: # image_start_marker = '' #if 'image_end_marker' in infoModule.info.source: # image_end_marker = infoModule.info.source['image_end_marker'] #else: # image_end_marker = '' #imageArray = find_images.findImages(infoModule.info.page['rawHTML'], image_start_marker, image_end_marker) #x = imageArray[0] #y = imageArray[1] #imageURL = imageArray[2] #if imageURL == '': # log.plog('could not find image', 3) #else: # log.plog('image found: ' + imageURL, 2) # infoModule.info.page['largestImage'] = imageURL # infoModule.info.page['maxSize'] = x * y ## image credit if any infoModule.info.page['imageSource'] = '' if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source: imageSource = find_credit.findCredit(infoModule.info.page['rawHTML'], infoModule.info.source['image_source_start_marker'], infoModule.info.source['image_source_end_marker']) if imageSource != False and imageSource != None: infoModule.info.page['imageSource'] = imageSource print "<b>Image Credit:</b> " + imageSource + "<br />" ###look for videos #videoHunter = find_video.youtube() #videoLink = videoHunter.getURL(infoModule.info.page['rawHTML']) #if videoLink == False: # log.plog('no video found', 2) # infoModule.info.page['vlink'] = '' #else: # log.plog('found video embed', 2) # infoModule.info.page['vlink'] = videoLink ## parse links in page #links.linkScoring(infoModule.info.page['rawHTML'], 'subs') #links.linkScoring(infoModule.info.page['rawHTML'], 'newsroom') #links.outboundLinks(infoModule.info.page['rawHTML']) ##author in story? if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source: author = find_author.findAuthor() if author != False: author = author.replace('<', '<') infoModule.info.page['author'] = author print "<b>Author:</b> " + author + "<br />" else: infoModule.info.page['author'] = '' else: infoModule.info.page['author'] = '' ## fetch outline if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1': infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(infoModule.info.page['rawHTML']) else: infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML']) outline = find_story.findStoryViaRegex() if outline != False: if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1': infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(outline) else: infoModule.info.page['outline'] = strip_html.clearHTML(outline) print "<b>Outline:</b> " + infoModule.info.page['outline'] + "<br />" print "<hr>"
# set database infoModule.info.site['database'] = 'db_celebrifi' infoModule.info.site['dblink'] = link randSourceQ = mysql_tools.mysqlQuery( "select * from db_sportifi.sources where source_id=6479", link) randSource = randSourceQ.fetch_row(1, 1) for key in randSource[0].keys(): infoModule.info.source[key] = randSource[0][key] #find recent story from that source randStoryQ = mysql_tools.mysqlQuery( "select url from db_celebrifi.subs where source_id=" + randSource[0]['source_id'] + " order by sub_id desc limit 1", link) randStory = randStoryQ.fetch_row(1, 1) url = randStory[0]['url'] url = 'http://nhlhotstove.com/the-price-was-right-and-price-it-is/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+NHLHotStove+%28The+Hot+Stove%29' print url urlSocket = urllib.urlopen(url) html = urlSocket.read() infoModule.info.page['url'] = url infoModule.info.page['rawHTML'] = html outline = find_story.findStoryViaRegex() if outline == False: print "findStoryViaRegex failed" else: print outline