Ejemplo n.º 1
0
def scanPage():
    if 'url' not in infoModule.info.page:
        log.plog('scan page called without url', 4)
        sys.exit()

    log.plog("fetching " + infoModule.info.page['url'], 2)
    socket = urllib.urlopen(infoModule.info.page['url'])
    infoModule.info.page['rawHTML'] = socket.read()
    redirURL = socket.geturl()
    if redirURL != infoModule.info.page['url']:
        log.plog('redirected to ' + redirURL, 2)
        infoModule.info.page['url'] = redirURL

    ## maybe check last modified header and don't get stories older than 7 days?

    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        sys.exit()

    print "<b>URL</b> <a href=\"" + infoModule.info.page[
        'url'] + "\">" + infoModule.info.page['url'] + "</a><br />"
    #print "<b>URL</b> " + infoModule.info.page['url'] + "<br />"
    # get title
    #title = find_title.findTitle()
    #if title != False:
    #    infoModule.info.page['title'] = title
    #elif 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0:
    #    infoModule.info.page['title'] = infoModule.info.page['potential_title']
    #else:
    #    log.plog('no title found!', 3)
    #    sys.exit()

    ##print infoModule.info.page['title']

    #find images
    #if 'image_start_marker' in infoModule.info.source:
    #    image_start_marker = infoModule.info.source['image_start_marker']
    #else:
    #    image_start_marker = ''

    #if 'image_end_marker' in infoModule.info.source:
    #    image_end_marker = infoModule.info.source['image_end_marker']
    #else:
    #    image_end_marker = ''
    #imageArray = find_images.findImages(infoModule.info.page['rawHTML'], image_start_marker, image_end_marker)
    #x = imageArray[0]
    #y = imageArray[1]
    #imageURL = imageArray[2]

    #if imageURL == '':
    #    log.plog('could not find image', 3)
    #else:
    #    log.plog('image found: ' + imageURL, 2)
    #    infoModule.info.page['largestImage'] = imageURL
    #    infoModule.info.page['maxSize'] = x * y

    ## image credit if any
    infoModule.info.page['imageSource'] = ''
    if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source:
        imageSource = find_credit.findCredit(
            infoModule.info.page['rawHTML'],
            infoModule.info.source['image_source_start_marker'],
            infoModule.info.source['image_source_end_marker'])
        if imageSource != False and imageSource != None:
            infoModule.info.page['imageSource'] = imageSource
            #print "<b>Image Credit:</b> " + imageSource + "<br />"

    ###look for videos
    #videoHunter = find_video.youtube()

    #videoLink = videoHunter.getURL(infoModule.info.page['rawHTML'])
    #if videoLink == False:
    #    log.plog('no video found', 2)
    #    infoModule.info.page['vlink'] = ''
    #else:
    #    log.plog('found video embed', 2)
    #    infoModule.info.page['vlink'] = videoLink

    ## parse links in page
    #links.linkScoring(infoModule.info.page['rawHTML'], 'subs')
    #links.linkScoring(infoModule.info.page['rawHTML'], 'newsroom')
    #links.outboundLinks(infoModule.info.page['rawHTML'])

    ##author in story?
    if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source:
        author = find_author.findAuthor()
        if author != False:
            author = author.replace('<', '&lt;')
            infoModule.info.page['author'] = author
            print "<b>Author:</b> " + author + "<br />"
        else:
            infoModule.info.page['author'] = ''
    else:
        infoModule.info.page['author'] = ''

    ## fetch outline
    if 'featured_source' in infoModule.info.source and infoModule.info.source[
            'featured_source'] == '1':
        infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(
            infoModule.info.page['rawHTML'])
    else:
        infoModule.info.page['plainText'] = strip_html.clearHTML(
            infoModule.info.page['rawHTML'])

    outline = find_story.findStoryViaRegex()
    if outline != False:
        if 'featured_source' in infoModule.info.source and infoModule.info.source[
                'featured_source'] == '1':
            infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(
                outline)
        else:
            infoModule.info.page['outline'] = strip_html.clearHTML(outline)
        print "<b>Outline:</b> " + infoModule.info.page['outline'] + "<br />"

    print "<hr>"
Ejemplo n.º 2
0
	
	
# set database
infoModule.info.site['database'] = 'db_celebrifi'
infoModule.info.site['dblink'] = link

randSourceQ = mysql_tools.mysqlQuery("select * from db_sportifi.sources where source_id=6479", link)
randSource = randSourceQ.fetch_row(1,1)
for key in randSource[0].keys():
    infoModule.info.source[key] = randSource[0][key]
    
#find recent story from that source
randStoryQ = mysql_tools.mysqlQuery("select url from db_celebrifi.subs where source_id=" + randSource[0]['source_id'] + " order by sub_id desc limit 1", link)
randStory = randStoryQ.fetch_row(1,1)
url = randStory[0]['url']
url = 'http://nhlhotstove.com/the-price-was-right-and-price-it-is/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+NHLHotStove+%28The+Hot+Stove%29'
print url

urlSocket = urllib.urlopen(url)
html = urlSocket.read()	

infoModule.info.page['url'] = url
infoModule.info.page['rawHTML'] = html
    
outline = find_story.findStoryViaRegex()
if outline == False:
    print "findStoryViaRegex failed"
else:
    print outline

Ejemplo n.º 3
0
def scanPage():
    if 'url' not in infoModule.info.page:
        log.plog('scan page called without url', 4)
        sys.exit()
        
    log.plog("fetching " + infoModule.info.page['url'], 2)
    socket = urllib.urlopen(infoModule.info.page['url'])
    infoModule.info.page['rawHTML'] = socket.read()
    redirURL = socket.geturl()
    if redirURL != infoModule.info.page['url']:
        log.plog('redirected to ' + redirURL, 2)
        infoModule.info.page['url'] = redirURL

    ## maybe check last modified header and don't get stories older than 7 days?

    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        sys.exit()

    print "<b>URL</b> <a href=\"" + infoModule.info.page['url'] + "\">" + infoModule.info.page['url'] + "</a><br />"
    # get title
    #title = find_title.findTitle()
    #if title != False:
    #    infoModule.info.page['title'] = title
    #elif 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0:
    #    infoModule.info.page['title'] = infoModule.info.page['potential_title']
    #else:
    #    log.plog('no title found!', 3)
    #    sys.exit()

    ##print infoModule.info.page['title']        
        
    #find images        
    #if 'image_start_marker' in infoModule.info.source:
    #    image_start_marker = infoModule.info.source['image_start_marker']
    #else:
    #    image_start_marker = ''

    #if 'image_end_marker' in infoModule.info.source:
    #    image_end_marker = infoModule.info.source['image_end_marker']
    #else:
    #    image_end_marker = ''
    #imageArray = find_images.findImages(infoModule.info.page['rawHTML'], image_start_marker, image_end_marker)
    #x = imageArray[0]
    #y = imageArray[1]
    #imageURL = imageArray[2]
    
    #if imageURL == '':
    #    log.plog('could not find image', 3)
    #else:
    #    log.plog('image found: ' + imageURL, 2)
    #    infoModule.info.page['largestImage'] = imageURL
    #    infoModule.info.page['maxSize'] = x * y
    
    ## image credit if any
    infoModule.info.page['imageSource'] = ''
    if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source:
        imageSource = find_credit.findCredit(infoModule.info.page['rawHTML'], infoModule.info.source['image_source_start_marker'], infoModule.info.source['image_source_end_marker'])
        if imageSource != False and imageSource != None:
            infoModule.info.page['imageSource'] = imageSource
            print "<b>Image Credit:</b> " + imageSource + "<br />"
            

    ###look for videos
    #videoHunter = find_video.youtube()
    
    #videoLink = videoHunter.getURL(infoModule.info.page['rawHTML'])
    #if videoLink == False:
    #    log.plog('no video found', 2)
    #    infoModule.info.page['vlink'] = ''
    #else:
    #    log.plog('found video embed', 2)
    #    infoModule.info.page['vlink'] = videoLink
        
    ## parse links in page
    #links.linkScoring(infoModule.info.page['rawHTML'], 'subs')
    #links.linkScoring(infoModule.info.page['rawHTML'], 'newsroom')
    #links.outboundLinks(infoModule.info.page['rawHTML'])
    
    
    ##author in story?
    if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source:
        author = find_author.findAuthor()
        if author != False:
            author = author.replace('<', '&lt;')
            infoModule.info.page['author'] = author
            print "<b>Author:</b> " + author + "<br />"
        else:
            infoModule.info.page['author'] = ''
    else:
        infoModule.info.page['author'] = ''
        
    ## fetch outline
    if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1':
        infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(infoModule.info.page['rawHTML'])
    else:
        infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML'])
    
    outline = find_story.findStoryViaRegex()
    if outline != False:
        if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1':
            infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(outline)
        else:
            infoModule.info.page['outline'] = strip_html.clearHTML(outline)
        print "<b>Outline:</b> " + infoModule.info.page['outline'] + "<br />"
            
    print "<hr>"    
Ejemplo n.º 4
0
# set database
infoModule.info.site['database'] = 'db_celebrifi'
infoModule.info.site['dblink'] = link

randSourceQ = mysql_tools.mysqlQuery(
    "select * from db_sportifi.sources where source_id=6479", link)
randSource = randSourceQ.fetch_row(1, 1)
for key in randSource[0].keys():
    infoModule.info.source[key] = randSource[0][key]

#find recent story from that source
randStoryQ = mysql_tools.mysqlQuery(
    "select url from db_celebrifi.subs where source_id=" +
    randSource[0]['source_id'] + " order by sub_id desc limit 1", link)
randStory = randStoryQ.fetch_row(1, 1)
url = randStory[0]['url']
url = 'http://nhlhotstove.com/the-price-was-right-and-price-it-is/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+NHLHotStove+%28The+Hot+Stove%29'
print url

urlSocket = urllib.urlopen(url)
html = urlSocket.read()

infoModule.info.page['url'] = url
infoModule.info.page['rawHTML'] = html

outline = find_story.findStoryViaRegex()
if outline == False:
    print "findStoryViaRegex failed"
else:
    print outline