コード例 #1
0
def genericImageExtract(url=None, account_id=None):
    ## function for generic image sites.  Just find a big image, and make sure it's bigger than
    ## min x and min y
    min_x = 400
    min_y = 400
    next_biggest = 50
    biggest = {'url': '', 'width': 0, 'height': 0}
    next_biggest = {'url': '', 'width': 0, 'height': 0}
    if url == None:
        return throwGenericError()
    contents = getSiteContents(url)
    if not contents:
        return throwGenericError()

    image_list = find_all_images.findImages(contents, url)
    #print "images for " + url
    #pprint.pprint(image_list)
    if image_list != None:
        ## if there is one big image and the rest are x% less big than it, go for it
        ## the requirement for % calculation is that BOTH dimentions be greater than 50% of maximum
        for image in image_list:
            if image['width'] > min_x and image['height'] > min_y and image[
                    'width'] > biggest[1] and image['height'] > biggest[2]:
                if biggest['width'] > next_biggest['width'] and biggest[
                        'height'] > next_biggest['height']:
                    next_biggest = biggest
                biggest = image
            elif image['width'] > next_biggest['width'] and image[
                    'height'] > next_biggest['height']:
                next_biggest = image

        if biggest['url'] != '':
            x_ratio = float(next_biggest['width']) / float(biggest['width'])
            y_ratio = float(next_biggest['height']) / float(biggest['height'])
            if y_ratio < 0.51 or x_ratio < 0.51:
                #we have a winner
                #print "biggest image: " + biggest[0]
                #print "next biggest: " + next_biggest[0]
                #print "x ratio = " + str(float(next_biggest[1]) / float(biggest[1]))
                #print "y ratio = " + str(float(next_biggest[2]) / float(biggest[2]))
                return json.dumps(
                    returnStructure(url, biggest['url'], '', biggest['width'],
                                    biggest['height']))

    return throwGenericError()
コード例 #2
0
def genericImageExtract(url=None, account_id=None):
    ## function for generic image sites.  Just find a big image, and make sure it's bigger than
    ## min x and min y
    min_x = 400
    min_y = 400
    next_biggest = 50
    biggest = {'url':'', 'width':0, 'height':0}
    next_biggest = {'url':'', 'width':0, 'height':0}
    if url == None:
        return throwGenericError()
    contents = getSiteContents(url)
    if not contents:
         return throwGenericError()  
         
    image_list = find_all_images.findImages(contents, url)
    #print "images for " + url
    #pprint.pprint(image_list)
    if image_list != None:
        ## if there is one big image and the rest are x% less big than it, go for it
        ## the requirement for % calculation is that BOTH dimentions be greater than 50% of maximum
        for image in image_list:
            if image['width'] > min_x and image['height'] > min_y and image['width'] > biggest[1] and image['height'] > biggest[2]:
                if biggest['width'] > next_biggest['width'] and biggest['height'] > next_biggest['height']:
                    next_biggest = biggest
                biggest = image
            elif image['width'] > next_biggest['width'] and image['height'] > next_biggest['height']:
                next_biggest = image
             
        if biggest['url'] != '':
            x_ratio = float(next_biggest['width']) / float(biggest['width'])
            y_ratio = float(next_biggest['height']) / float(biggest['height'])
            if y_ratio < 0.51 or x_ratio < 0.51:
                #we have a winner 
                #print "biggest image: " + biggest[0]
                #print "next biggest: " + next_biggest[0]
                #print "x ratio = " + str(float(next_biggest[1]) / float(biggest[1]))
                #print "y ratio = " + str(float(next_biggest[2]) / float(biggest[2]))
                return json.dumps(returnStructure(url, biggest['url'], '', biggest['width'], biggest['height']))
        
            
    return throwGenericError()  
コード例 #3
0
def fetchStory(url):
    siteDB = 'peepbuzz'
    infoModule.info.page['url'] = url
    log.plog("fetching " + url, 2)
    request_obj = urllib2.Request(url)
    request_obj.add_header('Referer', 'http://www.google.com/')
    request_obj.add_header(
        'User-agent',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)'
    )
    try:
        websock = urllib2.urlopen(request_obj)
    except IOError:
        log.plog('could not open ' + url, 4)
        return failOn('could not open ' + url)
    responseCode = websock.getcode()
    headerInfo = websock.info()
    pprint.pprint(headerInfo)
    log.plog('urllib2 response code: ' + str(responseCode), 2)
    if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303:
        log.plog('got failure response code from server', 4)
        return failOn('got failure response code from server')
    contentType = headerInfo.gettype()
    if contentType != 'text/html' and contentType != 'text/html, text/html':
        log.plog('content type: ' + contentType + '. not fetching', 4)
        return failOn('content type: ' + contentType + '. not fetching')

    readWithTimeout = timeout.TimeoutFunction(websock.read, 5)
    #infoModule.info.page['rawHTML'] = websock.read()
    try:
        infoModule.info.page['rawHTML'] = readWithTimeout()
    except timeout.TimeoutFunctionException:
        log.plog("timeout while trying to fetch " + url, 101)
        return failOn('read timeout ' + url)
    redirURL = websock.geturl()
    if redirURL != url:
        log.plog('redirected to ' + redirURL, 2)
        url = redirURL
        #redirected urls need to be blocked too

    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        return failOn('article length exceeds 500k, probably not html')

    windows_trouble_list = [u'\x93', u'\x92', u'\x91', u'\x96', u'\x94']
    cd = chardet.detect(infoModule.info.page['rawHTML'])
    if cd['encoding'] != 'ascii':
        log.plog('Server encoding: ' + cd['encoding'], 2)
        oldHTML = infoModule.info.page['rawHTML']
        infoModule.info.page['rawHTML'] = infoModule.info.page[
            'rawHTML'].decode(cd['encoding'])
        windows_chars_in_html = [
            trouble for trouble in windows_trouble_list
            if infoModule.info.page['rawHTML'].find(trouble) >= 0
        ]
        if len(windows_chars_in_html) > 0:
            #windows = infoModule.info.page['rawHTML'].find(u'\x93')
            log.plog('this is actually windows-1252', 3)
            infoModule.info.page['rawHTML'] = oldHTML.decode('windows-1252')

    # some configuration options
    infoModule.info.page['meta_description'] = ''
    meta_search = re.search(
        'meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"',
        infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1)
        log.plog(
            "meta_description: " + infoModule.info.page['meta_description'], 2)

    log.plog(
        '======================================= TITLE ================================',
        2)
    # get title
    #set HTMLTitle first

    HTMLTitle = re.search('<title>(.*?)<\/title>',
                          infoModule.info.page['rawHTML'], re.S | re.I)
    if HTMLTitle != None:
        infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1)
        log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2)
    else:
        infoModule.info.page['HTMLTitle'] = ""
    title = find_title.findTitle()
    if title != False:
        infoModule.info.page['title'] = title
        log.plog('title from regex', 2)
    if 'potential_title' in infoModule.info.page and len(
            infoModule.info.page['potential_title']) > 0:
        infoModule.info.page['title'] = strip_html.clearHTML(
            infoModule.info.page['potential_title'])
        log.plog('title from potential_title', 2)
    else:
        infoModule.info.page['title'] = real_title2.realTitle()
        if infoModule.info.page['title'] == False:
            infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']
            log.plog('using html title', 2)
        else:
            log.plog('title from realTitle', 2)

    if infoModule.info.page['title'] == '':
        log.plog('could not find title for page. Setting to HTML Title', 4)
        infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']

    #clear html from title
    infoModule.info.page['title'] = strip_html.clearHTML(
        infoModule.info.page['title'])
    #also titleCase the title
    #infoModule.info.page['title'] = infoModule.info.page['title'].title()
    log.plog('final title: ' + infoModule.info.page['title'], 2)

    #cd = chardet.detect(infoModule.info.page['title'])
    #if cd['encoding'] != 'ascii':
    #    log.plog('title encoding: ' + cd['encoding'], 2)
    #    oldTitle = infoModule.info.page['title']
    #    infoModule.info.page['title'] = infoModule.info.page['title'].decode(cd['encoding'])
    #    windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['title'].find(trouble) >= 0]
    #    if len(windows_chars_in_html) > 0:
    #        #windows = infoModule.info.page['rawHTML'].find(u'\x93')
    #        log.plog('title is actually windows-1252', 3)
    #        infoModule.info.page['title'] = oldTitle.decode('windows-1252')

    log.plog(
        '======================================= OUTLINE ================================',
        2)
    ## fetch outline
    #remove special case elements from the html.  These are lines or blocks of code that cause
    #problems if left in
    infoModule.info.page['plainText'] = strip_html.removeSpecialCases(
        infoModule.info.page['rawHTML'])
    infoModule.info.page['plainText'] = strip_html.clearHTML(
        infoModule.info.page['plainText'])
    #clearHTML can't take out title, because title gets passed to clearHTML, but it should be removed here
    infoModule.info.page['plainText'] = re.sub(
        '<title.*?</title.*?>', '', infoModule.info.page['plainText'], 0,
        re.I | re.S | re.M)
    outline = False
    #this toggle allows for ignoring regex in favor of body_extractor

    log.plog('searching for body using body extractor', 2)
    infoModule.info.site['body_extractor_no_date'] = True
    outline = body_extractor.extract(infoModule.info.page['plainText'],
                                     doAsciiConvert=False)
    infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML']
    if outline != None:
        abbreviatedHTML = html_body_extractor.html_body_extractor(
            infoModule.info.page['rawHTML'], outline)
        if abbreviatedHTML != None:
            infoModule.info.page['rawHTML'] = abbreviatedHTML
        infoModule.info.page['outline'] = outline
        #use largestBlock to strip leading dom elements off that seem extraneous
        infoModule.info.page['outline'] = largestBlock.removePreceedingBlocks(
            infoModule.info.page['imageHTML'], infoModule.info.page['outline'])
    else:
        log.plog('could not create an outline for this story!', 5)
        infoModule.info.page['outline'] = ''

        #return failOn('could not create an outline for this story!')

    # outline must be at least minOutlineLen
    minOutlineLen = 255
    if len(infoModule.info.page['outline']) > 0 and len(
            infoModule.info.page['outline']) < minOutlineLen:
        log.plog('outline too short, assuming failure', 3)
        infoModule.info.page['outline'] = ''

    log.plog(
        '======================================= IMAGES ================================',
        2)
    #find images
    image_start_marker = ''
    image_end_marker = ''
    imageArray = find_all_images.findImages(infoModule.info.page['imageHTML'],
                                            url)
    if imageArray == None:
        log.plog('could not find image', 3)
        imageArray = ''

    log.plog(
        '======================================= VIDEOS ================================',
        2)
    ###look for videos
    allVideosJSON = find_all_videos.find_all_videos(
        infoModule.info.page['imageHTML'])

    allVideos = json.loads(allVideosJSON)
    if len(allVideos) > 0:
        log.plog('found video embed', 2)
        print allVideosJSON

    #if no outline and no images over x by y and no videos, then no story
    if infoModule.info.page['outline'] == '' and (
            imageArray == '' or imageArray == []) and allVideos == '':
        failOn('nothing found')

    #largest image if no outline must be at least 450 x 450 to make it an image page
    largestImageDimensions = 0
    largestImage = []
    for image in imageArray:
        if image['width'] * image['height'] > largestImageDimensions:
            largestImage = image
            largestImageDimensions = image['width'] * image['height']

    print largestImage
    minImageSize = 400
    if infoModule.info.page['outline'] == '' and allVideos == [] and (
            largestImage == [] or largestImage['width'] < minImageSize
            or largestImage['height'] < minImageSize):
        return (failOn(
            'no story or video found, and largest image less than min size'))

    status = 'OK'
    storyObj = {}
    storyObj['title'] = infoModule.info.page['title']

    storyObj['outline'] = unicodeMapper.clearCurlies(
        infoModule.info.page['outline'])
    storyObj['url'] = url
    storyObj['images'] = imageArray
    storyObj['videos'] = allVideos
    returnVal = {"status": status, "story": storyObj}
    output = json.dumps(returnVal)
    return output
コード例 #4
0
ファイル: storyFetcher.py プロジェクト: ctwiz/sourcereader
def fetchStory(url):
    siteDB = 'peepbuzz'
    infoModule.info.page['url'] = url
    log.plog("fetching " + url, 2)
    request_obj = urllib2.Request(url)
    request_obj.add_header('Referer', 'http://www.google.com/')     
    request_obj.add_header('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)')     
    try:
        websock = urllib2.urlopen(request_obj)
    except IOError:
        log.plog('could not open ' + url, 4)
        return failOn('could not open ' + url)   
    responseCode = websock.getcode()
    headerInfo = websock.info()
    pprint.pprint(headerInfo)
    log.plog('urllib2 response code: ' + str(responseCode), 2)
    if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303:
        log.plog('got failure response code from server', 4)
        return failOn('got failure response code from server')
    contentType = headerInfo.gettype()
    if contentType != 'text/html' and contentType != 'text/html, text/html':
        log.plog('content type: ' + contentType + '. not fetching', 4)
        return failOn('content type: ' + contentType + '. not fetching')
        
    readWithTimeout = timeout.TimeoutFunction(websock.read, 5)
    #infoModule.info.page['rawHTML'] = websock.read()
    try:
        infoModule.info.page['rawHTML'] = readWithTimeout()
    except timeout.TimeoutFunctionException:
        log.plog("timeout while trying to fetch " + url, 101)
        return failOn('read timeout ' + url)
    redirURL = websock.geturl()
    if redirURL != url:
        log.plog('redirected to ' + redirURL, 2)
        url = redirURL
        #redirected urls need to be blocked too

    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        return failOn('article length exceeds 500k, probably not html')

    windows_trouble_list = [u'\x93', u'\x92', u'\x91', u'\x96', u'\x94'] 
    cd = chardet.detect(infoModule.info.page['rawHTML'])
    if cd['encoding'] != 'ascii':
        log.plog('Server encoding: ' + cd['encoding'], 2)
        oldHTML = infoModule.info.page['rawHTML']
        infoModule.info.page['rawHTML'] = infoModule.info.page['rawHTML'].decode(cd['encoding'])
        windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['rawHTML'].find(trouble) >= 0]
        if len(windows_chars_in_html) > 0:
            #windows = infoModule.info.page['rawHTML'].find(u'\x93')
            log.plog('this is actually windows-1252', 3)
            infoModule.info.page['rawHTML'] = oldHTML.decode('windows-1252')
    
    # some configuration options
    infoModule.info.page['meta_description'] = ''
    meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1)
        log.plog("meta_description: " + infoModule.info.page['meta_description'], 2)
    
    

    log.plog('======================================= TITLE ================================', 2)
    # get title
    #set HTMLTitle first
    
    
    HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I)
    if HTMLTitle != None:
        infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1)
        log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2)
    else:
        infoModule.info.page['HTMLTitle'] = ""
    title = find_title.findTitle()
    if title != False:
        infoModule.info.page['title'] = title
        log.plog('title from regex', 2)
    if 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0:
        infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['potential_title'])
        log.plog('title from potential_title', 2)
    else:
        infoModule.info.page['title'] = real_title2.realTitle()
        if infoModule.info.page['title'] == False:
            infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']
            log.plog('using html title', 2)
        else: 
            log.plog('title from realTitle', 2)

    if infoModule.info.page['title'] == '':
        log.plog('could not find title for page. Setting to HTML Title', 4)
        infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']

    #clear html from title
    infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['title'])
    #also titleCase the title
    #infoModule.info.page['title'] = infoModule.info.page['title'].title()
    log.plog('final title: ' + infoModule.info.page['title'], 2)

    #cd = chardet.detect(infoModule.info.page['title'])
    #if cd['encoding'] != 'ascii':
    #    log.plog('title encoding: ' + cd['encoding'], 2)
    #    oldTitle = infoModule.info.page['title']
    #    infoModule.info.page['title'] = infoModule.info.page['title'].decode(cd['encoding'])
    #    windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['title'].find(trouble) >= 0]
    #    if len(windows_chars_in_html) > 0:
    #        #windows = infoModule.info.page['rawHTML'].find(u'\x93')
    #        log.plog('title is actually windows-1252', 3)
    #        infoModule.info.page['title'] = oldTitle.decode('windows-1252')

    log.plog('======================================= OUTLINE ================================', 2)        
    ## fetch outline
    #remove special case elements from the html.  These are lines or blocks of code that cause 
    #problems if left in
    infoModule.info.page['plainText'] = strip_html.removeSpecialCases(infoModule.info.page['rawHTML'])
    infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['plainText'])
    #clearHTML can't take out title, because title gets passed to clearHTML, but it should be removed here
    infoModule.info.page['plainText'] = re.sub('<title.*?</title.*?>', '', infoModule.info.page['plainText'], 0, re.I | re.S | re.M)
    outline = False
    #this toggle allows for ignoring regex in favor of body_extractor

    log.plog('searching for body using body extractor', 2)
    infoModule.info.site['body_extractor_no_date'] = True
    outline = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False)
    infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML'];
    if outline != None:
        abbreviatedHTML = html_body_extractor.html_body_extractor(infoModule.info.page['rawHTML'], outline)
        if abbreviatedHTML != None:
            infoModule.info.page['rawHTML'] = abbreviatedHTML
        infoModule.info.page['outline'] = outline
        #use largestBlock to strip leading dom elements off that seem extraneous
        infoModule.info.page['outline'] = largestBlock.removePreceedingBlocks(infoModule.info.page['imageHTML'], infoModule.info.page['outline'])
    else:
        log.plog('could not create an outline for this story!', 5)
        infoModule.info.page['outline'] = ''
        
        #return failOn('could not create an outline for this story!')

    # outline must be at least minOutlineLen
    minOutlineLen = 255
    if len(infoModule.info.page['outline']) > 0 and len(infoModule.info.page['outline']) < minOutlineLen:
        log.plog('outline too short, assuming failure', 3)
        infoModule.info.page['outline'] = ''
        
    log.plog('======================================= IMAGES ================================', 2)
    #find images        
    image_start_marker = ''
    image_end_marker = ''
    imageArray = find_all_images.findImages(infoModule.info.page['imageHTML'], url)
    if imageArray == None:
        log.plog('could not find image', 3) 
        imageArray = ''   


    log.plog('======================================= VIDEOS ================================', 2)
    ###look for videos
    allVideosJSON = find_all_videos.find_all_videos(infoModule.info.page['imageHTML'])

    allVideos = json.loads(allVideosJSON)
    if len(allVideos) > 0:
        log.plog('found video embed', 2)
        print allVideosJSON
    
    #if no outline and no images over x by y and no videos, then no story
    if infoModule.info.page['outline'] == '' and (imageArray == '' or imageArray == []) and allVideos == '':
        failOn('nothing found')
    
    #largest image if no outline must be at least 450 x 450 to make it an image page
    largestImageDimensions = 0;
    largestImage = []
    for image in imageArray:
        if image['width'] * image['height'] > largestImageDimensions:
            largestImage = image
            largestImageDimensions = image['width'] * image['height']

    print largestImage
    minImageSize = 400
    if infoModule.info.page['outline'] == '' and allVideos == [] and (largestImage == [] or largestImage['width'] < minImageSize or largestImage['height'] < minImageSize):
        return(failOn('no story or video found, and largest image less than min size'))
    
    status = 'OK'
    storyObj = {}
    storyObj['title'] = infoModule.info.page['title']
    
    storyObj['outline'] = unicodeMapper.clearCurlies(infoModule.info.page['outline'])
    storyObj['url'] = url
    storyObj['images'] = imageArray
    storyObj['videos'] = allVideos
    returnVal = {"status" : status, "story" : storyObj}
    output = json.dumps(returnVal)
    return output