Exemple #1
0
def fetchStory(url):
    siteDB = 'peepbuzz'
    infoModule.info.page['url'] = url
    log.plog("fetching " + url, 2)
    request_obj = urllib2.Request(url)
    request_obj.add_header('Referer', 'http://www.google.com/')
    request_obj.add_header(
        'User-agent',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)'
    )
    try:
        websock = urllib2.urlopen(request_obj)
    except IOError:
        log.plog('could not open ' + url, 4)
        return failOn('could not open ' + url)
    responseCode = websock.getcode()
    headerInfo = websock.info()
    pprint.pprint(headerInfo)
    log.plog('urllib2 response code: ' + str(responseCode), 2)
    if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303:
        log.plog('got failure response code from server', 4)
        return failOn('got failure response code from server')
    contentType = headerInfo.gettype()
    if contentType != 'text/html' and contentType != 'text/html, text/html':
        log.plog('content type: ' + contentType + '. not fetching', 4)
        return failOn('content type: ' + contentType + '. not fetching')

    readWithTimeout = timeout.TimeoutFunction(websock.read, 5)
    #infoModule.info.page['rawHTML'] = websock.read()
    try:
        infoModule.info.page['rawHTML'] = readWithTimeout()
    except timeout.TimeoutFunctionException:
        log.plog("timeout while trying to fetch " + url, 101)
        return failOn('read timeout ' + url)
    redirURL = websock.geturl()
    if redirURL != url:
        log.plog('redirected to ' + redirURL, 2)
        url = redirURL
        #redirected urls need to be blocked too

    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        return failOn('article length exceeds 500k, probably not html')

    windows_trouble_list = [u'\x93', u'\x92', u'\x91', u'\x96', u'\x94']
    cd = chardet.detect(infoModule.info.page['rawHTML'])
    if cd['encoding'] != 'ascii':
        log.plog('Server encoding: ' + cd['encoding'], 2)
        oldHTML = infoModule.info.page['rawHTML']
        infoModule.info.page['rawHTML'] = infoModule.info.page[
            'rawHTML'].decode(cd['encoding'])
        windows_chars_in_html = [
            trouble for trouble in windows_trouble_list
            if infoModule.info.page['rawHTML'].find(trouble) >= 0
        ]
        if len(windows_chars_in_html) > 0:
            #windows = infoModule.info.page['rawHTML'].find(u'\x93')
            log.plog('this is actually windows-1252', 3)
            infoModule.info.page['rawHTML'] = oldHTML.decode('windows-1252')

    # some configuration options
    infoModule.info.page['meta_description'] = ''
    meta_search = re.search(
        'meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"',
        infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1)
        log.plog(
            "meta_description: " + infoModule.info.page['meta_description'], 2)

    log.plog(
        '======================================= TITLE ================================',
        2)
    # get title
    #set HTMLTitle first

    HTMLTitle = re.search('<title>(.*?)<\/title>',
                          infoModule.info.page['rawHTML'], re.S | re.I)
    if HTMLTitle != None:
        infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1)
        log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2)
    else:
        infoModule.info.page['HTMLTitle'] = ""
    title = find_title.findTitle()
    if title != False:
        infoModule.info.page['title'] = title
        log.plog('title from regex', 2)
    if 'potential_title' in infoModule.info.page and len(
            infoModule.info.page['potential_title']) > 0:
        infoModule.info.page['title'] = strip_html.clearHTML(
            infoModule.info.page['potential_title'])
        log.plog('title from potential_title', 2)
    else:
        infoModule.info.page['title'] = real_title2.realTitle()
        if infoModule.info.page['title'] == False:
            infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']
            log.plog('using html title', 2)
        else:
            log.plog('title from realTitle', 2)

    if infoModule.info.page['title'] == '':
        log.plog('could not find title for page. Setting to HTML Title', 4)
        infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']

    #clear html from title
    infoModule.info.page['title'] = strip_html.clearHTML(
        infoModule.info.page['title'])
    #also titleCase the title
    #infoModule.info.page['title'] = infoModule.info.page['title'].title()
    log.plog('final title: ' + infoModule.info.page['title'], 2)

    #cd = chardet.detect(infoModule.info.page['title'])
    #if cd['encoding'] != 'ascii':
    #    log.plog('title encoding: ' + cd['encoding'], 2)
    #    oldTitle = infoModule.info.page['title']
    #    infoModule.info.page['title'] = infoModule.info.page['title'].decode(cd['encoding'])
    #    windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['title'].find(trouble) >= 0]
    #    if len(windows_chars_in_html) > 0:
    #        #windows = infoModule.info.page['rawHTML'].find(u'\x93')
    #        log.plog('title is actually windows-1252', 3)
    #        infoModule.info.page['title'] = oldTitle.decode('windows-1252')

    log.plog(
        '======================================= OUTLINE ================================',
        2)
    ## fetch outline
    #remove special case elements from the html.  These are lines or blocks of code that cause
    #problems if left in
    infoModule.info.page['plainText'] = strip_html.removeSpecialCases(
        infoModule.info.page['rawHTML'])
    infoModule.info.page['plainText'] = strip_html.clearHTML(
        infoModule.info.page['plainText'])
    #clearHTML can't take out title, because title gets passed to clearHTML, but it should be removed here
    infoModule.info.page['plainText'] = re.sub(
        '<title.*?</title.*?>', '', infoModule.info.page['plainText'], 0,
        re.I | re.S | re.M)
    outline = False
    #this toggle allows for ignoring regex in favor of body_extractor

    log.plog('searching for body using body extractor', 2)
    infoModule.info.site['body_extractor_no_date'] = True
    outline = body_extractor.extract(infoModule.info.page['plainText'],
                                     doAsciiConvert=False)
    infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML']
    if outline != None:
        abbreviatedHTML = html_body_extractor.html_body_extractor(
            infoModule.info.page['rawHTML'], outline)
        if abbreviatedHTML != None:
            infoModule.info.page['rawHTML'] = abbreviatedHTML
        infoModule.info.page['outline'] = outline
        #use largestBlock to strip leading dom elements off that seem extraneous
        infoModule.info.page['outline'] = largestBlock.removePreceedingBlocks(
            infoModule.info.page['imageHTML'], infoModule.info.page['outline'])
    else:
        log.plog('could not create an outline for this story!', 5)
        infoModule.info.page['outline'] = ''

        #return failOn('could not create an outline for this story!')

    # outline must be at least minOutlineLen
    minOutlineLen = 255
    if len(infoModule.info.page['outline']) > 0 and len(
            infoModule.info.page['outline']) < minOutlineLen:
        log.plog('outline too short, assuming failure', 3)
        infoModule.info.page['outline'] = ''

    log.plog(
        '======================================= IMAGES ================================',
        2)
    #find images
    image_start_marker = ''
    image_end_marker = ''
    imageArray = find_all_images.findImages(infoModule.info.page['imageHTML'],
                                            url)
    if imageArray == None:
        log.plog('could not find image', 3)
        imageArray = ''

    log.plog(
        '======================================= VIDEOS ================================',
        2)
    ###look for videos
    allVideosJSON = find_all_videos.find_all_videos(
        infoModule.info.page['imageHTML'])

    allVideos = json.loads(allVideosJSON)
    if len(allVideos) > 0:
        log.plog('found video embed', 2)
        print allVideosJSON

    #if no outline and no images over x by y and no videos, then no story
    if infoModule.info.page['outline'] == '' and (
            imageArray == '' or imageArray == []) and allVideos == '':
        failOn('nothing found')

    #largest image if no outline must be at least 450 x 450 to make it an image page
    largestImageDimensions = 0
    largestImage = []
    for image in imageArray:
        if image['width'] * image['height'] > largestImageDimensions:
            largestImage = image
            largestImageDimensions = image['width'] * image['height']

    print largestImage
    minImageSize = 400
    if infoModule.info.page['outline'] == '' and allVideos == [] and (
            largestImage == [] or largestImage['width'] < minImageSize
            or largestImage['height'] < minImageSize):
        return (failOn(
            'no story or video found, and largest image less than min size'))

    status = 'OK'
    storyObj = {}
    storyObj['title'] = infoModule.info.page['title']

    storyObj['outline'] = unicodeMapper.clearCurlies(
        infoModule.info.page['outline'])
    storyObj['url'] = url
    storyObj['images'] = imageArray
    storyObj['videos'] = allVideos
    returnVal = {"status": status, "story": storyObj}
    output = json.dumps(returnVal)
    return output
Exemple #2
0
def fetchStory(url):
    siteDB = 'peepbuzz'
    infoModule.info.page['url'] = url
    log.plog("fetching " + url, 2)
    request_obj = urllib2.Request(url)
    request_obj.add_header('Referer', 'http://www.google.com/')     
    request_obj.add_header('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)')     
    try:
        websock = urllib2.urlopen(request_obj)
    except IOError:
        log.plog('could not open ' + url, 4)
        return failOn('could not open ' + url)   
    responseCode = websock.getcode()
    headerInfo = websock.info()
    pprint.pprint(headerInfo)
    log.plog('urllib2 response code: ' + str(responseCode), 2)
    if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303:
        log.plog('got failure response code from server', 4)
        return failOn('got failure response code from server')
    contentType = headerInfo.gettype()
    if contentType != 'text/html' and contentType != 'text/html, text/html':
        log.plog('content type: ' + contentType + '. not fetching', 4)
        return failOn('content type: ' + contentType + '. not fetching')
        
    readWithTimeout = timeout.TimeoutFunction(websock.read, 5)
    #infoModule.info.page['rawHTML'] = websock.read()
    try:
        infoModule.info.page['rawHTML'] = readWithTimeout()
    except timeout.TimeoutFunctionException:
        log.plog("timeout while trying to fetch " + url, 101)
        return failOn('read timeout ' + url)
    redirURL = websock.geturl()
    if redirURL != url:
        log.plog('redirected to ' + redirURL, 2)
        url = redirURL
        #redirected urls need to be blocked too

    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        return failOn('article length exceeds 500k, probably not html')

    windows_trouble_list = [u'\x93', u'\x92', u'\x91', u'\x96', u'\x94'] 
    cd = chardet.detect(infoModule.info.page['rawHTML'])
    if cd['encoding'] != 'ascii':
        log.plog('Server encoding: ' + cd['encoding'], 2)
        oldHTML = infoModule.info.page['rawHTML']
        infoModule.info.page['rawHTML'] = infoModule.info.page['rawHTML'].decode(cd['encoding'])
        windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['rawHTML'].find(trouble) >= 0]
        if len(windows_chars_in_html) > 0:
            #windows = infoModule.info.page['rawHTML'].find(u'\x93')
            log.plog('this is actually windows-1252', 3)
            infoModule.info.page['rawHTML'] = oldHTML.decode('windows-1252')
    
    # some configuration options
    infoModule.info.page['meta_description'] = ''
    meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1)
        log.plog("meta_description: " + infoModule.info.page['meta_description'], 2)
    
    

    log.plog('======================================= TITLE ================================', 2)
    # get title
    #set HTMLTitle first
    
    
    HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I)
    if HTMLTitle != None:
        infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1)
        log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2)
    else:
        infoModule.info.page['HTMLTitle'] = ""
    title = find_title.findTitle()
    if title != False:
        infoModule.info.page['title'] = title
        log.plog('title from regex', 2)
    if 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0:
        infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['potential_title'])
        log.plog('title from potential_title', 2)
    else:
        infoModule.info.page['title'] = real_title2.realTitle()
        if infoModule.info.page['title'] == False:
            infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']
            log.plog('using html title', 2)
        else: 
            log.plog('title from realTitle', 2)

    if infoModule.info.page['title'] == '':
        log.plog('could not find title for page. Setting to HTML Title', 4)
        infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']

    #clear html from title
    infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['title'])
    #also titleCase the title
    #infoModule.info.page['title'] = infoModule.info.page['title'].title()
    log.plog('final title: ' + infoModule.info.page['title'], 2)

    #cd = chardet.detect(infoModule.info.page['title'])
    #if cd['encoding'] != 'ascii':
    #    log.plog('title encoding: ' + cd['encoding'], 2)
    #    oldTitle = infoModule.info.page['title']
    #    infoModule.info.page['title'] = infoModule.info.page['title'].decode(cd['encoding'])
    #    windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['title'].find(trouble) >= 0]
    #    if len(windows_chars_in_html) > 0:
    #        #windows = infoModule.info.page['rawHTML'].find(u'\x93')
    #        log.plog('title is actually windows-1252', 3)
    #        infoModule.info.page['title'] = oldTitle.decode('windows-1252')

    log.plog('======================================= OUTLINE ================================', 2)        
    ## fetch outline
    #remove special case elements from the html.  These are lines or blocks of code that cause 
    #problems if left in
    infoModule.info.page['plainText'] = strip_html.removeSpecialCases(infoModule.info.page['rawHTML'])
    infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['plainText'])
    #clearHTML can't take out title, because title gets passed to clearHTML, but it should be removed here
    infoModule.info.page['plainText'] = re.sub('<title.*?</title.*?>', '', infoModule.info.page['plainText'], 0, re.I | re.S | re.M)
    outline = False
    #this toggle allows for ignoring regex in favor of body_extractor

    log.plog('searching for body using body extractor', 2)
    infoModule.info.site['body_extractor_no_date'] = True
    outline = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False)
    infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML'];
    if outline != None:
        abbreviatedHTML = html_body_extractor.html_body_extractor(infoModule.info.page['rawHTML'], outline)
        if abbreviatedHTML != None:
            infoModule.info.page['rawHTML'] = abbreviatedHTML
        infoModule.info.page['outline'] = outline
        #use largestBlock to strip leading dom elements off that seem extraneous
        infoModule.info.page['outline'] = largestBlock.removePreceedingBlocks(infoModule.info.page['imageHTML'], infoModule.info.page['outline'])
    else:
        log.plog('could not create an outline for this story!', 5)
        infoModule.info.page['outline'] = ''
        
        #return failOn('could not create an outline for this story!')

    # outline must be at least minOutlineLen
    minOutlineLen = 255
    if len(infoModule.info.page['outline']) > 0 and len(infoModule.info.page['outline']) < minOutlineLen:
        log.plog('outline too short, assuming failure', 3)
        infoModule.info.page['outline'] = ''
        
    log.plog('======================================= IMAGES ================================', 2)
    #find images        
    image_start_marker = ''
    image_end_marker = ''
    imageArray = find_all_images.findImages(infoModule.info.page['imageHTML'], url)
    if imageArray == None:
        log.plog('could not find image', 3) 
        imageArray = ''   


    log.plog('======================================= VIDEOS ================================', 2)
    ###look for videos
    allVideosJSON = find_all_videos.find_all_videos(infoModule.info.page['imageHTML'])

    allVideos = json.loads(allVideosJSON)
    if len(allVideos) > 0:
        log.plog('found video embed', 2)
        print allVideosJSON
    
    #if no outline and no images over x by y and no videos, then no story
    if infoModule.info.page['outline'] == '' and (imageArray == '' or imageArray == []) and allVideos == '':
        failOn('nothing found')
    
    #largest image if no outline must be at least 450 x 450 to make it an image page
    largestImageDimensions = 0;
    largestImage = []
    for image in imageArray:
        if image['width'] * image['height'] > largestImageDimensions:
            largestImage = image
            largestImageDimensions = image['width'] * image['height']

    print largestImage
    minImageSize = 400
    if infoModule.info.page['outline'] == '' and allVideos == [] and (largestImage == [] or largestImage['width'] < minImageSize or largestImage['height'] < minImageSize):
        return(failOn('no story or video found, and largest image less than min size'))
    
    status = 'OK'
    storyObj = {}
    storyObj['title'] = infoModule.info.page['title']
    
    storyObj['outline'] = unicodeMapper.clearCurlies(infoModule.info.page['outline'])
    storyObj['url'] = url
    storyObj['images'] = imageArray
    storyObj['videos'] = allVideos
    returnVal = {"status" : status, "story" : storyObj}
    output = json.dumps(returnVal)
    return output
        return False
        
    retval = websock.read()
    return retval
    

if __name__ == '__main__':
    if len(sys.argv) > 1:
        url = sys.argv[1]
        
        infoModule.info.site['body_extractor_no_date'] = True
        infoModule.info.page['rawHTML'] = fetchPage(url)
        htmlTitle()
        infoModule.info.page['title'] = real_title2.realTitle()
        print infoModule.info.page['title']
        #sys.exit()

        infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML'])
        infoModule.info.site['body_extractor_no_date'] = True
        infoModule.info.page['meta_description'] = ''
        meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S)
        if meta_search != None:
            infoModule.info.page['meta_description'] = meta_search.group(1)
            print "meta_description: " + infoModule.info.page['meta_description']
        outline = extract(infoModule.info.page['plainText'], doAsciiConvert=False)
        outline = unicodeMapper.clearCurlies(outline)
        
        print outline
    else:
        unittest.main()
Exemple #4
0
    #sys.exit()
    meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1).decode('utf-8')
        print "meta_description: " + infoModule.info.page['meta_description']
    infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML'])
    be_results = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False)
    if be_results != None:
        print be_results
    return be_results

    

if __name__ == '__main__':
    if len(sys.argv) > 1:
        url = sys.argv[1]
        
        be_results = bodyExtract(url)
        html = infoModule.info.page['rawHTML']
        if be_results == None:
            print "no body extractor results"
            sys.exit()
        html = removePreceedingBlocks(html, be_results)
        print "TITLE: " + infoModule.info.page['title']
        html = unicodeMapper.clearCurlies(html)
        
        print "BODY: " + html
        #for i in range(len(html)):
        #    print html[i] + ":" + str(ord(html[i]))
        
        infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1).decode(
            'utf-8')
        print "meta_description: " + infoModule.info.page['meta_description']
    infoModule.info.page['plainText'] = strip_html.clearHTML(
        infoModule.info.page['rawHTML'])
    be_results = body_extractor.extract(infoModule.info.page['plainText'],
                                        doAsciiConvert=False)
    if be_results != None:
        print be_results
    return be_results


if __name__ == '__main__':
    if len(sys.argv) > 1:
        url = sys.argv[1]

        be_results = bodyExtract(url)
        html = infoModule.info.page['rawHTML']
        if be_results == None:
            print "no body extractor results"
            sys.exit()
        html = removePreceedingBlocks(html, be_results)
        print "TITLE: " + infoModule.info.page['title']
        html = unicodeMapper.clearCurlies(html)

        print "BODY: " + html
        #for i in range(len(html)):
        #    print html[i] + ":" + str(ord(html[i]))