Beispiel #1
0
def main(url):
    if not url:
        print "No url provided"
        sys.exit()

    #url = 'http://newstatesman.com/politics/2013/10/russell-brand-on-revolution'
    #h = getHTML(url)
    html = UnicodeRetriever().fetch_data(url)
    tree = make_tree(html)
    lines = article_text(tree)

    file_type = magic.from_buffer(html, mime=True)
    print "File Type: %s" % file_type
    #print html

    url_obj = urlparse(url)
    if not url_obj.path:
        print "URL is top-level"

    if len(lines) < 1:
        print "URL is top-level"

    soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
    #print get_attribute(html, 'img', url)

    img = get_attribute(soup, 'img', url)
    title = get_attribute(soup, 'title', url)
    desc = get_attribute(soup, 'description', lines)

    print "Title: %s" % title
    print "Desc: %s" % desc
    print "IMG: %s" % img
Beispiel #2
0
 def auto_excerpt(self):
     """
     Attempts to detect the text of this page (ignoring all navigation and
     other clutter), returning a list of strings. Each string represents a
     paragraph.
     """
     from ebdata.textmining.treeutils import make_tree
     tree = make_tree(self.html)
     if self.seed.rss_full_entry:
         from ebdata.templatemaker.textlist import html_to_paragraph_list
         paras = html_to_paragraph_list(tree)
     else:
         if self.seed.strip_noise:
             from ebdata.templatemaker.clean import strip_template
             try:
                 html2 = self.companion_page().html
             except IndexError:
                 pass
             else:
                 tree2 = make_tree(html2)
                 strip_template(tree, tree2)
         if self.seed.guess_article_text:
             from ebdata.templatemaker.articletext import article_text
             paras = article_text(tree)
         else:
             from ebdata.templatemaker.textlist import html_to_paragraph_list
             paras = html_to_paragraph_list(tree)
     return paras
Beispiel #3
0
 def auto_excerpt(self):
     """
     Attempts to detect the text of this page (ignoring all navigation and
     other clutter), returning a list of strings. Each string represents a
     paragraph.
     """
     from ebdata.textmining.treeutils import make_tree
     tree = make_tree(self.html)
     if self.seed.rss_full_entry:
         from ebdata.templatemaker.textlist import html_to_paragraph_list
         paras = html_to_paragraph_list(tree)
     else:
         if self.seed.strip_noise:
             from ebdata.templatemaker.clean import strip_template
             try:
                 html2 = self.companion_page().html
             except IndexError:
                 pass
             else:
                 tree2 = make_tree(html2)
                 strip_template(tree, tree2)
         if self.seed.guess_article_text:
             from ebdata.templatemaker.articletext import article_text
             paras = article_text(tree)
         else:
             from ebdata.templatemaker.textlist import html_to_paragraph_list
             paras = html_to_paragraph_list(tree)
     return paras
def main(url):
    if not url:
        print "No url provided"
        sys.exit()

    #url = 'http://newstatesman.com/politics/2013/10/russell-brand-on-revolution'
    #h = getHTML(url)
    html = UnicodeRetriever().fetch_data(url)
    tree = make_tree(html)
    lines = article_text(tree)


    file_type = magic.from_buffer(html, mime=True)
    print "File Type: %s" % file_type
    #print html

    url_obj = urlparse(url)
    if not url_obj.path:
        print "URL is top-level"

    if len(lines)<1:
        print "URL is top-level"



    soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
    #print get_attribute(html, 'img', url)

    img = get_attribute(soup, 'img', url)
    title = get_attribute(soup, 'title', url)
    desc = get_attribute(soup, 'description', lines)

    print "Title: %s" % title
    print "Desc: %s" % desc
    print "IMG: %s" % img
def setAttributesForUrl(pgConn, url_id, html_str=None, url_core=None):
    """ Downloads the HTML if not found in DB
        pulls out the title, description, thumbnail, etc
        saves this meta data to postgres
    """
    printMsg = True

    html = None
    url = None
    soup = None
    lines = []

    if not pgConn:
        if printMsg: print '[setAttributesForUrl]: Error - No postgres connection'
        return False

    if url_core:
        url = url_core

    if not url:
        row = get_url(pgConn, url_id)
        if row:
            url = row['url']

    if url:
        url_obj = urlparse(url)
        if len(url_obj.path) < 5:
            url = None
    if url:
        if html_str:
            html = html_str

        if not html:
            html_row = get_html(pgConn, url_id)
            if html_row:
                html = html_row['html']
            elif url:
                html = getHTML(url)

        if html:
            tree = make_tree(html)
            lines = article_text(tree)
            soup = cleanSoup(BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES))

        if len(lines) < 1:
            html = None


    if not html or not url or not soup:
        if printMsg: print '[setAttributesForUrl]: Error - no html returned %s' % url
        delete_url(pgConn, url_id) # not sure we need to do this
        return False


    # get thumbnail
    # TODO: check to see if this is working correctly
    thumbnail = get_attribute(soup, 'img', url)

    if not len(thumbnail):
        if printMsg: print '[setAttributesForUrl]: Warning - no thumbnail returned - %s' % url

    # get title
    title = get_attribute(soup, 'title')
    if title is None: title = ''

    if not len(title):
        if printMsg: print '[setAttributesForUrl]: Warning - no title returned - %s' % url

    title = " ".join(title.strip().split())

    # get description
    description = get_attribute(soup, 'description', lines)
    if description == 'error':
        #delete_url(pgConn, url_id)
        description = None

    if description is None: description = ''

    try:
        description = " ".join(description.strip().split())
    except:
        description = description

    if printMsg:
        print ""
        print "-----------------------------------------"
        print "URL ID: %s" % url_id
        print "Title: %s" % title
        print "Desc: %s" % description
        print "IMG: %s" %  thumbnail
        print "-----------------------------------------"

    if not len(description):
        if printMsg: print '[setAttributesForUrl]: Warning - no description returned - %s' % url

    # save
    if pgConn:
        save_url_attributes(pgConn, url_id, title, description, thumbnail)

    return False
def urlValidation(pgConn, url_long, hosts=[]):
    # TODO: fix periodically not returning  the expanded URL
    url_parts = getURLCore(pgConn, url_long) # returns list or None

    valid = False
    url_core = None
    html = None

    if not url_parts:
        print "[addURL]: %s - Error - not able to find a core: %s" % (getCurrentTime(), url_long)
        return url_core, html

    if url_parts and len(url_parts):
        url_core = url_parts[0]
        url_netloc = url_parts[1]

    if url_core and url_netloc:
        #validate here to only the media outlets in hosts

        if len(hosts):
            net_loc_lower = url_netloc.lower().strip()
            ext = tldextract.extract(net_loc_lower)
            normalized_url_netloc = "%s.%s" % (ext.domain, ext.suffix)

            for host in hosts:
                host_parts = host.split('/') #speigal.de/international

                #if not valid:
                if host_parts[0] == normalized_url_netloc or host_parts[0] == normalized_url_netloc:
                    if len(host_parts) == 2:
                        if host in url_core:
                            valid = True
                    else:
                        valid = True

                    if valid: break

        else:
            valid = True


        if not valid:
            print "[addURL]: %s - Error - url is not in the whitelist of hosts - Core: %s - Net: %s" % (getCurrentTime(), url_core, url_netloc)

        if valid:
            valid = is_url_valid(url_core)

            if not valid:
                print "[addURL]: %s - Error - url seems to be an image, video or audio file - Core: %s - Net: %s" % (getCurrentTime(), url_core, url_netloc)


        # got this far need to check if there is quality text on the page
        html = None
        if valid:
            html = getHTML(url_core)
            if html:
                tree = make_tree(html)
                lines = article_text(tree)
                if len(lines) < 1:
                    valid = False

            else:
                valid = False

            if not valid:
                print "[addURL]: %s - Error - there seems to be no paragraphs to read - Core: %s - Net: %s" % (getCurrentTime(), url_core, url_netloc)

    if not valid:
        url_core = None
        html = None

    return url_core, html
def setAttributesForUrl(pgConn, url_id, html_str=None, url_core=None):
    """ Downloads the HTML if not found in DB
        pulls out the title, description, thumbnail, etc
        saves this meta data to postgres
    """
    printMsg = True

    html = None
    url = None
    soup = None
    lines = []

    if not pgConn:
        if printMsg:
            print '[setAttributesForUrl]: Error - No postgres connection'
        return False

    if url_core:
        url = url_core

    if not url:
        row = get_url(pgConn, url_id)
        if row:
            url = row['url']

    if url:
        url_obj = urlparse(url)
        if len(url_obj.path) < 5:
            url = None
    if url:
        if html_str:
            html = html_str

        if not html:
            html_row = get_html(pgConn, url_id)
            if html_row:
                html = html_row['html']
            elif url:
                html = getHTML(url)

        if html:
            tree = make_tree(html)
            lines = article_text(tree)
            soup = cleanSoup(
                BeautifulSoup(html,
                              convertEntities=BeautifulSoup.HTML_ENTITIES))

        if len(lines) < 1:
            html = None

    if not html or not url or not soup:
        if printMsg:
            print '[setAttributesForUrl]: Error - no html returned %s' % url
        delete_url(pgConn, url_id)  # not sure we need to do this
        return False

    # get thumbnail
    # TODO: check to see if this is working correctly
    thumbnail = get_attribute(soup, 'img', url)

    if not len(thumbnail):
        if printMsg:
            print '[setAttributesForUrl]: Warning - no thumbnail returned - %s' % url

    # get title
    title = get_attribute(soup, 'title')
    if title is None: title = ''

    if not len(title):
        if printMsg:
            print '[setAttributesForUrl]: Warning - no title returned - %s' % url

    title = " ".join(title.strip().split())

    # get description
    description = get_attribute(soup, 'description', lines)
    if description == 'error':
        #delete_url(pgConn, url_id)
        description = None

    if description is None: description = ''

    try:
        description = " ".join(description.strip().split())
    except:
        description = description

    if printMsg:
        print ""
        print "-----------------------------------------"
        print "URL ID: %s" % url_id
        print "Title: %s" % title
        print "Desc: %s" % description
        print "IMG: %s" % thumbnail
        print "-----------------------------------------"

    if not len(description):
        if printMsg:
            print '[setAttributesForUrl]: Warning - no description returned - %s' % url

    # save
    if pgConn:
        save_url_attributes(pgConn, url_id, title, description, thumbnail)

    return False
def urlValidation(pgConn, url_long, hosts=[]):
    # TODO: fix periodically not returning  the expanded URL
    url_parts = getURLCore(pgConn, url_long)  # returns list or None

    valid = False
    url_core = None
    html = None

    if not url_parts:
        print "[addURL]: %s - Error - not able to find a core: %s" % (
            getCurrentTime(), url_long)
        return url_core, html

    if url_parts and len(url_parts):
        url_core = url_parts[0]
        url_netloc = url_parts[1]

    if url_core and url_netloc:
        #validate here to only the media outlets in hosts

        if len(hosts):
            net_loc_lower = url_netloc.lower().strip()
            ext = tldextract.extract(net_loc_lower)
            normalized_url_netloc = "%s.%s" % (ext.domain, ext.suffix)

            for host in hosts:
                host_parts = host.split('/')  #speigal.de/international

                #if not valid:
                if host_parts[0] == normalized_url_netloc or host_parts[
                        0] == normalized_url_netloc:
                    if len(host_parts) == 2:
                        if host in url_core:
                            valid = True
                    else:
                        valid = True

                    if valid: break

        else:
            valid = True

        if not valid:
            print "[addURL]: %s - Error - url is not in the whitelist of hosts - Core: %s - Net: %s" % (
                getCurrentTime(), url_core, url_netloc)

        if valid:
            valid = is_url_valid(url_core)

            if not valid:
                print "[addURL]: %s - Error - url seems to be an image, video or audio file - Core: %s - Net: %s" % (
                    getCurrentTime(), url_core, url_netloc)

        # got this far need to check if there is quality text on the page
        html = None
        if valid:
            html = getHTML(url_core)
            if html:
                tree = make_tree(html)
                lines = article_text(tree)
                if len(lines) < 1:
                    valid = False

            else:
                valid = False

            if not valid:
                print "[addURL]: %s - Error - there seems to be no paragraphs to read - Core: %s - Net: %s" % (
                    getCurrentTime(), url_core, url_netloc)

    if not valid:
        url_core = None
        html = None

    return url_core, html