Ejemplo n.º 1
0
 def auto_excerpt(self):
     """
     Attempts to detect the text of this page (ignoring all navigation and
     other clutter), returning a list of strings. Each string represents a
     paragraph.
     """
     from ebdata.textmining.treeutils import make_tree
     tree = make_tree(self.html)
     if self.seed.rss_full_entry:
         from ebdata.templatemaker.textlist import html_to_paragraph_list
         paras = html_to_paragraph_list(tree)
     else:
         if self.seed.strip_noise:
             from ebdata.templatemaker.clean import strip_template
             try:
                 html2 = self.companion_page().html
             except IndexError:
                 pass
             else:
                 tree2 = make_tree(html2)
                 strip_template(tree, tree2)
         if self.seed.guess_article_text:
             from ebdata.templatemaker.articletext import article_text
             paras = article_text(tree)
         else:
             from ebdata.templatemaker.textlist import html_to_paragraph_list
             paras = html_to_paragraph_list(tree)
     return paras
Ejemplo n.º 2
0
 def auto_excerpt(self):
     """
     Attempts to detect the text of this page (ignoring all navigation and
     other clutter), returning a list of strings. Each string represents a
     paragraph.
     """
     from ebdata.textmining.treeutils import make_tree
     tree = make_tree(self.html)
     if self.seed.rss_full_entry:
         from ebdata.templatemaker.textlist import html_to_paragraph_list
         paras = html_to_paragraph_list(tree)
     else:
         if self.seed.strip_noise:
             from ebdata.templatemaker.clean import strip_template
             try:
                 html2 = self.companion_page().html
             except IndexError:
                 pass
             else:
                 tree2 = make_tree(html2)
                 strip_template(tree, tree2)
         if self.seed.guess_article_text:
             from ebdata.templatemaker.articletext import article_text
             paras = article_text(tree)
         else:
             from ebdata.templatemaker.textlist import html_to_paragraph_list
             paras = html_to_paragraph_list(tree)
     return paras
Ejemplo n.º 3
0
def main(url):
    if not url:
        print "No url provided"
        sys.exit()

    #url = 'http://newstatesman.com/politics/2013/10/russell-brand-on-revolution'
    #h = getHTML(url)
    html = UnicodeRetriever().fetch_data(url)
    tree = make_tree(html)
    lines = article_text(tree)

    file_type = magic.from_buffer(html, mime=True)
    print "File Type: %s" % file_type
    #print html

    url_obj = urlparse(url)
    if not url_obj.path:
        print "URL is top-level"

    if len(lines) < 1:
        print "URL is top-level"

    soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
    #print get_attribute(html, 'img', url)

    img = get_attribute(soup, 'img', url)
    title = get_attribute(soup, 'title', url)
    desc = get_attribute(soup, 'description', lines)

    print "Title: %s" % title
    print "Desc: %s" % desc
    print "IMG: %s" % img
def main(url):
    if not url:
        print "No url provided"
        sys.exit()

    #url = 'http://newstatesman.com/politics/2013/10/russell-brand-on-revolution'
    #h = getHTML(url)
    html = UnicodeRetriever().fetch_data(url)
    tree = make_tree(html)
    lines = article_text(tree)


    file_type = magic.from_buffer(html, mime=True)
    print "File Type: %s" % file_type
    #print html

    url_obj = urlparse(url)
    if not url_obj.path:
        print "URL is top-level"

    if len(lines)<1:
        print "URL is top-level"



    soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
    #print get_attribute(html, 'img', url)

    img = get_attribute(soup, 'img', url)
    title = get_attribute(soup, 'title', url)
    desc = get_attribute(soup, 'description', lines)

    print "Title: %s" % title
    print "Desc: %s" % desc
    print "IMG: %s" % img
Ejemplo n.º 5
0
 def get_printer_friendly_page(self, html, url):
     """
     Parses the given detail page and returns the printer-friendly page, or
     None if not found.
     """
     print_link = printer_friendly_link(make_tree(html))
     if print_link:
         print_link = urlparse.urljoin(url, print_link)
         try:
             return self.get_article_page(print_link)
         except Exception, e:
             self.logger.debug('Error retrieving printer-friendly page %s: %s', url, e)
             return None
Ejemplo n.º 6
0
 def get_printer_friendly_page(self, html, url):
     """
     Parses the given detail page and returns the printer-friendly page, or
     None if not found.
     """
     print_link = printer_friendly_link(make_tree(html))
     if print_link:
         print_link = urlparse.urljoin(url, print_link)
         try:
             return self.get_article_page(print_link)
         except Exception, e:
             self.logger.debug(
                 'Error retrieving printer-friendly page %s: %s', url, e)
             return None
Ejemplo n.º 7
0
 def assertMakeTree(self, html, expected):
     import warnings
     # Note, warnings.catch_warnings() should work but has no effect here?
     warnings.simplefilter('ignore', UnicodeWarning)
     got = etree.tostring(make_tree(html), method='html')
     self.assertEqual(got, expected)
Ejemplo n.º 8
0
 def assertConverts(self, html, expected):
     self.assertEqual(html_to_paragraph_list(make_tree(html)), expected)
def setAttributesForUrl(pgConn, url_id, html_str=None, url_core=None):
    """ Downloads the HTML if not found in DB
        pulls out the title, description, thumbnail, etc
        saves this meta data to postgres
    """
    printMsg = True

    html = None
    url = None
    soup = None
    lines = []

    if not pgConn:
        if printMsg: print '[setAttributesForUrl]: Error - No postgres connection'
        return False

    if url_core:
        url = url_core

    if not url:
        row = get_url(pgConn, url_id)
        if row:
            url = row['url']

    if url:
        url_obj = urlparse(url)
        if len(url_obj.path) < 5:
            url = None
    if url:
        if html_str:
            html = html_str

        if not html:
            html_row = get_html(pgConn, url_id)
            if html_row:
                html = html_row['html']
            elif url:
                html = getHTML(url)

        if html:
            tree = make_tree(html)
            lines = article_text(tree)
            soup = cleanSoup(BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES))

        if len(lines) < 1:
            html = None


    if not html or not url or not soup:
        if printMsg: print '[setAttributesForUrl]: Error - no html returned %s' % url
        delete_url(pgConn, url_id) # not sure we need to do this
        return False


    # get thumbnail
    # TODO: check to see if this is working correctly
    thumbnail = get_attribute(soup, 'img', url)

    if not len(thumbnail):
        if printMsg: print '[setAttributesForUrl]: Warning - no thumbnail returned - %s' % url

    # get title
    title = get_attribute(soup, 'title')
    if title is None: title = ''

    if not len(title):
        if printMsg: print '[setAttributesForUrl]: Warning - no title returned - %s' % url

    title = " ".join(title.strip().split())

    # get description
    description = get_attribute(soup, 'description', lines)
    if description == 'error':
        #delete_url(pgConn, url_id)
        description = None

    if description is None: description = ''

    try:
        description = " ".join(description.strip().split())
    except:
        description = description

    if printMsg:
        print ""
        print "-----------------------------------------"
        print "URL ID: %s" % url_id
        print "Title: %s" % title
        print "Desc: %s" % description
        print "IMG: %s" %  thumbnail
        print "-----------------------------------------"

    if not len(description):
        if printMsg: print '[setAttributesForUrl]: Warning - no description returned - %s' % url

    # save
    if pgConn:
        save_url_attributes(pgConn, url_id, title, description, thumbnail)

    return False
def urlValidation(pgConn, url_long, hosts=[]):
    # TODO: fix periodically not returning  the expanded URL
    url_parts = getURLCore(pgConn, url_long) # returns list or None

    valid = False
    url_core = None
    html = None

    if not url_parts:
        print "[addURL]: %s - Error - not able to find a core: %s" % (getCurrentTime(), url_long)
        return url_core, html

    if url_parts and len(url_parts):
        url_core = url_parts[0]
        url_netloc = url_parts[1]

    if url_core and url_netloc:
        #validate here to only the media outlets in hosts

        if len(hosts):
            net_loc_lower = url_netloc.lower().strip()
            ext = tldextract.extract(net_loc_lower)
            normalized_url_netloc = "%s.%s" % (ext.domain, ext.suffix)

            for host in hosts:
                host_parts = host.split('/') #speigal.de/international

                #if not valid:
                if host_parts[0] == normalized_url_netloc or host_parts[0] == normalized_url_netloc:
                    if len(host_parts) == 2:
                        if host in url_core:
                            valid = True
                    else:
                        valid = True

                    if valid: break

        else:
            valid = True


        if not valid:
            print "[addURL]: %s - Error - url is not in the whitelist of hosts - Core: %s - Net: %s" % (getCurrentTime(), url_core, url_netloc)

        if valid:
            valid = is_url_valid(url_core)

            if not valid:
                print "[addURL]: %s - Error - url seems to be an image, video or audio file - Core: %s - Net: %s" % (getCurrentTime(), url_core, url_netloc)


        # got this far need to check if there is quality text on the page
        html = None
        if valid:
            html = getHTML(url_core)
            if html:
                tree = make_tree(html)
                lines = article_text(tree)
                if len(lines) < 1:
                    valid = False

            else:
                valid = False

            if not valid:
                print "[addURL]: %s - Error - there seems to be no paragraphs to read - Core: %s - Net: %s" % (getCurrentTime(), url_core, url_netloc)

    if not valid:
        url_core = None
        html = None

    return url_core, html
def urlValidation(pgConn, url_long, hosts=[]):
    # TODO: fix periodically not returning  the expanded URL
    url_parts = getURLCore(pgConn, url_long)  # returns list or None

    valid = False
    url_core = None
    html = None

    if not url_parts:
        print "[addURL]: %s - Error - not able to find a core: %s" % (
            getCurrentTime(), url_long)
        return url_core, html

    if url_parts and len(url_parts):
        url_core = url_parts[0]
        url_netloc = url_parts[1]

    if url_core and url_netloc:
        #validate here to only the media outlets in hosts

        if len(hosts):
            net_loc_lower = url_netloc.lower().strip()
            ext = tldextract.extract(net_loc_lower)
            normalized_url_netloc = "%s.%s" % (ext.domain, ext.suffix)

            for host in hosts:
                host_parts = host.split('/')  #speigal.de/international

                #if not valid:
                if host_parts[0] == normalized_url_netloc or host_parts[
                        0] == normalized_url_netloc:
                    if len(host_parts) == 2:
                        if host in url_core:
                            valid = True
                    else:
                        valid = True

                    if valid: break

        else:
            valid = True

        if not valid:
            print "[addURL]: %s - Error - url is not in the whitelist of hosts - Core: %s - Net: %s" % (
                getCurrentTime(), url_core, url_netloc)

        if valid:
            valid = is_url_valid(url_core)

            if not valid:
                print "[addURL]: %s - Error - url seems to be an image, video or audio file - Core: %s - Net: %s" % (
                    getCurrentTime(), url_core, url_netloc)

        # got this far need to check if there is quality text on the page
        html = None
        if valid:
            html = getHTML(url_core)
            if html:
                tree = make_tree(html)
                lines = article_text(tree)
                if len(lines) < 1:
                    valid = False

            else:
                valid = False

            if not valid:
                print "[addURL]: %s - Error - there seems to be no paragraphs to read - Core: %s - Net: %s" % (
                    getCurrentTime(), url_core, url_netloc)

    if not valid:
        url_core = None
        html = None

    return url_core, html
Ejemplo n.º 12
0
 def assertMakeTree(self, html, expected):
     got = etree.tostring(make_tree(html), method='html')
     self.assertEqual(got, expected)
Ejemplo n.º 13
0
 def assertMakeTree(self, html, expected):
     got = etree.tostring(make_tree(html), method='html')
     self.assertEqual(got, expected)
Ejemplo n.º 14
0
        to_delete = []
        for i, paragraph in enumerate(section):
            if paragraph.lower() in ignored_paragraphs:
                to_delete.append(i)
            elif is_punctuated(paragraph) and len(paragraph) >= MIN_CHARS_IN_PARAGRAPH:
                count += 1
        percent_punctuated = decimal.Decimal(count) / decimal.Decimal(len(section))
        if count >= NUM_PARAGRAPHS_SAFE_GUESS or (count >= MIN_NUM_PUNCTUATED and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED):
            for i in reversed(to_delete): # Delete in reverse so that index order is preserved.
                del section[i]
            final_sections.append(section)
    return final_sections

def article_text(tree):
    """
    Simple wrapper around article_text_sections() that "flattens" sections into
    a single section.
    """
    result = []
    for section in article_text_sections(tree):
        result.extend(section)
    return result

if __name__ == "__main__":
    from ebdata.retrieval import UnicodeRetriever
    from ebdata.textmining.treeutils import make_tree
    import sys
    html = UnicodeRetriever().fetch_data(sys.argv[1])
    lines = article_text(make_tree(html))
    print lines
Ejemplo n.º 15
0
 def assertConverts(self, html, expected):
     self.assertEqual(html_to_paragraph_list(make_tree(html)), expected)
def setAttributesForUrl(pgConn, url_id, html_str=None, url_core=None):
    """ Downloads the HTML if not found in DB
        pulls out the title, description, thumbnail, etc
        saves this meta data to postgres
    """
    printMsg = True

    html = None
    url = None
    soup = None
    lines = []

    if not pgConn:
        if printMsg:
            print '[setAttributesForUrl]: Error - No postgres connection'
        return False

    if url_core:
        url = url_core

    if not url:
        row = get_url(pgConn, url_id)
        if row:
            url = row['url']

    if url:
        url_obj = urlparse(url)
        if len(url_obj.path) < 5:
            url = None
    if url:
        if html_str:
            html = html_str

        if not html:
            html_row = get_html(pgConn, url_id)
            if html_row:
                html = html_row['html']
            elif url:
                html = getHTML(url)

        if html:
            tree = make_tree(html)
            lines = article_text(tree)
            soup = cleanSoup(
                BeautifulSoup(html,
                              convertEntities=BeautifulSoup.HTML_ENTITIES))

        if len(lines) < 1:
            html = None

    if not html or not url or not soup:
        if printMsg:
            print '[setAttributesForUrl]: Error - no html returned %s' % url
        delete_url(pgConn, url_id)  # not sure we need to do this
        return False

    # get thumbnail
    # TODO: check to see if this is working correctly
    thumbnail = get_attribute(soup, 'img', url)

    if not len(thumbnail):
        if printMsg:
            print '[setAttributesForUrl]: Warning - no thumbnail returned - %s' % url

    # get title
    title = get_attribute(soup, 'title')
    if title is None: title = ''

    if not len(title):
        if printMsg:
            print '[setAttributesForUrl]: Warning - no title returned - %s' % url

    title = " ".join(title.strip().split())

    # get description
    description = get_attribute(soup, 'description', lines)
    if description == 'error':
        #delete_url(pgConn, url_id)
        description = None

    if description is None: description = ''

    try:
        description = " ".join(description.strip().split())
    except:
        description = description

    if printMsg:
        print ""
        print "-----------------------------------------"
        print "URL ID: %s" % url_id
        print "Title: %s" % title
        print "Desc: %s" % description
        print "IMG: %s" % thumbnail
        print "-----------------------------------------"

    if not len(description):
        if printMsg:
            print '[setAttributesForUrl]: Warning - no description returned - %s' % url

    # save
    if pgConn:
        save_url_attributes(pgConn, url_id, title, description, thumbnail)

    return False
Ejemplo n.º 17
0
 def assertPreprocesses(self, html, expected, **kwargs):
     import warnings
     with warnings.catch_warnings():
         tree = make_tree(html)
         got = etree.tostring(preprocess(tree, **kwargs), method='html')
         self.assertEqual(got, expected)
Ejemplo n.º 18
0
 def assertPreprocesses(self, html, expected, **kwargs):
     import warnings
     with warnings.catch_warnings():
         tree = make_tree(html)
         got = etree.tostring(preprocess(tree, **kwargs), method='html')
         self.assertEqual(got, expected)
Ejemplo n.º 19
0
        if count >= NUM_PARAGRAPHS_SAFE_GUESS or (
                count >= MIN_NUM_PUNCTUATED
                and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED):

            for i in reversed(
                    to_delete
            ):  # Delete in reverse so that index order is preserved.
                del section[i]
            final_sections.append(section)
    return final_sections


def article_text(tree):
    """
    Simple wrapper around article_text_sections() that "flattens" sections into
    a single section.
    """
    result = []
    for section in article_text_sections(tree):
        result.extend(section)
    return result


if __name__ == "__main__":
    from ebdata.retrieval import UnicodeRetriever
    from ebdata.textmining.treeutils import make_tree
    import sys
    html = UnicodeRetriever().fetch_data(sys.argv[1])
    lines = article_text(make_tree(html))
    print lines
Ejemplo n.º 20
0
 def assertPreprocesses(self, html, expected, **kwargs):
     tree = make_tree(html)
     got = etree.tostring(preprocess(tree, **kwargs), method='html')
     self.assertEqual(got, expected)
Ejemplo n.º 21
0
 def assertPreprocesses(self, html, expected, **kwargs):
     tree = make_tree(html)
     got = etree.tostring(preprocess(tree, **kwargs), method='html')
     self.assertEqual(got, expected)
Ejemplo n.º 22
0
 def assertMakeTree(self, html, expected):
     import warnings
     # Note, warnings.catch_warnings() should work but has no effect here?
     warnings.simplefilter('ignore', UnicodeWarning)
     got = etree.tostring(make_tree(html), method='html')
     self.assertEqual(got, expected)