Exemple #1
0
def run_parse():
    page = urllib.request.urlopen(base_url)
    doc = html.document_fromstring(page.read())
    doc.make_links_absolute(base_url=base_url)
    for link in html.iterlinks(doc):
        if ("forumdisplay.php" in link[2]) and ("f=43" in link[2]):
            v_chapter_name = link[0].text_content()
            v_path = folder_prefix + v_chapter_name
            v_link = link[2]
            # Создаем папку
            if not os.path.exists(v_path):
                os.makedirs(v_path)
            page = urllib.request.urlopen(v_link)
            doc = html.document_fromstring(page.read())
            doc.make_links_absolute(base_url=base_url)
            for link_topics in html.iterlinks(doc):
                parsed_url = urllib.parse.urlparse(link_topics[2])
                # print(parsed_url)
                parsed_q = urllib.parse.parse_qs(parsed_url.query)
                # print(parsed_q)
                # Отыскиваем ссылку на первую страницу
                if (parsed_url.path == "/showthread.php") and ("t" in parsed_q) and not ("page" in parsed_q) and \
                        (link_topics[0].text_content() != "1")\
                        and (parsed_q["t"][0] == "1537" ):
                    parse_topic(link_topics, v_path)
                #print(parsed_q)
Exemple #2
0
def run_parse():
    page = urllib.request.urlopen(base_url)
    doc = html.document_fromstring(page.read())
    doc.make_links_absolute(base_url=base_url)
    for link in html.iterlinks(doc):
        if ("forumdisplay.php" in link[2]) and ("f=43" in link[2]):
            v_chapter_name = link[0].text_content()
            v_path = folder_prefix + v_chapter_name
            v_link = link[2]
            # Создаем папку
            if not os.path.exists(v_path):
                os.makedirs(v_path)
            page = urllib.request.urlopen(v_link)
            doc = html.document_fromstring(page.read())
            doc.make_links_absolute(base_url=base_url)
            for link_topics in html.iterlinks(doc):
                parsed_url = urllib.parse.urlparse(link_topics[2])
                # print(parsed_url)
                parsed_q = urllib.parse.parse_qs(parsed_url.query)
                # print(parsed_q)
                # Отыскиваем ссылку на первую страницу
                if (parsed_url.path == "/showthread.php") and ("t" in parsed_q) and not ("page" in parsed_q) and \
                        (link_topics[0].text_content() != "1")\
                        and (parsed_q["t"][0] == "1537" ):
                    parse_topic(link_topics, v_path)
Exemple #3
0
    def fix_inner_link(self, page_html):
        """
        Changes inner book links to point to the saved HML file rather than the
        URL on CNX
        :param page_html: HTML that could contain page links
        :return: HTML with corrected links
        """
        html = lxml.html.document_fromstring(page_html)
        for element, attribute, link, pos in html.iterlinks():
            if '/contents/' in link and element.text is not None:
                print "fix_link: " + link
                content_index = link.find('/contents/')
                pound_index = link.find('#')
                id = ''
                if pound_index == -1:
                    id = link[content_index + 10:]
                else:
                    id = link[content_index + 10:pound_index]
                anchor = ''
                if pound_index > -1:
                    anchor = link[pound_index:]

                if self.remove_version(id) in self.get_page_ids():
                    print "fix_inner_link: " + id
                    page = self.get_page_json(id)
                    title = page['id'] + ".html"
                    link_text = title + anchor
                    element.attrib['href'] = link_text
        return lxml.html.tostring(html)
def scrape_ulsterbus():
    """
    For Ulsterbus there is a single index page containing the links
    to the individual service timetables. This index page is
    paginated - 5 pages with <= 100 items on each page. The paginator
    employs 'JS to form POST' buttons, so we need to mimic the
    individual POSTS here.
    """
    fmt = (
        'ctl00$MainRegion$MainContentRegion$MainBodyRegion'
        '$ctl00$rptPageList$ctl00$ctl03$ctl01$ctl%02d'
    )
    for x in [5, 7, 9, 11, 13]:
        done_routes = set()
        payload = {
            '__EVENTTARGET': fmt % x,
            '__EVENTARGUMENT': '',
        }
        response = requests.post(TRANSLINK_ULSTERBUS_INDEX, data=payload)
        html = parse(response.content)
        for link in html.iterlinks():
            url = link[2]
            if valid_ulsterbus_route_url(url) and url not in done_routes:
                href = TRANSLINK_URL + url
                for row in iter_scrape_route(ULSTERBUS_ID, BUS_ROUTE_TYPE, href):
                    yield row
                done_routes.add(url)
Exemple #5
0
    def get_for_flickr_photo(flickr_user, flickr_id):
        """ Return license instance """

        try:
            response = urllib2.urlopen(
                'http://www.flickr.com/photos/%s/%s/' %
                (flickr_user, flickr_id)
            ).read()
        except urllib2.URLError as e:
            if e.code == 404:
                return License.objects.get_or_create(
                    name='All Rights Reserved')[0]
            else:
                return None

        html = lxml.html.fromstring(response)
        for element, attribute, link, pos in html.iterlinks():
            if (attribute and link and attribute.lower() == 'href' and
                    link.startswith('http://creativecommons.org/licenses/') and
                    link.endswith('/')):
                return License.objects.get_or_create(
                    creative_commons=True,
                    url=link)[0]

        #for element, attribute, link, pos in html.iterlinks():
            #if (attribute and link and attribute.lower() == 'href' and
                #link.endswith('/help/general/#147')):
                #return License.objects.get_or_create(
                        #name='All Rights Reserved')[0]

        return License.objects.get_or_create(
            name='All Rights Reserved')[0]
Exemple #6
0
    def get_for_blendswap_scene(blendswap_id):
        """ Return (license, blendswap url, blendswap username) """

        blendswap_url = 'http://www.blendswap.com/blends/view/%s' % blendswap_id
        print 'Visiting %s...' % blendswap_url

        try:
            response = urllib2.urlopen(blendswap_url).read()
        except urllib2.URLError as e:
            if e.code == 404:
                license, _ = License.objects.get_or_create(name='All Rights Reserved')
                return license, None, None
            else:
                return None, None, None

        html = lxml.html.fromstring(response)
        blendswap_username = html.find_class("user_link")[0].text_content().strip()

        for element, attribute, link, pos in html.iterlinks():
            if (attribute and link and attribute.lower() == 'href' and
                    (link.startswith('http://creativecommons.org/licenses/') or
                     link.startswith('http://creativecommons.org/publicdomain/')) and
                    link.endswith('/')):
                license, _ = License.objects.get_or_create(
                    creative_commons=True, url=link)
                return license, blendswap_url, blendswap_username

        raise ValueError("Could not find license")
Exemple #7
0
def html_malware_scan(url, topic):
    """
    Crawls a page depth 1, returns the total count for each of 
    the keywords in topic for each of the pages in the crawl.
    """

    response = requests.get(url, timeout=10.0)
    html = lxml.html.fromstring(response.content, base_url=response.url)
    html.make_links_absolute(resolve_base_href=True)

    childs_topic_cnt = 0
    main_page_topic_cnt = response.text.lower().count(topic)

    for url in Bar().iter(
        {link
         for element, attribute, link, pos in html.iterlinks()}):
        #for url in {link for element, attribute, link, pos in html.iterlinks()}:
        childs_topic_cnt += check_content(url, topic)

    if (float(main_page_topic_cnt) / (float(childs_topic_cnt) + 1.0) >= 1.0
            or float(main_page_topic_cnt) /
        (float(childs_topic_cnt) + 1.0) == 0):
        return True
    else:
        return False
Exemple #8
0
def image_fixups(content, msgid, archive, richformat, allowimgs):
    "Replace the CID links stored messages"
    html = local_fromstring(content)
    for element, attribute, link, _ in iterlinks(html):
        if not link.startswith('cid:'):
            if not allowimgs and attribute == 'src':
                element.attrib['src'] = '%simgs/blocked.gif' % media_url()
                element.attrib['title'] = link
                if richformat:
                    if archive:
                        displayurl = url('message-preview-archived-with-imgs',
                                        msgid=msgid)
                    else:
                        displayurl = url('message-preview-with-imgs',
                                        msgid=msgid)
                    flash(ugettext('This message contains external'
                        ' images, which have been blocked. ') +
                        literal(link_to(ugettext('Display images'),
                                displayurl)))
        else:
            imgname = link.replace('cid:', '')
            if archive:
                imgurl = url('messages-preview-archived-img',
                            img=imgname.replace('/', '__xoxo__'),
                            msgid=msgid)
            else:
                imgurl = url('messages-preview-img',
                            img=imgname.replace('/', '__xoxo__'),
                            msgid=msgid)
            element.attrib['src'] = imgurl            
    return tostring(html)
Exemple #9
0
def html_malware_scan(url, topic):
    """
    Crawls a page depth 1, returns the total count for each of 
    the keywords in topic for each of the pages in the crawl.
    """

    response = requests.get(url, timeout=10.0)
    html = lxml.html.fromstring(
        response.content,
        base_url=response.url
    )
    html.make_links_absolute(resolve_base_href=True)

    childs_topic_cnt = 0
    main_page_topic_cnt = response.text.lower().count(topic)


    for url in Bar().iter({link for element, attribute, link, pos in html.iterlinks()}):
    #for url in {link for element, attribute, link, pos in html.iterlinks()}:
            childs_topic_cnt += check_content(url, topic)

    if (float(main_page_topic_cnt)/(float(childs_topic_cnt)+1.0) >= 1.0 or
        float(main_page_topic_cnt)/(float(childs_topic_cnt)+1.0) == 0):
        return True
    else:
        return False
def parse_links_xpath(filename):
    """question 2b

    Do the same using xpath and the lxml library from http://lxml.de rather
    than regular expressions.
    
    Which approach is better? (Hint: http://goo.gl/mzl9t)
    """
    
    from lxml.html import iterlinks

    f = open(filename)
    site = f.read()

    html = iterlinks(site)
    values = []
    keys = []

    for a in html:
        keys.append(a[0].text)
        values.append(a[2])

    dct = dict(zip(keys, values))

    f.close()
    return dct
def grab_urls(content, url):
    urls = {}
    domain = urlparse(url).netloc
    html = document_fromstring(content)
    html.make_links_absolute(url, resolve_base_href=True)

    for element, attribute, link, pos in html.iterlinks():
        if attribute != "href":
            continue

        # skip if not on our domain
        if urlparse(link).netloc != domain and urlparse(
                link).netloc != "www." + domain:
            continue

        # skip if self referential
        if (url.split("//")[1] + "#") in link:
            continue

        text = element.text_content() if len(
            element) == 0 else element[0].text_content()
        text = text.lstrip() if text is not None else ""
        # compute relevancy here

        relevance[link] = relevancy(link, text, url)
        urls[link] = 1

        if text != "":
            print text
        print link
        print

    return urls.keys()
Exemple #12
0
    def _post_process_html(self, content):
        html = lxml.html.fromstring(content)
        if self.links:
            html.rewrite_links(self._map_cid)

            for link in html.iterlinks():
                link[0].set("target", "_blank")
        else:
            html.rewrite_links(lambda x: None)
        safe_attrs = list(defs.safe_attrs) + ["class", "style"]
        cleaner = Cleaner(
            scripts=True,
            javascript=True,
            links=True,
            page_structure=True,
            embedded=True,
            frames=True,
            add_nofollow=True,
            safe_attrs=safe_attrs
        )
        mail_text = lxml.html.tostring(
            cleaner.clean_html(html), encoding="unicode")
        with open("/tmp/output.txt", "w") as fp:
            fp.write(mail_text)
        return smart_text(mail_text)
Exemple #13
0
 def _get_desc(self, cr, uid, ids, field_name=None, arg=None, context=None):
     res = dict.fromkeys(ids, '')
     for module in self.browse(cr, uid, ids, context=context):
         path = get_module_resource(module.name,
                                    'static/description/index.html')
         if path:
             with tools.file_open(path, 'rb') as desc_file:
                 doc = desc_file.read()
                 html = lxml.html.document_fromstring(doc)
                 for element, attribute, link, pos in html.iterlinks():
                     if element.get('src') and not '//' in element.get(
                             'src') and not 'static/' in element.get('src'):
                         element.set(
                             'src', "/%s/static/description/%s" %
                             (module.name, element.get('src')))
                 res[module.id] = html_sanitize(lxml.html.tostring(html))
         else:
             overrides = {
                 'embed_stylesheet': False,
                 'doctitle_xform': False,
                 'output_encoding': 'unicode',
                 'xml_declaration': False,
             }
             output = publish_string(source=module.description or '',
                                     settings_overrides=overrides,
                                     writer=MyWriter())
             res[module.id] = html_sanitize(output)
     return res
def scrape_ulsterbus():
    """
    For Ulsterbus there is a single index page containing the links
    to the individual service timetables. This index page is
    paginated - 5 pages with <= 100 items on each page. The paginator
    employs 'JS to form POST' buttons, so we need to mimic the
    individual POSTS here.
    """
    fmt = ('ctl00$MainRegion$MainContentRegion$MainBodyRegion'
           '$ctl00$rptPageList$ctl00$ctl03$ctl01$ctl%02d')
    for x in [5, 7, 9, 11, 13]:
        done_routes = set()
        payload = {
            '__EVENTTARGET': fmt % x,
            '__EVENTARGUMENT': '',
        }
        response = requests.post(TRANSLINK_ULSTERBUS_INDEX, data=payload)
        html = parse(response.content)
        for link in html.iterlinks():
            url = link[2]
            if valid_ulsterbus_route_url(url) and url not in done_routes:
                href = TRANSLINK_URL + url
                for row in iter_scrape_route(ULSTERBUS_ID, BUS_ROUTE_TYPE,
                                             href):
                    yield row
                done_routes.add(url)
Exemple #15
0
def write_html_sig(sigfile, sig, basedir, is_domain, logger):
    "write html sig"
    cleaner = SignatureCleaner(style=True, remove_tags=UNCLEANTAGS,
                                safe_attrs_only=False)
    html = cleaner.clean_html(sig.signature_content)
    html = fragments_fromstring(html)[0]
    for element, attribute, link, pos in iterlinks(html):
        if link.startswith('/settings/imgs/'):
            view, args, kwargs = resolve(link)
            view = None
            args = None
            img = SignatureImg.objects.get(pk=kwargs['img_id'])
            if is_domain:
                imgfile = '%s/domains/imgs/%s' % (basedir, img.name)
            else:
                imgfile = '%s/users/imgs/%s' % (basedir, img.name)
            element.attrib['src'] = 'cid:%s' % img.name
            imghandle = open(imgfile, 'wb')
            imghandle.write(base64.decodestring(img.image))
            imghandle.close()
            logger.info(_("Wrote img: %(img)s") % dict(img=imgfile))
            # update the sig with image obtained
            sig.image = img
    if 'link' in locals():
        sig.save()
    sighandle = open(sigfile, 'w')
    if not sig.signature_content.startswith('--'):
        sighandle.write('<br/>--<br/>')
    sighandle.write(tostring(html))
    logger.info(_("Wrote html signature: %(sig)s") % dict(sig=sigfile))
Exemple #16
0
    def get_links(self, url, starts_with=None, ends_with=None):
        results = []
        content = self.download(url)
        if not content:
            return []

        if not (starts_with or ends_with):
            raise NotImplementedError(
                'get_links requires either `startswith` or `endswith`')

        html = lxml.html.document_fromstring(content)

        path = urlparse.urlparse(url).path

        def url_match(link):
            # The link might be something like "/pub/mobile/nightly/"
            # but we're looking for a path that starts with "nightly".
            # So first we need to remove what's part of the base URL
            # to make a fair comparison.
            if starts_with is not None:
                # If the current URL is http://example.com/some/dir/
                # and the link is /some/dir/mypage/ and the thing
                # we're looking for is "myp" then this should be true
                if link.startswith(path):
                    link = link.replace(path, '')
                return link.startswith(starts_with)
            elif ends_with:
                return link.endswith(ends_with)
            return False

        for _, _, link, _ in html.iterlinks():
            if url_match(link):
                results.append(urlparse.urljoin(url, link))
        return results
Exemple #17
0
def get_page():
    # request the page
    r = requests.get(request.args['url'])
    # parse the dom into python objects
    html = lxml.html.document_fromstring(r.content)
    # prase the requested url so we can form the base href
    url = urlparse(request.args['url'])
    # create the base url dom fragment
    base_url = lxml.html.fromstring("<base href='%s://%s'>" % (url.scheme, url.hostname)).find('.//base')
    # find the head element
    head = html.find(".//head")
    # insert the base href in the last place of the head elements
    head.insert(-1, base_url)
    # rewrite urls to have absolute url
    html.resolve_base_href()
    # rewrite links to load through this proxy
    for element, attribute, link, pos in html.iterlinks():
        if element.tag == "a" and attribute == "href":
            link = "http://localhost:8888/translate_url?url=%s" % (link)
            element.set("href", link)
            element.set("target", "_parent")
    # translate through DOM Traversal
    # html = translate_dom_string(html, lxml.html.tostring(html))
    # translate through HTML regex string replacement
    html = translate_html(html, lxml.html.tostring(html))
    # dump the html string for debugging
    # with open('html_dump', 'w') as f:
    #     f.write(lxml.html.tostring(html))
    # a little regex to remove any script tags
    return re.subn(r'<(script).*?</\1>(?s)', '', lxml.html.tostring(html))[0]
Exemple #18
0
 def _get_desc(self, cr, uid, ids, field_name=None, arg=None, context=None):
     res = dict.fromkeys(ids, "")
     for module in self.browse(cr, uid, ids, context=context):
         path = get_module_resource(module.name, "static/description/index.html")
         if path:
             with tools.file_open(path, "rb") as desc_file:
                 doc = desc_file.read()
                 html = lxml.html.document_fromstring(doc)
                 for element, attribute, link, pos in html.iterlinks():
                     if (
                         element.get("src")
                         and not "//" in element.get("src")
                         and not "static/" in element.get("src")
                     ):
                         element.set("src", "/%s/static/description/%s" % (module.name, element.get("src")))
                 res[module.id] = html_sanitize(lxml.html.tostring(html))
         else:
             overrides = {
                 "embed_stylesheet": False,
                 "doctitle_xform": False,
                 "output_encoding": "unicode",
                 "xml_declaration": False,
             }
             output = publish_string(
                 source=module.description or "", settings_overrides=overrides, writer=MyWriter()
             )
             res[module.id] = html_sanitize(output)
     return res
Exemple #19
0
def xlinks(page):
    links = {}
    #    myparser = lxml.etree.HTMLParser(encoding="utf-8")
    #    html = lxml.etree.HTML(page, parser=myparser)
    html = lxml.html.document_fromstring(page)
    for (f_name_element, attr, f_link, pos) in html.iterlinks():
        #if(attr == 'href'):
        #    for r in html.xpath("//a") :
        #        f_link = l.get("href")

        links[f_link] = 1
        if f_link.find("washingtonpost") > 0:
            continue

        if f_link.find("washpost") > 0:
            continue

        if f_link.find("mailto") >= 0:
            #print "external mail",f_link,pos, attr
            continue
        if f_link.find(".gov") >= 0:

            f_link = f_link.rstrip("/")
            continue
        if f_link.find("wiki") >= 0:
            print "external wiki", f_link, pos, attr
    return links
Exemple #20
0
def parse(tid: str, host: str) -> None:
    job = get_current_job()
    print(f"Starting task for {host}")
    job.meta["status"] = "In progress"
    job.save_meta()

    links = []
    r = requests.get(f"https://{host}")
    if r.status_code == 200:
        for elem in iterlinks(r.text):
            el, href, path, n = elem
            #time.sleep(0.2)
            if path.startswith("/"):
                print(path)
                links.append(path)
        json_str = json.dumps(links, ensure_ascii=False, indent=4) + "\n"
        json_bytes = json_str.encode('utf-8')
        with tarfile.open(f"static/{tid}.tar.xz", "w:xz") as xz:
            buf = io.BytesIO(json_bytes)
            info = tarfile.TarInfo(f"{tid}.json")
            info.size = buf.seek(0, io.SEEK_END)
            xz.addfile(info, fileobj=io.BytesIO(buf.getvalue()))
        job.meta["status"] = "Completed"
        job.meta["url"] = f"https://timeweb.com/ru/task/{tid}.tar.xz"
        job.save_meta()
    else:
        job.meta["status"] = f"Error: {r.status_code}"
        job.save_meta()
    print('Task completed')
Exemple #21
0
def write_html_sig(sigfile, sig, basedir, is_domain, logger):
    "write html sig"
    cleaner = SignatureCleaner(style=True, remove_tags=UNCLEANTAGS,
                                safe_attrs_only=False)
    html = cleaner.clean_html(sig.signature_content)
    html = fragments_fromstring(html)[0]
    for element, attribute, link, pos in iterlinks(html):
        if link.startswith('/settings/imgs/'):
            view, args, kwargs = resolve(link)
            view = None
            args = None
            img = SignatureImg.objects.get(pk=kwargs['img_id'])
            if is_domain:
                imgfile = '%s/domains/imgs/%s' % (basedir, img.name)
            else:
                imgfile = '%s/users/imgs/%s' % (basedir, img.name)
            element.attrib['src'] = 'cid:%s' % img.name
            imghandle = open(imgfile, 'wb')
            imghandle.write(base64.decodestring(img.image))
            imghandle.close()
            logger.info(_("Wrote img: %(img)s") % dict(img=imgfile))
            # update the sig with image obtained
            sig.image = img
    if 'link' in locals():
        sig.save()
    sighandle = open(sigfile, 'w')
    if not sig.signature_content.startswith('--'):
        sighandle.write('<br/>--<br/>')
    sighandle.write(tostring(html))
    logger.info(_("Wrote html signature: %(sig)s") % dict(sig=sigfile))
	def get_links(self, noReturn = False):
		if self.links == []:
			so = self.get_source()
			self.links = list(iterlinks(so))
		
		if not noReturn:
			return self.links
Exemple #23
0
 def _get_desc(self):
     for module in self:
         if not module.name:
             module.description_html = False
             continue
         module_path = modules.get_module_path(module.name, display_warning=False)  # avoid to log warning for fake community module
         if module_path:
             path = modules.check_resource_path(module_path, 'static/description/index.html')
         if module_path and path:
             with tools.file_open(path, 'rb') as desc_file:
                 doc = desc_file.read()
                 html = lxml.html.document_fromstring(doc)
                 for element, attribute, link, pos in html.iterlinks():
                     if element.get('src') and not '//' in element.get('src') and not 'static/' in element.get('src'):
                         element.set('src', "/%s/static/description/%s" % (module.name, element.get('src')))
                 module.description_html = tools.html_sanitize(lxml.html.tostring(html))
         else:
             overrides = {
                 'embed_stylesheet': False,
                 'doctitle_xform': False,
                 'output_encoding': 'unicode',
                 'xml_declaration': False,
                 'file_insertion_enabled': False,
             }
             output = publish_string(source=module.description if not module.application and module.description else '', settings_overrides=overrides, writer=MyWriter())
             module.description_html = tools.html_sanitize(output)
Exemple #24
0
    def get_links(self, url, starts_with=None, ends_with=None):

        results = []
        content = self.download(url)
        if not content:
            return []

        if not (starts_with or ends_with):
            raise NotImplementedError("get_links requires either `startswith` or `endswith`")

        html = lxml.html.document_fromstring(content)

        path = urlparse.urlparse(url).path

        def url_match(link):
            # The link might be something like "/pub/mobile/nightly/"
            # but we're looking for a path that starts with "nightly".
            # So first we need to remove what's part of the base URL
            # to make a fair comparison.
            if starts_with is not None:
                # If the current URL is http://example.com/some/dir/
                # and the link is /some/dir/mypage/ and the thing
                # we're looking for is "myp" then this should be true
                if link.startswith(path):
                    link = link.replace(path, "")
                return link.startswith(starts_with)
            elif ends_with:
                return link.endswith(ends_with)
            return False

        for _, _, link, _ in html.iterlinks():
            if url_match(link):
                results.append(urlparse.urljoin(url, link))
        return results
Exemple #25
0
 def fix_inner_link(self, page_html):
     """
     Changes inner book links to point to the saved HML file rather than the
     URL on CNX
     :param page_html: HTML that could contain page links
     :return: HTML with corrected links
     """
     html = lxml.html.document_fromstring(page_html)
     for element, attribute, link, pos in html.iterlinks():
         if '/contents/' in link and element.text is not None:
             print "fix_inner_link: " + link
             content_index = link.find('/contents/')
             at_index = link.find('@')
             pound_index = link.find('#')
             id = ''
             if at_index == -1:
                 id = link[content_index + 10:]
             else:
                 id = link[content_index + 10:at_index]
             anchor = ''
             if pound_index > -1:
                 anchor = link[pound_index:]
             print "fix_inner_link: " + id
             page = self.get_page_json(id)
             title = page['id'] + ".html"
             link_text = title + anchor
             element.attrib['href'] = link_text
     return lxml.html.tostring(html)
Exemple #26
0
 def _get_desc(self):
     for module in self:
         path = modules.get_module_resource(
             module.name, 'static/description/index.html')
         if path:
             with tools.file_open(path, 'rb') as desc_file:
                 doc = desc_file.read()
                 html = lxml.html.document_fromstring(doc)
                 for element, attribute, link, pos in html.iterlinks():
                     if element.get('src') and not '//' in element.get(
                             'src') and not 'static/' in element.get('src'):
                         element.set(
                             'src', "/%s/static/description/%s" %
                             (module.name, element.get('src')))
                 module.description_html = tools.html_sanitize(
                     lxml.html.tostring(html))
         else:
             overrides = {
                 'embed_stylesheet': False,
                 'doctitle_xform': False,
                 'output_encoding': 'unicode',
                 'xml_declaration': False,
             }
             output = publish_string(source=module.description or '',
                                     settings_overrides=overrides,
                                     writer=MyWriter())
             module.description_html = tools.html_sanitize(output)
Exemple #27
0
    def get_page(self,item):
        links = html.iterlinks(item)
        l = []
        for i in links:
            l.append(i)

        page = r.get(l[0][2])
Exemple #28
0
def image_fixups(content, msgid, archive, richformat, allowimgs):
    "Replace the CID links stored messages"
    html = local_fromstring(content)
    for element, attribute, link, _ in iterlinks(html):
        if not link.startswith('cid:'):
            if not allowimgs and attribute == 'src':
                element.attrib['src'] = '%simgs/blocked.gif' % media_url()
                element.attrib['title'] = link
                if richformat:
                    if archive:
                        displayurl = url('message-preview-archived-with-imgs',
                                         msgid=msgid)
                    else:
                        displayurl = url('message-preview-with-imgs',
                                         msgid=msgid)
                    flash(
                        ugettext('This message contains external'
                                 ' images, which have been blocked. ') +
                        literal(link_to(ugettext('Display images'),
                                        displayurl)))
        else:
            imgname = link.replace('cid:', '')
            if archive:
                imgurl = url('messages-preview-archived-img',
                             img=imgname.replace('/', '__xoxo__'),
                             msgid=msgid)
            else:
                imgurl = url('messages-preview-img',
                             img=imgname.replace('/', '__xoxo__'),
                             msgid=msgid)
            element.attrib['src'] = imgurl
    return tostring(html)
Exemple #29
0
 def projects_from_url(url):
     """returns list of projects from the index url"""
     projects = []  # XXX should be a set?
     html = urllib2.urlopen(url).read()
     html = lxml.html.fromstring(html)
     for link in html.iterlinks():
         projects.append(link[2].strip('/'))
     return projects
Exemple #30
0
 def projects_from_url(url):
     """returns list of projects from the index url"""
     projects = [] # XXX should be a set?
     html = urllib2.urlopen(url).read()
     html = lxml.html.fromstring(html)
     for link in html.iterlinks():
         projects.append(link[2].strip('/'))
     return projects
def mask_links(html_text, site_url):
    document = html.fromstring(html_text)
    for el, attr, val, pos in html.iterlinks(document):
        if el.tag.lower() == "a":
            if (attr == 'href' and not is_internal_link(val, site_url)):
                el.attrib['target'] = '_blank'
                el.attrib['rel'] = 'nofollow'
    return html.tostring(document)
Exemple #32
0
 def get_links(html, base_url, tags=[]):
     links = []
     tags = tags
     html = lxml.html.document_fromstring(html)
     html.make_links_absolute(base_url)
     links_html = html.iterlinks()
     links = [x[2] for x in links_html if x[0].tag in tags]
     return links
Exemple #33
0
	def get_links(html, base_url, tags = []):
		links = []
		tags = tags
		html = lxml.html.document_fromstring(html)
		html.make_links_absolute(base_url)
		links_html = html.iterlinks()
		links = [ x[2] for x in links_html if x[0].tag in tags ]
		return links
Exemple #34
0
 def sanitize_html(self, msg):
     "Clean up html"
     cleaner = Cleaner(style=True, remove_tags=UNCLEANTAGS)
     msg = HTMLTITLE_RE.sub('', msg)
     html = cleaner.clean_html(msg)
     html = fromstring(html)
     for element, attribute, link, pos in iterlinks(html):
         element.attrib['src'] = settings.MEDIA_URL + '/imgs/blocked.gif'
     return tostring(html)
Exemple #35
0
def getTodos(projects, objects):
  """
    Get todos for each project
  """
  tags_dict = getTags(objects)
  for project in projects:
    for ref_id in project['ref_ids'].split():
      for object in objects:
        if object.attributes['id'].value == ref_id:
          attribute_nodes = object.getElementsByTagName("attribute")
          title        = ""
          content      = ""
          datemodified = ""
          datecreated  = ""
          datecompleted= ""
          tags         = ""           
          for attribute_node in attribute_nodes:
            if attribute_node.attributes['name'].value == 'title':
              if attribute_node.childNodes:
                  title = attribute_node.childNodes[0].nodeValue.encode("utf-8")
                  break
          # Check if todo has a note attached
          if title:
            for attribute_node in attribute_nodes:
              # <attribute name="datemodified" >309306984.40529602766036987305
              if attribute_node.attributes['name'].value == 'datemodified':
                datemodified = convertCocoaEpoch(attribute_node.childNodes[0].\
                    nodeValue.encode("utf-8"))
              # <attribute name="datecreated" >306520491.00000000000000000000
              if attribute_node.attributes['name'].value == 'datecreated':
                datecreated = convertCocoaEpoch(attribute_node.childNodes[0].\
                    nodeValue.encode("utf-8"))
              #<attribute name="datecompleted" type="date">292880221.18648099899291992188
              if attribute_node.attributes['name'].value == 'datecompleted':
                datecompleted = convertCocoaEpoch(attribute_node.childNodes[0].\
                    nodeValue.encode("utf-8"))
              if attribute_node.attributes['name'].value == 'content':
                content = attribute_node.childNodes[0].nodeValue #.encode("utf-8")
                # lets encode in writeOutline               
                # I think we need to translate all this things
                html = content.replace('\\u3c00', '<').replace('\\u3e00', '>') 
                html = html.replace('\u2600', '&')
                html = lxml.html.fromstring(html)
                content = html.text_content().split('\n')
                for l in html.iterlinks():
                    content += [l[2]]
            relationship_nodes = object.getElementsByTagName("relationship")
            for relationship_node in relationship_nodes:
              if relationship_node.attributes['name'].value == 'tags':
                try:
                  tags_id = relationship_node.attributes['idrefs'].value
                  tags = [tags_dict[t_id] for t_id in tags_id.split()]
                except:
                  tags = ""

          project['todos'].append([title, content, datecreated, datemodified, datecompleted, tags])
  return projects
Exemple #36
0
 def sanitize_html(self, msg):
     "Clean up html"
     cleaner = Cleaner(style=True, remove_tags=UNCLEANTAGS)
     msg = HTMLTITLE_RE.sub('', msg)
     html = cleaner.clean_html(msg)
     html = fromstring(html)
     for element, attribute, link, pos in iterlinks(html):
         element.attrib['src'] = settings.MEDIA_URL + '/imgs/blocked.gif'
     return tostring(html)
Exemple #37
0
def parse_topic_page(doc, path):
    posts = doc.cssselect('table[id^="post"]')
    for post in posts:
        post_content = post.cssselect('div[id^="post_message"]')
        if len(post_content) > 0:
            # links = post_content[0].cssselect('a[href^="http"]')
            # for link in links:
            for link in html.iterlinks(post_content[0]):
                if ("http" in link[2]) and (link[1] == "href"):
                    parse_external_image(link, path)
def scrape_goldline():
    done_routes = set()
    html = parse(scrape(TRANSLINK_GOLDLINE_INDEX))
    for link in html.iterlinks():
        url = link[2]
        if valid_goldline_route_url(url) and url not in done_routes:
            href = TRANSLINK_URL + url
            for row in iter_scrape_route(GOLDLINE_ID, BUS_ROUTE_TYPE, href):
                yield row
            done_routes.add(url)
Exemple #39
0
def parse_topic_page(doc, path):
    posts = doc.cssselect('table[id^="post"]')
    for post in posts:
        post_content = post.cssselect('div[id^="post_message"]')
        if len(post_content) > 0:
            # links = post_content[0].cssselect('a[href^="http"]')
            # for link in links:
            for link in html.iterlinks(post_content[0]):
                if ("http" in link[2]) and (link[1] == "href"):
                    parse_external_image(link, path)
def scrape_enterprise():
    done_routes = set()
    html = parse(scrape(TRANSLINK_ENTERPRISE_INDEX))
    for link in html.iterlinks():
        url = link[2]
        if valid_enterprise_route_url(url) and url not in done_routes:
            href = TRANSLINK_URL + url
            for row in iter_scrape_route(ENTERPRISE_ID, RAIL_ROUTE_TYPE, href):
                yield row
            done_routes.add(url)
Exemple #41
0
def get_google_links(query):
    links = []
    url = GOOGLE_SEARCH_URL.format(urllib.parse.quote(query))
    result = get_result(url)
    html = lxml.html.document_fromstring(result)
    for l in html.iterlinks():
        if is_question(l[2]):
            links.append(l[2])
            
    return links
def scrape_goldline():
    done_routes = set()
    html = parse(scrape(TRANSLINK_GOLDLINE_INDEX))
    for link in html.iterlinks():
        url = link[2]
        if valid_goldline_route_url(url) and url not in done_routes:
            href = TRANSLINK_URL + url
            for row in iter_scrape_route(GOLDLINE_ID, BUS_ROUTE_TYPE, href):
                yield row
            done_routes.add(url)
def scrape_enterprise():
    done_routes = set()
    html = parse(scrape(TRANSLINK_ENTERPRISE_INDEX))
    for link in html.iterlinks():
        url = link[2]
        if valid_enterprise_route_url(url) and url not in done_routes:
            href = TRANSLINK_URL + url
            for row in iter_scrape_route(ENTERPRISE_ID, RAIL_ROUTE_TYPE, href):
                yield row
            done_routes.add(url)
Exemple #44
0
def get_pages():
    response_text = requests.get(BASE_URL).content
    html = lxml.html.fromstring(response_text)
    html.make_links_absolute(BASE_URL)
    links = []
    i = 0
    for link in html.iterlinks():
        links.append(link[2])

    return links
def fetch_data(url: str) -> None:
    r = requests.get(url)
    html = lxml.html.fromstring(r.content)
    tasks = []
    loop = asyncio.get_event_loop()
    for *_, url, _ in html.iterlinks():
        if url.startswith("fileadmin"):
            tasks.append(
                loop.create_task(fetch_zip(os.path.join(BASE_URL, url))))
    loop.run_until_complete(asyncio.wait(tasks))
    loop.close()
Exemple #46
0
    def _extract_links(self, response_text, response_url):
        html = lxml.html.fromstring(response_text)
        html.make_links_absolute(response_url)
        for e, a, l, p in html.iterlinks():
            if self.scan_tag(e.tag):
                if self.scan_attr(a):
                    link = Link(self.process_attr(l), text=e.text)
                    self.links.append(link)

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        return links
Exemple #47
0
    def _extract_links(self, response_text, response_url):
        html = lxml.html.fromstring(response_text)
        html.make_links_absolute(response_url)
        for e, a, l, p in html.iterlinks():
            if self.scan_tag(e.tag):
                if self.scan_attr(a):
                    link = Link(self.process_attr(l), text=e.text)
                    self.links.append(link)

        links = unique_list(self.links, key=lambda link: link.url) \
                if self.unique else self.links

        return links
Exemple #48
0
    def extract_links_from_html(base, body):
        try:
            html = lxml.html.fromstring(body)
            html.make_links_absolute(base)

            for element, attribute, link, pos in html.iterlinks():
                if isinstance(link, str):
                    link = link.encode('utf-8', 'ignore')
                yield link

        except Exception:
            logging.warning("(lxml) html parse error")
            import traceback; traceback.print_exc()
    def extract_links_from_html(base, body):
        try:
            html = lxml.html.fromstring(body)
            html.make_links_absolute(base)

            for element, attribute, link, pos in html.iterlinks():
                if isinstance(link, unicode):
                    link = link.encode('utf-8', 'ignore')
                yield link

        except StandardError:
            logging.warning("(lxml) html parse error")
            import traceback
            traceback.print_exc()
Exemple #50
0
 def fetch_links_from_web_page(self, page):
     log.debug('')
     try:
         # [ NOTE ]: Pull out all links after resolving them using any
         #           <base> tags found in the document.
         links = [
             link for element, attribute, link, pos in iterlinks(
                 resolve_base_href(page.content))
         ]
     except etree.ParseError:
         # [ NOTE ]: If the document is not HTML content this will return
         #           an empty list.
         links = []
     return list(set(links))
Exemple #51
0
def getLinks(url, startswith=None, endswith=None):
    page = urllib2.urlopen(url)
    html = lxml.html.document_fromstring(page.read())
    page.close()

    results = []
    for element, attribute, link, pos in html.iterlinks():
        if startswith:
            if link.startswith(startswith):
                results.append(link)
        elif endswith:
            if link.endswith(endswith):
                results.append(link)
    return results
Exemple #52
0
    def get_uri(self, url, html):
        if url is not None and html is not None:
            print(url)
            parsed_uri = urlparse(url)
            domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
            html.make_links_absolute(url)
            for l in html.iterlinks():
                parsed_uri = urlparse(l[2])
                curr_domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
                if curr_domain == domain:
                    if l[2] not in self.urls:
                        self.pool.put(l[2])

                    self.urls.add(l[2])
Exemple #53
0
def getLinks(url, startswith=None, endswith=None, urllib=urllib2):
    page = urllib.urlopen(url)
    html = lxml.html.document_fromstring(page.read())
    page.close()

    results = []
    for element, attribute, link, pos in html.iterlinks():
        if startswith:
            if link.startswith(startswith):
                results.append(link)
        elif endswith:
            if link.endswith(endswith):
                results.append(link)
    return results
Exemple #54
0
def getLinks(url, startswith=None, endswith=None):

    html = ''
    results = []
    content = patient_urlopen(url, sleep_time=30)
    html = lxml.html.document_fromstring(content)

    for element, attribute, link, pos in html.iterlinks():
        if startswith:
            if link.startswith(startswith):
                results.append(link)
        elif endswith:
            if link.endswith(endswith):
                results.append(link)
    return results
def main():
    url = sys.argv[1]
    html = parse(url).getroot()
    # make the links absolute for those that are local
    html.make_links_absolute(html.base_url, True)
    # remember the domain
    domain = urlparse(html.base_url).netloc

    # print all links on the page
    for element, attribute, link, pos in html.iterlinks():
        # only print if it's a local link and not self-referential
        link_domain = urlparse(link).netloc
        selfReference = (url.split("//")[1] + "#") in link
        if (link_domain == domain or link_domain == "www." + domain) and not selfReference:
            print link
Exemple #56
0
 def create_plps(self, category_limit, min_wait_time):
     plps = set([])
     assert self.description == 'homepage', "Must create plp's beginning with the homepage."
     nav_categories = self.tree.xpath('//nav')
     count = 0
     self.last_scrape = time.time()
     for category in nav_categories[2:]:
         for a,b,link,d in html.iterlinks(category):
             self.throttle(min_wait_time)
             if link[0:4] != 'http' and link not in ('/Store/catalog/shopAllBrands.jsp', '#', '/Store/cart/cart.jsp') and count < category_limit:
                 url = self.url + link
                 print count, 'PLPs loaded so far. Loading PLP from ', url
                 plps.add(Page(url,'plp', self.user_agent, baseurl))
                 self.last_scrape = time.time()
                 count += 1
     return plps
Exemple #57
0
    def walk(self, link):
        print("DEBUG: walk getting called with url " + link)
        
        try:
            self.base_url
        except AttributeError:
            (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(link)
            self.base_url = scheme + "://" + netloc
            print("DEBUG: self.baseurl: " + self.base_url)

        
        # handle relative urls
        try:
            if link.startswith("/") or (not link.startswith("http://") and not
                                        link.startswith("https://")):
                link = self.base_url + link
        except AttributeError as e:
            print(e)

        if link in self.visited_urls:
            try:
                self.skip_count += 1
            except AttributeError:
                self.skip_count = 1
            print("DEBUG: self.skip_count: " + str(self.skip_count))
            return
        
        print("DEBUG: walk opening url " + link)
        try:
            resp, content = self.httplib2.request(link, "GET")
            del resp
        except socket.error as error:
            print(error)
            return
        
        self.visited_urls.append(link)
        print("DEBUG: self.visited_urls:", len(self.visited_urls))
        
        try:
            html_dom = html.document_fromstring(content)
        except etree.ParserError as e:
            print(e)
            return
        for (element, attribute, link, pos) in html.iterlinks(html_dom):
            for url in self.walk(link):
                yield url
            yield link
Exemple #58
0
 def _get_desc(self, cr, uid, ids, field_name=None, arg=None, context=None):
     res = dict.fromkeys(ids, '')
     for module in self.browse(cr, uid, ids, context=context):
         path = get_module_resource(module.name, 'static/description/index.html')
         if path:
             with tools.file_open(path, 'rb') as desc_file:
                 doc = desc_file.read()
                 html = lxml.html.document_fromstring(doc)
                 for element, attribute, link, pos in html.iterlinks():
                     if element.get('src') and not '//' in element.get('src') and not 'static/' in element.get('src'):
                         element.set('src', "/%s/static/description/%s" % (module.name, element.get('src')))
                 res[module.id] = lxml.html.tostring(html)
         else:
             overrides = dict(embed_stylesheet=False, doctitle_xform=False, output_encoding='unicode')
             output = publish_string(source=module.description, settings_overrides=overrides, writer=MyWriter())
             res[module.id] = output
     return res
    def _extract_links(self, response_text, response_url, response_encoding):
        links = []
        html = lxml.html.fromstring(response_text)
        html.make_links_absolute(response_url)
        for e, a, l, p in html.iterlinks():
            if self.tag_func(e.tag):
                if self.attr_func(a):
                    l = safe_url_string(l, response_encoding)
                    text = u''
                    if e.text:
                        text = str_to_unicode(e.text, response_encoding, errors='replace').strip()
                    link = Link(self.process_func(l), text=text)
                    links.append(link)

        links = unique_list(links, key=lambda link: link.url) \
                if self.unique else links

        return links
Exemple #60
0
def img_fixups(content, queueid, allowimgs, richformat):
    "Replace the CID links in Queued messages"
    html = local_fromstring(content)
    for element, attribute, link, _ in iterlinks(html):
        if not link.startswith('cid:'):
            if not allowimgs and attribute == 'src':
                element.attrib['src'] = '%simgs/blocked.gif' % media_url()
                element.attrib['title'] = link
                if richformat:
                    flash(ugettext('This message contains external '
                    'images, which have been blocked. ') +
                    literal(link_to(ugettext('Display images'),
                    url('queue-preview-with-imgs', queueid=queueid))))
        else:
            imgname = link.replace('cid:', '')
            element.attrib['src'] = url('queue-preview-img',
                                    imgid=imgname.replace('/', '__xoxo__'),
                                    queueid=queueid)
    return tostring(html)