Exemple #1
0
def html_to_etree(in_str, remove_blank_text=True):
    """
    Parses a tree of possibly malformed HTML5, according to WHATWG HTML5 rules.

    Result is either:
     - parsed input, or;
     - if multiple fragments (> 1 top-level tags) are given: parsed input wrapped in either a `div' or `span', or;
     - None for empty input.
    """
    if in_str is None:
        return None

    if not isinstance(in_str, basestring):
        raise ValueError('input must be a string')

    in_str = _nfc(in_str).strip()

    if not in_str:
        return None

    # Double-parse to remove (hopefully irrelevant) whitespace - some not-so-irrelevant whitespace will most likely be removed too
    etree = fromstring(in_str, parser=_html5Parser) # ATTENTION: tag/attributes namespace-info mangled here due to html5lib bugs.
    _etree_mutate_fix_localname(etree)
    if remove_blank_text:
        s = lxmltostring(etree)
        etree = parse(StringIO(s), parser=_xmlParser)
        etree = fromstring(lxmltostring(etree), parser=_html5Parser)
        _etree_mutate_fix_localname(etree)  # and they spawn again after fromstring, so remove them again.

    return etree.getroot() if hasattr(etree, 'getroot') else etree
Exemple #2
0
 def run(self, staffName=None):
     self.getstaffdata()
     if staffName:
         if not self.config.has_key(u'名前'):
             self.config[u'名前'] = {}
         print "Recording " + staffName
         self.config[u'名前']['ja'] = staffName
     else:
         mypath = self.filename
         if not self.filename.startswith('http'):
             mypath = os.path.join(self.ls.baseurl, self.filename)
         ifh = urlopen(mypath)
         page = ifh.read()
         self.root = html5parser.fromstring(page)
         
         self.multiscrape(u'名前')
         self.multiscrape(u'ふりがな')
         self.multiscrape(u'役職')
         self.multiscrape(u'所属')
         self.multiscrape(u'所属学会')
         self.multiscrape(u'専門分野')
         self.multiscrape(u'研究テーマ')
         self.multiscrape(u'教員からのメッセージ')
         self.multiscrape(u'略歴')
         self.multiscrape(u'主要業績')
         self.multiscrape(u'リンク')
     applyNamePatches(self.config, self.key)
     self.output()
Exemple #3
0
def build_html():
    service = docs.setup_api_service()

    html = etree.fromstring(HTML_TEMPLATE)
    body = html.xpath("/x:html/x:body", namespaces=NSS)[0]


    the_google_docs = _collect_google_docs_inside_folder(service)

    first_doc_created_at = the_google_docs[0]['createdDate']
    last_doc_created_at = the_google_docs[-1]['createdDate']
    end_search_timestamp = _minutes_later(last_doc_created_at, num_minutes=5)

    tweet_search_results = tweets.hashtag_search_in_daterange(first_doc_created_at, end_search_timestamp)

    for i, child in enumerate(the_google_docs):
        log.info(u"Appending slide %s" % child['title'])
        slide = html5parser.fromstring(docs.export_file(service, child))
        created_time = child['createdDate']
        slide_body = slide.xpath("/x:html/x:body", namespaces=NSS)[0]

        the_slide_html_section = _generate_slide_html_section(slide_body, created_time, the_google_docs, i, tweet_search_results)

        body.append(the_slide_html_section)

    return html
Exemple #4
0
 def __init__(self, filename):
     StaffBase.__init__(self, filename)
     ConfigPaths.__init__(self)
     print "DOING ug: " + filename
     ifh = urlopen(self.ug.staffUrl(filename))
     page = ifh.read()
     self.root = html5parser.fromstring(page)
    def handle(self, *args, **options):
        path = os.path.dirname(__file__)
        page = html5parser.fromstring(
            open(path + '/supported_coins.html', 'r').read())

        for coin_row in page.cssselect('.w-row.tdr'):
            try:
                icon = BASE_URL + coin_row.cssselect('.coin-logo')[0].get(
                    'src')
                name = coin_row.cssselect('.coin-link')[0].text
                code = coin_row.cssselect('.table-data')[1].text

                # Só cadastra moedas inexistentes
                if not Currencies.objects.filter(code=code).exists():
                    currency = Currencies()
                    currency.name = name
                    currency.code = code
                    currency.save()

                    # Faz o upload do icone
                    response = requests.get(icon, headers=HEADERS)

                    if response.status_code == 200:
                        currency.icon.save(os.path.basename(icon),
                                           ContentFile(response.content),
                                           save=True)

                    currency_gateway = CurrencyGateway()
                    currency_gateway.currency = currency
                    currency_gateway.gateway = 'coinpayments'
                    currency_gateway.save()

                    print('Cadastrando moeda {} -> {}'.format(name, code))
            except Exception as e:
                continue
Exemple #6
0
 def get(self, url, follow_redir=True):
     req = webapp2.Request.blank(url)
     self.res = req.get_response(main.app)
     if len(self.res.body) > 0 and self.res.headers['content-type'].split(';')[0].strip() == 'text/html':
         self.tree = html5parser.fromstring(self.res.body, parser=self.parser)
     if follow_redir and self.res.status_code in [301, 302, 303, 304, 307] and 'location' in self.res.headers:
         self.get(self.res.headers['location'][16:])
Exemple #7
0
def test_widget_empty():
    f = I18nFormField(widget=I18nTextInput, required=False, localize=True)
    rendered = f.widget.render('foo', [])
    tree = html5parser.fromstring(rendered)
    assert tree[0].attrib == {'lang': 'de', 'name': 'foo_0', 'type': 'text'}
    assert tree[1].attrib == {'lang': 'en', 'name': 'foo_1', 'type': 'text'}
    assert tree[2].attrib == {'lang': 'fr', 'name': 'foo_2', 'type': 'text'}
Exemple #8
0
def html_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    try:
        return html5parser.fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
Exemple #9
0
 def get(self, url, follow_redir=True):
     req = webapp2.Request.blank(url)
     self.res = req.get_response(main.app)
     if len(self.res.body) > 0 and self.res.headers['content-type'].split(';')[0].strip() == 'text/html':
         self.tree = html5parser.fromstring(self.res.body, parser=self.parser)
     if follow_redir and self.res.status_code in [301, 302, 303, 304, 307] and 'location' in self.res.headers:
         self.get(self.res.headers['location'][16:], follow_redir=True)
Exemple #10
0
 def get(self, path, data={}, follow=False):
     self.res = super(Browser, self).get(path, data, follow)
     if len(self.res.content) > 0 and self.res['content-type'].split(';')[0].strip() == 'text/html':
         self.tree = html5parser.fromstring(self.res.content, parser=self.parser)
     if follow and self.res.status_code in [301, 302, 303, 304, 307] and 'location' in self.res:
         self.get(self.res['location'][16:])
     return self.res
Exemple #11
0
    def get_entries(self):
        r = requests.get(self.base_url, verify=False)
        assert r.status_code == 200
        tree = html5parser.fromstring(r.text)

        d = []
        for entry1, entry2, entry3 in group(tree.xpath(self.xpath_entry)[1:], 3):
            link = entry1.xpath(self.xpath_link)
            assert len(link) in (0, 1)
            if len(link) == 0:
                break
            title = entry1.xpath(self.xpath_title)
            assert len(title) == 1
            score = entry2.xpath(self.xpath_score)
            assert len(score) in (0, 1)
            if len(score) == 0:
                continue

            link = urljoin(self.base_url, link[0])
            title = title[0]
            score = int(score[0].split()[0])

            d.append((title, score, link))

        assert len(d) > 10
        return d
Exemple #12
0
def infections_ons():
    res = requests.get(
        "https://www.ons.gov.uk/peoplepopulationandcommunity/healthandsocialcare/"
        "conditionsanddiseases/datasets/coronaviruscovid19infectionsurveydata")
    root = html5parser.fromstring(res.text)
    url = urljoin(
        "https://www.ons.gov.uk",
        root.find(
            './/{http://www.w3.org/1999/xhtml}a[@aria-label="Download Coronavirus'
            ' (COVID-19) Infection Survey: 2020 in xlsx format"]').get("href"),
    )

    df = pd.read_excel(requests.get(url).content,
                       sheet_name="2b",
                       skiprows=6,
                       skipfooter=10)
    df = df.drop(columns=[df.columns[4], df.columns[8]])
    df.columns = [
        "date",
        "incidence",
        "incidence_lower",
        "incidence_upper",
        "infections",
        "infections_lower",
        "infections_upper",
        "weekly",
        "weekly_lower",
        "weekly_upper",
    ]
    return df
Exemple #13
0
 def _get_content_preview(self):
   if self._content_preview is None:
     doc = html5parser.fromstring(self.content)
     desc = doc.xpath("string()").strip()
     desc = (desc[:38] + '..') if len(desc) > 40 else desc
     self._content_preview = desc.replace('\n', '')
   return self._content_preview
Exemple #14
0
def getNameTitle():
    response, content = h.request(URL+"?dt="+str(time.time()), 'POST')
    content = html5parser.fromstring(content)

    name = content.findall("*")[3].text
    title = content.findall("*")[5].text
    
    return name, title
Exemple #15
0
async def test_hello_world(client):
    response = await client.get('/')
    assert_that(response.status, equal_to(200))
    text = await response.text()
    value = html5parser.fromstring(text).xpath(
        './/*[@id="value"]/text()').pop()

    assert_that(value, value_matcher)
Exemple #16
0
    def _validate(self, url, validator, status_code=200):
        req = webapp2.Request.blank(url)
        res = req.get_response(main.app)

        self.assertEqual(status_code, res.status_code)

        if validator == 'html':
            self.assertEqual('text/html', res.content_type)
            html5parser.fromstring(res.body, parser=self.parser)
        elif validator == 'xml':
            self.assertEqual('text/xml', res.content_type)
            lxml.etree.XML(res.body)
        elif validator == 'json':
            self.assertEqual('application/json', res.content_type)
            json.loads(res.body)
        elif validator == 'text':
            self.assertEqual('text/plain', res.content_type)
Exemple #17
0
    def _validate(self, url, validator, status_code=200):
        req = webapp2.Request.blank(url)
        res = req.get_response(main.app)

        self.assertEqual(status_code, res.status_code)

        if validator == 'html':
            self.assertEqual('text/html', res.content_type)
            html5parser.fromstring(res.body, parser=self.parser)
        elif validator == 'xml':
            self.assertEqual('text/xml', res.content_type)
            lxml.etree.XML(res.body)
        elif validator == 'json':
            self.assertEqual('application/json', res.content_type)
            json.loads(res.body)
        elif validator == 'text':
            self.assertEqual('text/plain', res.content_type)
Exemple #18
0
def fetch_json_ld(url):
    res = requests.get(url)
    res.raise_for_status()
    root = html5parser.fromstring(res.text)
    return json.loads(
        root.find(
            './/{http://www.w3.org/1999/xhtml}script[@type="application/ld+json"]'
        ).text)
def find_category_string(html_text):
    parser = html5parser.fromstring(html_text)
    try:
        category_link = parser.cssselect('.category')[0]
        category_span = category_link.cssselect('[itemprop=genre]')[0]
        return category_span.text
    except IndexError as e:
        __log__.exception('Cannot match category in HTML')
        return None
Exemple #20
0
 def test_empty_result(self):
     query = "togetanansweryoufirsthavetoknowthequestion"
     r = requests.get(self.app_url + "search", params={'q': query})
     # check response & redirects
     assert r.ok, "problematic status code: " + r.status_code
     # check contents
     html = html5parser.fromstring(r.content)  # type: _Element
     assert html.cssselect("#result-failure")
     assert html.cssselect("#result-sidebar")
Exemple #21
0
def _get_nhs_potential(title):
    url = (
        "https://digital.nhs.uk/data-and-information/publications/statistical"
        "/mi-potential-covid-19-symptoms-reported-through-nhs-pathways-and-111-online/latest"
    )

    text = requests.get(url).text
    et = html5parser.fromstring(text)

    el = et.find('.//{http://www.w3.org/1999/xhtml}a[@title="' + title + '"]')
    return el.get("href")
Exemple #22
0
def test_widget_enabled_locales():
    f = I18nFormField(widget=I18nTextInput, required=False)
    f.widget.enabled_locales = ['de', 'fr']
    rendered = f.widget.render('foo', LazyI18nString({'de': 'Hallo', 'en': 'Hello'}))

    tree = html5parser.fromstring(rendered)
    assert tree[0].attrib == {
        'lang': 'de', 'name': 'foo_0', 'type': 'text', 'value': 'Hallo'
    }
    assert tree[1].attrib == {
        'lang': 'fr', 'name': 'foo_2', 'type': 'text'
    }
Exemple #23
0
def html_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    if isinstance(s, six.text_type):
        s = s.encode('utf8')
    try:
        if html_too_big(s):
            return None

        return html5parser.fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
Exemple #24
0
def html_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    if isinstance(s, six.text_type):
        s = s.encode('utf8')
    try:
        if html_too_big(s):
            return None

        return html5parser.fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
def parse_icons_from_html(html):
    """Parsing an HTML document and return a list of rel icon links"""
    icons = []
    htmlparsed = html5parser.fromstring(html)
    html_links = htmlparsed.xpath('//h:link[@rel]',
        namespaces={'h': 'http://www.w3.org/1999/xhtml'})
    for html_link in html_links:
        attributes = html_link.attrib
        relvalues = attributes['rel'].lower()
        if relvalues in ('icon', 'apple-touch-icon', 'apple-touch-icon-precomposed'):
            icons.append(attributes)
    return icons
Exemple #26
0
def parse_icons_from_html(html):
    """Parsing an HTML document and return a list of rel icon links"""
    icons = []
    htmlparsed = html5parser.fromstring(html)
    html_links = htmlparsed.xpath(
        '//h:link[@rel]', namespaces={'h': 'http://www.w3.org/1999/xhtml'})
    for html_link in html_links:
        attributes = html_link.attrib
        relvalues = attributes['rel'].lower()
        if relvalues in ('icon', 'apple-touch-icon',
                         'apple-touch-icon-precomposed'):
            icons.append(attributes)
    return icons
 def _lxml_parse_document(self, body, use_html5lib=False,
                          use_BeautifulSoup=False):
     if use_html5lib:
         from lxml.html import html5parser
         return html5parser.fromstring(body)
     elif use_BeautifulSoup:
         from lxml.html import soupparser
         return soupparser.fromstring(body)
     else:
         for parser in [ etree.XML, etree.HTML ]:
             try:
                 return (parser(body))
             except:
                 pass
Exemple #28
0
    def update(self):

        r = requests.get(self.page_url, verify=False)
        assert r.status_code == 200
        tree = html5parser.fromstring(r.text)

        self.link_scores = {}
        for article in tree.xpath("//*[local-name()='article']"):
            link = article.xpath(".//*[local-name()='span' and @class='anonymous_reader']/*[local-name()='a']/@href")
            assert len(link) == 1
            score = article.xpath(".//*[local-name()='figure' and @class='score']/text()")
            assert len(score) == 1
            self.link_scores[link[0]] = int(score[0])

        super().update()
Exemple #29
0
    def run(self):
        self.getstaffdata()
        ifh = urlopen(os.path.join(self.ug.baseurl, self.filename))
        page = ifh.read()
        self.root = html5parser.fromstring(page)

        self.multiscrape(u'名前')
        self.multiscrape(u'ふりがな')
        self.multiscrape(u'役職')
        self.multiscrape(u'専門分野')
        self.multiscrape(u'研究テーマ')
        self.multiscrape(u'学会')
        self.multiscrape(u'略歴')
        self.multiscrape(u'教員からのメッセージ')
        self.multiscrape(u'主要')
        applyNamePatches(self.config, self.key)
        self.output()
Exemple #30
0
def html2text(html):
    """Render html as text, convert line breaks to spaces."""
    if not ishtml(html):
        return re.sub(r'\s+', ' ', html.strip())
    parser = html5parser.HTMLParser(namespaceHTMLElements=False)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        html = html5parser.fromstring(html, parser=parser)
    for b in BLOCKTAGS:
        for e in html.xpath(f'//{b}'):
            e.text = ' ' + e.text if e.text else ''
            if len(e) > 0:
                lc = e[-1]
                lc.tail = (lc.tail if lc.tail else '') + ' '
            else:
                e.text = e.text + ' '
    text = tostring(html, method='text', encoding='utf-8')
    return re.sub(r'\s+', ' ', text.decode().strip())
Exemple #31
0
def extract_next_links(rawDatas):
    global max_outlinks
    global max_outlink_url
    outputLinks = list()
    '''
    rawDatas is a list of objs -> [raw_content_obj1, raw_content_obj2, ....]
    Each obj is of type UrlResponse  declared at L28-42 datamodel/search/datamodel.py
    the return of this function should be a list of urls in their absolute form
    Validation of link via is_valid function is done later (see line 42).
    The frontier takes care of that.

    Suggested library: lxml
    '''

    # cleaner = Cleaner(page_structure = False, links = False) # clean(remove) scripts, special tags, css style annotations, etc
    for raw_content_obj in rawDatas:

        if should_extract_urls(raw_content_obj):
            try:

                content = raw_content_obj.content
                # content = cleaner.clean_html(content)

                e = html5parser.fromstring(content)  # Parse html5 content into element
                doc = html.fromstring(
                    html.tostring(e))  # Weird workaround when using html5parser.from_string and html.fromstring
                # because they return different objects
                doc.make_links_absolute(raw_content_obj.url, resolve_base_href=True)

                link_count = 0
                for e, a, l, p in doc.iterlinks():  # Get (element, attribute, link, pos) for every link in doc
                    outputLinks.append(l)
                    link_count += 1
                    # print l

                if (link_count > max_outlinks):
                    max_outlinks = link_count
                    max_outlink_url = raw_content_obj.url

            except etree.XMLSyntaxError as e:
                print "Error on url " + raw_content_obj.url + " " + str(e)
                raw_content_obj.bad_url = True

    return outputLinks
Exemple #32
0
def grab_iconlink(html):
    """Parsing an HTML document and return a list of rel icon links"""
    htmlparsed = html5parser.fromstring(html)
    html_links = htmlparsed.xpath(
        '//h:link[@rel]', namespaces={'h': 'http://www.w3.org/1999/xhtml'})
    for html_link in html_links:
        attributes = html_link.attrib
        relvalues = attributes['rel'].lower()
        if 'icon' in relvalues:
            if 'href' in attributes:
                iconlink = attributes['href']
            else:
                iconlink = ''
            if 'sizes' in attributes:
                sizevalues = attributes['sizes']
            else:
                sizevalues = ''
            icons.append((relvalues, sizevalues, iconlink))
    return icons
Exemple #33
0
 def run(self):
     ifh = urlopen(self.ug.indexurl)
     page = ifh.read()
     ifh.close()
     doc = html5parser.fromstring(page)
     nodes = doc.xpath('//_:th/_:a[@href]', namespaces=NS)
     data = []
     for node in nodes:
         data.append({
             'href': node.attrib['href']
         })
     #if len(data) == 0:
         #print page
         #print len(nodes)
         #print "*** Nothing found. Check the xpath against the source."
     for datum in data:
         if not datum['href'].endswith('.html'): continue
         staffscraper = Undergrad(datum['href'])
         staffscraper.run()
Exemple #34
0
def grab_iconlink(html):
    """Parsing an HTML document and return a list of rel icon links"""
    htmlparsed = html5parser.fromstring(html)
    html_links = htmlparsed.xpath('//h:link[@rel]',
        namespaces={'h': 'http://www.w3.org/1999/xhtml'})
    for html_link in html_links:
        attributes = html_link.attrib
        relvalues = attributes['rel'].lower()
        if 'icon' in relvalues:
            if 'href' in attributes:
                iconlink = attributes['href']
            else:
                iconlink = ''
            if 'sizes' in attributes:
                sizevalues = attributes['sizes']
            else:
                sizevalues = ''
            icons.append((relvalues, sizevalues, iconlink))
    return icons
Exemple #35
0
def add_post(request):

    if request.method == "POST":

        content = request.POST['content']
        heading = request.POST['heading']
        newpost = Post()
        newpost.writer = request.user
        newpost.heading = heading
        newpost.pub_date = timezone.now()

        #here we Use html5lib to convert an HTML fragment to plain text
        doc = html5parser.fromstring(content)
        newpost.content = doc.xpath("string()")

        newpost.save()

    else:
        return render(request, 'blogg/add_post.html')
    return render(request, 'blogg/index.html')
Exemple #36
0
def clean_html(text, to_plaintext=False):
    if isinstance(text, str):
        text = unicode(text, 'utf-8')
    text = text.strip()
    if not len(text):
        return text

    import bleach
    ALLOWED_TAGS = bleach.ALLOWED_TAGS
    ALLOWED_TAGS.append('p')
    html = bleach.clean(text, tags=ALLOWED_TAGS, strip=True)

    from lxml.html import html5parser
    doc = html5parser.fromstring(html)
    plaintext = doc.xpath("string()")

    if plaintext == text:
        return plaintext

    return html
Exemple #37
0
 def run(self):
     ifh = urlopen(self.ls.indexurl)
     page = ifh.read()
     ifh.close()
     doc = html5parser.fromstring(page)
     for blockid in self.staffType.keys():
         nodes = doc.xpath('//_:div[@id="%s"]//_:th//_:a' % blockid, namespaces=NS)
         data = []
         for node in nodes:
             data.append({
                 'href': node.attrib['href'],
                 'staffName': node.text,
                 'staffType': self.staffType[blockid]
             })
         for datum in data:
             if datum['href'].endswith('.pdf'):
                 staffscraper = Lawschool(datum)
                 staffscraper.run(staffName=datum['staffName'])
             elif datum['href'].endswith('.html'):
                 staffscraper = Lawschool(datum)
                 staffscraper.run()
Exemple #38
0
 def test_query(self):
     query = "test"
     r = requests.get(self.app_url + "search", params={'q': query})
     # check response & redirects
     assert r.ok, "problematic status code: " + r.status_code
     assert r.history[-1].status_code == 302, "you were not redirected"
     # check params
     params = parse_qs(urlparse(r.url).query)
     assert 'sid' in params, "no session-id was given!"
     assert 'step' in params, "the 'step' parameter is missing!"
     assert 'q' in params, "the query parameter is missing!"
     assert params['q'][0] == query
     # check contents
     html = html5parser.fromstring(r.content)  # type: _Element
     assert html.cssselect("#result-list")
     assert len(html.cssselect("#result-list .result-entry")) == 10
     assert html.cssselect("#search-input")[0].attrib['value'] == query
     assert len(html.cssselect("#topic-centroid-list .topic-item")) > 5
     assert len(html.cssselect("#suggestions-list .result-entry")) == 10
     js = list(html.iter('{*}script'))[0].text
     assert "const query" in js
     assert "const topicGraph" in js
Exemple #39
0
 def process_channel_message(self, source, target, msg):
     res = []
     c = httplib2.Http()
     for word in msg.split(" "):
         word = word.split("#")[0]
         try:
             match = REG_EXP.match(word)
             if match:
                 url = match.groups()[0]
                 process = False
                 for domain in self.domains:
                     if fnmatch.fnmatch(urlparse.urlparse(url).netloc, domain):
                         process = True
                         break
                 if process:
                     resp, content = c.request(url)
                     tree = html5parser.fromstring(str(content))
                     for element in tree.iter("{http://www.w3.org/1999/xhtml}title"):
                         res.append(self.privmsg_response(target, "%s - \x0302\x1f%s\x0f" % (element.text, url)))
         except:
             pass
     return res
Exemple #40
0
def httpfacts(url, now=None):
    if now is None:
        now = int(time.time())

    req = requests.request('TRACE', url)
    yield ('http_trace_status-code', req.status_code)

    req = requests.request('OPTIONS', url)
    yield ('http_options_status-code', req.status_code)

    req = requests.request('GET', url)

    yield ('http_header_status-code', req.status_code)
    for k,v in req.headers.iteritems():
        yield ('http_header_' + k, v)
        if k == 'date' or k == 'last-modified' or k == 'expires':
            ts = http_parse_date(v)
            yield ('http_header_' + k + '_timestamp', http_parse_date(v))
            yield ('http_header_' + k + '_delta', ts - now)

    for k in req.cookies:
        """
Cookie(version=0, name='NID', value='67=Gx1dRLtNcYAkrXO8y3chpipEIQO4-PeWDuQ48QqjwZndKfJCQqvrdElss8vyDA_5wqohA1twy2te7xZqV9AvQVOmWElZJEcFu7VERLrr_tUlwlooIaDyb_UgqZy4zg-3', port=None, port_specified=False, domain='.google.co.jp', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=1393118966, discard=False, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)
        """

        prefix = 'http_cookie_' + k.name
        yield (prefix, k.value)
        yield (prefix + '_path', k.path)
        yield (prefix + '_domain', k.domain)
        yield (prefix + '_secure', k.secure)
        yield (prefix + '_expires', k.expires)
        yield (prefix + '_version', k.version)
        yield (prefix + '_rfc2109', k.rfc2109)
        httponly = k.has_nonstandard_attr('HttpOnly')
        yield (prefix + '_httponly', httponly)

    tree = html5parser.fromstring(req.text)

    #for el in tree.iterdescendants('meta'):
    XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
    XHTML = "{%s}" % XHTML_NAMESPACE

    # really look for HMTL/HEAD
    yield ('html_root_tag', str(tree.tag))
    #yield ('html_root1_tag', str(tree[0].tag))

    # find HEAD element
    #  /html/head[0]
    head = None
    for child in tree.iter(XHTML + 'head'):
        head = child
        break
    if head is None:
        return

    for meta in head.iterchildren(XHTML + 'meta'):
        if meta.get('name') and meta.get('content') is not None:
            yield 'html_meta_name_' + meta.get('name'), meta.get('content')
        elif meta.get('name') and meta.get('value') is not None:
            yield 'html_meta_name_' + meta.get('name'), meta.get('value')
        elif meta.get('property') and meta.get('content') is not None:
            yield 'html_meta_property_' + meta.get('property'), meta.get('content')
        elif meta.get('http-equiv') and meta.get('content') is not None:
            yield 'html_meta_http-equiv_' + meta.get('http-equiv').lower(), meta.get('content')
        elif meta.get('charset') is not None:
            yield 'html_meta_charset', meta.get('charset').lower()
        else:
            yield 'html_meta_unknown', tostring(meta)
Exemple #41
0
def get_page(url):
    r = requests.get(url)
    assert r.status_code == 200
    return html5parser.fromstring(r.text)
Exemple #42
0
def httpfacts(url, now=None):
    if now is None:
        now = int(time.time())

    req = requests.request('TRACE', url)
    yield ('http_trace_status-code', req.status_code)

    req = requests.request('OPTIONS', url)
    yield ('http_options_status-code', req.status_code)

    req = requests.request('GET', url)

    yield ('http_header_status-code', req.status_code)
    for k, v in req.headers.iteritems():
        yield ('http_header_' + k, v)
        if k == 'date' or k == 'last-modified' or k == 'expires':
            ts = http_parse_date(v)
            yield ('http_header_' + k + '_timestamp', http_parse_date(v))
            yield ('http_header_' + k + '_delta', ts - now)

    for k in req.cookies:
        """
Cookie(version=0, name='NID', value='67=Gx1dRLtNcYAkrXO8y3chpipEIQO4-PeWDuQ48QqjwZndKfJCQqvrdElss8vyDA_5wqohA1twy2te7xZqV9AvQVOmWElZJEcFu7VERLrr_tUlwlooIaDyb_UgqZy4zg-3', port=None, port_specified=False, domain='.google.co.jp', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=1393118966, discard=False, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)
        """

        prefix = 'http_cookie_' + k.name
        yield (prefix, k.value)
        yield (prefix + '_path', k.path)
        yield (prefix + '_domain', k.domain)
        yield (prefix + '_secure', k.secure)
        yield (prefix + '_expires', k.expires)
        yield (prefix + '_version', k.version)
        yield (prefix + '_rfc2109', k.rfc2109)
        httponly = k.has_nonstandard_attr('HttpOnly')
        yield (prefix + '_httponly', httponly)

    tree = html5parser.fromstring(req.text)

    #for el in tree.iterdescendants('meta'):
    XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
    XHTML = "{%s}" % XHTML_NAMESPACE

    # really look for HMTL/HEAD
    yield ('html_root_tag', str(tree.tag))
    #yield ('html_root1_tag', str(tree[0].tag))

    # find HEAD element
    #  /html/head[0]
    head = None
    for child in tree.iter(XHTML + 'head'):
        head = child
        break
    if head is None:
        return

    for meta in head.iterchildren(XHTML + 'meta'):
        if meta.get('name') and meta.get('content') is not None:
            yield 'html_meta_name_' + meta.get('name'), meta.get('content')
        elif meta.get('name') and meta.get('value') is not None:
            yield 'html_meta_name_' + meta.get('name'), meta.get('value')
        elif meta.get('property') and meta.get('content') is not None:
            yield 'html_meta_property_' + meta.get('property'), meta.get(
                'content')
        elif meta.get('http-equiv') and meta.get('content') is not None:
            yield 'html_meta_http-equiv_' + meta.get(
                'http-equiv').lower(), meta.get('content')
        elif meta.get('charset') is not None:
            yield 'html_meta_charset', meta.get('charset').lower()
        else:
            yield 'html_meta_unknown', tostring(meta)
Exemple #43
0
def collect_web_data(domain):
    """Collects information after web site analysis"""
    results = {}
    url = 'http://' + domain
    try:
        (status, headers, realurl, data) = fetch_page(url)
    except KeyboardInterrupt:
        #    except pycurl.error:
        return prepareError(ERROR_URLGETERROR, results)

    encoding = None  # Default encoding is UTF-8
    if 'content-type' in headers:
        ptypr, plist = parseContentType(headers['content-type'])
        if 'charset' in plist:
            results['page:enc:server'] = plist['charset']
            encoding = results['page:enc:server']

    resdata = {
        'crawler:processed': datetime.datetime.now().isoformat(),
        'page:data': zlib.compress(data, 9),
        'page:headers': zlib.compress(repr(headers).encode('utf8'), 9)
    }
    try:
        encoding = 'utf-8'
#        edata = decode_html(data) #.decode(encoding, 'ignore')
    except KeyboardInterrupt:
        return prepareError(ERROR_ENCODING, results, resdata)
    try:
        p = fromstring(data)
    except KeyboardInterrupt:
        try:
            p = html5parser.fromstring(data)
        except KeyboardInterrupt:
            p = soupparser.fromstring(data)
            return prepareError(ERROR_PARSEERROR, results, resdata)

    # Setting basic properties
    results['site:url'] = url
    results['site:host'] = url_to_host(url)
    results['site:realurl'] = realurl
    results['site:realhost'] = url_to_host(realurl)
    results['page:status'] = status

    hders = []
    for k, v in list(dict(headers).items()):
        try:
            hders.append({'name': k, 'value': v})
        except KeyboardInterrupt:
            pass  # Do nothing
    results['web:page:headers'] = hders

    # Process blocks of tags
    results['page:scripts'] = tags_to_array(p,
                                            tagname='script',
                                            attrlist=['type', 'src'],
                                            filter='src',
                                            distinct='src')
    results['page:images'] = tags_to_array(
        p,
        tagname='img',
        attrlist=['alt', 'title', 'width', 'height', 'src'],
        filter='src',
        distinct='src')
    results['page:meta'] = tags_to_array(
        p,
        tagname='meta',
        attrlist=['http-equiv', 'content', 'name', 'property'],
        filter=None,
        distinct=None)
    results['page:headlinks'] = tags_to_array(
        p,
        tagname='link',
        attrlist=['rel', 'type', 'title', 'href', 'media'],
        filter=None,
        distinct=None)
    results['page:iframes'] = tags_to_array(p,
                                            tagname='iframe',
                                            attrlist=['name', 'src'],
                                            filter=None,
                                            distinct=None)
    results['page:embeds'] = tags_to_array(
        p,
        tagname='embed',
        attrlist=['src', 'pluginspage', 'type'],
        filter=None,
        distinct=None)
    results['page:objects'] = tags_to_array(
        p,
        tagname='object',
        attrlist=['codetype', 'classid', 'code', 'codebase', 'type', 'data'],
        filter=None,
        distinct=None)
    results['page:forms'] = forms_to_array(p)
    results['page:applets'] = tags_to_array(
        p,
        tagname='applet',
        attrlist=['code', 'codebase', 'src', 'alt', 'title', 'name'],
        filter=None,
        distinct=None)
    results['page:inscripts'] = innerscripts_to_array(p)
    results['page:links'] = links_to_array(p)

    # Processing header keys
    if SERVER_KEY in list(headers.keys()):
        server = parse_server_header(headers[SERVER_KEY])
        results['hdr:server'] = server

    if POWERED_BY_KEY in list(headers.keys()):
        s = parse_powered_by(headers[POWERED_BY_KEY])
        results['hdr:poweredby'] = s

    return (results, resdata)
def repair_html(html_str):
	parser = html5parser.HTMLParser(namespaceHTMLElements=False)
	parsed = html5parser.fromstring(html_str, guess_charset=False, parser=parser)
	return html.tostring(parsed, encoding='unicode')
Exemple #45
0
def repair_html(html_str):
    parser = html5parser.HTMLParser(namespaceHTMLElements=False)
    parsed = html5parser.fromstring(html_str,
                                    guess_charset=False,
                                    parser=parser)
    return html.tostring(parsed, encoding='unicode')
Exemple #46
0
def text_html5parse(text, encoding='unicode'):
    node = html5parser.fromstring(text, parser=_html5parser)
    etree = node.getroottree()
    parsed = tostring(etree, encoding=encoding)
    return parsed
Exemple #47
0
 def query(self, path):
     html = self.res.body
     p = html5parser.fromstring(html, parser=self.parser)
     xml = ET.fromstring(tostring(p))
     path = re.sub(r'/(\w+\d?)', r'/{http://www.w3.org/1999/xhtml}\1', path)
     return xml.findall(path)
Exemple #48
0
 def __init__(self, html):
     tree = html5parser.fromstring(html)
     namespace = {'html': 'http://www.w3.org/1999/xhtml'}
     # Extract company name (mandatory)
     xpath = './/html:span[@class="tag-name"]/text()'
     name_text = tree.xpath(xpath, namespaces=namespace)
     if name_text:
         self.name = name_text[0]
     else:
         raise ZS_ParserParseError("Unable to get name text (can't find '{0}')".format(xpath))
     # Extract comapny address (mandatory)
     xpath = './/html:address/text()'
     address_text = tree.xpath(xpath, namespaces=namespace)
     if address_text:
         self.address = html2text(' '.join(address_text))[:-2]
     else:
         raise ZS_ParserParseError("Unable to get address text (can't find '{0}')".format(xpath))
     # Extract comapny information (free)
     information_text = tree.xpath('.//html:p[@class="tag-description"]/text()', namespaces=namespace)
     if information_text:
         self.information = html2text(' '.join(information_text))[:-2]
     else:
         self.information = None
     # Extract categories (free)
     xpath = './/html:span[@class="tag-categories"]/text()'
     categories_text = tree.xpath(xpath, namespaces=namespace)
     if categories_text:
         self.categories = categories_text
     else:
         self.categories = None
     # Extract branches (free)
     branches_tree = tree.xpath('.//html:div[@class="row branch ptb20"]', namespaces=namespace)
     branches = []
     for branch_tree in branches_tree:
         id_json = branch_tree.xpath('.//html:a/@data-td', namespaces=namespace)
         try:
             branches.append(loads(id_json[0])['id'])
         except (IndexError, ValueError, KeyError):
             raise ZS_ParserParseError("Unable to get branch data-td json")
     if branches:
         self.branches = branches
     else:
         self.branches = None
     # Load other data (free except at least one main contact)
     self.contact_persons = None
     self.products = None
     self.marks = None
     self.services = None
     self.keywords = None
     self.special_offer = None
     self.payments = None
     sections = tree.xpath('.//html:section[@class="profile-body"]/html:section', namespaces=namespace)
     for section in sections:
         try:
             attrib_class = section.attrib['class']
         except KeyError:
             continue
         # Main and other contacts
         if 'contacts' in attrib_class:
             try:
                 xpath = 'html:h3/text()'
                 heading = section.xpath(xpath, namespaces=namespace)[0]
             except IndexError:
                 raise ZS_ParserParseError("Unable to get heading of profile-body contacts section text (can't find '{0}')".format(xpath))
             # Main contacts
             if heading == 'Kontaktné údaje':
                 rows = section.xpath('.//html:li[@class="dt w100"]', namespaces=namespace)
                 for row in rows:
                     xpath = './/html:span[@class="title dtc w30"]/text()'
                     row_title_text = row.xpath(xpath, namespaces=namespace)
                     row_title = html2text(' '.join(row_title_text))[:-2]
                     xpath = './/html:span[@class="dtc w70"]//text()'
                     contact_text = row.xpath(xpath, namespaces=namespace)
                     if not contact_text:
                         raise ZS_ParserParseError("Unable to get contact from main contacts row '{0}' text (can't find '{1}')".format(row_title, xpath))
                     if not self.contacts:
                         self.contacts = []
                     self.contacts.append((row_title, html2text(' '.join(contact_text))[:-2]))
                 else:
                     raise  ZS_ParserParseError("Unable to get main contacts rows tree (can't find {0}')".format(xpath))
             # Other contacts
             if heading == 'Kontaktné osoby':
                 rows = section.xpath('.//html:li[@class="dt w100"]', namespaces=namespace)
                 for row in rows:
                     xpath = './/html:span[@class="title dtc w50"]/text()'
                     row_title_text = row.xpath(xpath, namespaces=namespace)
                     row_title = html2text(' '.join(row_title_text))[:-2]
                     contacts = []
                     xpath = './/html:span[@class="dtc w50"]/html:span'
                     contacts_tree = row.xpath(xpath, namespaces=namespace)
                     for contact_tree in contacts_tree:
                         contacts.append(html2text(' '.join(contact_tree.xpath('.//text()')))[:-2])
                     else:
                         raise  ZS_ParserParseError("Unable to get contacts from contact persons row '{0}' text (can't find {1}')".format(row_title, xpath))
                     if not self.contact_persons:
                         self.contact_persons = []
                     self.contact_persons.append((html2text(' '.join(row_title_tree[0].xpath('text()')))[:-2], contacts))
                 else:
                     raise  ZS_ParserParseError("Unable to get contact persons rows tree (can't find {0}')".format(xpath))
         if not hasattr(self, 'contacts'):
             raise ZS_ParserParseError("Unable to get at least one main contact from page")
         # Products
         if 'products' in attrib_class:
             xpath = './/html:li[@itemprop="name"]/text()'
             products_texts = section.xpath(xpath, namespaces=namespace)
             if products_texts:
                 self.products = products_texts
             else:
                 raise ZS_ParserParseError("Unable to get products texts (can't find '{0}')".format(xpath))
         # Marks
         if 'marks' in attrib_class:
             xpath = './/html:li[@itemprop="name"]/text()'
             marks_texts = section.xpath(xpath, namespaces=namespace)
             if marks_texts:
                 self.marks = marks_texts
             else:
                 raise ZS_ParserParseError("Unable to get marks texts (can't find '{0}')".format(xpath))
         # Services
         if 'services' in attrib_class:
             xpath = './/html:li[@itemprop="name"]/text()'
             services_texts = section.xpath(xpath, namespaces=namespace)
             if services_texts:
                 self.services = services_texts
             else:
                 raise ZS_ParserParseError("Unable to get services texts (can't find '{0}')".format(xpath))
         # Keywords
         if 'kw' in attrib_class:
             xpath = './/html:span[@itemprop="name"]/text()'
             keywords_texts = section.xpath(xpath, namespaces=namespace)
             if keywords_texts:
                 self.keywords = keywords_texts
             else:
                 raise ZS_ParserParseError("Unable to get keywords texts (can't find '{0}')".format(xpath))
         # Special offers
         if 'special-offer' in attrib_class:
             xpath = './/html:p/text()'
             special_offer_text = section.xpath(xpath, namespaces=namespace)
             if special_offer_text:
                 self.special_offer = html2text(' '.join(special_offer_text))[:-2]
             else:
                 raise ZS_ParserParseError("Unable to get special offer text (can't find '{0}')".format(xpath))
         # Payments
         if 'payments' in attrib_class:
             xpath = './/html:li'
             payment_headings_tree = section.xpath(xpath, namespaces=namespace)
             for payment_heading_tree in payment_headings_tree:
                 xpath = './/html:img/text()'
                 payment_heading_text = section.xpath(xpath, namespaces=namespace)
                 
             else:
                 raise ZS_ParserParseError("Unable to get payment headings tree (can't find {0}')".format(xpath))
         # Gallery
         if 'gallery' in attrib_class:
             pass
         # Documents
         if 'documents' in attrib_class:
             pass
         # Certificates
         if 'certificates' in attrib_class:
             pass
         # Video
         if 'video' in attrib_class:
             pass
         # Virtual tour link
         if 'virtual-tour' in attrib_class:
             pass
         # Reviews
         if 'reviews-box' in attrib_class:
             pass
Exemple #49
0
 def call_it(self, *args, **kwargs):
     from lxml.html.html5parser import fromstring
     return fromstring(*args, **kwargs)
Exemple #50
0
 def call_it(self, *args, **kwargs):
     from lxml.html.html5parser import fromstring
     return fromstring(*args, **kwargs)
Exemple #51
0
def lxml_etree_from_response(response):
    text = response.body_as_unicode()
    node = html5parser.fromstring(text, parser=_html5parser)
    etree = node.getroottree()
    return etree
Exemple #52
0
# -*- coding: koi8-r -*-

import urllib2

def separatewords(text):
    splitter = re.compile(' ')
    return [s.lower() for s in splitter.split(text) if s != '']

page = 'http://www.newizv.ru/culture/2012-09-26/170402-novym-hudrukom-masterskoj-petra-fomenko-stanet-evgenij-kamenkovich.html'

c = urllib2.urlopen(page)
content = c.read()
encoding = c.headers['content-type'].split('charset=')[-1]
unicontent = unicode(content, encoding)
from lxml.html import html5parser
doc = html5parser.fromstring(unicontent)
str = doc.xpath("string()")
print str

#from lxml import html
#doc = html.fromstring(content)
import codecs
f = codecs.open('text3.txt', encoding='utf-8', mode='w+')

sepa = separatewords(str)
print sepa
import unicodedata
#str2 = repr(str.decode('unicode-escape'))
#sepa2 = separatewords(str2)
#print sepa2
for elem in sepa: