Ejemplo n.º 1
0
    def get_selected_items(self):
        response = self.session.get(self.url("selected_items"))

        tree = lxml.html.fromstring(response.text)

        item_sel = CSSSelector('div[headers="th_selected_items"]')
        name_sel = CSSSelector("h4.il_ContainerItemTitle")
        icon_sel = CSSSelector("img.ilListItemIcon")

        results = item_sel(tree)

        for result in results:
            item = Item()

            name = name_sel(result)[0]

            try:
                name = CSSSelector("a")(name)[0]
            except IndexError:
                pass

            item.name = name.text
            item.url = name.get("href")

            icon = icon_sel(result)[0]
            item.icon = icon.get("src")

            yield item
Ejemplo n.º 2
0
    def _fetch_from_cache(language, url):
        from . import utils

        cms_url = utils.get_cms_url(language, url)

        if cms_url in cache:
            html = cache.get(cms_url)
        else:
            html = utils.get_cms_page(language, url)
            cache.set(cms_url, html)

        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(html), parser).getroot()
        toc = CSSSelector('.toc')

        # Removing all table of contents
        for table in toc(tree):
            table.getparent().remove(table)

        title = CSSSelector('.page-title')(tree)[0]
        title.getparent().remove(title)

        elements = list(CSSSelector('.cms-content')(tree)[0])

        headers = [i for i, e in enumerate(elements) if CSSSelector('.section-header')(e)]
        title_icons = list(CSSSelector('.title-icon')(tree))

        page_contents = []

        for i, h in enumerate(headers):
            icon = ""
            if i < len(title_icons) and 'src' in title_icons[i].attrib:
                icon = title_icons[i].attrib['src']

            element = elements[h]
            if (i + 1) == len(headers):
                contents = elements[h + 1:]
            else:
                contents = elements[h + 1:headers[i + 1]]

            for e in elements:
                if 'dir' in e.attrib:
                    del e.attrib['dir']

            section_title = CSSSelector('a[name]')(element)[0].text
            section_body = ""
            for c in contents:
                section_body += etree.tostring(c, pretty_print=True, method="html")

            page_contents.append({
                "is_important": True if CSSSelector('.important')(element) else False,
                "title": section_title,
                "body": section_body,
                "icon": icon
            })

        return {
            "title": title.text,
            "contents": page_contents
        }
Ejemplo n.º 3
0
	def process_html(self, html, path):
		parser = etree.HTMLParser(encoding='utf-8')
		tree = etree.fromstring(html.decode('utf-8'), parser).getroottree()
		page = tree.getroot()

		if page is None:
			print(repr(html))
			raise ParserError('Could not parse the html')

		lines = html.splitlines()
		body, = CSSSelector('body')(page)
		self._bodies.append(body)
		if self.optimize_lookup:
			for each in body.iter():
				identifier = each.attrib.get('id')
				if identifier:
				    self._all_ids.add(identifier)
				classes = each.attrib.get('class')
				if classes:
				    for class_ in classes.split():
				        self._all_classes.add(class_)

		for style in CSSSelector('style')(page):
		    first_line = style.text.strip().splitlines()[0]
		    for i, line in enumerate(lines):
				if line.count(first_line):
					key = (i + 1, path)
					self.blocks[key] = style.text
					break
Ejemplo n.º 4
0
 def detect_withdrawn(self, tree, url):
     comment = CSSSelector(".tablecell.comments")(tree)
     if comment:
         comment = comment[0].text_content()
         if "withdrawn" in comment.lower():
             print("Paper", url, "appears to be withdrawn!")
             return True
     return False
Ejemplo n.º 5
0
def post_node(title, datetime, content):
    post = copy(POST)
    CSSSelector('.title .text')(post)[0].text = title
    CSSSelector('.datetime')(post)[0].text = datetime.strftime("%H:%M on %A the %%s of %B, %Y") % niceday(datetime)
    content_css = CSSSelector('.content')(post)[0]
    for fragment in fragments_fromstring(cleaner_trusted.clean_html(content)):
        content_css.append(fragment)

    return post
Ejemplo n.º 6
0
def get_or_create_head(root):
    """Ensures that `root` contains a <head> element and returns it.
    """
    head = CSSSelector('head')(root)
    if not head:
        head = etree.Element('head')
        body = CSSSelector('body')(root)[0]
        body.getparent().insert(0, head)
        return head
    else:
        return head[0]
Ejemplo n.º 7
0
    def process_html(self, html, url):
        parser = etree.HTMLParser(encoding='utf-8')
        tree = etree.fromstring(html.encode('utf-8'), parser).getroottree()
        page = tree.getroot()

        if page is None:
            print(repr(html))
            raise ParserError('Could not parse the html')

        lines = html.splitlines()
        body, = CSSSelector('body')(page)
        self._bodies.append(body)
        if self.optimize_lookup:
            for each in body.iter():
                identifier = each.attrib.get('id')
                if identifier:
                    self._all_ids.add(identifier)
                classes = each.attrib.get('class')
                if classes:
                    for class_ in classes.split():
                        self._all_classes.add(class_)

        for style in CSSSelector('style')(page):
            try:
                first_line = style.text.strip().splitlines()[0]
            except IndexError:
                # meaning the inline style tag was just whitespace
                continue
            except AttributeError:
                # happend when the style tag has absolute nothing it
                # not even whitespace
                continue
            for i, line in enumerate(lines):
                if line.count(first_line):
                    key = (i + 1, url)
                    self.blocks[key] = style.text
                    break

        for link in CSSSelector('link')(page):
            if (
                link.attrib.get('rel', '') == 'stylesheet' or
                link.attrib['href'].lower().split('?')[0].endswith('.css')
            ):
                link_url = self.make_absolute_url(url, link.attrib['href'])
                key = (link_url, link.attrib['href'])
                self.blocks[key] = self.download(link_url)
                if self.preserve_remote_urls:
                    self.blocks[key] = self._rewrite_urls(
                        self.blocks[key],
                        link_url
                    )
Ejemplo n.º 8
0
    def get_submission_dates(self, arxiv_tree, queried_version):
        links = CSSSelector("div.submission-history")(arxiv_tree)[0]
        versions = {}
        #print "Parsing", links.text_content()
        for line in links.text_content().split("\n"):
            match = self.version_re.match(line)
            if match:
                version, d = match.group(1), match.group(2)
                d = datetime.datetime.strptime(d,'%a, %d %b %Y').date()
                versions[version] = d
                if queried_version == version:
                    return {version: d}
                #print version, date

        return versions
Ejemplo n.º 9
0
    def process_html(self, html, url):
        parser = etree.HTMLParser()
        tree = etree.fromstring(html, parser).getroottree()
        page = tree.getroot()

        if page is None:
            print repr(html)
            raise ParserError("Could not parse the html")

        lines = html.splitlines()
        body, = CSSSelector('body')(page)
        self._bodies.append(body)
        if self.optimize_lookup:
            for each in body.iter():
                id = each.attrib.get('id')
                if id:
                    self._all_ids.add(id)
                classes = each.attrib.get('class')
                if classes:
                    for class_ in classes.split():
                        self._all_classes.add(class_)

        for style in CSSSelector('style')(page):
            first_line = style.text.strip().splitlines()[0]
            for i, line in enumerate(lines):
                if line.count(first_line):
                    key = (i + 1, url)
                    self.blocks[key] = style.text
                    break

        for link in CSSSelector('link')(page):
            if (
                link.attrib.get('rel', '') == 'stylesheet' or
                link.attrib['href'].lower().split('?')[0].endswith('.css')
            ):
                link_url = self.make_absolute_url(url, link.attrib['href'])
                key = (link_url, link.attrib['href'])
                self.blocks[key] = self._download(link_url)
                if self.preserve_remote_urls:
                    self.blocks[key] = self._rewrite_urls(
                        self.blocks[key],
                        link_url
                    )
def load_stations(file="stations-converted.json"):
    global STATIONS

    with open(file) as f:
        STATIONS = anyjson.deserialize(f.read())

    for station in STATIONS.values():
        try:
            uri = "http://hydro.chmi.cz/isarrow/object.php?seq=2000855701&chemie=1&biota=1&ukol_p=1&id_objekt=&vod_typ=R&nadmh_sign=%3E&rickm_sign=%3E&rok_od=2007&rok_do=2012&objekty_chemdata=1&matrice=2000868184&typodb=41"
            seq = CSSSelector("form input[name='seq']")(fromstring(urllib2.urlopen(uri).read().decode("cp1250")))[
                0
            ].value

            # print 'seq is ' + seq

            uri = (
                "http://hydro.chmi.cz/isarrow/object.php?agenda=POV&objekty_chemdata=1&objekty_biodata=&taxon_tree=&seq="
                + seq
                + "&data_sel=chemdata&chemie=1&biota=1&rok_od=2007&rok_do=2012&matrice=2000868184&typodb=41&tscongrp=&tscon=&data_mez_stanovitelnosti=&data_od=&data_do=&taxon=&send=Chemick%E9+vzorky"
            )
            tree = fromstring(urllib2.urlopen(uri).read().decode("cp1250"))

            link = CSSSelector("table.tbl a")(tree)[-1]

            uri = "http://hydro.chmi.cz/isarrow/" + link.get("href")
            tree = fromstring(urllib2.urlopen(uri).read().decode("cp1250"))

            csv_link = tree.xpath("//form[1]//a")[0]

            uri = "http://hydro.chmi.cz/isarrow/" + csv_link.get("href")

            # FIXME: CSV export is now broken on IS ARROW
            # wait for them to fix it or parse from table -- and store relevant data into structure
            reader = csv.reader(urllib2.urlopen(uri))
            for row in reader:
                print row

        except Exception:
            print "Failed to retrieve values for station " + station["id"]
            import traceback

            traceback.print_exc()
Ejemplo n.º 11
0
 def make_emoji_img_elem(emoji_span_elem: CSSSelector) -> Dict[str, Any]:
     # Convert the emoji spans to img tags.
     classes = emoji_span_elem.get('class')
     match = re.search('emoji-(?P<emoji_code>\S+)', classes)
     # re.search is capable of returning None,
     # but since the parent function should only be called with a valid css element
     # we assert that it does not.
     assert match is not None
     emoji_code = match.group('emoji_code')
     emoji_name = emoji_span_elem.get('title')
     alt_code = emoji_span_elem.text
     image_url = base_url + '/static/generated/emoji/images-%(emojiset)s-64/%(emoji_code)s.png' % {
         'emojiset': emojiset,
         'emoji_code': emoji_code
     }
     img_elem = lxml.html.fromstring(
         '<img alt="%(alt_code)s" src="%(image_url)s" title="%(title)s">' % {
             'alt_code': alt_code,
             'image_url': image_url,
             'title': emoji_name,
         })
     img_elem.set('style', 'height: 20px;')
     img_elem.tail = emoji_span_elem.tail
     return img_elem
Ejemplo n.º 12
0
# coding: utf-8
import lxml.html
import requests
from lxml.cssselect import CSSSelector

keyword = '비오는'
r = requests.get("http://music.naver.com/search/search.nhn?query=" + keyword +
                 "&x=0&y=0")
_html = lxml.html.fromstring(r.text)

sel = CSSSelector('table[summary] > tbody > ._tracklist_move')
# Apply the selector to the DOM tree.
nodes = sel(_html)

_selName = CSSSelector('.name > a.title')
_selArtist = CSSSelector('._artist.artist')
_selAlbum = CSSSelector('.album > a')
for node in nodes:
    #print lxml.html.tostring(item)
    _name = _selName(node)
    _artist = _selArtist(node)
    _album = _selAlbum(node)
    if _name:
        print _artist[0].text_content().strip(),
        print "---",
        print _name[0].text_content(),
        print "---",
        print _album[0].text_content()
Ejemplo n.º 13
0
def csstext(target, selector):
    from lxml.cssselect import CSSSelector
    return ' '.join(e.text_content()
                    for e in CSSSelector(selector)(target)).strip()
Ejemplo n.º 14
0
def _parse_html_for_content(html):
    """
    This function takes in the HTML from transifex and looks for the special tags that
    break down the anchors into two separate divs see function above
    :param html:
    :return:
    """
    p = re.compile(r'<.*?>')
    if p.findall(html):
        h = html_parser.HTMLParser()

        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(html), parser)

        a = CSSSelector('div.former-anchor')
        translatable_a = CSSSelector('div.former-anchor-translatable')
        img = CSSSelector('div.former-image')
        phones = CSSSelector('div.former-tel')

        anchors = a(tree)
        for anchor in anchors:
            attributes = [(k.replace('data-a-', ''), h.unescape(v))
                          for k, v in dict(anchor.attrib).iteritems()
                          if 'data-a-' in k]

            ht_st = "<a>{}</a>".format(stringify_children(anchor))
            div = etree.parse(StringIO(fix_html_fragment(ht_st))).getroot()

            for k, v in attributes:
                div.attrib[k] = v

            swap_element_inbound(div, anchor)

        anchors = translatable_a(tree.getroot())
        for anchor in anchors:
            attributes = [(k.replace('data-a-', ''), h.unescape(v))
                          for k, v in dict(anchor.attrib).iteritems()
                          if 'data-a-' in k]

            content = etree.Element('div')
            link = etree.Element('div')

            for c in anchor:
                if 'class' in c.attrib:
                    if c.attrib['class'] == 'text':
                        content = c
                    if c.attrib['class'] == 'href':
                        link = c

            ht_st = "<a>{}</a>".format(stringify_children(content))
            div = etree.parse(StringIO(fix_html_fragment(ht_st))).getroot()

            for k, v in attributes:
                div.attrib[k] = v

            href = stringify_children(link)

            if href:
                div.attrib['href'] = h.unescape(href)
            swap_element_inbound(div, anchor)

        images = img(tree.getroot())
        for image in images:
            attributes = [(k.replace('data-img-', ''), h.unescape(v))
                          for k, v in dict(image.attrib).iteritems()
                          if 'data-img-' in k]
            div = etree.Element('img')

            for k, v in attributes:
                div.attrib[k] = h.unescape(v)

            swap_element_inbound(div, image)

        tels = phones(tree.getroot())
        for tel in tels:
            if 'class' in tel.attrib:
                classes = tel.attrib['class'].split(' ')
                tag_format = "{}"
                if 'has-b' in classes:
                    tag_format = "<b>{}</b>".format(tag_format)
                if 'has-u' in classes:
                    tag_format = "<u>{}</u>".format(tag_format)
                if 'has-strong' in classes:
                    tag_format = "<strong>{}</strong>".format(tag_format)
                if 'has-em' in classes:
                    tag_format = "<em>{}</em>".format(tag_format)
                if 'has-i' in classes:
                    tag_format = "<i>{}</i>".format(tag_format)

                tag_format = "<span class=\"tel\">{}</span>".format(tag_format)
                div = etree.parse(
                    StringIO(tag_format.format(
                        tel.attrib['data-tel-number']))).getroot()

                swap_element_inbound(div, tel)
        html = etree.tostring(tree)

    soup = BeautifulSoup(html)
    return unicode(soup.prettify())
Ejemplo n.º 15
0
def pull_from_transifex(slug,
                        language,
                        project=settings.TRANSIFEX_PROJECT_SLUG,
                        retry=True):
    from django.contrib.auth import get_user_model

    User = get_user_model()

    # cache.add fails if the key already exists
    acquire_lock = lambda: cache.add('publishing-translation', 'true', 60 * 5)
    # memcache delete is very slow, but we have to use it to take
    # advantage of using add() for atomic locking
    release_lock = lambda: cache.delete('publishing-translation')

    try:
        if language == 'en':
            return
        import cms.api

        internal_language = language if language not in SHIM_LANGUAGE_DICTIONARY else SHIM_LANGUAGE_DICTIONARY[
            language]

        while True:
            if acquire_lock():
                break
            time.sleep(5)

        staging = Title.objects.filter(language='en', slug='staging')
        if staging:
            staging = staging[0].page
        titles = Title.objects.filter(language='en',
                                      slug=slug,
                                      page__in=staging.get_descendants())

        if not titles:
            logger.info('Page not found. Ignoring.')

        page = titles[0].page.get_draft_object()

        password = settings.TRANSIFEX_PASSWORD
        user = settings.TRANSIFEX_USER

        transifex_language = language
        transifex_url_data = {
            "project": project,
            "slug": page.get_slug('en'),
            "language": transifex_language
        }
        fetch_format = "http://www.transifex.com/api/2/project/{project}/resource/{slug}html/translation/{language}/?mode=default"

        logger.info("Trying to request: %s" %
                    fetch_format.format(**transifex_url_data))
        logger.info("With creds: %s %s" % (user, password))

        r = requests.get(fetch_format.format(**transifex_url_data),
                         auth=(user, password))

        translation = r.json()

        text = translation['content'].strip()
        text = _parse_html_for_content(text)
        soup = BeautifulSoup(text)

        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(unicode(soup.prettify())), parser)
        selector = CSSSelector('div[data-id]')
        title_selector = CSSSelector('div.title')
        """
        Directions are handled application-wise
        """
        dir_selector = CSSSelector('[dir]')

        for element in dir_selector(tree.getroot()):
            del element.attrib['dir']

        content = selector(tree.getroot())
        title = title_selector(tree.getroot())
        if title:
            try:
                title = title[0].text
                title_obj = page.get_title_obj(internal_language,
                                               fallback=False)
                if type(title_obj).__name__ == 'EmptyTitle':
                    logger.info('Creating new title')
                    en_title_obj = page.get_title_obj('en')
                    title_obj = cms.api.create_title(
                        language=internal_language,
                        title=en_title_obj.title.strip(),
                        page=page,
                        slug=en_title_obj.slug.strip(),
                    )
                    title_obj.save()
                title_obj.page_title = title.strip()
                title_obj.save()
            except Exception as e:
                logger.exception('Error updating the application.')

        dict_list = []

        for div in content:
            plugin_dict = {
                'id':
                div.attrib['data-id'],
                'type':
                div.attrib['data-type'],
                'parent':
                div.attrib['data-parent'],
                'position':
                div.attrib['data-position'],
                'translated': (div.text or '') + u''.join([
                    etree.tostring(a, pretty_print=True, method="html")
                    for a in div
                ]),
            }
            dict_list.append(plugin_dict)
        blame = User.objects.filter(is_staff=True, is_superuser=True)[0]

        _translate_page(dict_list, internal_language, page)

        cms.api.publish_page(page, blame, internal_language)
    except Exception as e:
        if retry:
            time.sleep(5)
            pull_from_transifex.delay(slug, language, project, False)
        else:
            traceback.print_exc()
            logger.info('Tried to retry it but it still erred out.')
            raise e
    finally:
        release_lock()
def extract_reply_cids(html):
    tree = lxml.html.fromstring(html)

    sel = CSSSelector('.comment-replies-header > .load-comments')
    return [i.get('data-cid') for i in sel(tree)]
Ejemplo n.º 17
0
import lxml.html
from lxml.cssselect import CSSSelector
import requests

url = requests.get('http://www.google.com/search?q=python')
tree = lxml.html.fromstring(url.text)
link = CSSSelector('a[href]')
lines = link(tree)
print len(save)

for line in lines:
    save = lxml.html.tostring(line)
    if 'https://' in save:
        print save
        print
Ejemplo n.º 18
0
def split_html(html_filename, split_at_level=0):
    """ Split aggregated and rendered HTML document at
        some <hX> tag(s). split_at_level=0 -> split at
        H1 tags, split_at_level=1 -> split at H1 and H2
        tags.
        Returns a list of dicts with keys 'html' referring
        to the subdocument and 'level' indicating the split
        point.
    """

    destdir = os.path.dirname(html_filename)
    soup = BeautifulSoup(file(html_filename).read())
    fp = StringIO(soup.__str__(prettyPrint=True))
    docs = list()
    current_doc = list()
    for line in fp:
        line = line.rstrip()
        for level in range(split_at_level + 1):
            if '<h%d' % (level + 1) in line.lower():
                html = '\n'.join(current_doc)
                root = lxml.html.fromstring(unicode(html, 'utf-8'))
                title = u''
                h1_nodes = root.xpath('//h1')
                if h1_nodes:
                    title = h1_nodes[0].text_content().strip()

                # count tables and images
                number_tables = len(root.xpath('//table'))
                number_images = len(CSSSelector('div.image-caption')(root))

                # find all linkable nodes with an ID attribute
                node_ids = list()
                for node in root.xpath('.//*'):
                    node_id = node.get('id')
                    if node_id:
                        node_ids.append(node_id)

                html = lxml.html.tostring(root, encoding=unicode)
                docs.append(
                    dict(html=html,
                         level=level,
                         title=title,
                         node_ids=node_ids,
                         number_images=number_images,
                         number_tables=number_tables))
                current_doc = []
                break

        current_doc.append(line)

    # now deal with the remaining part of the document
    html = '\n'.join(current_doc)
    root = lxml.html.fromstring(unicode(html, 'utf-8'))
    title = u''
    h1_nodes = root.xpath('//h1')
    if h1_nodes:
        title = h1_nodes[0].text_content().strip()

    # count tables and images
    # count tables and images
    number_tables = len(root.xpath('//table'))
    number_images = len(CSSSelector('div.image-caption')(root))

    # find all linkable nodes with an ID attribute
    node_ids = list()
    for node in root.xpath('.//*'):
        node_id = node.get('id')
        if node_id:
            node_ids.append(node_id)

    html = lxml.html.tostring(root, encoding=unicode)
    docs.append(
        dict(html=html,
             level=0,
             title=title,
             node_ids=node_ids,
             number_images=number_images,
             number_tables=number_tables))

    # now store files on the filesystem
    ini_filename = os.path.join(destdir, 'documents.ini')
    fp_ini = codecs.open(ini_filename, 'w', 'utf-8')

    for count, d in enumerate(docs[1:]):
        filename = os.path.join(
            destdir, 'split-0/%d-level-%d.html' % (count, d['level']))
        if not os.path.exists(os.path.dirname(filename)):
            os.makedirs(os.path.dirname(filename))
        file(filename, 'w').write(d['html'].encode('utf-8'))

        print >> fp_ini, '[%d]' % count
        print >> fp_ini, 'filename = %s' % filename
        print >> fp_ini, 'title = %s' % d['title']
        print >> fp_ini, 'number_tables= %d' % d['number_tables']
        print >> fp_ini, 'number_images = %d' % d['number_images']
        print >> fp_ini, 'node_ids = '
        for node_id in d['node_ids']:
            print >> fp_ini, '    ' + node_id
        print >> fp_ini

    fp_ini.close()
    return docs[1:]
Ejemplo n.º 19
0
def getView(document, css):
	"""
	document
		a DOM document, currently an lxml HTML document
	css
		a CSS StyleSheet string
	
	returns style view
		a dict of {DOMElement: css.CSSStyleDeclaration} for html
	"""
	from lxml.cssselect import CSSSelector
	sheet = cssutils.parseString(css)
	
	view = {}
	specificities = {} # needed temporarily 

	# TODO: filter rules simpler?, add @media
	rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE)	
	for rule in rules:
		for selector in rule.selectorList:
			#log(0, 'SELECTOR', selector.selectorText)
			# TODO: make this a callback to be able to use other stuff than lxml
			try:
				cssselector = CSSSelector(selector.selectorText)
			except:
				continue
			matching = cssselector.evaluate(document)
			
			for element in matching:
				#if element.tag in ('div',):
					# add styles for all matching DOM elements
					#log(1, 'ELEMENT', id(element), element.text)
					
					if element not in view:	
						# add initial empty style declatation
						view[element] = cssutils.css.CSSStyleDeclaration() # @UndefinedVariable
						specificities[element] = {}					
															
					for p in rule.style:
						# update style declaration
						if p not in view[element]:
							# setProperty needs a new Property object and
							# MUST NOT reuse the existing Property
							# which would be the same for all elements!
							# see Issue #23
							view[element].setProperty(p.name, p.value, p.priority)
							specificities[element][p.name] = selector.specificity
							#log(2, view[element].getProperty('color'))
							
						else:
							#log(2, view[element].getProperty('color'))
							sameprio = (p.priority == 
										view[element].getPropertyPriority(p.name))
							if not sameprio and bool(p.priority) or (
							   sameprio and selector.specificity >= 
											specificities[element][p.name]):
								# later, more specific or higher prio 
								view[element].setProperty(p.name, p.value, p.priority)
					

	return view
Ejemplo n.º 20
0
    lasturl = ""

while True:
    req = urllib2.Request(url)
    req.add_header("User-Agent", useragent)
    if lasturl:
        req.add_header("Referer", lasturl)
    html = unicode(urllib2.urlopen(req).read(), errors="ignore")
    doc = etree.HTML(html)
    rtr = CSSSelector("ol#rtr")(doc)
    if rtr:
        numresults = len(rtr[0].getchildren())
    else:
        numresults = 0
    print "hit " + url + " got " + str(numresults) + " results"
    rhscol = CSSSelector("div#rhscol")(doc)[0]
    links = [a for a in rhscol.getiterator("a")]
    if len(links) != 3 or "Older" not in links[1].text or "Newer" not in links[2].text:
        print "Cant find older and newer links here, backing up"
        oldurl = page.url
        match = re.search("mbl_hs:(\d+),mbl_he:(\d+),mbl_rs:(\d+),mbl_re:(\d+)", oldurl)
        mbl_hs = int(match.group(1)) + 600
        mbl_he = int(match.group(2)) + 600
        mbl_rs = int(match.group(3)) + 600
        mbl_re = int(match.group(4)) + 600
        url = oldurl.replace(
            match.group(0),
            "mbl_hs:" + str(mbl_hs) + ",mbl_he:" + str(mbl_he) + ",mbl_rs:" + str(mbl_rs) + ",mbl_re:" + str(mbl_re),
        )
        lasturl = oldurl
        time.sleep(10)
Ejemplo n.º 21
0
Archivo: try.py Proyecto: kgn/cssutils
    document = etree.HTML(html)
    e = etree.Element('pre', {'class': 'cssutils'})
    e.text = css
    document.find('body').append(e)

    sheet = cssutils.parseString(css)

    view = {}
    specificities = {} # temporarily needed
    # TODO: filter rules simpler?, add @media
    rules = (rule for rule in sheet.cssRules if rule.type == rule.STYLE_RULE)

    for rule in rules:
        for selector in rule.selectorList:
            cssselector = CSSSelector(selector.selectorText)
            elements = cssselector.evaluate(document)
            for element in elements:
                # add styles for all matching DOM elements
                if element not in view:
                    # add initial
                    view[element] = cssutils.css.CSSStyleDeclaration()
                    specificities[element] = {}

                for p in rule.style:
                    # update styles
                    if p not in view[element]:
                        view[element].setProperty(p)
                        specificities[element][p.name] = selector.specificity
                    else:
                        sameprio = (p.priority ==
Ejemplo n.º 22
0
sel = CSSSelector('table tbody tr')
rows = sel(tree)
print "Row results: ", len(rows)
num_operating = 0
for row in rows:
    # This is unrealiable; I don't know how to get just the text 'Operating':
    #  phase = CSSSelector('td:nth-of-type(4)')(row)[0]
    #  lxml.html.tostring(phase)
    #  '<td><span class="hide">3</span>Operating</td>'
    phase = CSSSelector('td:nth-of-type(4)')(row)[0].text_content() # '3Operating'
    if 'Operating' in phase:
        num_operating += 1
    # Show phase because we may have stale ones in iSat
    division = CSSSelector('td:nth-of-type(1)')(row)[0]
    try:
        division = division.text.strip()
    except AttributeError, e:
        division = 'NOTFOUND'
    mission = CSSSelector('td:nth-of-type(2) > a')(row)[0]
    mission_name = mission.text.strip()
    mission_url = mission.get('href') # /missions/xmm-newton/
    mission_slug = mission_url.split('/')[2]
    num_operating += 1
    try:
        print '%-30s\t%-40s\t%-20s\t%-20s' % (mission_slug, mission_name.encode('ascii', 'ignore'), division, phase)
    except UnicodeEncodeError, e:
        print "F*****g unicode problem: ", e
        import pdb; pdb.set_trace()
print 'Operating:', num_operating

Ejemplo n.º 23
0
    def getView(self, document, sheet, media='all', name=None, styleCallback=None):
        """
        document
            a DOM document, currently an lxml HTML document
        sheet
            a CSS StyleSheet object, currently cssutils sheet
        media: optional
            TODO: view for which media it should be
        name: optional
            TODO: names of sheets only
        styleCallback: optional
            should return css.CSSStyleDeclaration of inline styles, for html
            a style declaration for ``element@style``. Gets one parameter
            ``element`` which is the relevant DOMElement

        returns style view
            a dict of {DOMElement: css.CSSStyleDeclaration} for html
        """

        styleCallback = styleCallback or self.styleattribute

        _unmergable_rules = CSSStyleSheet()

        view = {}
        specificities = {}  # needed temporarily

        # TODO: filter rules simpler?, add @media
        rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE)
        for rule in rules:
            for selector in rule.selectorList:
                self.log(0, 'SELECTOR', selector.selectorText)
                # TODO: make this a callback to be able to use other stuff than lxml
                try:
                    cssselector = CSSSelector(selector.selectorText)
                except (ExpressionError, NotImplementedError) as e:
                    _unmergable_rules.add(CSSStyleRule(selectorText=selector.selectorText,
                                                       style=rule.style))
                    continue

                matching = cssselector.evaluate(document)

                for element in matching:

                        if element.tag in self.NONVISUAL_TAGS:
                            continue

                        # add styles for all matching DOM elements
                        self.log(1, 'ELEMENT', id(element), element.text)

                        if element not in view:
                            # add initial empty style declatation
                            view[element] = CSSStyleDeclaration()
                            specificities[element] = {}

                            # and add inline @style if present
                            inlinestyle = styleCallback(element)
                            if inlinestyle:
                                for p in inlinestyle:
                                    # set inline style specificity
                                    view[element].setProperty(p)
                                    specificities[element][p.name] = (1, 0, 0, 0)

                        for p in rule.style:
                            # update style declaration
                            if p not in view[element]:
                                # setProperty needs a new Property object and
                                # MUST NOT reuse the existing Property
                                # which would be the same for all elements!
                                # see Issue #23
                                view[element].setProperty(p.name, p.value, p.priority)
                                specificities[element][p.name] = selector.specificity
                                self.log(2, view[element].getProperty('color'))

                            else:
                                self.log(2, view[element].getProperty('color'))
                                sameprio = (p.priority ==
                                            view[element].getPropertyPriority(p.name))
                                if not sameprio and bool(p.priority) or (
                                   sameprio and selector.specificity >=
                                        specificities[element][p.name]):
                                    # later, more specific or higher prio
                                    view[element].setProperty(p.name, p.value, p.priority)

        _unmergable_css = _unmergable_rules.cssText
        if _unmergable_css:
            e = etree.Element('style')
            # print __name__, _unmergable_css.__repr__()
            e.text = to_unicode(_unmergable_css, 'utf-8')
            body = document.find('body') or document
            body.insert(0, e)  # add <style> right into body

        return view
Ejemplo n.º 24
0
def cssselect(expr, tree):
    return CSSSelector(expr)(tree)
def Parse(reading):
    result = { "url": reading["url"] }
    text = reading["text"]

    text = re.sub("<p<", "", text)   # this error too severe for parser to handle
    doc = lxml.html.parse(StringIO(text))
    root = doc.getroot()
    #body = h.find(".//body")
    maindiv = CSSSelector("#divMiddleLeftCentreBottomRight")(root)[0]    

    heading = CSSSelector("#divHeading h1")(maindiv)[0].text
    intro = CSSSelector("#divIntroduction h2")(maindiv)[0]
    h2 = lxml.etree.tounicode(intro)
    #print [heading, h2]
    
    mheading = re.match(u"([\w\s\-']*?)\s*(?:\u2013\s*(?:PPC for (.*?)$)?|$)", heading)
    result["name"] = mheading.group(1)

    mmpfor = re.search(u'(?:<br\s*/>)?\s*MP for (.*?)\s*<br\s*/>', h2)
    if mmpfor:
        result["MP for"] = mmpfor.group(1)
        result["MP for"] = result["MP for"]   # needs to be regularized for the 2005 boundaries

        
    mcandidate = re.search(u'Liberal Democrat candidate for <a href="in_your_area_detail.aspx.*?">(.*?)</a>', h2)
    if mcandidate:
        result["constituency"] = RegularizeConstituency(mcandidate.group(1))
    elif mheading.group(2):
        result["constituency"] = RegularizeConstituency(mheading.group(2))
    elif "MP for" in result:
        result["constituency"] = RegularizeConstituency(result["MP for"])
    else:
        assert False, (h2, heading)
    
    divImage = maindiv.cssselect("#divIntroduction a img")
    if divImage:
        result["image"] = divImage[0].get("src")
        
        
    #print maindiv.cssselect("#divAboutMe h2")[0].text, "About Me"
    
    for traboutme in maindiv.cssselect("#divAboutMe tr"):
        key = traboutme.cssselect("th")[0].text[:-1]
        assert key in ["Marital Status", "Occupation", "Education"]
        value = traboutme.cssselect("td")[0].text
        if value:
            value = re.sub(u"\u2019", "'", value).strip()
            value = re.sub(u"\u2013", "-", value)
            value = re.sub("\xae", "", value)
            value = re.sub("\s*\n\s*", "; ", value)
            result[key] = value
        
    divBiography = maindiv.cssselect("#divBiography")
    if divBiography:
        result["bio"] = SimplifyHTML(divBiography[0])
        result["bio"] = re.sub("^Biography\s+", "", result["bio"])  # clean out leading title


    contacttext = lxml.etree.tounicode(maindiv.cssselect("#divIndividualContactInfo")[0])
    
    memail = re.search('<strong>Email:</strong> <a href="(?:mailto:)?(.*?)">', contacttext)
    if memail:
        result["email"] = memail.group(1)
        
    mwebsite = re.search('<strong>Website:</strong> <a href="(.*?)">', contacttext)
    if mwebsite:
        result["website"] = mwebsite.group(1)
        
    mphone = re.search('<strong>Telephone:</strong> ([\d\s]+)', contacttext)
    if mphone:
        result["phone"] = mphone.group(1).strip()
        
    address = "; ".join([ addressline.text  for addressline in maindiv.cssselect("#divIndividualContactInfo ul li") ])
    if address:
        result["address"] = address.encode("ascii", "replace")  # the database doesn't seem to be unicode.  it should be
        
    return result
Ejemplo n.º 26
0
    def get_anime_info(self, obj):
        """Returns an AnimeInfoObject. A url,
        or any meta object can be passed"""
        url = ''
        if isinstance(obj, basestring):
            if obj[:1] == '/':
                url = BASE_URL[:-1] + obj
            else:
                url = obj  # Can take absolute url
        else:
            url = obj.get_url()  # Any anime meta object
        content = self.conn.scrape.get(url)
        tree = lxml.html.fromstring(content.text)
        listing = CSSSelector('.bigBarContainer')(tree)
        if listing is None or len(listing) < 2:
            return None
        pgraphs = listing[0].cssselect('p')
        if pgraphs is None or len(pgraphs) < 5:
            return None
        extra_info = []
        if len(pgraphs) == 6:  # some animes dont have an air date
            extra_info = [x.strip() for x in pgraphs[3].itertext()]
        else:
            extra_info = [x.strip() for x in pgraphs[2].itertext()]
        if len(extra_info) < 5:  # Not valid at all?
            return None
        title = listing[0].cssselect('.bigChar')[0]
        if title is None:
            title = 'N/A'
        else:
            title = title.text
        alt_names = [
            x.text.strip().encode('utf-8') for x in pgraphs[0].cssselect('a')
        ]
        tags = [
            x.text.strip().encode('utf-8') for x in pgraphs[1].cssselect('a')
        ]
        airdate = 'N/A'
        if len(pgraphs) == 6:  # Only if we have 6 <p>'s, airdate is present
            airdate = "".join([x.strip() for x in pgraphs[2].itertext()])[11:]
        status = extra_info[2]
        views = extra_info[4]
        summary = 'N/A'
        if len(pgraphs) == 6:  # Due to airdate, summary can be shifted
            summary = "".join([x.strip() for x in pgraphs[5].itertext()])
        else:
            summary = "".join([x.strip() for x in pgraphs[4].itertext()])

        ep_list = listing[1].cssselect('tr')
        ep_meta = []
        if len(ep_list) > 2:
            for i in range(2, len(ep_list)):  # First two are junk
                episode = ep_list[i]
                info = episode.cssselect('td')
                if len(info) < 2:
                    continue
                ep_name = "".join([x.strip() for x in info[0].itertext()])
                ep_url = BASE_URL[:-1]
                ep_url += info[0].cssselect('a')[0].get('href')
                ep_rel = info[1].text.strip()  # Remove whitespace
                ep_meta.append(AnimeEpisodeInfoObject(ep_name, ep_url, ep_rel))
        return AnimeEpisodeMetaObject(title, alt_names, tags, airdate, status,
                                      views, summary, ep_meta)
Ejemplo n.º 27
0
Archivo: ganji.py Proyecto: ptphp/PyLib
    def buy(self,url):
        self.fd['city'] = self.citycode       
        self.fd['house_flag'] = 3
#        self.fd['belong']="1"
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        tree = etree.HTML(response)
        soup =BeautifulSoup(response)
        
        detail_mer = soup.find('div',{'class':'detail_mer'})
        
        #非个人房源 return
        if u"个人房源"  not in str(detail_mer):return        
        
        Dname = detail_mer.find('span',{'class':'Dname'})
        if Dname:
            self.fd['owner_name'] = Dname.string
        else:
            self.fd['owner_name'] = None
            
        ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'})
        
        if ganji_phone_call_class:
            self.fd['owner_phone'] = ganji_phone_call_class.contents[0]
            if str(ganji_phone_call_class).find('src='):                
                self.fd['owner_phone'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src']
            else:
                self.fd['owner_phone'] = None            
        else:
            self.fd['owner_phone'] = None
            
            
        #没有联系方式  return
        if not self.fd['owner_phone']:return     
        
        if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response):
            cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1)
            self.fd['cityname'] = cityname
        else:
            return   
        
        self.fd['house_floor'] = 0
        self.fd['house_topfloor'] = 0 
        self.fd['house_type'] = 0
        self.fd['house_age'] = 0
        self.fd['house_toward'] = 0
        self.fd['house_fitment'] = 0

        
        if re.search(self.house_totalarea_regex_qiu, response):
            house_totalarea=re.search(self.house_totalarea_regex_qiu, response).group(1)
            self.fd['house_totalarea'] = house_totalarea
            self.fd['house_totalarea_max'] = house_totalarea
            self.fd['house_totalarea_min'] = house_totalarea
        else:
            self.fd['house_totalarea'] = 0
            self.fd['house_totalarea_max'] = 0
            self.fd['house_totalarea_min'] = 0
            
        if re.search(self.house_price_regex_gou, response):
            house_price_zu = re.search(self.house_price_regex_gou, response).group(1)
            house_price_zu = house_price_zu.replace('万','')
            if house_price_zu.find("以上") != -1:
                self.fd['house_price_max'] = 0
                self.fd['house_price_min'] = house_price_zu.replace('以上','')
                self.fd['house_price'] = self.fd['house_price_min']
            elif house_price_zu.find("以下") != -1:
                self.fd['house_price_max'] = house_price_zu.replace('以下','')
                self.fd['house_price_min'] = 0
                self.fd['house_price'] = self.fd['house_price_max']
            elif house_price_zu.find("-") != -1:
                self.fd['house_price_max'] = house_price_zu.split('-')[1]
                self.fd['house_price_min'] = house_price_zu.split('-')[0]
                self.fd['house_price'] = house_price_zu.split('-')[1]
            else:
                self.fd['house_price_max'] = 0
                self.fd['house_price_min'] = 0
                self.fd['house_price'] = 0
        else:
            self.fd['house_price_max'] = 0
            self.fd['house_price_min'] = 0
            self.fd['house_price'] = 0
            
        posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None 
        if posttime:
            Y=int(time.strftime('%Y', time.localtime()))
            M=int(posttime.split(' ')[0].split('-')[0])
            D=int(posttime.split(' ')[0].split('-')[1])
            s = datetime.datetime(Y,M,D,0,0)
            posttime=int(time.mktime(s.timetuple()))
            self.fd['posttime'] =posttime 
        else:
            self.fd['posttime'] =None
            
        if re.search(self.house_room_regex, response):
            house_room=re.search(self.house_room_regex, response).group(1)
            self.fd['house_room'] = house_room
        else:
            self.fd['house_room'] = '0'
            
        if re.search(self.house_hall_regex, response):
            house_hall=re.search(self.house_hall_regex, response).group(1)
            self.fd['house_hall'] = house_hall
        else:
            self.fd['house_hall'] = '0'
        
        if re.search(self.house_toilet_regex, response):
            house_toilet=re.search(self.house_toilet_regex, response).group(1)
            self.fd['house_toilet'] = house_toilet
        else:
            self.fd['house_toilet'] = '0'

        house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
        self.fd['house_title'] = house_title
        
        #描述        
        detail_box = soup.find('div',{'class':'detail_box'})
        if detail_box:
            house_desc = str(detail_box('p')[1])
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc)
        else:
            self.fd['house_desc'] = None

        d_i = soup.find('ul',{'class':'d_i'})
        
        #小区名
        #先处理JS
        if re.search(self.xiaoqu_regex, response):
            borough_name=re.search(self.xiaoqu_regex, response).group(1)
            self.fd['borough_name'] = borough_name
            if re.search(self.address_regex, response):
                house_addr=re.search(self.address_regex, response).group(1)
                self.fd['house_addr'] = house_addr
        else:            
            if d_i.find(text="小区: "):
                borough_box = d_i.find(text="小区: ").parent        
                borough_name = borough_box.find("a")
                if borough_name:
                    self.fd['borough_name'] = borough_name.string
                else:
                    self.fd['borough_name'] = None                         
            else:
                if re.search(self.borough_name_regex_reg, response):
                    borough_name=re.search(self.borough_name_regex_reg, response).group(1)
                    self.fd['borough_name'] = borough_name
            if re.search(self.house_addr_regex_reg, response):
                house_addr=re.search(self.house_addr_regex_reg, response).group(1)
                self.fd['house_addr'] = house_addr
            else:
                self.fd['house_addr'] = ''
            
        #区域     
        area_box = d_i.find(text="区域: ").parent
        area_a = area_box('a')
        if area_a and len(area_a)>1:
            self.fd['cityarea'] = area_a[0].string
            self.fd['section'] = area_a[1].string
        elif area_a and len(area_a)==1:
            self.fd['cityarea'] = area_a[0].string
            self.fd['section'] = None
        else:
            self.fd['cityarea'] = None
            self.fd['section'] = None
Ejemplo n.º 28
0
 def find_by_css(self, selector):
     xpath = CSSSelector(selector).path
     return self.find_by_xpath(xpath,
                               original_find="css",
                               original_query=selector)
Ejemplo n.º 29
0
Archivo: ganji.py Proyecto: ptphp/PyLib
    def rent(self,url):
#        self.fd['house_city'] = urlparse(url)[1].replace('.ganji.com',"")
        hc= urlparse(url)[1].replace('.ganji.com',"") 
        hc2=citynameDict_sf.get(hc)
        if hc2:
            self.fd['house_city']=hc2
        else:
            self.fd['house_city']=hc
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            raise
        tree = etree.HTML(response)
        if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response):
            cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1)
            self.fd['cityname'] = cityname
        else:
            raise
        
        self.fd['house_flag'] = 2
        self.fd['house_type'] = 6
        self.fd['house_floor'] = 0
        self.fd['house_topfloor'] = 0 
        
        soup =BeautifulSoup(response)
        detail_mer = soup.find('div',{'class':'detail_mer'})
        
        #非个人房源 return
        if u"个人房源"  not in str(detail_mer):raise
        
        Dname = detail_mer.find('span',{'class':'Dname'})
        if Dname:
            self.fd['owner_name'] = str(Dname.string)
        else:
            self.fd['owner_name'] = ""
            
        ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'})
        
        if ganji_phone_call_class:
            self.fd['owner_phone_pic'] = ganji_phone_call_class.contents[0]
            if str(ganji_phone_call_class).find('src='):                
                self.fd['owner_phone_pic'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src']
            else:
                self.fd['owner_phone_pic'] = None            
        else:
            self.fd['owner_phone_pic'] = None
            
            
        #没有联系方式  return
        if not self.fd['owner_phone_pic']:raise        
        
        if re.search(self.house_totalarea_regex, response):
            house_totalarea=re.search(self.house_totalarea_regex, response).group(1)
            self.fd['house_area'] = house_totalarea
        else:
            self.fd['house_area'] = None
        
        if re.search(self.house_price_regex_2, response):
            house_price=re.search(self.house_price_regex_2, response).group(1)
            if house_price=="面议":
                house_price=0
            self.fd['house_price'] = int(house_price)
        else:
            self.fd['house_price'] = 0
    #    house_price=tree.xpath("/html/body/div[2]/div/div/ul/li/span") and tree.xpath("/html/body/div[2]/div/div/ul/li/span")[0].text.strip() or None    
    #    v['house_price'] = house_price
        
        posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None 
        if posttime:
            Y=int(time.strftime('%Y', time.localtime()))
            M=int(posttime.split(' ')[0].split('-')[0])
            D=int(posttime.split(' ')[0].split('-')[1])
            H=int(time.strftime('%H',time.localtime(time.time())))
            Min=int(time.strftime('%M',time.localtime(time.time())))
            s = datetime.datetime(Y,M,D,H,Min)
            posttime=str(int(time.mktime(s.timetuple())))
            self.fd['house_posttime'] =posttime 
        else:
            s=time.localtime(time.time())
            self.fd['house_posttime'] =str(int(time.mktime(s)))
            
        house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
        self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        

        if re.search(self.house_room_regex, response):
            house_room=re.search(self.house_room_regex, response).group(1)
            self.fd['house_room'] = house_room
        else:
            self.fd['house_room'] = 0
            
        if re.search(self.house_hall_regex, response):
            house_hall=re.search(self.house_hall_regex, response).group(1)
            self.fd['house_hall'] = house_hall
        else:
            self.fd['house_hall'] = 0
        
        if re.search(self.house_toilet_regex, response):
            house_toilet=re.search(self.house_toilet_regex, response).group(1)
            self.fd['house_toilet'] = house_toilet
        else:
            self.fd['house_toilet'] = 0
            
        if re.search(self.house_veranda_regex, response):
            house_veranda=re.search(self.house_veranda_regex, response).group(1)
            self.fd['house_veranda'] = house_veranda
        else:
            self.fd['house_veranda'] = 0
            
            
        if re.search(self.house_floor_regex, response):
            house_floor=re.search(self.house_floor_regex, response).group(1)
            house_topfloor=re.search(self.house_floor_regex, response).group(2)
            self.fd['house_floor']    = int(house_floor)
            self.fd['house_topfloor'] = int(house_topfloor)
        else:
            self.fd['house_floor'] = 0
            self.fd['house_topfloor'] = 0
          
            
        house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
        self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        
        #描述        
        detail_box = soup.find('div',{'class':'detail_box'})
        if detail_box:
            house_desc = str(detail_box('p')[1])
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc)
        else:
            self.fd['house_desc'] = None

        d_i = soup.find('ul',{'class':'d_i'})        
        #小区名
        #先处理JS
        if re.search(self.xiaoqu_regex, response):
            borough_name=re.search(self.xiaoqu_regex, response).group(1)
            self.fd['borough_name'] = borough_name
            if re.search(self.address_regex, response):
                house_addr=re.search(self.address_regex, response).group(1)
                self.fd['house_addr'] = house_addr
        else:            
            if d_i.find(text="小区: "):
                borough_box = d_i.find(text="小区: ").parent        
                borough_name = borough_box.find("a")
                if borough_name:
                    self.fd['borough_name'] = borough_name.string
                else:
                    self.fd['borough_name'] = None            
                #地址
                if borough_name and borough_name.nextSibling:
                    house_addr = borough_name.nextSibling.string
                    self.fd['house_addr'] = re.sub("\(|\)| ","",house_addr)
                else:
                    self.fd['house_addr'] = None
            else:
                if re.search(self.borough_name_regex, response):
                    borough_name=re.search(self.borough_name_regex, response).group(1)
                    self.fd['borough_name'] = re.sub("\(.*\)| ","",borough_name)
            
        #区域     
        area_box = d_i.find(text="区域: ").parent
        area_a = area_box('a')
        if area_a and len(area_a)>1:
            self.fd['house_region'] = str(area_a[0].string)
            self.fd['house_section'] = str(area_a[1].string)
        elif area_a and len(area_a)==1:
            self.fd['house_region'] = str(area_a[0].string)
            self.fd['house_section'] = ""
        else:
            self.fd['house_region'] = ""
            self.fd['house_section'] = ""
        
        if re.search(self.house_age_regex, response):
            house_age=re.search(self.house_age_regex, response).group(1)
            Y=int(time.strftime('%Y', time.localtime()))
            house_age=Y-int(house_age)
            self.fd['house_age'] = house_age
        else:
            self.fd['house_age'] = 0
            
        #朝向
        if re.search(self.house_toward_regex, response):
            house_toward=re.search(self.house_toward_regex, response).group(1)
            self.fd['house_toward'] = toward(house_toward)
        else:
            self.fd['house_toward'] = 0        
            
        if re.search(self.house_fitment_regex, response):
            house_fitment=re.search(self.house_fitment_regex, response).group(1)
            self.fd['house_fitment'] = fitment(house_fitment)
        else:
            self.fd['house_fitment'] = 2
            
        if re.search(self.house_deposit_regex, response):
            house_deposit=re.search(self.house_deposit_regex, response).group(1)
            self.fd['house_deposit'] = deposit(house_deposit)
        else:
            self.fd['house_deposit'] = None
        request = None
        response = None
        soup=None
        tree=None
        del tree
        del request
        del response
        del soup  
Ejemplo n.º 30
0
import lxml.etree
from lxml.cssselect import CSSSelector
from BeautifulSoup import BeautifulSoup

if len(sys.argv) < 2:
    print >>sys.stderr, 'usage: weather.py CITY, STATE'
    exit(2)

data = urllib.urlencode({'inputstring': ' '.join(sys.argv[1:])})
info = urllib2.urlopen('http://forecast.weather.gov/zipcity.php', data)
content = info.read()

# Solution #1
parser = lxml.etree.HTMLParser(encoding='utf-8')
tree = lxml.etree.fromstring(content, parser)
big = CSSSelector('td.big')(tree)[0]
if big.find('font') is not None:
    big = big.find('font')
print 'Condition:', big.text.strip()
print 'Temperature:', big.findall('br')[1].tail
tr = tree.xpath('.//td[b="Humidity"]')[0].getparent()
print 'Humidity:', tr.findall('td')[1].text
print

# Solution #2
soup = BeautifulSoup(content)  # doctest: +SKIP
big = soup.find('td', 'big')
if big.font is not None:
    big = big.font
print 'Condition:', big.contents[0].string.strip()
temp = big.contents[3].string or big.contents[4].string  # can be either
Ejemplo n.º 31
0
Archivo: ganji.py Proyecto: ptphp/PyLib
    def require(self,url):
        hc= urlparse(url)[1].replace('.ganji.com',"") 
        hc2=citynameDict_sf.get(hc)
        if hc2:
            self.fd['house_city']=hc2
        else:
            self.fd['house_city']=hc
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            raise
        tree = etree.HTML(response)
        if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response):
            cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1)
            self.fd['cityname'] = cityname
        else:
            raise
        
        self.fd['house_flag'] = 4
        self.fd['house_type'] = 6
        self.fd['house_floor'] = 0
        self.fd['house_topfloor'] = 0 
        self.fd['house_area']=0
        self.fd['house_age'] = 0
        self.fd['house_toward'] = 0
        self.fd['house_fitment'] = 0
        self.fd['house_deposit'] = 0
#        self.fd['house_totalarea_max'] = 0
#        self.fd['house_totalarea_min'] = 0
        
        soup =BeautifulSoup(response)
        detail_mer = soup.find('div',{'class':'detail_mer'})
        
        #非个人房源 return
        if u"个人房源"  not in str(detail_mer):raise
        
        Dname = detail_mer.find('span',{'class':'Dname'})
        if Dname:
            self.fd['owner_name'] = Dname.string
        else:
            self.fd['owner_name'] = None
            
        ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'})
        
        if ganji_phone_call_class:
            self.fd['owner_phone_pic'] = ganji_phone_call_class.contents[0]
            if str(ganji_phone_call_class).find('src='):                
                self.fd['owner_phone_pic'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src']
            else:
                self.fd['owner_phone_pic'] = None            
        else:
            self.fd['owner_phone_pic'] = None
            
            
        #没有联系方式  return
        if not self.fd['owner_phone_pic']:raise     
        
        if re.search(self.house_price_regex_zu, response):
            house_price_zu = re.search(self.house_price_regex_zu, response).group(1)
            house_price_zu = house_price_zu.replace('元/月','')
            if house_price_zu.find("以上") != -1:
                self.fd['house_price_max'] = 0
                self.fd['house_price'] = int(house_price_zu.replace('以上',''))
            elif house_price_zu.find("以下") != -1:
                self.fd['house_price_max'] = int(house_price_zu.replace('以下',''))
                self.fd['house_price'] = 0
            elif house_price_zu.find("-") != -1:
                self.fd['house_price_max'] = int(house_price_zu.split('-')[1])
                self.fd['house_price'] = int(house_price_zu.split('-')[0])
            else:
                self.fd['house_price_max'] = 0
                self.fd['house_price'] = 0
        else:
            self.fd['house_price_max'] = 0
            self.fd['house_price'] = 0
        
        posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None 
        if posttime:
            Y=int(time.strftime('%Y', time.localtime()))
            M=int(posttime.split(' ')[0].split('-')[0])
            D=int(posttime.split(' ')[0].split('-')[1])
            H=int(time.strftime('%H',time.localtime(time.time())))
            Min=int(time.strftime('%M',time.localtime(time.time())))
            s = datetime.datetime(Y,M,D,H,Min)
            posttime=str(int(time.mktime(s.timetuple())))
            self.fd['house_posttime'] =posttime 
        else:
            s=time.localtime(time.time())
            self.fd['house_posttime'] =str(int(time.mktime(s)))
            
        house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
        self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        

        if re.search(self.house_room_regex, response):
            house_room=re.search(self.house_room_regex, response).group(1)
            self.fd['house_room'] = int(house_room)
        else:
            self.fd['house_room'] = 0
            
        if re.search(self.house_hall_regex, response):
            house_hall=re.search(self.house_hall_regex, response).group(1)
            self.fd['house_hall'] = int(house_hall)
        else:
            self.fd['house_hall'] = 0
        
        if re.search(self.house_toilet_regex, response):
            house_toilet=re.search(self.house_toilet_regex, response).group(1)
            self.fd['house_toilet'] = int(house_toilet)
        else:
            self.fd['house_toilet'] = 0

        if re.search(self.house_veranda_regex, response):
            house_veranda=re.search(self.house_veranda_regex, response).group(1)
            self.fd['house_veranda'] = int(house_veranda)
        else:
            self.fd['house_veranda'] = 0

        house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
        self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        
        #描述        
        detail_box = soup.find('div',{'class':'detail_box'})
        if detail_box:
            house_desc = str(detail_box('p')[1])
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc)
        else:
            self.fd['house_desc'] = ""

        d_i = soup.find('ul',{'class':'d_i'})        
        #小区名
        #先处理JS
        if re.search(self.xiaoqu_regex, response):
            borough_name=re.search(self.xiaoqu_regex, response).group(1)
            self.fd['borough_name'] = borough_name
            if re.search(self.address_regex, response):
                house_addr=re.search(self.address_regex, response).group(1)
                self.fd['house_addr'] = house_addr
        else:            
            if re.search(self.borough_name_regex_reg, response):
                borough_name=re.search(self.borough_name_regex_reg, response).group(1)
                self.fd['borough_name'] = borough_name
            if re.search(self.house_addr_regex_reg, response):
                house_addr=re.search(self.house_addr_regex_reg, response).group(1)
                self.fd['house_addr'] = house_addr
            else:
                self.fd['house_addr'] = ''
                
            
        #区域     
        area_box = d_i.find(text="区域: ").parent
        area_a = area_box('a')
        if area_a and len(area_a)>1:
            self.fd['house_region'] = str(area_a[0].string)
            self.fd['house_section'] = str(area_a[1].string)
        elif area_a and len(area_a)==1:
            self.fd['house_region'] = str(area_a[0].string)
            self.fd['house_section'] = ""
        else:
            self.fd['house_region'] = ""
            self.fd['house_section'] = ""
        request = None
        response = None
        soup=None
        tree=None
        del tree
        del request
        del response
        del soup
Ejemplo n.º 32
0
import sys

import requests
import lxml.html
from lxml.cssselect import CSSSelector


# get page
url = sys.argv[1]
page = requests.get(url).text
page = page.replace('\xa0', ' ')
tree = lxml.html.fromstring(page)


# get title
title_tag = CSSSelector('div#main h1')(tree)[0]
title = title_tag.text_content()
fb2 = title.find(' (fb2)')
if fb2 != -1:
    title = title[:fb2]


# get text
text_tag = CSSSelector('div#main div._ga1_on_')(tree)[0]
text = text_tag.text_content().strip()


# get refs
ref_sup_tags = CSSSelector('sup')(text_tag)
ref_tags = [CSSSelector('a')(ref_sup_tag)[1] for ref_sup_tag in ref_sup_tags]
refs = [ref_tag.get('title').strip() for ref_tag in ref_tags]
Ejemplo n.º 33
0
 def autocomplete_input(self, et):
     return CSSSelector('input.autocomplete')(et)[0]
Ejemplo n.º 34
0
def _parse_html_for_translation(html):
    """
    This function breaks down anchors and strips them into two divs. This will show up as two strings on transifex.
    :param html:
    :return:
    """
    p = re.compile(r'<.*?>')
    if p.findall(html):
        html = unicode(BeautifulSoup(html).prettify())
        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(html), parser)
        a = CSSSelector('a')
        translatable_a = CSSSelector('a.translatable')
        img = CSSSelector('img:not(.image-translatable)')

        # Translatable anchors are split into text and links
        anchors = translatable_a(tree.getroot())
        logger.info(str(anchors))

        for anchor in anchors:
            attributes = [("data-a-{}".format(k), v)
                          for k, v in dict(anchor.attrib).iteritems()]
            div = etree.Element('div')

            content = etree.parse(
                StringIO("<div class=\"text\">{}</div>".format(
                    stringify_children(anchor)))).getroot()
            href_format = """<div class=\"href\">{}</div>"""
            href_html = fix_html_fragment(
                href_format.format(anchor.attrib['href']))

            link = etree.parse(StringIO(href_html)).getroot()

            for k, v in attributes:
                div.attrib[k] = v

            div.attrib['class'] = 'former-anchor-translatable'
            div.append(content)
            div.append(link)

            swap_element(div, anchor)

        # Anchors are just the text
        anchors = a(tree.getroot())
        for anchor in anchors:
            attributes = [("data-a-{}".format(k), v)
                          for k, v in dict(anchor.attrib).iteritems()]

            anchor_format = "<div class=\"former-anchor\">{}</div>"
            anchor_html = fix_html_fragment(
                anchor_format.format(stringify_children(anchor)))

            div = etree.parse(StringIO(anchor_html)).getroot()

            for k, v in attributes:
                div.attrib[k] = v

            swap_element(div, anchor)

        # Images are just copies of the attributes
        images = img(tree.getroot())
        for image in images:
            div = etree.Element('div')
            attributes = [("data-img-{}".format(k), v)
                          for k, v in dict(image.attrib).iteritems()]

            for k, v in attributes:
                div.attrib[k] = v
            div.attrib['class'] = 'former-image'

            swap_element(div, image)
        html = etree.tostring(tree)

    # Chicken coop de grass
    # Massive regex that takes in phone numbers and puts them in divs
    # only to be postprocessed below and dissapear from the translations
    p = re.compile(
        r'((?:\+\s*)*\d+(?:\s+\(*\d+\)*)*\d+(?:\s+\d+\(*\)*)+|\d+(?:\s+\d+)+|00\d+(?:\s+\d+)+)'
    )
    html = p.sub('<div class="former-tel">\g<1></div>', html)

    soup = BeautifulSoup(html)
    for div in soup.find_all('div'):
        tag_format = None
        while div.parent and div.parent.name in [
                'b', 'em', 'i', 'strong', 'u'
        ]:
            if div.parent.name == "b":
                div.parent.unwrap()
                tag_format = "<b>{}</b>"
            if div.parent.name == "strong":
                div.parent.unwrap()
                tag_format = "<strong>{}</strong>"
            if div.parent.name == "em":
                div.parent.unwrap()
                tag_format = "<em>{}</em>"
            if div.parent.name == "i":
                div.parent.unwrap()
                tag_format = "<i>{}</i>"
            if div.parent.name == "u":
                div.parent.unwrap()
                tag_format = "<u>{}</u>"

            if tag_format:
                children = "".join([unicode(c) for c in div.contents])
                div.clear()

                child_soup = BeautifulSoup(tag_format.format(children))
                if child_soup.body:
                    child_frag = child_soup.body.next
                elif child_soup.html:
                    child_frag = child_soup.html.next
                else:
                    child_frag = child_soup
                div.append(child_frag)

    for n in soup.select('u, b, i, em, strong'):
        if not n.text.strip():
            n.extract()

    for tel in soup.select('div.former-tel'):
        number = tel.text
        classes = ['former-tel']
        if tel.select('b'):
            classes.append('has-b')
        if tel.select('em'):
            classes.append('has-em')
        if tel.select('strong'):
            classes.append('has-strong')
        if tel.select('i'):
            classes.append('has-i')
        if tel.select('u'):
            classes.append('has-u')

        tel.attrs['data-tel-number'] = number
        tel.attrs['class'] = classes
        tel.clear()

    return soup.prettify()
Ejemplo n.º 35
0
def main():
	global COUNT

	site_url = "http://www.sanskritlibrary.org/"
	seed_url = "http://www.sanskritlibrary.org/textsList.html"
	titus_url = "http://titus.uni-frankfurt.de"
	p = Page(seed_url)

	a_tags = CSSSelector('a')
	div_tags = CSSSelector('div')
	span_tags = CSSSelector('span')
	body_tags = CSSSelector('body')

	div = [e for e in div_tags(p.dom) if e.get("class")=="text"]
	div = div[0]
	links = [site_url + i.get("href") for i in div.getchildren() if i.tag=='a']
	print "Links of texts:", len(links)
	source_links = list()


	#Creating list of links
	for l in links:
		lpage = Page(l)
		slinks = [i.get("href") for i in a_tags(lpage.dom) if i.get("target")=="source"]
		source_links += slinks

	
	print "Links of sources:", len(source_links) #134
	source_links = list(set(source_links))
	print "Unique links of sources:", len(source_links) #94

	#Considering only ramayana and mahabharat links
	source_links = [i for i in source_links if ("/mbh" in i or "/ram" in i)]
	pp.pprint(source_links)

	b = p.selenium_load()
	for link in source_links:
		lp = link

		print "SOURCE_LINK",link
		while lp:
			try:
				b.get(lp)
				sleep(0.25)

				b.switch_to_frame(b.find_elements_by_tag_name("frame")[0])
				bdom=html.fromstring(b.page_source, parser=html.HTMLParser(encoding='utf-8'))
				bt = body_tags(bdom)
				if len(bt)==0:
					print "No body tag for " + lp
					continue
				body = bt[0]
				f = open("download/" + lp[lp.rfind("/")+1:]+".txt", 'w')
				f.write(body.text_content().encode('utf-8'))
				f.close()
				print "File no. " + str(COUNT) + " created"
				COUNT += 1
				anchors = a_tags(bdom)

				lp = None


				for i in range(len(anchors)-1, max(0, len(anchors)-5), -1):
				
					if len(anchors[i].getchildren())==1 and anchors[i].getchildren()[0].tag=="img" and "arribar" in anchors[i].getchildren()[0].get("src"):
						href = anchors[i].get("href")
						lp = titus_url+href
						print i, len(anchors)-i
						print "New frame:", lp
						break
			except:
				lp = None
Ejemplo n.º 36
0
def select_all(tree, expr):
  sel = CSSSelector(expr)
  return sel(tree)
Ejemplo n.º 37
0
class GoogleNews:
    SELECTOR = {
        "title": CSSSelector(".l.lLrAF"),
        "summary": CSSSelector("div.st"),
        "date": CSSSelector(".f.nsa.fwzPFf"),
        "source": CSSSelector(".xQ82C.e8fRJf"),
        "source_url": CSSSelector(".top.NQHJEb.dfhHve"),
        "image_url": CSSSelector("img.th.BbeB2d")
    }

    def __init__(self, title, summary, date, source, source_url, image_url):
        self._title = title
        self._summary = summary
        self._date = date
        self._source = source
        self._source_url = source_url
        self._image_url = image_url

    @classmethod
    def from_source(cls, news_source):
        return cls(
            title=cls._get_element_content(news_source, "title"),
            summary=cls._get_element_content(news_source, "summary"),
            date=cls._get_element_content(news_source, "date"),
            source=cls._get_element_content(news_source, "source"),
            source_url=cls._get_element_attribute(news_source, "source_url", "href"),
            image_url=cls._get_element_attribute(news_source, "image_url", "src")
        )

    @staticmethod
    def _get_element_content(source, element_name):
        selected_area = GoogleNews._select_area(source, element_name)

        if selected_area is not None:
            return selected_area.text_content()

        return None

    @staticmethod
    def _get_element_attribute(source, element_name, attribute_name):
        selected_area = GoogleNews._select_area(source, element_name)

        if selected_area is not None:
            return selected_area.get(attribute_name)

        return None

    @staticmethod
    def _select_area(source, selector_key):
        selected_area = GoogleNews.SELECTOR[selector_key](source)

        # if len(selected_area) > 1:
        #     # raise GoogleNewsError("More than one selections match to current criteria")
        #     return ""
        if len(selected_area) < 1:
            return None
            # raise GoogleNewsError("No selection match to current criteria")

        return selected_area.pop()

    def display(self):
        print(f"Date: {self._date}")
        print(self._title)
        print(self._summary)
        print(self._source)
        print(self._source_url)

    def as_json(self):
        json_representation = {
            "title": self._title,
            "summary": self._summary,
            "date": self._date,
            "source": self._source,
            "source_url": self._source_url,
            "image_url": self._image_url
        }

        return json_representation
Ejemplo n.º 38
0
 def _select(self, selector_str):
     """ use css selector string to query corresponding etree elements """
     sel = CSSSelector(selector_str)
     return (e for e in sel(self._tree.getroot()))
Ejemplo n.º 39
0
class GoogleSearch:
    BASE_URL = "https://www.google.com/search"
    BASE_HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",
        "Accept-Language": "pl,en-US;q=0.7,en;q=0.3"
    }

    SELECTORS = {
        "news": CSSSelector("div.g"),
        "unsuccessful_search": CSSSelector("div.mnr-c")
    }

    @classmethod
    def get_news_as_json(cls, query, start_date, end_date, limit=10):
        returned_news = cls.get_news(
            query=query,
            start_date=start_date,
            end_date=end_date,
            limit=limit
        )

        return json.dumps([news.as_json() for news in returned_news])

    @classmethod
    def get_news(cls, query, start_date, end_date, limit=10):
        """
        Performs Google News search with given criteria.
        :param query: Main query to search for (content of search box)
        :param start_date: (Optional) start day of the search
        :param end_date: (Optional) end day of the search
        :param limit: (default=10) max number of news to return
        :return: list of GoogleNews
        """
        parsed_news = list()
        page_index = 0

        while True:
            raw_search_page = GoogleSearch._get_search_page(
                query=query,
                search_type="nws",
                start_date=start_date,
                end_date=end_date,
                page_index=page_index
            )

            if not GoogleSearch.page_search_successful(raw_search_page):
                return parsed_news

            raw_all_news_on_page = GoogleSearch.SELECTORS["news"](raw_search_page)

            for raw_news in raw_all_news_on_page:
                if len(parsed_news) >= limit:
                    return parsed_news

                parsed_news.append(GoogleNews.from_source(raw_news))

            page_index += 1

    @staticmethod
    def page_search_successful(raw_search_page):
        """
        Checks if given page contains valid search results
        :param raw_search_page: parsed html page
        :return: True if results are valid; False otherwise
        """
        selection = GoogleSearch.SELECTORS["unsuccessful_search"](raw_search_page)

        return not len(selection)

    @classmethod
    def get_images(cls):
        raise NotImplementedError()

    @staticmethod
    def _get_search_page(query, search_type, start_date, end_date, page_index):
        custom_date_range = f"cdr:1,cd_min:{start_date},cd_max:{end_date}"

        payload = {
            "q": query,
            "tbs": custom_date_range,
            "tbm": search_type,
            "start": page_index * 10
        }

        response = requests.get(
            GoogleSearch.BASE_URL,
            params=payload,
            headers=GoogleSearch.BASE_HEADERS
        )

        if response.status_code != 200:
            raise GoogleSearchError(f"Response status code was {response.status_code}")

        return html.fromstring(response.text)
Ejemplo n.º 40
0
import urllib.parse
import time
import random
import unicodedata

PATTERN = re.compile("^\s*(\d+)\s+(.+?)$", re.UNICODE)
FN_VERB_LIST = "./all-verbs-count.txt"
VERB_LIST = []
# with open(FN_VERB_LIST, "r") as fh:
#     VERB_LIST = [item.rstrip() for item in fh.readlines()]
with open(FN_VERB_LIST, "rb") as fh:
    VERB_LIST = [item.decode("utf-8").rstrip() for item in fh.readlines()]
LENGTH = len(VERB_LIST)
TRANSLATE_STUB = "http://www.spanishdict.com/translate/"
BIG_DICT = {"verbs": {}}
MISMATCH_CSS = CSSSelector(".mismatch")

TEST1 = re.compile("represents different", re.UNICODE)
GET_INF1 = re.compile("\*\*.+?\*\* represents .+? \*\*(.+?)\*\*", re.UNICODE)

TEST2 = re.compile("\*\*.+?\*\* is the", re.UNICODE)
GET_INF2 = re.compile(
    "\*\*.+?\*\* is the (\w+) form of \*\*(.+?)\*\* in the (\w+ \w+) (\w+)",
    re.UNICODE)

BK_REGEX = re.compile('<div variation-type="mismatch-verb.+?>(.+?)</div>',
                      re.UNICODE)


def parse_line(line):
    mat = PATTERN.search(line)
Ejemplo n.º 41
0
 def _modify_nodes_inplace(root: etree._Element, css_selector_str: str,
                           fn: Callable):
     sel = CSSSelector(css_selector_str, translator='html')
     for w in sel(root):
         fn(w)
Ejemplo n.º 42
0
    def check_css(self, html, *selectors):
        '''
        Checks if a series of CSS selectors are present in the HTML. For example:

            self.check_css(html,
                ('h1', 'Admin'),              # first H1 is Admin
                ('h1', 'is', 'Admin'),        # first H1 has text Admin
                ('img', 'is', {'src': 'X'}),  # first img has src="X"
                ('h1', 1, 'X'},               # second H1 should have text X
                ('h1', -1, 'X'},              # last H1 should have text X
                ('h1', 'has', 'X'},           # any H1 should have text X
                ('h1', 'all', 'X'},           # all H1 should have text X
            )
        '''
        import re
        import lxml.html
        from lxml.cssselect import CSSSelector
        tree = lxml.html.fromstring(html)

        for selector in selectors:
            if len(selector) == 2:
                (css, val), how = selector, 'is'
            elif len(selector) == 3:
                css, how, val = selector
            else:
                raise ValueError('Selector %s must be a (css, how, val) triple' % selector)
            # Check all matching nodes. At least one node must exist
            nodes = CSSSelector(css)(tree)
            ok_(len(nodes) > 0, 'CSS %s missing' % css)

            # val must be a dict. Convert text values to dict. Raise error for rest
            if isinstance(val, six.string_types):
                val = {'@text': val}
            elif not isinstance(val, dict):
                raise ValueError('CSS %s has invalid value %s' % (css, val))

            for attr, v in val.items():
                if attr == '@text':
                    actuals = [node.text for node in nodes]
                else:
                    actuals = [node.get(attr, None) for node in nodes]

                # Try substring search. Else try regexp search
                regex = re.compile(v)
                match = lambda x: x in actual or regex.search(x)        # noqa

                # First or specified selector should match v
                if how == 'is' or isinstance(how, int):
                    actual = actuals[0 if how == 'is' else how]
                    if not match(actual):
                        self.fail('CSS %s@%s = %s != %s' % (css, attr, actual, v))
                # Any selector should match v
                elif how in {'has', 'any'}:
                    if not any(match(actual) for actual in actuals):
                        self.fail('CSS %s@%s has no %s' % (css, attr, v))
                # All selectors should match v
                elif how == 'all':
                    if not all(match(actual) for actual in actuals):
                        self.fail('CSS %s@%s is not all %s' % (css, attr, v))
                else:
                    raise ValueError('CSS %s: invalid how: "%s"' % (css, how))
        return tree
Ejemplo n.º 43
0
    def handle(self, *args, **options):
        if not args:
            return

        page_id, = args

        parser = etree.HTMLParser()
        selector = CSSSelector('body')

        # content = selector(tree.getroot())
        dict_list = []

        page = Page.objects.get(id=page_id)
        page = page.get_draft_object()
        for placeholder in page.get_placeholders():
            for plugin in placeholder.get_plugins('en'):
                instance, t = plugin.get_plugin_instance()
                typename = type(t).__name__
                if typename == 'TextPlugin':
                    tree = etree.parse(StringIO.StringIO(instance.body), parser).getroot()
                    for child in instance.get_children():
                        child_instance, child_type = child.get_plugin_instance()
                        child_type_name = type(child_type).__name__

                        img = CSSSelector('[id=plugin_obj_{}]'.format(child_instance.id))(tree)
                        if not img:
                            child.delete()
                            continue

                        img = img[0]
                        parent = img.getparent()
                        element = None

                        if child_type_name == "LinkPlugin":
                            element = etree.Element('a', attrib={
                                "target": "_blank",
                                "href": child_instance.url
                            })
                            element.text = child_instance.name
                        elif child_type_name == "CMSLinkButtonPlugin":
                            element = etree.Element('a', attrib={
                                "class": "link-button",
                                "target": "_blank",
                                "href": child_instance.url
                            })
                            element.text = child_instance.name

                        if element is not None:
                            parent.insert(parent.index(img), element)
                            parent.remove(img)

                            child.delete()


                    body = selector(tree)[0]

                    out = (body.text or '') + '\n'.join(
                        [etree.tostring(h, pretty_print=True, method="html") for h in list(body)]
                    )

                    instance.body = out
                    instance.save()
Ejemplo n.º 44
0
    def transform(self, pretty_print=True):
        """change the self.html and return it with CSS turned into style
        attributes.
        """
        if etree is None:
            return self.html

        parser = etree.HTMLParser()
        stripped = self.html.strip()
        tree = etree.fromstring(stripped, parser).getroottree()
        page = tree.getroot()
        # lxml inserts a doctype if none exists, so only include it in
        # the root if it was in the original html.
        root = tree if stripped.startswith(tree.docinfo.doctype) else page

        if page is None:
            print repr(self.html)
            raise PremailerError("Could not parse the html")
        assert page is not None

        ##
        ## style selectors
        ##

        rules = []
        index = 0

        for element in CSSSelector('style,link[rel~=stylesheet]')(page):
            # If we have a media attribute whose value is anything other than
            # 'screen', ignore the ruleset.
            media = element.attrib.get('media')
            if media and media != 'screen':
                continue

            is_style = element.tag == 'style'
            if is_style:
                css_body = element.text
            else:
                href = element.attrib.get('href')
                if not href:
                    continue
                css_body = self._load_external(href)

            these_rules, these_leftover = self._parse_style_rules(
                css_body, index)
            index += 1
            rules.extend(these_rules)

            parent_of_element = element.getparent()
            if these_leftover:
                if is_style:
                    style = element
                else:
                    style = etree.Element('style')
                    style.attrib['type'] = 'text/css'

                style.text = '\n'.join([
                    '%s {%s}' % (k, make_important(v))
                    for (k, v) in these_leftover
                ])
                if self.method == 'xml':
                    style.text = etree.CDATA(style.text)

                if not is_style:
                    element.addprevious(style)
                    parent_of_element.remove(element)

            elif not self.keep_style_tags or not is_style:
                parent_of_element.remove(element)

        if self.external_styles:
            for stylefile in self.external_styles:
                css_body = self._load_external(stylefile)
                these_rules, these_leftover = self._parse_style_rules(
                    css_body, index)
                index += 1
                rules.extend(these_rules)

        # rules is a tuple of (specificity, selector, styles), where specificity is a tuple
        # ordered such that more specific rules sort larger.
        rules.sort(key=operator.itemgetter(0))

        first_time = []
        first_time_styles = []
        for __, selector, style in rules:
            new_selector = selector
            class_ = ''
            if ':' in selector:
                new_selector, class_ = re.split(':', selector, 1)
                class_ = ':%s' % class_
            # Keep filter-type selectors untouched.
            if class_ in FILTER_PSEUDOSELECTORS:
                class_ = ''
            else:
                selector = new_selector

            sel = CSSSelector(selector)
            for item in sel(page):
                old_style = item.attrib.get('style', '')
                if not item in first_time:
                    new_style = merge_styles(old_style, style, class_)
                    first_time.append(item)
                    first_time_styles.append((item, old_style))
                else:
                    new_style = merge_styles(old_style, style, class_)
                item.attrib['style'] = new_style
                self._style_to_basic_html_attributes(item,
                                                     new_style,
                                                     force=True)

        # Re-apply initial inline styles.
        for item, inline_style in first_time_styles:
            old_style = item.attrib.get('style', '')
            if not inline_style:
                continue
            new_style = merge_styles(old_style, inline_style, class_)
            item.attrib['style'] = new_style
            self._style_to_basic_html_attributes(item, new_style, force=True)

        if self.remove_classes:
            # now we can delete all 'class' attributes
            for item in page.xpath('//@class'):
                parent = item.getparent()
                del parent.attrib['class']

        ##
        ## URLs
        ##
        if self.base_url:
            for attr in ('href', 'src'):
                for item in page.xpath("//@%s" % attr):
                    parent = item.getparent()
                    if attr == 'href' and self.preserve_internal_links \
                           and parent.attrib[attr].startswith('#'):
                        continue
                    if not self.base_url.endswith('/'):
                        self.base_url += '/'
                    parent.attrib[attr] = urlparse.urljoin(
                        self.base_url, parent.attrib[attr].strip('/'))

        out = etree.tostring(root,
                             method=self.method,
                             pretty_print=pretty_print)
        if self.method == 'xml':
            out = _cdata_regex.sub(
                lambda m: '/*<![CDATA[*/%s/*]]>*/' % m.group(1), out)
        if self.strip_important:
            out = _importants.sub('', out)
        return out
    def extract_user_node(self, block):
        """
        Step 1: Look for all css classes corresponding to possible user blocks
        Step 2: Find valid user nodes out of possible nodes (needs to contain link to user profile)
        Step 3: Once valid block identified, extract and return corresponding information
        :param block: lxml node corresponding to a block
        :return: a tuple containing:
        - the lxml node corresponding the user block inside the input block
        - the corresponding user css class
        - the link to the user's profile
        - the text inside the user block (usually the user name)
        """
        block_copy = deepcopy(block)
        block_string = str(etree.tostring(block_copy))
        # Step 1
        css_classes_list = extract_css_class(block_string).split(' ')
        user_css_class = None
        user_css_classes = []
        for css_class in css_classes_list:
            if 'user' in css_class or 'author' in css_class:
                user_css_classes.append(css_class)

        if len(user_css_classes) > 0:

            # Step 2
            # There could be possible user_nodes, we only take the one the satisfies extra criteria
            user_nodes = []
            for user_css_class in user_css_classes:
                user_nodes += list(CSSSelector('.{}'.format(user_css_class))(block_copy))
            valid_user_nodes = []  # list of tuples containing (user node, list of links in user node)
            for _user_node in user_nodes:
                links = _user_node.iterlinks()
                filtered_links = []
                # Filter links 1) on site 2) a tags 3) duplicates 4) text content not null 5) contains no dates
                # Idea is when a user is mentioned, there is always a link to his profile
                for link in links:
                    if link[0].tag == 'a':
                        if urlparse(link[2]).netloc == urlparse(self.url).netloc:
                            if link[1] not in [link[1] for link in filtered_links]:
                                if len(self.extract_text_content(link[0])) > 0:
                                    if not self.contains_date(link[0]):
                                        filtered_links.append(link)
                if len(filtered_links) > 0:
                    valid_user_nodes.append((_user_node, filtered_links))
                    break

            # Step 3
            if len(valid_user_nodes) == 0:
                user_node = None
                user_link = None
                user_text = None
            else:  # No valid user nodes
                user_node_pair = valid_user_nodes[0]  # Take the first node
                user_node = user_node_pair[0]
                filtered_links = user_node_pair[1]
                if len(filtered_links) > 1:
                    self.logger.debug('multiple user  links found')
                user_link = filtered_links[0][2]
                user_text = self.extract_text_content(filtered_links[0][0])
        else:  # No valid user nodes
            user_node = None
            user_link = None
            user_text = None
        return user_node, user_css_class, user_link, user_text
Ejemplo n.º 46
0
from concurrent import futures
import logging
from multiprocessing import cpu_count
from lxml import html
from lxml.cssselect import CSSSelector
from urlparse import urljoin
from .. import SITE_URL
from ..cache import cached_storage

TALKS_LIST_URL_FMT = "http://www.ted.com/talks/quick-list?page=%d"

_PAGINATION_INFO_SELECTOR = CSSSelector('div.pagination a:nth-last-of-type(1)')

_TALKS_URLS_SELECTOR = CSSSelector('div.quick-list__row div.title span a')

TALKS_URLS_BLACKLIST = [
    # No downloads
    'http://www.ted.com/talks/rokia_traore_sings_m_bifo.html',
    'http://www.ted.com/talks/rokia_traore_sings_kounandi.html',
    'http://www.ted.com/talks/andrew_stanton_the_clues_to_a_great_story.html',
]


def _parse_page(page_num):
    return html.parse(TALKS_LIST_URL_FMT % page_num)


def _get_num_pages():
    logging.debug('Trying to find out the number of talk list pages...')
    elements = _PAGINATION_INFO_SELECTOR(_parse_page(1))
    num_pages = int(elements[0].text_content())
Ejemplo n.º 47
0
# coding: utf-8
import lxml.html
from lxml.cssselect import CSSSelector
import requests
link = requests.get(
    'http://www.ieee.org/conferences_events/conferences/search/index.html')

html = lxml.html.fromstring(link.text)
study = CSSSelector('div.content-r-full table.nogrid-nopad tr p>a[href]')
lines = study(html)
n = 0
for line in lines:
    if n % 3 == 0:
        print "Conference name: ", line.text
        print "============="
        n += 1
    elif n % 3 == 1:
        print "Conference Date: ", line.text
        print "============="
        n += 1
    elif n % 3 == 2:
        print "Location: ", line.text
        print "============="
        n += 1
BASE_URL = 'https://london.hackspace.org.uk/'

cookiejar = cookielib.CookieJar()
processor = urllib2.HTTPCookieProcessor(cookiejar)
opener = urllib2.build_opener(processor)
urllib2.install_opener(opener)


def browse(url, params=None):
    if params is not None:
        params = urlencode(params)
    page = urllib2.urlopen(BASE_URL + url, params)
    return etree.HTML(page.read())


find_exception = CSSSelector('.alert-danger')

if len(sys.argv) > 1:
    print 'Checking for card... (scan card on the RFID reader attached to this computer)'

    uid = None
    while uid is None:
        try:
            with rfid.Pcsc.reader() as reader:
                for tag in reader.pn532.scan():
                    uid = tag.uid.upper()
                    break
        except rfid.NoCardException:
            pass

        time.sleep(0.1)
Ejemplo n.º 49
0
class DocumentParser(RISParser):
    """"""

    # an easy way to get there selectors is to use firefox and copy the unique selector from the dev tools
    #
    #adoption_css = CSSSelector("#rismain table.risdeco tbody tr td table.tk1 tbody tr td table.tk1 tbody tr td table tbody tr.zl12 td.text3")
    #adoption_css = CSSSelector("table.risdeco tr td table.tk1 tr td.ko1 table.tk1 tr td table tr.zl12 td.text3")
    adoption_css = CSSSelector("tr.zl12:nth-child(3) > td:nth-child(5)") # selects the td which holds status information such as "beschlossen"
    top_css = CSSSelector("tr.zl12:nth-child(3) > td:nth-child(7) > form:nth-child(1) > input:nth-child(1)") # selects the td which holds the link to the TOP with transcript
    table_css = CSSSelector(".ko1 > table:nth-child(1)") # table with info block
    attachments_css = CSSSelector("table.tk1:nth-child(23)")
    #main_css = CSSSelector("#rismain table.risdeco")

    MD5_FIELDS = ['docs', 'betreff', 'federführend']
    city = "Aachen"

    def __init__(self, url,
            tzinfo = timezone('Europe/Berlin'), 
            months = 12,
            **kwargs):
        self.utc = pytz.utc
        self.tzinfo = tzinfo
        self.consultation_list_start = False 
        super(DocumentParser, self).__init__(url, **kwargs)

        # this will be moved to the second stage
        #self.db.documents.remove()

    @classmethod
    def construct_instance(cls, args):
        """construct the parse instance"""
        bu = args.base_url
        if not bu.endswith("/"):
            bu = bu + "/"
        url = bu+"vo020.asp?VOLFDNR=%s"
        return cls(url,
            city = args.city,
            mongodb_host = args.mongodb_host,
            mongodb_port = args.mongodb_port,
            mongodb_name = args.mongodb_name,
            force = args.force
        )

    def before_save(self, data):
        """hook which is called with all the data for a document just before it's
        saved to the database. You have to return a data object yourself
        """
        return data

    def preprocess_text(self, text):
        """preprocess the incoming text, e.g. do some encoding etc."""
        return text

    def process(self, force = True):
        """process documents"""

        # get all the ids of the documents we need to parse
     
        agenda_items = self.db.agenda_items.find({
            "city" : self.city,
        })
        print "processing %s agenda items" %agenda_items.count()
        document_ids = [item['volfdnr'] for item in agenda_items if "volfdnr" in item]
        print "processing %s documents" %len(document_ids)
        #self.process_document("11768", True) # had wrong last_discussed
        #self.process_document("10745", True) # street is "Ludwig Forum"
        #self.process_document("12811", True) # street is "Hof" but shouldn't be
        #return
        #print document_ids
        for document_id in document_ids:
            self.process_document(document_id, force = self.force)
        return

    def process_document(self, document_id, force = False):
        """process a single document

        :param document_id: id of document to parse
        :param force: if True then reread the document regardless of whether 
            we have it already in the db or not
        """
        print "trying document %s:%s" %(self.city, document_id)
        found = False
        try:
            data = self.db.documents.find_one({
                '_id' : "%s:%s" %(self.city, document_id),
                'document_id' : str(document_id),
                'city' : self.city,
            })
            found = True
        except Exception, e:
            print "problem when trying to find document id %s: %s" %(document_id, e)
            # we did not find any old data, so lets create an empty one
            found = False
        if data is None:
            data = {
                '_id'            : "%s:%s" %(self.city, document_id),
                'document_id'    : document_id,
                'document_url'   : self.url %document_id,
                'last_discussed' : TIME_MARKER,            # date of last appearance in a meeting
                'created'        : datetime.datetime.now(),# for our own reference
            }
            found = False
        if found and not force: 
            print "%s already read" %document_id
            return
        url = self.url %document_id
        print "reading", url

        self.response = response = requests.get(url)
        if "noauth" in response.url:
            print "*** no permission to read %s" %url
            print 
            return
        text = self.preprocess_text(response.text)
        doc = html.fromstring(text)

        # Check info block
        try:
            table = self.table_css(doc)[0] # lets hope we always have this table
        except: # for some reason on some runs this can't be found but the next one it can so we are saving it for now.
            print "**** INFO TABLE NOT FOUND, ABORTING document processing"
            fn = "/tmp/pyallris-error-%s-%s.html" %(self.city, document_id)
            fp = codecs.open(fn, "w", "utf-8")
            fp.write(text)
            fp.close()
            return
        self.consultation_list_start = False
        for line in table:
            headline = line[0].text
            if headline:
                headline = headline.split(":")[0].lower()
                if headline[-1]==":":
                    headline = headline[:-1]
                if headline == "betreff":
                    e = etree.tostring(line[1], encoding="utf-8")
                    e = unicode(e, "utf-8") # as etree does not return unicode
                    value = html2text.html2text(e)
                    data[headline] = value
                elif headline in ['status', 'verfasser', u'federführend']:
                    data[headline] = line[1].text.strip()
                elif headline == "beratungsfolge":
                    # the actual list will be in the next row inside a table, so we only set a marker
                    data = self.parse_consultation_list_headline(line, data) # for parser which have the consultation list here
                elif self.consultation_list_start:
                    data = self.parse_consultation_list(line, data) # for parser which have the consultation list in the next tr
                    self.consultation_list_start = False # set the marker to False again as we have read it
                # we simply ignore the rest (there might not be much more actually)

        # the actual text comes after the table in a div but it's not valid XML or HTML this using regex
        docs = body_re.findall(self.response.text)
        data['docs'] = docs
        data = utils.update_md5(data, self.MD5_FIELDS)
        data['city'] = self.city
        plaintext = data.get("betreff", "").lower()
        md = ""
        for d in data.get("docs"):
            plaintext = plaintext + " " + html2text.html2text(d.lower())
            md = md + "\n\n\n--------------------------------------------------------------------------------\n\n\n" + html2text.html2text(d)
        data['markdown'] = md
        streets = {} # this stores official street name => street._id
        geolocations = []
        geolocation = None
        for street in self.streets.keys():
            if re.search(r"\b" + re.escape(street) + r"\b", plaintext):
                s = self.streets[street]
                streets[s['original']] = s['_id']
                if "lat" in s:
                    sname = s['original'].replace(".",":") # we have to replace dots for mongodb keys. So we use a :
                    loc = {
                        'name' : s['original'],
                        'lat' : s["lat"], 
                        'lon' : s["lng"]
                    }
                    geolocations.append(loc)
                    # we now store the location of the first street in our database for the geo index
                    if geolocation is None:
                        geolocation = {'lat' : s["lat"], 'lon' : s["lng"]}
        #data['streets'] = streets
        data['geolocations'] = geolocations
        data['geolocation'] = geolocation
        data = self.before_save(data)
        #pprint.pprint(data)
        self.db.documents.save(data)
        time.sleep(1)
        return # we do attachments later, for now we save that stuff without

        # get the attachments if possible
        attachments = self.attachments_css(doc)
        if len(attachments)>0 and attachments[0][1][0].text.strip() == "Anlagen:":
            for tr in attachments[0][3:]:
                nummer = tr[1].text
                link = tr[2][0]
                href = link.attrib["href"]
                name = link.text
                # TODO: save it
        return
def Parse(reading):
    result = {"url": reading["url"]}
    text = reading["text"]

    text = re.sub("<p<", "",
                  text)  # this error too severe for parser to handle
    doc = lxml.html.parse(StringIO(text))
    root = doc.getroot()
    #body = h.find(".//body")
    maindiv = CSSSelector("#divMiddleLeftCentreBottomRight")(root)[0]

    heading = CSSSelector("#divHeading h1")(maindiv)[0].text
    intro = CSSSelector("#divIntroduction h2")(maindiv)[0]
    h2 = lxml.etree.tounicode(intro)
    #print [heading, h2]

    mheading = re.match(u"([\w\s\-']*?)\s*(?:\u2013\s*(?:PPC for (.*?)$)?|$)",
                        heading)
    result["name"] = mheading.group(1)

    mmpfor = re.search(u'(?:<br\s*/>)?\s*MP for (.*?)\s*<br\s*/>', h2)
    if mmpfor:
        result["MP for"] = mmpfor.group(1)
        result["MP for"] = result[
            "MP for"]  # needs to be regularized for the 2005 boundaries

    mcandidate = re.search(
        u'Liberal Democrat candidate for <a href="in_your_area_detail.aspx.*?">(.*?)</a>',
        h2)
    if mcandidate:
        result["constituency"] = RegularizeConstituency(mcandidate.group(1))
    elif mheading.group(2):
        result["constituency"] = RegularizeConstituency(mheading.group(2))
    elif "MP for" in result:
        result["constituency"] = RegularizeConstituency(result["MP for"])
    else:
        assert False, (h2, heading)

    divImage = maindiv.cssselect("#divIntroduction a img")
    if divImage:
        result["image"] = divImage[0].get("src")

    #print maindiv.cssselect("#divAboutMe h2")[0].text, "About Me"

    for traboutme in maindiv.cssselect("#divAboutMe tr"):
        key = traboutme.cssselect("th")[0].text[:-1]
        assert key in ["Marital Status", "Occupation", "Education"]
        value = traboutme.cssselect("td")[0].text
        if value:
            value = re.sub(u"\u2019", "'", value).strip()
            value = re.sub(u"\u2013", "-", value)
            value = re.sub("\xae", "", value)
            value = re.sub("\s*\n\s*", "; ", value)
            result[key] = value

    divBiography = maindiv.cssselect("#divBiography")
    if divBiography:
        result["bio"] = SimplifyHTML(divBiography[0])
        result["bio"] = re.sub("^Biography\s+", "",
                               result["bio"])  # clean out leading title

    contacttext = lxml.etree.tounicode(
        maindiv.cssselect("#divIndividualContactInfo")[0])

    memail = re.search('<strong>Email:</strong> <a href="(?:mailto:)?(.*?)">',
                       contacttext)
    if memail:
        result["email"] = memail.group(1)

    mwebsite = re.search('<strong>Website:</strong> <a href="(.*?)">',
                         contacttext)
    if mwebsite:
        result["website"] = mwebsite.group(1)

    mphone = re.search('<strong>Telephone:</strong> ([\d\s]+)', contacttext)
    if mphone:
        result["phone"] = mphone.group(1).strip()

    address = "; ".join([
        addressline.text
        for addressline in maindiv.cssselect("#divIndividualContactInfo ul li")
    ])
    if address:
        result["address"] = address.encode(
            "ascii", "replace"
        )  # the database doesn't seem to be unicode.  it should be

    return result
Ejemplo n.º 51
0
def getView(document, css, media='all', name=None, 
            styleCallback=lambda element: None):
    """
    document
        a DOM document, currently an lxml HTML document
    css
        a CSS StyleSheet string
    media: optional
        TODO: view for which media it should be
    name: optional
        TODO: names of sheets only
    styleCallback: optional
        should return css.CSSStyleDeclaration of inline styles, for html
        a style declaration for ``element@style``. Gets one parameter 
        ``element`` which is the relevant DOMElement
    
    returns style view
        a dict of {DOMElement: css.CSSStyleDeclaration} for html
    """
    sheet = cssutils.parseString(css)
    
    view = {}
    specificities = {} # needed temporarily 

    # TODO: filter rules simpler?, add @media
    rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE)    
    for rule in rules:
        for selector in rule.selectorList:
            log(0, 'SELECTOR', selector.selectorText)
            # TODO: make this a callback to be able to use other stuff than lxml
            cssselector = CSSSelector(selector.selectorText)
            matching = cssselector.evaluate(document)
            for element in matching:
                #if element.tag in ('div',):
                    # add styles for all matching DOM elements
                    log(1, 'ELEMENT', id(element), element.text)
                    
                    if element not in view:    
                        # add initial empty style declatation
                        view[element] = cssutils.css.CSSStyleDeclaration()
                        specificities[element] = {}                    
                        
                        # and add inline @style if present
                        inlinestyle = styleCallback(element)
                        if inlinestyle:
                            for p in inlinestyle:
                                # set inline style specificity
                                view[element].setProperty(p)
                                specificities[element][p.name] = (1,0,0,0)
                                                            
                    for p in rule.style:
                        # update style declaration
                        if p not in view[element]:
                            # setProperty needs a new Property object and
                            # MUST NOT reuse the existing Property
                            # which would be the same for all elements!
                            # see Issue #23
                            view[element].setProperty(p.name, p.value, p.priority)
                            specificities[element][p.name] = selector.specificity
                            log(2, view[element].getProperty('color'))
                            
                        else:
                            log(2, view[element].getProperty('color'))
                            sameprio = (p.priority == 
                                        view[element].getPropertyPriority(p.name))
                            if not sameprio and bool(p.priority) or (
                               sameprio and selector.specificity >= 
                                            specificities[element][p.name]):
                                # later, more specific or higher prio 
                                view[element].setProperty(p.name, p.value, p.priority)
                    
                   
    #pprint(view)
    return view                        
Ejemplo n.º 52
0
    def transform(self, pretty_print=True, **kwargs):
        """change the self.html and return it with CSS turned into style
        attributes.
        """
        if etree is None:
            return self.html

        if self.method == 'xml':
            parser = etree.XMLParser(ns_clean=False, resolve_entities=False)
        else:
            parser = etree.HTMLParser()
        stripped = self.html.strip()
        tree = etree.fromstring(stripped, parser).getroottree()
        page = tree.getroot()
        # lxml inserts a doctype if none exists, so only include it in
        # the root if it was in the original html.
        root = tree if stripped.startswith(tree.docinfo.doctype) else page

        if page is None:
            print repr(self.html)
            raise ValueError("Could not parse the html")
        assert page is not None

        ## style tags
        for element in CSSSelector('style,link[rel~=stylesheet]')(page):
            # If we have a media attribute whose value is anything other than
            # 'screen', ignore the ruleset.
            media = element.attrib.get('media')
            if media and media != 'screen':
                continue

            is_style = element.tag == 'style'
            if is_style:
                css_body = element.text
            else:
                href = element.attrib.get('href')
                if not href:
                    continue
                css_body = self._load_external(href)

            self._parse_style_rules(css_body)

            parent_of_element = element.getparent()
            if not self.keep_style_tags or not is_style:
                parent_of_element.remove(element)

        ## explicitly defined external style file
        if self.external_styles:
            for stylefile in self.external_styles:
                css_body = self._load_external(stylefile)
                self._parse_style_rules(css_body)

        for tag_classes in page.xpath('//@class'):
            tag = tag_classes.getparent()
            tag_classes = [
                '.' + c.strip() for c in tag_classes.split(' ') if c.strip()
            ]
            for tag_class in tag_classes:
                if tag_class in self.rules:
                    old_style = tag.attrib.get('style', '')
                    new_style = self.rules[tag_class]
                    if old_style:
                        new_style = '; '.join([old_style, new_style])
                    tag.attrib['style'] = new_style

        if self.remove_classes:
            # now we can delete all 'class' attributes
            for item in page.xpath('//@class'):
                parent = item.getparent()
                del parent.attrib['class']

        kwargs.setdefault('method', self.method)
        kwargs.setdefault('pretty_print', pretty_print)
        out = etree.tostring(root, **kwargs)
        if self.method == 'xml':
            out = _cdata_regex.sub(
                lambda m: '/*<![CDATA[*/%s/*]]>*/' % m.group(1), out)
        if self.strip_important:
            out = _importants.sub('', out)
        return out
Ejemplo n.º 53
0
_htmlTree = etree.HTML(_html)
result = etree.tostring(_htmlTree, pretty_print=True, method="html")
print len(result)
nodes = _htmlTree.xpath('//*[@href]')
print len(nodes)
for i, node in enumerate(nodes):
    if i < 20:
        print i, node.attrib

import lxml.html
from lxml.cssselect import CSSSelector
import requests
r = requests.get('http://python.org/')

html = lxml.html.fromstring(r.text)
sel = CSSSelector('a[href]')
# Apply the selector to the DOM tree.
nodes = sel(html)
print len(nodes)
for i, node in enumerate(nodes):
    #print lxml.html.tostring(item)
    if i < 20:
        print i, node.get('href'), node.text

import lxml.html
from lxml.cssselect import CSSSelector
import requests
r = requests.get('http://python.org/')

html = lxml.html.fromstring(r.text)
sel = CSSSelector('a[href]')
Ejemplo n.º 54
0
import requests
import re
import json
from lxml import html, etree
from lxml.cssselect import CSSSelector

verbose = True

SITE_URL = 'http://www.filmweb.pl'
select_topics_links = CSSSelector('.topics-list h3 a')
select_first_post = CSSSelector('.firstPost')
select_post_author = CSSSelector('.userName')
select_date_time = CSSSelector('.cap')
select_points = CSSSelector('.plusCount')
select_post_info = CSSSelector('.postInfo')
select_post_text = CSSSelector('.text')
select_title = CSSSelector('h1 a')


def get_opinion(opinion_url):
    response = requests.get(opinion_url)
    tree = html.fromstring(response.content)
    first_post = select_first_post(tree)[0]
    rating_match = re.search(rb'(\d+) <i',
                             etree.tostring(select_post_info(first_post)[0]))
    post_text_el = select_post_text(first_post)[0]
    etree.strip_elements(post_text_el, "*", with_tail=False)
    opinion = {
        'author': select_post_author(first_post)[0].text.strip(),
        'date': select_date_time(first_post)[0].get('title'),
        'rating': int(rating_match.group(1)) if rating_match else None,
Ejemplo n.º 55
0
    def sell(self,url):
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            return 
        tree = etree.HTML(response)
        soup =BeautifulSoup(response)
        
        self.fd['house_flag'] = 1
        self.fd['belong']=0
        
        detail_mer = soup.find('div',{'class':'detail_mer'})        
        #非个人房源 return
        if u"个人房源"  not in str(detail_mer):return        
        
        Dname = detail_mer.find('span',{'class':'Dname'})
        if Dname:
            self.fd['owner_name'] = Dname.string
        else:
            self.fd['owner_name'] = None
            
        ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'})
        
        if ganji_phone_call_class:
            self.fd['owner_phone'] = ganji_phone_call_class.contents[0]
            if str(ganji_phone_call_class).find('src='):                
                self.fd['owner_phone'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src']
            else:
                self.fd['owner_phone'] = None            
        else:
            self.fd['owner_phone'] = None            
            
        #没有联系方式  return
        if not self.fd['owner_phone']:return     
        
        if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response):
            cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1)
            self.fd['cityname'] = cityname
        else:
            return   
        
        if re.search(self.house_floor_regex, response):
            house_floor=re.search(self.house_floor_regex, response).group(1)
            house_topfloor=re.search(self.house_floor_regex, response).group(2)
            self.fd['house_floor']    = house_floor
            self.fd['house_topfloor'] = house_topfloor
        else:
            self.fd['house_floor'] = None
            self.fd['house_topfloor'] = None   
        
        if re.search(self.house_totalarea_regex, response):
            house_totalarea=re.search(self.house_totalarea_regex, response).group(1)
            self.fd['house_totalarea'] = house_totalarea
        else:
            self.fd['house_totalarea'] = None
            
        #类型 
        if re.search(self.house_type_regex, response):
            house_type=re.search(self.house_type_regex, response).group(1)
            self.fd['house_type'] = housetype(house_type)
        else:
            self.fd['house_type'] = None   
            
        if re.search(self.house_price_regex, response):
            house_price=re.search(self.house_price_regex, response).group(1)
            if house_price=="面议":
                house_price="0"
            self.fd['house_price'] = house_price
        else:
            self.fd['house_price'] = None
    
        posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None 
        if posttime:
            Y=int(time.strftime('%Y', time.localtime()))
            M=int(posttime.split(' ')[0].split('-')[0])
            D=int(posttime.split(' ')[0].split('-')[1])
            s = datetime.datetime(Y,M,D,0,0)
            posttime=int(time.mktime(s.timetuple()))
            self.fd['posttime'] =posttime 
        else:
            self.fd['posttime'] =None
            
        if re.search(self.house_room_regex, response):
            house_room=re.search(self.house_room_regex, response).group(1)
            self.fd['house_room'] = house_room
        else:
            self.fd['house_room'] = '0'
            
        if re.search(self.house_hall_regex, response):
            house_hall=re.search(self.house_hall_regex, response).group(1)
            self.fd['house_hall'] = house_hall
        else:
            self.fd['house_hall'] = '0'
        
        if re.search(self.house_toilet_regex, response):
            house_toilet=re.search(self.house_toilet_regex, response).group(1)
            self.fd['house_toilet'] = house_toilet
        else:
            self.fd['house_toilet'] = '0'

        house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
        self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        
        #描述        
        detail_box = soup.find('div',{'class':'detail_box'})
        if detail_box:
            house_desc = str(detail_box('p')[1])
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc)
        else:
            self.fd['house_desc'] = None

        d_i = soup.find('ul',{'class':'d_i'})
        
        #小区名
        #先处理JS
        if re.search(self.xiaoqu_regex, response):
            borough_name=re.search(self.xiaoqu_regex, response).group(1)
            self.fd['borough_name'] = borough_name
            if re.search(self.address_regex, response):
                house_addr=re.search(self.address_regex, response).group(1)
                self.fd['house_addr'] = house_addr
        else:            
            if d_i.find(text="小区: "):
                borough_box = d_i.find(text="小区: ").parent        
                borough_name = borough_box.find("a")
                if borough_name:
                    self.fd['borough_name'] = borough_name.string
                else:
                    self.fd['borough_name'] = None            
                #地址
                if borough_name and borough_name.nextSibling:
                    house_addr = borough_name.nextSibling.string
                    self.fd['house_addr'] = re.sub("\(|\)| ","",house_addr)
                else:
                    self.fd['house_addr'] = None
            else:
                if re.search(self.borough_name_regex, response):
                    borough_name=re.search(self.borough_name_regex, response).group(1)
                    self.fd['borough_name'] = re.sub("\(.*\)| ","",borough_name)
            
        #区域     
        area_box = d_i.find(text="区域: ").parent
        area_a = area_box('a')
        if area_a and len(area_a)>1:
            self.fd['cityarea'] = area_a[0].string
            self.fd['section'] = area_a[1].string
        elif area_a and len(area_a)==1:
            self.fd['cityarea'] = area_a[0].string
            self.fd['section'] = None
        else:
            self.fd['cityarea'] = None
            self.fd['section'] = None
        
        if re.search(self.house_age_regex, response):
            house_age=re.search(self.house_age_regex, response).group(1)
            self.fd['house_age'] = house_age
        else:
            self.fd['house_age'] = None
            
        #朝向
        if re.search(self.house_toward_regex, response):
            house_toward=re.search(self.house_toward_regex, response).group(1)
            self.fd['house_toward'] = toward(house_toward)
        else:
            self.fd['house_toward'] = None        
            
        if re.search(self.house_fitment_regex, response):
            house_fitment=re.search(self.house_fitment_regex, response).group(1)
            self.fd['house_fitment'] = fitment(house_fitment)
        else:
            self.fd['house_fitment'] = 2
        request = None
        response = None
        soup=None
        tree=None
        del tree
        del request
        del response
        del soup