def test_get_list_identities(self, client):
        res = client.get("/identity")
        html = document_fromstring(res.get_data())

        header_exists = html.xpath("boolean(//tr/th)")
        self.assertTrue(header_exists)

        rows = html.xpath("boolean(//tr/td)")
        self.assertFalse(rows)

        joe = Identity()
        joe.save()
        john = Identity()
        john.save()
        jim = Identity()
        jim.save()

        res = client.get("/identity")
        html = document_fromstring(res.get_data())

        rows = html.xpath("boolean(//tr/td)")
        self.assertTrue(rows)

        ids = html.xpath("//tr/td[1]//text()")
        self.assertIn(str(joe.id), ids)
Beispiel #2
0
def get_url_html(url, cache=True, cookiejar=None):

	print("GET :: ", url)
	
	if cache:
		data = get_from_cache(url)
		if data:
			return lh.document_fromstring(data)
	
	attempts = 0
	while True:
	
		try:
			response = urlopen(url);
			html = urlopen(url).read().decode('utf8')
			save_to_cache(url, html)
			return lh.document_fromstring(html)
			 
		except:
			
			attempts = attempts + 1
			if attempts > MAX_ATTEMPTS:
				raise
				
			print( "Http error retry in 1 second")
			time.sleep(1)    # pause 5 seconds
			continue
Beispiel #3
0
    def parse_detail(self, page, list_record):
        t = document_fromstring(page)
        record = {}
        for tr in t.xpath("//td[@class='displayvalue']/parent::*"):
            key = tr[1].text_content() or ''
            value = tr[2].text_content() or ''
            record[key.strip()] = value.strip()

        # If there's no filing date, this detail page is related to another
        # license. Go get the dates from that page.
        if not record.has_key('Filing Date:'):
            a = t.xpath("//div[@class='instructions']//a")[0]
            page = self.get_html('http://www.trans.abc.state.ny.us' + a.get('href'))
            t = document_fromstring(page)
            parent_record = {}
            for tr in t.xpath("//td[@class='displayvalue']/parent::*"):
                key = tr[1].text_content() or ''
                value = tr[2].text_content() or ''
                parent_record[key.strip()] = value.strip()
            dates = {
                'Filing Date:': parent_record['Filing Date:'],
                'Effective Date:': parent_record['Effective Date:'],
                'Expiration Date:': parent_record['Expiration Date:'],
            }
            record.update(dates)
        return record
Beispiel #4
0
def getCollectionFics(url):
  try:
    wrlog('Начинаю скачивать фанфики со сборника %s' % url)
    # Все фанфики с одного зборника
    fics = [] # будущий список фанфиков
    counter = 1 # счетчик страниц
    r = req.post(url, cookies = cookies) 
    doc = html.document_fromstring(r.text)
    if checkColecttionIsOpen(doc):
        while True:
            r = req.get(url + "?sort=author&p=" + str(counter), cookies=cookies) 
            doc = html.document_fromstring(r.text)
            # Если на странице есть блок с фиками.
            # Это обусловленно тем, что фикбук не выдает 404 если загрузить страницу под несоществуючим номером
            # Она просто будет пустой
            if not doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/div/table/tr/td[1]/table/tr[1]/td/*'):
                break
            # Список ссылок на фанфики с текущей страницы
            cur_page_fics = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/div/table/tr/td[1]/table/tr[1]/td/a/@href')
            for fic in cur_page_fics:
               fics.append("http://ficbook.net" + fic)
            counter += 1
    else:
        wrlog("Закрытый сборник. URL: %s" % url)
        QtGui.QMessageBox.information(None, 'Ошибка', 'Сборник является приватным. Если это ваш сборник, или вы имеете к нему доступ -- авторизируйтесь.')
        return

    return fics
  except Exception as e:
    showErrorMessage(e)
Beispiel #5
0
    def data_retrieval(self, datasets):
        """
        Retrieve a list of datasets form the ESO archive.

        Parameters
        ----------
        datasets : list of strings
            List of datasets strings to retrieve from the archive.

        Returns
        -------
        files : list of strings
            List of files that have been locally downloaded from the archive.

        """
        from lxml import html
        datasets_to_download = []
        files = []
        # First: Detect datasets already downloaded
        for dataset in datasets:
            local_filename = dataset + ".fits"
            if self.cache_location is not None:
                local_filename = os.path.join(self.cache_location,
                                              local_filename)
            if os.path.exists(local_filename):
                print("Found {0}.fits...".format(dataset))
                files += [local_filename]
            elif os.path.exists(local_filename + ".Z"):
                print("Found {0}.fits.Z...".format(dataset))
                files += [local_filename + ".Z"]
            else:
                datasets_to_download += [dataset]
        # Second: Download the other datasets
        if datasets_to_download:
            data_retrieval_form = self.request("GET", "http://archive.eso.org/cms/eso-data/eso-data-direct-retrieval.html")
            print("Staging request...")
            with suspend_cache(self):  # Never cache staging operations
                data_confirmation_form = self._activate_form(data_retrieval_form, form_index=-1, inputs={"list_of_datasets": "\n".join(datasets_to_download)})
                root = html.document_fromstring(data_confirmation_form.content)
                login_button = root.xpath('//input[@value="LOGIN"]')
                if login_button:
                    raise LoginError("Not logged in.  You must be logged in to download data.")
                # TODO: There may be another screen for Not Authorized; that should be included too
                data_download_form = self._activate_form(data_confirmation_form, form_index=-1)
                root = html.document_fromstring(data_download_form.content)
                state = root.xpath("//span[@id='requestState']")[0].text
                while state != 'COMPLETE':
                    time.sleep(2.0)
                    data_download_form = self.request("GET",
                                                      data_download_form.url)
                    root = html.document_fromstring(data_download_form.content)
                    state = root.xpath("//span[@id='requestState']")[0].text
            print("Downloading files...")
            for fileId in root.xpath("//input[@name='fileId']"):
                fileLink = fileId.attrib['value'].split()[1]
                fileLink = fileLink.replace("/api", "").replace("https://", "http://")
                filename = self.request("GET", fileLink, save=True)
                files += [system_tools.gunzip(filename)]
        print("Done!")
        return files
Beispiel #6
0
def run_parse():
    page = urllib.request.urlopen(base_url)
    doc = html.document_fromstring(page.read())
    doc.make_links_absolute(base_url=base_url)
    for link in html.iterlinks(doc):
        if ("forumdisplay.php" in link[2]) and ("f=43" in link[2]):
            v_chapter_name = link[0].text_content()
            v_path = folder_prefix + v_chapter_name
            v_link = link[2]
            # Создаем папку
            if not os.path.exists(v_path):
                os.makedirs(v_path)
            page = urllib.request.urlopen(v_link)
            doc = html.document_fromstring(page.read())
            doc.make_links_absolute(base_url=base_url)
            for link_topics in html.iterlinks(doc):
                parsed_url = urllib.parse.urlparse(link_topics[2])
                # print(parsed_url)
                parsed_q = urllib.parse.parse_qs(parsed_url.query)
                # print(parsed_q)
                # Отыскиваем ссылку на первую страницу
                if (parsed_url.path == "/showthread.php") and ("t" in parsed_q) and not ("page" in parsed_q) and \
                        (link_topics[0].text_content() != "1")\
                        and (parsed_q["t"][0] == "1537" ):
                    parse_topic(link_topics, v_path)
                #print(parsed_q)
Beispiel #7
0
def parse_topic(page_link, v_base_path):
    v_topic_name = page_link[0].text_content()
    log(v_topic_name + "---" + page_link[2])
    logging.log(logging.INFO, v_topic_name + "---" + page_link[2])
    v_full_path = v_base_path + "\\" + slugify(v_topic_name)
    v_url = page_link[2]
    # Создаем папку
    #if not os.path.exists(v_full_path):
    #    os.makedirs(v_full_path)
    #TODO Временно
    if True:
        # Ищем все посты
        page = urllib.request.urlopen(v_url)
        doc = html.document_fromstring(page.read())
        menu_controls = doc.cssselect('td[class="vbmenu_control"]')
        #TODO: Временно!
        #parse_topic_page(doc, v_full_path)
        for menu_control in menu_controls:
            v_page_text = menu_control.text_content()
            if re.match(r'Страница \d из \d', v_page_text):
                page_count = int(re.sub(r'(Страница \d из )(\d)', r'\2', v_page_text))
                for i in range(page_count-1):
                    #TODO: Временно!
                    if (i+2>=32):
                        url = "{}&page={}".format(page_link[2],i+2)
                        log(v_topic_name + "---" + url)
                        page = urllib.request.urlopen(url)
                        doc = html.document_fromstring(page.read())
                        parse_topic_page(doc, v_full_path)
                break
    else:
        log("{} already exists. Skipped".format(v_full_path))
Beispiel #8
0
def to_doc(text, parser=scraper.LXML_HTML, whole_doc=True):
    """Parse an HTML text. Return value: lxml.html.HtmlElement document.
    
    parser: which parser to use. 
    whole_doc: parse to complete HTML document (with <html> around), or parse just a fragment of HTML."""
    doc = None
    
    if parser == scraper.LXML_HTML:
        if whole_doc:
            doc = html.document_fromstring(text)
        else:
            doc = html.fromstring(text)
    elif parser == scraper.HTML5PARSER:
        # html5parser was broken for me, bug report is here: https://bugs.launchpad.net/lxml/+bug/780642
        #if whole_doc:
        #    doc = html5parser.document_fromstring(text)
        #else:
        #    doc = html5parser.fromstring(text)
        # Here is my workaround:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
        etree_doc = parser.parse(text)  # returns an ElementTree
        doc = html.document_fromstring(elementtree_to_string(etree_doc))
        # ^ this double conversion makes it slow ^
    elif parser == scraper.BEAUTIFULSOUP:
        # soupparser has no document_fromstring method
        doc = soupparser.fromstring(text)
    else:
        print >>sys.stderr, "Warning: you want to use an unknown parser in lx.py."
        # doc is None
        
    return doc  # lxml.html.HtmlElement
Beispiel #9
0
def scrape_wiki_codes():
    data = {}
    base_url = 'http://en.wikipedia.org/wiki/List_of_ISO_639'
    #639-1
    resp = web.get(base_url + '-1_codes')
    h = html.document_fromstring(resp)
    table = h.find_class('wikitable')[0]
    for row in table.findall('tr')[1:]:
        name = row.findall('td')[2].find('a').text
        code = row.findall('td')[4].text
        data[code] = name
    #639-2
    resp = web.get(base_url + '-2_codes')
    h = html.document_fromstring(resp)
    table = h.find_class('wikitable')[0]
    for row in table.findall('tr')[1:]:
        name = row.findall('td')[3].find('a')
        if name:
            name = name.text
        else:
            continue
        code_list = row.findall('td')[0].text.split(' ')
        if len(code_list) == 1:
            code = code_list[0]
        else:
            for i in code_list:
                if '*' in i:
                    code = i.replace('*', '')
                    break
        data[code] = name

    return data
Beispiel #10
0
    def scrape_balance(username, password):
        session = requests.Session()

        login_response = session.get(LOGIN_URL)

        login_page = html.document_fromstring(login_response.content)
        login_page.make_links_absolute(LOGIN_URL)

        login_form = login_page.forms[0]
        login_form_data = dict(login_form.form_values())
        login_form_data['username'] = username
        login_form_data['password'] = password

        logged_in_response = session.request(
            login_form.method,
            login_form.action,
            data=login_form_data,
        )

        logged_in_page = html.document_fromstring(logged_in_response.content)
        if logged_in_page.cssselect('form.login-form'):
            raise InvalidCredentialsException()

        points_elem = logged_in_page.cssselect('.points')
        if not points_elem:
            # We don't know what went wrong here.
            raise RuntimeError()

        points_text = ''.join(points_elem[0].itertext())
        points_match = RE_POINTS.search(points_text)
        if not points_match:
            # We don't know what went wrong here either.
            raise RuntimeError()

        return int(points_match.group('points'))
Beispiel #11
0
	def consume(self):
		while not self.__stop:
			try:
				data = self.__fetcher.fetch()
				if data=="DIE":
					self.logger.info("收到死亡信号了")
					self.pusher.push("DIE")
					self.stop()
					continue
				# import pdb
				# pdb.set_trace()
				url = BASEURL+data
				self.logger.info("正在访问 : %s" % url)
				time.sleep(1)
				self.__browser.visit(url)
				page = html.document_fromstring(self.__browser.html)
				top50 = [BASEURL+link.get('href')+"/about" for link in page.xpath(u"//h3[@class='zm-item-answer-author-wrap']/a[@class='zm-item-link-avatar']")]
				# 代码需要修改, 这里应该有一个top[] 长度的判断,
				for i in range(5):
					time.sleep(2)
					self.logger.info("正在访问 : %s" % top50[i])
        			self.__browser.visit(top50[i])
        			self.__pusher.push(html.document_fromstring(self.__browser.html))
        		except QueueEmpty as e:
        			time.sleep(5)
        			self.logger.error("队列为空")
        		except Exception as ex:
        			# 调用相关的处理 ,调用相关的组建
					self.logger.error("严重异常 :%s" % traceback.format_exc())
					raise ex
Beispiel #12
0
    def parse_html(self, in_html=False):
        """
        Parses the HTML document imported. Currently, BeautifulSoup is used.

        :param in_html: HTML file to override the one given by the super class
        :return: the parsed content

        # Alternative used etree HTMLParser, but this requires an extra
        # two calls, one making it a StringIO, and then acquiring the root
        # element tree, but I don't see a difference?
        # parser = etree.HTMLParser()
        # tree = etree.parse(StringIO(html), parser)
        #
        # return tree.getroot()
        """

        if not in_html:
            parsed_html = document_fromstring(self.raw_html)
            self.parsed_html = parsed_html
        else:
            parsed_html = document_fromstring(in_html)

        logger.debug('Parsed HTML. {0}'.format(parsed_html))

        return parsed_html
Beispiel #13
0
 def _do_fetch_all(self):
     logging.debug("Fetching all data from : %s" % self["url"])
     
     response = self.http_get(self["url"])
     tree = HTMLNode(html.document_fromstring(response.text))
     
     self["files"] = []
     files_div = tree.find("div", **{"id": "files"})
     if len(files_div) == 1:
         files_table = files_div[0].find("table")
         if len(files_table) == 1:
             for tr in files_table[0].find("tr")[1:]:
                 filename, size = tr.find("td")
                 self["files"].append(ResultFile(**{
                     "filename": filename.getContent().decode("utf-8").strip(),
                     "size": self._plugin.parse_size(size.getContent().decode("utf-8").replace(",", "").strip())
                 }))
     
     logging.debug("Fetching all data from : %s" % self["download_link_page"])
     
     response = self.http_get(self["download_link_page"])
     tree = HTMLNode(html.document_fromstring(response.text))
     
     download_link =[a for a in tree.find("a") if a.prop("href") is not None and a.prop("href").startswith("download.php")][0]
     self["download_link"] = urllib.parse.urljoin(self["download_link_page"], download_link.prop("href"))
    def _get_works(self):
        req_opere = requests.get(self.scheda.link_opere)
        if req_opere.ok:
            doc = html.document_fromstring(req_opere.text)
            risultati = doc.xpath("//*[@id='corpo_opac']/div[1]/div[2]/div[2]/div[1]")[0].text_content().strip()
            resmatch = RISULTATI.match(risultati)
            if resmatch:
                res_start = int(resmatch.group(1))
                res_stop = int(resmatch.group(2))
                res_tot = int(resmatch.group(3))

                url = 'http://opac.sbn.it/opacsbn/opaclib' \
                      '?db=solr_iccu&resultForward=opac/iccu/brief.jsp&from=1&nentries={res_tot}' \
                      '&searchForm=opac/iccu/error.jsp&do_cmd=search_show_cmd&item:5032:BID={code}'.format(
                      res_tot=res_tot, code='IT\\ICCU\\'+self.code)

            req_opere_tot = requests.get(url)
            if req_opere_tot.ok:
                doc = html.document_fromstring(req_opere_tot.text)
                topere = doc.xpath("//div[@id='colonna_risultati']/table[@id='records']/tbody")[0].getchildren()
                topere = [row.getchildren()[3].getchildren() for row in topere]
                for op in topere:
                    opera = self.Work()
                    for div in op:
                        if div.get('class') == 'rectitolo':
                            opera.url = self.BASE_SBN_URL + div.getchildren()[0].get('href')
                            opera.titolo = div.getchildren()[0].text.strip()
                        elif div.get('class') == 'rec_3a_linea':
                            sourceline = div.sourceline
                            raw_source = req_opere_tot.text.split('\n')[sourceline-1:sourceline][0]
                            opera.edizione = remove_tags(remove_br_tags(raw_source, '\n'))
                        else:
                            opera.autori = div.text

                    self.opere.append(opera)
Beispiel #15
0
    def _getFavorites(self):
        """
        Returns dict by name of topic_id

        :param username:
            string of username, ex. 'some_user'
        :return:
            dict(name) = id
        """
        url = self._genFavoritesUrlByUser(self._username)
        doc = html.document_fromstring(requests.get(url).text)
        out = dict()
        pages = get_pages(doc)
        favs = doc.xpath("//div[@class='user_favorites']//a[@class='post_title']")
        for f in favs:
            # out[f.text] = str(f.attrib['href']).split('/')[-2]
            # topic_id =
            out[f.text] = str(f.attrib['href']).split('/')[-2]
        for p in range(2, pages):
            url = 'http://{0}/users/{1}/favorites/page{2}/'.format(self._domain, self._username, p)
            # if show_progress:
            # print('parsing page{0}... url={1}'.format(p, url))
            doc = html.document_fromstring(requests.get(url).text)
            favs = doc.xpath("//div[@class='user_favorites']//a[@class='post_title']")
            for f in favs:
                # out[f.text] = f.attrib['href'][-7:-1]
                out[f.text] = str(f.attrib['href']).split('/')[-2]
        return out
Beispiel #16
0
    def get_tree(self):
        """
        Return the DOM for the article content.

        Note this actually returns the XPATH method on the tree, so
        you can do: a.tree(<xpath>) directly.
        """
        quoted_url = urllib.quote(self.url, safe='')
        html_file = CACHE.child(quoted_url)
        self.log.info(self.url)
        if not html_file.exists():
            self.log.debug("  Downloading")
            response, self.content = HTTP.request(self.url)
            status_code = int(response['status'])

            if not (200 <= status_code < 400):
                self.log.error("Got HTTP status code %d" % status_code)

            # cache content
            with open(html_file, 'w') as fp:
                fp.write(self.content)

            return document_fromstring(self.content).xpath
        else:
            self.log.debug("  Using cache ('%s...')" % html_file.name[:60])
            with open(html_file) as fp:
                self.content = fp.read()
                return document_fromstring(self.content).xpath
Beispiel #17
0
 def data_retrieval(self, datasets):
     """ Retrieve a list of datasets form the ESO archive.
     
     Parameters
     ----------
     datasets : list of strings
         List of datasets strings to retrieve from the archive.
     
     Returns
     -------
     files : list of strings
         List of files that have been locally downloaded from the archive.
     
     """
     data_retrieval_form = self.session.get("http://archive.eso.org/cms/eso-data/eso-data-direct-retrieval.html")
     data_confirmation_form = self._activate_form(data_retrieval_form, form_index=-1, inputs={"list_of_datasets": "\n".join(datasets)})
     data_download_form = self._activate_form(data_confirmation_form, form_index=-1)
     root = html.document_fromstring(data_download_form.content)
     state = root.xpath("//span[@id='requestState']")[0].text
     while state != 'COMPLETE':
         time.sleep(2.0)
         data_download_form = self.session.get(data_download_form.url)
         root = html.document_fromstring(data_download_form.content)
         state = root.xpath("//span[@id='requestState']")[0].text
     files = []
     for fileId in root.xpath("//input[@name='fileId']"):
         fileLink = fileId.attrib['value'].split()[1]
         fileLink = fileLink.replace("/api","").replace("https://","http://")
         files += [self._download_file(fileLink)]
     print("Done!")
     return files
Beispiel #18
0
 def wrapper(*args, **kw):
     try:
         doc = lhtml.document_fromstring(get(url, cache=True, **kw))
         return fn(doc, *args, **kw)
     except Exception:
         write_cache(url, None)
         doc = lhtml.document_fromstring(get(url, cache=True, **kw))
         return fn(doc, *args, **kw)
Beispiel #19
0
    def scrape_balance(username, password):
        session = requests.Session()

        login_response = session.get(LOGIN_URL)

        login_page = html.document_fromstring(login_response.content)
        login_page.make_links_absolute(LOGIN_URL)

        login_form_data = {
            'catalogId': '10051',
            'reLogonURL': 'MSSparksLandingPage',
            'myAcctMain': '',
            'fromOrderId': '*',
            'toOrderId': '.',
            'deleteIfEmpty': '*',
            'continue': '1',
            'createIfEmpty': '1',
            'calculationUsageId': '-1',
            'updatePrices': '0',
            'errorViewName': 'MSSparksLandingPage',
            'forgotPasswordURL': 'MSSparksLandingPage',
            'previousPage': 'logon',
            'rememberMe': 'true',
            'resetConfirmationViewName': 'ResetPasswordForm',
            'URL': '/MSNorth',
            'logonId': username,
            'logonPassword': password,
        }

        logged_in_response = session.post(
            LOGIN_ENDPOINT,
            data=login_form_data,
        )

        # Populate the auth token cookie for the API
        session.get(AUTH_TOKEN_ENDPOINT)

        auth_token = None
        for cookie, value in session.cookies.items():
            if cookie.startswith('MS_AUTH_TOKEN_'):
                auth_token = value

        if not auth_token:
            raise InvalidCredentialsException()

        logged_in_page = html.document_fromstring(logged_in_response.content)
        if logged_in_page.cssselect('form.login-form'):
            raise InvalidCredentialsException()

        offers_response = session.get(OFFERS_API, headers={
            'Authorization': 'MNSAuthToken %s' % auth_token,
        })

        offers = offers_response.json()
        return int(offers['sparks'])
Beispiel #20
0
def domain_to_graph(fname, type="zss"):
  ''' a wrapper function that turns an html file to a dom graph
  '''
  fh = open(fname, 'r')
  content = fh.read()
  fh.close()

  if type == "zss":
    html_tag = html.document_fromstring(content)
    return make_html_zssgraph(html_tag)
  if type == "nx":
    html_tag = html.document_fromstring(content)
    return make_html_nxgraph(html_tag)
Beispiel #21
0
    def process_results_page(self, url):
        """
        Parameters
        ----------
        url : str
            URL of the lead-in results page
        """
        r = requests.get(url)
        if r.status_code != 200:
            raise RuntimeError("Could not retrieve {}".format(url))
        leadin_doc = html.document_fromstring(r.content)
        tables = leadin_doc.cssselect('.participant-list')

        # Get any following pages.
        links = leadin_doc.cssselect('.pagination a[rel]')
        while True:
            if len(links) == 0:
                break

            lst = [link for link in links if link.text.startswith('Next')]
            if len(lst) == 0:
                break

            anchor = lst[0]
            next_rel_url = anchor.get('href')
            print('\t\t{}'.format(next_rel_url))
            r = requests.get('http://results.active.com' + next_rel_url)
            doc = html.document_fromstring(r.content)
            table = doc.cssselect('.participant-list')[0]
            tables.append(table)

            links = doc.cssselect('.pagination a[rel]')

        # Search the tables.                
        lst = []
        for table in tables:
            trs = table.cssselect('tr')
            # first row has stuff we don't want
            for tr in trs[1:]:
                tds = tr.getchildren()
                if len(tds) < 2:
                    continue
                for regex in self.regex:
                    if regex.match(tds[2].text_content()):
                        lst.append(tr)

        if len(lst) > 0:
            # Ok we found some results.  Insert the header for the first table.
            header_row = tables[0].cssselect('tr')[0]
            lst.insert(0, header_row)
            self.webify_results(leadin_doc, lst, url)
Beispiel #22
0
    def assertTreeDiff(self, html1, html2, expected):
        """
        Asserts that the given HTML strings will produce a tree_diff of the
        expected HTML string.
        """
        # The test strings should *not* have <html> and <body> tags, for the
        # sake of brevity.
        tree1 = document_fromstring("<html><body>%s</body></html>" % html1)
        tree2 = document_fromstring("<html><body>%s</body></html>" % html2)
        expected = "<html><body>%s</body></html>" % expected

        result_tree = tree_diff(preprocess(tree1), preprocess(tree2), self.algorithm)
        got = etree.tostring(result_tree)
        self.assertEqual(got, expected)
Beispiel #23
0
def vote_ids_for_house(congress, session_year, options):
    vote_ids = []

    index_page = "http://clerk.house.gov/evs/%s/index.asp" % session_year
    group_page = r"ROLL_(\d+)\.asp"
    link_pattern = r"http://clerk.house.gov/cgi-bin/vote.asp\?year=%s&rollnumber=(\d+)" % session_year

    # download index page, find the matching links to the paged listing of votes
    page = utils.download(
        index_page,
        "%s/votes/%s/pages/house.html" % (congress, session_year),
        options)

    if not page:
        logging.error("Couldn't download House vote index page, aborting")
        return None

    # extract matching links
    doc = html.document_fromstring(page)
    links = doc.xpath(
        "//a[re:match(@href, '%s')]" % group_page,
        namespaces={"re": "http://exslt.org/regular-expressions"})

    for link in links:
        # get some identifier for this inside page for caching
        grp = re.match(group_page, link.get("href")).group(1)

        # download inside page, find the matching links
        page = utils.download(
            urlparse.urljoin(index_page, link.get("href")),
            "%s/votes/%s/pages/house_%s.html" % (congress, session_year, grp),
            options)

        if not page:
            logging.error("Couldn't download House vote group page (%s), aborting" % grp)
            continue

        doc = html.document_fromstring(page)
        votelinks = doc.xpath(
            "//a[re:match(@href, '%s')]" % link_pattern,
            namespaces={"re": "http://exslt.org/regular-expressions"})

        for votelink in votelinks:
            num = re.match(link_pattern, votelink.get("href")).group(1)
            vote_id = "h" + num + "-" + str(congress) + "." + session_year
            if not should_process(vote_id, options):
                continue
            vote_ids.append(vote_id)

    return utils.uniq(vote_ids)
Beispiel #24
0
def main():
    source = urllib.urlopen(_URL).read()
    tree = html.document_fromstring(source)

    for img in get_image_top_news(tree):
        print img

    url = get_text_top_news(tree)
    print "\n", url
    detail_tree = html.document_fromstring(urllib.urlopen(url).read())
    title, snippets = get_complete_page(detail_tree)

    print "\n", title
    for snippet in snippets:
        print "\n", snippet
Beispiel #25
0
    def assertStrips(self, html1, html2, expected, num_removals, check_ids=False):
        """
        Asserts that strip_template(html1, html2) will result in the expected
        HTML string, and that the return value is num_removals.
        """
        # The test strings should *not* have <html> and <body> tags, for the
        # sake of brevity.
        tree1 = document_fromstring('<html><body>%s</body></html>' % html1)
        tree2 = document_fromstring('<html><body>%s</body></html>' % html2)
        expected = '<html><body>%s</body></html>' % expected

        got_removals = strip_template(tree1, tree2, check_ids=check_ids)
        got_tree = etree.tostring(tree1, method='html')
        self.assertEqual(got_tree, expected)
        self.assertEqual(got_removals, num_removals)
Beispiel #26
0
    def _parse_data(self):
        # Find the javascript and get the content
        try:
            text = self.url_response.text
            root = HTML.document_fromstring(text)
            js_list = root.xpath('head/script/@src')
            js_url = self.BASE_URL + filter(lambda x:x[-3:]=='.js', js_list)[0]
        except IndexError:
            return QueryResult(False, err='javascript file is not found')

        r = retry_requests(js_url)
        if r.ok is False:
            return QueryResult(False, err=r.error_msg)
        text = r.text.encode('utf8')

        # Get the image list
        urls_pattern = 'picAy\[\d+\] = "(.*?)"'
        urls = re.findall(urls_pattern, text)
        self.urls = [self.BASE_URL + comic_url for comic_url in urls]

        # Get the comic name
        name_pattern = 'comicName = "(.*?)"'
        self.name = re.findall(name_pattern, text)[0]

        # Get the links of prev/next chapter
        def _get_url(pattern):
            url = re.findall(pattern, text)[0]
            if 'javascript' in url:
                return None
            return self.BASE_URL + url

        prev_pattern = 'preVolume="(.*?)"'
        next_pattern = 'nextVolume="(.*?)"'
        self.prev_url = _get_url(prev_pattern)
        self.next_url = _get_url(next_pattern)
Beispiel #27
0
def parsestring(s):
    ht={}
    import string
    from lxml import html
    import re
    doc = html.document_fromstring(s)
    def getWords(text):
        return re.compile('\w+').findall(text)

    text_doc = doc.text_content()
    #print text_doc
    s = text_doc.lower() # all lowercase
    s = re.sub('<[^>]*>', '', s) # removes <something> tags.
    #print s
    s = s.translate(string.maketrans("",""), string.digits)
    s = ' '.join(getWords(s)) # seperates out only words
    #print s
    #s = s.translate(string.maketrans("",""), string.punctuation)


    #s=' '.join(s.split(','))
    #s= ' '.join(s.split('.'))
    s= s.split()
    #print s
    list_stopw=getstopwords(s) #remove stopwords
    #print list_stopw
    for i in s:
        if i not in list_stopw:
            #i=stem(i) #stemming algorithm
            ht[i]=ht.get(i,0)+1
    return ht
Beispiel #28
0
def sanitize(input, cleaner=DocumentCleaner, wrap='p'):
    """Cleanup markup using a given cleanup configuration.
       Unwrapped text will be wrapped with wrap parameter.
    """
    if 'body' not in cleaner.allow_tags:
        cleaner.allow_tags.append('body')

    input = six.u("<html><body>%s</body></html>") % input
    document = html.document_fromstring(input)
    bodies = [e for e in document if html._nons(e.tag) == 'body']
    body = bodies[0]

    cleaned = cleaner.clean_html(body)
    remove_empty_tags(cleaned)
    strip_outer_breaks(cleaned)

    if wrap is not None:
        if wrap in html.defs.tags:
            wrap_text(cleaned, wrap)
        else:
            raise ValueError(
                'Invalid html tag provided for wrapping the sanitized text')

    output = six.u('').join([etree.tostring(fragment, encoding=six.text_type)
        for fragment in cleaned.iterchildren()])
    if wrap is None and cleaned.text:
        output = cleaned.text + output

    return output
Beispiel #29
0
def make_data_and_cookies():
    """make the post data(including vcode) and get cookies"""

    vcode = ''
    while len(vcode) is not 4:
        r = requests.get(MAIN_URL)
        doc = html.document_fromstring(r.text)
        vcode_link = doc.cssselect('form img')[0].get('src')
        #print vcode_link
        vcv = doc.cssselect('input[name="vcv"]')[0].get('value')
        img_url = BASE_URL + vcode_link
        #print vcv
        img = requests.get(img_url)

        # write to the image file
        with open(IMG_PATH, 'w') as f:
            f.write(img.content)
        fh = open(IMG_PATH, 'rb')
        imgstring = fh.read()
        fh.close()
        data = {
              "picstring" : imgstring
            }
        re = requests.post("http://202.117.120.235/server.php", data=data)
        vcode = re.text

    data = {
            "account": USERNAME,
            "password": PASSWORD,
            "vcode": vcode,
            "vcv": vcv
            }
    return data, r.cookies
Beispiel #30
0
def home(request):
    if request.method == 'POST':
        form = URLForm(request.POST)
        if form.is_valid():
            url = form.cleaned_data['url']
            return redirect('/?q=' + url)
    else:
        url = request.GET.get('q')
        url = check_url(url)
        if url:
            page, content = download_page(url)
            if content == 'text/html':
                form = URLForm()
                doc = html.document_fromstring(page)
                title = get_title(doc)
                doc = replace_links(doc, url)
                head = get_head(doc)
                body = get_body(doc)
                context = {'form': form, 'head': head, 'body': body, 'title': title}
                return render(request, 'page.html', context, context_instance=RequestContext(request))
            else:
                return HttpResponse(page, content_type=content)
        else:
            form = URLForm()
            return render(request, 'home.html', {'form': form}, context_instance=RequestContext(request))
Beispiel #31
0
def ParseHtml(story, corpus):
    """Parses the HTML of a news story.

  Args:
    story: The raw Story to be parsed.
    corpus: Either 'cnn' or 'dailymail'.

  Returns:
    A Story containing URL, paragraphs and highlights.
  """

    parser = html.HTMLParser(encoding=chardet.detect(story.html)['encoding'])
    tree = html.document_fromstring(story.html, parser=parser)

    # Elements to delete.
    delete_selectors = {
        'cnn': [
            '//blockquote[contains(@class, "twitter-tweet")]',
            '//blockquote[contains(@class, "instagram-media")]'
        ],
        'dailymail': [
            '//blockquote[contains(@class, "twitter-tweet")]',
            '//blockquote[contains(@class, "instagram-media")]'
        ]
    }

    # Paragraph exclusions: ads, links, bylines, comments
    cnn_exclude = (
        'not(ancestor::*[contains(@class, "metadata")])'
        ' and not(ancestor::*[contains(@class, "pullquote")])'
        ' and not(ancestor::*[contains(@class, "SandboxRoot")])'
        ' and not(ancestor::*[contains(@class, "twitter-tweet")])'
        ' and not(ancestor::div[contains(@class, "cnnStoryElementBox")])'
        ' and not(contains(@class, "cnnTopics"))'
        ' and not(descendant::*[starts-with(text(), "Read:")])'
        ' and not(descendant::*[starts-with(text(), "READ:")])'
        ' and not(descendant::*[starts-with(text(), "Join us at")])'
        ' and not(descendant::*[starts-with(text(), "Join us on")])'
        ' and not(descendant::*[starts-with(text(), "Read CNNOpinion")])'
        ' and not(descendant::*[contains(text(), "@CNNOpinion")])'
        ' and not(descendant-or-self::*[starts-with(text(), "Follow us")])'
        ' and not(descendant::*[starts-with(text(), "MORE:")])'
        ' and not(descendant::*[starts-with(text(), "SPOILER ALERT:")])')

    dm_exclude = ('not(ancestor::*[contains(@id,"reader-comments")])'
                  ' and not(contains(@class, "byline-plain"))'
                  ' and not(contains(@class, "byline-section"))'
                  ' and not(contains(@class, "count-number"))'
                  ' and not(contains(@class, "count-text"))'
                  ' and not(contains(@class, "video-item-title"))'
                  ' and not(ancestor::*[contains(@class, "column-content")])'
                  ' and not(ancestor::iframe)')

    paragraph_selectors = {
        'cnn': [
            '//div[contains(@class, "cnnContentContainer")]//p[%s]' %
            cnn_exclude,
            '//div[contains(@class, "l-container")]//p[%s]' % cnn_exclude,
            '//div[contains(@class, "cnn_strycntntlft")]//p[%s]' % cnn_exclude
        ],
        'dailymail':
        ['//div[contains(@class, "article-text")]//p[%s]' % dm_exclude]
    }

    # Highlight exclusions.
    he = ('not(contains(@class, "cnnHiliteHeader"))'
          ' and not(descendant::*[starts-with(text(), "Next Article in")])')
    highlight_selectors = {
        'cnn': [
            '//*[contains(@class, "el__storyhighlights__list")]//li[%s]' % he,
            '//*[contains(@class, "cnnStryHghLght")]//li[%s]' % he,
            '//*[@id="cnnHeaderRightCol"]//li[%s]' % he
        ],
        'dailymail': ['//h1/following-sibling::ul//li']
    }

    def ExtractText(selector):
        """Extracts a list of paragraphs given a XPath selector.

    Args:
      selector: A XPath selector to find the paragraphs.

    Returns:
      A list of raw text paragraphs with leading and trailing whitespace.
    """

        xpaths = map(tree.xpath, selector)
        elements = list(chain.from_iterable(xpaths))
        paragraphs = [e.text_content().encode('utf-8') for e in elements]

        # Remove editorial notes, etc.
        if corpus == 'cnn' and len(
                paragraphs) >= 2 and '(CNN)' in paragraphs[1]:
            paragraphs.pop(0)

        paragraphs = map(str.strip, paragraphs)
        paragraphs = [s for s in paragraphs if s and not str.isspace(s)]

        return paragraphs

    for selector in delete_selectors[corpus]:
        for bad in tree.xpath(selector):
            bad.getparent().remove(bad)

    paragraphs = ExtractText(paragraph_selectors[corpus])
    highlights = ExtractText(highlight_selectors[corpus])

    content = '\n\n'.join(paragraphs)

    return Story(story.url, content, highlights)
Beispiel #32
0
 def _activate_form(self, response, form_index=0, inputs={}):
     from lxml import html
     # Extract form from response
     root = html.document_fromstring(response.content)
     form = root.forms[form_index]
     # Construct base url
     if "://" in form.action:
         url = form.action
     elif form.action[0] == "/":
         url = '/'.join(response.url.split('/', 3)[:3]) + form.action
     else:
         url = response.url.rsplit('/', 1)[0] + '/' + form.action
     # Identify payload format
     if form.method == 'GET':
         fmt = 'get'  # get(url, params=payload)
     elif form.method == 'POST':
         if 'enctype' in form.attrib:
             if form.attrib['enctype'] == 'multipart/form-data':
                 fmt = 'multipart/form-data'  # post(url, files=payload)
             elif form.attrib[
                     'enctype'] == 'application/x-www-form-urlencoded':
                 fmt = 'application/x-www-form-urlencoded'  # post(url, data=payload)
         else:
             fmt = 'post'  # post(url, params=payload)
     # Extract payload from form
     payload = []
     for form_input in form.inputs:
         key = form_input.name
         value = None
         is_file = False
         if isinstance(form_input, html.InputElement):
             value = form_input.value
             if 'type' in form_input.attrib:
                 is_file = (form_input.attrib['type'] == 'file')
         elif isinstance(form_input, html.SelectElement):
             if isinstance(form_input.value, html.MultipleSelectOptions):
                 value = []
                 for v in form_input.value:
                     value += [v]
             else:
                 value = form_input.value
                 if value is None:
                     value = form_input.value_options[0]
         if key in inputs.keys():
             value = "{0}".format(inputs[key])
         if (key is not None) and (value is not None):
             if fmt == 'multipart/form-data':
                 if is_file:
                     payload += [(key, ('', '', 'application/octet-stream'))
                                 ]
                 else:
                     if type(value) is list:
                         for v in value:
                             payload += [(key, ('', v))]
                     else:
                         payload += [(key, ('', value))]
             else:
                 if type(value) is list:
                     for v in value:
                         payload += [(key, v)]
                 else:
                     payload += [(key, value)]
     # Send payload
     if fmt == 'get':
         response = self.request("GET", url, params=payload)
     elif fmt == 'post':
         response = self.request("POST", url, params=payload)
     elif fmt == 'multipart/form-data':
         response = self.request("POST", url, files=payload)
     elif fmt == 'application/x-www-form-urlencoded':
         response = self.request("POST", url, data=payload)
     return response
Beispiel #33
0
 def page_html(self):
     return html.document_fromstring(self.response.text)
Beispiel #34
0
args = parser.parse_args()
year = args.year

headers = {
    'User-Agent':
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:66.0) Gecko/20100101 Firefox/66.0'
}

try:
    os.mkdir(f'{year}_AER')
except:
    pass

url = 'https://www.aeaweb.org/journals/aer/issues'

a = html.document_fromstring(requests.get(url, headers=headers).text)

issues = [
    ref for ref, vol in zip(a.xpath('//a[@href]/@href'),
                            a.xpath('//a[@href]/text()'))
    if 'issues' in ref and str(year) in vol
]

for issue in issues:
    url2 = 'https://www.aeaweb.org' + issue
    print(url2)

    b = html.document_fromstring(requests.get(url2, headers=headers).text)

    [x for x in b.xpath('//a[@href]/text()') if 'articles?' in x]
Beispiel #35
0
def get_csrf_token(session, base_url):
    csrf_response = session.get(base_url + '/login')
    tree = html.document_fromstring(csrf_response.content)
    return tree.xpath("//input[contains(@name, '_csrf_token')]")[0].value
Beispiel #36
0
    async def parse_ea44(self, session: aiohttp.ClientSession,
                         order: Data) -> Data:
        """ парсит данные одной закупки """

        # проверка на наличие записи в базе
        data = self.db_session.query(Data).filter(Data.id == order.id).all()
        if data:
            return data[0]

        text = await self._get_request(session, order.tender_link)

        order_document = document_fromstring(text)
        # парсим главную информацию о закупке - номер цена заказчик дата

        card_info_container = order_document.cssselect('.cardMainInfo')[0]
        tender_id = card_info_container.cssselect(
            '.cardMainInfo__purchaseLink')

        if not tender_id:
            tender_id = ''
        else:
            tender_id = self._normalizer(tender_id[0].text_content())

        tender_object = card_info_container.xpath(
            './div[1]/div[2]/div[1]/span[2]')
        if not tender_object:
            tender_object = ''
        else:
            tender_object = self._normalizer(tender_object[0].text_content())

        customer = card_info_container.xpath('./div[1]/div[2]/div[2]/span[2]')
        if not customer:
            customer = ''
        else:
            customer = self._normalizer(customer[0].text_content())

        tender_price = card_info_container.cssselect('.cost')
        if not tender_price:
            tender_price = ''
        else:
            tender_price = self._normalizer(tender_price[0].text_content())

        tender_date = card_info_container.xpath(
            './div[2]/div[2]/div[1]/div[1]/span[2]')
        if not tender_date:
            tender_date = ''
        else:
            tender_date = self._normalizer(tender_date[0].text_content())

        # общая информация о закупке - адресс электронной площадки и обьект закупки
        general_information_container = order_document.xpath(
            '//div[@class="wrapper"]/div[2]')

        tender_adress = general_information_container[0].xpath(
            './/div[@class="col"]/section[3]/span[2]')
        if not tender_adress:
            tender_adress = ''
        else:
            tender_adress = self._normalizer(tender_adress[0].text_content())

        # условия контракта
        condition_container = self.get_cotract_conditions_container(
            order_document.xpath('//div[@id="custReqNoticeTable"]/div'))
        if condition_container is not None:
            tender_delivery_adress = condition_container.xpath(
                './/div[@class="col"]/section[2]/span[2]')
            if not tender_delivery_adress:
                tender_delivery_adress = ''
            else:
                tender_delivery_adress = self._normalizer(
                    tender_delivery_adress[0].text_content())

            tender_term = condition_container.xpath(
                './/div[@class="row"]/section[3]/span[2]')
            if not tender_term:
                tender_term = ''
            else:
                tender_term = self._normalizer(tender_term[0].text_content())
        else:
            tender_delivery_adress = ''
            tender_term = ''

        # парсинг информации о обьекте закупки
        tender_object_info = self.parse_tender_object_info(order_document)

        # парсинг победителя
        try:
            winner = await self.parse_tender_winner(session, order.tender_link)
        except Exception:
            winner = []
        if len(winner) < 3:
            winner = ['', '', '']

        # парсинг ссылок документов
        term_document_link = order.tender_link.replace('common-info',
                                                       'documents')
        term_document_data = await self._get_request(session,
                                                     term_document_link)
        term_document_links = document_fromstring(term_document_data).xpath(
            '//span[@class="section__value"]/a[@title]/@href')
        order.tender_object = tender_object
        order.customer = customer
        order.tender_price = self._tender_price_handler(tender_price)
        order.tender_date = self._tender_date_handler(tender_date)
        order.tender_adress = tender_adress
        order.tender_delivery = tender_delivery_adress
        order.tender_term = tender_term
        for object_info in self._handle_tender_objects(tender_object_info):
            order.objects.append(object_info)
        for document_link_data in term_document_links:
            tender_link = TenderLinks()
            tender_link.link = document_link_data
            tender_link.data_id = order.id
            order.document_links.append(tender_link)
        order.winner.append(
            Winners(name=winner[0], position=winner[1], price=winner[2]))
        order.type = 'fz44'
        return order
Beispiel #37
0
# -*- coding: utf-8 -*-

import sys
import os
import time
from lxml import etree, html

# 设置utf-8 unicode环境
reload(sys)
sys.setdefaultencoding('utf-8')

# htm文件路径,以及读取文件
path = "1.htm"
content = open(path, "rb").read()
page = html.document_fromstring(content)  # 解析文件
text = page.text_content()  # 去除所有标签
print text  # 输出去除标签后解析结果
 def s_lxml(self):
     tree = html.document_fromstring(self.page)
     self.tree = tree  #xpath treee
     return tree
Beispiel #39
0
def document_fromstring(string):
    return _html.document_fromstring(string, parser=utf8parser)
Beispiel #40
0
from lxml import html
import urllib
import requests

URL = "http://goalkicker.com/"

response = requests.get(URL)
sourceCode = response.content
htmlElem = html.document_fromstring(sourceCode)

books = htmlElem.cssselect('[class="bookContainer grow"]')

for book in books:
    urlSuffix = book[0].get('href')
    response = requests.get(URL + urlSuffix)
    sourceCode = response.content
    htmlElem = html.document_fromstring(sourceCode)
    download = htmlElem.cssselect('[id="footer"]')
    pdfTitle = download[0][0].get('onclick')[15:-1]

    link = URL + urlSuffix + pdfTitle
    downloadDir = pdfTitle
    urllib.request.urlretrieve(link, downloadDir)  #download the pdf
    print(pdfTitle)
Beispiel #41
0
def getFanficInfo(url):
  try:
    ffinfo = {}
    r = req.get(url)
    if not check404(r):
        doc = html.document_fromstring(r.text)
        # Тут все предельно ясно: получаем нужную инфу путем извлечения ее через xpath
        ffinfo['id'] = url[url.find('ficbook.net/readfic/')+20:]
        ffinfo['name'] = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/h1[1]/text()')[0].strip()
        ffinfo['author'] = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[1]/a[1]/text()')[0]
        ffinfo['authorlink'] = 'http://ficbook.net' + doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[1]/a[1]/@href')[0]
        ffinfo['likes'] = int(doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[1]/div[1]/div[1]/text()')[0].strip().replace('+',''))
        ffinfo['description'] = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[2]/span[1]/text()')[0].strip()       
        warings = []
        genders = []
        # Разделение на жанры и предупреждения.
        # Не знаю зачем оно, но пусть будет.
        genders_buf = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[1]/a')
        for g in genders_buf:
            try:
                href = g.attrib['href']
                title = g.attrib['title']
            except KeyError:
                continue
            if not href in ratings and href.startswith('/ratings/'):
                title = title[3:title.find('</b>')]
                if title in genders_list:
                    genders.append(title)
                elif title in warings_list:
                    warings.append(title)
        ffinfo['genders'] = genders
        ffinfo['warings'] = warings
        # А тут фэндом, размер и рейтинг
        # ------------------------------- 
        # -- Дайте один NC, пожалуйста.
        # -- Вам оридж или по Лаки Стар?
        # -- А по ЛС какие жанры?
        # -- Только юри.
        # -- Ахуеть, дайте два!
        # ------------------------------- 
        buf_list = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[1]/a')
        for item in buf_list:
            buf_link = '/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/table/tr/td[1]/a[@href = "' + str(item.get('href')) + '"]'
            if doc.xpath(buf_link + '/text()'):
                value = doc.xpath(buf_link + '/text()')[0] .strip()
            if '/fanfiction/' in str(item.get('href')):
                ffinfo['fandom'] = value
            elif str(item.get('href')) in ratings:
                ffinfo['rating'] = value
            elif '/sizes/' in str(item.get('href')):
                ffinfo['size'] = value
        
        if doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/div[@class="part_list"]'):
            parts = doc.xpath('/html/body/table[2]/tr[2]/td[1]/table/tr[2]/td[2]/div[@class="part_list"]/a/@href')
            parts_urls = list(map(lambda x: 'http://ficbook.net' + x.replace('#part_content', ''), parts))
        else:
            parts_urls = [url]

        ffinfo['parts'] = parts_urls
        return ffinfo
    else:
        QtGui.QMessageBox.information(None, 'Error 404', 
        'Такой страницы не существует.')

  except Exception as e:
    showErrorMessage(e)
Beispiel #42
0
# SPDX-License-Identifier: MIT
# Copyright (C) 2020 Tobias Gruetzmacher

from lxml import html

from dosagelib.xml import NS

import httpmocks

tree = html.document_fromstring(httpmocks.content('zp-222'))


class TestXML:
    def xpath(self, path):
        return tree.xpath(path, namespaces=NS)

    def test_class_ext(self):
        assert len(self.xpath('//li[d:class("menu-item-3773")]')) == 1
        assert len(self.xpath('//ul[d:class("menu")]')) == 1
        assert len(self.xpath('//li[d:class("menu-item-object-custom")]')) == 2
        assert len(self.xpath('//li[d:class("menu-item")]')) == 25

    def test_re_ext(self):
        assert len(self.xpath(r'//img[re:test(@src, "posters.*jpg")]')) == 1
def search_title(title):
    if not LMXL:
        return None
    data = get_page("/index.php?first=no&what=&kp_query=", 1,
                    title.decode('utf8'))
    doc = html.document_fromstring(data)
    search_results = []
    #Проверяем ту ли страницу (т.е. страницу с результатами поиска) мы получили
    regexp = re.compile(unicode("Скорее всего, вы ищете:", "utf8"), re.DOTALL)
    result = regexp.search(data)
    #if result == None:
    #Если не ту, то парсим страницу фильма на которую нас перенаправил кинопоиск
    #    titlestr = doc.xpath("//h1[@class='moviename-big']") [0].text.strip()
    #    try:
    #        title = '%s' % (normilize_string(titlestr))
    #    except:
    #        title = 'None'
    #    try:
    #        idstr = '\nid:%s' % (doc.xpath("//link[@rel='canonical']") [0].attrib["href"].split("/")[-2])
    #    except:
    #        idstr = '\nid:n/a'
    #    cur_movie = (title.encode("utf-8"), idstr.encode("utf-8"))
    #    search_results.append(cur_movie)
    #print '%s' % (search_results)
    #    return search_results
    if result:
        titleNodes = doc.xpath(
            "//div[@class='search_results' or @class='search_results search_results_simple']/div[@class='element most_wanted' or @class='element']/div[@class='info']"
        )
        for titleNode in titleNodes:
            yearInfo = titleNode.xpath("p//span[@class='year']/text()")
            titleInfo = titleNode.xpath("p[@class='name']/a")
            #rateInfo = titleNode.xpath("div[@class='rating']/text()")
            genreInfo = titleNode.xpath("span[@class='gray']/text()")
            try:
                year = yearInfo[0]
            except:
                year = 'n/a'
            try:
                title = '%s (%s)' % (normilize_string(titleInfo[0].text), year)
            except:
                title = 'none'
            try:
                id = '\nid:%s' % (titleInfo[0].attrib["data-id"])
            except:
                id = '\nid:n/a'
            #try:
            #    rate = '\n%s' % (rateInfo[0])
            #except:
            #    rate = ''
            try:
                genre = '\n%s\n%s\n%s' % (
                    normilize_string(genreInfo[0]),
                    normilize_string(genreInfo[1].replace(',', '').replace(
                        '...', '')), normilize_string(genreInfo[3]))
            except:
                genre = ''
            search = (title.encode("utf-8"), id.encode("utf-8"),
                      genre.encode("utf-8"))
            search_results.append(search)
            #print '%s %s %s\n' % (title, id, genre)
    return search_results
Beispiel #44
0
 def load_offline(self, path):
     with open(path, 'r') as f:
         s = f.read()
         self.html = html.document_fromstring(s)
Beispiel #45
0
    }

    return obj_template


base_url = 'https://www.tripadvisor.com/Restaurants-g274967-Riga_Riga_Region.html'
opened_page = 1

# linux chromedriver path: /mnt/c/chromedriver.exe
# windows chromedriver path: C:\\chromedriver.exe

browser = webdriver.Chrome("C:\\chromedriver.exe")
browser.get(base_url)
time.sleep(5)
content = browser.page_source
trip_content = html.document_fromstring(content)
contents_seen = 0

pages_total = get_number_of_pages(trip_content)
main_window = browser.current_window_handle

while opened_page <= pages_total:
    contents_seen = 0
    # get the number of restaurant links on given search results page
    page_content_count = get_number_of_page_links(trip_content)

    while contents_seen < page_content_count:
        # get the list of currently opened page restaurant list items
        all_restaurants_link_elements = browser.find_elements(
            By.XPATH, '//a[@class="photo_link"]')
Beispiel #46
0
    def parse(self, html_src: str) -> 'list[Result]':
        """ Parses an html document for a given XPath expression. Any resulting node can optionally be filtered against a regular expression """

        # In the following some xpath extension functions are introduced. They can be used in the xpath fields of a task's selectors #
        def textify(node):
            return (str(node.text) if hasattr(node, "text") else str(node)).strip()

        def merge_lists(context, *args):
            """ Merge the items of lists at same positions. If one list is shorter, its last element is repeated """
            try:
                return [" ".join([textify(arg[min(i, len(arg) - 1)]) for arg in args]) for i in range(max(map(len, args)))]
            except Exception as e:
                return [""]

        def exe(context, nodes, path):
            """ Executes a given xpath with each node in the first xpath as context node """
            try:
                return [textify(node.xpath(path).pop()) if node.xpath(path) else "" for node in nodes]
            except Exception as e:
                return [""]

        def all(context, nodes):
            return [" ".join(textify(node) for node in nodes)]

        ns = etree.FunctionNamespace(None)
        ns['merge_lists'] = merge_lists
        ns['exe'] = exe
        ns['all'] = all

        if not self.selectors.all():
            return html_src  # nothing to do

        parsed_tree = html.document_fromstring(html_src)

        selectors_results = []
        for selector in self.selectors.all():
            nodes = parsed_tree.xpath(selector.xpath)
            nodes = [textify(node) for node in nodes]

            if selector.regex:
                # Apply regex to every single node #
                selector_results = []
                for node in nodes:
                    node = str(node)
                    regex_result = re.search(selector.regex, node, re.DOTALL | re.UNICODE)
                    if regex_result:
                        if regex_result.groups():
                            selector_results += [regex_result.groups()[-1]]
                        else:
                            selector_results += [regex_result.group()]
                    else:
                        selector_results += [None]
            else:
                selector_results = nodes

            selector_results = [selector.cast(data) if data is not None else None for data in selector_results]  # cast to type

            selectors_results.append(selector_results)

        # convert selector results from a tuple of lists to a list of tuples #
        results = []
        for y in range(max([len(selectors_results[list(self.selectors.all()).index(key_selector)]) for key_selector in self.selectors.all() if key_selector.is_key])):  # Take as many results, as there are results for a key selector
            result = Result(task_id=self.name)
            for x, selector in enumerate(self.selectors.all()):
                selectors_results[x] = selectors_results[x] or [None]  # Guarantee that an element is there
                setattr(result, selector.name, selectors_results[x][min(y, len(selectors_results[x]) - 1)])

            result.key = result.get_key()
            if result.key:
                results.append(result)

        return results
Beispiel #47
0
def page(request, slug):
    my_list = []
    page = HelpPages.objects.get(slug=slug)
    left_menu = get_left_menu(page)
    page_title = page.title

    anchors = None
    if page.content:
        content_html = html.document_fromstring(page.content)
        anchors = content_html.cssselect("h2")
        setted_anchors = []
        for anchor in anchors:
            if anchor.text and anchor.text != "":
                anchor_link = slugify(anchor.text, allow_unicode=True)
                if anchor_link in setted_anchors:
                    anchor_link = "{}_".format(anchor_link)

                anchor.attrib['id'] = anchor_link
                anchor.insert(
                    0,
                    etree.XML('<a href="#{}" class="anchor">#</a>'.format(
                        anchor_link)))
                setted_anchors.append(anchor_link)

        page.content = html.tostring(
            content_html,
            encoding='unicode',
            pretty_print=True,
        )

    sub_pages = HelpPages.objects.filter(parent_page=page)

    # Next Page
    if HelpPages.objects.filter(parent_page_id=page.parent_page_id,
                                status=1,
                                tree_id__gt=page.tree_id).exists():
        next_page = HelpPages.objects.filter(
            parent_page_id=page.parent_page_id,
            status=1,
            tree_id__gt=page.tree_id)[0]
    elif page.parent_page and HelpPages.objects.filter(
            parent_page=None, status=1,
            tree_id__gt=page.parent_page.tree_id).exists():
        next_page = HelpPages.objects.filter(
            parent_page=None, status=1,
            tree_id__gt=page.parent_page.tree_id)[0]
    if HelpPages.objects.filter(parent_page_id=page.id, status=1).exists():
        next_page = HelpPages.objects.filter(parent_page_id=page.id,
                                             status=1)[0]
    # Previous Page
    if HelpPages.objects.filter(parent_page_id=page.parent_page_id,
                                status=1,
                                tree_id__lt=page.tree_id).exists():
        prev_page = HelpPages.objects.filter(
            parent_page_id=page.parent_page_id,
            status=1,
            tree_id__lt=page.tree_id).last()
        if HelpPages.objects.filter(parent_page_id=prev_page.id,
                                    status=1).exists():
            prev_page = HelpPages.objects.filter(parent_page_id=prev_page.id,
                                                 status=1).last()
    elif page.parent_page:
        prev_page = page.parent_page
    else:
        prev_page = {'title': "Помощь"}

    return render(request, "page/index.html", locals())
Beispiel #48
0
        self._task = task
        self._body = body
        self._body_type = None
        self.items = dict()
        self.tasks = list()
        self._md5_mk = hashlib.md5()
        if body[0] == '{' and body[-1] == '}' or body[0] == '[' and body[
                -1] == ']':
            try:
                self._json_dict = json.loads(body)
            except Exception, e:
                print(e)
            else:
                self._body_type = self.JSON
        else:
            self._doc = html.document_fromstring(body)
            self._body_type = self.HTML
        self._parse()

    def _parse(self):
        pass

    def _xpath(self, xp):
        if self._body_type == self.HTML:
            return self._doc.xpath(xp)
        return None

    def _get(self, key, default=None):
        if self._body_type == self.JSON:
            return self._json_dict.get(key, default)
        return None
from lxml import html
import requests
import math


def prod(l):
    a = 1
    for term in l:
        a *= int(term)
    return a


page = requests.post('https://projecteuler.net/problem=8', verify=False)
data = html.document_fromstring(page.text)
num = data.xpath(
    '//p[@style="font-family:courier new;text-align:center;"]/text()')
num = ''.join(num).replace('\r', '').replace('\n', '')
res = []
for i, n in enumerate(num):
    if i + 13 < len(num):
        r = num[i:i + 13]
        res.append(prod(r))
    else:
        break
print max(res)
def update_page_info_module(course_id, page_name):
    # Use the Canvas API to GET the page
    #GET /api/v1/courses/:course_id/pages/:url

    url = baseUrl + '%s/pages/%s' % (course_id, page_name)
    if Verbose_Flag:
        print(url)
    payload = {}
    r = requests.get(url, headers=header, data=payload)
    if r.status_code == requests.codes.ok:
        page_response = r.json()
        if Verbose_Flag:
            print("body: {}".format(page_response["body"]))

        document = html.document_fromstring(page_response["body"])
        raw_text = document.text_content()
        print("raw_text: {}".format(raw_text))

        title = page_response["title"]
    else:
        print("No page {}".format(page_name))
        return False

    # transform page

    GQMContent = document.xpath('//p[@class="GQMContent"]')
    if len(GQMContent) > 0:
        text_of_GQMContent = GQMContent[0].text
        print("Existing information as text is {}".format(text_of_GQMContent))

        information_for_on_page = json.loads(text_of_GQMContent)
        print("Existing information is {}".format(information_for_on_page))

        document2 = deepcopy(document)
        # trim off GQMContent paragraph before processing the raw_text
        for elem in document2.xpath('//p[@class="GQMContent"]'):
            elem.getparent().remove(elem)

        raw_text = document2.text_content()
        print("raw_text: {}".format(raw_text))

    information_for_on_page["Words"] = len(raw_text.split())
    information_for_on_page["Characters"] = len(raw_text)
    # see http://www.erinhengel.com/software/textatistic/
    information_for_on_page["Textatistic.counts"] = Textatistic(
        raw_text).counts
    information_for_on_page["Textatistic.statistics"] = Textatistic(
        raw_text).dict()

    if len(GQMContent) == 0:
        #no GQMContent found on this page so add some
        print("No GQMContent found - adding some")
        body = document.find('.//body')
        if body == None:
            print("page has no <body>")
        else:
            GQMContent_string = '<p class="GQMContent">' + json.dumps(
                information_for_on_page) + "</p>"
        body.append(html.etree.XML(GQMContent_string))
        print("initial updated document {}", format(html.tostring(document)))
    else:
        GQMContent[0].text = json.dumps(information_for_on_page)
        print("updated document {}", format(html.tostring(document)))

    # Use the Canvas API to insert the page
    #PUT /api/v1/courses/:course_id/pages/:uid
    #    wiki_page[title]
    #    wiki_page[published]
    #    wiki_page[body]

    url = baseUrl + '%s/pages/%s' % (course_id, page_name)
    if Verbose_Flag:
        print(url)
    payload = {
        'wiki_page[title]':
        title,
        'wiki_page[published]':
        False,
        'wiki_page[body]':
        str(html.tostring(document, pretty_print=True, method="html"), 'utf-8')
    }
    r = requests.put(url, headers=header, data=payload)
    write_to_log(r.text)
    print("status code {}".format(r.status_code))
    if r.status_code == requests.codes.ok:
        return True
    else:
        print("Unable to update page {}".format(page_name))
        return False
Beispiel #51
0
 def compile(self, content):
     self._parser_content = HTML.document_fromstring(content)
Beispiel #52
0
    def parse_ea44(self, link):
        inform_request = self.session.get(link)
        inform_request.raise_for_status()

        order_document = document_fromstring(inform_request.text)

        # парсим главную информацию о закупке - номер цена заказчик дата
        card_info_container = order_document.cssselect('.cardMainInfo')[0]
        tender_id = card_info_container.cssselect(
            '.cardMainInfo__purchaseLink')

        if not tender_id:
            tender_id = ''
        else:
            tender_id = self._normalizer(tender_id[0].text_content())

        tender_object = card_info_container.xpath(
            './div[1]/div[2]/div[1]/span[2]')
        if not tender_object:
            tender_object = ''
        else:
            tender_object = self._normalizer(tender_object[0].text_content())

        customer = card_info_container.xpath('./div[1]/div[2]/div[2]/span[2]')
        if not customer:
            customer = ''
        else:
            customer = self._normalizer(customer[0].text_content())

        tender_price = card_info_container.cssselect('.cost')
        if not tender_price:
            tender_price = ''
        else:
            tender_price = self._normalizer(tender_price[0].text_content())

        tender_date = card_info_container.xpath(
            './div[2]/div[2]/div[1]/div[1]/span[2]')
        if not tender_date:
            tender_date = ''
        else:
            tender_date = self._normalizer(tender_date[0].text_content())

        # общая информация о закупке - адресс электронной площадки и обьект закупки
        general_information_container = order_document.xpath(
            '//div[@class="wrapper"]/div[2]')
        
        tender_adress = general_information_container[0].xpath(
            './/div[@class="col"]/section[3]/span[2]')
        if not tender_adress:
            tender_adress = ''
        else:
            tender_adress = self._normalizer(tender_adress[0].text_content())

        # условия контракта
        condition_container = self._get_cotract_conditions_container(
            order_document.xpath('//div[@id="custReqNoticeTable"]/div'))
        if condition_container is not None:
            tender_delivery_adress = condition_container.xpath(
                './/div[@class="col"]/section[2]/span[2]')
            if not tender_delivery_adress:
                tender_delivery_adress = ''
            else:
                tender_delivery_adress = self._normalizer(
                    tender_delivery_adress[0].text_content())

            tender_term = condition_container.xpath(
                './/div[@class="row"]/section[3]/span[2]')
            if not tender_term:
                tender_term = ''
            else:
                tender_term = self._normalizer(tender_term[0].text_content())
        else:
            tender_delivery_adress = ''
            tender_term = ''

        # парсинг информации о обьекте закупки
        tender_object_info = self._parse_tender_object_info(order_document)

        # парсинг победителя
        winner = self._parse_tender_winner(link)
        if len(winner) < 3:
            winner = ['', '', '']

        # парсинг ссылок документов
        term_document_link = link.replace('common-info', 'documents')
        term_document_data = self.session.get(term_document_link)
        term_document_data.raise_for_status()
        term_document_links = document_fromstring(term_document_data.text).xpath(
            '//span[@class="section__value"]/a[@title]/@href')

        return {
            'tender_id': tender_id, 'tender_object': tender_object, 'customer': customer,
            'tender_price': tender_price, 'tender_date': tender_date, 'tender_adress': tender_adress,
            'tender_delivery': tender_delivery_adress, 'tender_term': tender_term,
            'tender_object_info': tender_object_info, 'document_links': term_document_links,
            'tender_winner': winner, 'type': 'fz44', 'link': link
        }
Beispiel #53
0
	return json.dumps({"error": message}, indent=4, sort_keys=True)

base_wikipedia_url = "https://en.wikipedia.org"

# Throw an error for command-line arguments.
if len(sys.argv) != 3:
	print error_message("Requires 2 command-line arguments: url and number.")
	sys.exit()

# Extract command-line arguments.
url = sys.argv[1]
number = int(sys.argv[2])

# Make request and extract table.
response = requests.get(url)
tree = html.document_fromstring(response.text)
table = tree.xpath('//table[@class="infobox"]')

# Throw an error if table is not found.
if len(table) == 0:
	print error_message("No cabinet tables found for this presidency.")
	sys.exit()

# Retrieve appropriate table.
# For all presidents, this will be the first table except for Grover Cleveland.
# Because he had two non-consecutive terms, we must check which term we're referring to.
# His second term is on a second table.
#
# TODO: Make this more generic such that non-consecutive terms can be handled generically.
else:
	table = table[ 1 if number == 24 else 0 ]
Beispiel #54
0
def main():
    with open(filename) as bookmarks_file:
        data = bookmarks_file.read()

    geolocator = Nominatim()

    kml = simplekml.Kml()

    lst = list()

    # Hacky and doesn't work for all of the stars:
    lat_re = re.compile('markers:[^\]]*latlng[^}]*lat:([^,]*)')
    lon_re = re.compile('markers:[^\]]*latlng[^}]*lng:([^}]*)')
    coords_in_url = re.compile('\?q=(-?\d{,3}\.\d*),\s*(-?\d{,3}\.\d*)')

    doc = document_fromstring(data)
    for element, attribute, url, pos in doc.body.iterlinks():
        if 'maps.google' in url:
            description = element.text or ''
            print description.encode('UTF8')

            if coords_in_url.search(url):
                # Coordinates are in URL itself
                latitude = coords_in_url.search(url).groups()[0]
                longitude = coords_in_url.search(url).groups()[1]
            else:
                # Load map and find coordinates in source of page
                try:
                    sock = urlopen(url.replace(' ', '+').encode('UTF8'))
                except Exception, e:
                    print 'Connection problem:'
                    print repr(e)
                    print 'Waiting 2 minutes and trying again'
                    time.sleep(120)
                    sock = urlopen(url.replace(' ', '+').encode('UTF8'))
                content = sock.read()
                sock.close()
                time.sleep(3)  # Don't annoy server
                try:
                    latitude = lat_re.findall(content)[0]
                    longitude = lon_re.findall(content)[0]
                except IndexError:
                    try:
                        lines = content.split(
                            '\n')  # --> ['Line 1', 'Line 2', 'Line 3']
                        for line in lines:
                            if re.search('cacheResponse\(', line):
                                splitline = line.split('(')[1].split(
                                    ')')[0] + '"]'
                                # in the future we can extract the coordinates from here
                                null = None
                                values = eval(splitline)
                                print values[8][0][1]
                                longitude = str(values[0][0][1])
                                latitude = str(values[0][0][2])
                                continue
                    except IndexError:
                        print '[Coordinates not found]'
                        continue
                    print

            print latitude, longitude
            try:
                location = geolocator.reverse(latitude + ", " + longitude)
                print(location.address)
            except ValueError:
                print '[Invalid coordinates]'
            print
            kml.newpoint(name=description,
                         coords=[(float(longitude), float(latitude))])
            lst.append({
                'latitude':
                latitude,
                'longitude':
                longitude,
                'name':
                description,
                'url':
                url.encode(encoding='utf-8', errors='replace'),
                'address':
                location.address.encode(encoding='utf-8', errors='replace')
                if location else 'error'
            })

            # this is here because there's a tendancy for this script to fail part way through...
            # so at least you can get a partial result
            kml.save("GoogleBookmarks.kml")
            with open('GoogleBookmarks.json', mode='w') as listdump:
                listdump.write(json.dumps(lst))
        sys.stdout.flush()
Beispiel #55
0
    def parse_html(self, html_source):
        import lxml.html as HTML
        root = HTML.document_fromstring(html_source)
        with open('test.html','w') as f:
            f.write(html_source)
        class_name = re.findall(r'<li id="result_\d+".*? class="(.*?)"', html_source)
        if class_name:
            class_name = class_name[0]
        else:
            class_name = "s-result-item  celwidget"
        print (class_name)
        pdivs = root.xpath(".//li[@class='"+class_name+"']")
        print ('len', len(pdivs))
        products = []
        if len(pdivs)==0: 
            return products
        for pdiv in pdivs:
            try:
                product = {}
                ASIN = pdiv.xpath("./@data-asin")
            
                link = pdiv.xpath(".//a[@class='a-link-normal a-text-normal']/@href")
            
                image = pdiv.xpath(".//a[@class='a-link-normal a-text-normal']/img/@src")

                title = pdiv.xpath(".//h2[@class='a-size-base a-color-null s-inline s-access-title a-text-normal']/text()")

                price = pdiv.xpath(".//span[@class='a-color-price']/text()|.//span[@class='a-size-base a-color-price s-price a-text-bold']/text()")

                isfba = len(pdiv.xpath(".//i[@class='a-icon a-icon-prime a-icon-small s-align-text-bottom']"))
                review=pdiv.xpath(".//a[@class='a-size-small a-link-normal a-text-normal']/text()")
                
                try:
                    product['ASIN'] = ASIN[0]
                except:
                    product['ASIN'] = ''
                    
                    
                try:
                    product['link'] = link[0]
                except:
                    product['link'] = ''
                    
                try:
                    product['image'] = image[0]
                except:
                    product['image'] = ''
                try:
                    product['title'] = title[0]
                except:
                    product['title'] = ''               
                
                try:
                    product['price'] = price[0]
                except:
                    product['price'] = '0'
                    
                product['iamge_list'] = []
                


                product['isfba'] = isfba
                # can't find these code 20170117
                # product_count=MarketProductsCandidates.objects.filter(product_id=ASIN[0],market__market_name="Amazon.com")
                # if product_count.exists():
                #     product['in_db']='True'
                # else:
                #     product['in_db']='False'
                # productas=MarketProductAssignment.objects.filter(product__product_id=ASIN[0],product__market__market_name="Amazon.com")
                # if productas.exists():
                #     product['in_assign']='True'
                # else:
                #     product['in_assign']='False'
                try:  
                    if review[-1]==' ':
                        product['review']=0
                    else:              
                        if len(review)==0:
                            product['review']=0
                        else:
                            product['review']=int(review[-1].replace(',',''))
                except:
                    product['review']=0
                products.append(product)
            except Exception as e:
                print (e, 'location: parse_html')
        return products
Beispiel #56
0
    def get_headers(self, product_ids):
        """
        Get the headers associated to a list of data product IDs

        This method returns a `~astropy.table.Table` where the rows correspond
        to the provided data product IDs, and the columns are from each of
        the Fits headers keywords.

        Note: The additional column ``'DP.ID'`` found in the returned table
        corresponds to the provided data product IDs.

        Parameters
        ----------
        product_ids : either a list of strings or a `~astropy.table.Column`
            List of data product IDs.

        Returns
        -------
        result : `~astropy.table.Table`
            A table where: columns are header keywords, rows are product_ids.

        """
        from lxml import html
        _schema_product_ids = schema.Schema(
            schema.Or(Column, [six.string_types]))
        _schema_product_ids.validate(product_ids)
        # Get all headers
        result = []
        for dp_id in product_ids:
            response = self.request(
                "GET", "http://archive.eso.org/hdr?DpId={0}".format(dp_id))
            root = html.document_fromstring(response.content)
            hdr = root.xpath("//pre")[0].text
            header = {'DP.ID': dp_id}
            for key_value in hdr.split('\n'):
                if "=" in key_value:
                    [key, value] = key_value.split('=', 1)
                    key = key.strip()
                    value = value.split('/', 1)[0].strip()
                    if key[0:7] != "COMMENT":  # drop comments
                        if value == "T":  # Convert boolean T to True
                            value = True
                        elif value == "F":  # Convert boolean F to False
                            value = False
                        # Convert to string, removing quotation marks
                        elif value[0] == "'":
                            value = value[1:-1]
                        elif "." in value:  # Convert to float
                            value = float(value)
                        else:  # Convert to integer
                            value = int(value)
                        header[key] = value
                elif key_value.find("END") == 0:
                    break
            result += [header]
        # Identify all columns
        columns = []
        column_types = []
        for header in result:
            for key in header.keys():
                if key not in columns:
                    columns += [key]
                    column_types += [type(header[key])]
        # Add all missing elements
        for i in range(len(result)):
            for (column, column_type) in zip(columns, column_types):
                if column not in result[i]:
                    result[i][column] = column_type()
        # Return as Table
        return Table(result)
Beispiel #57
0
    def perform_url(self, url):
        """
        Perform an article document by designated url

        :param url: web-page url of the document
        """
        self.url = url
        self.title = self.image_url = self.language = self.description = \
            self.clean_html = self.error_msg = self._charset = None

        if not self.url:
            self.error_msg = 'Empty or null URL to perform'
            return

        # get the page (bytecode)
        try:
            web_page = requests.get(self.url, headers=self._headers)

            # perform http status codes
            if web_page.status_code not in [200, 301, 302]:
                self.error_msg = str('HTTP Error. Status: %s' %
                                     web_page.status_code)
                return

            self.url = web_page.url

            raw_html = web_page.content

            self._charset = get_encoding(raw_html)
            raw_html_str = raw_html.decode(self._charset)

            # getting and cleaning the document
            self._source_html = document_fromstring(raw_html_str)
            self._source_html = html_cleaner.clean_html(self._source_html)

            # making links absolute
            self._source_html.make_links_absolute(self.url,
                                                  resolve_base_href=True)

        except (ConnectionError, Timeout, TypeError, Exception) as e:
            self.error_msg = str(e)
        finally:
            if self.error_msg:
                return

        if self._source_html is not None:

            # obtaining title
            self.title = shorten_title(self._source_html)

            # obtaining image url
            self.image_url = get_image_url(self._source_html, self.url)
            if self.image_url is not None:
                image_url_node = "<meta itemprop=\"image\" content=\"%s\">" % self.image_url
                image_url_img = "<img src=\"%s\" />" % self.image_url
            else:
                image_url_node = image_url_img = ""

            # clean html
            self.clean_html = self._article_extractor.get_clean_html(
                source_html=self._source_html)

            # summarized description, requires clean_html
            if self.clean_html:
                self.description, self.language = get_plain_text(
                    etree.XML(self.clean_html), self._summary_sentences_qty)
                description_node = "<meta name=\"description\" content=\"%s\">" if self.description else ""

                # filling the template
                self.clean_html = ARTICLE_TEMPLATE % {
                    'language': self.language,
                    'title': self.title,
                    'image_url_node': image_url_node,
                    'image_url_img': image_url_img,
                    'description_node': description_node,
                    'clean_html': self.clean_html
                }
Beispiel #58
0
    def authenticate(self):
        config = ConfigParser.ConfigParser()
        config.read(TOKENS_FILE)

        if config.has_option("hubic", "refresh_token"):
            oauth_token = self._refresh_access_token()
        else:
            r = requests.get(
                OAUTH_ENDPOINT + 'auth/?client_id={0}&redirect_uri={1}'
                '&scope=credentials.r,account.r&response_type=code&state={2}'.
                format(
                    quote(self._client_id),
                    quote_plus(self._redirect_uri),
                    pyrax.utils.random_ascii()  # csrf ? wut ?..
                ),
                allow_redirects=False)
            if r.status_code != 200:
                raise exc.AuthenticationFailed("Incorrect/unauthorized "
                                               "client_id (%s)" %
                                               str(self._parse_error(r)))

            try:
                from lxml import html as lxml_html
            except ImportError:
                lxml_html = None

            if lxml_html:
                oauth = lxml_html.document_fromstring(
                    r.content).xpath('//input[@name="oauth"]')
                oauth = oauth[0].value if oauth else None
            else:
                oauth = re.search(
                    r'<input\s+[^>]*name=[\'"]?oauth[\'"]?\s+[^>]*value=[\'"]?(\d+)[\'"]?>',
                    r.content)
                oauth = oauth.group(1) if oauth else None

            if not oauth:
                raise exc.AuthenticationFailed(
                    "Unable to get oauth_id from authorization page")

            if self._email is None or self._password is None:
                raise exc.AuthenticationFailed(
                    "Cannot retrieve email and/or password. "
                    "Please run expresslane-hubic-setup.sh")

            r = requests.post(OAUTH_ENDPOINT + 'auth/',
                              data={
                                  'action': 'accepted',
                                  'oauth': oauth,
                                  'login': self._email,
                                  'user_pwd': self._password,
                                  'account': 'r',
                                  'credentials': 'r',
                              },
                              allow_redirects=False)

            try:
                query = urlparse.urlsplit(r.headers['location']).query
                code = dict(urlparse.parse_qsl(query))['code']
            except:
                raise exc.AuthenticationFailed(
                    "Unable to authorize client_id, "
                    "invalid login/password ?")

            oauth_token = self._get_access_token(code)

        if oauth_token['token_type'].lower() != 'bearer':
            raise exc.AuthenticationFailed("Unsupported access token type")

        r = requests.get(
            API_ENDPOINT + 'account/credentials',
            auth=BearerTokenAuth(oauth_token['access_token']),
        )

        swift_token = r.json()
        self.authenticated = True
        self.token = swift_token['token']
        self.expires = swift_token['expires']
        self.services['object_store'] = Service(
            self, {
                'name': 'HubiC',
                'type': 'cloudfiles',
                'endpoints': [{
                    'public_url': swift_token['endpoint']
                }]
            })
        self.username = self.password = None
Beispiel #59
0
    def data_retrieval(self, datasets):
        """
        Retrieve a list of datasets form the ESO archive.

        Parameters
        ----------
        datasets : list of strings
            List of datasets strings to retrieve from the archive.

        Returns
        -------
        files : list of strings
            List of files that have been locally downloaded from the archive.

        """
        from lxml import html
        datasets_to_download = []
        files = []
        # First: Detect datasets already downloaded
        for dataset in datasets:
            local_filename = dataset + ".fits"
            if self.cache_location is not None:
                local_filename = os.path.join(self.cache_location,
                                              local_filename)
            if os.path.exists(local_filename):
                print("Found {0}.fits...".format(dataset))
                files += [local_filename]
            elif os.path.exists(local_filename + ".Z"):
                print("Found {0}.fits.Z...".format(dataset))
                files += [local_filename + ".Z"]
            else:
                datasets_to_download += [dataset]
        # Second: Download the other datasets
        if datasets_to_download:
            data_retrieval_form = self.request(
                "GET",
                "http://archive.eso.org/cms/eso-data/eso-data-direct-retrieval.html"
            )
            print("Staging request...")
            with suspend_cache(self):  # Never cache staging operations
                data_confirmation_form = self._activate_form(
                    data_retrieval_form,
                    form_index=-1,
                    inputs={
                        "list_of_datasets": "\n".join(datasets_to_download)
                    })
                root = html.document_fromstring(data_confirmation_form.content)
                login_button = root.xpath('//input[@value="LOGIN"]')
                if login_button:
                    raise LoginError(
                        "Not logged in.  You must be logged in to download data."
                    )
                # TODO: There may be another screen for Not Authorized; that should be included too
                data_download_form = self._activate_form(
                    data_confirmation_form, form_index=-1)
                root = html.document_fromstring(data_download_form.content)
                state = root.xpath("//span[@id='requestState']")[0].text
                while state != 'COMPLETE':
                    time.sleep(2.0)
                    data_download_form = self.request("GET",
                                                      data_download_form.url)
                    root = html.document_fromstring(data_download_form.content)
                    state = root.xpath("//span[@id='requestState']")[0].text
            print("Downloading files...")
            for fileId in root.xpath("//input[@name='fileId']"):
                fileLink = fileId.attrib['value'].split()[1]
                fileLink = fileLink.replace("/api",
                                            "").replace("https://", "http://")
                filename = self.request("GET", fileLink, save=True)
                files += [system_tools.gunzip(filename)]
        print("Done!")
        return files
def search_data(uid):
    def addMultiValues(dataNode, xpathTuple):
        result = ''
        temp_list = []
        for xpathString in xpathTuple:
            if len(dataNode) and len(dataNode.xpath(xpathString)):
                for node in dataNode.xpath(xpathString):
                    if node.text != "...":
                        temp_list.append(node.text)
        result = ",".join(temp_list)
        return result

    try:
        filmdata = {
            'title': '',
            'countries': '',
            'year': '',
            'directors': '',
            'cast': '',
            'genre': '',
            'duplicate': '',
            'user_rating': '',
            'rating_count': '',
            'movie_rating': '',
            'plot': '',
            'runtime': ''
            #'url' : '',
            #'coverart' : '',
            #'fanart' : ''
        }

        data = get_page("/level/1/film/" + uid, 1)
        doc = html.document_fromstring(data)
        titleNodes = doc.xpath("//h1[@class='moviename-big']")
        if len(titleNodes):
            try:
                filmdata['title'] = titleNodes[0].text.strip()
            except:
                filmdata['title'] = ''
        userRatingNodes = doc.xpath("//div[@id='block_rating']/div/div/a/span")
        if len(userRatingNodes):
            try:
                filmdata['user_rating'] = userRatingNodes[0].text
            except:
                filmdata['user_rating'] = ''
        countRatingNodes = doc.xpath("//span[@class='ratingCount']")
        if len(countRatingNodes):
            try:
                filmdata['rating_count'] = normilize_string(
                    countRatingNodes[0].text)
            except:
                filmdata['rating_count'] = ''
        infoNodes = doc.xpath("//table[@class='info']/*")
        for infoNode in infoNodes:
            dataNodes = infoNode.xpath("td")
            if dataNodes[0].text == u"год":
                try:
                    filmdata['year'] = dataNodes[0].xpath(
                        "//table[@class='info']//td//div//a/text()")[0]
                except:
                    filmdata['year'] = ''
            elif dataNodes[0].text == u"страна":
                try:
                    filmdata['countries'] = addMultiValues(
                        dataNodes[1], ("div/a", "/a"))
                except:
                    filmdata['countries'] = ''
            elif dataNodes[0].text == u"режиссер":
                try:
                    filmdata['directors'] = addMultiValues(
                        dataNodes[1], ("a", "/a"))
                except:
                    filmdata['directors'] = ''
            elif dataNodes[0].text == u"жанр":
                try:
                    film_data = addMultiValues(dataNodes[1], ("span/a", "/a"))
                    filmdata['genre'] = film_data.replace('музыка',
                                                          '').replace(
                                                              'слова', '')
                except:
                    filmdata['genre'] = ''
            elif dataNodes[0].text == u"время":
                try:
                    filmdata['runtime'] = dataNodes[1].text.split()[0]
                except:
                    filmdata['runtime'] = ''
            elif dataNodes[0].text == u"рейтинг MPAA":
                try:
                    filmdata['movie_rating'] = dataNodes[1].xpath(
                        "a")[0].attrib["href"].split("/")[-2]
                except:
                    filmdata['movie_rating'] = ''
        actorNodes = doc.xpath("//div[@id='actorList']/ul")
        if len(actorNodes):
            try:
                filmdata['cast'] = addMultiValues(actorNodes[0], ("a", "li/a"))
            except:
                filmdata['cast'] = ''
        duplicatedNodes = doc.xpath("//div[@id='actorList']/ul")
        if len(duplicatedNodes):
            try:
                duplicated = addMultiValues(duplicatedNodes[1], ("a", "li/a"))
                filmdata['duplicate'] = duplicated.replace(
                    'показать всех', '').replace('»', '')
            except:
                filmdata['duplicate'] = ''
        descNodes = doc.xpath(
            "//div[@class='brand_words film-synopsys' or @class='brand_words']['description']"
        )
        if len(descNodes):
            try:
                filmdata['plot'] = normilize_string(descNodes[0].text)
            except:
                filmdata['plot'] = ''

        #posters = search_poster(uid)
        #if len(posters):
        #    try:
        #        filmdata['coverart'] = posters[0]
        #    except:
        #        filmdata['coverart'] = ''

        #fanarts = search_fanart(uid)
        #if len(fanarts):
        #    try:
        #        filmdata['fanart'] = fanarts[0]
        #    except:
        #        filmdata['fanart'] = ''

        #filmdata['url'] = "http://www.kinopoisk.ru/level/1/film/"+uid


#        print("""\
#            Title:%(title)s
#            Year:%(year)s
#            Director:%(directors)s
#            Plot:%(plot)s
#            UserRating:%(user_rating)s
#            RatingCount:%(rating_count)s
#            Cast:%(cast)s
#            Duplicate:%(duplicate)s
#            Genres:%(genre)s
#            Countries:%(countries)s
#            Runtime:%(runtime)s
#            MovieRating:%(movie_rating)s
#            Coverart:%(coverart)s
#            Fanart:%(fanart)s
#            URL:%(url)s
#    """ % filmdata)

        return filmdata

    except:
        print_exception(traceback.format_exc())