Python fromstring Examples, lxml.html.soupparser.fromstring Python Examples

Example #1

0

Show file

File: crawler.py Project: yhfyhf/Legal-Analysis

def parse_html(html):
    soup = soupparser.fromstring(html)
    body = soup.xpath("body")[0]
    div_wenshu = body.xpath("div[@id='wrap']/div[@id='wenshu']")[0]
    inner_table = div_wenshu.xpath("table[1]/tbody[1]/tr[1]/td[1]/div[@id='ws']/table")[0]


    try:
        first_tr, second_tr, third_tr, fourth_tr = inner_table.xpath("tr")

        idx = html.find("<HTML>")
        html = html[idx:]
        soup = soupparser.fromstring(html)
        contents =  soup.xpath("//text()")[:-50]
        contents = [content.encode('utf-8') for content in contents]
        content = "\n".join(contents).strip().encode('utf-8')

    except:
        first_tr, second_tr, third_tr, fourth_tr = inner_table.xpath("tr")[:4]

        idx = html.find("PrintArea")
        last_idx = html.find("td", idx)
        html = html[idx-74:last_idx+3].encode("utf-8")
        soup = soupparser.fromstring(html)
        contents =  soup.xpath("//text()")
        contents = [content.encode('utf-8').strip() for content in contents if len(content.strip()) != 0]
        content = "\n".join(contents).strip().encode('utf-8')

    title = first_tr.xpath("td/div[@id='wsTitle']/text()")[0].encode('utf-8')
    time = second_tr.xpath("td/div[@id='wsTime']/span/text()")[0].encode('utf-8')[-10:]
    court = content.split('\n')[0].encode('utf-8')
    return title, time, court, content

Example #2

0

Show file

File: webExtractor.py Project: weijia/ufs

def getElementFromTemplateInHtml(html, knownHtml, knownTextDict):
    res = {}
    for i in knownTextDict.keys():
        # print i
        # print 'finding:',knownTextDict[i]
        res[i] = getElementFromTemplateInHtmlElem(fromstring(html), fromstring(knownHtml), knownTextDict[i])
    return res

Example #3

0

Show file

File: dApuller.py Project: B-Rich/random-scripts

def get_deviations(args):
	resp = opener.open('http://my.deviantart.com/gallery/?offset=0')
	print "Opened your dA gallery management page"
	got_all_devs=False
	dev_page = soupparser.fromstring(resp.read())
	print dev_page.xpath('//tbody')
	#print dev_page.getchildren('tbody')
	while not got_all_devs:
		for dev in dev_page.xpath('//tbody/tr'):
			dev_link_cell=dev.xpath('./td/a')[0]
			#dev_link_date=dev.xpath('./td')[3]
			#print dev_link_cell.attrib['href']
			#print etree.tostring(dev_link_cell)
			try:
				get_deviation(dev_link_cell.attrib['href'],args)
			except IndexError:
				print ("Probably tried to get a deviation that"
				" didn't have download of the original enabled")
		next_page=dev_page.cssselect('.pagination ul.pages li.next a')
		if not ('href' in next_page[0].attrib.keys()):
			got_all_devs=True
		else:
			print "proceeding to {}".format(next_page[0].attrib['href'])
			resp=opener.open(next_page[0].attrib['href'])
			dev_page = soupparser.fromstring(resp.read())

Example #4

0

Show file

File: test_manage_messages.py Project: mihneasim/Naaya-Translation-Service

    def test_delete_msgid(self):
        self.browser.go('http://localhost/portal/portal_i18n/manage_messages?lang=de')
        dom = fromstring(re.sub(r'\s+', ' ', self.browser.get_html()))
        results = dom.xpath('//table[@id="message_results"]/tr/td/small/a')
        found = None
        for result in results:
            if result.text == '${cnt} cats':
                found = result
                break
        self.browser.go(found.attrib['href'])

        form = self.browser.get_form('translate_message')
        self.browser.clicked(form,
                  self.browser.get_form_field(form, 'translation:utf8:ustring'))
        self.browser.submit(fieldname='manage_delMessage:method')

        self.browser.go('http://localhost/portal/portal_i18n/manage_messages?lang=de')
        dom = fromstring(re.sub(r'\s+', ' ', self.browser.get_html()))
        results = dom.xpath('//table[@id="message_results"]/tr/td/small/a')
        found = None
        for result in results:
            if result.text == '${cnt} cats':
                found = result
                break
        self.assertTrue(found is None)

Example #5

0

Show file

File: decision_list_test.py Project: JayFliz/econsensus

    def test_all_sorts_result_in_one_arrow_present(self):
        """Assert only one sort class is present in the decision list view"""

        # Assumes CSS will be correctly displaying the sort status
        decision_list = DecisionList()
        sort_options = deepcopy(decision_list.sort_table_headers)
        unsortable_headers = decision_list.unsortable_fields[:]

        for header_list in sort_options.values():
            for header in unsortable_headers:
                index = header_list.index(header)
                header_list.pop(index)

        self.create_decisions_with_different_statuses()

        # Test Ascending Sort
        for page, sort_queries in sort_options.iteritems():
            for sort_query in sort_queries:
                response = self.client.get(reverse('publicweb_item_list', args=[self.bettysorg.slug, page]), {'sort': sort_query})
                html = fromstring(response.content)
                sort_selector = CSSSelector('table.summary-list .sort-asc')
                sorts = sort_selector(html)
                self.assertEquals(len(sorts), 1, 'Number of ascending sort arrows should be 1. But is ' + str(len(sorts))
                                                 + ' for page=' + page + ' sort_query=' + sort_query)
        # Test Descending Sort
        for page, sort_queries in sort_options.iteritems():
            for sort_query in sort_queries:
                response = self.client.get(reverse('publicweb_item_list', args=[self.bettysorg.slug, page]), {'sort': '-' + sort_query})
                html = fromstring(response.content)
                sort_selector = CSSSelector('table.summary-list .sort-desc')
                sorts = sort_selector(html)
                self.assertEquals(len(sorts), 1, 'Number of descending sort arrows should be 1. But is ' + str(len(sorts))
                                                 + ' for page=' + page + ' sort_query=' + sort_query)

Example #6

0

Show file

File: parseutil.py Project: ranfi/python-crawler

def getFormatHtml(htmlContent):
    try:
        dom = soupparser.fromstring(htmlContent)
    except Exception, e:
        cleaner = Cleaner()
        htmlContent = cleaner.clean_html(htmlContent)
        doc = soupparser.fromstring(htmlContent)

Example #7

0

Show file

File: scrape.py Project: zhangchitc/pkudblp

    def __get_paper_from_acm (self, entry_url):
        resp_body = self.op.open (entry_url).read ()
        root = sp.fromstring (resp_body)

        divmain = root.xpath ("//div[@id='divmain']")[0]

        title = divmain.xpath ("div/h1/strong")[0].text
        
        # use regex to extract abstract link
        abst_url = re.compile (r"tab.abstract.cfm[^']*").search (resp_body).group (0)
        abst_url = 'http://dl.acm.org/' + abst_url
        abst_body = self.op.open (abst_url).read ()
        
        # extract all text node from this dom tree
        abst = ''.join (sp.fromstring (abst_body).xpath ('//div/p/div/p/descendant-or-self::*/text()'))
        
        # instantiate a Paper class
        paper = Paper (title, abst)

        # locate the author table block
        author_table = divmain.xpath ("table/tr/td/table")[1]
        
        # add each author
        for author_row in author_table.xpath ('tr'):
            name = author_row.xpath ('td/a/text()')[0]
            affn = author_row.xpath ('td/a/small/text()')[0]
            paper.add_author (Author (name, affn))

        return paper

Example #8

0

Show file

File: thread.py Project: Nikita-Ting/my_code

    def GetUserInfo(self,html):
        try:
            if '搜索结果为空' in html:
                print (u'weibo用户不存在!')
                return False
            if '您当前访问的用户状态异常' in html:
                #print (u'weibo用户状态异常!')
                return False
            html = self.GetHtmlInfo(html, '{\"pid\":\"pl_user_feedList\"')
            root = fromstring(html)
            usersDivise = root.xpath("//div[@class='list_person clearfix']")
            if len(usersDivise) > 0:
                users = []
                for div in usersDivise:
#                     user = dict.fromkeys(sinaSetting.USER_KEY, '')
                    user={}#定义一个用户字典并初始化
                    div = tostring(div , encoding='utf-8')
                    div = fromstring(div)
                    try:
                        iu_node = div.xpath("//div[@class='person_pic']/a/img")[0]
                        user['Imgurl'] = iu_node.get("src")
                        user['nickname'] = div.xpath("//div[@class='person_detail']/p[@class='person_name']")[0].text_content()
                        user['uid'] = iu_node.get("uid")
                        
                        sex_node = div.xpath("//div[@class='person_detail']/p[@class='person_addr']/span[@class='male m_icon']")
                        sex = ''
                        if sex_node:
                            sex = sex_node[0].get('title')
                        user['sex'] = sex
                        addr_node = div.xpath("//div[@class='person_detail']/p[@class='person_addr']")
                        addr = ''
                        if addr_node:
                            addr = addr_node[0].text_content()
                        user['addr'] = addr
                        num_node = div.xpath("//div[@class='person_detail']/p[@class='person_num']")
                        num = ''
                        if num_node:
                            num = num_node[0].text_content()
                        user['num'] = num
                        intro_node = div.xpath("//div[@class='person_detail']/div[@class='person_info']")
                        intro = ''
                        if intro_node:
                            intro = intro_node[0].text_content()
                        user['intro'] = intro
                        users.append(user)
                    except:
                        pass
                self.result['users'] = users
            else:
                return False
        except Exception:
            s=sys.exc_info()
            msg = (u"GetUserInfo Error %s happened on line %d" % (s[1],s[2].tb_lineno))
            loggerSearch.error(msg)
            return False
        return True

Example #9

0

Show file

File: usersearchcom.py Project: CnPaMeng/WeiboMsgBackupGUI

 def getUserInfo(self, html):
     try:
         if '搜索结果为空' in html:
             #print (u'weibo用户不存在!')
             return False
         if '您当前访问的用户状态异常' in html:
             #print (u'weibo用户状态异常!')
             return False
         html = self.getPanelInfo(html, '{\"pid\":\"pl_user_feedList\"')
         root = fromstring(html)
         user_divs = root.xpath("//div[@class='list_person clearfix']")
         if len(user_divs) > 0:
             users = []
             for div in user_divs:
                 user = {}
                 div = tostring(div , encoding='utf-8')
                 div = fromstring(div)
                 try:
                     iu_node = div.xpath("//div[@class='person_pic']/a/img")[0]
                     user['iu'] = iu_node.get("src")
                     user['sn'] = div.xpath("//div[@class='person_detail']/p[@class='person_name']")[0].text_content()
                     user['uid'] = iu_node.get("uid")
                     
                     sx_node = div.xpath("//div[@class='person_detail']/p[@class='person_addr']/span[@class='male m_icon']")
                     sx = ''
                     if sx_node:
                         sx = sx_node[0].get('title')
                     user['sx'] = sx
                     ad_node = div.xpath("//div[@class='person_detail']/p[@class='person_addr']")
                     ad = ''
                     if ad_node:
                         ad = ad_node[0].text_content()
                     user['ad'] = ad
                     num_node = div.xpath("//div[@class='person_detail']/p[@class='person_num']")
                     num = ''
                     if num_node:
                         num = num_node[0].text_content()
                     user['num'] = num
                     de_node = div.xpath("//div[@class='person_detail']/div[@class='person_info']")
                     de = ''
                     if de_node:
                         de = de_node[0].text_content()
                     user['de'] = de
                     users.append(user)
                 except:
                     pass
             self.result['users'] = users
         else:
             return False
     except Exception:
         s=sys.exc_info()
         msg = (u"getUserMsgInfo Error %s happened on line %d" % (s[1],s[2].tb_lineno))
         logger.error(msg)
         return False
     return True

Example #10

0

Show file

File: webExtractor.py Project: weijia/ufs

def main(argv=None):
    if argv is None:
        argv = sys.argv
        # print argv
    if len(argv) < 2:
        pass
    f1 = open("50dbb0570e45771ca4d7c2204cd2649f")
    f2 = open("38d54af2d8f7d8628acfae40933675a1")
    d1 = f1.read()
    d2 = f2.read()
    print getElementFromTemplateInHtmlElem(
        fromstring(d2), fromstring(d1), "Alarms are wrongly mapped or reported in BTS when using CTU2D"
    )
    print getElementFromTemplateInHtml(d2, d1, {"res": "Alarms are wrongly mapped or reported in BTS when using CTU2D"})

Example #11

0

Show file

File: erepublik.py Project: SKYnv/pollirio

def scrape(resource, **args):
    session = login()

    if resource == 'uid':
        # we effectively only need the first user, so don't scrape all pages
        search = session.get(
            'http://www.erepublik.com/en/main/search/%s/' %
                args['query'].replace(' ', '_')
        )
        doc = fromstring(search.text)
        uid = doc.xpath('//div[@class="nameholder"]/a/@href')[0].split('/')[-1].strip()
        return uid
    elif resource == 'citizen.profile':
        profile = session.get(
            'http://www.erepublik.com/en/citizen/profile/%s' % args['citizenId']
        )
        doc = fromstring(profile.text)

        citizen_state = doc.xpath('//div[@class="citizen_state"]/div[@class="is"]/span/img/@src')
        is_dead = citizen_state and 'dead_citizen' in citizen_state[0]

        profile = {
            'general': {
                'avatar': doc.xpath('//img[@class="citizen_avatar"]/@style')[0].split('(')[1].split(')')[0],
                'level': doc.xpath('//*[@class="citizen_level"]')[0].text,
                'experience_points': doc.xpath('//*[@class="citizen_experience"]/div/p')[0].text.split(' / ')[0].replace(',', ''),
                'name': doc.xpath('//*[@class="citizen_profile_header auth"]/h2')[0].text_content().strip(),
                'is_alive': str(int(not is_dead)),
                'birthDay': doc.xpath('//div[@class="citizen_second"]/p')[1].text.strip(),
                'nationalRank': doc.xpath('//div[@class="citizen_second"]/small/strong')[0].text,

            },
            'location': {
                'citizenship_country_initials': doc.xpath('//div[contains(@class, "citizen_info")]/a/@href')[2].split('/')[-1],
                'residence_country_name': doc.xpath('//div[contains(@class, "citizen_info")]/a/@title')[0],
                'residence_region_name': doc.xpath('//div[contains(@class, "citizen_info")]/a/@title')[1],
            },
            'party': {
                'name': doc.xpath('//div[@class="citizen_activity"]/div/div/span/a')[0].text.strip(),
            },
            'militaryUnit': {
                'id': doc.xpath('//div[@class="citizen_activity"]/div/div/a/@href')[0].split('/')[-1],
                'name': doc.xpath('//div[@class="citizen_activity"]/div/div/a/span')[0].text.strip(),
            },
            'militaryAttributes': {
                'strength': doc.xpath('//div[@class="citizen_military"]/h4')[0].text.replace(',', '').strip(),
                'rank_points': doc.xpath('//div[@class="stat"]/small/strong')[1].text.split(' / ')[0].replace(',', ''),
            },
        }
        return profile

Example #12

0

Show file

File: test_negotiator.py Project: eaudeweb/naaya.i18n

    def test_lang_in_path(self):
        self.browser_do_login('admin', '')
        self.portal.gl_add_site_language('es', 'Spanish')
        import transaction; transaction.commit()

        self.browser.go('http://localhost/portal/de')
        doc = fromstring(re.sub(r'\s+', ' ', self.browser.get_html()))
        self.assertEqual(doc.xpath('//div[@id="middle_port"]/h1')[0].text,
                         'Error page')

        self.browser.go('http://localhost/portal/es')
        doc = fromstring(re.sub(r'\s+', ' ', self.browser.get_html()))
        self.assertTrue(doc.attrib.has_key('lang'))
        self.assertEqual(doc.attrib['lang'], 'es')

Example #13

0

Show file

File: scraper.py Project: clangarica/mph_book_price_scraper

    def scrape(self):
        self.log('starting %s', self.__class__.__name__)
        self.open()

        root = soupparser.fromstring(self.body())

        # get total available books
        totals = TotalAvailableBooksParser()
        total_books = totals.get_value(root)
        total_books_str = str(total_books)

        if not total_books:
            raise AssertionError('Could not parse for total books in %s' % self._url)

        self.log('total books = %d', total_books)

        perpage_inputs = root.cssselect('input[name^="%s"]' % self.perpage_input_name)

        if len(perpage_inputs):
            form = {}

            for ppi in perpage_inputs:
                name = ppi.attrib['name']
                try:
                    value = int(ppi.attrib['value'])
                except (TypeError, ValueError):
                    continue
                if value < total_books:
                    form[name] = total_books_str

            # if a hidden 'per page' input is changed this means there are
            # more than 1 page of results, otherwise all available books are
            # already in this 1 initial page
            if form:
                # load all books and reparse response
                self.submit(form)
                root = soupparser.fromstring(self.body())

        self.log('scraping for book prices')

        books = []
        for cls in self.parser_classes:
            self.log('... using %s', cls.__name__)
            parser = cls()
            pbooks = [self.tag(b) for b in parser.parse(root)]
            books.extend(pbooks)
            self.log('... found %d books (total = %d)', len(pbooks), len(books))
        return books

Example #14

0

Show file

File: tieba.py Project: callMeBigKing/workSpace

 def downloadlz(self, type='txt', start=1, end=5):
     """
     type:txt or photo
     默认下载前5页的内容
     :param :
     :return:
     """
     self.__mkdirfile(self.tienum)
     num = int(self.getnum())
     print u'本帖子楼主共有%d页发表!' % num
     if start > end:
         print u"结束页超过起始页，请检查参数!\n"
         sys.exit()
     elif start > num:
         print u"起始页面超过上限!本帖子一共有 %d 页\n" % num
         sys.exit()
     num = num if num < end else end
     for i in xrange(start - 1, num):
         soup = soupparser.fromstring(requests.get(self.url + str(i + 1), verify=False).content)
         if type == "txt":
             self.__get_lz_txt(i + 1, soup)
         elif type == "photo":
             self.__get_lz_jpg(i + 1, soup)
         else:
             print u"输入的参数有误，只能输入'txt'或者'photo'"

Example #15

0

Show file

File: msgcomcrawler.py Project: CnPaMeng/WeiboMsgBackupGUI

 def parseRefeed(self, node):
     node = fromstring(tostring(node))
     #ui
     userNode = node.xpath(self.config.get("RT_USER_XPATH"))
     if userNode:
         userNode = userNode[0]
         ui = userNode.get("usercard", "").replace("id=", "")
         sn = userNode.get("nick-name", " ")
         un = userNode.get("href", "").replace("/", "")
     else:
         return {}
     rtmsg = self.parseCommon(node, "rtmsg")
     if rtmsg:
         rtmsg[COLUMN_USERID] = ui
         rtmsg[COLUMN_USERNAME] = un
         rtmsg[COLUMN_SCRENNAME] = sn
         #转发消息URL
         muNode = node.xpath("//div/div/div/div[@class='WB_from']/a[@title]")
         mu = ""
         mid = ""
         if muNode:
             mu = muNode[0].get("href", "").split("/")[-1]
             mid = sinaWburl2ID(mu)
         rtmsg[COLUMN_ID] = mid
         rtmsg[COLUMN_MSGURL] = mu
     return rtmsg

Example #16

0

Show file

File: test_dork_list.py Project: chiehwen/glastopf

    def test_dork_links(self):
        """Objective: Test if a random link from the dork page exists in the database.
        Input: A random link from a created dork page.
        Expected Results: The path of the link should be at least once in the db.
        Notes: Links have the parameters truncated, so multiple entries are likely."""

        pages_dir = tempfile.mkdtemp()
        try:
            (db, engine, dork_generator) = self.dork_generator_chain('sql', pages_dir)
            dork_generator.regular_generate_dork(0)
            sample_file = choice(dork_generator.get_current_pages())
            print "Randomly selected dork page:", sample_file.rsplit('/', 1)[1]
            with open(sample_file, 'r') as sample_data:
                data = fromstring(sample_data)
            links = data.cssselect('a')
            test_link_path = choice(links).get('href')
            print "Randomly selected path:", test_link_path
            from_livedb = db.select_entry(test_link_path)
            #the test database has below 100 entries, so it will seeded from the dorkdb
            from_dorkdb = db.get_dork_list('inurl', starts_with=test_link_path)
            result_count = len(from_livedb) + len(from_dorkdb)
            print "Done searching for the entry."
            self.assertTrue(result_count > 0)
            print "The dork db returned:",
            print "{0} entries,".format(result_count),
            print "which equates our expectation."
        finally:
            if os.path.isdir(pages_dir):
                shutil.rmtree(pages_dir)

Example #17

0

Show file

File: test_local_properties.py Project: mihneasim/Naaya-Translation-Service

 def assertTitle(self, url_lang_sufix, value):
     #title = lambda x: x.xpath('//span[@class="page_title"]')[0].text or ''
     title = lambda x: CSSSelector("span.page_title")(x)[0].text_content()
     url = 'http://localhost/portal/'
     self.browser.go(url + url_lang_sufix)
     doc = fromstring(re.sub(r'\s+', ' ', self.browser.get_html()))
     self.assertEqual(title(doc), value)

Example #18

0

Show file

File: test_dork_list.py Project: chiehwen/glastopf

    def test_dork_page_content(self):
        """Objective: Testing the attack surfaces content.
        Input: An attack surface sample. The structure is defined in a template.
        Expected Results: The attack surface should be a HTML page containing text and links.
        Notes: We extract and count the elements in the HTML document."""

        pages_dir = tempfile.mkdtemp()

        try:
            dork_generator = self.dork_generator_chain('sql', pages_dir)[2]
            dork_generator.regular_generate_dork(0)

            sample_file = choice(dork_generator.get_current_pages())
            with open(sample_file, 'r') as sample_data:
                data = fromstring(sample_data)
            self.assertTrue(len(data.cssselect('a')) > 0)
            self.assertTrue(len(data.cssselect('title')) > 0)
            self.assertTrue(len(data.cssselect('form')) > 0)
            print "The content analysis of a random HTML page returned:"
            print len(data.cssselect('a')), 'links (<a href=""></a>)',
            print len(data.cssselect('title')), 'page title (<title />)',
            print len(data.cssselect('form')), 'form field (<form />)'
            print "which equates our expectation."
        finally:
            if os.path.isdir(pages_dir):
                shutil.rmtree(pages_dir)

Example #19

0

Show file

File: importers.py Project: marcelst/whatsupcoming

    def import_day(self, day, month, year):
        tree = fromstring(self._get_html(self._url(day, month, year)))

        titlenodes = tree.xpath("//td[@class='size2']/font[@color='#CD076A']/b")


        for titlenode in titlenodes:
            event = Event()            
            event.name = titlenode.text_content()
            time = titlenode.xpath("./ancestor::table/parent::td/table[2]/tr/td[2]/font[1]/text()")[0]
            event.date_start = datetime.strptime(time, "%H:%M Uhr")
            event.date_start = event.date_start.replace(year=year, day=day, month=month)
            venue = titlenode.xpath("./ancestor::table/parent::td/table[2]/tr[2]/td[2]/font/descendant::text()")[0]
            address = titlenode.xpath("./ancestor::table/parent::td/table[2]/tr[2]/td[2]/text()[preceding-sibling::br]")[0].strip()
            p = re.search(".*[0-9]{5} (.*)$", address)
            city = p.group(1)

            geodata = GooglePlacesLookup.find_geo_data_for_venue(venue, city)
            venue  = geodata['name']
            lat = geodata['lat']
            lon = geodata['lon']
            location, created = Location.objects.get_or_create(name=venue, city=city, latitude=lat, longitude=lon)

            event.location = location
            if not self.is_duplicate_event(event):
                event.save()

Example #20

0

Show file

File: XML.py Project: kmoore134/plexmediaserver-freebsd-10.1-amd64

def ElementFromFile(path, use_html_parser=False):
  """
    Creates a new Element object from an XML file at the given path.
    
    @param path: The path to the XML file
    @type path: string
    @return: Element
  """
  #text = open(path)
  
  tfile = open(path)
  text = tfile.read()
  tfile.close()
  
  if text is not None:
    if use_html_parser:
      try:
        root = html.fromstring(text)
        test = html.tostring(root, encoding=unicode)
        return root
      except:
        return fromstring(text)
    else:
      return etree.fromstring(text)
  else:
    return None

Example #21

0

Show file

File: scrape3.py Project: zhangchitc/pkudblp

    def __get_paperentry_from_acm (self, title, authors):
        QUERY_URL = 'http://dl.acm.org/advsearch.cfm'

        self.br.open (self.__wrapper (QUERY_URL))
        
        # this query form does not have id attribute, nr=0 means the first form
        self.br.select_form (nr=0)
   
        # termzone is a multi choice dropdown menu, the value should be a list
        self.br.form['termzone'] = ['Title']
        self.br.form['allofem'] = title
        self.br.form['peoplezone'] = ['Author']
        self.br.form['people'] = authors 
        self.br.form['peoplehow'] = ['and']
        resp_body = self.__deljs_html (self.br.submit ().read ())

        # check if the rearch result is not empty
        if resp_body.find ('was not found') == -1:
            root = sp.fromstring (resp_body)

            # select the first entry in search result
            entry_url = root.xpath ("//a[@class='medium-text' and @target='_self' and \
                starts-with (@href, 'citation.cfm')]/@href")[0]
            return 'http://dl.acm.org/' + entry_url
        else:
            return ""

Example #22

0

Show file

File: search_results_controller.py Project: pablosuau/pyBacklogger

    def initializeUi(self):
        # search results
        html = str(self.html)
        systems = []
        names = []
        urls = []
        doc = fromstring(html)
        el = doc.xpath("//table[@class='results']")
        for table in el:
            rows = table.getchildren()[2:]
            for row in rows:
                system = row.getchildren()[0].text.strip()
                if system == "":
                    system = systems[-1]
                systems.append(system)
                names.append(row.getchildren()[1].findtext("a"))
                urls.append(GAMEFAQS_URL + row.getchildren()[1].getchildren()[0].attrib["href"])

        # Displaying search results
        model = QStandardItemModel()
        if len(systems) > 0:
            for i in range(0, len(systems)):
                item = QStandardItem("(" + systems[i] + ") " + names[i])
                item.setFlags(Qt.ItemIsUserCheckable | Qt.ItemIsEnabled)
                item.setData(Qt.Unchecked, Qt.CheckStateRole)
                model.appendRow(item)
            model.itemChanged.connect(self.on_item_changed)
        else:
            item = QStandardItem("No game was found")
            model.appendRow(item)
        self.ui.listViewGames.setModel(model)

        self.urls = urls
        self.checked = 0
        self.ui.pushButtonOk.setEnabled(False)

Example #23

0

Show file

File: cleaner.py Project: HoekR/divscripts

def cleaner(infile='', outfile='', soup=True):
    '''clean html'''
    fl = open(infile)
    doc = fl.read()
    fl.close()
    for element in ['span', 'div', 'font', 'u']:
        doc = doc.replace("%s>" % element, "%s> " % element)
    if soup:
        doc = soupparser.fromstring(doc)
    else: #fallback
        doc = html.fromstring(doc)

    safe_attrs = clean.defs.safe_attrs
    clean.defs.safe_attrs = frozenset(['href', 'alt', 'id', 'src', 'width', 'height'])
    c = clean.Cleaner(scripts=True, 
                embedded=True, 
                meta=True, 
                style=True, 
                remove_tags = ['span', 'div', 'font', 'u'],
                safe_attrs_only=True)
    c.safe_attrs=frozenset(['href', 'alt', 'id', 'src', 'width', 'height']) #this seems to work no it doesnt
    d2 = c.clean_html(doc)
    #ps = 
    #for p in ps:
    #	if p.find('a'):#
    #		if p.find('a').find('img'):
    #			print ok
    d2 = squash_paragraph_attributes(d2)    
    flout = open(outfile, 'wb')
    flout.write(etree.tostring(d2, method="html", encoding='utf-8'))
    flout.close()

Example #24

0

Show file

File: scrape3.py Project: zhangchitc/pkudblp

    def __get_author_from_ms (self, entry_url):
        author_id = entry_url.split ('/')[-2]
        entry_url = "http://academic.research.microsoft.com/io.ashx?authorID=%s" % author_id
        resp_body = self.op.open (entry_url).read ()
        json_obj = json.loads (resp_body)
        
        name = json_obj['DisplayName']
        if json_obj['Affiliation']:
            affn = json_obj['Affiliation']['FullName']
        else:
            affn = ''
        print "Finished author"
        return Author (name, affn) 


        # OLD ONE: FETCH OFFICIAL AUTHOR PAGE
        root = sp.fromstring (resp_body)

        name = root.xpath ("//span[@id='ctl00_MainContent_AuthorItem_authorName']")[0].text
        aff_nodes = root.xpath ("//a[@id='ctl00_MainContent_AuthorItem_affiliation']")
        
        # make sure the author page has affiliation 
        if len (aff_nodes) > 0:
            affn = aff_nodes[0].text
        else:
            affn = ""

        return Author (name, affn)

Example #25

0

Show file

File: ui_common.py Project: eea/eea.ldapadmin

def extend_crumbs(crumbs_html, editor_url, extra_crumbs):
    from lxml.html.soupparser import fromstring
    from lxml.html import tostring
    from lxml.builder import E

    crumbs = fromstring(crumbs_html).find('div[@class="breadcrumbtrail"]')

    roles_div = crumbs.find('div[@class="breadcrumbitemlast"]')
    roles_div.attrib["class"] = "breadcrumbitem"
    roles_link = E.a(roles_div.text, href=editor_url)
    roles_div.text = ""
    roles_div.append(roles_link)

    for title, href in extra_crumbs:
        a = E.a(title, {"href": href})
        div = E.div(a, {"class": "breadcrumbitem"})
        crumbs.append(div)

    last_crumb = crumbs.xpath('div[@class="breadcrumbitem"]')[-1]
    last_crumb_text = last_crumb.find("a").text
    last_crumb.clear()
    last_crumb.attrib["class"] = "breadcrumbitemlast"
    last_crumb.text = last_crumb_text

    return tostring(crumbs)

Example #26

0

Show file

File: randoxygene.py Project: flyeven/scraperwiki-scraper-vault

def getRandoAtUrl(url):
    print "syncing " + url
    response = urllib2.urlopen(url)
    html = response.read()
    root = fromstring(html)
    content = root.xpath("//td[@class='col_content']")[0]
    name = content.xpath("string(span[@class='rando_titre']/text())")
    start = content.xpath("string(span[@class='rando_depart']/text())").replace(u"Au d\u00E9part de","")
    description = content.xpath("string(p[@class='rando_description']/text())")
    if description=="":
        description = content.xpath("string(span[@class='rando_itineraire']/text())") 
    itinerary = content.xpath("string(span[@class='rando_itineraire']/text())")
    propertiesTable = root.xpath(u"//th[starts-with(.,'Caract\u00E9ristiques')]/../following-sibling::tr/td/table")[0]
    props = propertiesTable.xpath(".//tr/td[2]")
    place = props[0].xpath("string(.)")
    placeInfo = props[1].xpath("string(.)")
    startAltitude = props[2].xpath("string(.)").replace(u"Alt. au d\u00E9p.","")
    rise = props[3].xpath("string(.)").replace(u"Mont\u00E9e","")
    descent = props[4].xpath("string(.)").replace("Descente","")
    duration = props[5].xpath("string(.)").replace(u"Dur\u00E9e","")
    difficulty = props[6].xpath("string(.)").replace(u"Difficult\u00E9e","")
    bestPeriod = props[7].xpath("string(.)").replace(u"P\u00E9riode conseill\u00E9e","")
    howToGetThere = root.xpath(u"string(//th[starts-with(.,'Acc\u00E8s Routier')]/../following-sibling::tr/td[@class='module_texte']/text())")

    rando = {"url":url, "name":name, "start":start, "description":description, "itinerary":itinerary, "place":place, "placeinfo":placeInfo, "startaltitude":startAltitude, "rise":rise, "descent":descent, "duration":duration, "difficulty":difficulty, "bestperiod":bestPeriod, "howtogetthere":howToGetThere}  
    return rando

Example #27

0

Show file

File: lx.py Project: ThePenguin1140/jabbapylib

def to_doc(text, parser=scraper.LXML_HTML, whole_doc=True):
    """Parse an HTML text. Return value: lxml.html.HtmlElement document.
    
    parser: which parser to use. 
    whole_doc: parse to complete HTML document (with <html> around), or parse just a fragment of HTML."""
    doc = None
    
    if parser == scraper.LXML_HTML:
        if whole_doc:
            doc = html.document_fromstring(text)
        else:
            doc = html.fromstring(text)
    elif parser == scraper.HTML5PARSER:
        # html5parser was broken for me, bug report is here: https://bugs.launchpad.net/lxml/+bug/780642
        #if whole_doc:
        #    doc = html5parser.document_fromstring(text)
        #else:
        #    doc = html5parser.fromstring(text)
        # Here is my workaround:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
        etree_doc = parser.parse(text)  # returns an ElementTree
        doc = html.document_fromstring(elementtree_to_string(etree_doc))
        # ^ this double conversion makes it slow ^
    elif parser == scraper.BEAUTIFULSOUP:
        # soupparser has no document_fromstring method
        doc = soupparser.fromstring(text)
    else:
        print >>sys.stderr, "Warning: you want to use an unknown parser in lx.py."
        # doc is None
        
    return doc  # lxml.html.HtmlElement

Example #28

0

Show file

File: urllist.py Project: natoinet/ploufseo

 def load_from_sitemap(self, url):
     try:
         res = urlopen(url)
     except ValueError:
         logging.warning('Sitemap URL is not valid')
     content = fromstring(res.read())
     self.urls.extend(content.xpath('//loc/text()'))

Example #29

0

Show file

File: scraper.py Project: jeffdh5/nextep

def prev_and_next_ep(url):
    """Next episode, followed by previous episode."""
    parser = html5lib.HTMLParser()
    tag_soup = urllib2.urlopen(url).read()
    root = fromstring(tag_soup)
    string = tostring(root, pretty_print=True)
    soup = BeautifulSoup(string)
    div = soup.findAll("div", "grid_7_5 box margin_top_bottom")
    lst = []
    prev_and_next = []
    next = ""
    prev = ""
    for item in div:
        x = item.find("span", "content_title")
        if x:
            if x.find(text=True) == "Episode Info":
                lst.append(item)
    y = lst[0].findAll("h2")
    for item in y:
        prev_and_next.append((item.findAll(text=True)))
    for item in prev_and_next[0][1:]:
        next += str(item)
    for item in prev_and_next[1][1:]:
        prev += str(item)
    return (next, prev)

Example #30

0

Show file

File: msgcomcrawler.py Project: CnPaMeng/WeiboMsgBackupGUI

 def parseFeedlist(self, html):
     feedDoc = fromstring(html)
     self.config = self.xpathconfig.getIndexConfig('v1')
     nodeLst = feedDoc.xpath(self.config.get("USER_FEEDLIST_XPATH"))
     moreNode = feedDoc.xpath(self.config.get("MORE_FEEDLIST_XPATH"))
     feedmsgLst = []
     hasMore = 0
     max_id = ""
     for node in nodeLst:
         try:
             msg,rtmsg = self.parseFeed(node)
             if msg:
                 max_id = msg.get("mid")
                 feedmsgLst.append(msg)
             if rtmsg:
                 feedmsgLst.append(rtmsg)
         except:
             #s=sys.exc_info()
             #msg = (u"parseFeedlist Error %s happened on line %d" % (s[1],s[2].tb_lineno))
             #logger.error(msg)
             continue
     if moreNode:
         #需要解析更多
         hasMore = 1
     return hasMore,feedmsgLst,max_id

Example #31

0

Show file

def from_chocolatey():
    root = fromstring(
        requests.get(
            'https://chocolatey.org/packages/vmwareworkstation').content)
    trs = root.findall('.//tr')
    p_version = re.compile('(?P<version>\d{1,2}\..*)', re.IGNORECASE)

    for entry in trs:
        date = entry.xpath('string(td[3])').strip()
        release = entry.xpath('string(td[1]/a|td[1]/span)')

        version_entry = p_version.search(release)
        if version_entry and date:
            release = version_entry.group('version')

            workstation = {}
            format_str = "%A, %B %d, %Y"
            datetime_obj = datetime.datetime.strptime(date, format_str)
            workstation['date'] = datetime_obj.date().isoformat()

            yield release, workstation

Example #32

0

Show file

def from_clamav():
    urls = ['https://www.clamav.net/downloads', 'https://www.clamav.net/previous_stable_releases']
    for url in urls:
        root = fromstring(requests.get(url).content)
        trs = root.findall('.//tr')
        p_version = re.compile('clamav-(?P<version>\d{1,2}\..*)\.tar\.gz$', re.IGNORECASE)
        
        for entry in trs:
            date = entry.xpath('string(td[2])').strip()
            release = entry.xpath('string(td[1])').strip()
            
            version_entry = p_version.search(release)
            if version_entry and date:
                release = version_entry.group('version')
                
                clamav = {}
                format_str = "%Y-%m-%d %H:%M:%S UTC"
                datetime_obj = datetime.datetime.strptime(date, format_str)
                clamav['date'] = datetime_obj.date().isoformat()
            
                yield release, clamav

Example #33

0

Show file

File: proxy_fetcher.py Project: Tinysun1234/proxy_tools

 def fetch_xici(self):
     '''
     www.xicidaili.com/nn/ first page
     '''
     self.opener = urllib2.build_opener()
     ua = [('User-Agent', random.choice(settings.USER_AGENTS))]
     self.opener.addheaders = ua
     try:
         page = self.opener.open(self.websites_url['xici']).read()
         doc = soupparser.fromstring(page)
         proxy_sels = doc.xpath('//table[@id="ip_list"]//tr')
         for proxy_sel in proxy_sels[1:-1]:
             params = proxy_sel.xpath('./td')
             #                 print params
             proxy = dict()
             proxy['ip'] = params[2].xpath('./text()')[0]
             proxy['port'] = params[3].xpath('./text()')[0]
             proxy['type'] = params[6].xpath('./text()')[0]
             self.proxies.append(proxy)
     except urllib2.URLError as e:
         print e

Example #34

0

Show file

File: leetcodedown.py Project: bupttcl/LCSpider

	def download_one(self,problem_name):
		try:
			print 'downloading',problem_name
			if os.path.isfile(problem_name):
				print 'exist,skip'
			return
		self.update_referer(self.PROBLEM_URL)
		print self.SUBMISSION_URL%problem_name
		submission_page = self.opener.open(self.SUBMISSION_URL%problem_name).read()
	 	results = soupparser.fromstring(submission_page).xpath(self.SUBMISSION_XPATH)
		if not results:
			print 'accepted submission cannot be found on the first page'
			return
		self.update_referer(self.SUBMISSION_URL%problem_name)
		detail_page = self.opener.open(self.SITE_URL+results[0]).read()
		match_results = re.search(self.CPP_REGEX,detail_page)
		code = match_results.group(1).decode('unicode-escape')
		with open(self.folder+'/'+problem_name+'.cpp','w') as w:
			w.write(code)
		
		except:

Example #35

0

Show file

def get_rates(country, date=None):
    """Retrieve the VAT rates for the specified country.  Returns a
       Rates object on success, or in case of error raises an exception."""

    if date is None:
        date = datetime.date.today()

    req = urllib.request.Request(
        url=TIC_VATRATESEARCH,
        headers={'Content-Type': 'application/x-www-form-urlencoded'})
    req.method = 'POST'
    req.data = urllib.parse.urlencode([('listOfMsa', msa_map[country]),
                                       ('listOfTypes', 'Standard'),
                                       ('listOfTypes', 'Reduced'),
                                       ('listOfTypes', 'Category'),
                                       ('dateFilter', format_date(date))])

    f = urllib.request.urlopen(req)

    status = f.getcode()

    if status != 200:
        raise TICHTTPException(status, f.info(), f.read())

    body = f.read()

    xml = soupparser.fromstring(body)

    row = xml.find('.//div[@id="national"]/table/tbody/tr')
    std_rate = ''.join(row[1].itertext()).strip()

    m = _percent_re.match(std_rate)

    if not m:
        raise TICException("didn't understand rate %s" % std_rate)

    rate = Rate(D(m.group(1)), date)
    rates = Rates({'Standard': rate}, {}, {})

    return rates

Example #36

0

Show file

File: apache-scraper.py Project: maaaaz/third-parties-version-history

def from_apache():
    root = fromstring(
        requests.get('https://archive.apache.org/dist/httpd/').content)
    trs = root.xpath(
        './/a[starts-with(@href, "apache_") or starts-with(@href, "httpd-")]')

    p_version = re.compile(
        r'(apache_|httpd-)(?P<version>\d\.\d.\d{1,2})\.[^\d]*', re.IGNORECASE)
    for entry in trs:
        release = entry.text
        date = entry.tail.strip().rsplit(' ', 1)[0].strip()

        version_entry = p_version.search(release)
        if version_entry and date:
            release = version_entry.group('version')

            apache = {}
            format_str = "%Y-%m-%d %H:%M"
            datetime_obj = datetime.datetime.strptime(date, format_str)
            apache['date'] = datetime_obj.date().isoformat()

            yield release, apache

Example #37

0

Show file

File: tieba.py Project: fattyfook2015/Crawler-1

 def __get_msg_reply(self, url):
     """
     获取post数据所需要的各种参数，通过游览器查看得出
     唯一有疑问的是mouse_pwd这个参数，在我电脑上实验这个参数可以顺利评论帖子
     如出现不能post可根据你游览器截获到的参数修改
     :param url:
     :return:
     """
     dictory = {}
     text = requests.get(url=url, allow_redirects=False, verify=False).content
     text2 = requests.get(url="http://tieba.baidu.com/f/user/sign_list?t=" + str(int(time.time() * 10000)),
                          allow_redirects=False, verify=False).content
     soup = soupparser.fromstring(text)
     msg = soup.xpath(".//*[@type='hidden']")[0]
     dictory['kw'] = msg.attrib['value']
     dictory['floor_num'] = re.findall("reply_num:([0-9]*),", text)[0]
     dictory['tid'] = re.findall("thread_id:([0-9]*),", text)[0]
     dictory['fid'] = re.findall('"forum_id":([0-9]*),', text)[0]
     dictory['tbs'] = re.findall('"tbs": "([\w]*)",', text)[0]
     dictory["sign_id"] = json.loads(text2.decode("gbk"))['data']['used_id']
     dictory["mouse_pwd_t"] = int(time.time())
     return dictory

Example #38

0

Show file

File: syntaxhighlight.py Project: pyblosxom/plugins

def highlightcallback(code):
    try:
        lexer = get_lexer_by_name(code.attrib['lang'])
    except Exception:
        lexer = guess_lexer(etree.tostring(code))
    output = code.text_content(
    )  # same as `etree.tostring(code, method='text')` afaict
    output = highlight(output, lexer, HtmlFormatter())
    # NOTE: emitting the styles like this doesn't feel right
    # if you have multiple entries with source code -> redundant style tags
    # plus, all this style info doesn't really belong in the html
    output = '<style>' + HtmlFormatter().get_style_defs(
        '.highlight') + '</style>' + output
    # newElement has html tags around the actual content!
    newElement = fromstring(output)
    # lxml insists of wrapping with <html>..</html> tags, so page source would look like:
    # <code><html><style...
    # the easiest fix is just changing the html to div, we get rid of the html tag mid-document
    # and having a wrapping div tag is harmless.
    newElement.tag = 'div'
    code.clear()
    code.append(newElement)

Example #39

0

Show file

File: extract_lib.py Project: simon582/Antman

 def get_content(self, url):
     rt_result = []
     dr = re.compile(r'<[^>]+>', re.S)
     html = urllib.urlopen(url).read()
     cur_title = Document(html).short_title().replace(' ', '')
     readable_article = Document(html).summary()
     print readable_article.encode('utf8')
     readable_article = readable_article.replace('&#13;', '')
     cur_list = readable_article.replace('</p>', '\n').split('\n')
     for item in cur_list:
         if '<img' in item and 'src=' in item:
             #print item.split('src=')[1].split('"')[1]
             dom = soupparser.fromstring(item)
             if len(dom) > 0:
                 img_path = dom[0].xpath('.//img')
                 for img in img_path:
                     rt_result.append(['0', img.get('src')])
         else:
             use_item = dr.sub('', item).replace(' ', '')
             if len(use_item) > 10:
                 rt_result.append(['1', use_item])
     return cur_title, rt_result

Example #40

0

Show file

    def open_xml(self):
        """
        Opens the XML file and reads raw string, uses soupparse to encode.

        Removes some text that has no relevance for XML files, such as HTML tags, LaTeX entities

        Note that soupparser.fromstring is called by both open_xml and parse_xml.  
        open_xml uses soupparser.fromstring because the html and LaTex cleanup needs a string, not a parse tree
        
        :return: semi-parsed XML content
        """
        raw_xml = None

        try:
            logger.debug('Opening the file: {0}'.format(self.file_input))

            with open(self.file_input, 'rb') as fp:
                raw_xml = fp.read()

            # use soupparser to properly encode file contents
            #  it could be utf-8, iso-8859, etc.
            parsed_content = soupparser.fromstring(raw_xml)
            # convert to string for ease of clean-up, convert html and LaTeX entities
            raw_xml = tostring(parsed_content)
            raw_xml = re.sub('(<!-- body|endbody -->)', '', raw_xml)
            raw_xml = edef.convertentities(raw_xml)
            raw_xml = re.sub('<\?CDATA.+?\?>', '', raw_xml)

            logger.debug('reading')
            logger.debug('Opened file, trying to massage the input.')

            logger.debug('XML file opened successfully')
            self.raw_xml = raw_xml

        except Exception as err:
            logger.error('Error: {0}'.format(err))
            raise Exception(err)

        return raw_xml

Example #41

0

Show file

def from_bucardo():
    root = fromstring(
        requests.get('https://bucardo.org/postgres_all_versions.html').content)
    trs = root.findall('.//td')
    p_version_and_date = re.compile(
        '^(?P<version>\d{1,2}\..*) \((?P<date>[\d]{4}-[\d]{2}-[\d]{2})\)$',
        re.IGNORECASE)

    for entries in trs:
        entries = entries.text_content().splitlines()
        for entry in entries:
            version_and_date = p_version_and_date.search(entry)
            if version_and_date:
                release = version_and_date.group('version')
                date = version_and_date.group('date')

                postgres = {}
                format_str = "%Y-%m-%d"
                datetime_obj = datetime.datetime.strptime(date, format_str)
                postgres['date'] = datetime_obj.date().isoformat()

                yield release, postgres

Example #42

0

Show file

    def set_tieba(self, name):
        """
        修改正在操作的贴吧名字
        :param name: 贴吧名
        :return:
        """
        self.name = name
        url_items = [MAIN_URL, "f?kw=", name, "&fr=home"]
        self.url = ''.join(url_items)
        req = requests.get(self.url,
                           headers=HEADERS,
                           allow_redirects=False,
                           verify=False)

        if int(req.status_code) != 200:
            raise TiebaError, 'The tieba: "%s" have not exist!' % self.name

        self.html = req.content
        try:
            self.soup = soupparser.fromstring(self.html)
        except ValueError:
            self.set_html_by_js()

Example #43

0

Show file

def from_virten():
    root = fromstring(
        requests.get(
            'https://www.virten.net/vmware/workstation-release-and-build-number-history/'
        ).content)
    trs = root.xpath('.//tr')

    p_version = re.compile('(?P<version>(\d{1,2}\.?){2,3})', re.IGNORECASE)
    for entry in trs:
        release = entry.xpath('string(td[1]/text())')
        date = entry.xpath('string(td[3]/text())')

        version = p_version.search(release)
        if version and date:
            release = version.group('version')

            workstation = {}
            format_str = "%Y-%m-%d"
            datetime_obj = datetime.datetime.strptime(date, format_str)
            workstation['date'] = datetime_obj.date().isoformat()

            yield release, workstation

Example #44

0

Show file

File: python-sitemap-generator.py Project: wiejakp/python-sitemap-generator

    def run(self):
        temp_status = None
        temp_object = None

        try:
            temp_req = Request(self.obj['url'], headers=request_headers)
            temp_res = urlopen(temp_req)
            temp_code = temp_res.getcode()
            temp_type = temp_res.info()["Content-Type"]

            temp_status = temp_res.getcode()
            temp_object = temp_res

            if temp_code == 200:
                if types in temp_type:
                    temp_content = temp_res.read()

                    #var_dump(temp_content)

                    try:
                        temp_data = fromstring(temp_content)
                        temp_thread = threading.Thread(target=ParseThread, args=(self.obj['url'], temp_data))
                        link_threads.append(temp_thread)
                        temp_thread.start()
                    except (RuntimeError, TypeError, NameError, ValueError):
                        print ('Content could not be parsed, perhaps it is XML? We do not support that yet.')
                        #var_dump(temp_content)
                        pass



        except HTTPError as e:
            temp_status = e.code
            pass

        self.obj['obj'] = temp_object
        self.obj['sta'] = temp_status

        ProcessChecked(self.obj)

Example #45

0

Show file

    def check_all_text_translated(self, view, args):
        old_lang = translation.get_language()

        self.mock_get_text_functions_for_french()

        translation.activate("fr")

        response = self.client.get(reverse(view, args=args), follow=True)
        html = response.content  # pylint: disable=E1103
        root = fromstring(html)
        sel = CSSSelector('*')

        for element in sel(root):
            if self.has_translatable_text(element):
                try:
                    self.assertTrue(
                        self.contains(element.text,
                                      "XXX "), "No translation for element " +
                        element.tag + " with text '" + element.text +
                        "' from view '" + view + "'")
                finally:
                    translation.activate(old_lang)

Example #46

0

Show file

File: visiteursdusoir.py Project: rayassch/scraperwiki-scraper-vault

def getDetailsFromUrl(address):
    url = 'http://lesvisiteursdusoir.com' + address
    response = urllib2.urlopen(url)
    html = response.read()
    root = fromstring(html)
    titleArray = root.xpath("//h1[@class='title']/text()")
    yearArray = root.xpath(
        "//div[@class='field field-name-field-annee-realisation field-type-date field-label-inline clearfix']//span/text()"
    )
    durationArray = root.xpath(
        "string(//div[@class='field field-name-field-duree-txt field-type-text field-label-inline clearfix']//div[@class='field-item even']/text())"
    )
    countryArray = root.xpath(
        "//div[@class='field field-name-field-pays field-type-text field-label-inline clearfix']//div[@class='field-item even']/text()"
    )
    directorArray = root.xpath(
        "//div[@class='field field-name-field-realisateur field-type-text field-label-inline clearfix']//div[@class='field-item even']/text()"
    )
    synopsisArray = root.xpath(
        "//div[@class='field field-name-field-synopsis field-type-text-long field-label-above']//p/text()"
    )
    imageArray = root.xpath(
        "//div[@class='field field-name-field-affiche field-type-image field-label-hidden']//img/@src"
    )
    allocineShortcutArray = root.xpath(
        "//div[@class='region region-content']//div[@class='content']/a/@href")
    #print title,  duration, country, director, synopsis, image, allocineShortcut
    details = {
        "title": firstValue(titleArray),
        "duration": str(durationArray),
        "country": firstValue(countryArray),
        "year": firstValue(yearArray),
        "synopsis": firstValue(synopsisArray),
        "director": firstValue(directorArray),
        "url": url,
        "poster": firstValue(imageArray),
        "allocine": firstValue(allocineShortcutArray)
    }
    return details

Example #47

0

Show file

 def _check_response(self, response):
     '''Checks response
     
     :param requests.Response response: response object
     :return: bool (True if valid, False if not valid)
     '''
     response_ct = response.headers.get('Content-Type')
     if not response.ok:
         if response.status_code == 400:
             raise appngizer.errors.HttpClientBadRequest('400 - Bad request ({})'.format(response.url))
         if response.status_code == 409:
             raise appngizer.errors.HttpElementConflict('409 - Conflict ({})'.format(response.url))
         if response.status_code == 403:
             raise appngizer.errors.HttpElementForbidden('403 - Forbidden ({})'.format(response.url))
         if response.status_code == 404:
             raise appngizer.errors.HttpElementNotFound('404 - Not found ({})'.format(response.url))
         if response.status_code == 500:
             # try to get exception message from html error page if exist
             if response.text:
                 html_error = soupparser.fromstring(response.text)
                 pre_childs = html_error.xpath('//pre[contains(text(),"Exception")]')
                 pre_texts = []
                 for pre_text in pre_childs:
                     pre_texts.append(pre_text.text)
                 raise appngizer.errors.HttpServerError('500 - Server error ({}): {}'.format(response.url, ' '.join(pre_texts)))                   
             else:
                 raise appngizer.errors.HttpServerError('500 - Server error ({})'.format(response.url))
         else:
             raise appngizer.errors.ClientError(response.raise_for_status())
     else:
         if self.content_type != response_ct:
             if response.status_code == 204 and response.request.method == 'DELETE':
                 return True
             if response.status_code == 200 and response_ct == None:
                 return True
             else:
                 raise appngizer.errors.ClientError('Unexpected response Content-Type: {0}'.format(response_ct))
     return True

Example #48

0

Show file

File: scrape_denied_courses.py Project: gordonje/ncaa-high-school-course-scraper

def parse_state_schools(schools_html):
    schools = []

    root = fromstring(schools_html)
    schools_form = root.cssselect('form[name="selectHsForm"]')[0]
    for row in schools_form.cssselect('tr'):
        form_input = row.cssselect('input[name="hsCode"]')
        if len(form_input) == 0:
            continue

        school = {}

        school['hs_code'] = form_input[0].get('value')

        cols = row.cssselect('td')

        school['high_school_name'] = cols[1].text_content().strip()
        school['city'] = cols[2].text_content().strip()
        school['state'] = cols[3].text_content().strip()[:2]

        schools.append(school)

    return schools

Example #49

0

Show file

 def extract(self, url, html):
     """根据xpath从DOM树中抽取出感兴趣的内容"""
     # extract person's url
     result = {}
     # extract city from url
     city = re.search(r'city_py=\w+', url).group(0)
     if len(city) > 8:
         city = city[8:]
     result['city'] = city
     # from lxml import etree
     import lxml.html.soupparser as soupparser
     dom = soupparser.fromstring(html)
     for name, xpath in self.xpaths.items():
         result[name] = []
         r = dom.xpath(xpath)
         for item in r:
             try:
                 uid = re.search(r'\d+', item.strip()).group(0)
                 result[name].append(uid)
                 self.fetched_num += 1
             except Exception, e:
                 # print 'Error occurs: =>', url, ' ==> ', item
                 pass  # always item = '/user/'

Example #50

0

Show file

def from_wikipedia():
    root = fromstring(
        requests.get(
            'https://en.wikipedia.org/wiki/Google_Chrome_version_history').
        content)
    trs = root.findall('.//tbody/tr')
    p_version = re.compile('(?P<version>\d{1,2}\.[0-9.]*)', re.IGNORECASE)

    for entry in trs:
        release = entry.xpath('string(td[1])').strip()

        # split-trick to keep only the first date occurence, the other ones are details per OS
        date = entry.xpath('string(td[2]/text())').strip().split(' ', 2)[0]

        version_entry = p_version.search(release)
        if version_entry and date:
            release = version_entry.group('version')

            chrome = {}
            datetime_obj = datetime.datetime.strptime(date, "%Y-%m-%d")
            chrome['date'] = datetime_obj.date().isoformat()

            yield release, chrome

Example #51

0

Show file

File: java-scraper.py Project: maaaaz/third-parties-version-history

def from_oracle():
    root = fromstring(
        requests.get(
            'https://java.com/en/download/faq/release_dates.xml').content)

    p_java = re.compile(
        'java (?P<version_major>\d*) update (?P<version_minor>\d*)',
        re.IGNORECASE)

    trs = root.findall('.//tr')
    for entry in trs:
        release = entry.xpath('string(td[1]/text())')
        date = entry.xpath('string(td[2]/text())')

        java_entry = p_java.search(release)
        if java_entry:
            java = {}
            version_full = "1.%s.0_%s" % (
                java_entry.group('version_major').strip(),
                java_entry.group('version_minor').strip())
            java['version_major'] = java_entry.group('version_major')
            java['date'] = date.strip()
            yield version_full, java

Example #52

0

Show file

    def from_string(self,
                    string,
                    isHTML=False,
                    encoding=None,
                    remove_blank_text=False):
        if string is None: return None

        if encoding == None:
            ud = UnicodeDammit(str(string), isHTML=isHTML)
            markup = ud.markup.encode('utf-8')
        else:
            markup = str(string).encode(encoding)

        if isHTML:
            try:
                return html.fromstring(markup, parser=html_parser)
            except:
                self._core.log_exception(
                    'Error parsing with lxml, falling back to soupparser')
                return soupparser.fromstring(string)
        else:
            return etree.fromstring(
                markup, parser=(xml_parser if remove_blank_text else None))

Example #53

0

Show file

File: nasdaq.py Project: jlyheden/stockbot

 def scrape(self, i):
     headers = {
         "User-Agent":
         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
     }
     res = requests.get(self.indexes[i], headers=headers)
     tree = fromstring(res.text)
     companies = tree.xpath(
         "//article[@class='nordic-our-listed-companies']//tbody/tr")
     rv = []
     for company in companies:
         fields = company.findall("td")
         name = fields[0].find("a").text
         ticker = fields[1].text
         currency = fields[2].text
         category = fields[4].text
         rv.append(
             NasdaqCompany(name=name,
                           ticker=ticker,
                           currency=currency,
                           category=category,
                           segment=i))
     return rv

Example #54

0

Show file

File: networkCzechVR.py Project: yougreektube/PhoenixAdult.bundle

def search(results,encodedTitle,title,searchTitle,siteNum,lang,searchByDateActor,searchDate,searchSiteID):
    if searchSiteID != 9999:
        siteNum = searchSiteID
    searchString = searchTitle.replace(" ","-")
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
    try:
        searchResults = HTML.ElementFromURL(PAsearchSites.getSearchSearchURL(searchSiteID) + searchString)
    except:
        request = urllib.Request(PAsearchSites.getSearchSearchURL(searchSiteID) + searchString, headers=headers)
        response = urllib.urlopen(request, context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))
        htmlstring = response.read()
        searchResults = fromstring(htmlstring)


    for searchResult in searchResults.xpath('//div[contains(@class,"postTag")]'):
        titleNoFormatting = searchResult.xpath('.//div[@class="nazev"]//h2//a')[0].text_content()
        Log('title: ' + titleNoFormatting)
        curID = searchResult.xpath('.//a')[0].get('href').replace("./","_")
        Log('curID: ' + curID)
        releaseDate = parse(searchResult.xpath('.//div[@class="datum"]')[0].text_content().strip()).strftime('%Y-%m-%d')
        Log('releaseDate: ' + releaseDate)
        actors = searchResult.xpath('.//div[@class="nazev"]//div[@class="featuring"]//a')
        actorList = []
        for actor in actors:
            actorName = actor.text_content()
            actorList.append(actorName)
        actorsPrint = ", ".join(actorList)
        Log("actors: " + actorsPrint)

        if searchDate:
            score = 100 - Util.LevenshteinDistance(searchDate, releaseDate)
        else:
            score = 100 - Util.LevenshteinDistance(searchTitle.lower(), titleNoFormatting.lower())

        results.Append(MetadataSearchResult(id = curID + "|" + str(siteNum), name = actorsPrint + " in " + titleNoFormatting + " [" + PAsearchSites.getSearchSiteName(siteNum) + "] " + releaseDate, score = score, lang = lang))
        Log(curID + "|" + str(siteNum) + " // " + titleNoFormatting + " [" + PAsearchSites.getSearchSiteName(siteNum) + "] " + releaseDate + " // " + str(score))
    return results

Example #55

0

Show file

File: dc_recent_crime_incidents.py Project: yuandra/scraperwiki-scraper-vault

def get_incidents(year):
    """
    Gets crime incidents from the DC Government XML.
    """
    print 'Downloading year: %s' % year
    
    # Build URL from year.
    # If the year is 2007-2011, download the XML straight from ... my S3 account.
    if year in range(2007, 2011):
        url = 'http://wapo-projects.s3.amazonaws.com/techathon/scraperwiki/xml/crime_incidents_%s_plain.xml' % year
    
    # If the year is 2012, get it from the DC government. This is NOT the whole year.
    if year == 2012:
        url = 'http://data.octo.dc.gov/feeds/crime_incidents/crime_incidents_current.xml'    
    
    # Request the data using the Requests library.
    request = requests.get(url)
    unzipped_request = request.content
    
    # Parse the XML using lxml's BeautifulSoup parser.
    crime_xml_parsed = fromstring(unzipped_request)

    # Return the parsed Element() objects by grabbing the xpath for <entry> tags.
    return crime_xml_parsed.xpath('//entry')

Example #56

0

Show file

    def test_save_bookmarks(self):
        expected = """<!DOCTYPE NETSCAPE-Bookmark-file-1>
<!-- This is an automatically generated file.
     It will be read and overwritten.
     DO NOT EDIT! -->
<H1>Bookmarks Menu</H1>
<DL><p>
    <DT><H3 ADD_DATE="1518129521" LAST_MODIFIED="1518129615">Subfolder</H3>
    <DL><p>
        <DT><A ADD_DATE="1518129612" HREF="http://www.sub.level.html" LAST_MODIFIED="1518129612">
Sub level link</A>
    </DL>
    <DT><A ADD_DATE="1518129612" HREF="http://www.top.level.html" LAST_MODIFIED="1518129612">
Top level link</A>
</DL>
"""
        tree = bs.reduce_tree(fromstring(expected))

        with tempfile.TemporaryDirectory() as fpd:
            filepath = os.path.join(fpd, 'merged.html')
            bs.save_bookmarks(tree, filepath)
            actual = self.file_to_string(filepath)

        self.assertEqual(actual, expected)

Example #57

0

Show file

File: java-scraper.py Project: maaaaz/third-parties-version-history

def from_wikipedia():
    root = fromstring(
        requests.get(
            'https://en.wikipedia.org/wiki/Java_version_history').content)

    p_java_until_9 = re.compile(
        'java se (?P<version_major>\d*) update (?P<version_minor>.*)',
        re.IGNORECASE)
    p_java_9_plus = re.compile(
        'java se (?P<version_major>\d*?)\.(?P<version_minor>.*)',
        re.IGNORECASE)

    trs = root.findall('.//tbody/tr')
    for entry in trs:
        release = entry.xpath('string(td[1]/text())')
        date = entry.xpath('string(td[2]/text())')

        java_entry = p_java_until_9.search(release)
        if java_entry:
            java = {}
            version_full = "1.%s.0_%s" % (
                java_entry.group('version_major').strip(),
                java_entry.group('version_minor').strip())
            java['version_major'] = java_entry.group('version_major')
            java['date'] = date.strip()
            yield version_full, java

        java_entry = p_java_9_plus.search(release)
        if java_entry:
            java = {}
            version_full = "1.%s.%s" % (
                java_entry.group('version_major').strip(),
                java_entry.group('version_minor').strip().replace('.', '_'))
            java['version_major'] = java_entry.group('version_major')
            java['date'] = date.strip()
            yield version_full, java

Example #58

0

Show file

File: vlc-scraper.py Project: maaaaz/third-parties-version-history

def from_chocolatey():
    root = fromstring(
        re.sub(
            r'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+',
            '',
            requests.get('https://chocolatey.org/packages/vlc').content.decode(
                'utf-8')))
    trs = root.findall('.//tr')
    p_version = re.compile('(?P<version>\d{1,2}\..*)', re.IGNORECASE)

    for entry in trs:
        date = entry.xpath('string(td[3])').strip()
        release = entry.xpath('string(td[1]/a|td[1]/span)')

        version_entry = p_version.search(release)
        if version_entry and date:
            release = version_entry.group('version')

            vlc = {}
            format_str = "%A, %B %d, %Y"
            datetime_obj = datetime.datetime.strptime(date, format_str)
            vlc['date'] = datetime_obj.date().isoformat()

            yield release, vlc

Example #59

0

Show file

File: handler.py Project: nherbaut/abo-sncf-mail-parser

    def handle_sncf_message(self, message):
        payload = list(message.walk())[1].get_payload()

        root = fromstring(
            quopri.decodestring(payload).decode("latin1").replace(
                "\t", "").replace("\n", "").replace('\\xa0', ' '))
        departure_city, _, arrival_city, _, seat_info, duration, _ = [
            r.replace("\xa0", " ") for r in root.xpath(
                "//table/tr/td/table/tr/td/table/tr/td/span/text()")
        ]
        departure_time, train_id, ticket_id, arrival_time = [
            r.replace("\xa0", " ") for r in root.xpath(
                "//table/tr/td/table/tr/td/table/tr/td/span/b/text()")
        ]
        departure_date = [
            r.replace("\xa0", " ") for r in root.xpath(
                "//html/body/table/tr/td/table/tr/td/span/text()")
        ]

        c = Calendar()
        e = Event()
        e.name = "%s: %s -> %s [%s]" % (train_id, departure_city, arrival_city,
                                        ticket_id)
        e.begin = dateparser.parse("%s %s CEST" %
                                   (departure_date, departure_time),
                                   languages=["fr"])
        e.end = dateparser.parse("%s %s CEST    " %
                                 (departure_date, arrival_time),
                                 languages=["fr"])
        e.location = departure_city
        e.description = "%s" % seat_info
        c.events.add(e)
        c.events

        with open('my.ics', 'w') as f:
            f.writelines(c)

Example #60

0

Show file

def fix_links(site, text):

    f = open('/tmp/links.txt', 'a+')

    from lxml.html.soupparser import fromstring
    e = fromstring(text)

    for img in e.xpath('//img'):
        src = img.get('src')
        f.write((src or '').encode('utf-8') + "\n")

        image = get_image_from_link(site, src)

        if isinstance(image, basestring):
            pass
        else:
            if image is not None:
                url = localize(image, site) + "/@@images/image"
                logger.info("Change image link %s to %s", src, url)
                img.set('src', url)

    for a in e.xpath('//a'):
        href = a.get('href')
        f.write((href or '').encode('utf-8') + "\n")
        if href is not None:
            res = fix_inner_link(site, href)
            if not res:
                continue
            if href != res:
                if not isinstance(res, basestring):
                    res = res.absolute_url()
                logger.info("Change link %s to %s", href, res)
                a.set('href', res)

    f.close()
    return lxml.html.tostring(e, encoding='unicode', pretty_print=True)