def decode_html(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=DEFAULT_ENC_ERRORS): """ Converts a `html` containing an HTML page into Unicode. Tries to guess character encoding from meta tag. """ if isinstance(html, unicode): return html if encoding: return html.decode(encoding, errors) match = CHARSET_META_TAG_PATTERN.search(html) if match: declared_encoding = match.group(1).decode("ASCII") # proceed unknown encoding as if it wasn't found at all with ignored(LookupError): return html.decode(declared_encoding, errors) # unknown encoding try: # try UTF-8 first return html.decode("utf8") except UnicodeDecodeError: # try lucky with default encoding try: return html.decode(default_encoding, errors) except UnicodeDecodeError as e: raise JustextError("Unable to decode the HTML to Unicode: " + unicode(e))
def extract(html, url, **kwargs): """ """ logging.debug("*** extracting %s ***" % (url,)) kw = { 'remove_comments': True } if 'encoding' in kwargs: kw['encoding'] = kwargs['encoding'] try: foo = html.decode(kw['encoding']) except UnicodeDecodeError: # make it legal logging.warning("Invalid %s - cleaning up" %(kw['encoding'],)) foo = html.decode(kw['encoding'],'ignore') html = foo.encode(kw['encoding']) parser = lxml.html.HTMLParser(**kw) doc = lxml.html.document_fromstring(html, parser, base_url=url) [i.drop_tree() for i in util.tags(doc,'script','style')] # drop comment divs - they have a nasty habit of screwing things up [i.drop_tree() for i in doc.cssselect('#disqus_thread')] [i.drop_tree() for i in doc.cssselect('#comments, .comment')] # drop obvious structural cruft [i.drop_tree() for i in doc.cssselect('#header, #footer, #sidebar')] # NASTY SITE-SPECIFIC HACKS # nasty little hacks with no obvious general solutions: # Johnston Publishing sites - they have adverts embedded in the headline :-( [i.drop_tree() for i in doc.cssselect('.sponsorPanel')] # www.shropshirestar.com # www.expressandstar.com # Have annoyingly-well marked up author links to featured articles in masthead [i.drop_tree() for i in doc.cssselect('#masthead-quote')] if 'independent.co.uk' in url: [i.drop_tree() for i in doc.cssselect('#side, .readNextWidget')] # html = UnicodeDammit(html, isHTML=True).markup headline_info = extract_headline(doc,url) headline_linenum = 0 headline = None headline_node = None if headline_info is not None: headline_linenum = headline_info['sourceline'] headline = headline_info['txt'] headline_node = headline_info['node'] pubdate, pubdate_node = extract_pubdate(doc,url,headline_linenum) authors = byline.extract(doc, url, headline_node, pubdate_node) return headline,authors,pubdate
def _request(self, request): if self.__last_acttime is not None: diff = time.time() - self.__last_acttime if diff < self.time_interval: time.sleep(self.time_interval - diff) self.__last_acttime = time.time() html = self.browser.open(request).read() try: html = html.decode('utf8') except UnicodeDecodeError: html = html.decode('cp936', 'ignore') r = html.lstrip() return r
def get_lines (url): try: html = urlopen(url).read() # make random guesses on the document coding try: text = html2text(html.decode("utf-8")) except Exception: text = html2text(html.decode("latin-1", "ignore")) # workaround a html2text bug text = text.replace(" _place_holder;", " "); return text.split("\n") except Exception, err: print "Failed to get lines: %s" % err return []
def scrape_membernames_generic(self, committee, url, chamber, term): html = self.urlopen(url) html = html.decode(self.encoding) doc = lxml.html.fromstring(html) names = doc.xpath('//a/text()') names = filter(lambda n: 'Senator' in n, names) return names
def getSiteContact(self, account, username, mobile): HOST = "dealer.che168.com" # if account in config.che168VIPAccountList: # HOST = "dealer.che168.com" # else: # HOST = "dealers.che168.com" conn = httplib.HTTPConnection(HOST, timeout=timeout_che168) headers = copy.copy(self.headers) conn.request("GET", "/car/publish/?s=1", headers=headers) res = conn.getresponse() resHeaders = res.getheaders() resRead = res.read() html = self.decodeBody(resHeaders, resRead) html = html.decode('GB18030') html = html.replace("gb2312", "utf-8") dom = lxml.html.fromstring(html) contactItems = dom.xpath('//*[@id="sh_linkMan_div"]/a/@rel') conn.close() if len(contactItems) == 0: return self.createNewContact(username, mobile) logger.debug(str(contactItems)) for salesid in contactItems: # if self.checkCurrentContact(salesid, mobile) is True: return salesid return self.createNewContact(username, mobile)
def craw_sitemap(url, user_agent, num_retrics): #download the sitemap file sitemap = download(url, user_agent, num_retrics) #f = open(r'D:\exercise\zhurenwenji.txt','w') doc = Document() #extract the sitemap links links = re.findall('<a href="http:(.*?)" title="http', sitemap) #print links i = 0 print len(links) for link in links: link = 'http:' + link try: html = download(link, user_agent, num_retrics) #contents = re.findall('<div class="detail">(.*?)</div>',html) tree = lxml.html.fromstring(html.decode('utf-8')) td = tree.cssselect('div.detail')[0].text_content() i += 1 print i except: pass #doc.save(contents) doc.add_paragraph(td) doc.save('d:\exercise\zhurenwenji.docx')
def _guide(name): game_id = name.lower().replace(" ", "-") session = utils.web.requests_session() page = session.get(GUIDE_URL.format(game_id)) tree = lxml.html.fromstring(page.text) li_elements = tree.cssselect("#col_l .bl_la_main_guide .showhide ul li") if li_elements: return [x.text_content().strip() for x in li_elements[:5]] else: elements = tree.cssselect("#col_l .bl_la_main_guide .showhide p") if not elements: elements = tree.cssselect("#col_l .bl_la_main_guide .showhide div div") if elements: info = [] html = lxml.html.tostring(elements[0]) lines = html.decode("utf-8").split("<br>") for line in lines[1:6]: span_str = "<span>{0}</span>".format(line) span = lxml.html.fragment_fromstring(span_str) s = span.text_content().strip() if s.startswith("-"): s = s[1:] info.append(s) return info
def _execute(self, options, args): """Compile reStructuredText to standalone HTML files.""" compiler = self.site.plugin_manager.getPluginByName('rest', 'PageCompiler').plugin_object if len(args) != 1: print("This command takes only one argument (input file name).") return 2 source = args[0] with io.open(source, "r", encoding="utf8") as in_file: data = in_file.read() output, error_level, deps, shortcode_deps = compiler.compile_string(data, source, True) rstcss_path = resource_filename('nikola', 'data/themes/base/assets/css/rst_base.css') with io.open(rstcss_path, "r", encoding="utf8") as fh: rstcss = fh.read() template_path = resource_filename('nikola', 'plugins/command/rst2html/rst2html.tmpl') template = Template(filename=template_path) template_output = template.render(rstcss=rstcss, output=output) parser = lxml.html.HTMLParser(remove_blank_text=True) doc = lxml.html.document_fromstring(template_output, parser) html = b'<!DOCTYPE html>\n' + lxml.html.tostring(doc, encoding='utf8', method='html', pretty_print=True) print(html.decode('utf-8')) if error_level < 3: return 0 else: return 1
def scrape_members_senate_subcommittees(self, committee, url, chamber, term, cache={}): if cache: names = cache[committee['subcommittee']] return Membernames.scrub(names) html = self.urlopen(url) html = html.decode(self.encoding) doc = lxml.html.fromstring(html) # Commence horrific regex-based hackery to get subcommittee members. text = doc.xpath('//div[@class="content"]')[0].text_content() chunks = re.split(r'\s*Subcommittee.*', text) namelists = [] for c in chunks: names = re.sub(r'\s*Members\s*', '', c) names = re.split(r'\s*(,|and)\s*', names) names = filter(lambda s: s not in [',', 'and'], names) names = map(clean, names) if filter(None, names): namelists.append(names) committee_names = doc.xpath('//div[@class="content"]/h3/text()') for _committee, _names in zip(map(clean, committee_names), namelists): cache[_committee] = _names names = cache[committee['subcommittee']] return Membernames.scrub(names)
def _execute(self, options, args): """Compile reStructuredText to standalone HTML files.""" compiler = self.site.plugin_manager.getPluginByName('rest', 'PageCompiler').plugin_object if len(args) != 1: print("This command takes only one argument (input file name).") return 2 source = args[0] with io.open(source, "r", encoding="utf8") as in_file: data = in_file.read() output, error_level, deps, shortcode_deps = compiler.compile_string(data, source, True) rstcss_path = resource_filename('nikola', 'data/themes/base/assets/css/rst.css') with io.open(rstcss_path, "r", encoding="utf8") as fh: rstcss = fh.read() template_path = resource_filename('nikola', 'plugins/command/rst2html/rst2html.tmpl') template = Template(filename=template_path) template_output = template.render(rstcss=rstcss, output=output) parser = lxml.html.HTMLParser(remove_blank_text=True) doc = lxml.html.document_fromstring(template_output, parser) html = b'<!DOCTYPE html>\n' + lxml.html.tostring(doc, encoding='utf8', method='html', pretty_print=True) print(html.decode('utf-8')) if error_level < 3: return 0 else: return 1
def parse_odds(url): #pdb.set_trace() global M_DB html=pget(url) if not html:return MatchId=url.split('/')[-4] tree=lxml.html.fromstring(html.decode("utf8")) #table=tree.xpath("//div[@id='data_main_content']/table")[0] trs=tree.xpath("tr") #if MatchId not in M_DB[day]: # M_DB[day][MatchId]={"odds":{}} #data=M_DB[day][MatchId]["odds"] data={} #pdb.set_trace() for tr in trs: tds=tr.xpath("td") company=unicode(tds[1].text_content()).strip() if company not in M_COMPANY:continue s_zhu=unicode(tds[2].text_content()).strip().replace(u'↓' ,'').replace(u'↑' ,'').strip() s_ping=unicode(tds[3].text_content()).strip().replace(u'↓' ,'').replace(u'↑' ,'').strip() s_ke=unicode(tds[4].text_content()).strip().replace(u'↓' ,'').replace(u'↑' ,'').strip() n_zhu=unicode(tds[5].text_content()).strip().replace(u'↓' ,'').replace(u'↑' ,'').strip() n_ping=unicode(tds[6].text_content()).strip().replace(u'↓' ,'').replace(u'↑' ,'').strip() n_ke=unicode(tds[7].text_content()).strip().replace(u'↓' ,'').replace(u'↑' ,'').strip() href=tds[5].xpath("a/@href")[0] #print href odds_change=pase_history(href) odds_change.update({"company":company, "s_zhu":s_zhu,"s_ping":s_ping,"s_ke":s_ke, "n_zhu":n_zhu,"n_ping":n_ping,"n_ke":n_ke,}) data[company]=odds_change return data
def get_albums(self, html): if not html: raise ValueError doc = lxml.html.fromstring(html.decode('utf8')) sections = doc.cssselect('h2 > span.mw-headline') albums = {} for s in sections: try: album = s.cssselect('a:first-child')[0].text except: album = s.text if not len(album): continue album = self.normalizeish(album) songlist = s.xpath("./following::ol[1]") if not songlist: continue al = albums.setdefault(album, {}) for i, e in enumerate(songlist[0].getchildren()): try: a = e.cssselect('a')[0] if '(page does not exist)' not in a.get('title'): al[i+1] = a except (TypeError, IndexError): pass if not al: del albums[album] return albums or None
def fetch_oep_entry(id, datastorage): oepurl = url_from_id(id) html = scraperwiki.scrape(oepurl) root = lxml.html.fromstring(html.decode('utf-8')) data = { 'journalPostId' : id } for tr in root.cssselect("table.defaultTable tr"): vtype = tr.cssselect("th")[0].text_content().strip().replace(":", "").replace(",", "") value = tr.cssselect("td")[0].text_content().strip() #print '"' + vtype + '"', '"'+value+'"' if (vtype == 'Record entry date' and value == 'Not stated.') or \ (vtype == 'Document type' and value == '-') or \ (vtype == 'Case number' and value == ''): return -1 if vtype in fieldmap: vtype = fieldmap[vtype] if 'doctype' == vtype: value = doctypemap[value] if 'caseid' == vtype: caseyear, caseseqnr = value.split("/") data['caseyear'] = caseyear data['caseseqnr'] = caseseqnr data[vtype] = value # print str(id) + ": " + str(data) data['scrapestamputc'] = datetime.datetime.now() # print data['scrapestamputc'] # exit () datastorage.append(data) # scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data) return 0
def parseNewApplications(url): html = scraperwiki.scrape(url) html = html.decode("utf8") apps = lxml.html.fromstring(html) new_cnt=0 total_cnt=0 global warning global scrape_id warning='' for app in apps.cssselect("ul[id='highlights'] li"): try: appAnchor = safeXPath(app.cssselect("a"), 0) appHref = safeXPath(appAnchor.xpath("@href"), 0) app_ID = appHref.partition('consultations/')[2].partition('.aspx')[0] #appTitle = safeXPath(appAnchor.xpath("@title"), 0) #appPara = safeXPath(app.cssselect("p"), 0) #appDescr = safeXPath(appPara.xpath("text()"), 0) if scraperwiki.sqlite.select("* from licence_applications WHERE app_ID="+str(app_ID)) == []: new_yn=1 new_cnt=new_cnt+1 else: new_yn=0 parseAppDetail(app_ID, baseURL+appHref, new_yn) total_cnt=total_cnt+1 except IndexError as ex: print "parseNewApplications: ex={1}: url={0} app={2}".format(url, str(ex), app) warning='Could not parse page' #save log scrape_id=scraperwiki.sqlite.get_var('last_page') log_entry= {"scrape_id":scrape_id , "scrape_date":now, "task":'parse list', "url":url, "result":str(new_cnt) + ' New records / ' + str(total_cnt) + ' Total records', "warnings":warning} scraperwiki.sqlite.save(['scrape_id'],log_entry,'log')
def scrape_study(url): html = scraperwiki.scrape(url) root = lxml.html.fromstring(html.decode('utf-8')) body = root.cssselect('.large-copy')[0] data = {} key = None value = None for child in body: if child.tag == 'b': if key is not None: if len(value) == 1: data[key] = value[0] else: data[key] = '\n'.join(value) key = make_key(child.text_content().strip()) value = [] if child.tag == 'ul': value.extend([s.text_content().strip() for s in child.cssselect('li')]) if child.tail is not None and child.tail.strip(): content = child.tail.strip() content = content.replace(u'•', '') content = content.strip() value.append(content) if len(value) == 1: data[key] = value[0] else: data[key] = '\n'.join(value) return data
def parseDetail(uri): html = scrape(uri) root = lxml.html.fromstring(html.decode('utf-8')) root.make_links_absolute(DELIBERE_URI) data = {} tds = root.cssselect('table td') for ix, td in enumerate(tds): if ix == 1: data['organo'] = d = td.find('strong').text.strip() elif ix == 3: data['numero'] = int(td.find('strong').text.strip()) elif ix == 5: data['anno'] = int(td.find('strong').text.strip()) elif ix == 7: date = td.find('strong').text.strip() data['data'] = dateutil.parser.parse(date, dayfirst=True).date() elif ix == 9: data['oggetto'] = td.find('strong').text.strip() elif ix == 11: date = td.find('strong').text.strip() data['data_pubblicazione'] = dateutil.parser.parse(date, dayfirst=True).date() elif ix == 13: date = td.find('strong').text.strip() data['data_esecutivita'] = dateutil.parser.parse(date, dayfirst=True).date() elif ix == 15: allegati = [] for li in td.iter('li'): # get always the second link which contains a title link = li.findall('a')[1] if link.get('href').endswith('.pdf'): allegati.append({'uri': link.get('href'), 'titolo': link.text.strip()}) data['allegati'] = allegati return data
def getEncoding(self): html = self.getSOURCE() # first, get encode from http head # second, get from source dom = lxml.html.fromstring(html.decode('utf8', 'ignore'), \ parser = lxml.html.HTMLParser(remove_comments = True)) encs = dom.xpath('.//head/meta[@charset]/@charset') encs += [re.findall(r'charset=(.*)', _.get('content'))[0] for _ in dom.xpath('.//head/meta[@http-equiv][@content]') \ if _.get('http-equiv').lower() == "content-type" and \ _.get('content').count('charset=') == 1] encs = set([_.lower() for _ in encs]) if set(['gb2312', 'gbk']) <= encs: encs.remove('gb2312') if set(['gb2312']) == encs: encs = set(['gbk']) if len(encs) == 1: return encs.pop() try: import chardet return chardet.detect(html)['encoding'] except ImportError, e: raise e
def scrape_lower_members(self, committee, url, chamber, term, re_name=re.compile(r'^(Senator|Assemblymember)'),): try: # Some committees display the members @ /memberstaff html = self.urlopen(url + '/membersstaff') except: # Others display the members table on the homepage. html = self.urlopen(url) html = html.decode(self.encoding) doc = lxml.html.fromstring(html) members = doc.xpath('//table/descendant::td/a/text()') members = map(strip, members) members = filter(None, members)[::2] if not members: self.warning('Dind\'t find any committe members at url: %s' % url) for member in members: if ' - ' in member: member, role = member.split(' - ') else: role = 'member' member = re_name.sub('', member) member = member.strip() committee.add_member(member, role) return committee
def process(url, season, internal_matchday): html = scraperwiki.scrape(url) html = html.decode( "utf-8" ) # convert to unicode before lxml gets it since the encoding declaration is missing in the html root = lxml.html.fromstring(html) matches = root.xpath(match_detail_xpath) for match in matches: record = {} record["matchday"] = root.xpath("//h3")[0].text record["season"] = season record["internal_matchday"] = internal_matchday # trs = match.xpath('//tr[@class="sup"]') #info and url record["stage"] = match.xpath('tr[@class="sup"]//span[@class="rname"]')[0].text_content() try: # one or two cancels record["match_detail_url"] = ( "http://www.uefa.com" + match.xpath('tr[@class="sup"]//span[contains(@class,"report")]/a')[0].attrib["href"] ) except: pass record["home_team"] = match.xpath('tr[@class=" match_res"]//td[contains(@class,"home")]')[0].text_content() record["away_team"] = match.xpath('tr[@class=" match_res"]//td[contains(@class,"away")]')[0].text_content() try: record["aggregate"] = match.xpath('tr[@class="reasonwin"]//span[contains(@class,"rwa")]')[0].text_content() except: pass try: record["aggregate_notes"] = match.xpath('tr[@class="reasonwin"]//span[contains(@class,"woag")]')[ 0 ].text_content() except: pass record["home_team_url"] = ( "http://www.uefa.com" + match.xpath('tr[@class=" match_res"]//td[contains(@class,"home")]/a')[0].attrib["href"] ) record["away_team_url"] = ( "http://www.uefa.com" + match.xpath('tr[@class=" match_res"]//td[contains(@class,"away")]/a')[0].attrib["href"] ) record["score"] = match.xpath('tr[@class=" match_res"]//td[contains(@class,"score")]')[0].text_content() ref_stadium = re.split(u"\u2013", match.xpath('tr[@class="referee_stadium"]')[0].text_content()) # print repr(match.xpath('tr[@class="referee_stadium"]')[0][0].text) try: # record['referee'] = ref_stadium[0].lstrip('Referee: ').strip() record["referee"] = ref_stadium[0].replace("Referee: ", "").strip() except: pass try: # record['stadium'] = ref_stadium[1].lstrip('Stadium: ').strip() record["stadium"] = ref_stadium[1].replace("Stadium: ", "").strip() except: pass # print record scraperwiki.sqlite.save( unique_keys=["matchday", "season", "score", "home_team", "away_team"], data=record, verbose=1 )
def parse_html_slow(html): 'Uses Beautiful Soup to parse messages out of a log file.' html = html.decode('utf-8', 'ignore') soup = soupify(html, markupMassage = ((br_re,lambda m: '<br />'),)) messages = [] strptime = datetime.strptime for div in soup.findAll(message_divs): try: buddyname = div.findAll('span', class_buddy)[0].renderContents(None) timestamp = parse_timestamp(div['timestamp']) message = div.findAll('span', class_msgcontent)[0].renderContents(None) type = div['class'].replace('message', '').strip() auto = boolify(div.get('auto', 'false')) except Exception: print_exc() else: messages.append(Message(buddy = S(name = buddyname), timestamp = timestamp, message = message, type = type, auto = auto)) log_info('parse_html_slow with %d bytes returning %d messages', len(html), len(messages)) return messages
def sanitize_html(html, encoding='utf-8', return_unicode=False): html = smart_str(html, encoding=encoding) if RE_TAG_START.search(html): html = render_html(parse_html(html)) if return_unicode: return html.decode('utf-8') else: return html
def search2(): writer = csv.writer(open('countries.csv', 'w')) D = Downloader() html = D('http://example.webscraping.com/places/default/search?page=0&page_size=1000&search_term=.') print(html.decode('utf-8')) ajax = json.loads(html) for record in ajax['records']: writer.writerow([record['country']])
def userstats_api(self, user): url = "http://www.bright-shadows.net/userdata.php?" html = urllib.request.urlopen(url + urllib.parse.urlencode({"username" : user}), timeout=5).read() html = html.decode() if html == "Unknown User": return None real_user, rank, users_total, challs_cnt, challs_total = html.split(":") return real_user, str(int(challs_cnt)), int(challs_total), str(int(rank)), int(users_total), None, None, None
def get_captcha(html): tree = lxml.html.fromstring(html.decode('utf8')) img_data = tree.cssselect('div#recaptcha img')[0].get('src') img_data = img_data.partition(',')[-1] binary_img_data = base64.b64decode(img_data) file_like = BytesIO(binary_img_data) img = Image.open(file_like) return img
def __init__(self, html, encoding='utf-8', cache_xpath=True): if isinstance(html, bytes): self.html = lxml.html.fromstring(html.decode(encoding)) elif isinstance(html, lxml.html.HtmlElement): self.html = html else: self.html = lxml.html.fromstring(html) self.cache_xpath = cache_xpath
def scrape_membernames_senate_autism(self, committee, url, chamber, term): '''The Senate Autism committee has its own wierd format. ''' url = 'http://autism.senate.ca.gov/committeemembers1' html = self.urlopen(url) html = html.decode(self.encoding) doc = lxml.html.fromstring(html) return self.scrape_membernames_generic(doc)
def regex_scraper(html): results = {} if html is not None: html = html.decode('utf-8') for field in FIELDS: print(field) results[field] = re.search('<a href="/places/default/index">(.*?)</a>', html).groups()[0] return results
def scrape(crno): crnostr = "%07d" % crno baseurl = "https://www.mobile-cr.gov.hk/mob/cps_criteria.do?queryCRNO=" url = baseurl + crnostr print "trying local", crnostr html = load_local(url) if html is None: print "trying site", crnostr html = scraperwiki.scrape(url).decode('utf-8') print "storing local", crnostr store_local(url, html.encode('utf-8')) else: html = html.decode('utf-8') if '沒有紀錄與輸入的查詢資料相符' in html.encode('utf-8'): print 'NO MATCHING RECORD FOUND FOR THE SEARCH INFORMATION INPUT!' return nil root = lxml.html.fromstring(html) # , encoding="utf-8") tds = root.cssselect("tr td tr td") namestds = root.cssselect("td.data") while tds == []: print "trying", crnostr, "again" sleep(46) html = scraperwiki.scrape(baseurl + crnostr).decode('utf-8') root = lxml.html.fromstring(html) # , encoding="utf-8") tds = root.cssselect("tr td tr td") namestds = root.cssselect("td.data") #for idx, val in enumerate(tds): # print idx, ":", val.text_content().encode('utf-8') names = {} for nameidx, nameval in enumerate(namestds): names["Name" + str(nameidx)] = nameval.text_content()[10:] names["Name" + str(nameidx) + "date"] = nameval.text_content()[:10] print "got", tds[1].text_content() data = { 'cr' : tds[1].text_content(), 'English Company Name' : tds[2].text_content().rsplit('\r')[1].lstrip('\n\t'), 'Chinese Company Name' : tds[2].text_content().rpartition('\r')[2].lstrip('\r\n\t'), 'Company Type' : tds[4].text_content()[:-1], 'Date of incorporation' : tds[6].text_content(), # 'Company status' : tds[8].text_content()[:-1], 'Active status' : tds[8].text_content()[:-1], 'Remarks' : tds[9].text_content().replace(u"備註:",""), 'Winding up mode' : tds[11].text_content()[:-1], 'Date of Dissolution' : tds[13].text_content(), 'Register of Charges' : tds[15].text_content()[:-1], 'Important Note' : tds[16].text_content().replace(u"重要事項:","").lstrip('\r\n\t') } data.update(names) db['swdata'].upsert(data, ['cr']) print "wrote", tds[1].text_content()
def process_html(html): '''process html to tokenize to get a list of words ''' table = str.maketrans(".,?!'\";:-_(){}[]\|`~#$%^&*<:>/+="," ") reg = re.compile('<[^>]*>') html = reg.sub('',html.decode().replace('\n','').replace(' ','')) text = html.translate(table) words = text.split() return words
def getPage(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } req = request.Request(url=url, headers=headers) html = request.urlopen(req).read() html = html.decode('utf-8') # 解码 return html
def getArticleHead(url): html = requests.get(url).content html = html.decode('utf-8') # 标题 msg_title = re.findall(r'var msg_title = \"(.*?)\"', html)[0] # 推文首页图 msg_cdn_url = re.findall(r'var msg_cdn_url = \"(.*?)\"', html)[0] print("title:" + msg_title) print("msg_url:" + msg_cdn_url)
def parse(html): tree=lxml.html.fromstring(html.decode("utf8")) query = tree.xpath('//input[@id="lst-ib"]/@value') if len(query) > 0: query = query[0] else: query = "" for href in tree.xpath('//li[@class="g card-section"]//h3/a/@href'): yield href,query
def doLogin(self, username, password): if self.baseheaders.has_key("Cookie"): self.baseheaders.pop("Cookie", None) self.cookies = {} if self.headers.has_key("Cookie"): self.headers.pop("Cookie", None) conn = httplib.HTTPConnection("dealer.che168.com", timeout=10) conn.request("GET", "/", "", self.baseheaders) res = conn.getresponse() resHeaders = res.getheaders() resRead = res.read() self.setCookies(resHeaders) html = base.BaseSharer.decodeBody(resHeaders, resRead) html = html.decode('GB18030') dom = lxml.html.fromstring(html) checkCodeImageUrls = dom.xpath('.//span/img[@src]/@src') if len(checkCodeImageUrls) == 0: return False checkCodeImageUrl = checkCodeImageUrls[0] conn.close() conn = httplib.HTTPConnection("dealer.che168.com", timeout=10) conn.request("GET", checkCodeImageUrl, "", self.baseheaders) res = conn.getresponse() self.setCookies(res.getheaders()) imageData = res.read() conn.close() image = StringIO(imageData) captcha = base.BaseSharer.getCaptcha(image, imageData) if captcha is None: return False validcode = captcha["text"] conn = httplib.HTTPConnection("dealer.che168.com", timeout=10) url = "/Handler/Login/Login.ashx?" username = urllib.quote(username.encode("GB18030")) password = urllib.quote(password.encode("GB18030")) url = url + 'name=' + username url = url + '&pwd=' + password url = url + '&validcode=' + validcode.strip() url += '&remember=false' url = url + '&req=' + str(random.random()) conn.request("GET", url, "", self.baseheaders) res = conn.getresponse() resHeaders = res.getheaders() resRead = res.read() loginResult = base.BaseSharer.decodeBody(resHeaders, resRead) loginResult = loginResult.decode('GB18030') if not loginResult.startswith(u"var code='1';"): return False logger.debug("loginResult=" + loginResult) self.setCookies(res.getheaders()) return True
def strToUnicode(html, decoding=None): if not isinstance(html, unicode): if not decoding: decoding, charJust = '', chardet.detect(html) try: decoding = 'gbk' if charJust['encoding'].lower() == 'gb2312' else charJust['encoding'] except Exception, e: print 'strToUnicode chardet detect error:', Exception, '->', e decoding = 'utf-8' if not decoding else decoding if decoding: html = html.decode(decoding, 'ignore') return html
def fetch_detail(url, detail): html = requests.get(url).content root = lxml.html.fromstring(html.decode(TARGET_ENCODING)) summary = root.xpath('//*[@id="mw-content-text"]/h2[1]/following-sibling::node()[not(preceding-sibling::h2[2])]') # summary = root.xpath('//*[@id="mw-content-text"]/p[1]') # detail['description'] = lxml.html.tostring(summary[0], method='text', encoding=ENCODING) # detail['description'] = ''.join(s if issubclass(type(s), str) else s.text_content() for s in summary) detail['description'] = ''.join(s.text_content() if hasattr(s, 'text_content') else s for s in summary) return detail
def getUrl(url): req = request.Request(url) html = request.urlopen(req).read() html = html.decode('utf-8') req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36' ) return html
def unicodeToStr(html, encoding='utf-8'): if not isinstance(html, unicode): decoding, charJust = '', chardet.detect(html) try: decoding = 'gbk' if charJust['encoding'].lower() == 'gb2312' else charJust['encoding'] except Exception, e: print 'unicodeToStr chardet detect error:', Exception, '->', e if encoding and decoding and decoding!=encoding : html = html.decode(decoding, 'ignore').encode(encoding, 'ignore') else: if encoding: html = html.encode(encoding, 'ignore') return html
def tidy (html): """ Pipe html thru w3c tidy. """ html = parsers.RE_RESTRICTED.sub ('', html) html = RE_XMLDECL.sub ('', html) html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html) # convert to xhtml tidy = subprocess.Popen ( ["tidy", "-utf8", "-clean", "--wrap", "0", # "--drop-font-tags", "y", # "--drop-proprietary-attributes", "y", # "--add-xml-space", "y", "--output-xhtml", "y", "--numeric-entities", "y", "--merge-divs", "n", # keep poetry indentation "--merge-spans", "n", "--add-xml-decl", "n", "--doctype", "strict", "--anchor-as-name", "n", "--enclose-text", "y" ], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) # print (html.encode ('utf-8')) # sys.exit () (html, stderr) = tidy.communicate (html.encode ('utf-8')) regex = re.compile (r'(Info:|Warning:|Error:)\s*', re.I) # pylint: disable=E1103 msg = stderr.decode (sys.stderr.encoding).strip () for line in msg.splitlines (): match = regex.search (line) if match: sline = regex.sub ("", line) g = match.group (1).lower () if g == 'info:': info ("tidy: %s" % sline) elif g == 'warning:': warning ("tidy: %s" % sline) elif g == 'error:': error ("tidy: %s" % sline) else: error (line) if tidy.returncode == 2: raise ValueError (stderr) return html.decode ('utf-8')
def tidy (html): """ Pipe html thru w3c tidy. """ html = parsers.RE_RESTRICTED.sub ('', html) html = RE_XMLDECL.sub ('', html) html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html) # convert to xhtml tidy = subprocess.Popen ( ["tidy", "-utf8", "-clean", "--wrap", "0", # "--drop-font-tags", "y", # "--drop-proprietary-attributes", "y", # "--add-xml-space", "y", "--output-xhtml", "y", "--numeric-entities", "y", "--merge-divs", "n", # keep poetry indentation "--merge-spans", "n", "--add-xml-decl", "n", "--doctype", "strict", "--anchor-as-name", "n", "--enclose-text", "y" ], stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE) # print (html.encode ('utf-8')) # sys.exit () (html, stderr) = tidy.communicate (html.encode ('utf-8')) regex = re.compile ('(Info:|Warning:|Error:)\s*', re.I) # pylint: disable=E1103 msg = stderr.rstrip () for line in msg.splitlines (): match = regex.search (line) if match: sline = regex.sub ("", line) g = match.group (1).lower () if g == 'info:': info ("tidy: %s" % sline) elif g == 'warning:': warn ("tidy: %s" % sline) elif g == 'error:': error ("tidy: %s" % sline) else: error (line) if tidy.returncode == 2: raise ValueError, stderr return html.decode ('utf-8')
def read_film_list(page_index): website = 'http://www.tasteofcinema.com/category/lists/film-lists/page/' + str(page_index) + '/' html = get_url_content(website) print("Parsing: ", website) from toc_parser.list_parser import TocListParser parser = TocListParser() parser.feed(html.decode('utf-8')) parser.close() return parser.get_film_list()
def _url_2_lxml(self, url, base_url='{0.scheme}://{0.netloc}'.format): ''' Fetch the url as a string, convert it to unicode, and parse with lxml. ''' html = self.urlopen(url) doc = lxml.html.fromstring(html.decode(self.encoding)) urldata = urlparse(url) doc.make_links_absolute(base_url(urldata)) return doc
def main(): remove=re.compile(r' |</br>|\.*') baseurl = 'https://movie.douban.com/top250?start=' datalist=[] for i in range(0,10): url=baseurl+str(i*25) html=askURL(url) html=html.decode('utf-8').replace(u'\xa0', u' ') tree = lxml.html.fromstring(html) items=tree.cssselect('div.item')#找到每一个影片项 for item in items: data=[] td=item.cssselect('div.hd a span.title')#片名可能只有一个中文名,没有外国名 if(len(td)==2): ctitle=td[0].text_content() data.append(ctitle)#添加中文片名 otitle=td[1].text_content().replace(u'\xa0', u' ') otitle=otitle.replace(" / ","")#去掉无关符号 data.append(otitle)#添加外国片名 else: data.append(td[0].text_content())#添加中文片名 data.append(' ')#留空 rating=item.cssselect('span.rating_num')[0] data.append(rating.text_content())#添加评分 judgeNum=item.cssselect('div.star span')[3] judgeNum=judgeNum.text_content().replace('人评价','') data.append(judgeNum)#添加评论人数 inq=item.cssselect('p.quote') #可能没有概况 if len(inq)!=0: inq=inq[0].text_content().replace("。","").strip()#去掉句号和没用的空格 data.append(inq)#添加概况 else: data.append(' ')#留空 bd=item.cssselect('p')[0] bd=bd.text_content().replace(u'\xa0', u'/') bd=bd.encode('GBK','ignore') bd=bd.decode('GBK') bd=re.sub(remove,"",bd) bd=re.sub('\n',"|",bd)#去掉<br> bd=re.sub(': ',":",bd)#替换/ bd=re.sub('<br/>',"",bd)#去掉<br> bd=re.sub('///',"|",bd)#替换/ words=bd.split("|") for s in words: if len(s)!=0 and s.strip()!='': #去掉空白内容 data.append(s) #主演有可能因为导演内容太长而没有 if(len(data)!=10): data.insert(6,' ')#留空 datalist.append(data) return datalist
def scrape(url, season): html = scraperwiki.scrape(url) html = html.decode('utf-8') #print html root = lxml.html.fromstring(html) matchdays = root.xpath(match_date_xpath) number_matchdays = len(matchdays) for m in matchdays: r = match_detail_regex.search(str(m.attrib['id'])) url = real_baseurl % (season, r.groups()[0], r.groups()[1]) process(url, season, r.groups()[1])
def get_listings(city): url = "http://{}.craigslist.org/search/apa".format(city) resp = requests.get(url) listing_ids = parse_listings(resp.content) for listing_id in listing_ids: filename = '{}{}.html'.format(DATA_PATH, listing_id) if os.path.isfile(filename): continue html = fetch_listing(city, listing_id) open(filename, 'w').write(html.decode('utf-8'))
def truncate_html(html, limit, encoding='utf-8'): """ Truncate html data to specified length and then fix broken tags. """ if not isinstance(html, unicode): html = html.decode(encoding) truncated_html = html[:limit] elem = parse_html(truncated_html, encoding=encoding) fixed_html = render_html(elem, encoding=encoding) return fixed_html
def download(): """ 获取电影列表并存入csv文件 """ # 114个页面 url = 'https://www.dytt8.net/html/gndy/china/list_4_{}.html' urls = [url.format(i) for i in range(1, 115)] # 获取proxy与user-agent proxy = [get_proxy()] #proxy = ['223.85.196.75:9999'] userAgent = getAgent() downloader = Downloader(delay=1, user_agent=userAgent, timeout=100, proxies=proxy, num_retries=3) # csv存储 path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) filepath = os.path.join(path, 'myfile') file = os.path.join(filepath, 'dytt.csv') col_head = ['名称', '日期'] csvHtml = CsvHtml(file=file, col_head=col_head) for url in urls: html = downloader(url) # html charset=gb2312,使用gbk解码 if html: html.decode('gbk', 'ignore') tree = lxml.html.fromstring(html) titles = tree.cssselect('div.co_content8 b') dates = tree.cssselect('div.co_content8 font') for e in zip(titles, dates): row = [ e[0].text_content(), e[1].text_content().split('\r\n')[0] ] csvHtml(row)
def userstats_api(self, user): url = "https://www.securitytraps.pl/wcscore.php?uname=%s&key=%s" authkey = self.settings["securitytraps_api_key"] html = urllib.request.urlopen(url % (Plugin.to_utf8(user), authkey), timeout=5).read() html = html.decode() if html == "0": return None rank, challs_solved, challs_total, users_total, scoremax = html.split( ":") return user, str(int(challs_solved)), int(challs_total), str( int(rank)), int(users_total), None, int(scoremax), None
def scrap( url ): # ソースを取得. html = scraperwiki.scrape( url ) root = lxml.html.fromstring( html.decode( TARGET_ENCODING ) ) circles = root.xpath( TARGET_XPATH_CIRCLE ) tables = root.xpath( TARGET_XPATH_TABLES ) validate( len( circles ) > 0 and len( tables ) > 0, ERROR_CODE_NOT_FOUND_TABLE ) validate( len( circles ) == len( tables ), ERROR_CODE_NOT_CORRESPONDE_COUNT ) # サークル単位のテーブル処理. for (circle,tables) in zip(circles,tables): # テーブルのヘッダ情報を処理. lines = tables.xpath( 'tr' ) line_count = len( lines ) validate( line_count > 1, ERROR_CODE_NO_CIRCLE_RECORD ) name_to_index_table = {} index_to_name_table = {} header_rows = lines[0].xpath( 'td' ) row_count = len( header_rows ) for x in range( row_count ): item = header_rows[x] name = item.text_content() name_to_index_table[name] = x index_to_name_table[x] = name # テーブルのレコード読み取り. prev_record = {} rowspans_count = row_count * [0] for y in range( 1, line_count ): rows = lines[y].xpath( 'td' ) record = {TARGET_CIRCLE: circle.text_content()} x = 0 for i in range( row_count ): name = index_to_name_table[i] if rowspans_count[i] > 0: record[name] = prev_record[name] else: item = rows[x] if 'rowspan' in item.attrib: rowspans_count[i] = int( item.attrib['rowspan'] ) if name in TARGET_TABLE_ITEM_PARSER: f = TARGET_TABLE_ITEM_PARSER[name] record[name] = f( item ) else: record[name] = parse_item_default( item ) x += 1 validate_with_msg( validate_record( record ), ERROR_CODE_INVALID_RECORD_FOUND, record ) prev_record = record formalized_record = formalize_record( record ) validate( save_record( formalized_record ), ERROR_CODE_FAILED_TO_SAVE ) rowspans_count = map( (lambda n: n-1), rowspans_count )
def unmunge(html): """Clean up Word HTML""" if 'mso' in html: # remove outlook html style key = '%s:unmunge' % hash(html) out = cache.get(key, namespace="filters") if not out: html = re.sub(re.compile('p"mso.*?"'), 'p', html) html = re.sub(re.compile('( style=".*?")'), '', html) out = unmungeHtml(html.decode('utf-8')) cache.set(key, out, namespace="filters") return out return html
def userstats(self, user): url = "https://defendtheweb.net/wechall/userscore?username=%s&authkey=%s" authkey = self.settings["defendtheweb_auth_key"] html = urllib.request.urlopen(url % (Plugin.to_utf8(user), authkey), timeout=5).read() html = html.decode() if html == "0": return None user, rank, score, scoremax, challs_solved, challs_total, users_total = html.split( ":") return user, str(int(challs_solved)), int(challs_total), str( int(rank)), int(users_total), int(score), int(scoremax), None
def userstats_api(self, user): url = "https://247ctf.com/wechall_validate_score_service?username=%s&authkey=%s" authkey = self.settings["247ctf_api_key"] html = urllib.request.urlopen(url % (Plugin.to_utf8(user), authkey), timeout=5).read() html = html.decode() if html == "": return None user, rank, score, scoremax, challs_solved, challs_total, users_total = html.split( ":") return user, str(int(challs_solved)), int(challs_total), str( int(rank)), int(users_total), int(score), int(scoremax), None
def smart_decode(self, html): """ Obviously this function is not smart at all. Anyway, it works on most sites in Chinese. """ encodings = ['utf-8', 'gbk', 'big5'] for enc in encodings: try: return html.decode(enc) except: continue return html
def get_branches(): b=Browse() lid=1 id=1 scraperwiki.sqlite.execute('delete from coords_data') scraperwiki.sqlite.execute('delete from branch_data') data=scraperwiki.sqlite.select('* from regions') for d in data: html=b.query("http://www.openbank.ru/ru/about/office/"+d['city_url']) html=html.decode('windows-1251') #print html r=lxml.html.document_fromstring(html) data=[] branchName='' address='' for el in r.xpath("//div[@class='body_sec']//*[name()='h4' or name()='ul']"): #print el.tag #print el.tag if el.tag == 'h4': branchName=el.text_content() address='' if el.tag == 'ul': address=get_xpath_el(el, 'li[1]') if branchName != '' and address!='': data.append({'id':id,'branch_name':branchName, 'address':address, 'city':d['city'], 'oblasty':d['oblasty']}) branchName='' address='' id+=1 print data #get coordinates latlon=re.findall(r'createObject\("Placemark"\, new YMaps\.GeoPoint\(([\d\.]+?),\s*([\d\.]+?)\),\s*?".*?",\s*?"(.*?)"\)\)',html, re.I|re.U) lldata=[] if latlon!=[]: for l in latlon: lldata.append({'lid':lid, 'lat':l[1], 'lon':l[0], 'branch_data':l[2].encode('utf-8')}) lid+=1 if data!=[]: scraperwiki.sqlite.save(unique_keys=['id'], data=data, table_name='branch_data') if lldata!=[]: scraperwiki.sqlite.save(unique_keys=['lid'], data=lldata, table_name='coords_data')
def getWeibo(self, id, page, cid='107603'): #id(字符串类型):博主的用户id,page(整型):微博翻页参数 url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=' + id + '&containerid=' + cid + id + '&page=' + str( page) print("url:", url) response = requests.get(url) html = response.text.encode('utf-8', 'ignore') html1 = html.decode("utf-8", 'ignore') ob_json = json.loads(html1) #print("ob_json:",ob_json) #ob_json: {'ok': 0, 'msg': '这里还没有内容', 'data': {'cards': []}} list_card = ob_json['data'] list_cards = list_card['cards'] #print("list_cards:",list_cards) return list_cards # 返回本页所有的cards
def get_url(url): req = urllib2.Request(url) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36' ) response = urllib2.urlopen(req) #tml = res.read().decode('utf-8') html = response.read() selector = etree.HTML(html) print html.decode('utf-8') data = selector.xpath("a") news = [] for i in data: news.append(i["href"]) #//a[@class='single-story-module__headline-link']|//section/section[@id='_up_with_heds_1']/div/article/h3/a # # # 请求URL,获取其text文本 # wbdata = requests.get(url).text # # 对获取到的文本进行解析 # soup = BeautifulSoup(wbdata, 'lxml') # # 从解析文件中通过select选择器定位指定的元素,返回一个列表 # news_links = soup.select("h3") # #,h3.story-package-module__story__headline > a # #.single-story-module__headline-link # news = [] # # # 对返回的列表进行遍历 # for n in news_links: # # 提取出标题和链接信息 # link = n.get("href") # news.append(link) print news return news
def download(url,user_agent='wswp',retries=2): print('download:',url) headers={'user agent':user_agent} try: html=urllib.request.urlopen(url).read() html = html.decode('utf-8') print(html) except urllib.request.URLError as e: print('dowmload error:',e.reason) html=None if retries> 0: if hasattr(e,'code') and 500<= e.code<600: return download(url,user_agent,retries-1) return html