def check_errors(html): # Check error message from acm.timus.ru mpos = html.find('color:Red') if mpos > -1: # Extract red text spos = html.find('>', mpos) + 1 epos = html.find('</', spos) raise OnlineJudje(html[spos:epos])
def resolve_ean(ean): page = requests.get(SEARCH_URL.format(ean)) html = lxml.html.document_fromstring(page.text) #Jump further further_url = "http://www.rebuy.de/" + html.find('.//a[@class="productConversion"]').attrib["href"] page = requests.get(further_url) html = lxml.html.document_fromstring(page.text) result = dict() result["title"] = html.find('.//h1/span[@class="loud"]').text_content() result["type"] = TYPE_TRANSLATE[html.xpath('.//p[contains(@class, "category-icon")]')[0].text_content()] result["imgurl"] = html.find(".//img[@id='cover']").attrib["src"] attribs = dict() for i in html.findall(".//ul[@id='main-info-facts']/li"): name, sep, val = i.text_content().strip().partition(":") attribs[name] = val result["created"] = defNone(attribs.get("Erscheinungsdatum"), lambda x: toDBDate(x.strip(), "%d.%m.%Y")) result["author"] = None result["artists"] = None result["description"] = None result["duration"] = None return result
def html_to_text(self, html): if html.find('head') is not None: html.head.decompose() if html.find('script') is not None: html.script.decompose() without_tags = html.get_text(" ", strip=True) return without_tags
def txt_wrap_by(start_str, end, html): start = html.find(start_str) if start >= 0: start += len(start_str) end = html.find(end, start) if end >= 0: return html[start:end].strip()
def compass(answers=None): answers = answers or def_answers.copy() questions = {} post_args = {} while post_args is not None: # Post previous responses, Get new questions (first post is empty, gets page 1) html_text = submit_page(post_args) html = lxml.html.fromstring(html_text) curr_questions = reap_questions(html) # If the test isn't done, prepare [post_args] for next page if len(curr_questions): # Verify test integrity if not all(item in def_questions.items() for item in curr_questions.items()): raise RuntimeError( "Questions have changed. Answer cache is bad!") questions.update(curr_questions) # Assemble responses post_args = { 'answer_' + str(key): answers[key] for key in curr_questions } # Print responses for num in sorted(curr_questions): print( str(num) + ":\t" + curr_questions[num] + "\n\t" + values[int(answers[num])] + '\n') submit_tag = html.find(".//input[@type='submit']") post_args[ "submit"] = submit_tag.value # submit_tag.type == "submit" for tag in html.findall(".//input[@type='hidden']"): post_args[tag.name] = tag.value pageno = post_args["pageno"] else: post_args = None pageno = 'f' # with open('/Users/alex/Desktop/page' + pageno + ".html", "a+") as f: # f.write(html_text) h2 = html.find(".//h2") print(h2.text_content()) lines = h2.text_content().split('\n') x = float(lines[0].split(":")[1]) y = float(lines[1].split(":")[1]) pyplot.scatter(x, y) pyplot.xlim(-10, 10) pyplot.ylim(-10, 10) pyplot.title("Political coordinates") pyplot.xlabel("Economic Left/Right") pyplot.ylabel("Social Libertarian/Authoritarian") pyplot.grid() pyplot.show() return questions
def non_tag_chars(html): n = 0 while n < len(html): angle = html.find('<', n) if angle == -1: yield html[n:] n = len(html) break yield html[n:angle] n = angle while n < len(html): nl = html.find('\n', n) angle = html.find('>', n) if angle == -1: yield ' ' * (len(html) - n) n = len(html) break elif nl == -1 or angle < nl: yield ' ' * (angle + 1 - n) n = angle + 1 break else: yield ' ' * (nl - n) + '\n' n = nl + 1
def on_response(self, resp, req): ctype = resp.headers.iget('content-type') if not ctype: return ctype = ctype.split(";", 1)[0] # if this is an html page, parse it if ctype in HTML_CTYPES: body = resp.body_string() html = lxml.html.fromstring(body) # rewrite links to absolute html.rewrite_links(self.rewrite_link) # add base old_base = html.find(".//base") base = etree.Element("base") base.attrib['href'] = self.absolute_path if not old_base: head = html.find(".//head") head.append(base) # modify response rewritten_body = lxml.html.tostring(html) try: resp.headers.ipop('content-length') except KeyError: pass resp.headers['Content-Length'] = str(len(rewritten_body)) resp._body = StringIO(rewritten_body) resp._already_read = False
def mp(id, term): """Parse MP from his profile webpage.""" if term and term not in terms.keys(): raise ValueError("unknown term '%s'" % term) url = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/poslanec&PoslanecID=%s&CisObdobia=%s' % (id, term) content = scrapeutils.download(url) if 'Unexpected error!' in content: raise RuntimeError("MP with id '%s' does not exist in term '%s'" % (id, term)) html = lxml.html.fromstring(content) result = { 'id': str(id), 'url': url } for div in html.findall('.//div[@class="mp_personal_data"]//div[strong]'): label = div.findtext('strong') value = div.find('span') result[label.lower()] = value.text_content() if value is not None else '' image_url = html.find('.//div[@class="mp_foto"]/img').get('src') image = requests.get(image_url).content with open(os.path.join(BASE_DIR, 'dummy-image.jpg'), 'rb') as f: dummy_image = f.read() result['fotka'] = image_url if image != dummy_image else '' result['členstvo'] = [] ul = html.find('.//span[@id="_sectionLayoutContainer_ctl01_ctlClenstvoLabel"]').getparent().getnext() for li in ul.findall('li'): m = re.search(r'(.*?)\s*\((.*?)\)', li.text) result['členstvo'].append({'meno': m.group(1), 'rola': m.group(2)}) return scrapeutils.plaintext(result)
def getLinks(): url_base = open("./url_base.txt","w+") link_files = glob.glob("./links*.html") for link_file in link_files: filename = os.path.basename(link_file) #print (filename) f = open(filename,"r") html = f.read() glob_left_index = html.find("<div class=\"page__b-offers__guru\">")+35 #print (html[glob_left_index:glob_left_index+15]) glob_right_index = html.find("<div class=\"b-offers b-offers_type_guru\">",glob_left_index) cursor = html.find("class=\"b-offers b-offers_type_guru\"",glob_left_index,glob_right_index) #print (html[cursor+39:cursor+47]) cursor = cursor+10 while (cursor < glob_right_index) and (cursor != its_magick): cursor = html.find("class=\"b-offers b-offers_type_guru\"",cursor,glob_right_index) #print ("Еще :") pid = (html[cursor+39:cursor+47]) pid = pid.replace("\"","") #print ("http://market.yandex.ru/model.xml?hid=90578&modelid="+pid) cursor = cursor+10 url_list.append(pid) url_base.write("http://market.yandex.ru/model.xml?hid=90578&modelid="+pid+"\n") url_list.pop() url_base.close()
def find_text(self, html, key, num_chars): # Find specified value in HTML tags. pos_begin = html.find(key) + len(key) + num_chars pos_end = html.find('"', pos_begin) return html[pos_begin:pos_end]
def process(html): html = html[html.find('対戦用のアイテムと効果と入手方法'):html.find('同じわざしか出せなくなる代わりに')] parsed_html = lxml.html.fromstring(html) items_table = parsed_html.cssselect('table > tbody > tr') item_id = 0 for item_tr in items_table: item_dict = {} item_id += 1 item_dict["id"] = item_id item_dict["name"] = str.strip(item_tr[0].text_content()) items_list.append(item_dict)
def extract_mediainfo(self, html): """ extract media info from onleihe search result """ media_info = [] if html.find('mediaInfo') != -1: # Suchergebnisse LOGGER.info("extracting mediaInfo items from result") doc = lxml.html.document_fromstring(html) # <article class="list-item"> for item in doc.xpath("//article[@class='list-item']"): # <a class="cover-link" title="Details zum Titel: Die Rache des Kaisers" media_item = {} details = item.xpath(".//a[@class='cover-link']") assert len(details) == 1, "missing article details" details = details[0] title = details.attrib['title'] assert title.startswith( TITLE_DETAIL_PREFIX), "failed to extract media title" title = title[len(TITLE_DETAIL_PREFIX):].strip() media_item["title"] = title media_item["href"] = details.attrib['href'] abstract = item.xpath(".//div[@class='abstract']") if abstract: assert len(abstract) == 1, "multiple abstracts?" abstract = abstract[0] media_item['abstract'] = abstract.text_content().strip() else: media_item['abstract'] = None # //div[@class='media-type'] # <svg class="svg-icon ic_ebook"><use xlink:href="#ic_ebook"></use></svg> author = item.xpath(".//div[@class='author']") if author: assert len(author) == 1 author_text = author[0].text_content().strip() if author_text.startswith('Autor:'): author_text = author_text[6:] media_item["author"] = author_text else: media_item["author"] = None # //div[@class='available'] # <div class="available">Verfügbar</div> media_info.append(media_item) else: assert html.find('Suchergebnisse') == -1 assert html.find('contentlist resultlist') == -1 return media_info
def getLatLng(html,info): latlng1 = html.find('map.setCenter(new GLatLng('); latlng2 = html.find(', 13);'); gmapsStr = html[latlng1+26:latlng2-4] gmaps = gmapsStr.split(','); info['lat'] = gmaps[0]; info['lng'] = gmaps[1]; return info;
def convert_google_sheet(sid, gid, options): html = parse_google_document( 'https://docs.google.com/spreadsheets/d/{sid}/htmlembed/sheet?gid={gid}&{options}' .format(sid=sid, gid=gid, options=options), errhelp={'sid' : sid, 'gid' : gid} ) for script in html.iter('script'): v = script.get('src') if v is None: #pass #script.getparent().remove(script) script.text = script.text.replace("CHARTS_EXPORT_URI.push('","CHARTS_EXPORT_URI.push('https://docs.google.com") else: script.set('src',"https://docs.google.com"+v) html.find('head/link').rewrite_links( lambda s: 'https://docs.google.com' + s ) html.find('head').append(lxml.html.Element( 'link', rel='stylesheet', href=url_for('static', filename='metatable.css'), )) html.find('body').append(lxml.html.Element( 'script', src="https://ajax.googleapis.com/ajax/libs/jquery/3.1.1/jquery.min.js" )) html.find('body').append(lxml.html.Element( 'script', src=url_for('static', filename='metatable.js') )) script = lxml.html.Element('script') script.text = ( "$(init); " "function init() { " "$('body').css('overflow', 'hidden'); " "var $table = $('#sheets-viewport table').detach(); " "var $metatable = create_metatable($table); " "$('body').empty().append($metatable); " "$metatable.resize(); " " }" "$('.row-header-wrapper').remove();" #"$('td').css('min-width', '100px');" "$(window).bind('load', function() {" "i=1;" "tableWidth=0;" "while (true) { idStr = '#0C'+i.toString(); obj = $(idStr); if (obj[0]==undefined) {break;}; wstr=obj[0].style.width.replace('px', ''); tableWidth+=parseInt(wstr); i++; }" "tblList = $('table.waffle');" "tblList[1].style.width=tableWidth.toString()+'px';" "tblList[3].style.width=tableWidth.toString()+'px';" "initCharts();" "});" ) html.find('body').append(script) # with open("output.txt", "w") as text_file: # text_file.write(lxml.html.tostring(html, encoding='utf-8')) return b'<!DOCTYPE html>\n<meta charset="UTF-8">\n' + \ lxml.html.tostring(html, encoding='utf-8')
def compass(): answers = def_answers.copy() questions = {} post_args = {} while post_args is not None: # Post previous responses, Get new questions (first post is empty, gets page 1) html_text = submit_page(post_args) html = lxml.html.fromstring(html_text) curr_questions = reap_questions(html) # If the test isn't done, prepare [post_args] for next page if len(curr_questions): # Verify test integrity if not all(item in def_questions.items() for item in curr_questions.items()): raise RuntimeError("Questions have changed. Answer cache is bad!") questions.update(curr_questions) # Assemble responses post_args = {'answer_' + str(key): answers[key] for key in curr_questions} # Print responses for num in sorted(curr_questions): print(str(num) + ":\t" + curr_questions[num] + "\n\t" + values[int(answers[num])] + '\n') submit_tag = html.find(".//input[@type='submit']") post_args["submit"] = submit_tag.value # submit_tag.type == "submit" for tag in html.findall(".//input[@type='hidden']"): post_args[tag.name] = tag.value pageno = post_args["pageno"] else: post_args = None pageno = 'f' # with open('/Users/alex/Desktop/page' + pageno + ".html", "a+") as f: # f.write(html_text) h2 = html.find(".//h2") print(h2.text_content()) lines = h2.text_content().split('\n') x = float(lines[0][-6:]) y = float(lines[1][-6:]) pyplot.scatter(x, y) pyplot.xlim(-10, 10) pyplot.ylim(-10, 10) pyplot.title("Political coordinates") pyplot.xlabel("Economic Left/Right") pyplot.ylabel("Social Libertarian/Authoritarian") pyplot.grid() pyplot.show() return questions
def get_sector(ticker_old): na_count = 0 ticker = ticker_old.replace('.', '') prefix = 'http://finance.yahoo.com/q/pr?s=' # ticker = 'LMCK' response = urllib2.urlopen(prefix + ticker) html = response.read() response.close() # print html start_string = 'Sector:</td><td class="yfnc_tabledata1">' end_string = 'Full Time Employees' start_index = html.find(start_string) start_length = len(start_string) end_index = html.find(end_string) sub_string = html[start_index + start_length:end_index - 1] if ticker == r'JW/A': sector = 'Services' industry = 'Publishing - Books' elif ticker == 'PGRE': sector = 'Financials' industry = ' Real Estate Development & Operations - NEC' elif start_index == -1 or end_index == -1 or sub_string[0:3] == 'N/A': na_count = 1 sector = 'N/A' industry = 'N/A' else: sub_string = sub_string[sub_string.find('">') + 2:] # print sub_string start_string_2 = 'Industry:</td><td class="yfnc_tabledata1">' start_index_2 = sub_string.find(start_string_2) start_length_2 = len(start_string_2) sub_string_2 = sub_string[(start_index_2 + start_length_2):] sub_string_2 = sub_string_2[sub_string_2.find('">') + 2:] # print sub_string h = HTMLParser.HTMLParser() sector = h.unescape(sub_string[0:sub_string.find('</a>')]) industry = h.unescape(sub_string_2[0:sub_string_2.find('</a>')]) print 'sector = %s' % sector print 'industry = %s ' % industry tmp_df = pd.DataFrame([{ 'ticker': ticker_old, 'sector': sector, 'industry': industry }]) return tmp_df, na_count
def execute(self): rewrite, headers = self.rewrite_headers() if not headers: msg = "HTTP/1.1 502 Gateway Error\r\n\r\n bad request." self.resp.send(msg) return if rewrite: body = self.parser.body_string() if not body: rewritten_body = '' else: html = lxml.html.fromstring(body) # rewrite links to absolute html.rewrite_links(self.rewrite_link) # add base absolute_path = "%s%s" % (self.local_base, self.extra.get('path', '')) old_base = html.find(".//base") base = etree.Element("base") base.attrib['href'] = absolute_path if not old_base: head = html.find(".//head") head.append(base) # modify response rewritten_body = bytes(lxml.html.tostring(html)) # finally send response. headers.extend([ 'Content-Length: %s\r\n' % len(rewritten_body), "\r\n"]) self.resp.writeall(bytes("".join(headers))) stream = io.BytesIO(rewritten_body) while True: data = stream.read(io.DEFAULT_BUFFER_SIZE) if not data: break self.resp.writeall(data) else: self.resp.writeall(bytes("".join(headers) + "\r\n")) body = self.parser.body_file() send_body(self.resp, body, self.parser.is_chunked())
def grow(year,quarter): pn=1 DF=pd.DataFrame() ws._write_head() while True: try: ws._write_console() url='http://vip.stock.finance.sina.com.cn/q/go.php/vFinanceAnalyze/kind/grow/index.phtml?s_i=&s_a=&s_c=&reportdate={0}&quarter={1}&p={2}'.format(year,quarter,pn) r=requests.get(url,headers=hds()) r=r.content.decode('gbk') html=BeautifulSoup(r,'lxml') text=html.find(id='dataTable') df=pd.read_html(str(text),header=0)[0] if df.empty is True: break else: pn = pn + 1 DF =DF.append(df) except: break DF=DF.applymap(lambda x:np.where(x=='--',np.nan,x)) DF=DF.set_index('股票代码') DF.index=DF.index.map(lambda x: str(x).split('.')[0].zfill(6)) DF['date']=str(year)+'_'+str(quarter).zfill(2) name=list(DF.columns) name.remove('股票名称') name.remove('date') for label in name: DF[label]=DF[label].astype(float) return DF
def get_company_info(company_number): ''' Get company info for a given company number. Returning a dict with the information, with CompanyNumber set to -1 if no company is found. ''' html = scraperwiki.scrape(SEARCH_SITE+str(company_number)) if( html.find("The search did not produce any results") != -1 ): info = {"CompanyNumber":-1} else: root = lxml.html.fromstring(html) info = {} info["CompanyNumber"] = company_number tables = root.cssselect("table") table = tables[7] tds = table.cssselect("td") td = tds[1] fonts = td.cssselect("font") info["CompanyName"] = fonts[0].text brs = td.cssselect("br") info["Address"] = lxml.html.tostring(brs[1]).replace("<br> ","") zip_city = lxml.html.tostring(brs[2]).replace("<br> ","") info["PostalCode"] = zip_city.split('-')[0].strip(" ") info["City"] = zip_city.split('-')[1].strip(" ") info["ScrapedTime"] = datetime.datetime.now() status_code = fonts[1].text.split(":") info["Status"] = status_code[0] if( len(status_code) == 2 ): info["LicenceCode"] = status_code[1].strip() return info
def render(self, only_op): html = E.html( E.head( E.title(self.title), E.link(href='story.css', type='text/css', rel='stylesheet') #TODO: convert it to a PI. ), E.body( E.h1(self.title) ), xmlns='http://www.w3.org/1999/xhtml' ) body = html.find('body') body.append(self.op.render(display_title=False)) #calc mean length of replies sortmean = [i for i in self.replies if self.postlen(i) > 0] #sortmean.sort(key=lambda x: self.postlen(x)) mean = sum(self.postlen(i) for i in sortmean) / len(sortmean) #print('mean reply length:', mean) if self.author.name == 'Anonymous' or self.author.name == '' or self.author.name == None: only_op = False for reply in self.replies: # Remove user answers if not wanted. if only_op and not reply.is_op(self.op): continue if not only_op and self.postlen(reply) < mean: continue body.append(reply.render()) return html
def next5(self, irc, msg, args, optteam): """<optional team> Gets next 5 games, for a specific team if passed. """ url = 'http://www.nhl.com/ice/schedulebymonth.htm' try: sched = utils.web.getUrl(url).decode('utf8') except URLError as e: print 'An error occured fetching %s \n %s' % (url, e.reason) html = BeautifulSoup(sched, "lxml") try: table = html.find('table', { 'class' : 'data schedTbl'}) rows = table.findAll('tr') except AttributeError as e: raise ValueError("No valid table found") # Get data parser = PrivFuncs() table_data = parser._parse_rows(rows) table_data = table_data[0] # # Print data # for i in table_data: # print '\t'.join(i) irc.reply(table_data)
def scrape_html_title(self, url): """Scrape the ``<title>`` tag contents from an HTML page. """ # Let's see what's on the other end... r = requests.get(url.geturl()) # Only bother with 200 OK if r.status_code != requests.codes.ok: self.log.debug(u'request failed for ' + url.geturl()) return None if 'html' not in r.headers['Content-Type']: self.log.debug(u'Content-Type not HTML-ish ({}): {}' .format(r.headers['Content-Type'], url.geturl())) return None # Attempt to scrape the HTML for a <title> html = lxml.html.document_fromstring(r.text) title = html.find('.//title') if title is None: self.log.debug(u'failed to find <title>: ' + url.geturl()) return None # Normalise title whitespace title = ' '.join(title.text.strip().split()) nsfw = url.netloc.endswith('.xxx') # See if the title is in the URL if self._filter_title_in_url(url, title): return None # Return the scraped title return 'Title', nsfw, u'"{}"'.format(title)
def FI_sina(code): """ 获得个股的财务指标 --------- Return """ DF = pd.DataFrame() url = 'http://money.finance.sina.com.cn/corp/go.php/vFD_FinancialGuideLine/stockid/{0}/displaytype/4.phtml'.format( code) r = requests.get(url) r = r.content.decode('gbk') html = lxml.html.parse(StringIO(r)) urls = html.xpath("//div[@id='con02-1']/table[1]//a/@href") for url in urls: r = requests.get(url) r = r.content.decode('gbk') html = BeautifulSoup(r, 'lxml') text = html.find(id='BalanceSheetNewTable0') df = pd.read_html(str(text), header=0)[0] df.columns = df.iloc[0, :] df = df.drop(0, axis=0) df = df.set_index('报告日期') df = df.T #print(df) DF = DF.append(df) return DF
def parse_html(html_file): html = lxml.html.parse(html_file) # 本文を抽出 main_text = html.find('//div[@class="main_text"]') if main_text is None: return None # ルビの除去 for rt in main_text.findall('.//rt'): rt.getparent().remove(rt) for rp in main_text.findall('.//rp'): rp.getparent().remove(rp) # 注記と前後の不要な空白を除去 text = re.sub( '[#[^]]+]\n?', '', main_text.text_content(), flags=(re.MULTILINE) ).strip() # 正規化 text = unicodedata.normalize('NFKC', text) text = text.lower() return text
def xsjj(begin,end): """ 查询限售解禁的数据,主要是当前数据的前20页的数据 ----------------- Return: DataFrame """ pn=1 DF=pd.DataFrame() ws._write_head() while True: try: ws._write_console() url='http://vip.stock.finance.sina.com.cn/q/go.php/vInvestConsult/kind/xsjj/index.phtml?bdate={0}&edate={1}&showall=%CF%D4%CA%BE%C8%AB%B2%BF&p={2}'.format(begin,end,pn) r=requests.get(url) #print(url) r=r.content.decode('gbk') html=BeautifulSoup(r,'lxml') sarr=html.find(id='dataTable') df=pd.read_html(str(sarr),header=0)[0] DF =DF.append(df) pn=pn+1 if pn > 50: break if df.empty is True: break except: break DF=DF.applymap(lambda x:np.where(x=='--',np.nan,x)) DF=DF.drop_duplicates() DF['代码']=DF['代码'].map(lambda x:str(x).split('.')[0].zfill(6)) return DF
def getActorPhoto(self, htmltree): htmla = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']") names = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()") t = {} for name, a in zip(names, htmla): if name.strip() == '他': continue p = {name.strip(): a.attrib['href']} t.update(p) o = {} for k, v in t.items(): if '/search_act/' not in v: continue r = self.getHtml(urljoin('https://www.caribbeancom.com', v), type='object') if not r.ok: continue html = r.text pos = html.find('.full-bg') if pos<0: continue css = html[pos:pos+100] cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I) if not cssBGjpgs or not len(cssBGjpgs[0]): continue p = {k: urljoin(r.url, cssBGjpgs[0])} o.update(p) return o
def parse_title(self, htmlstring): html = lxml.html.document_fromstring(htmlstring) title = html.find('*/title') if title is None: return return title.text.encode('iso-8859-1').replace('\n', '').strip()
def add_css_js(html): head_sub_elements = [ #css r'<link rel="stylesheet" href="/static/css/word-recall.css" type="text/css">', r'<link rel="stylesheet" href="/static/css/popbox.css" type="text/css" media="screen" charset="utf-8">', #js r'<script type="text/javascript" src="/static/js/jquery.min.js"></script>', ] body_sub_elements = [ #js r'<script type="text/javascript" src="/static/js/recall-word.js"></script>', #html loader.render_to_string('recall/box.html'), ] add_elements_map = {'body': body_sub_elements, 'head': head_sub_elements} for add_tag in add_elements_map: element = html.find(add_tag) if element: for raw_html in add_elements_map[add_tag]: element.append(lxml.html.fromstring(raw_html))
def margin_share(code, begin, end): """ datetime:日期 如2017-06-16 ----------------- Return: DataFrame """ DF = pd.DataFrame() ws._write_head() try: ws._write_console() url = 'http://vip.stock.finance.sina.com.cn/q/go.php/vInvestConsult/kind/rzrq/index.phtml?symbol={0}&bdate={1}&edate={2}'.format( code, begin, end) #print(url) r = requests.get(url) r = r.content.decode('gbk') html = BeautifulSoup(r, 'lxml') sarr = html.find(id='dataTable') df = pd.read_html(str(sarr), header=None, skiprows=3)[0] df = df.applymap(lambda x: np.where(x == '--', np.nan, x)) df = df.drop(0, axis=1) #print(df.head()) df.columns = name df = df.set_index('date') return df except Exception as e: print(e) pass
def try_accept_cookies(): # Searches soup for the button on the cookie banner cookie_tag = html.find('button', { 'id' : 'onetrust-accept-btn-handler'}) if cookie_tag: #print('cookie tag found') cookie_xpath = xpath_soup(cookie_tag) cookie_button = browser.find_element_by_xpath(cookie_xpath).click()
def get_page(): # request the page r = requests.get(request.args['url']) # parse the dom into python objects html = lxml.html.document_fromstring(r.content) # prase the requested url so we can form the base href url = urlparse(request.args['url']) # create the base url dom fragment base_url = lxml.html.fromstring("<base href='%s://%s'>" % (url.scheme, url.hostname)).find('.//base') # find the head element head = html.find(".//head") # insert the base href in the last place of the head elements head.insert(-1, base_url) # rewrite urls to have absolute url html.resolve_base_href() # rewrite links to load through this proxy for element, attribute, link, pos in html.iterlinks(): if element.tag == "a" and attribute == "href": link = "http://localhost:8888/translate_url?url=%s" % (link) element.set("href", link) element.set("target", "_parent") # translate through DOM Traversal # html = translate_dom_string(html, lxml.html.tostring(html)) # translate through HTML regex string replacement html = translate_html(html, lxml.html.tostring(html)) # dump the html string for debugging # with open('html_dump', 'w') as f: # f.write(lxml.html.tostring(html)) # a little regex to remove any script tags return re.subn(r'<(script).*?</\1>(?s)', '', lxml.html.tostring(html))[0]
def dzjy(): """ 查询大宗交易的数据,主要是当前数据的前20页的数据 ----------------- Return: DataFrame """ pn = 1 DF = pd.DataFrame() ws._write_head() while True: try: ws._write_console() url = 'http://vip.stock.finance.sina.com.cn/q/go.php/vInvestConsult/kind/dzjy/index.phtml?num=60&p={0}'.format( pn) r = requests.get(url) #print(url) r = r.content.decode('gbk') html = BeautifulSoup(r, 'lxml') sarr = html.find(id='dataTable') df = pd.read_html(str(sarr), header=0)[0] DF = DF.append(df) pn = pn + 1 if pn > 20: break if df.empty is True: break except: break DF = DF.applymap(lambda x: np.where(x == '--', np.nan, x)) DF = DF.drop_duplicates() return DF
def search(bookname): # 搜索听书 # 参数 page int 例子: 1 # 参数 searchword string 56听书的限制搜索必须提交GBK编码的字符串 例子: %D5%C2%D3%E3 path = '/search.asp?page=1&searchword=' + urllib.request.quote( bookname.encode('gb2312')) html = getHtmlContent(path) searchList = html.find_all(class_='list-ov-tw') pageCount = html.find( class_='cate-pages').find_all('li')[1].text.split('/') nowPageCount = pageCount[0] # 当前页 allPageCount = pageCount[1] # 总页数 bookList = [] # 搜索结果 for searchItem in searchList: bookUrl = searchItem.find(class_='bt').find('a').attrs['href'] bookImg = searchItem.find('img').attrs['original'] bookName = searchItem.find(class_='bt').text bookAuthor = searchItem.find_all( class_='zz')[0].text + ' ' + searchItem.find_all( class_='zz')[1].text bookContent = searchItem.find(class_='nr').text book = Book(bookUrl, bookImg, bookName, bookAuthor, bookContent, nowPageCount, allPageCount) bookList.append(book) return bookList
def main(): print 'start at %s' % time.asctime() users = db.select("users") print 'current users count %s ' % len(users) for user in users: # print 'user %s ' % user.token # print 'user %s ' % user.secret access_token = OAuthToken(user.token, user.secret) if not user.trunk_key: continue t = Trunkly(user.trunk_key) sinat = Sinat(sinaConsumer, access_token=access_token) statuses = sinat.statuses__user_timeline('GET') for status in statuses: weibo = status['text'] if status.has_key('retweeted_status'): weibo = '%s //@%s: %s' % (weibo , status['retweeted_status']['user']['name'], status['retweeted_status']['text']) # print 'status %s' % status['text'] urls = p.findall(weibo) for url in urls: print 'url is %s ' % url title = None trunk = None try: html = lxml.html.parse(url) title = html.find(".//title").text url = html.getroot().base_url print 'title is %s' % title print 'base url is %s ' % url try: try: trunk = t.get_link(parameters={'url': url}) print 'url Already exists!!!' continue except: print 'error' pass if title and not trunk: print 'post url to trunk.ly' t.post_link(parameters={'url': url, 'title': title, 'tags' : '', 'note' : weibo, 'text' : weibo}) except: print 'post to trunk error. url %s title %s' % (url, title) except: print 'url %s fetch error' % (url) print '---------------- end ---------------------'
def get_actor_photo(lx, session): htmla = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']") names = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()") t = {} for name, a in zip(names, htmla): if name.strip() == '他': continue p = {name.strip(): a.attrib['href']} t.update(p) o = {} for k, v in t.items(): if '/search_act/' not in v: continue r = session.get(urljoin(G_SITE, v)) if not r.ok: continue html = r.text pos = html.find('.full-bg') if pos<0: continue css = html[pos:pos+100] cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I) if not cssBGjpgs or not len(cssBGjpgs[0]): continue p = {k: urljoin(r.url, cssBGjpgs[0])} o.update(p) return o
def scrape_pacific_plants_names(): r = requests.get('http://www.hear.org/pier/wralist.htm') html = BeautifulSoup(r.text, 'lxml') table = html.find('table') # remove header row rows = table.find_all('tr')[1:] # find scientific names data = [row.find_all('td')[0] for row in rows] plant_scientific_names = [] # clean parsed data for d in data: plant_scientific_names.append(parse_html(str(d))) # write names to json with open('data/plant_scientific_names.json', 'w') as datafile: json.dump(plant_scientific_names, datafile, indent=2, separators=(',', ':')) # find common names data = [row.find_all('td')[2] for row in rows] plant_common_names = [] # clean parsed data for d in data: plant_common_names.append(parse_html(str(d))) # write names to json with open('data/plant_common_names.json', 'w') as datafile: json.dump(plant_common_names, datafile, indent=2, separators=(',', ':'))
def print_url(url): # Get webpage content try: r = requests.get(url, timeout=2) except requests.exceptions.MissingSchema: return print_url("http://{}".format(url)) except: return False # Print URL and title try: html = lxml.html.fromstring(r.text) print(url) try: # Some valid webpages don't have titles print(html.find(".//title").text.strip()) except: pass print('') sys.stdout.flush() return True except: return False
def CS_sina(code): """ 20170616,测试有问题 获得个股的现金流量表 ------- Return """ DF = pd.DataFrame() url = 'http://money.finance.sina.com.cn/corp/go.php/vFD_CashFlow/stockid/{0}/ctrl/part/displaytype/4.phtml'.format( code) r = requests.get(url) r = r.content.decode('gbk') html = lxml.html.parse(StringIO(r)) urls = html.xpath("//div[@id='con02-1']/table[1]//a/@href") for url in urls: r = requests.get(url) r = r.content.decode('gbk') html = BeautifulSoup(r, 'lxml') text = html.find(id='ProfitStatementNewTable0') df = pd.read_html(str(text), header=0)[0] df.columns = df.iloc[0, :] df = df.drop(0, axis=0) df = df.set_index('报表日期') df = df.T #print(df) DF = DF.append(df) return DF
def speaker(): """Parse current speaker (predseda) of the chamber.""" url = 'http://www.nrsr.sk/web/default.aspx?sid=predseda' content = scrapeutils.download(url) html = lxml.html.fromstring(content) div = html.find(".//div[@id='_sectionLayoutContainer__panelContent']") result = { 'url': url, 'meno': div.find(".//h1").text_content(), } image = div.find('.//img') if image is not None: result['fotka'] = 'http://www.nrsr.sk/web/' + image.get('src') born = div.find("div[@class='article']") if born is not None: result['narodený'] = re.search(r'Narodený: (.*)', born.text_content()).group(1) bio = div.find('table') if bio is not None: result['životopis'] = lxml.html.tostring(bio, encoding='unicode', with_tail=False) return scrapeutils.plaintext(result)
def get_predict_share_Sina(code, mtype): """ mtype: eps--每股收益, sales--营业收入, np--净利润, roe--净资产收益率 """ pn = 1 DF = pd.DataFrame() ws._write_head() while True: try: ws._write_console() url = 'http://vip.stock.finance.sina.com.cn/q/go.php/vPerformancePrediction/kind/{0}/index.phtml?symbol={1}&p={2}'.format( mtype, code, pn) r = requests.get(url) r = r.content.decode('gbk') html = BeautifulSoup(r, 'lxml') text = html.find(id='dataTable') df = pd.read_html(str(text), header=0)[0] if df.empty is True: break else: pn = pn + 1 DF = DF.append(df) except: break DF = DF.applymap(lambda x: np.where(x == '--', np.nan, x)) DF = DF.set_index('股票代码') DF.index = DF.index.map(lambda x: str(x).split('.')[0].zfill(6)) return DF
def current_term(): url = 'http://www.nrsr.sk/web/default.aspx?sid=poslanci' content = scrapeutils.download(url) html = lxml.html.fromstring(content) option = html.find('.//select[@id="_sectionLayoutContainer_ctl01__currentTerm"]/option[@selected]') return option.get('value')
def get_actor_photo(browser): htmla = browser.page.select( '#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a' ) t = {} for a in htmla: if a.text.strip() == '他': continue p = {a.text.strip(): a['href']} t.update(p) o = {} for k, v in t.items(): if '/search_act/' not in v: continue r = browser.open_relative(v) if not r.ok: continue html = browser.page.prettify() pos = html.find('.full-bg') if pos < 0: continue css = html[pos:pos + 100] cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I) if not cssBGjpgs or not len(cssBGjpgs[0]): continue p = {k: urljoin(browser.url, cssBGjpgs[0])} o.update(p) return o
def get_sector(ticker_old): na_count = 0 ticker = ticker_old.replace('.', '') prefix = 'http://finance.yahoo.com/q/pr?s=' # ticker = 'LMCK' response = urllib2.urlopen(prefix + ticker) html = response.read() response.close() # print html start_string = 'Sector:</td><td class="yfnc_tabledata1">' end_string = 'Full Time Employees' start_index = html.find(start_string) start_length = len(start_string) end_index = html.find(end_string) sub_string = html[start_index+start_length: end_index-1] if ticker == r'JW/A': sector = 'Services' industry = 'Publishing - Books' elif ticker == 'PGRE': sector = 'Financials' industry = ' Real Estate Development & Operations - NEC' elif start_index == -1 or end_index == -1 or sub_string[0:3] == 'N/A': na_count = 1 sector = 'N/A' industry = 'N/A' else: sub_string = sub_string[sub_string.find('">')+2:] # print sub_string start_string_2 = 'Industry:</td><td class="yfnc_tabledata1">' start_index_2 = sub_string.find(start_string_2) start_length_2 = len(start_string_2) sub_string_2 = sub_string[(start_index_2+start_length_2):] sub_string_2 = sub_string_2[sub_string_2.find('">')+2:] # print sub_string h = HTMLParser.HTMLParser() sector = h.unescape(sub_string[0:sub_string.find('</a>')]) industry = h.unescape(sub_string_2[0:sub_string_2.find('</a>')]) print 'sector = %s' % sector print 'industry = %s ' % industry tmp_df = pd.DataFrame([{'ticker': ticker_old, 'sector': sector, 'industry': industry}]) return tmp_df, na_count
def scrape_haodoo(): """ Main """ skip_stage1 = False try: print(">>> Stage 1 - Collecting all book urls {}<<<".format( datetime.now())) if not skip_stage1: for url in urls(): # print(url) html = scrape(url) parse_books_from_html(html) page = 1 while True: suburl = get_suburl(url, page) #print(suburl) if html.find(urlparse(suburl).query): html = scrape(suburl) if html.find("<strong>404") != -1: break parse_books_from_html(html) page = page + 1 else: break print(">>> Stage 2 - Analysising all book urls {}<<<".format( datetime.now())) p = Pool() results = p.map(grab_and_analysis, Page.query_all()) # results = map(grab_and_analysis, Page.query_all()) print(">>> Stage 3 - Saving results {}<<<".format( datetime.now())) for volumes in results: for volume in volumes: save_volume(volume) print(">>> State 4 - done {}<<<".format(datetime.now())) except Exception as e: print("Got exception:") print(e) print(traceback.format_exc())
def estrai(html,search,nrline,offset): i=html.find(search) if i > 0: i=i+len(search) j=i+offset s = html[i:j] return (s) else: return ''
def from_url(cls, url, tokenizer): data = fetch_url(url) # form the lxml tree from the data html = lxml.html.fromstring(data) # find and store the title in the instance title = html.find(".//title").text return cls(data, tokenizer, url, title)
def main(): urls = [ 'http://www.haodoo.net/?M=hd&P=wisdom', 'http://www.haodoo.net/?M=hd&P=history', 'http://www.haodoo.net/?M=hd&P=martial', 'http://www.haodoo.net/?M=hd&P=mystery', 'http://www.haodoo.net/?M=hd&P=romance', 'http://www.haodoo.net/?M=hd&P=scifi', 'http://www.haodoo.net/?M=hd&P=fiction', ] skip_stage1 = False try: print( ">>> Stage 1 - Collecting all book urls <<<" ) if not skip_stage1: for url in urls: html = scraperwiki.scrape(url) page = 1 while True: suburl = "{0}-{1}".format( url, page ) if html.find( suburl[suburl.find('?'):] ): html = scraperwiki.scrape( suburl ) if html.find("<strong>404")!=-1: break parse_books_from_html( html ) page = page + 1 else: break print( ">>> Stage 2 - Analysising all book urls <<<" ) for book in scraperwiki.sqlite.select("* from bookpages"): # grab html html = scraperwiki.scrape( book['url'] ) # analysis and store information into book analysis_book_html_and_save( book, html ) print( ">>> State 3 - done <<<" ) except Exception, e: print( "Got exception:" ) print( e ) print( traceback.format_exc() )
def artverify(art, html='', pdf=''): """ Check whether HTML and PDF documents match abstract text Arguments: html (str): HTML text (optional) pdf (str): PDF text (optional) """ # Cast article to Article art = toart(art) # Get article info info = artinfo({'xml' : art.xml}) # Quit if no abstract if info['abstxt'] is None: return None, None # Tokenize abstract abstxt = info['abstxt'] abswords = re.split('\s+', abstxt) abswords = [word.lower() for word in abswords] # Ignore punctuation for char in ['.', ',', ';', ':']: abswords = [word.strip(char) for word in abswords] # Load HTML if not html: html = loadhtml(art, overwrite=True) # Load PDF if not pdf: pdf = loadpdf(art) pdf = to_unicode(pdf) # To lower-case html = html.lower() pdf = pdf.lower() # Check HTML if html: htmlwords = [word for word in abswords if html.find(word) > -1] htmlprop = float(len(htmlwords)) / len(abswords) else: htmlprop = None # Check PDF if pdf: pdfwords = [word for word in abswords if pdf.find(word) > -1] pdfprop = float(len(pdfwords)) / len(abswords) else: pdfprop = None # Return return htmlprop, pdfprop
def extract_set_title( html ): start_pos = html.find( 'SetTitle("' ) if start_pos == -1: return ("", "") start_quote = html.find( '"', start_pos ) if start_quote == -1: return ("", "") end_quote = html.find( '"', start_quote+1 ) if end_quote == -1: return ("", "") set_title = html[ start_quote+1: end_quote-1 ] set_title = set_title.replace( '《', ',' ) r = set_title.split(',') if len(r)!=2: return ("", "" ) return r
def resolve_ean(ean): page = requests.post(SEARCH_URL, data={"form[q]": ean}) #Check if something was found if "keine Artikel gefunden" in page.text: return None html = lxml.html.document_fromstring(page.text) result = dict() result["type"] = html.find('.//li[@class="variant"]').text_content().strip() if result["type"] == "Audio CD": result["type"] = "audiobook" result["author"] = html.find('.//a[@class="author"]').text_content().strip() result["artists"] = None elif result["type"] == "Gebundenes Buch": result["type"] = "book" result["author"] = html.find('.//a[@class="author"]').text_content().strip() result["artists"] = None else: result["artists"] = result["author"] = None result["type"] = "movie" result["title"] = html.find('.//h1[@class="headline"]').text attr_field = html.find('.//ul[@class="plain"]') attrs = dict() for li in attr_field.findall(".//li"): data = li.text_content() if data: title, sep, val = data.partition(":") attrs[title] = val.strip() #Extract description description_element = html.find('.//div[@class="product-description"]/div[2]/div[1]') #Convert brs to nl if description_element is not None: for br in description_element.xpath(".//br"): br.tail = "\n" + br.tail if br.tail else "\n" description = description_element.text_content() #Strip trailing crap result["description"] = description[:description.find("Bonusmaterial")] else: #Ignore this hit if there is no description return None try: result["duration"] = int(re.search("Gesamtlaufzeit: (\d+) Min.", page.text).group(1)) except: result["duration"] = None result["created"] = defNone(attrs.get("Erscheinungstermin"), lambda x: interpDate(x)) result["studio"] = attrs.get("Hersteller") result["imgurl"] = html.find('.//img[@class="cover"]').attrib["src"] return result
def xpath_analyzer(self, content_html): """Use XPath to extract stats, and return stats dictionary.""" html = lxml.html.fromstring(content_html) stats = dict() for stat_name, selector in self.stats_map.iteritems(): stat_block = html.find(self.xpath_search_str % (selector)) assert stat_block is not None logging.debug('%s: %s', stat_name, stat_block.text_content()) stats[stat_name] = int(stat_block.text_content().replace(',', '')) return stats
def scrape_html_title(self, url): """Scrape the ``<title>`` tag contents from the HTML page at *url*. Returns a :class:`LinkInfoResult`. """ make_error = partial(LinkInfoResult, url.geturl(), is_error=True) # Let's see what's on the other end... with closing(simple_http_get(url.geturl(), stream=True)) as r: # Only bother with 200 OK if r.status_code != requests.codes.ok: return make_error("HTTP request failed: {}".format(r.status_code)) # Only process HTML-ish responses if "Content-Type" not in r.headers: return make_error("No Content-Type header") elif "html" not in r.headers["Content-Type"]: return make_error("Content-Type not HTML-ish: {}".format(r.headers["Content-Type"])) # Don't try to process massive responses if "Content-Length" in r.headers: max_size = int(self.config_get("max_response_size")) if int(r.headers["Content-Length"]) > max_size: return make_error( "Content-Length too large: {} bytes, >{}".format( r.headers["Content-Length"], self.config_get("max_response_size") ) ) # Get the correct parser if "charset=" in r.headers["content-type"]: # If present, HTTP Content-Type header charset takes precedence parser = lxml.html.HTMLParser(encoding=r.headers["content-type"].rsplit("=", 1)[1]) else: parser = lxml.html.html_parser # Get only a chunk, in case Content-Length is absent on massive file chunk = next(r.iter_content(int(self.config_get("max_response_size")))) # Try to trim chunk to a tag end to help the HTML parser out try: chunk = chunk[: chunk.rindex(b">") + 1] except ValueError: pass # Attempt to get the <title> tag html = lxml.etree.fromstring(chunk, parser) title = html.find(".//title") if title is None: return make_error("failed to find <title>") # Normalise title whitespace title = " ".join(title.text.strip().split()) # Build result result = LinkInfoResult(url, title, nsfw=url.netloc.endswith(".xxx")) # See if the title is redundant, i.e. appears in the URL result.is_redundant = self._filter_title_in_url(url, title) return result
def parse_page_content(self, page_content, cached_soup={}): page_key = hashlib.sha224(page_content.encode("utf-8")).hexdigest() try: if not cached_soup.has_key(page_key): import lxml.html.soupparser cached_soup[page_key] = lxml.html.soupparser.fromstring(page_content) html = deepcopy(cached_soup[page_key]) body = html.find(".//body") if body is None: raise UnknownContentException() # for simplicity we decided to use BeatifulSoup parser for now # html = etree.XML(page_content, etree.XMLParser()) # body = html.find('{%s}body' % NS['html']) # if body is None: # raise UnknownContentException() except (ExpatError, etree.XMLSyntaxError, UnknownContentException): raise # logging.warning('Was not valid XHTML; trying with BeautifulSoup') # try: # import lxml.html.soupparser # html = lxml.html.soupparser.fromstring(page_content) # body = html.find('.//body') # if body is None: # raise # except: # # Give up # logging.error("Giving up on this content") # raise UnknownContentException() if self.current_anchor is None: return html elements_to_remove = [] start_elem, end_elem = find_bounding_elements( body, self.previous_anchor, None if self.current_anchor["id"] is None else self.current_anchor, self.next_anchor, ) within_start_and_end_elem = True if start_elem is None else False for elem in body.iterdescendants(): if elem == start_elem: within_start_and_end_elem = True elif elem == end_elem: within_start_and_end_elem = False if not within_start_and_end_elem and start_elem not in elem.iterdescendants(): elements_to_remove.append(elem) for elem in elements_to_remove: elem.clear() try: body.remove(elem) except ValueError: pass return html