def getDomFromFile(url): #html = requests.get("http://www.autolanka.com/Buy.asp").content html=open('index'+str(url),'r') dom = lxml.html.fromstring(html.read()) #data=minePage(dom) return dom
def get_email(self, url, search_word): self._clear_variable() if url == 'nan': self.emails = '' return if fnmatch.fnmatch(url, '*.txt') or fnmatch.fnmatch(url, '*.pdf'): self.emails = '' return try: html = urlopen(url) soup = BeautifulSoup(html.read(), "lxml") email = soup.find_all(string=re.compile(search_word)) self._set_emails(email) if len(self.emails) > 0: print('{}という文字列を発見しました。\nemails: {}'.format( search_word, self.emails)) else: print('{}を含む文字列は見つかりませんでした。'.format(search_word)) except urllib.error.HTTPError as e: print(e) if e.code == 403: self.emails = None else: self.emails = ''
def download(url,num_retries=2,headers={'User_agent':'wswp'}): print 'Downloading:'+url headers=headers # headers={ # 'cookie':'ali_apache_id=10.181.239.59.1494788661790.629693.9; ali_beacon_id=10.181.239.59.1494788661790.629693.9; __utma=3375712.567060824.1494788650.1494788798.1494788798.1; __utmc=3375712; __utmz=3375712.1494788798.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); JSESSIONID=FF6Y4EE1N2-GOQKM0V4V8OX81KS4MMQ3-TP303P2J-SNN; _mle_tmp0=eNrz4A12DQ729PeL9%2FV3cfUxiK7OTLFScnMzizRxdTX0M9J19w%2F09jUIMwmz8I%2BwMPQONvH1DTTWDQkwNjAOMPLSDfbzU9JJLrEyNLE0MbewNLYwMDUz1ElMRhPIrbAyqI0CAE1jHIM%3D; aep_history=keywords%5E%0Akeywords%09%0A%0Aproduct_selloffer%5E%0Aproduct_selloffer%0932765891920%0932659135793; aep_common_f=lloB6SH9JkijCM0tZf5kF3vyLv3h17v3UGN0yKJY3eGouhvEFBCS8A==; cna=7TiaEawi8UUCAdz5Y7Jfecem; acs_usuc_t=acs_rt=9fa376ab0d934f49897582cda74a6a48&x_csrf=1dhm5ajlu9pss; xman_us_t=x_lid=cn1520615299vyjy&sign=y&x_user=sY7/8Mu/TJ74CnXJLTJLG+uzUZKMe5Udm53rpggEULQ=&ctoken=hku5awqv39rt&need_popup=y&l_source=aliexpress; xman_f=r9aP0o7m4kQFmXkQnDDVvHtloUJDl4TJtul01V5pE/TGopjb7kv3ERmNWw/bl4AkL2PUKsbtm0P9uzA1VpN9yTOLXjsCP492Tp0lMn+dZp6lLhPTMTEPlFPB9+Df+xVeZnFC4bm1nRL4VukJ85ff5E6t6GRqQNKTT+rFlACpAt8Xml6pQqUVNwXg2DwEYaUKhuQuJtUCJguhpCa9xCyXT4MffiUY7ExmZH0NG4eesAgpds2lCBCmS6GKkR1JRECrRYDFGYnVYO72CIy4so0cERX0HEVsdrCvu9pBrBOsnUxDLXCvUQWUbfIeT/Pf+rJT49UCdnH0DKQZftBBaUmJL0ZpSZc/5p9+RN8ZPGXplbRkFPIqThB/THcutxz5OKb8K5mSRANDbAR6utSxkYdfPw==; _ga=GA1.2.567060824.1494788650; _gid=GA1.2.511798973.1494791972; _gat=1; xman_us_f=x_l=1&x_locale=en_US&no_popup_today=n&x_user=CN|zheng|quanshi|cnfm|230715779&zero_order=y&last_popup_time=1494791355822; aep_usuc_f=site=glo®ion=US&b_locale=en_US&iss=y&s_locale=zh_CN&isfm=y&x_alimid=230715779&c_tp=USD; intl_locale=en_US; intl_common_forever=y79FmkBQVBQqJv1LtS4vunejUJ855apY0IOmvml/PRSXt6uEoU8RZg==; l=Ajk50FxeJA3yhOmq8aD8iPi8ya4RcS34; isg=AvPzpm8uroKGwmLWP-mLS0CTgvd5y4fq6arxoKWSeJJJpBNGLPgXOlE-KGKx; ali_apache_track=mt=2|ms=|mid=cn1520615299vyjy; ali_apache_tracktmp=W_signed=Y; xman_t=oWB8qjX+m/FChYzepnxuxryFhmGnbqQ023tWzmrFPg31C97Flxcq69qPSvczwM3a+vYgGwjxlyDEUqr1uQKvfSk2yxYTlIXjrfq1qKduCQqLIiofcEw0m34tbPSH0b25clkG0+uN3pj+pI7GcStuSq60x5OEmwFYzxrucHqx+Lw6rdJo6C6cMWZNa98KFo7mVIv9FDorv/rLbURUmXcRtKpzakFP1PQQuM69/LXfW9eltTejIU4ssITXciaL7JBxU0DkVvIdop4ZFwLG9P6TA8CUQb7m3MSFvhW/zdgztKyg7ZgIHUh6+p5FRdLOA3UKSB3+kTw+pQJ4xr1tCTCKBHzTuPCa7RgITEUR7n7LE67o5FtMOy6EZmoZZggReQG5Vo89imAnwnLlsvdjezMswKh87jvo66bJQ423xlN/yUb2n8kO+yeAs/0FeSXltceWR1R50qFg2HO8cEz+QChALI5KC7rzHXLZPSXDt7EzvmoIuKo8UI6Db4Iuefl0t0UhRonofgoh3meoZFidDI2m3ZwfMBTka0hbRoK/fWNvW9FOeQTOzYpBtoFRCd1x1aNmzuF+i6lMvH32E57KJXo/dN4aNF5O6ZDSnMW7hZe75T9pWX1MNIiFmGget/NY3Cx96tgwr7ekv2vseN+4HIo20a5027PyOIoq', # } request=urllib2.Request(url,headers=headers) try: response=urllib2.urlopen(request) html=urllib2.urlopen(request).read() if response.info().get('Content-Encoding')=='gzip': html=gzip.GzipFile(fileobj=StringIO.StringIO(html),mode="r") try: html=html.read() #.decode('gbk').encode('utf-8') except IOError as e1: html=urllib2.urlopen(request).read() #针对 amazon 返回信息 时有错误 except urllib2.URLError as e: print 'Downloading error:',e.reason html=None if num_retries > 0: if hasattr(e,'code') and 500<=e.code<600: return download(url,num_retries-1,headers) #print html return html
def match_walletID_bitaddr(ID_txhash, address_type): global idx socket.setdefaulttimeout(3) for walletId in ID_txhash.keys(): idx += 1 print(idx) try: txhashes = ID_txhash[walletId] except Exception as e: continue for txhash in txhashes: try: request = urllib.request.urlopen('http://www.qukuai.com/search/zh-CN/BTC/' + txhash) html = request.read() request.close() address = get_address(html, address_type) # print('method2 ', walletId, address) except Exception as e: try: html = urllib.request.urlopen('https://blockchain.info/rawtx/' + txhash) hjson = json.loads(html.read()) address = parse_transaction(hjson, address_type) # print('method1 ', address) except Exception as e: print('get address failed') continue if walletId not in walletId_bitaddr: # print('1 ', walletId, address) walletId_bitaddr[walletId] = address else: # print( 'not 1 ', walletId, address) walletId_bitaddr[walletId].extend(address)
def processRounds(roundURLs): for roundURL in roundURLs: html = urllib2.urlopen(siteURL + roundURL) roundPage = lxml.html.fromstring(html.read()) html.close() round = roundPage.cssselect( "li[id='tpRound'] a")[0].text_content().replace( "round ", "").replace(" Rankings", "").strip() print "Round: " + round roundRows = roundPage.cssselect("div[id='view_standard'] tr") # specified in the footer pageLinks = roundRows[-1].cssselect("a") #remove the "next page" link del pageLinks[-1] for link in pageLinks: linkURL = siteURL + link.get("href") print linkURL scrapePage(linkURL, round) calculateExtraStats(round)
def gettitle(url): requests.packages.urllib3.disable_warnings() req = request.Request(url) try: re = request.urlopen(req) html = urlopen(url) # 解析返回包的内容 #捕获异常,目标标签在网页中缺失 try: soup = BeautifulSoup(html.read(), 'lxml') title = soup.title.text tfw = open("title.txt", "a") tfw.write(str(soup.title.text) + "\n") tfw.close() ufw = open("url.txt", "a") ufw.write(str(re.url) + "\n") ufw.close() # 要加close不然无法写入 except AttributeError as e: print(url + " " + "no title") efw = open("eception.txt", "a") efw.write(url + " no tile" + "\n") except error.HTTPError as e: print(e.code) efw = open("eception.txt", "a") efw.write(url + " " + str(e.code) + "\n") except error.URLError as e: print(e.reason) efw = open("eception.txt", "a") efw.write(url + " " + str(e.reason) + "\n")
def get_html(self): try: html = urllib2.urlopen(URL) except Exception as e: self.exit(STATES.UNKNOWN, 'Error while opening url: %s' % str(e)) if html.getcode() >= 400: self.exit(STATES.UNKNOWN, 'HTTP error: %d' % html.getcode()) return html.read()
def test1(): j = json.loads('{"one" : "1", "two" : "2", "three" : "3"}') html = urlopen( "http://www.czce.com.cn/portal/DFSStaticFiles/Future/2017/20171026/FutureDataDaily.xls" ) data = html.read() print(data) return
def getHtml(self, url): if (self.testUrl(url) is True): html = urllib.request.urlopen(url) mybytes = html.read() mystr = mybytes.decode("utf8") html.close() return mystr else: return None
def yuandaima(ss): url = ss headers1 = {'GET': url, 'Host': "www.icpcw.com", 'User-Agent': "Mozilla/5.0 (Windows NT 6.2; rv:28.0) Gecko/20100101 Firefox/28.0", 'Referer': url} req = urllib.request.Request(url, headers=headers1) html = urllib.request.urlopen(req) scode = html.read().decode('utf-8', 'ignore') return scode
def getSeniority(linkList): myList = [] for link in linkList: html = urlopen(link) bs = BeautifulSoup(html.read(), 'html.parser') seniority = bs.find( 'div', {'col star-section text-center active'}).findNext('p') myList.append(seniority.get_text()) return myList
def __init__(self, url): print("load codeforces contest %s" % url) base = urlparse(url).netloc html = request.urlopen(url) self.dom = lxml.html.fromstring(html.read()) self.contest_id = CFContest.get_contest_id(url) self.pdf_name = "CF" + self.contest_id + ".pdf" self.problems = [] for problem_a_tag in self.dom.xpath('//table[@class="problems"]/tr[position() > 1]/td[1]/a'): self.problems.append(CFProblem("https://" + base + problem_a_tag.attrib['href']))
def get_all_functions(self, passedurl, topics): '''open the function page for parsing''' html = urllib.urlopen(passedurl) html = html.read() maintree = etree.parse(StringIO(html), self.parser) mainContent = maintree.xpath("//div[@class='section']") #scrape main div containing data if self.url=='http://docs.scipy.org/doc/scipy/reference/': self.scrape_section(mainContent[0], topics, scipy_first=True) else: self.scrape_section(mainContent[0], topics)
def get_html(self, url): opener = urllib2.build_opener() # agence.santemontreal.qc.ca seems to prohibit access (403) to "custom" http agents ( # like urllib2 one) ; by forcing User-agent we workaround the problem: opener.addheaders = [('User-agent', 'Mozilla/5.0')] try: html = opener.open(url) except Exception as e: self.exit(STATES.UNKNOWN, 'Error while opening url: %s' % str(e)) if html.getcode() >= 400: self.exit(STATES.UNKNOWN, 'HTTP error: %d' % html.getcode()) return html.read()
def request(self, url, params={}, timeout=180): error = None for x in range(0, settings.http_tries): try: if params: params = urllib.urlencode(params) html = urllib2.urlopen(url, params, timeout) else: html = urllib2.urlopen(url) return html.read() except Exception as e: error = e raise error
def getTitleAll(url, t1, t2, t3): try: html = urlopen(url) except HTTPError as e: return None try: bsObj = BeautifulSoup(html.read()) title = bsObj.body.h1 price = bsObj.findAll(t1, attrs={t2: t3}) print(title.get_text()) for el in price: print(el.get_text()) except AttributeError as e: return None return price
def getTitle(url): try: html = urlopen(url) except HTTPError as e: return None try: bsObj = BeautifulSoup(html.read()) title = bsObj.body.h1 price = bsObj.findAll("span", attrs={"class": "cost"}) print(title.get_text()) for el in price: print(el.get_text()) except AttributeError as e: return None return price
def search(s): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } req = request.Request('https://baike.baidu.com/item/' + quote(s, 'utf-8'), headers=headers) html = urlopen(req) bsObj = BeautifulSoup(html.read(), "html.parser") bs = bsObj.find_all(name='div', attrs={'class': 'para'}) content = "" for i in bs: content = f'{content}{i.text}' return content
def expert_prepare(_url): dictionary = {} db = MySQLdb.connect(host='localhost', user='******',passwd='123qwe', db='infoport', charset='utf8', init_command='SET NAMES UTF8') cursor = db.cursor() cursor.execute('select interest, article_id from exbd') result = cursor.fetchall() i = 0 k = 0 listkeys = [] dictkeys = {} for record in result: if record[i+1] == _url:#!= dictkeys[k] = record[i] k=k+1 #listkeys.append(record[i]) dictionary['keyword'] = dictkeys #dictionary['keyword'] = dictkeys.get('keys') #dictionary['keyword'] = listkeys #print dictionary['keyword'] html = urllib.urlopen(_url) doc = lxml.html.document_fromstring(html.read().decode('utf-8', 'ignore')) post = doc.cssselect('div.main .person-appointment-title')[0] dictionary['pos'] = post.text #.encode('utf-8') academictitle = doc.cssselect('div.main .person-appointment-title')[0] dictionary['academic_title'] = academictitle.text #.encode('utf-8') fio = doc.cssselect('div.footer__breadcrumbs .b ')[0] #ФИО dictionary['fio'] = fio.text #.encode('utf-8') items = doc.cssselect('div.g-pic') for item in items: image = item.get('style') s = image.split("'") page = 'http://www.hse.ru' + s[1] person_id = page.split("/") dictionary['person_id'] = person_id[6] #print page#адрес страницы, где находится фотография place = doc.cssselect('div.main .person-appointment-title + .link') #dictionary['place'] = place[0].text #print place[1].text #вывод ГОРОДА dictionary['photo'] = page #json_data = json.dumps(dictionary) #print json_data return dictionary
def use(self, rawcommand): if not len(rawcommand): return cooked = urllib.parse.urlencode({"search": rawcommand}) html = urllib.request.urlopen( "http://t-rechnik.info/search.php?" + cooked) html = html.read().decode("utf8") root = lxml.html.fromstring(html) tbl = root.get_element_by_id("table") if len(tbl) == 3: txt = tbl[2].text_content() txt = re.sub(r"\r|\n", " ", txt) txt = re.sub(r"\s+", " ", txt) self.bot.privmsg(self.bot.sender[0], txt, option="multiline") return return "nothing found"
def Scrape(tech, city, starting_page: int, ending_page: int): generalList = [] #generalList.append('Job Title;;;Employer Name;;;Salary;;;Link;;;Seniority;;;describtion;;;experience') if not city: city = 'warszawa' for i in range(starting_page, ending_page + 1): #print("Trying crawling on page "+ str(i) + "/" + str(ending_page)) if tech: url = 'https://nofluffjobs.com/pl/jobs/' + city + '/' + tech + '?criteria=city%3D' + \ city + '%20' + tech + '&page=' + str(i) else: url = 'https://nofluffjobs.com/pl/jobs/' + city + '?criteria=city%3D' + \ city + '&page=' + str(i) # dodac sytaucje kiedy ani tech ani lokalizacja nie jest podana try: html = urlopen(url) #print("HTML found (1/3)") except HTTPError as e: #print('HTML does not exist') break except URLError as e: #print("Server not found") break else: pass #print("Successfully connected to the server! (2/3)") bs = BeautifulSoup(html.read(), 'html.parser') title = getTitle(bs) employer = getEmployer(bs) salary = getSalary(bs) link = getLinks(bs) seniority = getSeniority(link) desc = getDescription(link) experience = getExperience(desc) for i in range(countOffers(bs)): #Tutaj tworzy te obiekty oferta jobOffer = Oferta(title[i], employer[i], salary[i], link[i], seniority[i], desc[i], experience[i]) # jobOffer = "%s;;;%s;;;%s;;;%s;;;%s;;;%s;;;%s" % (title[i], employer[i], salary[i], # link[i], seniority[i], desc[i], experience[i]) generalList.append(jobOffer) return generalList
def getTitle(url): try: html = urlopen(url) except HTTPError as e: return None try: bsObj = BeautifulSoup(html.read(), "lxml") title = bsObj.body.h1 except AttributeError as e: return None return title title = getTitle(url) if title == None: return "Title could not be found" else: return title
def main(self): '''Scrapes function name, argument list, description for argument, URL for description, URL for examples.''' html = urllib.urlopen(self.url) html = html.read() maintree = etree.parse(StringIO(html), self.parser) mainContent = maintree.xpath("//div[@class='section']") #scrape main div containing data main_h1 = [ child for child in mainContent[0].iterchildren('h1') ] #get its child h1 contentHTML= (etree.tostring(main_h1[0], pretty_print=True)) tree = etree.parse(StringIO(contentHTML), self.parser) title_text = tree.xpath("//text()")[0].strip() #title_text all_content = [ child for child in mainContent[0].iterchildren('div') ] # get its child div contentHTML= (etree.tostring(all_content[0], pretty_print=True)) tree = etree.parse(StringIO(contentHTML), self.parser) all_content_class = tree.xpath("//@class")[0].strip() if all_content_class=='toctree-wrapper compound': main_ul = [ child for child in all_content[0].iterchildren('ul') ] #get its child ul else: main_ul = [ child for child in all_content[1].iterchildren('ul') ] #get its child ul main_li = [ child for child in main_ul[0].iterchildren('li') ] #get its child li for each_li in main_li: main_a = [ child for child in each_li.iterchildren('a') ] #get its child a sectionHTML= (etree.tostring(main_a[0], pretty_print=True)) tree = etree.parse(StringIO(sectionHTML), self.parser) main_topic = ' '.join(tree.xpath("//text()")).encode('utf-8').strip() main_topic_link = tree.xpath("//@href")[0].encode('utf-8').strip() # main_topic, main_topic_link sub_ul = [ child for child in each_li.iterchildren('ul') ] #get its child ul if len(sub_ul)!=0: sub_li = [ child for child in sub_ul[0].iterchildren('li') ] #get its children li for each_sub_li in sub_li: sectionHTML= (etree.tostring(each_sub_li, pretty_print=True)) tree = etree.parse(StringIO(sectionHTML), self.parser) sub_topic = ' '.join(tree.xpath("//text()")).encode('utf-8').strip() sub_topic_link = tree.xpath("//@href")[0].encode('utf-8').strip() topics = {'main_topic': main_topic, 'main_topic_link': self.url+main_topic_link, 'sub_topic': sub_topic, 'sub_topic_link': self.url+sub_topic_link} self.get_all_functions(topics['sub_topic_link'], topics) else: topics = {'main_topic': main_topic, 'main_topic_link': self.url+main_topic_link, 'sub_topic': '', 'sub_topic_link': ''} self.get_all_functions(topics['main_topic_link'], topics)
def save_model(self, request, obj, form, change): if obj and form.is_valid(): toc = None excerpt = None if 'original_file' in form.changed_data: if obj.html_file: obj.html_file.delete(save=False) f = request.FILES['original_file'] html = _original_file_to_html(f) obj.html_file.save(obj.title+'.html', html, save=False) obj.html_file.close() html.seek(0) htmltree = lxml.html.fromstring(html.read().decode('utf-8')) toc = get_html_toc(htmltree) excerpt = get_html_excerpt(htmltree) f.close() obj.save(toc, excerpt)
def parse_web_page(url, xpaths=None, links=False): """Parse a response returned by a URL. The response can be parsed on the basis of xpaths determined by the URL's Resource instance or the xpaths given. If the response is to be parsed based on the former, the xpaths can be normal or related to link extraction, and thus patch-finding/recursion. Args: url (str): The URL to be parsed. xpaths (list[str]): A list of xpaths to parse the response with respect to. Defaults to None. If None, the xpaths are taken from the URL's corresponding Resource instance. links (bool): If True, the links xpaths are used from the corresponding Resource, else the normal xpaths are. Defaults to False. Returns: list[str]: A list of strings scraped from the determined or given xpaths. Raises: Exception: If there is an error in opening the given URL. """ logger.info("Opening %s...", url) try: html = urllib.request.urlopen(url) except urllib.error.HTTPError: raise Exception("Error opening {url}".format(url=url)) logger.info("Crawled %s", url) search_results = [] if not xpaths: if not links: xpaths = Resource.get_resource(url).normal_xpaths else: xpaths = Resource.get_resource(url).links_xpaths elements = lxml.html.fromstring(html.read()) for element in elements: if element.tag != "body": continue for xpath in xpaths: search_results.extend(element.xpath(xpath)) break return search_results
def __init__(self, url): print("load codeforces problem %s" % url) html = request.urlopen(url) self.problem_id = CFProblem.get_problem_id(url) self.pdf_name = 'CF' + self.problem_id + '.pdf' self.dom = lxml.html.fromstring(html.read()) self.contest_name = self.dom.xpath('//*[@id="sidebar"]/div[1]/table/tbody/tr[1]/th/a')[0].text base_tag = lxml.html.Element('base', href="https://%s" % urlparse(url).netloc) style_tag = lxml.html.Element('style') style_tag.text = '#pageContent>*:not(.problemindexholder) { display: none !important; } #header { display: none; } #footer { display: none; } .roundbox.menu-box { display: none; } #sidebar { display: none; } #body > br:nth-child(8) { display: none; } #pageContent { margin-right: 0 !important; } #body { padding-top: 0; } #MathJax_Message { display: none !important; }' self.dom.xpath('//html')[0].insert(0, base_tag) self.dom.xpath('//head')[0].append(style_tag) contest_tag = lxml.html.Element('div') contest_tag.text = self.contest_name #contest_tag.attrib['class'] = 'title' contest_tag.attrib['style'] = 'text-align: left;' self.dom.xpath('//*[@class="header"]')[0].insert(0, contest_tag)
def get_prefectures(self): # STEP.1 都道府県ののリストを取得 try: html = urlopen(URL_TOP) soup = BeautifulSoup(html.read(), "lxml") links = soup.select("table tr td a") for link in links: exclusion = str(link).count('HOME') or str(link).count( '都道府県') or str(link).count('メール送信') if exclusion: continue href = link.get('href') self.pref_list.append({'url': href, 'name': link.text}) except Exception as e: print('-----page not found.-----') print(e) self.pref_list = None
def _load_html(self, html, parser=lxml.html.parse): self.form_files = {} if hasattr(html, 'seek'): html.seek(0) if isinstance(html, (unicode, str)): html = StringIO(html) if isinstance(html, requests.Response): html = StringIO(html.content) if len(html.read()) == 0: self.document = None return None else: html.seek(0) self.document = parser(html) return html
def processRounds(roundURLs): for roundURL in roundURLs: html = urllib2.urlopen(siteURL + roundURL) roundPage = lxml.html.fromstring(html.read()) html.close() round = roundPage.cssselect("li[id='tpRound'] a")[0].text_content().replace("round ", "").replace(" Rankings", "").strip() print "Round: " + round roundRows = roundPage.cssselect("div[id='view_standard'] tr") # specified in the footer pageLinks = roundRows[-1].cssselect("a") #remove the "next page" link del pageLinks[-1] for link in pageLinks: linkURL = siteURL + link.get("href") print linkURL scrapePage(linkURL, round)
def main(): html = urllib.urlopen(url) doc = lxml.html.document_fromstring(html.read().decode('utf-8', 'ignore')) post = doc.cssselect('div.main .person-appointment-title')[0] print post.text#должность post1 = urllib.urlencode(post) #print p academictitle = doc.cssselect('div.main .person-appointment-title')[1] print academictitle.text#ученое звание academictitle1 = urllib.urlencode(academictitle) fio = doc.cssselect('div.footer__breadcrumbs .b ')[0]#ФИО print fio.text#ФИО fio1 = urllib.urlencode(fio) items = doc.cssselect('div.g-pic') for item in items: image = item.get('style') #print image s = image.split("'") #print s[1] page = 'hse.ru'+s[1]#адрес страницы, где находится фотография print page #hse.ru/pubs/share/direct/138568616 dictionary = {'post':post1,'academic title': academictitle1, 'fio': fio1, 'photo': page}#словарь который нужно преобразовать для JSON print dictionary #print dictionary {'academic title': 'class=person-appointment-title', 'post': 'class=person-appointment-title', 'fio': 'class=b', 'photo': 'hse.ru/pubs/share/direct/138568616'} #print(json.dumps((d),sort_keys=True)) json_data = json.dumps(dictionary) print (json.dumps(dictionary, sort_keys=True, indent=4, separators=(',', ': '))) #Результат печати словаря в формате JSON данных # { #"academic title": "class=person-appointment-title", #"fio": "class=b", #"photo": "hse.ru/pubs/share/direct/138568616", #"post": "class=person-appointment-title" # } # Почему то значения ключей печатаются не само значение поля, например как с ФИО, а печатается только имя класса, где находится? # Итак со всеми нужными нам полями elements_json = json.loads(json_data) print elements_json["post"]#доступ по ключу #class=person-appointment-title return json_data
def get_cities(self): # STEP2. 市区町村のリストを取得 if self.pref_list is None: return for pref in self.pref_list: target_url = URL_TOP + pref['url'] try: df = pd.DataFrame(columns=df_columns) html = urlopen(target_url) soup = BeautifulSoup(html.read(), "lxml") links = soup.select("center table tr td a") for link in links: if str(link).count('☆'): continue href = link.get('href') arr = href.split("//") domain = arr[1] domain = domain[:-1] data = { "pref": pref['name'], "name": link.text, "top_url": href, 'domain': domain } df = df.append(data, ignore_index=True) print(data) self.pref_df = pd.concat([self.pref_df, df]) except Exception as e: print('-----page not found.-----') print(e)
def __get_flat_details__(self, link, flat_params): """ Функция получает url страницы с информацией о квартире. Возвращает словарь название параметра (как на странице) -> значение """ url = settings.SITE_ROOT + link html = self.__get_url__(url) xhtml = lxml.html.fromstring(html.read()) cells = xhtml.xpath(settings.DETAIL_CELLS_XPATH) result = dict() result[u"URL"] = url for i in range(len(cells) / 2): value = cells.pop().text_content() name = cells.pop().text_content() name = re.sub(":", "", name) name = name.strip() if name in flat_params: result[name] = value return result
def __init(self,url): self.url = url html = urllib2.urlopen(url) self.source = lxml.html.fromstring(html.read())
# python3 import urllib.request import lxml.html import re url = 'http://em.scnu.edu.cn/article/xueyuantongzhi/zonghe/' html = urllib.request.urlopen(url) scode=html.read().decode('utf-8') doc = lxml.html.document_fromstring(scode) ss = doc.xpath("""//div[@class="c_news"]/ul/li/a/font/text()|//div[@class="c_news"]/ul/li/a/text()""") bb = doc.xpath("""//div[@class="c_news"]/ul/li/span/text()""") aa= list(zip(ss,bb)) print(aa)
def accident_records(): print "reached" all_accidents = [] for file_name in range(29): file_name = APP_ROOT + "/accidentApp" + "/try/" + str( file_name) + ".html" print file_name try: html = urllib.urlopen(file_name) except: continue html = html.read() i = 1 while True: if i == 1: my_iter = 1 my_iter2 = 3 else: my_iter = 0 my_iter2 = 0 root1 = lxml.html.fromstring(html) try: main_content = root1.cssselect('div#pf' + str(i)) i += 1 except: break print main_content if main_content == []: break node = main_content[0] try: content_date = node.cssselect('div.x4')[my_iter:] content_time = node.cssselect('div.x4 div.t')[my_iter:] content_location = node.cssselect('div.x4 div.t')[my_iter:] death_1 = node.cssselect('div.x12')[my_iter2:] death_2 = node.cssselect('div.x1d')[my_iter:] death_3 = node.cssselect('div.x1e')[my_iter:] death_4 = node.cssselect('div.x1f')[my_iter:] injury_1 = node.cssselect('div.x13')[my_iter2:] injury_2 = node.cssselect('div.x20')[my_iter:] injury_3 = node.cssselect('div.x21')[my_iter:] injury_4 = node.cssselect('div.x22')[my_iter:] injury2_1 = node.cssselect('div.x14')[my_iter2:] injury2_2 = node.cssselect('div.x23')[my_iter:] injury2_3 = node.cssselect('div.x24')[my_iter:] injury2_4 = node.cssselect('div.x25')[my_iter:] vehicle_1 = node.cssselect('div.x15') vehicle_2 = node.cssselect('div.x26') vehicle_3 = node.cssselect('div.x27') vehicle_4 = node.cssselect('div.x28') vehicle_5 = node.cssselect('div.x29') vehicle_6 = node.cssselect('div.x2a') vehicle_7 = node.cssselect('div.x2b') vehicle_8 = node.cssselect('div.x2c') vehicle_damaged = node.cssselect('div.x18')[1:] rows = zip(content_date, content_time, content_location, death_1, death_2, death_3, death_4, injury_1, injury_2, injury_3, injury_4, injury2_1, injury2_2, injury2_3, injury2_4, vehicle_1, vehicle_2, vehicle_3, vehicle_4, vehicle_5, vehicle_6, vehicle_7, vehicle_8, vehicle_damaged) except: pass for item in rows: try: print "------------------------------" accident = {} my_date = map_number(item[0].cssselect("div.t") [0].text_content().split()[0]) print my_date accident["year"] = my_date.split(".")[0] accident["month"] = my_date.split(".")[1] accident["day"] = my_date.split(".")[2] time = map_number(item[0].cssselect("div.t") [1].text_content().split()[0]) accident["hour"] = time.split(":")[0] accident["minute"] = time.split(":")[1] accident["location"] = item[0].cssselect( "div.t")[2].text_content().strip() death = 0 for each_death in item[3:7]: death += int(each_death.text_content().strip() or 0) injury = 0 for each_injury in item[7:15]: injury += int(each_injury.text_content().strip() or 0) accident["death"] = death accident["injury"] = injury accident["vehicle_damaged"] = int( item[-1].text_content().strip() or 0) all_accidents.append(accident) #print all_accidents except: pass print all_accidents return all_accidents
import re import xml.etree.ElementTree as ET # Blank Python #import json #for json decoding from lxml import etree from cStringIO import StringIO import urllib import re totalLinks =[] for i in range(21)[1:]: strAddr = "http://codingtrying.herobo.com/"+str(i)+".html" html = urllib.urlopen(strAddr) html = html.read() parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) mainContent = tree.xpath("//th[@class='rowA']/a/@href") for content in mainContent: if content !="http://www.dlapiper.com/us/people/#": totalLinks.append(content) i=0; for url in totalLinks: if i<=481: i=i+1 continue
def get_function_details(self, func_details, topics): html = urllib.urlopen(func_details['function_link']) html = html.read() self.parser = etree.HTMLParser() maintree = etree.parse(StringIO(html), self.parser) mainContent1 = maintree.xpath("//dl[@class='method']") #scrape main div containing data mainContent2 = maintree.xpath("//dl[@class='function']") #scrape main div containing data if len(mainContent1)==0 and len(mainContent2)!=0: mainContent = mainContent2 elif len(mainContent2)==0 and len(mainContent1)!=0: mainContent = mainContent1 elif len(mainContent1)==0 and len(mainContent2)==0: return argument_list = [ child for child in mainContent[0].iterchildren('dt') ] #get its child dt contentHTML= (etree.tostring(argument_list[0], pretty_print=True)) tree = etree.parse(StringIO(contentHTML), self.parser) argument_list = tree.xpath("//text()") argument_list = ''.join(argument_list[1:len(argument_list)-1]).encode('utf-8').strip() # getting details for each args split_data = argument_list.split('(') full_function_name = split_data[0] sec_split_data = split_data[1].split(')') args = sec_split_data[:-1] arg_dict = {} if len(args)!=0: args = args[0].split(',') for each_arg in args: each_split = each_arg.split('=') if len(each_split)==1: if each_arg.find('.')== -1: arg_dict[each_arg] = {'optional_flag': 0, 'default_value': ''} else: if each_split[0].find('.')== -1: arg_dict[each_split[0]] = {'optional_flag': 1, 'default_value': each_split[1]} # parsing examples examples = '' dd = [ child for child in mainContent[0].iterchildren('dd') ] #get its child dd example_div = [ child for child in dd[0].iterchildren('div') ] #get its child div if len(example_div)!=0: contentHTML= (etree.tostring(example_div[0], pretty_print=True)) tree = etree.parse(StringIO(contentHTML), self.parser) example_div_class = tree.xpath("//@class") if example_div_class[0] == 'highlight-python': examples = tree.xpath("//text()") examples = ''.join(examples) parameters_table = [ child for child in mainContent[0].iterdescendants('table') ] #get its child table if len(parameters_table)!=0: contentHTML= (etree.tostring(parameters_table[0], pretty_print=True)) tree = etree.parse(StringIO(contentHTML), self.parser) table_class = tree.xpath("//@class") if table_class[0] == 'docutils field-list': all_desc = [ child for child in parameters_table[0].iterdescendants('tr') ] #get its child tr # for parameters argument_desc = [ child for child in all_desc[0].iterchildren('td') ] #get its child td contentHTML= (etree.tostring(argument_desc[0], pretty_print=True)) tree = etree.parse(StringIO(contentHTML), self.parser) argument_desc_list = tree.xpath("//text()") para_arg={} para_arg['argument_desc'] = ''.join(argument_desc_list).encode('utf-8').strip() # for returns if len(all_desc) == 2: parameter_desc = [ child for child in all_desc[1].iterchildren('td') ] #get its child td contentHTML= (etree.tostring(parameter_desc[0], pretty_print=True)) tree = etree.parse(StringIO(contentHTML), self.parser) parameter_desc_list = tree.xpath("//text()") para_arg['parameter_desc'] = ''.join(parameter_desc_list).encode('utf-8').strip() para_arg['parameter_desc'] = para_arg.get('parameter_desc') if para_arg.get('parameter_desc')!=None else '' # final_data = {'function_name':func_details['function_name'], final_data = {'function_name':full_function_name, 'function_link':func_details['function_link'], 'function_description':func_details['function_desc'], 'argument_list':arg_dict, 'argument_description':para_arg['argument_desc'], 'return_parameter':para_arg['parameter_desc'], 'examples': examples, 'sub_topic':topics['sub_topic'], 'sub_topic_link':topics['sub_topic_link'], 'main_topic':topics['main_topic'], 'main_topic_link':topics['main_topic_link']} #write to mongodb self.mongo_obj.write_data(self.table_name, final_data) else: final_data = {'function_name':full_function_name, 'function_link':func_details['function_link'], 'function_description':func_details['function_desc'], 'argument_list':arg_dict, 'argument_description':'', 'return_parameter':'', 'examples': examples, 'sub_topic':topics['sub_topic'], 'sub_topic_link':topics['sub_topic_link'], 'main_topic':topics['main_topic'], 'main_topic_link':topics['main_topic_link']} self.mongo_obj.write_data(self.table_name, final_data)
def accident_records(): print "reached" all_accidents =[] for file_name in range(29): file_name = APP_ROOT+"/accidentApp"+"/try/"+str(file_name)+".html" print file_name try: html = urllib.urlopen(file_name) except: continue html = html.read() i =1 while True: if i ==1: my_iter = 1 my_iter2 = 3 else: my_iter = 0 my_iter2 = 0 root1 = lxml.html.fromstring(html) try: main_content = root1.cssselect('div#pf'+str(i)) i += 1 except: break print main_content if main_content == []: break node = main_content[0] try: content_date = node.cssselect('div.x4')[my_iter:] content_time = node.cssselect('div.x4 div.t')[my_iter:] content_location = node.cssselect('div.x4 div.t')[my_iter:] death_1 = node.cssselect('div.x12')[my_iter2:] death_2 = node.cssselect('div.x1d')[my_iter:] death_3 = node.cssselect('div.x1e')[my_iter:] death_4 = node.cssselect('div.x1f')[my_iter:] injury_1 = node.cssselect('div.x13')[my_iter2:] injury_2 = node.cssselect('div.x20')[my_iter:] injury_3 = node.cssselect('div.x21')[my_iter:] injury_4 = node.cssselect('div.x22')[my_iter:] injury2_1 = node.cssselect('div.x14')[my_iter2:] injury2_2 = node.cssselect('div.x23')[my_iter:] injury2_3 = node.cssselect('div.x24')[my_iter:] injury2_4 = node.cssselect('div.x25')[my_iter:] vehicle_1 = node.cssselect('div.x15') vehicle_2 = node.cssselect('div.x26') vehicle_3 = node.cssselect('div.x27') vehicle_4 = node.cssselect('div.x28') vehicle_5 = node.cssselect('div.x29') vehicle_6 = node.cssselect('div.x2a') vehicle_7 = node.cssselect('div.x2b') vehicle_8 = node.cssselect('div.x2c') vehicle_damaged = node.cssselect('div.x18')[1:] rows = zip(content_date, content_time, content_location, death_1, death_2, death_3, death_4, injury_1, injury_2, injury_3, injury_4, injury2_1, injury2_2, injury2_3, injury2_4, vehicle_1, vehicle_2, vehicle_3, vehicle_4, vehicle_5, vehicle_6, vehicle_7, vehicle_8, vehicle_damaged) except: pass for item in rows: try: print "------------------------------" accident = {} my_date = map_number(item[0].cssselect("div.t")[0].text_content().split()[0]) print my_date accident["year"] = my_date.split(".")[0] accident["month"] = my_date.split(".")[1] accident["day"] = my_date.split(".")[2] time = map_number(item[0].cssselect("div.t")[1].text_content().split()[0]) accident ["hour"] = time.split(":")[0] accident["minute"] = time.split(":")[1] accident["location"] = item[0].cssselect("div.t")[2].text_content().strip() death = 0 for each_death in item[3:7]: death+= int(each_death.text_content().strip() or 0) injury = 0 for each_injury in item[7:15]: injury+= int(each_injury.text_content().strip() or 0) accident["death"] = death accident ["injury"] = injury accident ["vehicle_damaged"] = int(item[-1].text_content().strip() or 0) all_accidents.append(accident) #print all_accidents except: pass print all_accidents return all_accidents
import time import sys import codecs import lxml.html import urllib2 query = 'http://www39.atwiki.jp/osakahennyu/?cmd=backup&action=source&pageid=<PLACEHOLDER>&num=0' for line in open(sys.argv[1], 'r'): url = query.replace('<PLACEHOLDER>', line.rstrip()) while True: try: html = urllib2.urlopen(url) code = unicode(html.read(), 'utf-8') dom = lxml.html.fromstring(code) wiki = dom.xpath('//pre')[0] fout = codecs.open(line.rstrip() + '.txt', 'w', 'utf-8') fout.write(wiki.text) fout.close() html.close() break except urllib2.HTTPError: raw_input('>>> error! press continue...') time.sleep(1)
import re import xml.etree.ElementTree as ET # Blank Python #import json #for json decoding from lxml import etree from cStringIO import StringIO import urllib import re totalLinks = [] for i in range(21)[1:]: strAddr = "http://codingtrying.herobo.com/" + str(i) + ".html" html = urllib.urlopen(strAddr) html = html.read() parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) mainContent = tree.xpath("//th[@class='rowA']/a/@href") for content in mainContent: if content != "http://www.dlapiper.com/us/people/#": totalLinks.append(content) i = 0 for url in totalLinks: if i <= 481: i = i + 1 continue try: page = scraperwiki.scrape(url) html = bs.BeautifulSoup(page)
def scrape_section(self, element, topics, scipy_first=False, all_info=None): if scipy_first: h1_topic = [ child for child in element.iterchildren('h1') ] #get its child h1 actual_link = [ child for child in h1_topic[0].iterchildren('a') ] #get its child a if len(actual_link)==2: contentHTML= (etree.tostring(actual_link[0], pretty_print=True)) tree = etree.parse(StringIO(contentHTML), self.parser) actual_link = tree.xpath("//@href")[0].split('/') if actual_link[0]== '..': html = urllib.urlopen(self.url + actual_link[1]) html = html.read() maintree = etree.parse(StringIO(html), self.parser) mainContent = maintree.xpath("//div[@class='section']") #scrape main div containing data self.scrape_section(mainContent[0], topics) else: return else: main_topics = [ child for child in element.iterchildren('div') ] #get its child div for each_topic in main_topics: contentHTML= (etree.tostring(each_topic, pretty_print=True)) tree = etree.parse(StringIO(contentHTML), self.parser) div_class = tree.xpath("//@class") if div_class[0] == 'section': title = [ child for child in each_topic.iterchildren('h2') ] #get its child h2 mini_title, information='','' if len(title)==0: title = [ child for child in each_topic.iterchildren('h3') ] #get its child h3 if len(title)!=0: titleHTML= (etree.tostring(title[0], pretty_print=True)) title_tree = etree.parse(StringIO(titleHTML), self.parser) mini_title = title_tree.xpath("//text()")[0].encode('utf-8').strip() if self.url == 'http://docs.scipy.org/doc/numpy/user/': info = [ child for child in each_topic.iterchildren('p') ] #get its child para if len(info)!=0: infoHTML= (etree.tostring(info[0], pretty_print=True)) info_tree = etree.parse(StringIO(infoHTML), self.parser) information = info_tree.xpath("//text()")[0].encode('utf-8').strip() if all_info!=None: info_details = {'mini_title': mini_title, 'mini_info': information, 'parent_title': all_info.get('mini_title'), 'parent_info': all_info.get('mini_info')} else: info_details = {'mini_title': mini_title, 'mini_info': information} else: info_details = {'mini_title': mini_title, 'mini_info': information} self.scrape_section(each_topic, topics, all_info=info_details) else: self.get_func_tables(each_topic, topics) # check if table of functions exists # check if there is a section div within the div self.scrape_section(each_topic, topics) else: if self.url == 'http://docs.scipy.org/doc/numpy/user/' and all_info!=None: final_data = {'sub_topic':topics['sub_topic'], 'sub_topic_link':topics['sub_topic_link'], 'main_topic':topics['main_topic'], 'main_topic_link':topics['main_topic_link']} if all_info.get('parent_title')==None and all_info.get('parent_info')==None: final_data['parent_title'] = all_info['mini_title'] final_data['parent_info'] = all_info['mini_info'] final_data['mini_title'] = '' final_data['mini_info'] ='' self.mongo_obj.write_data(self.table_name, final_data) else: final_data['parent_title'] = all_info.get('parent_title') final_data['parent_info'] = all_info.get('parent_info') final_data['mini_title'] = all_info['mini_title'] final_data['mini_info'] =all_info['mini_info'] self.mongo_obj.write_data(self.table_name, final_data)
#tf.close() ########################################################################### ######인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api# client_id = '71b5f772fc5a467fbb4e6066ecbe9536' access_token = "1451885321.71b5f77.ddac6f3e719c4afb8375ab1dda874fd9" client_secret = "8fbf9fa995804da09d587ec0a3819e01" api = InstagramAPI(access_token=access_token, client_secret=client_secret) result = api.tag_recent_media(100, 10, moviename) url = result[1] html = urllib.urlopen(url) ##데이터를 받을 수 있는 url을 urllib모듈을 통해서 오픈시키고 넣음 htmlread = html.read().decode('utf-8') ## 읽어서 htmlread에 넣음 jjson = json.loads(htmlread) data = jjson['data'] try: #saveFile = open('result.txt', 'a') for i in range(0, len(data)): a = data[i] tag = a['tags'] for i in range(0, len(tag)): #saveFile.write(str(tag[i].encode('utf-8'))+" ") finaldata = finaldata + tag[i] + u" "
from BeautifulSoup import BeautifulSoup import re import urllib import lxml.html import string import json import pickle for char in string.uppercase: movieInfoList = [] html = urllib.urlopen('http://www.gomolo.com/indian-movies-list-films-database?SearchChar=' + char) soup = BeautifulSoup(html.read()) #print soup.html.head.title.string items = soup.findAll("div",attrs={"id":"divMain"})[0].contents[0].contents movielinks = [] for item in items: try: movielinks.append(item.contents[0].contents[0].attrs[0][1]) except IndexError: print "IndexError" pass #movielinks = ['http://www.gomolo.com/bal-hanuman-2-movie/39179'] for link in movielinks: movieInfo = {} arr = link.split("/")
#!/usr/bin/env python import scraperwiki import requests import lxml.html from bs4 import BeautifulSoup import requests html=open('index','r') print html.read() soup = BeautifulSoup(html) print(soup.pretify)
from BeautifulSoup import BeautifulSoup import re import urllib import lxml.html import string import json import pickle for char in string.uppercase: movieInfoList = [] html = urllib.urlopen( 'http://www.gomolo.com/indian-movies-list-films-database?SearchChar=' + char) soup = BeautifulSoup(html.read()) #print soup.html.head.title.string items = soup.findAll("div", attrs={"id": "divMain"})[0].contents[0].contents movielinks = [] for item in items: try: movielinks.append(item.contents[0].contents[0].attrs[0][1]) except IndexError: print "IndexError" pass #movielinks = ['http://www.gomolo.com/bal-hanuman-2-movie/39179'] for link in movielinks:
while len(word) > index: val=word[index] index =index+1 else: val="NULL" #print var+val+"\n" if val == None: return di[var]=val return val #html = requests.get("http://www.autolanka.com/Buy.asp").content html=open('index','r') dom = lxml.html.fromstring(html.read()) varia=["Code:","Added:","Make:","Model:","No:","Year:","Location:","Options:","Price:","Info:"] di={} ads={} for entry in dom.cssselect('.BuyDataTD'): [extract(var,entry,di) for var in varia] if len(di)==10: print di #if (len(di)==10) and not(di['Code:'].replace("Code:","") in ads): # ads[di['Code:'].replace("Code:","")]=di #print ads
val = word[index] index = index + 1 else: val = "NULL" #print var+val+"\n" if val == None: return di[var] = val return val #html = requests.get("http://www.autolanka.com/Buy.asp").content html = open('index', 'r') dom = lxml.html.fromstring(html.read()) varia = [ "Code:", "Added:", "Make:", "Model:", "No:", "Year:", "Location:", "Options:", "Price:", "Info:" ] di = {} ads = {} for entry in dom.cssselect('.BuyDataTD'): [extract(var, entry, di) for var in varia] if len(di) == 10: print di #if (len(di)==10) and not(di['Code:'].replace("Code:","") in ads): # ads[di['Code:'].replace("Code:","")]=di #print ads
#! python3 import urllib.request import lxml.html import re url = 'http://em.scnu.edu.cn/article/xueyuantongzhi/zonghe/' html = urllib.request.urlopen(url) scode = html.read().decode('utf-8') doc = lxml.html.document_fromstring(scode) ss = doc.xpath( """//div[@class="c_news"]/ul/li/a/font/text()|//div[@class="c_news"]/ul/li/a/text()""") bb = doc.xpath("""//div[@class="c_news"]/ul/li/span/text()""") aa = list(zip(ss, bb)) print(aa)
} web = {} web['新闻'] = 'https://searchcloudcomputing.techtarget.com.cn/news/' for key in web: with open('D:/' + key + '.csv', 'w', newline='', encoding='utf-8-sig') as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(('title', 'abstract', 'type', 'content')) for i in range(2, 407): try: print((key + '%.2f' % ((i - 1) / 407 * 100)) + "%") req = request.Request( 'https://searchcloudcomputing.techtarget.com.cn/interviews/page/3/', headers=headers) html = urlopen(req) bsObj = BeautifulSoup(html.read(), "html.parser") print(bsObj.text) bs = bsObj.find_all('h4', attrs={'class': 'newslist'}) print(bs) for j in bs: req = request.Request(j.find('a').get('href'), headers=headers) print(j.find('a').get('href')) html = urlopen(req) bsObj = BeautifulSoup(html.read(), "html.parser") bs = bsObj.find_all(name='div', attrs={'class': 'newslist'}) content = '' for i in bs: content = f'{content}{i.text}' title = bsObj.find_all('h1')
def getHtml(url): html = urllib2.urlopen(url) page = lxml.html.fromstring(html.read()) html.close() return page