def update(self): self.lastUpdate = datetime.now() for linenumber, bus in self.lines.items(): bus.reset() html = self.downloader.get(self.url) for line in xpath.search(html, '//table//tr'): counter=0 time = '' delay = '' destination = '' linenumber = '' details = '' for item in xpath.search(line, '/td'): if counter == 0: hour, minutes = xpath.get(item, '/span').strip().split(':'); time = Time(hour, minutes) delay = xpath.get(item, '/span[@class="block exclamation bold mts"]').strip().split(':') if len(delay) > 1: delay=Time(delay[0], delay[1], "delay") else: delay=Time(0, delay[0], "delay") elif counter == 1: destination = item.strip() elif counter == 2: linenumber = item.strip() elif counter == 4: details = re.sub('<span.*</span>', '', item).strip() counter+=1 if delay != '': if self.lines.has_key(linenumber + destination): self.lines[linenumber + destination].find(time).delay = delay
def update(self): self.lastUpdate = datetime.now() for linenumber, bus in self.lines.items(): bus.reset() html = self.downloader.get(self.url) for line in xpath.search(html, '//table//tr'): counter = 0 time = '' delay = '' destination = '' linenumber = '' details = '' for item in xpath.search(line, '/td'): if counter == 0: hour, minutes = xpath.get(item, '/span').strip().split(':') time = Time(hour, minutes) delay = xpath.get( item, '/span[@class="block exclamation bold mts"]' ).strip().split(':') if len(delay) > 1: delay = Time(delay[0], delay[1], "delay") else: delay = Time(0, delay[0], "delay") elif counter == 1: destination = item.strip() elif counter == 2: linenumber = item.strip() elif counter == 4: details = re.sub('<span.*</span>', '', item).strip() counter += 1 if delay != '': if self.lines.has_key(linenumber + destination): self.lines[linenumber + destination].find(time).delay = delay
def holders_parse(html, i): infos = [] h = xpath.get(html, r'//table[@class="table"]', remove=None) for k in xpath.search(h, r'//tr', remove=None): if '</td><td>' in k: ms = [common.normalize(m) for m in xpath.search(k, r'//td')] infos.append('"' + '","'.join(ms) + '"') return infos
def mal(mal_title, mal_id=False): cookies = {"incap_ses_224_81958":"P6tYbUr7VH9V6shgudAbA1g5FVYAAAAAyt7eDF9npLc6I7roc0UIEQ=="} response = requests.get( "http://myanimelist.net/api/anime/search.xml", params={'q':mal_title}, cookies=cookies, auth=("zodman1","zxczxc"), headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'}) content = response.content if not mal_id is False: for e in xpath.search(content,"//entry"): if mal_id in e: content = e break tqdm.write("%s %s"%((mal_title,), mal_id)) id = xpath.get(content, "//id") title = xpath.get(content, "//title") title_en = xpath.get(content, "//english") type_ = xpath.get(content, "//type") synonyms = xpath.get(content, "//synonyms") status = xpath.get(content, "//status") synopsys = translate(xpath.get(content, "//synopsis"),"es") img = xpath.get(content, "//image") episodes = xpath.get(content,"//episodes") resumen = synopsys.replace("<br />", " ").replace("\n\r","") resumen = translate(resumen,'es') status = translate(status,'es') assert id is not "", mal_title data=dict(title=title, title_en=title_en, type=type_, status=status, resumen=resumen, img=img,episodes=episodes, synonyms=synonyms,id=id, synopsys=synopsys) return MalResult(**data)
def download_locations(): """Find latitude longitude bounding box for this country """ D = download.Download(num_retries=1) index_url = 'http://download.geonames.org/export/zip/' index_html = D.get(index_url) for link in xpath.search(index_html, '//pre/a/@href'): if link.endswith( '.zip') and '_full' not in link and 'allCountries' not in link: download_html = D.get(urlparse.urljoin(index_url, link)) input_zip = StringIO.StringIO() input_zip.write(download_html) try: tsv_data = zipfile.ZipFile(input_zip).read( link.replace('.zip', '.txt')) except zipfile.BadZipfile as e: print e del D.cache[urlparse.urljoin(index_url, link)] continue output_filename = link.replace('.zip', '_locations.csv') writer = csv.writer(open(output_filename, 'w')) found = set() for row in csv.reader(tsv_data.splitlines(), delimiter='\t'): zip_code = row[1] = row[1].split('-')[0] try: lat, lng = float(row[9]), float(row[10]) except ValueError: print 'bad coord:', row[9], row[10] else: if lat and lng and zip_code not in found: found.add(zip_code) place = row[2] writer.writerow([place, zip_code, lat, lng]) print 'Downloaded to', output_filename
def scrapeAmazon(gamename): AMA = download.Download(user_agent=None) search = gamename search = search.replace(" ","+") html = AMA.fetch("http://www.amazon.com/gp/search/ref=sr_il_ti_videogames?rh=n%3A468642%2Ck%3A{}&keywords={}&ie=UTF8&qid=1407988315&lo=videogames".format(search,search)) if not html: noamazon=1 print("Couldn't connect to Amazon's servers.") return noamazon gametitle = xpath.search(html, '//div[@class="ilt3"]//a//span[@class="lrg bold"]') productlinks = xpath.search(html, '//div[@class="ilt3"]//a/@href') gameprice = xpath.search(html, '//div[@class="ill3"]//span[@class="red bld"]') return (gametitle, productlinks, gameprice)
def process_page_bizhi(page): try: html = urllib2.urlopen(page).read() app_list = xpath.search(html, '//div[@id="plistbox"]/span/a/@href') if(not app_list): print "page format changed at: %s" % page return for app in app_list: process_bizhi_url('http://a.3533.com' + app) page_next = xpath.search(html, '//div[@class="page"]/ul/li/a[@class="next"]/@href') if(page_next): process_page_bizhi('http://a.3533.com' + page_next[0]) else: print "reached at the max page or page format changed: %s" % page except urllib2.URLError, e: logger.error('process_url URLError Exception at: %s, %s' % (page, e.message))
def scrapeBB(gamename): BB = download.Download(user_agent=None) search = gamename search = search.replace(" ","+") html = BB.fetch("http://www.bestbuy.com/site/searchpage.jsp?_dyncharset=UTF-8&id=pcat17071&type=page&ks=960&st={}&sc=Global&cp=1&sp=&qp=category_facet%3DVideo+Games~abcat0700000&list=y&usc=All+Categories&nrp=15&iht=n&seeAll=".format(search)) if not html: nobb=1 print("Couldn't connect to Best Buy's servers.") return nobb gametitle = xpath.search(html, '//h3[@itemprop="name"]//a') productlinks = xpath.search(html, '//h3[@itemprop="name"]//a/@href') gameprice = xpath.search(html, '//span[@itemprop="price"]') return (gametitle, productlinks, gameprice)
def scrapeGamestop(gamename): GS = download.Download() search = gamename search = search.replace(" ","+") html = GS.fetch("http://www.gamestop.com/browse?nav=16k-3-{},28zu0".format(search)) if not html: nogs=1 print("Couldn't connect to Gamestop's servers.") return nogs gametitle = xpath.search(html, '//div[@class="product_info grid_12"]//a[1]') productlinks = xpath.search(html, '//div[@class="product_info grid_12"]//a[1]/@href') gameprice = xpath.search(html, '//p[@class="pricing"]') return (gametitle, productlinks, gameprice)
def scrapeBaramDom(): # UTF-8 support reload(sys) sys.setdefaultencoding('utf-8') now = datetime.now() down = Downloader('http://www.baramdom.com/') content = down.get_content() html = unicode(content) p = xpath.get(html, '//div[@class="box post"]') linkovi = xpath.search(p, '//div[@class="content"]') ads = [] for l in linkovi: link = "http://www.baramdom.com" + xpath.get(l, '//div[@class="post-title"]/h2/a/@href') title = xpath.get(l, '//div[@class="post-title"]/h2/a') imageUrl = xpath.get(l, '//a[@class="grouped"]/img/@src') if imageUrl == "": imageUrl = "http://www.baramdom.com/img/apartment_noimage.png" else: imageUrl = "http://www.baramdom.com" + imageUrl download = Downloader(link) cont = download.get_content() cont = unicode(cont) description = xpath.get(cont, '//p[@class="post_add_desc"]') description = description.strip() category = u"Недвижнини" ost = xpath.get(l, '//p[@class="add-title"]') ost = ost.strip() ost = ost.split(" во ") region = ost[1] country = u"Македонија" k = ost[0] k = k.split("ам ") subcategory = k[1] price = xpath.get(cont, '//div[@class="post-add"]/p[@class="last"]').strip() price = price.split(" ") if len(price)==3: value = "/" currency = "/" else: value = price[0] currency = price[1] if currency == "Euro.": currency = "EUR" elif currency == u"Ден.": currency = "MKD" date = xpath.get(l, '//div[@class="fl"]') date = date.strip() date = date.split(">") date = date[1] date = date.strip() date = date.split(" ") date = date[0] date = date.split("-") date = date[2]+"-"+date[1]+"-"+date[0] ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country) ads.append(ad) return adsToJson(ads) #print scrapeBaramDom()
def scrapeOglasiRs(): # UTF-8 support reload(sys) sys.setdefaultencoding('utf-8') now = datetime.now() down = Downloader('http://www.oglasi.rs/pretraga/0/0/') content = down.get_content() html = unicode(content) linkovi = xpath.search(html, '//li[@class="clearfix"]') ads = [] for l in linkovi: link = xpath.get(l, '//a[@class="ogl_id"]/@href') title = xpath.get(l, '//h2/a[@class="ogl_id"].text()') imageUrl ="http://oglasi.rs" + xpath.get(l, '//a[@class="ogl_id"]/img/@src') price = xpath.get(l, '//div[@class="ad-price"]/h3') datum = xpath.get(l, '//div[@class="right-side"]/div/p/strong') datum = datum.split(".") date = datum[2]+"-"+datum[1]+"-"+datum[0] price = price.split(" ") price[0] = price[0].replace(".","") currency = price[1] value = price[0] value = value.split(",") value = value[0] download = Downloader(link) ad = download.get_content() ad = unicode(ad) description = xpath.search(ad, '//div[@class="description"]/p') description = description[1].strip() category="/" subcategory="/" loc = xpath.search(ad, '//div[@class="description"]/ul[@class="clearfix"]') lo = xpath.search(loc[0], '//li') region = lo[1] region = region.split("(") region = region[0] region = region.strip() country = u"Србија" ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country) ads.append(ad) return adsToJson(ads) #print scrapeOglasiRs()
def scrapeBB(gamename): BB = download.Download(user_agent=None) search = gamename search = search.replace(" ", "+") html = BB.fetch( "http://www.bestbuy.com/site/searchpage.jsp?_dyncharset=UTF-8&id=pcat17071&type=page&ks=960&st={}&sc=Global&cp=1&sp=&qp=category_facet%3DVideo+Games~abcat0700000&list=y&usc=All+Categories&nrp=15&iht=n&seeAll=" .format(search)) if not html: nobb = 1 print("Couldn't connect to Best Buy's servers.") return nobb gametitle = xpath.search(html, '//h3[@itemprop="name"]//a') productlinks = xpath.search(html, '//h3[@itemprop="name"]//a/@href') gameprice = xpath.search(html, '//span[@itemprop="price"]') return (gametitle, productlinks, gameprice)
def get_external_URL(page_html): seen_urls = set() urls = xpath.search(page_html, '//section[@class="maincontainer"]//a/@href') for url in urls: #print link url = url.replace(archive, '') if url not in seen_urls: #num_new_articles += 1 seen_urls.add(url) return seen_urls
def scrapeGamestop(gamename): GS = download.Download() search = gamename search = search.replace(" ", "+") html = GS.fetch( "http://www.gamestop.com/browse?nav=16k-3-{},28zu0".format(search)) if not html: nogs = 1 print("Couldn't connect to Gamestop's servers.") return nogs gametitle = xpath.search(html, '//div[@class="product_info grid_12"]//a[1]') productlinks = xpath.search( html, '//div[@class="product_info grid_12"]//a[1]/@href') gameprice = xpath.search(html, '//p[@class="pricing"]') return (gametitle, productlinks, gameprice)
def download_content(outputfile, seen_urls): f = open(outputfile, 'a') for url in seen_urls: page_html = D.get(url) #print page_html #ba_cntebt_text introFirst contents = xpath.search(page_html, '//div[@id="ba_content"]//div/text()') #contents = xpath.search(page_html, '//div[@class="ba_cntebt_text_introFirst"]/div/text() | //div[@class="mainText"]/div/text()') f.write(url + '\n') #for content in contents: # f.write(content) if contents == None: contents = xpath.search(page_html, '//div[@class="mainpopup"]//div/text()') #contents = xpath.search(page_html, '//div[@class="ba_cntebt_text_introFirst"]/div/text() | //div[@class="mainText"]/div/text()') #f.write(url+'\n') for content in contents: f.write(content) #break f.close()
def scrapeAmazon(gamename): AMA = download.Download(user_agent=None) search = gamename search = search.replace(" ", "+") html = AMA.fetch( "http://www.amazon.com/gp/search/ref=sr_il_ti_videogames?rh=n%3A468642%2Ck%3A{}&keywords={}&ie=UTF8&qid=1407988315&lo=videogames" .format(search, search)) if not html: noamazon = 1 print("Couldn't connect to Amazon's servers.") return noamazon gametitle = xpath.search( html, '//div[@class="ilt3"]//a//span[@class="lrg bold"]') productlinks = xpath.search(html, '//div[@class="ilt3"]//a/@href') gameprice = xpath.search(html, '//div[@class="ill3"]//span[@class="red bld"]') return (gametitle, productlinks, gameprice)
def process_ruanjian_url(url): try: # http://a.3533.com/ruanjian/4180.htm # charset=utf-8 print url data = urllib2.urlopen(url).read() down_obj = util.app_info({'market':market}) down_obj['app_url'] = url down_obj['app_url_md5'] = hashlib.md5(url).hexdigest() app_name = xpath.search(data, '//div[@class="gametit"]/h1/') if(app_name): down_obj['app_name'] = app_name[0] apk_left = xpath.search(data, '//div[@class="apkleft"]/ul/li') if(apk_left): found1 = re.search('([.\d]+)', apk_left[0]) if(found1): down_obj['version'] = found1.group(1) found2 = re.search('([.\d]+)([MK])', apk_left[4]) if(found2): if(found2.group(2) == 'M'): down_obj['size'] = int(float(found2.group(1)) * 1024 * 1024) else: down_obj['size'] = int(float(found2.group(1)) * 1024) short_url = xpath.search(data, '//div[@class="apkdown"]/a/@href') if(short_url): opener = urllib2.build_opener(util.RedirectHandler) apk_url= opener.open(short_url[0]).geturl() down_obj['download_link'] = apk_url print down_obj util.sql_do(down_obj) util.put_job(down_obj) global cnt_all cnt_all += 1 except urllib2.URLError, e: logger.error('process_url Exception at: %s, %s' % (url, e.message))
def scrapeNedviznostiMakedonija(): # UTF-8 support reload(sys) sys.setdefaultencoding('utf-8') now = datetime.now() down = Downloader( 'http://www.nedviznostimakedonija.com.mk/Default.aspx?search=1') content = down.get_content() html = unicode(content) linkovi = xpath.search(html, '//div[@class="boxesResultNewTop"]') ads = [] for l in linkovi: link = "http://www.nedviznostimakedonija.com.mk/" + xpath.get( l, '//a[@class="subjectLook nobackim"]/@href') title = xpath.get(l, '//a[@class="subjectLook nobackim"]').strip() imageUrl = "http://www.nedviznostimakedonija.com.mk/" + xpath.get( l, '//a[@class="nobackim"]/img/@src') download = Downloader(link) cont = download.get_content() cont = unicode(cont) description = xpath.get( cont, '//span[@id="Body1_DetailControl1_FormView1_Label5"]') category = u"Недвижнини" subcategory = "/" price = xpath.get( l, '//div[@style="float:right; color:#1b5474; font-size:14px; font-weight:bold;"]/span' ) price = price.split(" ") price[0] = price[0].replace(".", "") if price[1] == "€": price[1] = "EUR" else: price[1] = "MKD" value = price[0] currency = price[1] region = xpath.get( cont, '//span[@id="Body1_DetailControl1_FormView1_cityDescriptionLabel"]' ) country = u"Македонија" date = xpath.get( cont, '//span[@id="Body1_DetailControl1_FormView1_LabelDate"]') date = date.split(".") date = date[2] + "-" + date[1] + "-" + date[0] ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country) ads.append(ad) return adsToJson(ads) #print scrapeNedviznostiMakedonija()
def scrapeVipMarket5(): # UTF-8 support reload(sys) sys.setdefaultencoding('utf-8') now = datetime.now() down = Downloader('http://www.vipmarket5.mk/search/') content = down.get_content() html = unicode(content) linkovi = xpath.search(html, '//tr[@class="frame_content"]') ads = [] for l in linkovi: link = "http://www.vipmarket5.mk" + xpath.get(l, '//div[@style="width:365px; height:90%; margin-top:10px;"]/b/a/@href') title = xpath.get(l, '//div[@style="width:365px; height:90%; margin-top:10px;"]/b/a') imageUrl = xpath.get(l, '//div[@style="overflow:hidden; width:150px; height: 146px; margin: 5px;"]/a/img/@src') download = Downloader(link) cont = download.get_content() cont = unicode(cont) description = xpath.get(cont, '//div[@class="feature"]/p').strip() if description == "": description = "/" #VNIMANIE! NEMA KATEGORII category="/" subcategory="/" price = xpath.get(l, '//div[@style="margin-top:5px; margin-left:10px;height:155px; overflow:hidden;"]/h4/a') if price == u"Цена:По договор": value = "/" currency = "/" else: price = price.split(":") price = price[1] price = price.split(" ") value = price[0] if price[1]=="€": currency = "EUR" elif price[1]=="ден.": currency = "MKD" date = xpath.get(l, '//b[@style="font-weight:bold;"]') date = date.split(": ") date = date[1] date = date.split(".") date = date[2]+"-"+date[1]+"-"+date[0] country = u"Македонија" region = xpath.get(cont, '//div[@style="float:left; width: 140px; overflow:hidden; font-family: Tahoma,Geneva,sans-serif; font-weight:bold"]') if region == "": region = "/" ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country) ads.append(ad) return adsToJson(ads) #print scrapeVipMarket5()
def parse(html, page): for i in html.split("<td></td></tr>"): ms = xpath.search(i, r"//a[@target='_parent']") txhash = ms[0] if len(ms) > 0 else '' fm = ms[1] if len(ms) > 1 else '' too = ms[2] if len(ms) > 2 else '' age = xpath.get(i, r"//span[@rel='tooltip']/@title") quantity = common.regex_get(i, r'>([\d\.\,]+)</td>$') info = '"' + '","'.join([txhash, age, fm, too, quantity]) + '"' infos.append(info) return infos
def process_bizhi_url(url): try: # print url data = urllib2.urlopen(url).read() down_obj = util.app_info({'market':market}) down_obj['app_url'] = url down_obj['app_url_md5'] = hashlib.md5(url).hexdigest() app_name = xpath.search(data, '//div[@class="viewh"]/h1/') if(app_name): down_obj['app_name'] = app_name[0] infoleft = xpath.search(data, '//ul[@class="infoleft"]/li') if(infoleft): found = re.search('([.\d]+)([MK])', infoleft[1]) if(found): if(found.group(2) == 'M'): down_obj['size'] = int(float(found.group(1)) * 1024 * 1024) else: # KB down_obj['size'] = int(float(found.group(1)) * 1024) short_url = xpath.search(data, '//div[@class="inforight"]/a/@href') if(short_url): opener = urllib2.build_opener(util.RedirectHandler) apk_url= opener.open(short_url[0]).geturl() down_obj['download_link'] = apk_url print down_obj util.sql_do(down_obj) util.put_job(down_obj) global cnt_all cnt_all += 1 except urllib2.URLError, e: logger.error('process_url Exception at: %s, %s' % (url, e.message))
def parse_html2(html): infos = [] for i in html.split("<td></td></tr>"): ms = xpath.search(i, r"//span[@class='address-tag']") txhash = common.normalize(ms[0]) if len(ms) > 0 else '' fm = common.normalize(ms[1]) if len(ms) > 1 else '' too = common.normalize(ms[2]) if len(ms) > 2 else '' age = xpath.get(i, r"//span[@rel='tooltip']/@title") quantity = common.regex_get(i, r'>([\d\.\,]+)</td>$') direction = common.normalize(xpath.get(i, r'//span[@class="label\slabel.+"]')) if txhash: info = '"' + '","'.join([txhash, age, fm, direction, too, quantity]) + '"' infos.append(info) return infos
def mal(mal_title, mal_id=False): cookies = { "incap_ses_224_81958": "P6tYbUr7VH9V6shgudAbA1g5FVYAAAAAyt7eDF9npLc6I7roc0UIEQ==" } response = requests.get( "http://myanimelist.net/api/anime/search.xml", params={'q': mal_title}, cookies=cookies, auth=("zodman1", "zxczxc"), headers={ 'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' }) content = response.content if not mal_id is False: for e in xpath.search(content, "//entry"): if mal_id in e: content = e break tqdm.write("%s %s" % ((mal_title, ), mal_id)) id = xpath.get(content, "//id") title = xpath.get(content, "//title") title_en = xpath.get(content, "//english") type_ = xpath.get(content, "//type") synonyms = xpath.get(content, "//synonyms") status = xpath.get(content, "//status") synopsys = translate(xpath.get(content, "//synopsis"), "es") img = xpath.get(content, "//image") episodes = xpath.get(content, "//episodes") resumen = synopsys.replace("<br />", " ").replace("\n\r", "") resumen = translate(resumen, 'es') status = translate(status, 'es') assert id is not "", mal_title data = dict(title=title, title_en=title_en, type=type_, status=status, resumen=resumen, img=img, episodes=episodes, synonyms=synonyms, id=id, synopsys=synopsys) return MalResult(**data)
def _loadFinished(self, result): frame = self.mainFrame() url = str(frame.url().toString()) html = frame.toHtml() html = unicode(html) self.data[url] = xpath.search(html, self.xpathFilter) self.crawl() # TEST STUB #urls = [u'http://www.vrapce.mk/ad/31515', u'http://www.vrapce.mk/ad/15389', u'http://www.vrapce.mk/ad/27998', u'http://www.vrapce.mk/ad/24257', u'http://www.vrapce.mk/ad/19107', u'http://www.vrapce.mk/ad/14938', u'http://www.vrapce.mk/ad/14093', u'http://www.vrapce.mk/ad/14287', u'http://www.vrapce.mk/ad/14285', u'http://www.vrapce.mk/ad/14095', u'http://www.vrapce.mk/ad/14283', u'http://www.vrapce.mk/ad/31674', u'http://www.vrapce.mk/ad/31501', u'http://www.vrapce.mk/ad/18958', u'http://www.vrapce.mk/ad/33154', u'http://www.vrapce.mk/ad/2306', u'http://www.vrapce.mk/ad/32088', u'http://www.vrapce.mk/ad/29153', u'http://www.vrapce.mk/ad/23524', u'http://www.vrapce.mk/ad/20304', u'http://www.vrapce.mk/ad/4108', u'http://www.vrapce.mk/ad/22328', u'http://www.vrapce.mk/ad/3279', u'http://www.vrapce.mk/ad/13233', u'http://www.vrapce.mk/ad/2827', u'http://www.vrapce.mk/ad/24813', u'http://www.vrapce.mk/ad/18957', u'http://www.vrapce.mk/ad/5466', u'http://www.vrapce.mk/ad/31556', u'http://www.vrapce.mk/ad/29668'] # url = [u'http://www.vrapce.mk/'] # urls = [] # r = MultiPageFilterRenderer(url, '//a[@class="advertImage3Inner"]/@href') # urls = r.data['http://www.vrapce.mk/'] # print urls # description = # print description
def incr_database(conn): # csi D = download.Download(delay=0, read_cache=None, write_cache=None) data = [] csi = [] src = 'http://www.csindex.com.cn/zh-CN/indices/index-detail/' for i in open('stocks.csv'): code = i.split('\t')[0] if 'CSI' in i or '000985' in i: url = src + code html = D.get(url) trddate = common.regex_get(html, r'截止日期:([^<]+)<') if trddate: trddate = trddate.replace('-', '') m = xpath.search(html, r'//table[@class="table\stc"]/tr/td', remove=None) close = m[0] if m else None change = m[1] if m and len(m) > 1 else None sql = ''' REPLACE INTO quote_csi(code, close, date, chg) VALUES('%s',%s,%s,%s); ''' % (code, close, trddate, change) conn.execute(sql) else: today = datetime.today().strftime('%Y-%m-%d') engine = create_engine( 'mysql://*****:*****@localhost:3306/dige', echo=False) try: df = ts.get_k_data(code, ktype='D', index=True, start=today, end=today) if not df.empty: sql = ''' delete from quote_nocsi where code like '%%%s%%' and date = '%s' ''' % ( code, today) conn.execute(sql) df.to_sql('quote_nocsi', engine, if_exists='append') except Exception, e: print e
def search_animenetwork(title): base_url="http://cdn.animenewsnetwork.com/encyclopedia/api.xml" params = {'anime':"~"+title} response = requests.get(base_url, params=params) animes = xpath.search(response.content,"//anime") l = [] for i in animes: id = xpath.search(i, "./@id").pop() images = xpath.search(i, "//info/img/@src") summary = xpath.get(i, "//info[@type='Plot Summary']") genres = xpath.search(i, "//info[@type='Genres']") openings = xpath.search(i, "//info[@type='Opening Theme']") endings = xpath.search(i, "//info[@type='Ending Theme']") d={'summary': summary, 'images':images, 'genres': genres, 'openings': openings,'endings': endings,'id':id } l.append(d) return l
def mal_search(mal_title, mal_id=False): cookies = {"incap_ses_224_81958":"P6tYbUr7VH9V6shgudAbA1g5FVYAAAAAyt7eDF9npLc6I7roc0UIEQ=="} response = requests.get( "http://myanimelist.net/api/anime/search.xml", params={'q':mal_title}, cookies=cookies, auth=("zodman1","zxczxc"), headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'} ) content = response.content if mal_id is not False: for e in xpath.search(content,"//entry"): if mal_id in e: content = xpath.get(e, "//anime/entry") break else: content = xpath.get(content, "//anime/entry") english_title = xpath.get(content, '//english') title = xpath.get(content, '//title') synonyms = xpath.get(content, '//synonyms') id = xpath.get(content, "//id") return {'title':title, 'english_title':english_title, 'synonyms': synonyms, 'id':id }
# -*- coding: utf-8 -*- """ Created on Thu Jun 11 10:41:19 2015 @author: justin.malinchak """ from webscraping import download, xpath D = download.Download() html = D.get('https://www.hedgefundresearch.com/hfrx_reg/index.php') for row in xpath.search( html, '//table[@class="spad"]/tbody/t' ): #xpath.search(html, '<b class=tenpx>HFRX Global Hedge Fund Index</b></TD>'): cols = xpath.search(row, '/td') print 'Sunrise: %s, Sunset: %s' % (cols[1], cols[2])
def scrapeMobileBg(): # cp1251 support reload(sys) sys.setdefaultencoding('cp1251') now = datetime.now() down = Downloader('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71wxzy&f1=1') #http://www.mobile.bg/71ydeh #http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71wxzy&f1=1 content = down.get_content() html = unicode(content) linkovi = xpath.search(html, '//form[@name="search"]/table[@class="tablereset"]') linkovi = linkovi[3:len(linkovi)-4] links = [] links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71xw69&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71xwi1&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71xwr0&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71xx7g&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71xxjy&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71xzyr&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y06e&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y0dk&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y0q6&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y16v&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y1ep&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y2ih&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y2x5&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y34p&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y3ex&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y3wj&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y449&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y4wz&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y5qh&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y5yv&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y6az&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y6kg&f1=1') links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y6qz&f1=1') for link in links: dole = Downloader(link) content = dole.get_content() html = unicode(content) lin = xpath.search(html, '//form[@name="search"]/table[@class="tablereset"]') lin = lin[3:len(lin)-4] for li in lin: linkovi.append(li) linkot = xpath.get(li, '//td[@class="valgtop"]/a[@class="mmm"]/@href') ads = [] for l in linkovi: link = xpath.get(l, '//td[@class="valgtop"]/a[@class="mmm"]/@href') title = xpath.get(l, '//td[@class="valgtop"]/a[@class="mmm"]').strip() imageUrl = xpath.get(l, '//a[@class="photoLink"]/img/@src') download = Downloader(link) cont = download.get_content() cont = unicode(cont) description = xpath.get(cont, '//td[@style="font-size:13px;"]').strip() description = description.split("<a href") description = description[0] if description == "» ": description = "/" else: description = description[0:len(description)-19] description = description = description.replace("\"", "") category = u"Возила" subcategory = "/" price = xpath.get(l, '//span[@class="price"]').strip() if price == u"Договаряне": value = "/" currency = "/" else: price = price.split(" ") if len(price)==2: value = price[0] currency = price[1] elif len(price)==3: currency = price[2] value = price[0]+price[1] else: currency = price[3] value = price[0]+price[1]+price[2] if currency == "лв.": currency = "BGN" region = xpath.get(cont, '//td[@style="padding:10px"]').strip() region = region.split("Регион: ") region = region[1] region = region.split(" ") region = region[0] region = region.replace("<a","").strip() date = str(now.year)+"-"+str(now.month)+"-"+str(now.day) country = u"Бугарија" ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country) ads.append(ad) return adsToJson(ads) #print scrapeMobileBg()
writer = common.UnicodeWriter('articles.csv') writer.writerow(['Title', 'Num reads', 'URL']) seen_urls = set( ) # track which articles URL's already seen, to prevent duplicates D = download.Download() # iterate each of the categories for category_link in ('/developer/knowledge‐base?page=%d', '/developer/articles?page=%d'): # iterate the pages of a category for page in itertools.count(): category_html = D.get(urlparse.urljoin(DOMAIN, category_link % page)) article_links = xpath.search(category_html, '//div[@class="morelink"]/a/@href') num_new_articles = 0 for article_link in article_links: # scrape each article url = urlparse.urljoin(DOMAIN, article_link) if url not in seen_urls: num_new_articles += 1 seen_urls.add(url) html = D.get(url) title = xpath.get(html, '//div[@class="feed‐header‐wrap"]/h2') num_reads = xpath.get( html, '//li[@class="statistics_counter last"]/span').replace row = title, num_reads, url writer.writerow(row) if num_new_articles == 0:
def scrapeReklama5(): # UTF-8 support reload(sys) sys.setdefaultencoding('utf-8') down = Downloader('https://www.reklama5.mk/Search') html = down.get_content() html = unicode(html) requestedWebPageUrl = 'https://www.reklama5.mk' adverts = xpath.search(html, '//div[@class="OglasResults"]') ads = [] for advert in adverts: link = requestedWebPageUrl + xpath.get( advert, '//a[@class="SearchAdTitle"]/@href') title = xpath.get( advert, '//a[@class="SearchAdTitle"].text()').strip().replace("\"", "") description = getDescription( link, '//div[@class="oglasTitle"]/p[@class="oglasTitle"]').strip( ).replace("\"", "") subcategory = "/" imageUrl = xpath.get(advert, '//img[@class="thumbnail thumbs"]/@src') if imageUrl == "/Content/images/noImage2.jpg": imageUrl = requestedWebPageUrl + imageUrl price = xpath.get(advert, '//div[@class="text-left text-success"]') price = re.sub('\s+', ' ', price).strip() price = price.split(" ") if price[0] == "По": price[0] = "/" if price[1] == "Договор": price[1] = "/" value = price[0] currency = price[1] if currency == "€": currency = "EUR" if currency == u"МКД": currency = "MKD" region = xpath.get(advert, '//p[@class="clear-margin"]') region = region.split(">") region = region[0].strip() country = u"Македонија" date = xpath.get(advert, '//div[@class="text-center clear-padding adDate"]') date = re.sub('\s+', ' ', date).strip() time = xpath.get(advert, '//div[@class="text-center clear-padding adDate"]') time = re.sub('\s+', ' ', time).strip() if date.split()[0] == u"Денес" and time.split()[0]: date = datetime.now() datum = str(date.year) + "-" + str(date.month) + "-" + str( date.day) vreme = time.split(" ")[1] p = datum + " " + vreme date = p category = xpath.get(advert, '//p[@class="adCategoryName"]/a') ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country) #print link, title, imageUrl, description, category, subcategory, value, currency, region, date ads.append(ad) return adsToJson(ads)
def scrapeMobile24(): # UTF-8 support reload(sys) sys.setdefaultencoding('utf-8') now = datetime.now() #http://www.mobile24.mk/avtomobili/' down = Downloader('http://www.mobile24.mk/avtomobili/') content = down.get_content() html = unicode(content) linkovi = xpath.search(html, '//tr[@class="t0"]') lin = xpath.search(html, '//tr[@class="t1"]') for l in lin: linkovi.append(l) #http://www.mobile24.mk/motocikli/ down = Downloader('http://www.mobile24.mk/motocikli/') content = down.get_content() html = unicode(content) linko = xpath.search(html, '//tr[@class="t0"]') lin = xpath.search(html, '//tr[@class="t1"]') for l in lin: linkovi.append(l) for l in linko: linkovi.append(l) #http://www.mobile24.mk/kombinja/ down = Downloader('http://www.mobile24.mk/kombinja/') content = down.get_content() html = unicode(content) linko = xpath.search(html, '//tr[@class="t0"]') lin = xpath.search(html, '//tr[@class="t1"]') for l in lin: linkovi.append(l) for l in linko: linkovi.append(l) #http://www.mobile24.mk/kamioni/ down = Downloader('http://www.mobile24.mk/kamioni/') content = down.get_content() html = unicode(content) linko = xpath.search(html, '//tr[@class="t0"]') lin = xpath.search(html, '//tr[@class="t1"]') for l in lin: linkovi.append(l) for l in linko: linkovi.append(l) #http://www.mobile24.mk/prikolki/ down = Downloader('http://www.mobile24.mk/prikolki/') content = down.get_content() html = unicode(content) linko = xpath.search(html, '//tr[@class="t0"]') lin = xpath.search(html, '//tr[@class="t1"]') for l in lin: linkovi.append(l) for l in linko: linkovi.append(l) #http://www.mobile24.mk/avtobusi/ down = Downloader('http://www.mobile24.mk/avtobusi/') content = down.get_content() html = unicode(content) linko = xpath.search(html, '//tr[@class="t0"]') lin = xpath.search(html, '//tr[@class="t1"]') for l in lin: linkovi.append(l) for l in linko: linkovi.append(l) #http://www.mobile24.mk/gumiiavtodelovi/ down = Downloader('http://www.mobile24.mk/gumiiavtodelovi/') content = down.get_content() html = unicode(content) linko = xpath.search(html, '//tr[@class="t0"]') lin = xpath.search(html, '//tr[@class="t1"]') for l in lin: linkovi.append(l) for l in linko: linkovi.append(l) ads = [] for l in linkovi: link = xpath.get(l, '//a[@class="listing-title"]/@href') title = xpath.get(l, '//a[@class="listing-title"]/b') imageUrl = xpath.get(l, '//td[@class="image"]/a/img/@src') download = Downloader(link) cont = download.get_content() cont = unicode(cont) desc = xpath.search( cont, '//div[@class="item-left"]/div[@class="fieldset rounded4"]/div') if len(desc) == 4: description = desc[1] else: description = desc[0] category = u"Возила" subcategory = "/" price = xpath.get(l, '//td[@class="price"].text()') value = xpath.get(l, '//td[@class="price"]/span') value = value.replace(",", "") price = price.split("span>") price = price[2] price = price.split("<") price = price[0] currency = price if currency == u"денари": currency = "MKD" if value == u"По договор": value = "/" currency = "/" region = xpath.get(l, '//span[@class="city"]') date = str(now.year) + "-" + str(now.month) + "-" + str(now.day) country = u"Македонија" ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country) ads.append(ad) return adsToJson(ads) # print scrapeMobile24()
import json import csv import sys import codecs #Download instance D = download.Download() #get page html = D.get('http://2015.es.pycon.org/es/schedule/') index = 0 talks_pycones = [] #get div where is located information for row in xpath.search(html, '//div[@class="col-xs-12"]'): if index % 2 == 0: talk = xpath.search(row, '//div[@class="slot-inner"]/h3') author = xpath.search(row, '//div[@class="slot-inner"]/p/strong') hour = xpath.search(row, '//div[@class="slot-inner"]/strong') if index % 2 != 0: description = xpath.search(row, '/p') if talk is not None and author is not None and description is not None and hour is not None and len( talk) > 0 and len(author) > 0 and len(description) > 0 and len( hour) > 0: talk_pycones = {}
def scrapePobarajOglasi(): # UTF-8 support reload(sys) sys.setdefaultencoding('utf-8') now = datetime.now() down = Downloader('http://www.pobaraj.com.mk/lista_na_oglasi/all/1') content = down.get_content() html = unicode(content) site = xpath.get(html, '//ul[@class="lista_na_oglasi"]') linkovi = xpath.search(site, '//li') ads = [] for l in linkovi: link = "http://www.pobaraj.com.mk" + xpath.get( l, '//a[@class="title"]/@href') title = xpath.get(l, '//a[@class="title"]') imageUrl = xpath.get(l, '//a[@class="photo"]/img/@src') download = Downloader(link) cont = download.get_content() cont = unicode(cont) description = xpath.get(cont, '//div[@class="oglas_prikaz_opis"]').strip() if description == "": description = "/" kategorii = xpath.search(cont, '//a[@class="pateka"]') category = kategorii[1] if len(kategorii) > 2: subcategory = kategorii[2] else: subcategory = "/" price = xpath.get(l, '//div[@class="price"]').strip() price = price.split("<div ") price = price[0].strip() price = price.split("Цена: ") price = price[1] if price == u"по договор": value = "/" currency = "/" else: price = price.split(" ") value = price[0] if price[1] == u"денари": currency = "MKD" elif price[1] == u"евра": currency = "EUR" else: currency = price[1] region = xpath.get(cont, '//div[@class="oglas_prikaz_left"]').strip() region = region.split("Град:<") region = region[1] region = region.split("<b class") region = region[0] region = region.split("b>") region = region[1] region = region.strip() country = u"Македонија" datum = xpath.get(l, '//div[@class="oglas_date"]').strip() datum = datum.split(": ") datum = datum[1] datum = datum.split(", ") vreme = datum[1] datum = datum[0] if datum == u"Денес": date = str(now.year) + "-" + str(now.month) + "-" + str( now.day) + " " + vreme elif datum == u"Вчера": da = datetime.now() - timedelta(days=1) date = str(da.year) + "-" + str(da.month) + "-" + str( da.day) + " " + vreme else: datum = datum.split(" ") if datum[1] == "Јан": datum = str(now.year) + "-1-" + datum[0] elif datum[1] == "Фев": datum = str(now.year) + "-2-" + datum[0] elif datum[1] == "Мар": datum = str(now.year) + "-3-" + datum[0] elif datum[1] == "Апр": datum = str(now.year) + "-4-" + datum[0] elif datum[1] == "Мај": datum = str(now.year) + "-5-" + datum[0] elif datum[1] == "Јун": datum = str(now.year) + "-6-" + datum[0] elif datum[1] == "Јул": datum = str(now.year) + "-7-" + datum[0] elif datum[1] == "Авг": datum = str(now.year) + "-8-" + datum[0] elif datum[1] == "Сеп": datum = str(now.year) + "-9-" + datum[0] elif datum[1] == "Окт": datum = str(now.year) + "-10-" + datum[0] elif datum[1] == "Ное": datum = str(now.year) + "-11-" + datum[0] elif datum[1] == "Дек": datum = str(now.year) + "-12-" + datum[0] date = datum + " " + vreme ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country) ads.append(ad) return adsToJson(ads) #print scrapePobarajOglasi()
def scrapeKoli(): reload(sys) sys.setdefaultencoding('utf-8') down = Downloader('http://koli.com.mk/polovni_lista.aspx') html = down.get_content() html = unicode(html) requestedWebPageUrl = 'http://koli.com.mk/polovni_lista.aspx' adverts = xpath.search(html, '//table[@id="dlRezultati"]') ads = [] links = xpath.search(html, '//a[@class="linkovi_desno_golemi"]/@href') da = datetime.now() for l in links: link = "http://koli.com.mk/" + l d = Downloader(link) ad = d.get_content() ad = unicode(ad) description = u"Опрема: " + xpath.get( ad, '//span[@id="lblOprema"]') + " \nOpis: " + xpath.get( ad, '//span[@id="lblOpis"]') title = xpath.get(ad, '//span[@id="lblMarkaModel"].text()').strip() imageUrl = 'http://koli.com.mk/' + xpath.get( ad, '//img[@id="slika"]/@src') subcategory = "/" category = u"Возила" region = xpath.get(ad, '//span[@id="lblGrad"].text()') country = u"Македонија" value = xpath.get(ad, '//span[@id="lblMomentalnaCena"]').strip() currency = "EUR" date = "" d = xpath.get(ad, '//span[@id="lblDenovi"]').strip() d = d.split(" ") if len(d) == 1: if d[0] == u"минута": date = str(da.year) + "-" + str(da.month) + "-" + str(da.day) elif d[0] == u"час": date = str(da.year) + "-" + str(da.month) + "-" + str(da.day) elif d[0] == u"ден": da = datetime.now() - timedelta(days=1) date = str(da.year) + "-" + str(da.month) + "-" + str(da.day) elif d[0] == u"месец": da = datetime.now() - timedelta(days=30) date = str(da.year) + "-" + str(da.month) + "-" + str(da.day) elif d[0] == u"секунда": date = str(da.year) + "-" + str(da.month) + "-" + str(da.day) else: if d[1] == u"месеци": da = datetime.now() - timedelta(days=int(d[0] * 30)) date = str(da.year) + "-" + str(da.month) + "-" + str(da.day) elif d[1] == u"дена": da = datetime.now() - timedelta(days=int(d[0])) date = str(da.year) + "-" + str(da.month) + "-" + str(da.day) elif d[1] == u"минути": date = str(da.year) + "-" + str(da.month) + "-" + str(da.day) elif d[1] == u"часа": date = str(da.year) + "-" + str(da.month) + "-" + str(da.day) elif d[1] == u"секунди": date = str(da.year) + "-" + str(da.month) + "-" + str(da.day) ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country) ads.append(ad) return adsToJson(ads)
import json import csv import sys import codecs #Download instance D = download.Download() #get page html = D.get('http://pydata.org/madrid2016/schedule/') talks_pydata = [] #get td element where is located information for row in xpath.search(html, '//td[@class="slot slot-talk"]'): speakers = xpath.search(row,'//span[@class="speaker"]/text()') urls = xpath.search(row,'//span[@class="title"]//a/@href') talks = xpath.search(row,'//span[@class="title"]//a/text()') for speaker in speakers: print speaker.strip() print urls[0] print talks[0] details = D.get('http://pydata.org/'+urls[0]) description = xpath.search(details,'//div[@class="description"]//p/text()')[0] print description hour = xpath.search(details,'//div[@class="col-md-8"]//h4/text()')[0].replace("\n","").strip() print hour if talks[0] is not None and speaker is not None and description is not None and hour is not None:
csv_file = csv.reader(open('urls-test2.csv', 'rb'), delimiter=',') names = [] for data in csv_file: names.append(data[0]) for name in names: html = D.get(name); html2 = html param = '<br />'; html2 = html2.replace("<br />", " | ") print name c = csv.writer(open("darkgrey.csv", "a")) for row in xpath.search(html2, '//table/tr[@class="bgdarkgrey"]'): cols = xpath.search(row, '/td') if len(cols) >= 5: c.writerow([cols[0], cols[1], cols[2], cols[3], cols[4]]) q = csv.writer(open("lightgrey.csv", "a")) for row2 in xpath.search(html2, '//table/tr[@class="bglightgrey"]'): cols2 = xpath.search(row2, '/td') if len(cols) >= 5: q.writerow([cols2[0], cols2[1], cols2[2], cols2[3], cols2[4]]) csv_file.close() import csv, sys from webscraping import download, xpath D = download.Download()
def get_followees(): # 构造关注链接,寻找下一层user user_url_followees = user_url + '/followees' user_url_followers = user_url + '/followers' try: # 获取关注列表页面 html_followees = D.get(user_url_followees, delay=0.1, opener=opener, read_cache=False, write_cache=False) # 获取关注者列表页面 html_followers = D.get(user_url_followers, delay=0.1, opener=opener, read_cache=False, write_cache=False) except Exception, e: print 'Exception in download. {}'.format(str(e)) else: if html_followees and html_followers: # xpath解析页面 # 解析followees followees_list = xpath.search(html_followees, '//div[@class="zh-general-list clearfix"]//div[@class="zm-profile-card zm-profile-section-item zg-clear no-hovercard"]//h2[@class="zm-list-content-title"]') for i in range(len(followees_list)): # 获取链接写入zhihu_url_main集合 zhihu_url_main.put(common.regex_get(followees_list[i], r'href="(.*?)" ')) # 解析followers followers_list = xpath.search(html_followers, '//div[@class="zh-general-list clearfix"]//div[@class="zm-profile-card zm-profile-section-item zg-clear no-hovercard"]//h2[@class="zm-list-content-title"]') for i in range(len(followers_list)): # 获取链接写入zhihu_url_main集合 zhihu_url_main.put(common.regex_get(followers_list[i], r'href="(.*?)" ')) # 执行获取函数 get_followees() # 从main集合中抽取url,保证是没有读取过的,即不在copy集合中 new_user_url = zhihu_url_main.get() while zhihu_url_copy.ismember(new_user_url) == 1: new_user_url = zhihu_url_main.get() # 存入copy集合
def scrapeHaloOglasi(): # UTF-8 support reload(sys) sys.setdefaultencoding('utf-8') now = datetime.now() down = Downloader( 'http://www.halooglasi.com/naslovna.240.html?search_text=&sortColumn=VremeDodavanja' ) content = down.get_content() html = unicode(content) celo = xpath.get(html, '//div[@class="results_container"]') linkovi = xpath.search(celo, '//div[@class="result_brza"]') ads = [] for l in linkovi: link = xpath.get(l, '//div[@style="height:auto;"]/h2/a/@href') link = "http://www.halooglasi.com" + link download = Downloader(link) cont = download.get_content() cont = unicode(cont) title = xpath.get(cont, '//div[@class="detail_bar_nek"]/h2').strip() if title == "": title = xpath.get(cont, '//div[@class="detail_bar"]/h2').strip() imageUrl = xpath.get(l, '//a[@class="thumb"]/img/@src') imageUrl = "http://www.halooglasi.com" + imageUrl description = xpath.get(l, '//div[@class="text_ogl"]/p') kategorija = xpath.get(l, '//div[@class="brza_link"]').strip() kategorija = kategorija.split("\r\n\t\t\t\t\t\t\r\n\t\t\t\t\t\t") kategorija = kategorija[1] kategorija = kategorija.split(" > ") category = kategorija[0] if len(kategorija) > 2: subcategory = kategorija[1] else: kategorija = kategorija[1].split("'>") kategorija = kategorija[1] kategorija = kategorija.split("<") subcategory = kategorija[0] price = xpath.get(cont, '//div[@class="price"]').strip() #price if price == "": price = xpath.get( cont, '//div[@class="price deal"]').strip() #price deal price = price.replace(".", "") price = price.replace("din", " DIN") price = price.replace("€", " EUR") if price == "Dogovor": value = "/" currency = "/" else: price = price.split(" ") value = price[0] currency = price[1] date_loc = xpath.search(l, '//div[@class="datum_grad"]/h6/span') date_loc[0] = date_loc[0].strip() date = date_loc[0].split("\r\n") date = date[0] date = date.replace(".", "") date = date.split(" ") if date[1] == "Jan": date[1] = "1" elif date[1] == "Feb": date[1] = "2" elif date[1] == "Mar": date[1] = "3" elif date[1] == "Apr": date[1] = "4" elif date[1] == "Maj": date[1] = "5" elif date[1] == "Jun": date[1] = "6" elif date[1] == "Jul": date[1] = "7" elif date[1] == "Avg": date[1] = "8" elif date[1] == "Sep": date[1] = "9" elif date[1] == "Okt": date[1] = "10" elif date[1] == "Nov": date[1] = "11" elif date[1] == "Dec": date[1] = "12" date = date[2] + "-" + date[1] + "-" + date[0] l = date_loc[1].strip() l = l.split(" ") region = l[0] country = u"Србија" ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country) ads.append(ad) return adsToJson(ads) #print scrapeHaloOglasi()
csv_file = csv.reader(open('urls-test2.csv', 'rb'), delimiter=',') names = [] for data in csv_file: names.append(data[0]) for name in names: html = D.get(name) html2 = html param = '<br />' html2 = html2.replace("<br />", " | ") print name c = csv.writer(open("darkgrey.csv", "a")) for row in xpath.search(html2, '//table/tr[@class="bgdarkgrey"]'): cols = xpath.search(row, '/td') if len(cols) >= 5: c.writerow([cols[0], cols[1], cols[2], cols[3], cols[4]]) q = csv.writer(open("lightgrey.csv", "a")) for row2 in xpath.search(html2, '//table/tr[@class="bglightgrey"]'): cols2 = xpath.search(row2, '/td') if len(cols) >= 5: q.writerow([cols2[0], cols2[1], cols2[2], cols2[3], cols2[4]]) csv_file.close() import csv, sys from webscraping import download, xpath D = download.Download()
def scrapeKupujemProdajem(): # UTF-8 support reload(sys) sys.setdefaultencoding('utf-8') now = datetime.now() ads = [] try: down = Downloader( 'http://www.kupujemprodajem.com/search.php?action=list&data[category_id]=&data[group_id]=&data[location_id]=&data[keywords]=&submit[search]=Tra%C5%BEi' ) content = down.get_content() html = unicode(content) link = "" title = "" imageUrl = "" description = "/" category = "/" subcategory = "/" value = "/" currency = "/" region = "/" date = str(now.year) + "-" + str(now.month) + "-" + str(now.day) linkovi = xpath.search(html, '//div[@class="item clearfix"]') highlighted = xpath.search( html, '//div[@class="item clearfix adHighlighted"]') for h in highlighted: linkovi.append(h) for l in linkovi: try: link = "http://www.kupujemprodajem.com/" + xpath.get( l, '//a[@class="adName"]/@href') title = xpath.get(l, '//a[@class="adName"]') region = xpath.get(l, '//section[@class="locationSec"]').strip() region = region.split(" | ") region = region[0] price = xpath.get(l, '//span[@class="adPrice"]') price = price.split(" ") if len(price) == 2: value = price[0] value = value.replace(".", "") value = value.split(",") value = value[0] currency = price[1] else: value = "/" currency = "/" if currency == "€": currency = "EUR" elif currency == "din": currency = "DIN" down = Downloader(link) content = down.get_content() category = xpath.get(content, '//a[@class="crumbs"]') description = xpath.get( l, '//section[@class="nameSec"]/p[@class="adDescription"]') category = category.split("|") category = category[0] category = category.strip() imageUrl = xpath.get( content, '//div[@class="adThumbnailHolder"]/a/img/@src') imageUrl = imageUrl.replace("//", "/") imageUrl = imageUrl[1::] if imageUrl == "": imageUrl = "/" description = description.replace("...<p>", "") description = description.strip() country = u"Србија" ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country) ads.append(ad) except: pass except: pass return adsToJson(ads) #print scrapeKupujemProdajem()
from webscraping import download, xpath import json import csv import sys import codecs #Download instance D = download.Download() #get page html = D.get('http://pydata.org/madrid2016/schedule/') talks_pydata = [] #get td element where is located information for row in xpath.search(html, '//td[@class="slot slot-talk"]'): speakers = xpath.search(row, '//span[@class="speaker"]/text()') urls = xpath.search(row, '//span[@class="title"]//a/@href') talks = xpath.search(row, '//span[@class="title"]//a/text()') for speaker in speakers: print speaker.strip() print urls[0] print talks[0] details = D.get('http://pydata.org/' + urls[0]) description = xpath.search(details, '//div[@class="description"]//p/text()')[0] print description hour = xpath.search(details, '//div[@class="col-md-8"]//h4/text()')[0].replace( "\n", "").strip()
def scrapeAvtooglasi(): # UTF-8 support reload(sys) sys.setdefaultencoding('utf-8') now = datetime.now() down = Downloader('http://www.avtooglasi.com.mk/rezultati/show/?vid=0&orderby=0') content = down.get_content() html = unicode(content) sliki = xpath.search(html, '//div[@class="resultLeft"]') ostanato = xpath.search(html, '//div[@class="oglasInfoTopContent"]') ceni = xpath.search(html, '//a[@class="btn btn-info btn-xs oglasInfoAdditionalPrice"]') link = {} title = {} imageUrl = {} description = {} category = {} subcategory = {} value = {} currency = {} region = {} date = {} i = 0 ads = [] for slika in sliki: imageUrl[i] = xpath.search(slika, '//a[@class="thumbnail resultImg"]/img/@src')[0] i = i + 1 i = 0 for cena in ceni: price = xpath.get(cena,'//span/span').strip() price=price.split(" ") if len(price)>1: if price[0]=="По": price[0]="/" if price[1]=="договор": price[1]="/" value[i]=price[0] currency[i]=price[1] if currency[i]=="€": currency[i]="EUR" i = i + 1 i = 0 for advert in ostanato: link[i] = xpath.get(advert, '//a[@class="resultMainLink"]/@href') title[i] = xpath.get(advert, '//a[@class="resultMainLink"]/span').strip().replace("\"", "") path = xpath.search(getDescription(link[i],'//div[@class="centerC"]'), '/div/div[@class="padded"]') description[i] = path[1] subcategory[i]="/" category[i] = u"Возила" dodatok = xpath.get(advert, '//span[@class="oglasInfoAdditionalInfo"]') dodatok = dodatok.split(" | ") region[i] = dodatok[0] country = u"Македеонија" description[i] = dodatok[1] + u" година, "+ dodatok[2] +", "+ dodatok[3] +", "+ dodatok[4] +", "+ dodatok[5] +", "+ description[i] description[i] = description[i].strip().replace("\"", "") date[i]="" #print description[i] datum = dodatok[6].strip() datum = datum.split(" ") if datum[0]=="Денес": datum [0]= str(now.year)+"-"+str(now.month)+"-"+str(now.day) date[i]=datum[0]+" "+datum[2] elif datum[0]=="Вчера": da=datetime.now()-timedelta(days=1) datum[0]=str(da.year)+"-"+str(da.month)+"-"+str(da.day) date[i]=datum[0]+" "+datum[2] elif datum[0]=="пред": if datum[2]=="дена": da=datetime.now()-timedelta(days=int(datum[1])) datum[0]=str(da.year)+"-"+str(da.month)+"-"+str(da.day) date[i]=datum[0] else: if datum[1]=="1": da=datetime.now()-timedelta(days=30) datum[0]=str(da.year)+"-"+str(da.month)+"-"+str(da.day) date[i]=datum[0] else: da=datetime.now()-timedelta(days=60) datum[0]=str(da.year)+"-"+str(da.month)+"-"+str(da.day) date[i]=datum[0] else: date[i]=datum[0]+" "+datum[1] #print date[i] i = i + 1 for i in link: ad = Ad(link[i], title[i], imageUrl[i], description[i], category[i], subcategory[i], value[i], currency[i], region[i], date[i], country) ads.append(ad) return adsToJson(ads) # print scrapeAvtooglasi()
from webscraping import download, xpath D = download.Download() html = D.get('www.abv.bg') for row in xpath.search(html, '//table[@class="spad"]/tbody/tr'): cols = xpath.search(row, '/td') print 'Sunrise: %s, Sunset: %s' % (cols[1], cols[2])
import csv import sys import codecs #Download instance D = download.Download() #get page html = D.get('http://2015.es.pycon.org/es/schedule/') index =0 talks_pycones = [] #get div where is located information for row in xpath.search(html, '//div[@class="col-xs-12"]'): if index%2 ==0: talk = xpath.search(row, '//div[@class="slot-inner"]/h3') author = xpath.search(row, '//div[@class="slot-inner"]/p/strong') hour = xpath.search(row, '//div[@class="slot-inner"]/strong') if index%2 !=0: description = xpath.search(row, '/p') if talk is not None and author is not None and description is not None and hour is not None and len(talk)>0 and len(author)>0 and len(description)>0 and len(hour)>0: talk_pycones ={} talk_pycones['talk'] = talk[0].decode('utf-8').encode('cp850','replace').decode('cp850') talk_pycones['author'] = author[0].decode('utf-8').encode('cp850','replace').decode('cp850')