def run(self): base_rsc_url = self.URL page_number = 1 rsc_url_para = '?page=' input_date_format = '%b %d, %Y' output_date_format = '%Y.%m.%d' re_input_date = re.compile('(?<=\s-\s)\w+\s\d+,\s\d+') while True: rsc_url = base_rsc_url + rsc_url_para + str(page_number) try: date_page = p(url = rsc_url) except Exception, e: page_number += 1 continue label_a_list = date_page('.information_text a') if len(label_a_list) == 0: break else: for a in label_a_list: date_str = re_input_date.search(p(a).text()).group() date = dt.strptime(date_str, input_date_format) url = p(a).attr('href') _date_store_path = self.rsc_store_path + date.strftime(output_date_format) + '.txt' DataFetchThread(date_str, url, _date_store_path).start() page_number += 1
def parse(self, response): result = response.text doc = p(result) a = p(doc.find('.zc_contract_top')[1]).find('td a') for i in a: s = 'http://www.ahzfcg.gov.cn'+p(i).attr('href') yield scrapy.Request(s, callback=self.parse_)
def aa(): import json import requests import datetime from pyquery import PyQuery as p url = 'http://www.weather.com.cn/weather/101021200.shtml' result = requests.get(url) a = p(result.text).find('.sky .tem') max_list = [] min_list = [] work_list = [] date_list = [] for i in a: # print(p(i).html()) max = p(i).find('span').html() min = p(i).find('i').html() if not max: max = '32' max = re.findall(re.compile('\d+'), max)[0] min = re.findall(re.compile('\d+'), min)[0] print(max, min) max_list.append(int(max)) min_list.append(int(min)) today = datetime.date.today() holiday_url = 'http://api.goseek.cn/Tools/holiday?date=' #https://www.jianshu.com/p/05ccb5783f65 work = 0 for i in range(7): s = today.strftime('%Y%m%d') date_list.append(s) url = holiday_url + s res = requests.get(url).text print(res) res = json.loads(res) if 'data' in res: if res['data'] == 2 or res['data'] == 0: work = 1 # 上班 else: work = 0 # 不上班 today = today + datetime.timedelta(days=1) print(s, work) work_list.append(work) # time.sleep(0.5) return max_list, min_list, work_list, date_list
def page_parser(page): while True: current = wait.until( EC.presence_of_element_located( (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active>span' ))).text print('当前正在爬取第{}页'.format(current)) buttom = browser.find_element_by_css_selector('li.item.next a') buttom.click() if int(current) <= page: html = p(browser.page_source) items = html('#mainsrp-itemlist .items .item').items() for item in items: product = { 'img': item.find('.pic .img').attr('data-src'), 'price': item.find('.price').text(), 'deal': item.find('.deal-cnt').text(), 'shop': item.find('.shop').text(), 'location': item.find('.location').text() } print(product) save_mongo(product) time.sleep(3) else: break
def main(): base_store_path = 'crossword_puzzles/' #fetch the url dict of puzzle resources base_url = 'http://crosswordgiant.com/browse' rsc_page = p(url = base_url) label_a_list = rsc_page('.information_text a') for a in label_a_list: rsc_name = p(a).text() base_rsc_url = p(a).attr('href') rsc_store_path = base_store_path + rsc_name + '/' try: os.makedirs(rsc_store_path) except Exception, e: pass #fetch data from each resource UrlFetchThread(rsc_name, base_rsc_url, rsc_store_path).start()
def run(self): date_store_path = self.Path print self.Path #begin to fetch and write date_url = self.URL try: puzzle_page = p(url = date_url) except Exception, e: return None
def itis_lookup(name, TIMEOUT=10, CACHE=True): ''' Look up "name" on itis.gov. If a standard name can be identified, returns that name. Returns False if no or ambiguous result. If a name matches multiple species that are all members of the same genus, itis_lookup will return "Genus sp1/sp2/sp3..." ''' name = name.replace("'", '').lower() if name in cache and CACHE: return cache[name] url = 'http://www.itis.gov/servlet/SingleRpt/SingleRpt' values = {'search_topic': 'all', 'search_kingdom':'every', 'search_span':'containing', 'search_value': name.decode(), 'categories':'All', 'source':'html', 'search_credRating': 'All'} data = urllib.urlencode(values) req = urllib2.Request(url, data) response = urllib2.urlopen(req, timeout=TIMEOUT) html = response.read() # parse results to pull out unique species results = [s.tail for s in p(html)('td.body a')] results = sum([re.findall('Species: [A-Z][a-z ]*', result) for result in results], []) results = [s.split(':')[1].strip() for s in results] if results: genus = set() all_species = [] result = None for this_species in results: genus.add(this_species.split()[0]) if len(genus) > 1: result = False break all_species.append(' '.join(this_species.split()[1:])) if not result is False: result = list(genus)[0] + ' ' + '/'.join(sorted(list(set(all_species)))) cache[name] = result else: cache[name] = False if CACHE: caching.save_cache(cache, 'itis') return cache[name]
def itis_lookup(name, TIMEOUT=10): global TIMEOUTS name = name.replace("'", '').lower() if name in itis_cache: print "==> itis", return itis_cache[name] elif TIMEOUTS >= 5: # if ITIS seems to be down, do nothing raise Exception('ITIS seems to be down.') url = 'http://www.itis.gov/servlet/SingleRpt/SingleRpt' values = {'search_topic': 'all', 'search_kingdom':'every', 'search_span':'containing', 'search_value': name.decode(), 'categories':'All', 'source':'html', 'search_credRating': 'All'} data = urllib.urlencode(values) req = urllib2.Request(url, data) response = urllib2.urlopen(req, timeout=TIMEOUT) html = response.read() response.close() # parse results to pull out unique species results = [s.tail for s in p(html)('td.body a')] results = sum([re.findall('Species: [A-Z][a-z ]*', result) for result in results], []) results = [s.split(':')[1].strip() for s in results] if results: genus = set() all_species = [] for this_species in results: genus.add(this_species.split()[0]) if len(genus) > 1: return False all_species.append(' '.join(this_species.split()[1:])) species = list(genus)[0] + ' ' + '/'.join(sorted(list(set(all_species)))) itis_cache[name] = species print "==> itis", else: itis_cache[name] = False #print 'itis_cache = %s' % itis_cache pickle.dump(itis_cache, open(os.path.join(DATA_DIR, 'itis.cache'), 'w'), protocol=-1) return itis_cache[name]
def parse(self, response): url = 'http://www.ccgp-hunan.gov.cn/mvc/viewNoticeContent.do?noticeId=%s&area_id=' data = json.loads(response.text) rows = data['rows'] for i in rows: id = i['NOTICE_ID'] title = i['NOTICE_TITLE'] resp = requests.get(url % (id)) doc = p(resp.text) content = doc.find('table:eq(3)').html() yield { 'id': 'hunan_%s' % (id), 'title': title, 'content': content, 'province': '湖南', 'source_url': url % (id), 'publish_time': self.today, } self.log(data)
def parse_(self, response): url = response.url id = url.split('newsId=') if id and len(id) == 2: id = id[1] result = response.text doc = p(result) title = doc.find('.frameNews h1').html() publish_time = doc.find('.source span').html().replace('发布日期:', '').split(' ')[0] content = doc.find('.frameNews').html() yield { 'id':'anhui_' + id, 'title':title, 'content':content, 'souce_url':url, 'province':'安徽', 'publish_time':publish_time }
def pathparse(html,url): for img in [e.attr.src for e in p(html)("img").items()]: newimg=urljoin(url,img) html=html.replace(img,newimg) return html
def short(self): return p(self.content).text()[:40] if self.content else ''
def parse_item(self, response): url_2 = 'http://www.zfcg.sh.gov.cn/emeb_bulletin.do?method=showbulletin&bulletin_id=' result = response.text links = p(result).find('#bulletininfotable_table_body a') if len(links): for a in links: href = a.attrib['value'] title = a.text # yield scrapy.Request(url_2+href,callback=self.parse_tender,headers=self.headers) resp = requests.get(url_2+href, headers=self.headers) content = resp.text if content: c = p(resp.text).find('#templateContext') e = p(resp.text).find('.newinfotr1') drop = '<script(.*?)</script>|<textarea(.*?)>|</textarea>|<input(.*?)type="hidden"(.*?)>' if c: content = re.sub(drop,'',c.html()) elif e: content = '<table><tbody>'+''.join([_p(_).outerHtml() for _ in e])+'</tbody></table>'[:50] else: content = '' yield{ 'id':'shanghai_' + href, 'title':title, 'content':content, 'source_url':url_2+href, 'province':'上海', 'publish_time':datetime.date.today().strftime('%Y-%m-%d') } # yield response.follow(url_2+href,callback=self.parse_tender,headers=self.headers) # def parse_tender(self,response): # c = p(response.text).find('#templateContext') # e = p(response.text).find('.newinfotr1') # drop = '<script(.*?)</script>|<textarea(.*?)>|</textarea>|<input(.*?)type="hidden"(.*?)>' # if c: # content = re.sub(drop,'',c.html()) # elif e: # content = '<table><tbody>'+''.join([_p(_).outerHtml() for _ in e])+'</tbody></table>'[:50] # else: # content = '' # yield{ # 'id':'shanghai_' + # 'title':content, # 'content':content, # 'source_url':'', # 'area':'上海', # 'publish_time':datetime.date.today().strftime('%Y%m%d') # }