def run(self):
     base_rsc_url = self.URL
     page_number = 1
     rsc_url_para = '?page='
     input_date_format = '%b %d, %Y'
     output_date_format = '%Y.%m.%d'
     re_input_date = re.compile('(?<=\s-\s)\w+\s\d+,\s\d+')
     while True:
         rsc_url = base_rsc_url + rsc_url_para + str(page_number)
         try:
             date_page = p(url = rsc_url)
         except Exception, e:
             page_number += 1
             continue
         label_a_list = date_page('.information_text a')
         if len(label_a_list) == 0:
             break
         else:
             for a in label_a_list:
                 date_str = re_input_date.search(p(a).text()).group()
                 date = dt.strptime(date_str, input_date_format)
                 url = p(a).attr('href')
                 _date_store_path = self.rsc_store_path + date.strftime(output_date_format) + '.txt'
                 DataFetchThread(date_str, url, _date_store_path).start()
             page_number += 1
Example #2
0
    def parse(self, response):
        
        result = response.text
        doc = p(result)
        a = p(doc.find('.zc_contract_top')[1]).find('td a')

        for i in a:
            s = 'http://www.ahzfcg.gov.cn'+p(i).attr('href')
            yield scrapy.Request(s, callback=self.parse_)
Example #3
0
def aa():
    import json
    import requests
    import datetime
    from pyquery import PyQuery as p

    url = 'http://www.weather.com.cn/weather/101021200.shtml'
    result = requests.get(url)

    a = p(result.text).find('.sky .tem')

    max_list = []
    min_list = []
    work_list = []
    date_list = []

    for i in a:
        # print(p(i).html())
        max = p(i).find('span').html()
        min = p(i).find('i').html()
        if not max:
            max = '32'
        max = re.findall(re.compile('\d+'), max)[0]
        min = re.findall(re.compile('\d+'), min)[0]
        print(max, min)
        max_list.append(int(max))
        min_list.append(int(min))

    today = datetime.date.today()
    holiday_url = 'http://api.goseek.cn/Tools/holiday?date='
    #https://www.jianshu.com/p/05ccb5783f65
    work = 0

    for i in range(7):
        s = today.strftime('%Y%m%d')
        date_list.append(s)

        url = holiday_url + s
        res = requests.get(url).text
        print(res)
        res = json.loads(res)

        if 'data' in res:
            if res['data'] == 2 or res['data'] == 0:
                work = 1  # 上班
            else:
                work = 0  # 不上班

        today = today + datetime.timedelta(days=1)
        print(s, work)
        work_list.append(work)
        # time.sleep(0.5)

    return max_list, min_list, work_list, date_list
def page_parser(page):
    while True:
        current = wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR,
                 '#mainsrp-pager > div > div > div > ul > li.item.active>span'
                 ))).text
        print('当前正在爬取第{}页'.format(current))
        buttom = browser.find_element_by_css_selector('li.item.next a')
        buttom.click()
        if int(current) <= page:
            html = p(browser.page_source)
            items = html('#mainsrp-itemlist .items .item').items()
            for item in items:
                product = {
                    'img': item.find('.pic .img').attr('data-src'),
                    'price': item.find('.price').text(),
                    'deal': item.find('.deal-cnt').text(),
                    'shop': item.find('.shop').text(),
                    'location': item.find('.location').text()
                }
                print(product)
                save_mongo(product)
                time.sleep(3)
        else:
            break
def main():
    base_store_path = 'crossword_puzzles/'
    
#fetch the url dict of puzzle resources
    base_url = 'http://crosswordgiant.com/browse'
    rsc_page = p(url = base_url)
    label_a_list = rsc_page('.information_text a')
    for a in label_a_list:
        rsc_name = p(a).text()
        base_rsc_url = p(a).attr('href')
        rsc_store_path = base_store_path + rsc_name + '/'
        try:
            os.makedirs(rsc_store_path)
        except Exception, e:
            pass
#fetch data from each resource
        UrlFetchThread(rsc_name, base_rsc_url, rsc_store_path).start()
 def run(self):
     date_store_path = self.Path
     print self.Path
 #begin to fetch and write
     date_url = self.URL
     try:
         puzzle_page = p(url = date_url)
     except Exception, e:
         return None
Example #7
0
def itis_lookup(name, TIMEOUT=10, CACHE=True):
    '''
    Look up "name" on itis.gov. If a standard name can be identified, returns
    that name. Returns False if no or ambiguous result.

    If a name matches multiple species that are all members of the same genus,
    itis_lookup will return "Genus sp1/sp2/sp3..."
    '''

    name = name.replace("'", '').lower()
    if name in cache and CACHE:
        return cache[name]

    url = 'http://www.itis.gov/servlet/SingleRpt/SingleRpt'
    values = {'search_topic': 'all', 
              'search_kingdom':'every', 
              'search_span':'containing', 
              'search_value': name.decode(), 
              'categories':'All', 
              'source':'html', 
              'search_credRating': 'All'}
    data = urllib.urlencode(values)
    req = urllib2.Request(url, data)
    response = urllib2.urlopen(req, timeout=TIMEOUT)
    html = response.read()

    # parse results to pull out unique species
    results = [s.tail for s in p(html)('td.body a')]
    results = sum([re.findall('Species: [A-Z][a-z ]*', result) for result in results], [])
    results = [s.split(':')[1].strip() for s in results]
    
    if results:
        genus = set()
        all_species = []
        result = None
        for this_species in results:
            genus.add(this_species.split()[0])
            if len(genus) > 1: 
                result = False
                break
            all_species.append(' '.join(this_species.split()[1:]))
        if not result is False:
            result = list(genus)[0] + ' ' + '/'.join(sorted(list(set(all_species))))
        cache[name] = result
    else:
        cache[name] = False

    if CACHE: caching.save_cache(cache, 'itis')

    return cache[name]
def itis_lookup(name, TIMEOUT=10):
    global TIMEOUTS

    name = name.replace("'", '').lower()
    if name in itis_cache:
        print "==> itis",
        return itis_cache[name]
    elif TIMEOUTS >= 5:
        # if ITIS seems to be down, do nothing
        raise Exception('ITIS seems to be down.')

    url = 'http://www.itis.gov/servlet/SingleRpt/SingleRpt'
    values = {'search_topic': 'all', 
              'search_kingdom':'every', 
              'search_span':'containing', 
              'search_value': name.decode(), 
              'categories':'All', 
              'source':'html', 
              'search_credRating': 'All'}
    data = urllib.urlencode(values)
    req = urllib2.Request(url, data)
    response = urllib2.urlopen(req, timeout=TIMEOUT)
    html = response.read()
    response.close()

    # parse results to pull out unique species
    results = [s.tail for s in p(html)('td.body a')]
    results = sum([re.findall('Species: [A-Z][a-z ]*', result) for result in results], [])
    results = [s.split(':')[1].strip() for s in results]
    
    if results:
        genus = set()
        all_species = []
        for this_species in results:
            genus.add(this_species.split()[0])
            if len(genus) > 1: return False
            all_species.append(' '.join(this_species.split()[1:]))
        species = list(genus)[0] + ' ' + '/'.join(sorted(list(set(all_species))))
        itis_cache[name] = species
        print "==> itis",
    else:
        itis_cache[name] = False

    #print 'itis_cache = %s' % itis_cache
    pickle.dump(itis_cache, open(os.path.join(DATA_DIR, 'itis.cache'), 'w'), protocol=-1)

    return itis_cache[name]
Example #9
0
    def parse(self, response):

        url = 'http://www.ccgp-hunan.gov.cn/mvc/viewNoticeContent.do?noticeId=%s&area_id='
        data = json.loads(response.text)
        rows = data['rows']
        for i in rows:
            id = i['NOTICE_ID']
            title = i['NOTICE_TITLE']

            resp = requests.get(url % (id))

            doc = p(resp.text)
            content = doc.find('table:eq(3)').html()

            yield {
                'id': 'hunan_%s' % (id),
                'title': title,
                'content': content,
                'province': '湖南',
                'source_url': url % (id),
                'publish_time': self.today,
            }
        self.log(data)
Example #10
0
    def parse_(self, response):

        url = response.url
        id = url.split('newsId=')
        if id and len(id) == 2:
            id = id[1]

            result = response.text
            doc = p(result)

            title = doc.find('.frameNews h1').html()
            publish_time = doc.find('.source span').html().replace('发布日期:', '').split(' ')[0]

            content = doc.find('.frameNews').html()

            yield {
                'id':'anhui_' + id,
                'title':title,
                'content':content,
                'souce_url':url,
                'province':'安徽',
                'publish_time':publish_time
            }
Example #11
0
def pathparse(html,url):
    for img in [e.attr.src for e in p(html)("img").items()]:
        newimg=urljoin(url,img)
        html=html.replace(img,newimg)
    return  html
Example #12
0
 def short(self):
     return p(self.content).text()[:40] if self.content else ''
Example #13
0
    def parse_item(self, response):
        url_2 = 'http://www.zfcg.sh.gov.cn/emeb_bulletin.do?method=showbulletin&bulletin_id='
        result = response.text
        links = p(result).find('#bulletininfotable_table_body a')
        if len(links):
            for a in links:
                href = a.attrib['value']
                title = a.text
                # yield scrapy.Request(url_2+href,callback=self.parse_tender,headers=self.headers)

                resp = requests.get(url_2+href, headers=self.headers)
                content = resp.text
                if content:
                    c = p(resp.text).find('#templateContext')
                    e = p(resp.text).find('.newinfotr1')
                    drop = '<script(.*?)</script>|<textarea(.*?)>|</textarea>|<input(.*?)type="hidden"(.*?)>'

                    if c:
                        content = re.sub(drop,'',c.html())
                        
                    elif e:
                        content = '<table><tbody>'+''.join([_p(_).outerHtml() for _ in e])+'</tbody></table>'[:50]
                      
                    else:
                        content = ''
                    
                    yield{
                        'id':'shanghai_' + href,
                        'title':title,
                        'content':content,
                        'source_url':url_2+href,
                        'province':'上海',
                        'publish_time':datetime.date.today().strftime('%Y-%m-%d')
                    }

                # yield response.follow(url_2+href,callback=self.parse_tender,headers=self.headers)
        

    # def parse_tender(self,response):

    #     c = p(response.text).find('#templateContext')
    #     e = p(response.text).find('.newinfotr1')
    #     drop = '<script(.*?)</script>|<textarea(.*?)>|</textarea>|<input(.*?)type="hidden"(.*?)>'

    #     if c:
    #         content = re.sub(drop,'',c.html())
            
    #     elif e:
    #         content = '<table><tbody>'+''.join([_p(_).outerHtml() for _ in e])+'</tbody></table>'[:50]
          
    #     else:
    #         content = ''
        
    #     yield{
    #         'id':'shanghai_' + 
    #         'title':content,
    #         'content':content,
    #         'source_url':'',
    #         'area':'上海',
    #         'publish_time':datetime.date.today().strftime('%Y%m%d')
    #     }