Python BeautifulSoup.select Examples, bs4.BeautifulSoup.select Python Examples

Example #1

0

Show file

File: views.py Project: zhibzeng/UnderGraduateThesis

def OilPrice(request):
    driver = webdriver.PhantomJS()
    driver.get('http://www.bitauto.com/youjia/')
    html_source = driver.page_source
    #print html_source
    soup = BeautifulSoup(html_source)
    sichuan = soup.select("body > div.bt_page > div.bt_page > div > div.oilTableOut > table > tbody > tr:nth-of-type(11) > th:nth-of-type(2) > a")
    #90号汽油
    type1 = soup.select("body > div.bt_page > div.bt_page > div > div.oilTableOut > table > tbody > tr:nth-of-type(11) > td:nth-of-type(5)")
    #93号汽油
    type2 = soup.select("body > div.bt_page > div.bt_page > div > div.oilTableOut > table > tbody > tr:nth-of-type(11) > td:nth-of-type(6)")
    #97号汽油
    type3 = soup.select("body > div.bt_page > div.bt_page > div > div.oilTableOut > table > tbody > tr:nth-of-type(11) > td:nth-of-type(7)")
    #0号柴油(元/升)
    type4 = soup.select("body > div.bt_page > div.bt_page > div > div.oilTableOut > table > tbody > tr:nth-of-type(11) > td:nth-of-type(8)")
    sichuan = str(sichuan[0].get_text())
    type1 = str(type1[0].get_text())
    type2 = str(type2[0].get_text())
    type3 = str(type3[0].get_text())
    type4 = str(type4[0].get_text())
    print sichuan.encode('utf8')
    oilprice = Oil(city_name=sichuan.encode('utf8'),typeone_price=type1, typetwo_price=type2, typethree_price=type3, typefour_price=type4, date=datetime.datetime.now().date())
    oilprice.save()
    print driver.current_url
    driver.quit
    responseHtml = '<html><body>Scraping Oil Price Successfully!</body></html>'
    return HttpResponse(responseHtml)

Example #2

0

Show file

File: HexunWeiboCrawler.py Project: flykeysky/AllWeiboCrawler

 def getWeibos(self, keyword,  page=1, count=None):
     url = 'http://t.hexun.com/k/topic.html?type=1&value=%s&pg=%d' % (json.dumps(keyword).replace('\\', '%').replace('"', ''), page)
     result = WeiboCrawler.request(self, url, self.headers)
     if 'result' in result and result['result']:
         infos = result['info'].decode('gb2312')
         soup = BeautifulSoup(infos)
         total_soup = soup.select('.headerR1')[0]
         total_num = total_soup.get_text().split('共')[-1].split('条')[0].strip()
         return_val = {'total_count': int(total_num), 'msgs':[]}
         allmsgs = []
         msgs_soup = soup.select('.nr_con')
         for msg_soup in msgs_soup:
             avatar =  'http://t.hexun.com%s' % msg_soup.select('.nr_conLa > a')[0].get('href')
             nickandtext = msg_soup.select('.nr_shuo')[0].get_text().split('：')
             nickname = nickandtext[0]
             text = nickandtext[1]
             ts = msg_soup.select('.nr_tan > h3 > a')[0].get_text()
             allmsgs.append({
                 'avatar': avatar,
                 'nickname': nickname,
                 'text': text,
                 'datetime': ts,
                 })
         return_val['msgs'] = allmsgs
         return return_val

Example #3

0

Show file

File: page_parsing.py Project: MarchHu/Plan-for-combating

def get_links_from(channel,pages):
    #http://bj.ganji.com/jiaju/a3o11/
    #ttp://bj.ganji.com/wupinjiaohuan/o3/#两种不同url
    if channel in ['http://bj.ganji.com/xuniwupin/','http://bj.ganji.com/qitawupin/','http://bj.ganji.com/ershoufree/','http://bj.ganji.com/wupinjiaohuan/']:
        list_view = '{}o{}/'.format(channel,str(pages))
        wb_data = requests.get(list_view,headers=headers)
        #time.sleep(1)
        soup = BeautifulSoup(wb_data.text,'lxml')
        if soup.find('ul','pageLink clearfix'):
            for link in soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dt > div > a'):
                item_link = link.get('href')
                url_list.insert_one({'url':item_link})
                print(item_link)

        else:
            #pass
            print('重复页面')
    else:
        list_view = '{}a3o{}/'.format(channel,str(pages))
        wb_data = requests.get(list_view,headers=headers)
        #time.sleep(1)
        soup = BeautifulSoup(wb_data.text,'lxml')
        if soup.find('ul','pageLink clearfix'):
            for link in soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a'):
                item_link = link.get('href')
                url_list.insert_one({'url':item_link})
                print(item_link)

        else:
            #pass
            print('重复页面')

Example #4

0

Show file

File: views.py Project: asurenju/learn_parse

def get_page_info_from(url,data=None):
    web_data = requests.get(url)
    if web_data.status_code == 404:
        pass
    else:
        web_data.encoding = "utf-8"
        soup_page = BeautifulSoup(web_data.text,'lxml')
        page_tips = soup_page.select('div.newstop span')
        page_title = soup_page.select('div.newstop h2')[0].text
        page_contents= soup_page.select('div.lhnewcon p')
        page_imgs = soup_page.select('div.lhnewcon img')
        #content = ''
        tip = ''
        content_list = []
        imgs = []
        for page_content in page_contents:
            # content += page_content.get_text()+'&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;'
            content_list.append(page_content.get_text())
        for page_tip in page_tips:
            tip += page_tip.get_text()+' '
        for page_img in page_imgs:
            templink = page_img.get('src')
            templink = templink.replace('../../..','http://ilonghua.sznews.com')
            imgs.append(templink)
        data ={
            'title':page_title.strip(),
            # 'content':content,
            'tip':tip,
            'contents':content_list,
            'imgs':imgs,
            #'url':url
        }
        return data

Example #5

0

Show file

File: archives.py Project: slobdell/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    if year < 2006:  # The oldest year for audit reports
      continue
    url = AUDIT_REPORTS_URL.format(year=year)
    doc = BeautifulSoup(utils.download(url))
    results = doc.select("div#content li")
    for result in results:
      report = audit_report_from(result, url, year, year_range)
      if report:
        inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("div#content li")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the Peer Review
  doc = BeautifulSoup(utils.download(PEER_REVIEWS_URL))
  result = doc.find("div", id='content').find("a", text=True)
  report = peer_review_from(result, year_range)
  inspector.save_report(report)

Example #6

0

Show file

File: chaiyuan.py Project: MarchHu/Plan-for-combating

def get_items_info(sellerType):
    item_urls = get_link_list(sellerType)
    counter = 1  #计数器
    for item_url in item_urls:

        wb_data = requests.get(item_url)
        soup = BeautifulSoup(wb_data.text,'lxml')

        title = soup.title.text
        price = soup.select('#content span.price')
        area = soup.select('span.c_25d')
        date = soup.select('li.time')
        totalView = get_views_num(item_url)
        # print(title,price,area,date,totalView,sellerType,sep='\n------------\n')

        data = {
            '序号':counter,
            '标题':title,
            '价格':price[0].text,
            '地区':None if area==[] else list(area[0].stripped_strings),  #防止因某些商品没有地区信息而中断
            '日期':date[0].text,
            '浏览量':totalView,
            '卖家类型':'个人' if sellerType == 0 else '商家',
        }

        counter += 1
        print(data)

Example #7

0

Show file

File: getitem.py Project: MarchHu/Plan-for-combating

def getPageItems():
    for href in content_list.find({}, {'url' : 1}):
       if href in url_list.find({}, {'url' : 1}):
           print  '已抓取过'
       else:
        url=href.get('url')
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        pageCode = response.read().decode('utf-8')
        soup = BeautifulSoup(pageCode,'lxml')
        if not pageCode:
            print "页面加载失败...."
            return None
        title=soup.select('#main > div.col.detailPrimary.mb15 > div.col_sub.mainTitle > h1')
        date=soup.select('li.time')
        price=soup.select('span.price.c_f50')
        pattern = re.compile(r'\d+')
        result = re.findall(pattern,str(title))
        data={
                'title':result[1],
                'date':date[0].text,
                'price':price[0].text,
                'url':url,
                      }
        content_list.insert_one(data)
        print data

Example #8

0

Show file

File: scraper.py Project: TheSentinel36/draft-kings-fun

def scrape():
    hold = []
    hold.append(['playername', 'points'])
    for page in build_fp_pages():
        r = requests.get(page)
        soup = BS(r.text, 'html.parser')
        if 'espn' in page:
            for row in soup.select('.playerTableTable tr'):
                try:
                    p_check = row.findAll(class_="playertablePlayerName")
                    if len(p_check) == 0:
                        continue
                    defense_name = p_check[0].text
                    defense_points = row.find_all('td')[-1].text
                    defense = unicode_normalize(defense_name, defense_points)
                    hold.append(defense)
                except Exception, e:
                    print 'Error scraping ESPN data: ' + str(e)


        else:
            for row in soup.select('tr.mpb-available'):
                try:
                    hold.append([str(row.find_all('td')[0].text),
                                 str(row.find_all('td')[-1].text)])
                    
                except Exception, e:
                    print 'Error scraping FanPros data: ' + str(e)

Example #9

0

Show file

File: crawler4u.py Project: kitanow/webscrapper

    def _find_image(self, detail_url):
        """
        Find URL of image from detail page.
        """
        detail_page = urllib2.urlopen(detail_url)
        soup = BeautifulSoup(detail_page.read())

        file = open(path, "a")
        file.write(detail_url)
        file.write(",")

        for img in soup.find_all("img"):
            if img.get("alt") == word.decode("shift-jis"):
                print "extract image url : " + img.get("src")
                file.write(img.get("src"))
                file.write(",")

        if len(soup.select("td.t-left a")) == 0:
            file.write("none.")
            file.write("\n")
        else:
            origin = str(soup.select("td.t-left a")[0].get("href"))
            file.write(origin)
            file.write("\n")
        file.close()

Example #10

0

Show file

File: mypythonquiz.py Project: fmlvn/quiz

def get_question_info(question_link):
    print('processing: {}'.format(question_link.text))
    res = requests.get(requests.compat.urljoin('http://www.mypythonquiz.com/',
                                               question_link.attrs['href']))
    soup = BeautifulSoup(res.text, 'lxml')
    title = question_link.text
    question_id = question_link.attrs['href'].split('qid=')
    question = soup.select('.myspan')[0]
    question = question.getText().split(':')[1].strip()
    try:
        code = soup.select('.codesample code')[0]
        code = code.getText()
    except IndexError:
        code = None

    answer_values = [i.attrs['value'] for i in
                     soup.select('input[name="answer"]')]
    answer_list = [i.getText() for i in soup.select('.content .myspan')[1:]]
    answers = dict(zip(answer_values, answer_list))

    choices, description = get_correct_answer(question_id, answers)

    print('done')

    return {'title': title,
            'question': question,
            'code': code,
            'choices': choices,
            'description': description}

Example #11

0

Show file

File: tweeter_scrapp.py Project: pwr-inf/ICCSS2016-datathon

def get_tweet_details(tweet_id,user='',retry=0):
    print tweet_id
    try:
        headers_custom= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
        text_selector=".TweetDetail-text.u-textLarge"
        stat_selector='.TweetDetail-statCount, .TweetAction-count'
        tweet_template=Template('https://mobile.twitter.com/$user/status/$tweet_id')
        uri=tweet_template.substitute({'user':user,'tweet_id':tweet_id})
        response=requests.get(uri,headers=headers_custom)
        if response.status_code != 200:
            if retry%10==0:
                print 'Too many requests in tweet download!'
            time.sleep(1)
            return get_tweet_details(tweet_id,user=user,retry=retry+1) if retry<max_retry else {}
        soup = BeautifulSoup(response.content, 'html.parser')
        text=safe_list_get(map(lambda x:x.get_text(),soup.select(text_selector)),0,'')
        stats=soup.select(stat_selector)
        pases=get_stat(stats,0)
        likes=get_stat(stats,1)
        date=soup.select('.TweetDetail-timeAndGeo')
        date=map(lambda d:d.get_text(),date)
        responses=soup.select('.Timeline-base .Tweet-body')
        def response_to_data(r):
                user_name=r.select('.UserNames-screenName')[0].get_text()
                text=r.select('.Tweet-text')[0].get_text()
                date=r.select('.Tweet-timestamp time')[0].attrs['datetime']
                return {'text':text,'user':user_name,'date':date}
        responses=map(response_to_data,responses)
        return {'text':text,'pases':pases,'likes':likes,'date':safe_list_get(date,0,''),'responses':responses}
    except :
        traceback.print_exc()
        return {}

Example #12

0

Show file

File: XArt.py Project: ghijnuuz/pythonscripts

    def parser_video_info_html(self, html_doc):
        video_info = {}
        soup = BeautifulSoup(html_doc)

        # Get Title
        data = soup.select('#content > h1')
        if len(data) > 0:
            video_info['Title'] = unicode(data[0].contents[0])

        # Get Date
        data = soup.select('#content > .head-list > li')
        if len(data) == 2:
            date_str = str(unicode(data[0].contents[1])).strip()
            video_info['Date'] = datetime.strptime(date_str,  '%b %d, %Y').date()

        # Get Models
        data = soup.select('#content > .head-list > li > a')
        if len(data):
            video_info['Models'] = []
            for model in data:
                video_info['Models'].append(unicode(model.contents[0]))

        # Get Rating and VoteCount
        data = soup.select('.star-holder > p')
        if len(data):
            votes_list = data[0].contents[0]
            votes_list = re.split('/| |\(|\)', votes_list)
            votes_list.remove('')
            video_info['Rating'] = float(unicode(votes_list[0]))
            video_info['VoteCount'] = int(unicode(votes_list[2]))

        return video_info

Example #13

0

Show file

File: page_parsing.py Project: wenhaoliang/learn-python

def get_pages_info(url):
    if "zhuanzhuan" in url:
        ganji_url.delete_one({"url": url})
        return
    try:
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        titles = soup.select('h1.title-name')
        times = soup.select('i.pr-5')
        types = soup.select('ul.det-infor > li:nth-of-type(1) > span')
        prices = soup.select('i.f22.fc-orange.f-type')
        places = soup.select("ul.det-infor > li > a")
        for titles, times, types, prices, places in zip(titles, times, types, prices, places):
            data = {
                '标题': titles.get_text(),
                '发布时间': times.get_text().strip().split(' ')[0] if len(times) > 0 else "",
                '类型' : types.get_text(),
                '价格' : prices.get_text(),
                '交易地点':places.get_text(),
                'url': url
            }
            ganji_data2.insert_one(data)
    except Exception as e:
        print(e)
        time.sleep(3)

Example #14

0

Show file

File: test_group.py Project: berlinonline/ckan

    def test_admin_add(self):
        '''Admin can be added via add member page'''
        app = self._get_test_app()
        owner = factories.User(fullname='My Owner')
        factories.User(fullname="My Fullname", name='my-user')
        group = self._create_group(owner['name'])

        env, response = self._get_group_add_member_page(app,
                                                        owner,
                                                        group['name'])

        add_form = response.forms['add-member-form']
        add_form['username'] = '******'
        add_form['role'] = 'admin'
        add_response = submit_and_follow(app, add_form, env, 'save')

        assert_true('2 members' in add_response)

        add_response_html = BeautifulSoup(add_response.body)
        user_names = [u.string for u in
                      add_response_html.select('#member-table td.media a')]
        roles = [r.next_sibling.next_sibling.string
                 for r in add_response_html.select('#member-table td.media')]

        user_roles = dict(zip(user_names, roles))

        assert_equal(user_roles['My Owner'], 'Admin')
        assert_equal(user_roles['My Fullname'], 'Admin')

Example #15

0

Show file

File: test_group.py Project: berlinonline/ckan

    def test_remove_member(self):
        '''Member can be removed from group'''
        app = self._get_test_app()
        user_one = factories.User(fullname='User One', name='user-one')
        user_two = factories.User(fullname='User Two')

        other_users = [
            {'name': user_two['id'], 'capacity': 'member'}
        ]

        group = self._create_group(user_one['name'], other_users)

        remove_url = url_for(controller='group', action='member_delete',
                             user=user_two['id'], id=group['id'])

        env = {'REMOTE_USER': user_one['name'].encode('ascii')}
        remove_response = app.post(remove_url, extra_environ=env, status=302)
        # redirected to member list after removal
        remove_response = remove_response.follow(extra_environ=env)

        assert_true('Group member has been deleted.' in remove_response)
        assert_true('1 members' in remove_response)

        remove_response_html = BeautifulSoup(remove_response.body)
        user_names = [u.string for u in
                      remove_response_html.select('#member-table td.media a')]
        roles = [r.next_sibling.next_sibling.string
                 for r in
                 remove_response_html.select('#member-table td.media')]

        user_roles = dict(zip(user_names, roles))

        assert_equal(len(user_roles.keys()), 1)
        assert_equal(user_roles['User One'], 'Admin')

Example #16

0

Show file

File: test_checkout.py Project: cygery/pretix

    def test_attendee_name_required(self):
        self.event.settings.set('attendee_names_asked', True)
        self.event.settings.set('attendee_names_required', True)
        cr1 = CartPosition.objects.create(
            event=self.event, session=self.session_key, item=self.ticket,
            price=23, expires=now() + timedelta(minutes=10)
        )
        response = self.client.get('/%s/%s/checkout/questions/' % (self.orga.slug, self.event.slug), follow=True)
        doc = BeautifulSoup(response.rendered_content)
        self.assertEqual(len(doc.select('input[name=%s-attendee_name]' % cr1.identity)), 1)

        # Not all required fields filled out, expect failure
        response = self.client.post('/%s/%s/checkout/questions/' % (self.orga.slug, self.event.slug), {
            '%s-attendee_name' % cr1.identity: '',
            'email': 'admin@localhost'
        }, follow=True)
        doc = BeautifulSoup(response.rendered_content)
        self.assertGreaterEqual(len(doc.select('.has-error')), 1)

        # Corrected request
        response = self.client.post('/%s/%s/checkout/questions/' % (self.orga.slug, self.event.slug), {
            '%s-attendee_name' % cr1.identity: 'Peter',
            'email': 'admin@localhost'
        }, follow=True)
        self.assertRedirects(response, '/%s/%s/checkout/payment/' % (self.orga.slug, self.event.slug),
                             target_status_code=200)

        cr1 = CartPosition.objects.current.get(identity=cr1.identity)
        self.assertEqual(cr1.attendee_name, 'Peter')

Example #17

0

Show file

File: test_group.py Project: berlinonline/ckan

    def test_membership_list(self):
        '''List group admins and members'''
        app = self._get_test_app()
        user_one = factories.User(fullname='User One', name='user-one')
        user_two = factories.User(fullname='User Two')

        other_users = [
            {'name': user_two['id'], 'capacity': 'member'}
        ]

        group = self._create_group(user_one['name'], other_users)

        member_list_url = url_for(controller='group', action='members',
                                  id=group['id'])
        env = {'REMOTE_USER': user_one['name'].encode('ascii')}

        member_list_response = app.get(
            member_list_url, extra_environ=env)

        assert_true('2 members' in member_list_response)

        member_response_html = BeautifulSoup(member_list_response.body)
        user_names = [u.string for u in
                      member_response_html.select('#member-table td.media a')]
        roles = [r.next_sibling.next_sibling.string
                 for r
                 in member_response_html.select('#member-table td.media')]

        user_roles = dict(zip(user_names, roles))

        assert_equal(user_roles['User One'], 'Admin')
        assert_equal(user_roles['User Two'], 'Member')

Example #18

0

Show file

File: amazon.py Project: gtnx/mutagenerate

 def generate_with_kw(self, id3, kw, update):
     url = self.url.replace('KEYWORD', urllib.quote_plus(kw.encode('utf8')))
     logger.debug('Crawling %(url)s' % locals())
     soup = BeautifulSoup(requests.get(url).content, 'lxml')
     results = soup.select('table.mp3Tracks tr td.songTitle a')
     if not results:
         logger.info('Amazon, no results for %(kw)s' % locals())
         return False
     result = results[0]
     url = result.get("href")
     logger.debug("Found specific url: %(url)s" % locals())
     soup = BeautifulSoup(requests.get(url).content, 'lxml')
     id3.add(WXXX(encoding=3, desc=u"Amazon url", url=url))
     album = soup.select("#fromAlbum a")
     if album and (update or "TALB" not in id3):
         id3.add(TALB(encoding=3, text=album[0].find(text=True).strip(" \n")))
     images = soup.select("div#coverArt_feature_div img") + soup.select('#prodImageContainer img')
     if images and (update or 'APIC:Cover' not in id3):
         data = requests.get(images[0].get("src")).content
         id3.add(APIC(encoding=3, mime="image/jpeg", type=3, desc=u"Cover", data=data))
     details = [filter(lambda x: x not in ("\n", "", " "), detail.find_all(text=True)) for detail in soup.select("div.content li") if detail.find("strong")]
     for detail in details:
         if detail and detail[0] == "Genres:" and (update or "TCON" not in id3) and len(detail) >= 2:
             id3.add(TCON(encoding=3, text=detail[1]))
     return True

Example #19

0

Show file

File: state.py Project: BunsenMcDubbs/inspectors-general

def extract_reports_for_subtopic(subtopic_url, year_range, topic, subtopic=None):
  if subtopic_url.startswith("http://httphttp://"):
    # See notes to IG's web team
    subtopic_url = subtopic_url.replace("http://http", "")

  body = utils.download(subtopic_url)
  doc = BeautifulSoup(body)
  results = doc.select("#body-row02-col02andcol03 a")

  if not results:
    results = doc.select("#body-row02-col01andcol02andcol03 a")
  if not results and "There are currently no reports in this category" not in doc.text:
    raise AssertionError("No report links found for %s" % subtopic_url)

  topic_name = TOPIC_NAMES[topic]
  # Broadcasting Board of Governors is a fully independent agency
  if topic == 'BBG' or subtopic == 'Broadcasting Board of Governors':
    agency = 'bbg'
  else:
    agency = 'state'

  for result in results:
    report = report_from(result, year_range, agency, topic_name, subtopic)
    if report:
      inspector.save_report(report)

Example #20

0

Show file

File: models.py Project: QsBBQ/flack

 def expand_links(self):
     """Expand any links referenced in the message."""
     if '<blockquote>' in self.html:
         # links have been already expanded
         return False
     changed = False
     for link in BeautifulSoup(self.html, 'html5lib').select('a'):
         url = link.get('href', '')
         try:
             rv = requests.get(url)
         except requests.exceptions.ConnectionError:
             continue
         if rv.status_code == 200:
             soup = BeautifulSoup(rv.text, 'html5lib')
             title_tags = soup.select('title')
             if len(title_tags) > 0:
                 title = title_tags[0].string.strip()
             else:
                 title = url
             description = 'No description found.'
             for meta in soup.select('meta'):
                 if meta.get('name', '').lower() == 'description':
                     description = meta.get('content', description).strip()
                     break
             # add the detail of the link to the rendered message
             tpl = ('<blockquote><p><a href="{url}">{title}</a></p>'
                    '<p>{desc}</p></blockquote>')
             self.html += tpl.format(url=url, title=title, desc=description)
             changed = True
     return changed

Example #21

0

Show file

File: domainInfo.py Project: latentgod/domainInfo

 def selectDomain(self, html):
     soup = BeautifulSoup(html)
     subdomainList = []
     for i in xrange(len(soup.select('[class=domain]'))):
         subDomain = soup.select('[name=domain'+ str(i+1) +']')[0].attrs['value']
         subdomainList.append(subDomain)
     return subdomainList

Example #22

0

Show file

File: Zhilian-List-Fetch.py Project: solomonxie/ExperimentLand

def ZhilianFirmPage(firmUrl=''):
	'''	
	# Function: 获取智联招聘的企业详细信息页面里面的企业基本信息及招聘列表。
	# Params  : firmUrl=页面网址
	# Steps   : 先判断域名,如果是“标准页面”则正常解析,如果是“Special页面”则在得到“标准页面”后才正式解析。
	# Notes   : 企业页面就复杂了,分为普通页面和VIP页面,网址不同,源码也不同
	'''
	# === 开始解析网址 ===
	# 无论是Special页面还是标准页面,都必须要解析。
	webTarget = webPageSourceCode(firmUrl)
	soup = BeautifulSoup(webTarget['html'], 'html5lib')
	# === 根据域名判断当前为“标准页面”还是“Special页面” ===
	subDomain = urlAnalyse(firmUrl)['subloc'][0]
	if subDomain == 'special' :
		# 如果是"Special页面"则获取其标准页面的URL,并重新加载此函数。
		# 只能用正则表达式获取`<!-- -->`隐藏标签的内容。
		finder = re.findall(re.compile(r' href="(.+?)"'), str(soup.select('td[align=right]')))
		standardUrl = finder[0] if finder else ''
		if len(standardUrl) : 
			print 'Redirecting from a special company page to a standard page...'
			ZhilianFirmPage(standardUrl) # 以标准页面重新加载此函
			return ''		
	# === 在标准页面中获取该公司所有招聘信息的链接 ===
	# ===>>> 不过有一点：页面只会显示一个城市的招聘，其他城市的信息则是Javascript动态加载的。
	# 		 也就是说，还不如直接在搜索主页按照企业名搜索的强。
	resu = soup.select('[class=positionListContent1] [class*=jobName] a[href]')
	data = [t['href'] for t in resu ]
	print 'Done of retrieving %d job links of this company.' %len(resu)
	return data # 返回所有正在招聘的职位链接

Example #23

0

Show file

File: item_info.py Project: JeromeZhouQ/ganji

def get_item_info(link):
    time.sleep(random.uniform(0,3))    #随机休息
    try:
        webdata = requests.get(link, headers=headers)
        if webdata.status_code == 200:
            soup = BeautifulSoup(webdata.text, 'html.parser')
            title = soup.title.text
            post_time = soup.select('i.pr-5')
            views = get_view(link)
            type = soup.select('ul.det-infor > li > span > a ')
            price = soup.select(' i.f22.fc-orange.f-type ')
            address = soup.select('ul.det-infor > li')
            address_str = '' if address[2] == 0 else address[2].get_text().replace(' ', '').replace('\r', '').replace('\n',
                                                                                                                      '').replace(
                '\xa0', '')[5:]
            use = soup.select('div.second-dt-bewrite > ul > li ')

            data = {
                'link': link,
                'title': title,
                'post_time': get_text(post_time)[:-3],
                'views': views,
                'price': get_text(price),
                'type': get_text(type),
                'address': address_str,
                'use': get_use(use)[:get_use(use).find('新')+1]
            }
            item_info_lb.insert_one(data)
        else:
            print('{} has some problems,please try again later'.format(link))
            pass
    except Exception as e:
        print (Exception,':',e)

Example #24

0

Show file

File: energy.py Project: slobdell/inspectors-general

  def urls_for(self):
    only = self.options.get('topics')
    if only: # if only...
      only = set(only.split(','))
      only = [(o, TOPIC_TO_REPORT_TYPE[o]) if o in TOPIC_TO_REPORT_TYPE else o
              for o in only]
      yield from self.urls_for_topics(only)
      # If there are topics selected, ONLY yield URLs for those.
      return

    # First yield the URLs for the topics that are tangential to the main
    # Calendar Year reports.
    yield from self.urls_for_topics(ADDITIONAL_TOPICS)

    # Not getting reports from specific topics, iterate over all Calendar Year
    # reports.
    page = BeautifulSoup(utils.download(BASE_URL))

    # Iterate over each "Calendar Year XXXX" link
    for li in page.select('.field-items li'):
      md = RE_CALENDAR_YEAR.search(li.text)
      if md:
        cur_year = int(md.group(1))
        if cur_year >= self.year_range[0] and cur_year <= self.year_range[-1]:
          href = li.select('a')[0]['href']
          next_url = urljoin(BASE_URL, href)
          # The first page of reports is yielded.
          yield next_url

          # Next, read all the pagination links for the page and yield those. So
          # far, I haven't seen a page that doesn't have all of the following
          # pages enumerated.
          next_page = BeautifulSoup(utils.download(next_url))
          for link in next_page.select('li.pager-item a'):
            yield urljoin(BASE_URL, link['href'])

Example #25

0

Show file

File: nasa.py Project: Cloudxtreme/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    url = AUDITS_REPORTS_URL.format(str(year)[2:4])
    doc = BeautifulSoup(utils.download(url))
    results = doc.select("tr")
    if not results:
      raise inspector.NoReportsFoundError("NASA (%d)" % year)
    for index, result in enumerate(results):
      if not index or not result.text.strip():
        # Skip the header row and any empty rows
        continue
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the other reports
  doc = BeautifulSoup(utils.download(OTHER_REPORT_URL))
  results = doc.select("#subContainer ul li")
  if not results:
    raise inspector.NoReportsFoundError("NASA (other)")
  for result in results:
    report = other_report_from(result, year_range)
    if report:
      inspector.save_report(report)

Example #26

0

Show file

File: send_to.py Project: ascott1/regulations-stub

def send_to_server(api_base, stub_base, path):
    """
    Send the file at the given `path` to the given `api_base`. Path
    components will be appended to the `api_base` and are presumed to
    match.
    """
    relative_path = os.path.relpath(path, stub_base)
    url = urlparse.urljoin(api_base, relative_path)

    logger.info('sending {} to {}'.format(path, url))

    data = json.dumps(json.load(open(path, 'r')))
    r = requests.post(url, data=data,
                      headers={'content-type': 'application/json'})

    # regulations-core returns 204 on a successful POST
    if r.status_code != 204:
        try:
            soup = BeautifulSoup(r.text, 'html.parser')

            exception = soup.select("#summary h1")[0].text
            exception_value = soup.select("#summary .exception_value")[0].text

            logger.error("error sending {}: {}, {}".format(
                r.status_code, exception, exception_value))
        except:
            logger.error("error sending {}: {}".format(r.status_code, r.reason))

Example #27

0

Show file

File: energy.py Project: slobdell/inspectors-general

  def fetch_from_landing_page(self, landing_url):
    """Returns a tuple of (pdf_link, summary_text, is_unreleased)."""
    unreleased = False
    page = BeautifulSoup(utils.download(landing_url))

    summary = None
    field_items = page.select('.field-items')
    if field_items:
      text = [node.strip() for node in field_items[0].findAll(text=True)]
      summary = '\n\n'.join(text).strip()
    if not summary:
      logging.info('\tno summary text found')

    if (summary and (RE_NOT_AVAILABLE.search(summary)
                     or RE_NOT_AVAILABLE_2.search(summary)
                     or RE_NOT_AVAILABLE_3.search(summary)
                     or RE_NOT_AVAILABLE_4.search(summary)
                     or RE_CLASSIFIED.search(summary))):
      unreleased = True

    report_url = None
    pdf_link = page.select('.file a')
    if not pdf_link:
      logging.warn('No pdf link found on page: {0}'.format(landing_url))
    else:
      report_url = pdf_link[0]['href']

    return report_url, summary, unreleased

Example #28

0

Show file

File: rrb.py Project: slobdell/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  doc = BeautifulSoup(utils.download(REPORTS_URL))

  # Pull the semiannual reports
  semiannul_results = doc.select("#AnnualManagementReports select")[0]
  for result in semiannul_results.select("option"):
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the special reports
  special_report_table = doc.find("table", attrs={"bordercolor": "#808080"})
  for index, result in enumerate(special_report_table.select("tr")):
    if not index:
      # Skip the header row
      continue
    report = report_from(result, REPORTS_URL, report_type='other', year_range=year_range)
    if report:
      inspector.save_report(report)

  # Pull the audit reports
  for year in year_range:
    if year < 2001:  # The oldest fiscal year page available
      continue
    year_url = AUDIT_REPORTS_URL.format(year=year)
    doc = BeautifulSoup(utils.download(year_url))
    for index, result in enumerate(doc.select("#main table tr")):
      if not index:
        # Skip the header row
        continue
      report = report_from(result, year_url, report_type='audit', year_range=year_range)
      if report:
        inspector.save_report(report)

Example #29

0

Show file

File: test_admin.py Project: PublicaMundi/ckan

    def test_custom_css(self):
        '''Add some custom css to the head element'''
        app = self._get_test_app()

        # current tagline
        intro_response_html = BeautifulSoup(app.get('/').body)
        style_tag = intro_response_html.select('head style')
        assert_equal(len(style_tag), 0)

        # set new tagline css
        env, config_response = _get_admin_config_page(app)
        config_form = config_response.forms['admin-config-form']
        config_form['ckan.site_custom_css'] = 'body {background-color:red}'
        webtest_submit(config_form, 'save', status=302, extra_environ=env)

        # new tagline not visible yet
        new_intro_response_html = BeautifulSoup(app.get('/').body)
        style_tag = new_intro_response_html.select('head style')
        assert_equal(len(style_tag), 1)
        assert_equal(style_tag[0].text.strip(), 'body {background-color:red}')

        # reset config value
        _reset_config(app)
        reset_intro_response_html = BeautifulSoup(app.get('/').body)
        style_tag = reset_intro_response_html.select('head style')
        assert_equal(len(style_tag), 0)

Example #30

0

Show file

File: test_checkout.py Project: rixx/pretix

    def test_voucher_double(self):
        self.quota_tickets.size = 2
        self.quota_tickets.save()
        v = Voucher.objects.create(item=self.ticket, event=self.event,
                                   valid_until=now() + timedelta(days=2), block_quota=True)
        CartPosition.objects.create(
            event=self.event, cart_id=self.session_key, item=self.ticket,
            price=23, expires=now() + timedelta(minutes=10), voucher=v
        )
        CartPosition.objects.create(
            event=self.event, cart_id=self.session_key, item=self.ticket,
            price=23, expires=now() + timedelta(minutes=10), voucher=v
        )
        self._set_session('payment', 'banktransfer')

        response = self.client.post('/%s/%s/checkout/confirm/' % (self.orga.slug, self.event.slug), follow=True)
        doc = BeautifulSoup(response.rendered_content, "lxml")
        self.assertEqual(CartPosition.objects.filter(cart_id=self.session_key, voucher=v).count(), 1)
        self.assertEqual(len(doc.select(".alert-danger")), 1)
        self.assertFalse(Order.objects.exists())

        response = self.client.post('/%s/%s/checkout/confirm/' % (self.orga.slug, self.event.slug), follow=True)
        doc = BeautifulSoup(response.rendered_content, "lxml")
        self.assertFalse(CartPosition.objects.filter(cart_id=self.session_key, voucher=v).exists())
        self.assertEqual(len(doc.select(".thank-you")), 1)
        self.assertEqual(Order.objects.count(), 1)
        self.assertEqual(OrderPosition.objects.count(), 1)

Example #31

0

Show file

File: Scrapper.py Project: JuyeonYu/NewsScapperByPython

def crawler(query):
    #크롤링 전 데이터 세팅
    currnet_searching_page = 1
    have_more_page_to_search = True
    today_yy_mm_dd = datetime.datetime.now().strftime("%Y.%m.%d")
    # today_yy_mm_dd = '2020.01.22' # 테스트 코드

    print('크롤링 시작 전 값 세팅 확인 \ncurrnet_searching_page: ',
          currnet_searching_page, '\nhave_more_page_to_search: ',
          have_more_page_to_search, '\ntoday_yy_mm_dd: ', today_yy_mm_dd)

    # 해당 키워드에 해당하는 최신 기사 제목을 얻음
    latest_news_title_in_database = db.select_latest_news(query)

    # 크롤링 시작
    while have_more_page_to_search:
        url = "https://search.naver.com/search.naver?&where=news&query=" + query + "&sm=tab_pge&sort=1&photo=0&field=0&reporter_article=&pd=3&ds=" + today_yy_mm_dd + "&de=" + today_yy_mm_dd + "&mynews=0&start=" + str(
            currnet_searching_page) + "&refresh_start=0"

        print('크롤링 시작! url 확인 \nurl: ', url)

        req = requests.get(url)
        cont = req.content
        soup = BeautifulSoup(cont, 'html.parser')

        # 검색 결과가 없을 때 처리 (mm월 dd일 00시 mm에 기사가 올라오지 않을 때)
        noresult = soup.select('.noresult_tab')
        if noresult:
            print('no result')
            break

        # <a>태그에서 제목과 링크주소 추출
        atags = soup.select('._sp_each_title')

        # 첫번째 기사 제목 확인
        if currnet_searching_page == 1:
            print('크롤링 시작! 첫번째 기사 제목 확인 \nurl: ',
                  atags[0].text.replace("'", ""))
            first_searched_title = atags[0].text.replace("'", "")

        for atag in atags:
            # 새로운 뉴스가 없음 -> 크롤링 중단
            if atag.text.replace("'", "") == latest_news_title_in_database:
                have_more_page_to_search = False
                print('새로운 뉴스가 없음 -> 크롤링 중단')
                break
            else:
                subKeywords = db.select_sub_keyword(query)
                print('sub key word: ', subKeywords)

                # 등록해놓은 서브 키워드에 맞는 기사 제목만 필터링 해서 데이터 베이스에 저장
                for sub in subKeywords:
                    if sub in atag.text:
                        db.insert_scrapped_news(atag.text, atag['href'], query)

        # 저장해놓은 첫번째 기사와 제목이 같으면 이하부터 중복 기사로 처리
        if db.is_latest_news(first_searched_title) == 0:
            db.insert_latest_news(query, first_searched_title)

        # 본문요약본
        contents_lists = soup.select('ul.type01 dl')
        for contents_list in contents_lists:
            contents_cleansing(contents_list)  # 본문요약 정제화

        # 페이지 처리 및 크롤링 계속 할지 말지 결정
        for page in soup.select(".paging"):
            if "다음페이지" in page.text:
                currnet_searching_page = currnet_searching_page + 10
            else:
                have_more_page_to_search = False
    print('finish')

Example #32

0

Show file

File: testnsub.py Project: Otsdarva1/atcoder_tools

    # ユーザの入力した問題がユーザの入力したコンテストのレベルの問題に存在しない場合、処理終了
    print("Problem '{prob}' doesn't exist in {level}".format(level=level, prob=prob))
    exit()

# ログイン
login.login()

# ------------------------------test part start----------------------------------

# 「問題」ページのurlを作成
tasks_url = "https://atcoder.jp/contests/{level}{round}/tasks".format(level=level, round=round)

# 「問題」ページを取得
html = config.SESSION.get(tasks_url)
soup = BeautifulSoup(html.text, 'lxml')
a = soup.select('a')
plob_path_map = {}
for ai in a:
    try:
        # aタグのテキストを取得
        text = ai.get_text()
        if text in prob_list:
            # テキストの文字列が問題リストに含まれていた場合、aタグのhref属性を取得
            link = ai.attrs['href']
            # 問題とリンクをマッピング
            plob_path_map[text] = link
    except:
        pass

if prob not in plob_path_map:
    # ユーザが入力した問題のリンクが取得できなかった場合

Example #33

0

Show file

import requests
from bs4 import BeautifulSoup

req = requests.get('https://www.naver.com')
html = req.text
#print(html)

soup = BeautifulSoup(html, 'html.parser')

issues = soup.select(
    '#PM_ID_ct > div.header > div.section_navbar > div.area_hotkeyword.PM_CL_realtimeKeyword_base > div.ah_roll.PM_CL_realtimeKeyword_rolling_base > div > ul > li > a'
)
#print(issues)

for issue in issues:
    print("[" + issue.select_one('span[class="ah_r"]').text + "] " +
          issue.select_one('span[class="ah_k"]').text)

Example #34

0

Show file

File: toplevel_setting.py Project: 15179885376/Spider_practice

 def check_jiekou(self):
     req=requests.get("http://jiekou.xiaomil.com/",headers=headers)
     soup=BeautifulSoup(req.text,'lxml')
     url_list=soup.select("xiaomil_ul form div lib_3 a")
     for url in url_list:
         print(url.get('href'))

Example #35

0

Show file

def w_url(href):
    driver = webdriver.Firefox()  # 打开火狐浏览器
    driver.set_page_load_timeout(30)
    driver.get(href)  # 打开界面

    time.sleep(3)
    driver.add_cookie({
        'name': 'gldjc_sessionid',
        'value': '39e0c310-83f6-4fd4-a10e-983392b87cc6',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': None,
        'secure': False,
        'httpOnly': True
    })
    driver.add_cookie({
        'name': 'location_name',
        'value': '%25E5%25B1%25B1%25E4%25B8%259C',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': 1535531192,
        'secure': False,
        'httpOnly': False
    })
    driver.add_cookie({
        'name': 'location_code',
        'value': '370000',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': 1535531192,
        'secure': False,
        'httpOnly': False
    })
    driver.add_cookie({
        'name': '_gat_gtag_UA_110560299_1',
        'value': '1',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': 1532939793,
        'secure': False,
        'httpOnly': False
    })
    driver.add_cookie({
        'name': 'loginUuid',
        'value': '39e0c310-83f6-4fd4-a10e-983392b87cc6',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': None,
        'secure': False,
        'httpOnly': False
    })
    driver.add_cookie({
        'name': 'nTalk_CACHE_DATA',
        'value':
        '{uid:kf_9318_ISME9754_6349427345656906564,tid:1532939192431315}',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': None,
        'secure': False,
        'httpOnly': False
    })
    driver.add_cookie({
        'name': 'NTKF_T2D_CLIENTID',
        'value': 'guest6DE8EBA3-F3AF-F1D9-ECD9-EA4BC870E82E',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': 1596011199,
        'secure': False,
        'httpOnly': False
    })
    driver.add_cookie({
        'name': '_ga',
        'value': 'GA1.2.1004482802.1532939194',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': 1596011199,
        'secure': False,
        'httpOnly': False
    })
    driver.add_cookie({
        'name': '_gid',
        'value': 'GA1.2.1482668967.1532939200',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': 1533025599,
        'secure': False,
        'httpOnly': False
    })
    driver.add_cookie({
        'name': 'INFO_PRICE_LOCATION',
        'value': '1_1',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': 1540715203,
        'secure': False,
        'httpOnly': False
    })
    driver.add_cookie({
        'name': 'Hm_lvt_727d5904b141f326c9cb1ede703d1162',
        'value': '1532939192',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': 1564475203,
        'secure': False,
        'httpOnly': False
    })
    driver.add_cookie({
        'name': 'Hm_lpvt_727d5904b141f326c9cb1ede703d1162',
        'value': '1532939203',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': None,
        'secure': False,
        'httpOnly': False
    })
    driver.add_cookie({
        'name': 'Hm_lvt_82698a74ed862e6a03fc9e4cbac594a6',
        'value': '1532939192',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': 1564475203,
        'secure': False,
        'httpOnly': False
    })
    driver.add_cookie({
        'name': 'Hm_lpvt_82698a74ed862e6a03fc9e4cbac594a6',
        'value': '1532939203',
        'path': '/',
        'domain': '.gldjc.com',
        'expiry': None,
        'secure': False,
        'httpOnly': False
    })

    time.sleep(3)
    driver.refresh()
    driver.execute_script(""" 
            (function () { 
                var y = document.body.scrollTop; 
                var step = 100; 
                window.scroll(0, y); 
                function f() { 
                    if (y < document.body.scrollHeight) { 
                        y += step; 
                        window.scroll(0, y); 
                        setTimeout(f, 50); 
                    }
                    else { 
                        window.scroll(0, y); 
                        document.title += "scroll-done"; 
                    } 
                } 
                setTimeout(f, 1000); 
            })(); 
            """)
    time.sleep(2)
    pageSource = driver.page_source
    soup = BeautifulSoup(pageSource, 'lxml')
    title = soup.find(attrs={'class': 'highcharts-title'}).text
    print(title)
    # com_names = soup.find_all(class_='data_table')

    wb = workbook.Workbook()  # 创建Excel对象
    ws = wb.active  # 获取当前正在操作的表对象
    # 往表中写入标题行,以列表形式写入！
    ws.append(['序号', '名称', '规格型号', '单位', '税率', '除税价(元)', '含税价(元)', '日期', '备注'])

    trs = soup.select("#infoprice_table tr")
    ulist = []
    for tr in range(1, len(trs)):
        ui = []
        for td in trs[tr]:
            ui.append(td)
        ulist.append(ui)
    for i in range(len(ulist)):
        xh = ulist[i][0].text
        mc = ulist[i][1].text
        ggxh = ulist[i][2].text
        dw = ulist[i][3].text
        sl = ulist[i][4].text
        csj = ulist[i][5].text
        result = ulist[i][6].img['src']
        urllib.request.urlretrieve(result, 'D:/YZM/1.png')
        image = Image.open('D:/YZM/1.png')
        hsj = tesserocr.image_to_text(image)
        print(hsj)
        rq = ulist[i][7].text
        bz = ulist[i][8].text
        ws.append([xh, mc, ggxh, dw, sl, csj, hsj, rq, bz])
        print(title)
        wb.save('1.xlsx')

    driver.close()

Example #36

0

Show file

File: railway.py Project: CuiShaohua/Kaikeba-AI-Homework

'''
body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(65) > tbody > tr:nth-child(2) > td:nth-child(1) > a

body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(65) > tbody > tr:nth-child(4) > td:nth-child(1) > a

body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(65) > tbody > tr:nth-child(6) > td:nth-child(1) > a

body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(65) > tbody > tr:nth-child(8) > td:nth-child(1) > a

body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(65) > tbody > tr:nth-child(42) > td:nth-child(1) > a

/html/body/div[4]/div[2]/div/div[2]/table[3]/tbody/tr[6]/td[1]/a
'''
#href = soup.select('body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-of-type(65) > tbody > tr:nth-child(2) > td:nth-of-type(1) > a')

href = soup.select('body > div.body-wrapper > div.content-wrapper > div > div.main-content > table')


railways_condition = '''
<a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%811%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁1号线</a>
<a target="_blank" href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%812%E5%8F%B7%E7%BA%BF">北京地铁2号线</a>
<a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%814%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁4号线</a>
<a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%815%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁5号线</a>
<a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%816%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁6号线</a>
<a target="_blank" href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%817%E5%8F%B7%E7%BA%BF">北京地铁7号线</a>
<a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%818%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁8号线</a>
<a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%819%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁9号线</a>
<a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%8110%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁10号线</a>
<a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%8113%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁13号线</a>
<a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%8115%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁15号线</a>
<a target="_blank" href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%8116%E5%8F%B7%E7%BA%BF">北京地铁16号线</a>

Example #37

0

Show file

File: importData.py Project: gabrielgts/itapecerica-simulation

from bs4 import BeautifulSoup
import requests
import csv

url = 'http://www2.copasa.com.br/servicos/qualidadeagua/pesqtel.asp?letra=D&cidade=443&periodoInicial=01%2F2019&periodoFinal=04%2F2020'
html = requests.get(url)
soup = BeautifulSoup(html.text)
tables = soup.select("#mesames table")

for table in tables:
    headers = [th.text.encode("utf-8") for th in table.select("tr th")]
    with open("out.csv", "a") as f:
        wr = csv.writer(f)
        wr.writerow(headers)
        wr.writerows(
            [
                [td.text.encode("utf-8") for td in row.find_all("td")]
                for row in table.select("tr + tr")
            ]
        )

Example #38

0

Show file

File: parse.py Project: PythonLong/Tcy-Spider

 def D_parse_Set(cls, html):
     soup = BeautifulSoup(html, 'html.parser')
     temp = soup.select(".cy_cosList li div")
     kv_info = [cls.clean1(item) for item in temp]
     return kv_info

Example #39

0

Show file

File: spider_frame_1.4.py Project: Jason-DataSci/Python

#import chardet                      #speed up

# Initialize
index_url = 'http://coolshell.cn/page/68'
count = 0

data = pd.DataFrame(columns=('title', 'link', 'reads'))

# Get links
while index_url != 'End':
    index_res = requests.get(index_url)
    index_res.encoding = 'utf-8'
    index_soup = BeautifulSoup(index_res.text, 'html.parser')

    # Get url from index page
    index = index_soup.select('header h2 a')
    for i in index:
        count = count + 1
        data.loc[count] = [i.text, i['href'], '']

    # Go to next link page
    try:
        index_url = index_soup.select(
            'nav .wp-pagenavi .nextpostslink')[0]['href']
    except:
        index_url = 'End'

# Get contents
for data_id in list(range(1, count + 1)):
    page_url = data['link'].loc[data_id]
    page_res = requests.get(page_url)

Example #40

0

Show file

File: parse.py Project: PythonLong/Tcy-Spider

 def parse_Img(cls,html):
     soup = BeautifulSoup(html,'html.parser')
     temp = soup.select(".tc p")
     info = [cls.clean2(item) for item in temp if cls.clean2(item) ]
     return info

Example #41

0

Show file

File: webscraper.py Project: ChadKingsley/Webscraper

#!/usr/bin/env python3

import requests
from bs4 import BeautifulSoup

url = "https://www.humblebundle.com/books/linux-unix-oreilly-books"
tierDict = {}

resp = requests.get(url)

soup = BeautifulSoup(resp.text, 'html.parser')

#Bundle Tiers
tiers = soup.select(".dd-game-row")

for tier in tiers:
    #only for headline
    if tier.select(".dd-header-headline"):
        #grab tier name and price
        tiername = tier.select(".dd-header-headline")[0].text.strip()
        #grab tier product names
        productNames = tier.select(".dd-image-box-caption")
        productNames = [prodName.text.strip() for prodName in productNames]
        #add one product tier to our datastructure
        tierDict[tiername] = {"products": productNames}

#old tiers
tierHeadlines = soup.select(".dd-header-headline")
strippedTiernames = [tier.text.strip() for tier in tierHeadlines]

#product Names

Example #42

0

Show file

    """
    Scrape the TOC
    Extract its links and put them in the Pile O' Links
    """

    expand_toc_js = config['toc_js']
    print(f"Scraping table of contents: {config['toc_url']}")
    toc_scrape_result = scraper.scrape(
        config['toc_url'], wait_for_selector=config['toc_selector'], js=expand_toc_js)

    # Record the scrape results in included_scraped_urls and redirects
    mark_url_included(toc_scrape_result['final_url'])
    redirects[config['toc_url']] = toc_scrape_result['final_url']

    soup = BeautifulSoup(toc_scrape_result['html'], 'html.parser')
    toc_element = soup.select(config['toc_selector'])[0]

    remove_blacklisted_selectors(toc_element)

    if config['rewrite_toc']:
        toc_element = config['rewrite_toc'](toc_element)

    def is_post_link(tag, post_url_pattern=None):
        if tag.attrs['href'] is None:
            return False
        if tag.attrs['href'].startswith('javascript:'):
            return False
        if post_url_pattern is None:  # Not filtering TOC links at all
            return True
        return re.match(post_url_pattern, tag.attrs['href']) is not None

Example #43

0

Show file

File: long_movie_crawling.py Project: JiHoon-JK/ART_Cinema_test

for i in range(19):

    if i == 2 or i == 8:
        continue

    # 랭킹페이지 접근
    driver.get(url + str(i + 1))

    for j in range(10):
        # 장르마다 1위-10위까지 반복
        # xpath 를 사용하여, 1위로 찍힌 영화명을 클릭 (클릭해야 영화정보가 있는 사이트로 넘어갈 수 있음)
        driver.find_element_by_xpath("//*[@id='old_content']/table/tbody/tr[" + str(j + 2) + "]/td[2]/div/a").click()

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        Long_movie_infos = soup.select('#content > div.article')
        # 장르 '드라마' 에 있는 첫 번째 영화에서 추출해야하는 영화정보 제일 큰 셀렉터

        # 출력 양식 설정. 근데 이 포문이 의미가 있을까 ?
        for Long_movie_info in Long_movie_infos:
            title = '<영화제목>' + '\n' + str(
                Long_movie_info.select_one('div.mv_info_area > div.mv_info > h3 > a:nth-child(1)').text) + ' (' + str(
                Long_movie_info.select_one('div.mv_info_area > div.mv_info > strong').text) + ')'
            print(title + '\n')

            poster = '<영화포스터>' + '\n' + str(Long_movie_info.select_one('img').attrs['src']).replace('//', '')
            print(poster + '\n')

            director = '<감독>' + '\n' + str(
                Long_movie_info.select_one('div.mv_info_area > div.mv_info > dl > dd:nth-child(4) > p > a').text)
            print(director + '\n')

Example #44

0

Show file

File: cigarworld_func.py Project: ambroselgy/multi_cigar

def get_item_info(item_url_queue, item_info_queue, header):

    while True:

        while item_url_queue.empty():
            time.sleep(0.01)

        tmp_links = item_url_queue.get()
        if tmp_links == "#END#":  # 遇到结束标志，推出进程
            print("get_item_info Quit {}".format(item_url_queue.qsize()))
            print("队列剩余" + str(item_info_queue.qsize()))
            break
        else:
            print("开始获取 " + str(tmp_links) + "  数据")
            r = requests.get(tmp_links, headers=header)
            while r.status_code != 200:
                time.sleep(10)
                print(r.status_code)
                print("重新获取  " + str(tmp_links) + "   数据")
                r = requests.get(tmp_links, headers=header)
            r.encoding = 'utf-8'
            html = r.text
            soup = BeautifulSoup(html, "lxml")
            try:
                item_list = soup.select("li.ws-g.DetailVariant")
                title = soup.find('h1').string.strip()
                times = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
                for i in item_list:
                    cigar_name = i.find(
                        'div',
                        attrs={
                            'class': "ws-u-1 DetailVariant-variantName",
                        }).find(
                        text=True).strip()
                    pricelist = i.select(
                        'div.ws-u-1-3.ws-u-lg-1-4.DetailVariant-formPrice > span.preis')
                    numslist = i.find_all(
                        'span', attrs={
                            'class': re.compile(r'einheitlabel')})
                    tmp_itemurl = i.find(
                        'a', attrs={
                            'class': 'ws-u-1 ws-u-lg-4-24 DetailVariant-col DetailVariant-image'})['href']
                    itemurl = 'https://www.cigarworld.de' + tmp_itemurl
                    if len(pricelist) == len(numslist):
                        for i in range(len(pricelist)):
                            tmp_name = str(cigar_name)
                            price = pricelist[i].text.replace("€", "").strip()
                            tmp_nums = numslist[i].text
                            tmp_stock = numslist[i].get('title').strip()
                            if tmp_stock:
                                stock = tmp_stock
                            else:
                                stock = "in stock"
                            #nums = re.sub(r'\D',"",tmp_nums)
                            nums = tmp_nums
                            name = title + " " + tmp_name + '  ' + str(nums)
                            details = '0'
                            detailed = price
                            cigarinfo = {
                                'title': title,
                                'cigar_name': name,
                                'detailed': detailed,
                                'stock': stock,
                                'details': details,
                                'cigar_price': price,
                                'itemurl': itemurl,
                                'times': times}
                            item_info_queue.put(cigarinfo)
                    else:
                        print("比对不通过 " + tmp_links)
            except Exception as err:
                print(str(tmp_links) + "    商品获取报错")
                print(err)

Example #45

0

Show file

File: 09-select方法练习题.py Project: 164622456/python

            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
    </tbody>
</table>
"""
soup = BeautifulSoup(html, 'lxml')
# 1. 获取所有tr标签
# trs = soup.select('tr')
# print(trs)
# 2. 获取第2个tr标签
# tr = soup.select('tr')[1]
# print(tr)
# 3. 获取所有class等于even的tr标签
# tr = soup.select('.even')
# print(tr)
# tr = soup.select('tr[class="even"]')
# print(tr)
# 4. 获取所有a标签的href属性
# alist = soup.select('a')
# for a in alist:
#     href = a['href']
#     print(href)
# 5. 获取所有的职位信息（纯文本）

trs = soup.select('tr')
for tr in trs:
    info = list(tr.stripped_strings)
    print(info)

Example #46

0

Show file

File: getPage.py Project: jim4399266/getCNKI_2

    def get_artical_detail(self, artical_href, dict_artcical):
        # time.sleep(0.1)
        #获取文章详细内容的url，发现和href中的三个关键参数有关
        parameters = {
            'DbCode': '',
            'DbName': '',
            'FileName': '',
        }
        pattern_DbCode = re.compile(r'.*?[dD]b[cC]ode=\s?(.*?)&')
        pattern_DbName = re.compile(r'.*?[dD]b[nN]ame=\s?(.*?)&')
        pattern_FileName = re.compile(r'.*?[fF]ile[nN]ame=\s?(.*?)&')
        parameters['DbCode'] = re.search(pattern_DbCode, artical_href).group(1)
        parameters['DbName'] = re.search(pattern_DbName, artical_href).group(1)
        parameters['FileName'] = re.search(pattern_FileName,
                                           artical_href).group(1)
        print('FileName=' + parameters['FileName'])
        req = requests.get(GET_ARTICAL_DETAIL_URL,
                           params=parameters,
                           headers=my_parameters.headers_kns)
        #请求到文章详细内容的页面后，获取文章关键词
        soup = BeautifulSoup(req.text, 'lxml')
        keyword = []
        try:
            keyword_list = soup.find('label', attrs={
                'id': 'catalog_KEYWORD'
            }).parent.find_all('a')
            for item in keyword_list:
                keyword.append(item.text.strip(';\r\n\t '))
        except:
            pass
        #将获取的关键词保存为列表，插入到每个文章的信息中
        dict_artcical['关键词'] = keyword

        #查找摘要
        try:
            summary = soup.find('span', attrs={'id': 'ChDivSummary'}).text
        except:
            summary = "kong"
        dict_artcical['摘要'] = summary

        #查找相似文献
        parameters.update({
            'curdbcode': 'CJFQ',
            'reftype': '604',
            'catalogId': 'lcatalog_func604',
            'catalogName': '相似文献',
        })

        ajax_url = 'https://kns.cnki.net/kcms/detail/frame/asynlist.aspx?'
        dict_artcical['相似文献'] = self.find_ajax(ajax_url, parameters)
        #查找读者推荐
        parameters.update({
            'curdbcode': 'CJFQ',
            'reftype': '605',
            'catalogId': 'lcatalog_func605',
            'catalogName': '读者推荐',
        })
        dict_artcical['读者推荐'] = self.find_ajax(ajax_url, parameters)

        #获取复合影响因子、获取综合影响因子
        parameters_fators = {
            'pcode': '',
            'pykm': '',
        }
        infomation = soup.select('.sourinfo .title a')
        pattern = re.compile(
            r'.*?\(\'(.*?)\',\'(.*?)\',\'(.*?)\',\'(.*?)\'\);')
        parameters_fators['pcode'] = pattern.search(str(infomation)).group(2)
        parameters_fators['pykm'] = pattern.search(str(infomation)).group(4)
        if parameters_fators['pykm'] in Journal_Point.keys():
            dict_artcical['复合影响因子'], dict_artcical['综合影响因子'] = Journal_Point[
                parameters_fators['pykm']][0:2]
        else:
            try:
                dict_artcical['复合影响因子'], dict_artcical[
                    '综合影响因子'] = self.get_Impact_Factor2(parameters_fators)
            except:
                dict_artcical['复合影响因子'], dict_artcical['综合影响因子'] = 0, 0
                print("没找到")

            #将期刊的代号插入到字典中
            if (dict_artcical['复合影响因子'], dict_artcical['综合影响因子'] != 0, 0):
                Journal_Point[parameters_fators['pykm']] = [
                    dict_artcical['复合影响因子'], dict_artcical['综合影响因子']
                ]

Example #47

0

Show file

File: BingWallpaper.py Project: asmitamitra/crawlers

set desktop picture to POSIX file "%s"
end tell
END"""

dt = datetime.datetime.now()
cd = str(dt.year) + '0' + str(dt.month) + str(dt.day)
while True:
    dt = datetime.datetime.now()
    if (dt.hour == 0 and dt.minute == 2
            and dt.second == 0) or (dt.hour == 15 and dt.minute == 0
                                    and dt.second == 0):
        os.makedirs('Bing', exist_ok=True)
        url = "http://bingwallpaper.com/"
        sc = requests.get(url)
        soup = BeautifulSoup(sc.text, 'lxml')  #check lxml?
        print(sc.text)
        image = soup.select('.cursor_zoom img')
        image_url = image[0].get('src')
        response = requests.get(image_url)

        with open(os.path.join('Bing', cd + '.jpg'), 'wb') as file:
            file.write(response.content)

        #change desktop background
        #os.system('gsettings set org.gnome.desktop.background picture-uri file:///home/radioactive/Bing/'+cd+'.jpg')
        file_path = '/Users/asmita.mitra/PythonScripts/crawlers/Bing/' + cd + '.jpg'
        subprocess.Popen(SCRIPT % file_path, shell=True)
        print('Wallpaper set to ' + file_path)
        break

sys.exit()

Example #48

0

Show file

 def _get_confirmation_trade_offer_id(
         confirmation_details_page: str) -> str:
     soup = BeautifulSoup(confirmation_details_page, 'html.parser')
     full_offer_id = soup.select('.tradeoffer')[0]['id']
     return full_offer_id.split('_')[1]

Example #49

0

Show file

File: reddit.py Project: manojkec2007/PythonTraining

    #url = "http://www.ebay.com/sch/" + word.rstrip('\r\n')
    url = "http://www.ebay.com/sch/" + "Actnovate"
    # Check the response for URL connectivity and Reading the content of the URL
    page = requests.get(url)
    response = page.status_code
    content = page.content
    #print response  #--200 suucess response
    #print content
    soup = BeautifulSoup(content, 'html.parser')
    #print soup

    #print soup
    #soup1= soup.find_all('a', class_='vip')[0].get_text()
    #soup2= soup.find_all('a', class_='vip')

    hrefs = [d["href"] for d in soup.select(".lvtitle a")]
    #print hrefs

    for link in hrefs:
        # Check the response for URL connectivity and Reading the content of the URL
        sub_page = requests.get(link)
        sub_response = sub_page.status_code
        sub_content = sub_page.content
        #print sub_content
        sub_soup = BeautifulSoup(sub_content, 'html.parser')
        #print sub_soup

        title = sub_soup.find_all('span', id='vi-lkhdr-itmTitl')[0].get_text()
        print title

        price = sub_soup.find_all('span', id='prcIsum')[0].get_text()

Example #50

0

Show file

def start(company_code, start_date, end_date):
    print(f"company_code: {company_code} 뉴스기사 크롤링 시작")
    mkdir(company_code)

    unique_news_titles = set()
    page = 1
    processing_date = end_date
    while True:
        url = 'https://finance.naver.com/item/news_news.nhn?code=' + str(
            company_code) + '&page=' + str(page)

        source_code = requests.get(url).text
        html = BeautifulSoup(source_code, "lxml")

        dates = [
            datetime.datetime.strptime(date.get_text(),
                                       ' %Y.%m.%d %H:%M').date()
            for date in html.select('.date')
        ]
        titles = [
            re.sub('\n', '', str(title.get_text()))
            for title in html.select('.title')
        ]
        links = [
            'https://finance.naver.com' + link.find('a')['href']
            for link in html.select('.title')
        ]

        flag = True

        result_date = []
        result_title = []
        result_contents = []

        for row in list(zip(dates, titles, links)):
            date = row[0]
            title = row[1]
            link = row[2]

            if date > end_date:
                continue

            if title in unique_news_titles:
                continue

            unique_news_titles.add(title)

            source_code = requests.get(link).text
            html = BeautifulSoup(source_code, "lxml")
            contents = str(html.select("div#news_read"))
            contents.find("<span")
            a = contents.find("<a")
            contents = remove_filename(contents[0:a])

            if processing_date != date:  # row 단위로 뉴스기사를 읽어오다가 날짜가 달라진 경우
                result = {
                    "날짜": result_date,
                    "기사제목": result_title,
                    "본문내용": result_contents
                }
                df_result = pd.DataFrame(result)
                df_result.to_csv(
                    f"./{NEWS_DIR}/{company_code}/{company_code}_{str(processing_date)[:10]}.csv",
                    mode='w',
                    encoding='utf-8-sig')
                processing_date = date
                result_date.clear()
                result_title.clear()
                result_contents.clear()

            if start_date > date:  # 현재 읽어오려는 뉴스기사의 날짜가 원하는 날짜보다 더 과거의 날짜인 경우
                flag = False
                break

            result_date.append(date)
            result_title.append(title)
            result_contents.append(contents)

        if not flag:
            break

        # print(f"company_code: {company_code}, processing_date: {processing_date}, 크롤링 끝난 페이지: {page}")
        page += 1

Example #51

0

Show file

File: GetContents.py Project: akalswl14/insta_crawling

def GetEachContents(driver, EachUrl):
    EmptyFolder = 0
    url = baseUrl + str(EachUrl)
    driver.get(url)
    driver.find_element(By.CSS_SELECTOR, "._97aPb > div:nth-child(1)").click()
    sleep(2)

    #저장할 폴더 경로
    f_url = ProjectFolder + '/MobileTest_img/' + EachUrl[3:]
    #폴더 있으면 삭제
    if (os.path.isdir(f_url)):
        print('이미 다운받은 게시글입니다.')
        return EmptyFolder
    #저장할 폴더 생성
    else:
        os.mkdir(f_url)

    print(f_url + '을 작업중입니다.')

    image = list()
    # FirstOne = True
    if (driver.find_elements_by_css_selector(".coreSpriteRightChevron")
            or driver.find_elements(By.CLASS_NAME, "vi798")):
        while (True):
            pageString = driver.page_source
            soup = BeautifulSoup(pageString, "lxml")
            LiTagList = soup.select(".FFVAD")
            LiTagList += soup.select(".tWeCl")
            if len(LiTagList) == 0:
                driver.get(url)
                continue
            try:
                for LiTag in LiTagList:
                    image.append(LiTag.attrs['src'])
            except KeyError as keyerr:
                print(keyerr)
                print(LiTag)
                print("!!!!!!!KEYERROR!!!!!!!!!")
                driver.get(url)
                continue
            if (driver.find_elements_by_css_selector(".coreSpriteRightChevron")
                ):
                driver.find_element_by_css_selector(
                    ".coreSpriteRightChevron").click()
                sleep(1)
                print("click")
            else:
                break
    else:
        while (True):
            pageString = driver.page_source
            soup = BeautifulSoup(pageString, "lxml")
            EachContent = soup.select(".FFVAD") + soup.select(".tWeCl")
            if len(EachContent) == 0:
                driver.get(url)
                continue
            else:
                image.append(EachContent[0].attrs['src'])
                break
    cnt = 0
    image = list(set(image))
    for img in image:
        cnt += 1
        if (len(image) > 1):
            if ".mp4" in img:
                urllib.request.urlretrieve(img, f_url + str(cnt) + ".mp4")
            else:
                urllib.request.urlretrieve(img, f_url + str(cnt) + ".jpg")
        else:
            if ".mp4" in img:
                urllib.request.urlretrieve(img, f_url + str(cnt) + ".mp4")
            else:
                urllib.request.urlretrieve(img, f_url + str(cnt) + ".jpg")
    print(str(cnt) + "개의 게시글 콘텐츠를 폴더에 저장하였습니다.")
    if image == []:
        EmptyFolder += 1
    print("------------------------------")
    return EmptyFolder

Example #52

0

Show file

from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client.dbsparta

naver_movie = 'https://movie.naver.com/movie/running/current.nhn'
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
data = requests.get(naver_movie, headers=headers)

# HTML을 BeautifulSoup이라는 라이브러리를 활용해 검색하기 용이한 상태로 만듦
soup = BeautifulSoup(data.text, 'html.parser')

movies = soup.select('.lst_detail_t1 > li')
# print(movies)

for movie in movies:
    if not movie.select('dt.tit') == None:
        for title_info in movie.select('dt.tit'):
            title = title_info.a.text
            link = naver_movie + title_info.a.attrs['href']
            # print(title)
            director = ', '.join(
                [d.text for d in movie.select('span.link_txt')[1].select('a')])
            if len(movie.select('span.num')) > 1:
                rate = movie.select('span.num')[1].text
                # print(rate)
            img = movie.select('div.thumb > a > img')[0].attrs['src'].split(
                '?')[0]

Example #53

0

Show file

File: crawling.py Project: qwerty1434/crawling

from bs4 import BeautifulSoup
import requests

driver = webdriver.Chrome("<웹드라이버>")
driver.implicitly_wait(3)
driver.get("<주소>")

driver.find_element_by_name('authUser').send_keys('<아이디>')
driver.find_element_by_name('authPass').send_keys('<비밀번호>')
driver.find_element_by_css_selector('.uxd-btn').click()  #버튼클릭

driver.get("<리뷰주소>")
driver.find_element_by_css_selector('#regNothanksLink').click()  #버튼클릭
text_file = open("Output.csv", "w")

for i in range(0, 100000):  #게시물
    k = "<리뷰주소>" + str(i)
    driver.get(k)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    #notices = soup.select('p.vproductlist')
    notices = soup.select('span.vproductListItem')
    if "reviews/review/view" in requests.request("GET", k).url:
        for j in range(len(notices)):
            text_file.write(notices[j].text.strip())
        text_file.write("{0}\n".format(i))

    else:
        pass
text_file.close()

Example #54

0

Show file

File: edu_crawl.py Project: Canine89/sneak_proj

    def crawlData(self, MAX_PAGE=int(sys.argv[1])):
        # 초기화
        END_NUMBER = MAX_PAGE * 20 + 1
        QUERY_SET = {
            "title": "div.goods_info > div.goods_name > a",
        }
        PP = pprint.PrettyPrinter(indent=4, sort_dicts=False)
        BASE_DIR = os.path.dirname(os.path.abspath(__file__))

        result_data = {}
        for info_number in range(1, END_NUMBER):
            bookinfo_key = "bookinfo" + str(info_number)
            result_data[bookinfo_key] = {}

        # 크롤링
        for page in range(1, MAX_PAGE + 1):
            req = requests.get(
                "http://www.yes24.com/24/Category/More/001001044?ElemNo=104&ElemSeq=1&PageNumber="
                + str(page)
            )
            html = req.text
            soup = BeautifulSoup(html, "lxml")

            suffix_info_number = (page - 1) * 20 + 1
            result = soup.select(QUERY_SET["title"])

            # title, URL
            for item in result:
                if len(item.text) > 0:
                    bookinfo_key = "bookinfo" + str(suffix_info_number)
                    result_data[bookinfo_key]["title"] = item.text
                    result_data[bookinfo_key]["url"] = (
                        "https://www.yes24.com" + item.attrs["href"]
                    )
                    result_data[bookinfo_key]["rank"] = suffix_info_number
                    suffix_info_number = suffix_info_number + 1

            suffix_info_number = (page - 1) * 20 + 1
            # author, publisher, publish_date, right_price, sales_price, isbn, page
            for number in range(suffix_info_number, suffix_info_number + 20):
                bookinfo_key = "bookinfo" + str(number)
                try:
                    print(result_data[bookinfo_key]["title"] + "을 처리하는 중입니다...")
                    req = requests.get(result_data[bookinfo_key]["url"])
                except:
                    break
                html = req.text
                soup = BeautifulSoup(html, "lxml")

                result_data[bookinfo_key]["publisher"] = soup.select(
                    "#yDetailTopWrap > div.topColRgt > div.gd_infoTop > span.gd_pubArea > span.gd_pub > a"
                )[0].text

                result_data[bookinfo_key]["publish_date"] = soup.select(
                    "#yDetailTopWrap > div.topColRgt > div.gd_infoTop > span.gd_pubArea > span.gd_date"
                )[0].text

                result_data[bookinfo_key]["right_price"] = make_integer_from_string(
                    soup.select(
                        "#yDetailTopWrap > div.topColRgt > div.gd_infoBot > div.gd_infoTbArea > div > table > tbody > tr > td > span > em"
                    )[0].text.replace(",", "")
                )

                result_data[bookinfo_key]["sales_price"] = int(
                    result_data[bookinfo_key]["right_price"] * 0.9
                )
                result_data[bookinfo_key]["isbn"] = make_integer_from_string(
                    soup.select(
                        "#infoset_specific > div.infoSetCont_wrap > div > table > tbody > tr:nth-of-type(3) > td"
                    )[0].text
                    # //*[@id="infoset_specific"]/div[2]/div/table/tbody/tr[3]/td
                    # #infoset_specific > div.infoSetCont_wrap > div > table > tbody > tr:nth-of-type(3) > td
                )

                # 쪽수확인중 값 처리를 위해 try...except 문 삽입
                try:
                    result_data[bookinfo_key]["page"] = make_integer_from_string(
                        re.findall(
                            "\d+쪽",
                            soup.select("#infoset_specific > div.infoSetCont_wrap")[
                                0
                            ].text,
                        )[0]
                    )
                except:
                    result_data[bookinfo_key]["page"] = -1

                result_data[bookinfo_key]["sales_point"] = (
                    make_integer_from_string(
                        soup.select("span.gd_ratingArea > span.gd_sellNum")[
                            0
                        ].text.replace(",", "")
                    )
                    or "none"
                )

                # publisher
                # span으로 처리한 링크 없는 저자 이름을 위해 try...except 문 삽입
                result_data[bookinfo_key]["author"] = []
                try:
                    authors = soup.select(
                        "#yDetailTopWrap > div.topColRgt > div.gd_infoTop > span.gd_pubArea > span.gd_auth > a"
                    )
                    for author in authors:
                        result_data[bookinfo_key]["author"].append(author.text)
                except:
                    authors2 = soup.select(
                        "#yDetailTopWrap > div.topColRgt > div.gd_infoTop > span.gd_pubArea > span.gd_auth > span"
                    )
                    for author in authors2:
                        result_data[bookinfo_key]["author"].append(author.text)

                # tags
                result_data[bookinfo_key]["tags"] = []
                tags = (
                    soup.select(
                        "#infoset_goodsCate > div.infoSetCont_wrap > dl > dd > ul > li > a"
                    )
                    or "none"
                )
                tags2 = soup.select("span.tag > a") or "none"
                for tag in tags:
                    if str(type(tag)) != "<class 'str'>":
                        result_data[bookinfo_key]["tags"].append(tag.text)
                for tag in tags2:
                    if str(type(tag)) != "<class 'str'>":
                        result_data[bookinfo_key]["tags"].append(tag.text)

        with open(
            "yes24_" + datetime.today().strftime("%Y_%m%d_%H%M_%S") + ".json",
            "w",
            encoding="UTF-8",
        ) as outfile:
            json.dump(result_data, outfile, ensure_ascii=False)

Example #55

0

Show file

File: search.py Project: cha1ra/original-command

import requests
from bs4 import BeautifulSoup

target_url = 'https://www.google.com/search?q=english&hl=en&lr=lang_en'
r = requests.get(target_url)
soup = BeautifulSoup(r.content, 'html.parser')

titles = [i.string for i in soup.select('.vvjwJb')]
urls = [
    ''.join(i.string.split(' ')).replace('›', '/')
    for i in soup.select('.UPmit')
]

print(titles)
print(urls)

Example #56

0

Show file

File: FMA_json_parser_v2.0.py Project: kimsangwond/FMA_download_crawling

		genre = div.find('div',{'class':'playtxt'}).find('span',{'class':'ptxt-genre'}).text
		temp_dict[str(i+1)]={'artist':str(artist), 'track':str(track), 'album':str(album), 'genre':str(genre)}

	return temp_dict

def toJson(fma_dict):

    with open('{}_chart.json'.format(genre), 'w', encoding='utf-8') as file :
        json.dump(fma_dict, file, ensure_ascii=False, indent='\t')

fma_dict={}

req1 = requests.get('https://freemusicarchive.org/genre/{}/?sort=track_date_published&d=1&page=1&per_page=200/'.format(genre))

source1 = req1.text
html2 = BeautifulSoup(source1, 'lxml')
final_page2=html2.select('a[href^="https://freemusicarchive.org/genre/{}/?sort=track_date_published&d=1&page="]'.format(genre))
final_page=final_page2[6].text
final_page=int(final_page)

final_song2=html2.find('div', {'class': 'pagination-full'}).find_all("b")
final_song=final_song2[2].text
final_song=int(final_song)

for page in range(1,final_page+1):
	req = requests.get('https://freemusicarchive.org/genre/{}/?sort=track_date_published&d=1&page={}&per_page=200'.format(genre, page))
	source = req.text
	html = BeautifulSoup(source, 'lxml')
	fma_dict = dict(fma_dict, **fma_Crawling(html,page))

toJson(fma_dict)

Example #57

0

Show file

File: 7.59.py Project: minibe0/python-crawler-book

import requests as rq
from bs4 import BeautifulSoup

base_url = 'https://pjt3591oo.github.io'
page_path = '/page%d'
page = 2

res = rq.get(base_url)
soup = BeautifulSoup(res.content, 'lxml')

posts = soup.select('body main.page-content div.wrapper div.home div.p')

for post in posts:
    title = post.find('h3').text.strip()
    descript = post.find('h4').text.strip()
    author = post.find('span').text.strip()
    print(title, descript, author)

while True:
    sub_path = page_path%(page)
    page += 1
    res = rq.get(base_url + sub_path)

    if (res.status_code != 200):
        break

    soup = BeautifulSoup(res.content, 'lxml')

    posts = soup.select('body main.page-content div.wrapper div.home div.p')

    for post in posts:

Example #58

0

Show file

File: _11_shopping.py Project: amkorousagi/Trendup

def _11_shopping(lst):
    for i in gender:

        url = "http://www.11st.co.kr/browsing/BestSeller.tmall?method=getBestSellerMain&cornerNo=2&dispCtgrNo=" + switch_site(
            i)

        custom_header = {
            "user-agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
        }
        req = requests.get(url, headers=custom_header)

        html = BeautifulSoup(req.text, "html.parser")

        items = html.select("p")

        allword = []
        keyword = []

        for item in items:
            allword.append(item.text.strip())

        first_one = 0
        for j in range(0, 10):
            allword.remove(allword[first_one])

        split_list = []

        for j in allword:
            r_j = j.replace('[', ' ').replace(']',
                                              ' ').replace('/', ' ').replace(
                                                  '(', ' ').replace(')', ' ')
            split_list = r_j.split(" ")
            for k in split_list:
                keyword.append(k)

        keyword = ' '.join(keyword).split()

        word_count = counter(keyword)
        word_count = sorted(word_count.items(),
                            key=lambda x: x[1],
                            reverse=True)

        #keyword 삭제
        banlist = ['남성', '남자', '여성', '여자']

        x = 0
        for j in word_count:
            if j[0] in banlist:
                del word_count[x]
                x = x + 1
            else:
                x = x + 1

        keyword2 = []
        #상위 20개만 추출
        for j in range(0, 20):
            keyword2.append(word_count[j])

        k = 1
        for j in keyword2:
            gender_ = swithch_gender(i)
            values1 = (str(k), j[0], str(swithch_gender(i)), str(j[1]))
            query1 = "insert into _11_shopping (rank,keyword,date_,gender,score) values(%s,%s,cast(now() as char),%s,%s)"

            curs.execute(query1, values1)

            k = k + 1
    return 0

Example #59

0

Show file

File: get_all_index.py Project: OryxLib/Oryx.FastAdmin

import requests
from bs4 import BeautifulSoup
import re
from akshare.obor.get_countries_from_invest_web import get_countries_url

web_site = get_countries_url()

for item_1 in web_site.iloc[:, 0]:
    # item_1 = web_site.iloc[:, 0][0]
    url = 'https://cn.investing.com' + item_1
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'
    }
    res = requests.post(url, headers=headers)
    soup = BeautifulSoup(res.text, 'lxml')
    title = soup.select('title')[0].get_text().split('-')[0].strip().split('_')[0]
    if title == "科威特股市指数":
        continue
    useful_web = soup.find_all(attrs={'id': 'cr1'})[0].find_all(attrs={'class': 'bold left noWrap elp plusIconTd'})[0].select('a')[0]['href']
    useful_title = soup.find_all(attrs={'id': 'cr1'})[0].find_all(attrs={'class': 'bold left noWrap elp plusIconTd'})[0].select('a')[0]['title']

    url = 'https://cn.investing.com' + useful_web + '-historical-data'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'
    }
    res = requests.post(url, headers=headers)
    soup = BeautifulSoup(res.text, 'lxml')
    data = soup.find_all(text=re.compile('window.histDataExcessInfo'))[0].strip()
    para_data = re.findall(r'\d+', data)
    start_date = '2000/01/01'
    end_date = '2019/10/17'

Example #60

0

Show file

File: 5_PhantomJS_Selenium.py Project: WilliamCWChang/HandsOn_Scrapy

from __future__ import print_function
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import sys

browser = webdriver.PhantomJS(executable_path='/usr/bin/phantomjs')
#browser = webdriver.Firefox()

browser.get(
    "https://www.agoda.com/pages/agoda/default/DestinationSearchResult.aspx?asq=zWuVSTFwAmUZtJhrjzSYy5ufa9Vwpz6XltTHq4n%2B9gMYSfr7u1CU1i2lx00TDWH67lxWsQ6v%2FrbtGwzAUB%2FtOU%2FdDeCkxleINu%2BSBVhHZM%2BIpGI3GSP9dWr%2F8u9MCc9T2OGPRUf%2FnqWVFuWaH2y7CrS7mFrDxsW1r6%2BWtQtj5qO6pb0fC98X0j%2F7ua2%2FHygyWaTGybgLZnzu83SuX64zYXSk%2FM8eVuQYqDHVLhv%2F6oNjjoTmpFlSkVcSfnu9ryzz4KE%2FoYnM%2Fefy83sE%2FJDBPA%3D%3D&city=4951&cid=1732641&tag=41460a09-3e65-d173-1233-629e2428d88e&gclid=Cj0KEQjwxbDIBRCL99Wls-nLicoBEiQAWroh6uLlQWnHWRlc9Euu6Pg_XC1NRtBzj5Yb8HkVs-MjQLMaAigh8P8HAQ&tick=636295974842&txtuuid=c48ab805-f9ed-45d4-bb4a-2377625889d9&pagetypeid=103&origin=TW&aid=81837&userId=5fcd3f05-8c16-4426-acdf-5ee6bb07f69f&languageId=20&sessionId=xhjywu5gunsz0c5oexhquovf&storefrontId=3&currencyCode=TWD&htmlLanguage=zh-tw&trafficType=User&cultureInfoName=zh-TW&textToSearch=%E5%8F%B0%E5%8C%97%E5%B8%82&guid=c48ab805-f9ed-45d4-bb4a-2377625889d9&isHotelLandSearch=true&checkIn=2017-05-14&checkOut=2017-05-15&los=1&rooms=1&adults=2&children=0&ckuid=5fcd3f05-8c16-4426-acdf-5ee6bb07f69f&priceCur=TWD&hotelReviewScore=5"
)

soup = BeautifulSoup(browser.page_source, "html.parser")
while len(soup.select('.btn.btn-right')) > 0:
    for ele in soup.select('.hotel-info h3'):
        print(ele.text)
        # print(ele.text.encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding))
        browser.find_element_by_id("paginationNext").click()
        time.sleep(3)
        soup = BeautifulSoup(browser.page_source)
browser.close()