Ejemplo n.º 1
0
    def extract_packt_free_book(content, encoding='utf-8'):
        if hasattr(content, 'read'):    # file-type
            content = content.read()
        if isinstance(content, bytes):  # convert to str
            content = content.decode(encoding)

        # Extracting information with html parser
        page = BeautifulSoup(content, 'html.parser')
        dealoftheday = page.select_one(
            '#deal-of-the-day div div div:nth-of-type(2)')

        if not dealoftheday:
            return None

        book = util.AttributeDict()
        try:
            book['name'] = dealoftheday.select_one(
                'div:nth-of-type(2) h2').text.strip()
            book['summary'] = dealoftheday.select_one(
                'div:nth-of-type(3)').text.strip()
            book['expires'] = int(dealoftheday.select_one(
                'span.packt-js-countdown').attrs['data-countdown-to']
            )
            image_source = page.select_one(
                '#deal-of-the-day > div > div > '
                'div.dotd-main-book-image.float-left > a > img'
            ).attrs.get('data-original', None)
            if image_source and image_source.startswith('//'):
                image_source = 'https:{0}'.format(image_source)
            book['cover'] = image_source
            return book
        except:
            return None
Ejemplo n.º 2
0
def article_crawler(url):
    """
    Crawls article url, and extract fields

    args:
        url <str>: article url

    return:
        article_dict <dict>: artilce dict with fields
    """
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html5lib')

    article = {}
    article['title']   = soup.select_one('#h1').text.replace('\u3000',' ').strip()
    article['dt']      = soup.select_one('div.gggs > time').text.strip()
    article['content'] = soup.select_one('#summary').text.strip()

    if soup.select_one('div.urcc > a.function_icon.clicked'):
        article['view_count'] = int(re.findall('\d+', soup.select_one('div.urcc > a.function_icon.clicked').text)[0])
    else:
        article['view_count'] = 0
    writer.writerow(article)
    f.flush() # Flush here so we don't loose data on exception
    return article
Ejemplo n.º 3
0
def parse_object_content(content):
    obj = {}
    soup = BeautifulSoup(content, "html.parser")
    obj["div"] = _strip(soup.select_one("div").text)
    obj["select"] = _split(soup.select_one("select > option").text)
    print obj
    return obj
Ejemplo n.º 4
0
    def GetDetail(self,html):

        soup = BeautifulSoup(html,'html5lib')

        dt = dict()

        timelimit = soup.select_one('#conditions > div:nth-of-type(1)').text
        memorylimit = soup.select_one('#conditions > div:nth-of-type(2)').text

        L = soup.find_all(class_='content-wrapper')
        Title = ['description','input','output','sampleinput','sampleoutput','hint']

        for i in range(len(L)):
            try: dt[Title[i]] = L[i]
            except Exception as e:
                print(e)

        dt['specialjudge'] = soup.select_one('#spjinfo')
        if dt['specialjudge'] is None : dt['specialjudge'] = 0
        else: dt['specialjudge'] = 1

        s1 = 'Time Limit: '
        s2 = 'Memory Limit: '
        dt['timelimit'] = timelimit[timelimit.find(s1)+len(s1):]
        dt['memorylimit'] = memorylimit[memorylimit.find(s2)+len(s2):]

        return dt
Ejemplo n.º 5
0
    def get_course_classes(self, kcdm):
        """
        获取选课系统中课程的可选教学班级

        @structure {'可选班级': [{'起止周': str, '考核类型': str, '教学班附加信息': str, '课程容量': int, '选中人数': int,
         '教学班号': str, '禁选专业': str, '教师': [str], '校区': str, '优选范围': [str], '开课时间,开课地点': [str]}],
        '课程代码': str, '课程名称': str}

        :param kcdm: 课程代码
        """
        params = {'kcdm': kcdm.upper()}
        method = 'get'
        # url = 'student/asp/select_topRight.asp'
        url = 'student/asp/select_topRight_f3.asp'
        response = self.api_request(method, url, params)

        page = response.text
        ss = SoupStrainer('body')
        bs = BeautifulSoup(page, self.html_parser, parse_only=ss)
        class_table = bs.select_one('#JXBTable')
        if class_table.get_text(strip=True) == '对不起!该课程没有可被选的教学班。':
            return APIResult(None, response)

        result = dict()
        _, result['课程代码'], result['课程名称'] = bs.select_one('#KcdmTable').stripped_strings
        result['课程代码'] = result['课程代码'].upper()
        trs = class_table.find_all('tr')

        course_classes = []
        for tr in trs:
            tds = tr.find_all('td')
            assert len(tds) == 5

            # 解析隐含在 alt 属性中的信息
            class_info_table = BeautifulSoup(tds[1]['alt'], self.html_parser)
            info_trs = class_info_table.select('tr')
            # 校区 起止周 考核类型 禁选专业
            cls_info = dict(safe_zip(info_trs[0].stripped_strings, parse_tr_strs([info_trs[1]])[0]))
            # 选中人数 课程容量
            for s in info_trs[2].stripped_strings:
                kv = [v.strip() for v in s.split(':', 1)]
                cls_info[kv[0]] = int(kv[1]) if kv[1] else None
            # 教学班附加信息
            # 教学班附加信息:屯溪路校区 上课地点:体育部办公楼2楼
            cls_info.update([(v.strip() or None for v in s.split(':', 1)) for s in info_trs[5].stripped_strings])
            # 开课时间,开课地点
            p = re.compile(r'周[一二三四五六日]:\(\d+-\d+节\) \(\d+-\d+周\).+?\d+')
            cls_info[info_trs[3].get_text(strip=True)] = p.findall(info_trs[4].get_text(strip=True))

            cls_info['课程容量'] = int(cls_info['课程容量'])
            cls_info['选中人数'] = int(cls_info['选中人数'])
            cls_info['教学班号'] = tds[1].string.strip()
            cls_info['教师'] = [s.strip() for s in tds[2].text.split(',')]
            cls_info['优选范围'] = [s.strip() for s in tds[3].text.split(',')]

            course_classes.append(cls_info)

        result['可选班级'] = course_classes
        return APIResult(result, response)
Ejemplo n.º 6
0
 def extract_details(self):
     for item in self.get_detail_workers():
         soup = BeautifulSoup(item.worker.result(), 'lxml')
         if soup.select_one('div.article__heading__title') is None:
             continue
         self.detail_candidates.append({'title': soup.select_one('div.article__heading__title').text,
                                        'content': soup.select_one('div.node-article-content').text})
     return self.detail_candidates
Ejemplo n.º 7
0
    def __init__(self, url):
        html = urlopen(url)
        bs_html = BeautifulSoup(html.read(), "html.parser")
        self.title = bs_html.select_one("#title > div.left > h1").get_text(strip=True)
        self.publisher = bs_html.select_one("#title > div.left > h2").get_text(strip=True)
        self.category = bs_html.select_one("#left-stack > div.lockup.product.course.itunes-u > ul > li.genre > a > span").get_text(strip=True)
        self.rating = bs_html.select_one("#left-stack > div.extra-list.customer-ratings > div")["aria-label"]

        bs_video_trs = bs_html.find_all("tr",attrs={"class":"podcast-episode video"})
        if bs_video_trs is not None:
            self.video_urls = [bs_video_tr["video-preview-url"] for bs_video_tr in bs_video_trs]
Ejemplo n.º 8
0
def parseBook(url,book):
	'''解析当当商品页面内的标题(书名)和价格'''
	response=requests.get(url).text
	soup = BeautifulSoup(response, "lxml")

	title=soup.select_one('.name_info > h1').get('title')

	price_str=soup.select_one('.price_qiang .price_d').getText()
	pattern=re.compile("\d+\.?\d*")
	price=pattern.search(price_str).group(0)

	book.append({'title':title,'price':price})
Ejemplo n.º 9
0
	def login(self):
		#Get sessionid
		r=self.session.get('https://cas.xjtu.edu.cn/login')
		br=BeautifulSoup(r.content, "html.parser")
		lt=br.select_one('input[name="lt"]')['value']
		exe=br.select_one('input[name="execution"]')['value']
		#Auth
		headers = {
			'Content-Type': 'application/x-www-form-urlencoded'
		}
		data = 'username='******'&password='******'&code=&lt='+lt+'&execution='+exe+'&_eventId=submit&submit=%E7%99%BB%E5%BD%95'
		self.session.post('https://cas.xjtu.edu.cn/login', headers=headers, data=data)
Ejemplo n.º 10
0
    def parse_listing_page(self, response):
        """Extract basic info from listing page, returns an Item."""
        body = BeautifulSoup(response.body)

        meta = response.meta

        item = DubizzleItem()

        item['title'] = body.select_one(LISTING_SELECTORS['title']).get_text(strip=True)
        item['date'] = body.select_one(LISTING_SELECTORS['date']).get_text(strip=True)
        item['make'] = meta['make'].title()
        item['model'] = meta['model'].title()

        yield item
Ejemplo n.º 11
0
 def extract_details(self):
     for item in self.get_detail_workers():
         passage_id = item.url[-14:-5]
         soup = BeautifulSoup(item.worker.result(), 'lxml')
         comments = self.submit_job(self.comment_url.format(passage_id[-3:], passage_id))\
             .worker.result().decode('utf-8')
         comments = json.loads(comments[comments.index('{'):comments.rindex('}') + 1])
         user_comments = []
         for comment in comments['list']:
             user_comments.append(comment['content'])
         self.detail_candidates.append({'title': soup.select_one('h1').text,
                                        'content': soup.select_one('div.textbox').text,
                                        'comments': user_comments})
     return self.detail_candidates
Ejemplo n.º 12
0
    def get_character_data(self, lodestone_id, achievements=False):
        data = request.urlopen('{0}/character/{1}'.format(self.lodestoneUrl, lodestone_id))
        page = BeautifulSoup(data, LodestoneClient.HTML_PARSER)
        # noinspection PyDictCreation
        character = {}
        character['lodestone_id'] = lodestone_id
        character['name'] = page.select_one(self.config['selector.character.name']).text
        character['server'] = page.select_one(self.config['selector.character.server']).text.strip() \
            .replace('(', '').replace(')', '')
        character['free_company'] = \
        page.select(self.config['selector.character.free_company'])[0].attrs['href'].split('/')[3]
        character['lodestone_profile'] = page.select_one(self.config['selector.character.profile']).text.strip()
        tmp = page.select_one(self.config['selector.character.race']).text.strip().split(' / ')
        character['race'] = tmp[0]

        character['gender'] = 'Male' if tmp[2] == '♂' else 'Female'
        tmp = page.select(self.config['selector.character.main'])
        character['nameday'] = tmp[0].text
        character['guardian'] = tmp[1].text
        character['city_state'] = tmp[2].text
        try:
            character['grand_company'] = tmp[3].text.split('/')[0]
            character['grand_company_rank'] = tmp[3].text.split('/')[1]
        except IndexError:  # User has no grand company
            character['grand_company'] = None
            character['grand_company_rank'] = None
        character['mounts'] = [node.attrs['title'] for node in
                               page.select(self.config['selector.character.mounts'])[0].findAll('a')
                               if 'title' in node.attrs]
        character['minions'] = [node.attrs['title'] for node in
                                page.select(self.config['selector.character.mounts'])[1].findAll('a')
                                if 'title' in node.attrs]

        character['classes'] = []
        tmp = []
        for d in page.select(self.config['selector.character.classes']):
            if not d.text:
                continue
            tmp.append(d.text)
        for i in range(0, len(tmp), 3):
            if '-' in tmp[i + 1]:
                continue
            exp = tmp[i + 2].split(' / ')
            character['classes'].append({
                'name': tmp[i], 'level': int(tmp[i + 1]), 'current_exp': int(exp[0]), 'next_exp': int(exp[1])
            })
        if achievements:
            character['recent_achievements'] = self.get_character_achievements(lodestone_id)
        return character
Ejemplo n.º 13
0
def process_file(path):
    f = open(path, 'r')
    html_doc1 = f.read()
    soup = BeautifulSoup(html_doc1, 'xml')

    # only handles single epsilon schedule
    data = {"epsilon_schedule": element_to_array(soup.select_one("epsilon").select_one("e1")),
            "particles": element_to_array(soup.select_one("particles")),
            "times": element_to_array(soup.select_one("times")),
            "measurements": process_measurements(soup),
            "models": process_models(soup)}

    #print data
    #print "\n\n\n"
    print json.dumps(data)
Ejemplo n.º 14
0
    def parse_full_item(self, html, parsed_data=None):
        """
        Parse page with full details.

        :params source_html: HTML source to be parsed.
        :return: A dictionary with the details extracted.
        """

        soup = BeautifulSoup(html, 'lxml')
        parsed_data = parsed_data or {}
        data = {
            'sold': False,
            'ad_last_modified': self.get_last_modified(soup),
            'registration_expiry': self.get_registration_expiry(soup),
            'registration_plate': self._get_detail(
                soup, 'Registration Plate'),
            'doors': self.get_doors(soup),
            'body_type': self._get_key_feature(soup, 'BODY TYPE'),
            'transmission_type': self._get_key_feature(soup, 'TRANSMISSION'),
        }

        if not parsed_data.get('ad_title', None):
            data['ad_title'] = soup.select_one('title').get_text()

        if not parsed_data.get('odometer', None):
            data['odometer'] = self.extract_number(
                self._get_detail(soup, 'Kilometres')
            )

        data.update(self.extract_model(soup))

        data['series'] = self.get_series(soup, parsed_data.get('year'))

        return self.reset_optional(data)
Ejemplo n.º 15
0
    def Analyse(self, html):

        soup = BeautifulSoup(html, 'html5lib')

        L = list()

        for i in range(2, 30):

            td = soup.select_one('body > table.a > tbody > tr:nth-of-type({})'.format(i))

            if td is None: break

            dt = dict()
            dt['originOJ'] = 'PKU'

            titles = ['realrunid', 'nickname', 'originProb', 'status', 'runmemory',
                      'runtime', 'language', 'codelenth', 'realsubmittime']

            for con in td.contents:

                dt[titles[0]] = con.text
                if titles[0] == 'codelenth':
                    dt[titles[0]] = dt[titles[0]][:-1]
                titles = titles[1:]

            L.append(dt)

        return L
Ejemplo n.º 16
0
def main():
    resp = requests.get('https://github.com/login/')
    if resp.status_code != 200:
        return
    cookies = resp.cookies.get_dict()  # 获取cookie
    soup = BeautifulSoup(resp.text, 'lxml')
    utf8_value = soup.select_one('form input[name="utf8"]').attrs['value'] # 获取隐藏域内容
    authenticity_token_value = soup.select_one('form input[name="authenticity_token"]').attrs['value'] # 获取token
    data = {
        'utf8': utf8_value,
        'authenticity_token': authenticity_token_value,
        'login':'******',
        'password': '******'
    }
    resp = requests.post('https://github.com/session/', data=data, cookies=cookies) # 利用post请求提交
    print(resp.text)
Ejemplo n.º 17
0
    def get(self, response, page):

        channel = {
            'page': page,
            'page_patten': None,
            'movies': []
        }

        soup = BeautifulSoup(response, "html.parser")
        # get total page
        last_page = soup.select_one('div.ah-pagenavi > ul.pagination > li.last')
        print("*********************** Get pages ")
        if last_page is not None:
            page = last_page.text.strip()
            channel['page'] = int(page)

        for movie in soup.select('div.ah-row-film > div.ah-col-film > div.ah-pad-film > a'):

            title = movie.select_one('span.name-film').find(text=True, recursive=False).strip()
            type = movie.select_one('span.number-ep-film').text.strip()
            label = "[%s] %s" % (type, title)
            thumb = movie.select_one('img').get('src')

            channel['movies'].append({
                'id': movie.get('href'),
                'label': label.encode("utf-8"),
                'title': title.encode("utf-8"),
                'realtitle': title.encode("utf-8"),
                'thumb': thumb,
                'type': type.encode("utf-8"),
            })

        return channel
Ejemplo n.º 18
0
def fetch(link_url):
    tree = {}
    content_list = []
    resp = requests.get(link_url, headers=headers)
    data = resp.content
    print('content', data)
    soup = BeautifulSoup(data, "html5lib")
    elements = list(soup.select_one('#container').select_one('#content_left').children)
    f = open('search.txt', 'wb+')
    i = 0
    for ele in elements:
        try:
            print('ele bs4 type', type(ele))
            if isinstance(ele, Tag):
                i += 1
                title = ele.select_one('h3').text
                print('!!! title', title)
                contents = ele.select('div')
                c = ''
                for content in set(contents[1:]):
                    if content.text:
                        if content.text not in content_list:
                            c += content.text + '||'
                        content_list.append(content.text)
                f.write(str(i) + '、标题: ' + title + '\n')
                f.write('内容: ' + c + '\n')
                tree[i] = (title, c)
        except Exception as e:
            print('error:', e)
    f.close()
    print('tree: ', len(tree), tree)
Ejemplo n.º 19
0
def _getBinsForZip( zipCode ):
	# We get a 403 if we don't use the User-Agent
	r = requests.get( 'https://satruck.org/DropOff/Index', params = {
		'zip': zipCode
	} )
	if r.status_code != 200:
		raise RuntimeError( '%s' % r )

	print r

	soup = BeautifulSoup(r.text, 'html.parser')

	divs = soup.select_one( '#drop-off-location-list' )#.select( '.has-website' )
	print divs

	def cleanSelectText( root, selector, default='' ):
		selection = root.select(selector)
		if len( selection ):
			return re.sub( '\\s+', ' ', selection[0].getText() ).strip()
		else:
			return default

	locations = []
	for d in divs:
		locations.append( Location(
			title = cleanSelectText(d,'.drop-off-location-title'),
			address = cleanSelectText(d,'.drop-off-location-address')+cleanSelectText(d,'.drop-off-location-city-state-zip'),
			phone = cleanSelectText(d,'.drop-off-location-phone'),
			website = cleanSelectText(d,'.drop-off-location-website'),
			hours = cleanSelectText(d,'.drop-off-location-hours')
		) )

	return locations
Ejemplo n.º 20
0
def get_JD(jd_url):
    print 'jd_url=' + jd_url
    jd_data = requests.get(jd_url, verify=False, timeout=1)  # SSL连接错误,需要设置verify=False
    soup = BeautifulSoup(jd_data.text)
    jd_one = soup.select_one("div.lf-border-box")
    if jd_one is None or len(jd_one) < 1:
        return None
    else:
        jd_title = jd_one.select_one("h3.bg-title").get_text().replace("  <span> </span>", "")
        # print jd_title
        jd_detail_box = jd_one.select_one("div.detail-box")
        jd_detail_table_td = jd_detail_box.select("table.detail-table.box-border td")
        jd_publish_date = jd_detail_table_td[1].get_text()
        jd_hc = jd_detail_table_td[11].get_text().strip()
        jd_detail_content = jd_detail_box.select("p.detail-content")
        jd_desc = jd_detail_content[0].get_text().strip()
        jd_request = jd_detail_content[1].get_text().strip()
        # jd_content = jd_desc+ jd_request
        jd_content_data = {
            'jd_url': jd_url,
            'jd_title': jd_title,
            'jd_publish_date': jd_publish_date,
            'jd_hc': jd_hc,
            'jd_desc': jd_desc,
            'jd_request': jd_request,
        }

        # print jd_content_data
        return jd_content_data
Ejemplo n.º 21
0
    def _parse(self, page: BeautifulSoup, url):
        seasons = OrderedDict()
        eqg = OrderedSet()

        child = page.select_one("#WikiaArticle h2")
        season = child.text

        while child.next_sibling:
            child = child.next_sibling

            if child.name == "table":
                for a in child.find_all("a", string="Transcript"):
                    if not a.has_attr("class") or "new" not in a["class"]:
                        episode_url, fragment = urldefrag(a["href"])
                        episode_url = urljoin(url, episode_url)
                        if "Equestria Girls" not in season:
                            if season not in seasons:
                                seasons[season] = OrderedSet()
                            seasons[season].append(episode_url)
                        else:
                            eqg.append(episode_url)
                continue

            if child.name == "h2":
                season = child.text
                continue

        seasons["Equestria Girls"] = eqg
        return seasons
Ejemplo n.º 22
0
def handleFURLText(text):
    bs = BeautifulSoup(text, "html.parser")
    divs = bs.findAll('div', {"class" : "user-feed-wrapBox clearfix"})
    for div in divs :
        pdivs = div.select('div.user-feed-wrap')
        for pdiv in pdivs :
            titleDivs = pdiv.findAll('div', attrs = {"class" : "title clearfix"})
            for titleDiv in titleDivs :
                print titleDiv.select_one('div.title_info span').text
                print titleDiv.select_one('div.title_desc p').text
            videoNameTag = pdiv.select_one('div.mod-piclist_info p.mod-piclist_info_title a')
            print videoNameTag.text
            print videoNameTag['data-videoid']
            print videoNameTag['href']
            videoPlayTag = pdiv.select_one('div.mod-piclist_info p.mod-piclist_info_times span.playTimes a')
            if videoPlayTag is not None :
                videoPlayNum = videoPlayTag.text
                print videoPlayNum
            videoCommentTag = pdiv.select_one('div.mod-piclist_info p.mod-piclist_info_times span.commentTimes a')
            if videoCommentTag is not None :
                videoCommentNum = videoCommentTag.text
                print videoCommentNum
            emTag = pdiv.find('em', attrs = {"class" : "con fs14"})
            if emTag is not None :
                print emTag.text
    tipsDiv = bs.select_one('div.tips-loading')
    if tipsDiv is not None :
        fURL = tipsDiv['data-loading-src']
        handleFURLPageText(fURL)
Ejemplo n.º 23
0
def get_list_price(isbn):
    """Return the list price of a book.

    Parameters
    ----------
    isbn : string
        ISBN-10 or ISBN-13 of a book.

    Returns
    -------
    list_price : float
        List price of the book in US dollars, or None if not found.

    Examples
    --------
    >>> get_list_price("9780262029445")
    74.0
    """
    AMAZON_URL = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords="  # noqa
    search_result = requests.get(AMAZON_URL+isbn)
    book_soup = BeautifulSoup(search_result.content, 'lxml')

    price = book_soup.select_one("span.a-text-strike").get_text()
    if price[0] == '$':
        return float(price[1:])
    else:
        return None
Ejemplo n.º 24
0
def fetch_srouce(url=''):
    """"""
    next = url
    while next:
        print(next)
        r = fetcher.get(url=next)
        if 200 != r.status_code:
            return
        page = BeautifulSoup(r.content, 'html.parser')
        _duris = page.select('tbody[id^="normalthread"] .xst')
        
        # print(_duris)
        uris = [_du.get('href') for _du in _duris]
        # print(uris)
        for u in uris:
            try:
                fetch_detail(u)
            except:
                with open('fetch_fail.log', 'a') as fd:
                    fd.write(u)
                    fd.write('\n')

        _next = page.select_one('.nxt')
        if _next:
            next = _next.get('href')
        else:
            next = None
    return
Ejemplo n.º 25
0
    def Analyse(self, html):

        soup = BeautifulSoup(html, 'html5lib')
        L = list()

        for i in range(2, 30):

            tr = soup.select_one('#SubmissionSearchForm > table > tbody > tr:nth-of-type({})'.format(i))
            if tr is None: break

            dt = dict()
            dt['originOJ'] = 'ZOJ'

            titles = ['realrunid', 'realsubmittime', 'status', 'originProb', 'language'
                , 'runtime', 'runmemory', 'nickname']

            for con in tr.contents:
                try:
                    dt[titles[0]] = con.text.strip()
                    titles = titles[1:]
                except Exception:
                    pass

            L.append(dt)

        return L
Ejemplo n.º 26
0
    def parse_product(self, response):
        soup = BeautifulSoup(response.body, 'lxml')

        p = Product()

        for element, path in self.selectors.viewitems():
            node = soup.select_one(path)

            if not node:
                continue
            if element == 'image':
                p[element] = url_fix(urljoin(response.url, node['src']))
            else:
                p[element] = text(node)

        if 'name' in p and 'number' in p:
            p['url'] = response.url
            p['pricing'], p['discountcode'] = get_prices(soup)
            soup.decompose()
            yield p
        else:
            # Only follow links on non-product pages
            soup.decompose()
            for link in self.link_extractor.extract_links(response):
                yield Request(url=link.url)
Ejemplo n.º 27
0
def get_url(args):
  term = args.term
  if not args.search:
    if term.startswith('myanimelist.net'):
      term = "http://" + term
    if re.fullmatch(r"\d+", term):
      term = "http://myanimelist.net/anime/{}".format(term)

    if not re.fullmatch(r"http://myanimelist.net/anime/\d+/?(/.*)?", term):
      print("invaild url {}".format(term))
      sys.exit(2)

    return term

  # search all
  # search_url = "http://myanimelist.net/search/all"
  # page = requests.get(search_url, params={"q": term})
  # soup = BeautifulSoup(page.content, "html.parser")
  # anime_url = soup.select_one('article > div').select_one('a.hoverinfo_trigger')['href']

  search_url = "http://myanimelist.net/anime.php"
  page = requests.get(search_url, params={"q": term})
  soup = BeautifulSoup(page.content, "html.parser")
  anime_url = soup.select_one('a.hoverinfo_trigger')['href']
  print("    ", anime_url)
  return anime_url
Ejemplo n.º 28
0
def getLocations():
	r = requests.get('http://www.housingworks.org/donate/drop-off-donations/')
	if r.status_code != 200:
		raise RuntimeError( '%s' % r )
	
	soup = BeautifulSoup(r.text, 'html.parser')
	
	results = []
	
	locationsBlock = soup( text=re.compile(r'Drop off locations',re.IGNORECASE) )[0].parent.find_next_sibling('div')
	nextLocation = locationsBlock.find_next('a')
	while nextLocation:
		# Load the next location
		link = nextLocation.attrs['href']
		page = requests.get(link)
		if page.status_code != 200:
			raise RuntimeError( '%s' % page )

		# Parse the returned data and store it
		pageSoup = BeautifulSoup(page.text, 'html.parser')
		content = pageSoup.select_one('#primary')
		paragraphs = content.findAll('p')
		results.append( HousingWorksLocation(
			name = content.find('h2').getText(),
			address = content.find('h4').getText(),
			hours = paragraphs[0].getText(),
			telephone = paragraphs[1].getText().lower().replace('phone:','').strip(),
			offerings = paragraphs[2].getText().lower().replace('offerings:','').strip(),
			link = link
		) )
		
		# Find the next location
		nextLocation = nextLocation.find_next_sibling('a')

	return results
Ejemplo n.º 29
0
Archivo: app.py Proyecto: itmard/uh
def tasnimgallery(url):
    try:
        if url.find('http://tasnimnews.com/') != 0:
            raise Exception('Not supported link')

        soup = BeautifulSoup(requests.get(url).text, 'html.parser')
        article = soup.select_one('body.photos article.media')
        images = map(lambda x: {
            'link': x['href'],
            'thumb': x.find('img')['src']
        }, article.select('.row a'))
        result = {
            'title': article.select_one('h1.title').text.strip(),
            'reporter': article.select_one('h4.reporter').text.strip(),
            'time': parsedate(article.select_one('time').text.strip()),
            'lead': article.select_one('h3.lead').text.strip(),
            'images': list(images),
            'url': url
        }
    except Exception as e:
        result = {"error": str(e)}

    response = Response(
        json.dumps(result, indent=1, ensure_ascii=False),
        content_type='application/json;charset=utf8')

    # TODO: This should be limited
    response.headers['Access-Control-Allow-Origin'] = "*"

    return response
Ejemplo n.º 30
0
 def get_audio_by_id(self, owner_id, audio_id):
     response = self._vk.http.get(
         'https://m.vk.com/audio{}_{}'.format(owner_id, audio_id),
         allow_redirects=False
     )
     bs = BeautifulSoup(response.text, 'html.parser')
     link = bs.select_one('.ai_body input[type=hidden]').attrs['value']
     return decode_audio_url(link, self.user_id)
Ejemplo n.º 31
0
	temp_dict['Snippet'] = message['snippet'] # fetching message snippet


	try:
		
		# Fetching message body
		mssg_parts = payld['parts'] # fetching the message parts
		part_one  = mssg_parts[0] # fetching first element of the part 
		part_body = part_one['body'] # fetching body of the message
		part_data = part_body['data'] # fetching data from the body
		clean_one = part_data.replace("-","+") # decoding from Base64 to UTF-8
		clean_one = clean_one.replace("_","/") # decoding from Base64 to UTF-8
		clean_two = base64.b64decode (bytes(clean_one, 'UTF-8')) # decoding from Base64 to UTF-8
                html = part_body.read()
		soup = BeautifulSoup(html)
		table = soup.select_one("table.data2_s")
                headers = [th.text.encode("utf-8") for th in table.select("tr th")]
		# mssg_body is a readible form of message body
		# depending on the end user's requirements, it can be further cleaned 
		# using regex, beautiful soup, or any other method
		temp_dict['Message_body'] = table

	except :
		pass

	print (temp_dict)
	final_list.append(temp_dict) # This will create a dictonary item in the final list
	
	# This will mark the messagea as read
	GMAIL.users().messages().modify(userId=user_id, id=m_id,body={ 'removeLabelIds': ['UNREAD']}).execute() 
	
Ejemplo n.º 32
0
    while halaman < limitHalaman:
        raws=f'{linkRaw}{hari.strftime("%d-%m-%Y")}'
        url=BeautifulSoup(requests.get(raws).text.encode("utf-8"),"html.parser")

        #mengambil isi berita di sublink dan membersihkannya dari tag HTML
        for i in url.select(".simple-post"):
            linkSemuanya=i.find ("a")['href']
            file2.write(linkSemuanya+'\n')
            sublink=BeautifulSoup(requests.get(linkSemuanya).text.encode("utf-8"),"html.parser")

            #membuang googletagpush
            for isiScript in sublink(['script','style']):
                isiScript.decompose()
            try:
                isiBerita=sublink.select_one(".post-content").getText().strip().translate(str.maketrans('','',string.punctuation))
                title = sublink.select_one('.post-title').getText().strip().translate(str.maketrans('','',string.punctuation))
            except AttributeError:
                pass
            #memasukkan kedalam folder antaraNews
            file = open(f'../data/crawling/berita{halaman}.txt','w')
            file.write(f'{title}\n{isiBerita}')
            if halaman is limitHalaman:
                break
            halaman+=1
        else:
            hari+=timedelta(days=-1) 
        print(f'selesai {halaman} berita')


Ejemplo n.º 33
0
 def get_total_cases(cls, response):
     soup = BeautifulSoup(response.content, 'html.parser')
     return soup.select_one(cls._lawsuits_total_count_selector).get_text().replace(',', '')
Ejemplo n.º 34
0
from selenium import webdriver
from bs4 import BeautifulSoup
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client.dbproject
driver = webdriver.Chrome(
    'C:/Users/NTRION/Downloads/chromedriver_win32/chromedriver.exe')

spots = list(db.jeonbuklink.find({}))

for spot in spots:
    driver.get(spot['link'])
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    image = soup.select_one('meta[property="og:image"]')['content']
    location = driver.find_element_by_xpath(
        '//*[@id="contents"]/div[2]/div[3]/div[2]/div/div/ul/li[3]/span').text
    description = driver.find_element_by_xpath(
        '//*[@id="contents"]/div[2]/div[3]/div[1]/div/div/p').text
    if location is not None:
        doc = {
            'name': spot['name'],
            'image': image,
            'location': location,
            'description': description
        }
        db.jeonbuk.insert_one(doc)

driver.quit()
Ejemplo n.º 35
0
def build_search_indices(pages, version):
    page_views_statistic = {}  #get_page_views_statistic()
    index_objects = []
    wh_index_objects = []

    print("Start building index")
    for url, endpoint in pages:
        if url.endswith('/'): url += 'index.html'
        if not url.endswith('.html'): continue

        title = ''
        content = ''
        page_type = 'Page'
        page_path = get_page_path_from_url(url)
        page_views = 0

        if url in page_views_statistic:
            page_views = page_views_statistic[url]

        if page_path.startswith('community'):
            page_type = 'Community'
        elif page_path.startswith('docs/reference'):
            page_type = 'Reference'
        elif page_path.startswith('docs/tutorials'):
            page_type = 'Tutorial'

        if page_path.startswith("api/latest/"):
            page_info = get_api_page(True, page_path[4:], dist_path)

            for table in page_info['content']('table'):
                table.extract()

            for overload_group in page_info['content'].findAll(
                    "div", {"class": "signature"}):
                overload_group.extract()

            breadcrumbs = page_info['content'].find(
                "div", {"class": "api-docs-breadcrumbs"})

            title = page_info['title']

            if breadcrumbs is not None:
                full_name_parts = list(
                    map(lambda link: link.text, breadcrumbs.findAll("a")))

                if "kotlin-stdlib" in full_name_parts:
                    full_name_parts.remove("kotlin-stdlib")
                else:
                    full_name_parts.remove("kotlin.test")

                title = " › ".join(full_name_parts).replace('<',
                                                            '&lt;').replace(
                                                                '>', '&gt;')
                breadcrumbs.extract()

            page_type = "Standard Library" if "jvm/stdlib" in url else "Kotlin Test"
            content = page_info['content'].find('article', {"role": "main"})
        else:
            html_content = get_page_content(url)
            parsed = BeautifulSoup(html_content, "html.parser")

            if parsed.find("meta", {"http-equiv": "refresh"}):
                continue

            body_title = parsed.select_one("body[data-search-title]")

            if body_title:
                title = body_title.attrs["data-search-title"]

            if not title:
                title_node = parsed.find("title")
                if title_node:
                    title = title_node.text

            # Our default pages
            content = parsed.find("div", {"class": "page-content"})

            # Our modern pages
            if content is None:
                content = parsed.find("article", {"class": "page-content"})

            # WebHelp pages
            if content is None:
                content = parsed.find("article", {"class": "article"})

        if title and content:
            page_indexer = get_page_index_objects

            if parsed.select_one("body[data-article-props]"):
                page_type = "Documentation"
                page_indexer = get_webhelp_page_index_objects
            elif page_type == "Page":
                page_indexer = get_markdown_page_index_objects

            print("processing " + url + ' - ' + page_type)

            page_indices = page_indexer(content, url, page_path, title,
                                        page_type, page_views)

            index_objects += page_indices

            def wh(*args):
                return to_wh_index(version, *args)

            wh_index_objects += list(map(wh, page_indices.copy()))
        else:
            print('skip: ' + url + ' unknown page content in with title: ' +
                  title)

    wh_index = get_wh_index()

    if wh_index:
        print("Submitting WH index objects to " + wh_index.index_name +
              " index")
        wh_index.add_objects(wh_index_objects)

    print("Index objects successfully built")

    index = get_index()
    print("Submitting index objects to " + index.index_name + " index")
    index.add_objects(index_objects)
Ejemplo n.º 36
0
import codecs
from bs4 import BeautifulSoup
from konlpy.tag import Twitter
from gensim.models import word2vec

fp = codecs.open('BEXX0003.txt', 'r', encoding='utf-16')
soup = BeautifulSoup(fp, 'html.parser')
body = soup.select_one('body > text')
text = body.getText()

twiter = Twitter()
results = []
lines = text.split('\n')
for line in lines:
    malist = twiter.pos(line, norm=True, stem=True)
    r = []
    for word in malist:
        if not word[1] in ['josa', 'Eomi', 'Punctuation']:
            r.append(word[0])
    rl = (' '.join(r)).strip()
    results.append(rl)
    print(rl)

wakati_file = 'toji.wakati'
with open(wakati_file, 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(results))

data = word2vec.LineSentence(wakati_file)
model = word2vec.Word2Vec(data, size=200, window=10, hs=1, min_count=2, sg=1)
model.save('toji.model')
print('ok')
Ejemplo n.º 37
0
    def __init__(self):
        url_list = [
            'http://www.diabetes.or.kr/general/class/index.php?idx=1',
            'http://www.diabetes.or.kr/general/class/index.php?idx=2',
            'http://www.diabetes.or.kr/general/class/index.php?idx=3',
            'http://www.diabetes.or.kr/general/class/index.php?idx=4',
            'http://www.diabetes.or.kr/general/class/index.php?idx=5',
            'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=322&idx=6',
            'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=325&idx=1',
            'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=324&idx=1',
            'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=323&idx=1',
            'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=327&idx=2',
            'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=326&idx=2',
            'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=30&idx=4',
            'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=7&idx=5',
            'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=6&idx=5',
            'http://www.diabetes.or.kr/general/class/complications.php?code=complication&number=337&mode=view&idx=1',
            'http://www.diabetes.or.kr/general/class/complications.php?code=complication&number=336&mode=view&idx=2',
            'http://www.diabetes.or.kr/general/class/type.php',
            'http://www.diabetes.or.kr/general/class/gestational.php'
        ]

        file_count = len(os.walk('./json').__next__()[2]) + 1
        BASE_DIR = './json/'
        FILE_NAME = f'result{file_count}.json'

        json_arch = OrderedDict()
        json_arch["category"] = "당뇨병"

        title_list = []
        sub_title_list = []
        contents_list = []

        for url in tqdm(url_list):
            req = requests.get(url)
            html = req.text
            soup = BeautifulSoup(html, 'html.parser')

            # 질의문
            title = soup.select_one('div.cTop > span:nth-of-type(2)')
            if title:
                title_list.append(title.text)
            else:
                title_list.append("")

            # 질의문
            all_sub_title = []
            content_all = soup.select_one('body')
            sub_title = soup.select('div.food')
            if sub_title:
                for sub_tit in content_all.find_all('div', 'rnd_center'):
                    all_sub_title.append(sub_tit.text)
                sub_title_list.append(all_sub_title)
            else:
                sub_title_list.append([])
            # 응답문
            if len(sub_title) > 0:
                tmp_contents_list = []
                for idx in range(len(all_sub_title)):
                    tmp_contents = []
                    next_content = sub_title[idx].find_next_sibling('div')
                    while True:
                        next_content = next_content.find_next_sibling()
                        tmp_contents.append(next_content.text)
                        if next_content.find_next_sibling(
                        ) == next_content.find_next_sibling(
                                'div') or next_content.find_next_sibling(
                                ) == next_content.find_next_sibling('table'):
                            break
                    tmp_contents_list.append(tmp_contents)
                contents_list.append(tmp_contents_list)
            else:
                tmp_contents = []
                contents = soup.select('p.0')
                for content in contents:
                    tmp_contents.append(content.text)
                contents_list.append([tmp_contents])

        json_arch["title"] = title_list
        json_arch["sub_title"] = sub_title_list
        json_arch["content"] = contents_list

        with open(os.path.join(BASE_DIR, FILE_NAME), 'w',
                  encoding="utf-8") as json_file:
            json.dump(json_arch, json_file, ensure_ascii=False, indent="\t")
        print("json 저장완료", "저장 경로", BASE_DIR, FILE_NAME)
Ejemplo n.º 38
0
from bs4 import BeautifulSoup

# 분석 대상 HTML
html = """
<html><body>
<div id="meigen">
<h1>위키북스 도서</h1>
<ul class="items">
<li>유니티 게임 이펙트 입문</li>
<li>스위프트로 시작하는 아이폰 앱 개발 교과서</li>
<li>모던 웹사이트 디자인의 정석</li>
</ul>
</div>
</body></html>
"""

# HTML 분석
soup = BeautifulSoup(html, 'html.parser')

# 필요한 부분을 CSS 쿼리로 추출하기
# 타이틀 부분 추출하기
h1 = soup.select_one("div#meigen > h1").string
print("h1 = ", h1)

# 목록 부분 추출
li_list = soup.select("div#meigen > ul.items > li")
for li in li_list:
    print("li = ", li.string)
Ejemplo n.º 39
0
def get_hottest_article():
    # 파이낸셜뉴스, YTN, SBS
    target_media = ["014", "052", "055"]

    # 타겟 언론사의 많이본 뉴스 url 뽑아내기
    for target_num in target_media:
        # print(target_num)
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        }
        data = requests.get(
            f'https://news.naver.com/main/ranking/office.nhn?officeId={target_num}',
            headers=headers)

        soup = BeautifulSoup(data.text, 'html.parser')

        articles = soup.select(
            '#wrap > div.rankingnews > div.rankingnews_box._officeResult > div:nth-child(2) > ul > li'
        )

        for article in articles:
            url = article.select_one('div > a')['href']
            # print(url)

            target_url = f'https://news.naver.com{url}'
            # print(target_url)

            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
            }
            data = requests.get(target_url, headers=headers)

            soup2 = BeautifulSoup(data.text, 'html.parser')

            contents = soup2.select('#main_content')
            og_desc = soup2.select_one(
                'meta[property="og:description"]')['content']

            for content in contents:
                title = soup2.select_one('#articleTitle').text
                date = soup2.select_one(
                    'div.article_header > div.article_info > div > span.t11'
                ).text
                media = soup2.select_one(
                    'div.article_header > div.press_logo > a > img')['alt']

                # date의 년월일만 가져와서 datetime 형식으로 변환.
                target = date[:10]
                date_time_obj = datetime.datetime.strptime(target, '%Y.%m.%d')

                # date의 오전오후를 am.pm으로 바꾸기,0 추가하기
                if date[12:14] == '오전':
                    date1 = date[:12] + 'AM' + date[14:]
                else:
                    date1 = date[:12] + 'PM' + date[14:]

                if date[16] == ':':
                    date1 = date1[:15] + '0' + date1[15:]

                # date 문자열을 datetime 형식으로 변환.
                date_time_obj2 = datetime.datetime.strptime(
                    date1, '%Y.%m.%d. %p %I:%M')

                # 기사를 겹치지 않고 가지고 오기 위한 유니크 키 생성.
                parts = urlparse(target_url)

                query_string = parse_qs(parts.query)

                sid1 = query_string["sid1"][0]
                oid = query_string["oid"][0]
                aid = query_string["aid"][0]

                unique_key = f"{sid1}-{oid}-{aid}"

                # url주소가 네이버뉴스홈이면 korea 아이콘으로 db에 같이 저장하기.
                if target_url[:27] == "https://news.naver.com/main":
                    doc = {
                        'icon': "../static/south-korea.png",
                        'unique_key': unique_key,
                        'url': url,
                        'title': title,
                        'desc': og_desc,
                        'date': date,
                        'datetime': date_time_obj2,
                        'datetime_server': date_time_obj,
                        'media': media,
                    }

                    # db.hottestNews.drop()

                    # unique_key 값이 중복되지 않을때 db에 insert 하기.
                    # Tutor : document가 있는지 찾기
                    document = db.hottestNews.find_one(
                        {"unique_key": unique_key})
                    #
                    # # Tutor : document가 없다면 추가하기
                    if document is None:
                        db.hottestNews.insert_one(doc)
Ejemplo n.º 40
0
'''
Created on 2018. 5. 12.

@author: Administrator
'''
import codecs
from bs4 import BeautifulSoup
from konlpy.tag import Twitter
from gensim.models import word2vec
# utf-16 인코딩으로 파일을 열고 글자를 출력하기 --- (※1)
fp = codecs.open(filename='BEXX0003.txt', mode='r', encoding='utf-8')
soup = BeautifulSoup(fp, 'html.parser')
body = soup.select_one('body > text')  # 여러 개의 태그 중에서 첫 번째만 선택한다.(카페 62)
text = body.getText()
# print( text )
# 텍스트를 한 줄씩 처리하기 --- (※2)
twitter = Twitter()
results = []
lines = text.split('\r\n')
# print( lines )
for line in lines:  # 형태소 분석하기 --- (※3) # 단어의 기본형 사용
    malist = twitter.pos(line, norm=True, stem=True)
    # print( malist )
    r = []
    for word in malist:  # 어미/조사/구두점 등은 대상에서 제외
        if not word[1] in ["Josa", "Eomi", "Punctuation"]:
            r.append(word[0])
            rl = (" ".join(r)).strip()
            results.append(rl)
# print(rl)
# 파일로 출력하기 --- (※4)
Ejemplo n.º 41
0
import requests
from bs4 import BeautifulSoup

# 1. 원하는 주소로 요청을 보내 응답을 저장한다.
html = requests.get('https://finance.naver.com/sise/').text 

# 2. 정보를 조작하기 편하게 바꾸고(정제)
print(html)
soup = BeautifulSoup(html, 'html.parser')   
# 3. 바꾼 정보 중 원하는 것만 뽑아서
print(soup)
kospi = soup.select_one('#KOSPI_now').text

# 4. 출력한다.
print(kospi)


Ejemplo n.º 42
0
Archivo: naver.py Proyecto: QT-HH/TIL
import time

options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches',
                                ['enable-logging'])  #selenium 메시지 팝업 제거
browser = webdriver.Chrome(executable_path="./chromedriver.exe",
                           options=options)
browser.get('https://datalab.naver.com/keyword/realtimeList.naver?where=main')
time.sleep(1)

html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')

ranking = soup.select(
    '#content > div > div.selection_area > div.selection_content > div.field_list > div > div > ul > li > div > span.item_title_wrap > span.item_title'
)
day = soup.select_one(
    '#content > div > div.selection_area > div.selection_header > div:nth-child(1) > div > div > div > div.date_indo > a.date_box._date_trigger > span.date_txt._title_ymd'
)
tiktok = soup.select_one(
    '#content > div > div.selection_area > div.selection_header > div:nth-child(1) > div > div > div > div.time_indo > a.time_box._time_trigger > span.time_txt._title_hms'
)

print(day.text, tiktok.text, '기준 실시간 급상승 검색어')

n = 0
for i in ranking:
    n += 1
    print(n, i.text)

browser.quit()
Ejemplo n.º 43
0
            # go back to initial page
            driver.execute_script("window.history.go(-1)")

            #start new line for new rider profile
            f.write("\n")
        except:
            print('FAIL: 404 go back')
            # go back to initial page
            driver.execute_script("window.history.go(-1)")

            table_soup = BeautifulSoup(driver.page_source,
                                       'html.parser',
                                       from_encoding='utf8')
            # find url in table
            url = rider_link.strip('http://www.worldsnowboarding.org/')
            find_link = table_soup.select_one("a[href*='" + url + "']")
            # find parent of url - this is the row that has all the rider info
            parent = find_link.find_parent('tr', attrs={'class': 'ranking'})
            stat_array = parent.find_all('td')

            profile[1] = int(stat_array[0].span.text.strip('.'))  #position
            name = stat_array[3].a.text.split(',')
            first_name = name[1]
            last_name = name[0]
            profile[0] = str(first_name + last_name)  #name
            profile[5] = stat_array[4].span.text  #nationality
            if stat_array[5] is not None or len(stat_array[5]) > 0:
                profile[4] = stat_array[5].text  #age
            profile[2] = float(stat_array[8].text)  #points

            profile_str = ', '.join(str(x) for x in profile)
Ejemplo n.º 44
0
def calcSUMS(params):
    global sum_distance, sum_elevation_gain, sum_descent, sum_refreshment_points, sum_time_limit
    url = "https://itra.run/calend.php?mode=getEvt&id={}&annee={}&idc={}&idx={}".format(
        *params)
    session = requests.Session()
    session.max_redirects = 9999999
    dct = {}
    # url = url.strip('"')
    page = session.get(url, headers=headers, verify=False)
    soup = BeautifulSoup(page.content, "html.parser")
    All_info = soup.select("div#calevt_fich tr")  #get_text(strip=True)
    lst_1 = []
    dct_2 = {}

    for info in All_info:
        tds = info.select("td")
        if len(tds) == 2:
            dct_2[tds[0].get_text().strip()] = tds[1].get_text().strip()
        elif len(tds) == 1:
            if tds[0].get_text().strip() != "":
                if tds[0].select_one("a", href=True) is not None:
                    registr_url = tds[0].select_one("a", href=True)["href"]
                else:
                    lst_1.append(tds[0].get_text().strip())

    try:
        distance = re.sub('[(){}<>]', '', dct_2["Distance"]).split()[0].strip()
    except:
        distance = ""

    try:
        elevation_gain = dct_2["Ascent"].strip()
    except:
        elevation_gain = ""

    try:
        descent = dct_2["Descent"].strip()
    except:
        descent = ""

    try:
        refreshment_points = dct_2["Refreshment points"].strip()
    except:
        refreshment_points = ""

    try:
        time_limit = dct_2["Maximum time"].strip()
    except:
        time_limit = ""

    sum_distance += reprDist(distance)[0]
    sum_elevation_gain += reprDist(elevation_gain)[0]
    sum_descent += reprDist(descent)[0]
    sum_refreshment_points += reprDist(refreshment_points)[0]
    sum_time_limit += get_sec(time_limit)

    th = soup.select_one("div#calevt_fich tr th", onclick=True)

    if th is not None:
        try:
            stage = th.select_one("a.rightarr", onclick=True)
            params = eval(stage['onclick'].split(";")[0])
            calcSUMS(params)
        except:
            return 1
Ejemplo n.º 45
0
 def extract_review(self, parsed_claim_review_page: BeautifulSoup) -> str:
     return parsed_claim_review_page.select_one(
         "section.l-section.wpb_row.height_small div[itemprop=\"text\"]"
     ).text
Ejemplo n.º 46
0
def job_page(url, _id):
    res = retry(url)
    soup = BeautifulSoup(res.content, 'lxml')
    #=== Extract Static JobBKK ===#
    try:
        static_detail_list = [_.text.strip()
                              for _ in soup.select("div.statis-detail")]
    except Exception as e:
        print("Step job_static", e)

    #=== Extract Interesting ===#
    try:
        applicants = soup.select_one("#loadnumapply").text.strip()
    except Exception as e:
        print("Step job_interesting", e)

    #=== Extract Info ===#
    try:
        info = soup.select_one("div.row-left")
        detail_list = [_.text.strip() for _ in info.select_one(
            "div.job-detail.border-b").select("span")]
        skill_list = [_.text.strip() for _ in info.select_one(
            "div[itemprop=skills]").select("span")]
        incentives_detail_list = [_.text.strip() for _ in info.select_one(
            "div[itemprop=incentives]").select("li")]
        incentives_additional = info.select_one(
            "div[itemprop=incentives] div").text.strip()
    except Exception as e:
        print("Step job_info", e)

    #=== Extract Transport ===#
    try:
        jobLocation = info.select_one("div[itemprop=jobLocation]")
        transport_detail_list = [_.text.strip().replace(
            'ไม่มี', '') for _ in jobLocation.select("div.transport-detail")]
        # transport_additional = jobLocation.select_one("div.transport-additional span").text.strip()
    except Exception as e:
        print("Step job_transport", e)

    #=== Extract Json ===#
    try:
        data_dict = json.loads(soup.find_all(
            'script', {"type": "application/ld+json"})[1].text, strict=False)
        job_title = data_dict['title']
        description = data_dict['description']
        company = data_dict['hiringOrganization']['name']
        job_com_id = data_dict['hiringOrganization']['sameAs']
        job_com_id = re.search('\d+/\d+', job_com_id).group(0).split("/")
        date_post = data_dict['datePosted']
    except Exception as e:
        print("Step job_json_data", e)

    #=== Extract Main Info ===#
    if re.search('-', skill_list[2]) != None:
        edu_clean = skill_list[2].replace(' ', '').split('-')
        edu_clean = edu_dict[edu_clean[0]] + \
            '-' + edu_dict[edu_clean[1]]
    else:
        try:
            edu_clean = edu_dict[edu_clean]
        except KeyError:
            print("Step KeyError: ", edu_clean)
            edu_clean = ""
    try:
        job_dict = OrderedDict({
            'occupation_id': _id,
            'job_id': int(job_com_id[1]),
            'job_title': job_title,
            'job_description': description.replace('\n', '|'),
            'num_position': int(detail_list[0].replace('ตำแหน่ง', '').replace('ไม่ระบุ', '').replace('ไม่จำกัด', 'Inf').strip()),
            'job_type': detail_list[1],
            'company_id': int(job_com_id[0]),
            'company_name': company,
            'company_location': {
                #=== Location Company ===#
                'street_address': data_dict['jobLocation']['address']['streetAddress'],
                'local_address': data_dict['jobLocation']['address']['addressLocality'],
                'region_address': data_dict['jobLocation']['address']['addressRegion'],
                'postal_code': data_dict['jobLocation']['address']['postalCode'],
                'country_address': data_dict['jobLocation']['address']['addressCountry']
            },
            'work_location': detail_list[2].split(','),
            'salary': detail_list[3].replace(',', '').replace(' ', ''),
            'vacation': detail_list[5].replace('ไม่ระบุ', ''),
            'work_time': detail_list[4].replace('ไม่ระบุ', ''),
            'gender': skill_list[0].replace(' ', '').replace('ชาย', 'M').replace('หญิง', 'F').replace(',', ''),
            'age': skill_list[1].replace('ปีขึ้นไป', '+').replace('ทุกช่วงอายุ', '').replace(' ', ''),
            'edu': edu_clean.strip(),
            'exp': skill_list[3].replace('ปีขึ้นไป', '+').replace(' ', ''),
            'other': skill_list[4].replace('ไม่ระบุ', ''),
            'incentives': incentives_detail_list,
            'incentives_add': incentives_additional,
            'transport': {
                'bus': transport_detail_list[0],
                'bts': transport_detail_list[1],
                'mrt': transport_detail_list[2],
                'arl': transport_detail_list[3]
            },
            'applicants': int(applicants),
            'job_active': static_detail_list[1],
            'job_view': int(static_detail_list[0].replace(',', '')),
            'job_date_post': date_post,
        })
    except Exception as e:
        print("Step job_dict", e)
    # try:
    #     col_bkk_job.insert_one(job_dict)
    # except Exception as e:
    #     print('db', e)
    return job_dict
class Plugin():

    # registra o plugin e qual o padrão de url ele deve ser chamado
    # o primeiro parametro é o nome do arquivo
    # o segundo a o padrão em regex da URL
    def register_plugin(self, PluginManager):
        PluginManager.register_plugin('jornaldebrasilia_com_br', r"^https?://www.jornaldebrasilia.com.br/"
                            "(cidades|brasil|futebol|mundo|economia|politica-poder|politica-e-poder)/")

    # returna um dicionario contendo três chaves
    # subtitle
    # date_published em formato datetime object
    # content
    # o content é o único obrigatório
    def extract_metadata(self, url):
        self.url = url
        self.page = None
        self.bs = None

        r = requests.get(self.url)
        if r.status_code == 200:
            self.page = r.text
            self.bs = BeautifulSoup(self.page, 'html.parser')

            subtitle = self._get_subtitle()
            date_published = self._get_published_date()

            self._remove_elements()

            content = self._get_content()

            metadata = dict(subtitle=subtitle,
                            date_published=date_published,
                            content=content)
            return metadata
        else:
            return None

    # remove elementos indesejados da pagina, titulos no meio
    # da materia etc... se precisar.
    def _remove_elements(self):
        for div in self.bs.find_all(['style']):
            div.decompose()


    # localiza o subtitulo pelo seletor css
    def _get_subtitle(self):
        subtitle = self.bs.select_one('h1.entry-title')
        return subtitle.get_text()

    # localiza e parsea a data para formato datetime obj
    def _get_published_date(self):
        state = str(self.bs.select('.entry-date'))
        match_date = re.search(r'([0-9])+([/])+([0-9])+([/])+([0-9])+', state).group(0)
        date_published = datetime.strptime(match_date, "%d/%m/%Y").date()
        return date_published

    # localiza os paragrafos
    def _get_content(self):
        paragraphs_list = []
        paragraphs = self.bs.select('.td-post-content > p')
        for paragraph in paragraphs:
            if len(paragraph.text) > 20:
                paragraphs_list.append(paragraph.text.strip())
        return ' '.join(paragraphs_list)
Ejemplo n.º 48
0
ds = []
count = 0

for info in match_info:
    date, countries, ground, href, parent_href = info
    
    count += 1
    print(count)
    print(parent_href)
    print(href)
    
    page = requests.get(href)
    
    soup = BeautifulSoup(page.content, 'html.parser')
    
    table = soup.select_one('body > table:nth-child(2) > tr > td > table:nth-child(3)')
    
    if is_aborted(soup):
        print('Detect match aborted')
        continue
    elif is_conceded(soup):
        print('Detect match conceded')
        continue
    elif table is None:
        print('Possibly match aborted')
        break
    
    tr_all = table.select('tr')
    
    dt = []
    
Ejemplo n.º 49
0
def scrap_guy_hoquet(data_folder='data', replace_strategy='abort'):
    """
    Web scrapping function for www.guy-hoquet.com meant to retrieve relevant info from property ads
    in Ile-de-France.

    Parameters
    ----------
    data_folder: str, defaut 'data'
        path of the folder where the data will be written, created when needed
    replace_strategy: str, any from ['abort', 'replace'], default 'abort'
        strategy to follow if a file with the same name as the data file already exists

    Returns
    -------
    None

    """

    url = 'https://www.guy-hoquet.com/biens/result#1&p=1&f10=2&f20=75_c2,77_c2,78_c2,91_c2,92_c2,93_c2,94_c2,95_c2&f30=appartement,maison'

    links = []

    driver = webdriver.Firefox()
    driver.implicitly_wait(5)  # seconds
    driver.get(url)

    driver.find_element_by_css_selector('div#accept-all-cookies').click()
    links.extend([
        a.get_attribute('href')
        for a in driver.find_elements_by_css_selector('a.property_link_block')
    ])

    while True:
        try:
            driver.find_element_by_css_selector('li.page-item.next a').click()
        except NoSuchElementException:
            break
        links.extend([
            a.get_attribute('href') for a in
            driver.find_elements_by_css_selector('a.property_link_block')
        ])

    driver.close()

    data = []

    for link in tqdm(links):

        soup = BeautifulSoup(requests.get(link).content)

        try:
            prop_type = soup.select_one('h1.name.property-name').text
            city = soup.select_one('div.add').text
            price = soup.select_one('div.price').text.replace('\n', '').strip()
            descr = soup.select_one('span.description-more').text.replace(
                '\n', '').replace('Voir moins', '').strip()
            feats = [tag.text for tag in soup.select('div.ttl')]
            feats2 = [
                re.sub(r'\s+', ' ', re.sub(r'\n+', '', tag.text)).strip()
                for tag in soup.select('div.horaires-item')
            ]
            neighborhood = re.sub(
                r'\s+', ' ',
                re.sub(
                    r'\n+', '',
                    soup.select_one('div.quartier-info.mt-4').text)).strip()
        except AttributeError:
            continue

        data.append(
            [prop_type, city, price, descr, feats, feats2, neighborhood])

    df = pd.DataFrame(data,
                      columns=[
                          'prop_type', 'city', 'price', 'descr', 'feats',
                          'feats2', 'neighborhood'
                      ])

    # Check if data file name already exists : if so follow replace_strategy, if not then create it
    filename = f'guy_hoquet_{dt.now().year}_{dt.now().month}_{dt.now().day}.csv'
    if not os.path.isfile(os.path.join(data_folder, filename)):
        df.to_csv(os.path.join(data_folder, filename), sep='|', index=False)
    else:
        if replace_strategy == 'abort':
            raise FileExistsError(
                f"File {os.path.join(data_folder, file_name)} already exists. Scraping aborted. To replace the existing file, change replace_strategy to 'replace'."
            )
        elif replace_strategy == 'replace':
            df.to_csv(os.path.join(data_folder, filename),
                      sep='|',
                      index=False)
Ejemplo n.º 50
0
def scrape():
    browser = init_browser()

    #    NASA Mars fact data dictionary
    mars_fact_data = {}

    #    NASA Mars News
    # URL of page to be scraped
    nasa_url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
    # Retrieve page with the requests module
    response = requests.get(nasa_url)
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(response.text, 'html.parser')
    results = soup.find_all('div', class_="slide")
    results[0]
    news_title = soup.find('div', class_="content_title").text.strip()
    news_p = soup.find('div', class_="rollover_description_inner").text.strip()
    # Store in mars_fact_data dictionary
    mars_fact_data['news_title'] = news_title
    mars_fact_data['news_paragraph'] = news_p

    #    JPL Mars Space Images - Feature Image
    # URL of page to be scraped
    jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    # Retrieve page with the requests module
    response = requests.get(jpl_url)
    browser.visit(jpl_url)
    time.sleep(3)
    browser.find_by_id("full_image").click()
    time.sleep(3)
    browser.find_link_by_partial_text('more info').click()
    time.sleep(3)
    html = browser.html
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(html, 'html.parser')
    # Relative image url for featured image
    img_url_rel = soup.select_one('figure.lede a img').get("src")
    # Featured image url
    featured_image_url = f'https://www.jpl.nasa.gov{img_url_rel}'
    # Store in mars_fact_data dictionary
    mars_fact_data["featured_image"] = featured_image_url

    #     Web Scraping - Mars Weather twitter website
    # URL of page to be scraped
    twitter_url = "https://www.twitter.com/marswxreport?lang=en"
    # Retrieve page with the requests module
    response = requests.get(twitter_url)
    browser.visit(twitter_url)
    time.sleep(3)
    html = browser.html
    weather_soup = BeautifulSoup(html, 'html.parser')
    # find tweet with weather info
    results = weather_soup.find_all(
        "div",
        class_=
        "css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0"
    )
    # loop through the results to look for InSight and Sol text
    # for tweet in results:
    #     if 'InSight' and 'Sol' in tweet:
    #         print(tweet)
    #         break
    # mars_weather = tweet.text.strip()
    mars_weather = 'InSight sol 457 (2020-03-10) low -95.7ºC (-140.3ºF) high -9.1ºC (15.6ºF) winds from the SSE at 6.5 m/s (14.5 mph) gusting to 21.0 m/s (46.9 mph) pressure at 6.30 hPa'
    # Store in mars_fact_data dictionary
    mars_fact_data["mars_weather"] = mars_weather

    #    Mars Facts - Table
    # URL of page to be scraped
    Marsfacts_url = "https://space-facts.com/mars/"
    tables = pd.read_html(Marsfacts_url)
    df = tables[0]
    df.columns = ["Description", "values"]
    df.set_index("Description", inplace=True)
    # Convert dataframe to html
    html_table = df.to_html(classes="table table-striped")
    html_table = html_table.replace("\n", "")
    # Store in mars_fact_data dictionary
    mars_fact_data["mars_facts_table"] = html_table

    #    Mars Hemispheres
    # Base url
    hemispheres_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    # Retrieve page with the requests module
    response = requests.get(hemispheres_url)
    browser.visit(hemispheres_url)
    time.sleep(3)
    # Get a List of All the Hemispheres
    image_hemisphere_urls = []
    links = browser.find_by_css("a.product-item h3")
    for item in range(len(links)):
        hemisphere = {}

        # Find element on each Loop
        browser.find_by_css("a.product-item h3")[item].click()
        time.sleep(3)

        # Identify sample image anchor tag and extract <href>
        sample_element = browser.find_by_text("Sample").first
        hemisphere["img_url"] = sample_element["href"]

        # Get title for each hemisphere
        hemisphere["title"] = browser.find_by_css("h2.title").text

        # Append the hemisphere object to the list
        image_hemisphere_urls.append(hemisphere)

        # Navigate backwards
        browser.back()

    # Store in mars_fact_data dictionary
    mars_fact_data["image_mars_hemispheres"] = image_hemisphere_urls

    return mars_fact_data
Ejemplo n.º 51
0
from bs4 import BeautifulSoup
import sys, io

sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')

fn = open('cars.html', encoding='utf-8')
soup = BeautifulSoup(fn, 'html.parser')


def car_func(select):
    print('car_func', soup.select_one(select).string)


car_lambda = lambda q: print('car_lambda', soup.select_one(q).string)

car_func('#gr')
car_func('li#gr')
car_func('ul > li#gr')
car_func("#cars > #gr")
car_func("li[id='gr']")

car_lambda('#gr')
car_lambda('li#gr')
car_lambda('ul > li#gr')
car_lambda("#cars > #gr")
car_lambda('#gr')
car_lambda("li[id='gr']")

print('car_func', soup.select('li')[3].string)
print('car_func', soup.find_all('li')[3].string)
Ejemplo n.º 52
0
    for i in range(0, 50):

        #ChromeDriver로 접속, 자원 로딩시간 3초
        driver = webdriver.Chrome('./chromedriver')

        url = main_url + url_path[i]['url_path']
        driver.get(url)  # 드라이버에 해당 url의 웹페이지를 띄웁니다.
        sleep(1)  # 페이지가 로딩되는 동안 1초 간 기다립니다.

        req = driver.page_source  # html 정보를 가져옵니다.
        driver.quit()  # 정보를 가져왔으므로 드라이버는 꺼줍니다.

        soup = BeautifulSoup(req, 'html.parser')

        header = soup.select_one('#original_header')

        poster_path = header.select_one('div.poster img.poster')['src']
        poster_url = main_url + poster_path

        info = header.select_one('section.header')

        title = info.select_one('h2 > a').text

        runtime = info.select_one('div.facts > span.runtime').text.strip()

        genres = info.select('div.facts > span.genres > a')
        genre = ""

        for g in genres:
            genre = genre + " " + g.text
Ejemplo n.º 53
0
def scrap_orpi(data_folder='data', replace_strategy='abort'):
    """
    Web scrapping function for www.orpi.com meant to retrieve relevant info from property ads
    in Ile-de-France.

    Parameters
    ----------
    data_folder: str, defaut 'data'
        path of the folder where the data will be written, created when needed
    replace_strategy: str, any from ['abort', 'replace'], default 'abort'
        strategy to follow if a file with the same name as the data file already exists

    Returns
    -------
    None

    """

    BASE_URL = 'https://www.orpi.com/recherche/rent?'
    depts = [
        'paris', 'seine-et-marne', 'yvelines', 'essonne', 'hauts-de-seine',
        'seine-saint-denis', 'val-de-marne', 'val-d-oise'
    ]

    links = {dept: [] for dept in depts}

    print('Getting links to property ads for each département ...')

    for dept in tqdm(depts):
        url = f'{BASE_URL}transaction=rent&resultUrl=&realEstateTypes[0]=maison&realEstateTypes[1]=appartement&locations[0][value]={dept}&agency=&minSurface=&maxSurface=&newBuild=&oldBuild=&minPrice=&maxPrice=&sort=date-down&layoutType=mixte&nbBedrooms=&page=&minLotSurface=&maxLotSurface=&minStoryLocation=&maxStoryLocation='

        driver = webdriver.Firefox()

        driver.get(url)

        # accept cookies
        driver.find_element_by_css_selector('button.c-btn.c-btn--lg').click()

        # append property ads links
        soup = BeautifulSoup(driver.page_source)
        links[dept].extend([
            a.get('href')
            for a in soup.select('a.u-link-unstyled.c-overlay__link')
        ])

        # repeat for every page
        next_page = driver.find_elements_by_css_selector('a.c-pagination__link')[-1] \
                          .find_element_by_css_selector('span') \
                          .text == 'Suivant'
        while next_page:
            driver.find_elements_by_css_selector(
                'a.c-pagination__link')[-1].click()
            next_page = driver.find_elements_by_css_selector('a.c-pagination__link')[-1] \
                              .find_element_by_css_selector('span') \
                              .text == 'Suivant'
            soup = BeautifulSoup(driver.page_source)
            links[dept].extend([
                a.get('href')
                for a in soup.select('a.u-link-unstyled.c-overlay__link')
            ])

        driver.close()

    print('\n')

    word2num = {
        'paris': 75,
        'seine-et-marne': 77,
        'yvelines': 78,
        'essonne': 91,
        'hauts-de-seine': 92,
        'seine-saint-denis': 93,
        'val-de-marne': 94,
        'val-d-oise': 95
    }

    data = []

    for dept in links.keys():
        print(f'Scraping {dept} ...')

        for link in tqdm(links[dept]):

            url = 'https://www.orpi.com' + link
            soup = BeautifulSoup(requests.get(url).content)

            try:
                ref = soup.select_one('span.u-text-xs').text
            except AttributeError:
                continue
            prop_type = soup.select_one('span.u-text-xl').text.replace(
                '\n', '').strip()
            rooms, surface = soup.select_one(
                'span.u-h3.u-color-primary').text.split(' • ')
            city = soup.select_one('span.u-text-lg').text
            price = soup.select_one('span.u-h1').text.replace('\xa0', '')
            descr = soup.select_one(
                'div.c-section__inner div.o-container p').text.replace(
                    '\n', '').strip()
            feats = [span.text for span in soup.select('span.c-badge__text')]
            try:
                conso = soup.select_one(
                    'abbr.c-dpe__index.c-dpe__index--5').text
            except AttributeError:
                conso = ''

            try:
                emiss = soup.select_one(
                    'abbr.c-dpe__index.c-dpe__index--3').text
            except AttributeError:
                emiss = ''

            data.append([
                ref, prop_type, city, word2num[dept], rooms, surface, price,
                descr, conso, emiss, feats
            ])

        print('\n')

    df = pd.DataFrame(data,
                      columns=[
                          'ref', 'prop_type', 'city', 'dept', 'rooms',
                          'surface', 'price', 'descr', 'conso', 'emiss',
                          'feats'
                      ])

    # Check if data file name already exists : if so follow replace_strategy, if not then create it
    filename = f'orpi_{dt.now().year}_{dt.now().month}_{dt.now().day}.csv'
    if not os.path.isfile(os.path.join(data_folder, filename)):
        df.to_csv(os.path.join(data_folder, filename), sep='|', index=False)
    else:
        if replace_strategy == 'abort':
            raise FileExistsError(
                f"File {os.path.join(data_folder, file_name)} already exists. Scraping aborted. To replace the existing file, change replace_strategy to 'replace'."
            )
        elif replace_strategy == 'replace':
            df.to_csv(os.path.join(data_folder, filename),
                      sep='|',
                      index=False)
Ejemplo n.º 54
0
    * 서버에 과부하를 주는 등의 피해를 줄 수 있으므로, 법적인 문제에 대해 전문적으로 알아볼 것
    * 특히, 학습용이 아닌 상용을 목적으로 크롤링 및 스크래핑시에는 더욱 주의할 것
    * robots.txt 에서 크롤링 가능 여부 확인
        .검색엔진 크롤링 봇들이 해당 파일 조회 후 수집 여부 결정
        .참조: https://searchadvisor.naver.com/guide/seo-basic-robots
'''

import requests    # 웹 페이지 접속
from bs4 import BeautifulSoup   # HTML 문서 파싱(구문 분석)

res = requests.get('https://finance.naver.com')
print(res.status_code)
# print(res.text)

bs = BeautifulSoup(res.text, 'lxml')    # 구문분석기: 'lxml', 'html.parser', ...
title = bs.select_one('#container > div.aside > div.group_aside > div.aside_area.aside_popular > h3')
print(title.get_text()) # get_text(): 해당 HTML 요소의 텍스트 추출

# company_name = bs.select_one('#container > div.aside > div.group_aside > div.aside_area.aside_popular > table > tbody > tr.down > th > a')
# print(company_name.get_text())

# company_name = bs.select_one('#container > div.aside > div.group_aside > div.aside_area.aside_popular > table > tbody > tr:nth-of-type(2) > th > a')    # 퀴즈: HMM 가져오기
# print(company_name.get_text())

# 부모 요소 찾기
tbody = bs.select_one('#container > div.aside > div.group_aside > div.aside_area.aside_popular > table > tbody')

# 자식 요소 찾기
trs = tbody.select('tr')
# print(trs)
Ejemplo n.º 55
0
def parse_inner2(url):
    session = requests.Session()
    session.max_redirects = 9999999
    dct = {}
    # url = url.strip('"')
    page = session.get(url, headers=headers, verify=False)
    soup = BeautifulSoup(page.content, "html.parser")
    Event = soup.select_one("div#calevt_titre").contents[-1].strip()
    Race = soup.select_one(
        "div#race-container h2").contents[-1].strip()  #get_text(strip=True)
    WEBSITE = soup.select_one("div#calevt_titre > div > a.web", href=True)
    img_url = soup.select_one("div#im-container a", href=True)
    All_info = soup.select("div#calevt_fich tr")  #get_text(strip=True)
    lst_1 = []
    dct_2 = {}
    registr_url = ""

    for info in All_info:
        tds = info.select("td")
        if len(tds) == 2:
            dct_2[tds[0].get_text().strip()] = tds[1].get_text().strip()
        elif len(tds) == 1:
            if tds[0].get_text().strip() != "":
                if tds[0].select_one("a", href=True) is not None:
                    registr_url = tds[0].select_one("a", href=True)["href"]
                else:
                    lst_1.append(tds[0].get_text().strip())

    try:
        registr_fee = lst_1[lst_1.index("Registration fees") + 1]
    except ValueError:
        registr_fee = ""

    try:
        registr_open = dct_2["Opening of registration"]
    except:
        registr_open = ""
    try:
        registr_close = dct_2["Closure of registration"]
    except:
        registr_close = ""

    date_time = dct_2["Date and time of start"].split()
    starting_time = date_time.pop().strip()
    location_start = dct_2["Location of start"].split()
    date = " ".join(date_time)

    try:
        starting_point = location_start[0]
    except:
        starting_point = ""

    try:
        country = re.search(r'\((.*?)\)',
                            dct_2["Location of start"].strip()).group(1)
    except:
        country = ""

    try:
        distance = re.sub('[(){}<>]', '', dct_2["Distance"]).split()[0].strip()
    except:
        distance = ""

    try:
        elevation_gain = dct_2["Ascent"].strip()
    except:
        elevation_gain = ""

    try:
        descent = dct_2["Descent"].strip()
    except:
        descent = ""

    try:
        refreshment_points = dct_2["Refreshment points"].strip()
    except:
        refreshment_points = ""

    try:
        time_limit = dct_2["Maximum time"].strip()
    except:
        time_limit = ""

    source_url = url
    try:
        course_url = soup.select_one("div#calevt_fich iframe", src=True)["src"]
    except:
        course_url = ""
    try:
        participants = dct_2["Number of participants"].strip()
    except:
        participants = ""

    description = soup.select("div.content p")
    Description = ""
    if len(description) != 0:
        for c, desc in enumerate(description):
            if c == 1:
                Description += "\nDescription in English\n"
            Description += desc.get_text()
    else:
        Description = ""

    if img_url is not None:
        img_url = domain + img_url["href"]
    else:
        img_url = ""

    if WEBSITE is not None:
        WEBSITE = WEBSITE["href"]
    else:
        WEBSITE = ""

    logo = soup.select_one("div#calevt_lst img", src=True)

    if logo is not None:
        logo_url = logo["src"]
    else:
        logo_url = ""

    th = soup.select_one("div#calevt_fich tr th", onclick=True)

    if th is not None:
        stage = th.select_one("a.rightarr", onclick=True)
        params = eval(stage['onclick'].split(";")[0])
        global sum_distance, sum_elevation_gain, sum_descent, sum_refreshment_points, sum_time_limit

        sum_distance = sum_elevation_gain = sum_descent = sum_refreshment_points = sum_time_limit = 0
        try:

            sum_distance = reprDist(distance)[0]
        except Exception as ex:
            pass

        try:

            sum_elevation_gain = reprDist(elevation_gain)[0]
        except Exception as ex:
            pass

        try:

            sum_descent = reprDist(descent)[0]
        except Exception as ex:
            pass

        try:

            sum_refreshment_points = reprDist(refreshment_points)[0]
        except Exception as ex:
            pass

        try:
            sum_time_limit = get_sec(time_limit)
        except Exception as ex:
            sum_time_limit = 0
            pass

        calcSUMS(params)
        sum_time_limit = get_str(sum_time_limit)

    else:
        sum_distance = distance
        sum_elevation_gain = elevation_gain
        sum_descent = descent
        sum_refreshment_points = refreshment_points
        sum_time_limit = time_limit

    dct["Event"] = Event
    dct["Race"] = Race
    dct["Description"] = Description
    dct["Participants"] = participants
    dct["Registration Opens"] = registr_open
    dct["Registration Closes"] = registr_close
    dct["Entry Fee"] = registr_fee
    dct["Sign Up"] = registr_url
    dct["Date"] = date
    dct["Starting Time"] = starting_time
    dct["Starting Point"] = starting_point
    dct["Country"] = country
    dct["SumDistance"] = sum_distance
    dct["SumElevation Gain"] = sum_elevation_gain
    dct["SumDescent"] = sum_descent
    dct["SumRefreshment Points"] = sum_refreshment_points
    dct["SumTimeLimit"] = sum_time_limit  #
    dct["Website"] = WEBSITE
    dct["CourseUrl"] = course_url
    # dct["CourseFileName"] = "" #
    dct["LogoPicURL"] = logo_url
    dct["ProfilePicURL"] = img_url
    # dct["ProfilePicFile Name"] = "" #
    dct["SourceUrl"] = source_url

    return dct
    "id":ID,
    "pass":PASS,
    "mode":"login_entry_end"
}

res = session.post(url_login, data=login_info)
res.raise_for_status() # for error

print("--------------------------------------------------------")
###############################################################
#                        Print User Name                      #
###############################################################

soup_myage = BeautifulSoup(res.text, "html.parser")

account_href = soup_myage.select_one(".spotlight li a").attrs["href"]
url_account = urljoin(url_login, account_href)

res_account = session.get(url_account)
res_account.raise_for_status()

soup_account = BeautifulSoup(res_account.text, "html.parser")
user_name = str((soup_account.select(".section3 h3"))[0])[4:-5].split("/")[0]

print("Hello "+ user_name + "!")
print("--------------------------------------------------------")
###############################################################
#                        Page Transition                      #
###############################################################
a_list = soup_myage.select(".section.pickup a")
favo_a = ""
Ejemplo n.º 57
0
    def scrap(self,
              paper_views_range=150,
              paper_downloads_range=20,
              pages_min=1,
              pages_max=10,
              counter=1):
        papersList = []
        for disc in self.disciplineLinks:
            for page in range(pages_min, pages_max):
                print("page #" + str(page))
                html = urlopen(disc + str(page))
                print(disc + str(page))
                soup = BS(html, features="lxml")
                elems = [
                    self.baseLink + x['href'] + '/pdf'
                    for x in soup.findAll('a')
                    if x['href'].find("article/n/") != -1
                ]

                for elem in elems:
                    html2 = urlopen(elem[:-4])
                    soup2 = BS(html2, features="lxml")
                    if soup2.select_one(
                            '#body > div.content > div > span > div:nth-child(2) > h1 > i'
                    ) is None:
                        print("Can't collect papers, captcha on CyberLeninka")
                        if papersList:
                            print("Create rawdata.json and stop")
                            with open('rawdata.json', 'w',
                                      encoding='utf-8') as f:
                                json.dump(papersList, f, ensure_ascii=False)
                            return True
                        else:
                            print("No papers was found, try again later")
                            return False
                    paperTitle = soup2.select_one(
                        '#body > div.content > div > span > div:nth-child(2) > h1 > i'
                    ).text
                    paperViews = soup2.select_one(
                        '#body > div.content > div > span > div:nth-child(2) > div.infoblock.authors.visible > div.top-cc > div.statitem.views'
                    ).text
                    paperDownloads = soup2.select_one(
                        '#body > div.content > div > span > div:nth-child(2) > div.infoblock.authors.visible > div.top-cc > div.statitem.downloads'
                    ).text
                    print(paperTitle)
                    journal = [
                        self.baseLink + y['href'] for y in soup2.findAll('a')
                        if y['href'].find("journal/n/") != -1
                    ]
                    html3 = urlopen(journal[0])
                    soup3 = BS(html3, features="lxml")
                    title = soup3.findAll('h1')[0].text
                    statItems = [
                        x.text
                        for x in soup3.findAll("div", {"class": "statitem"})
                    ]

                    if int(paperViews) > paper_views_range and int(
                            paperDownloads) > paper_downloads_range:
                        isGood = 1
                    else:
                        isGood = 0
                    print(isGood)
                    paperObj = {
                        'journalName':
                        title,
                        'journalViews':
                        int(statItems[0]),
                        'journalDownloads':
                        int(statItems[1]),
                        'journalHirch':
                        int(statItems[2]),
                        'paperPath':
                        self.baseFolder + self.baseName + str(counter) +
                        ".pdf",
                        'paperUrl':
                        elem[:-4],
                        'paperTitle':
                        paperTitle,
                        'isGood':
                        isGood
                    }
                    papersList.append(paperObj)
                    fileInfo = urlretrieve(
                        elem, self.baseFolder + self.baseName + str(counter) +
                        ".pdf")
                    counter += 1

        with open('rawdata.json', 'w', encoding='utf-8') as f:
            json.dump(papersList, f, ensure_ascii=False)
        return True
        '.player-progress')
    progress__text_el = player_progress_el.find_element_by_css_selector(
        '.progress__bar.progress__text')

    while True:
        try:
            track_playing_el = driver.find_element_by_css_selector(
                '.d-track_playing')
            track = get_track(track_playing_el)
        except (NoSuchElementException, StaleElementReferenceException):
            continue

        el_html = progress__text_el.get_attribute('outerHTML')
        root = BeautifulSoup(el_html, 'html.parser')

        progress_left = to_seconds(root.select_one('.progress__left').text)
        progress_right = to_seconds(root.select_one('.progress__right').text)
        if progress_right == 0:
            continue

        progress_left_str = seconds_to_str(progress_left)
        progress_right_str = seconds_to_str(progress_right)

        print(
            f'{track.title}. {progress_left_str} / {progress_right_str} ({progress_left / progress_right:.1%})'
        )

        time.sleep(1)

finally:
    if driver:
Ejemplo n.º 59
0
def resume_page(url):
    #=== Check Key ===#
    resume_id = int(re.search('\/\d+\/', url).group(0).replace('/', ''))
    res = retry(url)
    soup = BeautifulSoup(res.content, 'lxml')
    try:
        #=== Check Not Found ===#
        info_sum = soup.select_one(
            "div#box_right div.span11.marL10.padB10.padL20")
        if info_sum == None:
            return 0
        #=== Extract Info Summary ===#
        prog = info_sum.select_one("div#progress_resume div")[
            'style'].strip()[-3:].replace('%', '')
        [div.decompose() for div in info_sum.select("div")]
        [b.decompose() for b in info_sum.select("b")]
        info_sum_list = re.sub(
            '[\s+]', '|', info_sum.text.strip().replace(" ", "")).split("|")
        info_sum_list = list(filter(lambda _: _ != "", info_sum_list))

        #=== Extract Info Resume ===#
        info_main = soup.select_one("div#resumeDT")
        resume_update = info_main.select_one(
            "div.taR.marR10").text.strip().split(":")

        resume_want_dict = {
            'job_type': '',
            'asked_job': [],
            'asked_location': '',
            'asked_salary': '',
            'start_in': '',
            'work_aboard': ''
        }
        resume_want_mapper = {
            'รูปแบบงาน:': 'job_type',
            'สาขาอาชีพ:': 'asked_job',
            'ตำแหน่ง:': 'asked_job',
            'พื้นที่ที่ต้องการทำงาน:': 'asked_location',
            'เงินเดือนที่ต้องการ:': 'asked_salary',
            'ระยะเวลาเริ่มงาน:': 'start_in',
            'ยินดีทำงานต่างประเทศ:': 'work_aboard'
        }
        resume_want = info_main.select_one(
            "div#resume_want").select("div.span11.offset1")
        if len(resume_want) > 0:
            for row in resume_want:
                # pair {occupation, position}
                op_dict = {}
                for col in row.select('.span6'):
                    key = re.sub('[\d.\s+]', '', col.b.text)
                    val = re.sub('[\s+]', '', col.span.text)
                    key = resume_want_mapper[key]
                    if key == 'asked_job':
                        if len(op_dict) == 0:
                            op_dict['occupation'] = val
                        else:
                            op_dict['position'] = val
                            resume_want_dict[key].append(op_dict)
                            op_dict = {}
                    else:
                        resume_want_dict[key] = val
    except Exception as e:
        print("Step info_resume", e)

    #=== Extract Education ===#
    try:
        resume_edu = []
        for dl in info_main.select("dl"):
            key = dl.select_one("dt").text.strip().replace(" :", "")
            val = dl.select_one("dd")
            for span in val.select("span"):
                span.decompose()
            val = re.sub('[\s+]', '|', val.text.strip().replace(" ", ""))
            val_list = list(filter(lambda _: _ != "", val.split("|")))
            resume_edu.append(
                (key, val_list[1:3] + val_list[4:])
            )
    except Exception as e:
        print("Step resume_edu", e)

    #=== Extract Experience ===#
    try:
        resume_exp = info_main.select("div.row-fluid.jsXp_row.padB5")
        if resume_exp != None:
            resume_exp_list = []
            for exp in resume_exp:
                work_info_list = [re.sub('[\s+]', ' ', _.text.strip()) for _ in exp.select_one(
                    "div.o.col000.span6.padV10H20.cor4.bg_lightyellow").select("span.padL10")]
                work_detail = exp.select_one("div.padB10.bb-code").text.strip()
                resume_exp_list.append((work_info_list, work_detail))
    except Exception as e:
        print("Step resume_exp", e)

    #=== Extract Skill[Nosql] ===#
    try:
        resume_skill_dict = {
            'own_vehicle': '',
            'skill_vehicle': '',
            'drive_license': '',
            'skill_lang': [],
            'skill_typing': {'th_wm': '', 'en_wm': ''},
            'skill_other': ''
        }
        resume_skill_mapper = {
            'ยานพาหนะ': 'own_vehicle',
            'ความสามารถในการขับขี่': 'skill_vehicle',
            'ใบอนุญาติขับขี่': 'drive_license',
            'ทักษะทางภาษา': 'skill_lang',
            'ทักษะการพิมพ์ดีด': 'skill_typing',
            'ทักษะอื่นๆ': 'skill_other'
        }
        resume_skill = info_main.select_one("div#resume_skill")
        lang_list = []
        if resume_skill != None:
            for skill_soup in resume_skill.select("div.padV10H20 > div.span11.offset1"):
                # skill_lang
                try:
                    skill_soup['style']
                except KeyError:
                    skill_soup['style'] = None
                if skill_soup['style'] == "float:left":
                    for row in skill_soup.select('div.span11.offset1'):
                        key = row.select_one(
                            '.span2.bg_lightyellow.taCen.o').text.strip()
                        val_list = [lang_mapper[re.sub(
                            '[\s+]', '', _.text).split(':')[1]] for _ in row.select('.pull-left')]
                        resume_skill_dict['skill_lang'].append(
                            {
                                'name': key,
                                'skill': {
                                    "listen": val_list[0],
                                    "speak": val_list[1],
                                    "read": val_list[2],
                                    "write": val_list[3]
                                }
                            }
                        )
                else:
                    # pair {skill_key, skill_val}
                    try:
                        key, val = re.sub(
                            '[\s+]', '', skill_soup.text).split(':')
                    except:
                        # skill_other too many values
                        continue
                    key = resume_skill_mapper[key]
                    if key == 'skill_typing':
                        val_list = re.findall('\d+', val)
                        try:
                            resume_skill_dict[key]['th_wm'] = val_list[0]
                        except:
                            pass
                        try:
                            resume_skill_dict[key]['en_wm'] = val_list[1]
                        except:
                            pass
                    else:
                        resume_skill_dict[key] = val
    except Exception as e:
        print("Step resume_skill", e)

    #=== Extract Main Info ===#
    try:
        # Handle cast String into Integer
        try:
            age = int(info_sum_list[1])
        except:
            age = ''
        try:
            asked_salary = int(
                resume_want_dict['asked_salary'].replace(',', ''))
        except:
            asked_salary = ''
        resume_csv_dict = OrderedDict({
            # Primary key
            '_id': resume_id,
            'resume_modified': resume_update[1].strip(),
            'resume_progress': int(prog),
            'gender': info_sum_list[0].replace('ชาย', 'M').replace('หญิง', 'F'),
            'age': age,
            'exprience': int(info_sum_list[2]),
            'job_type': resume_want_dict['job_type'],
            'asked_job': resume_want_dict['asked_job'],
            'asked_location': resume_want_dict['asked_location'],
            'asked_salary': asked_salary,
            'start_in': resume_want_dict['start_in'],
            'work_aboard': resume_want_dict['work_aboard'],
            'edu_hist': [],
            'exp_hist': [],
            'own_vehicle': resume_skill_dict['own_vehicle'],
            'skill_vehicle': resume_skill_dict['skill_vehicle'],
            'drive_license': resume_skill_dict['drive_license'],
            'skill_lang': resume_skill_dict['skill_lang'],
            'skill_typing': resume_skill_dict['skill_typing'],
            'skill_other': resume_skill_dict['skill_other'],
            'training_hist': []
        })
    except Exception as e:
        print("Step resume_csv", e)

    #=== Extract exp ===#
    try:
        if len(resume_exp_list) > 0:
            for rel in resume_exp_list:
                result_dict = {}
                for exp, _ in zip(exp_mapper, rel[0]):
                    result_dict[exp] = _
                resume_csv_dict['exp_hist'].append(
                    {'exp_info': result_dict, 'exp_detail': rel[1]})
    except Exception as e:
        print("Step exp_csv", e)

    #=== Extract education ===#
    try:
        if len(resume_edu) > 0:
            for redu in resume_edu:
                result_dict = {'edu_year': redu[0].replace(
                    'กำลังศึกษาอยู่', 'Studying')}
                for edu, r in zip(edu_mapper, redu[1]):
                    if edu == "edu_level":
                        try:
                            r = edu_dict[r]
                        except KeyError:
                            # print("KeyError: ", r)
                            r = edu_dict[r.replace(
                                '(กำลังศึกษาอยู่)', '')]
                            r = r + ' (Studying)'
                    result_dict[edu] = r
                resume_csv_dict['edu_hist'].append(result_dict)
    except Exception as e:
        print("Step edu_csv", e)

    #=== Extract training ====#
    try:
        training_list = info_main.select_one(
            "#resume_skill + div").select("div.row-fluid")
        if training_list != None:
            for train in training_list:
                result_dict = {}
                tl = [re.sub('[\s+]', ' ', _.text.strip())
                      for _ in train.select("span")]
                for tm, td in zip(train_mapper, tl):
                    result_dict[tm] = td
                if len(result_dict) == 0:
                    break
                else:
                    resume_csv_dict['training_hist'].append(result_dict)
    except Exception as e:
        print("Step training_csv", e)
    return resume_csv_dict
Ejemplo n.º 60
0
import requests
from bs4 import BeautifulSoup
nickname=input('닉네임 입력 : ')
url =f'https://www.op.gg/summoner/userName={nickname}'

response = requests.get(url).text
data = BeautifulSoup(response,"html.parser")
tier = BeautifulSoup.select_one("#SummonerLayoutContent > div.tabItem.Content.SummonerLayoutContent.summonerLayout-summary > div.SideContent > div.TierBox.Box > div > div.TierRankInfo > div.TierRank")
#왜 못불러와 ..
print(tier.text)