Python BeautifulSoup.find 예제들, bs4.BeautifulSoup.find Python 예제들

예제 #1

1

파일 보기

파일: spider.py 프로젝트: cantas/copy2bbs

    def spide(self):
        self.subject=''
        self.content=''
        try:
            request = urllib2.Request(self.spiderUrl)
            response = urllib2.urlopen(request)
            soup = BeautifulSoup(response.read().decode('utf-8'))

            self.subject = soup.title.string.output_ready()
            self.subject = self.subject.encode('utf-8')

            self.content = '[b]' + soup.find("div",class_="article-experpt explain").string.output_ready() + '[/b]' + '\n'

            main_body = soup.find("div",class_ ="js-article-body")
            for child in main_body.children:
                if child.string:
                    self.content = self.content + '\t' + child.string.output_ready() +'\n'
                elif child.contents:
                #    if child.contents[0].name=='strong':
                #        self.content = self.content + '[color=Sienna]'  + child.string.output_ready() +'[/color]' +'\n'
                    if child.contents[0].name=='img':
                        self.content = self.content + '[align=center][img=660,440]' + child.contents[0]['src'] + '[/img][/align]'+'\n'


            self.content = self.content.encode('utf-8')
            self.content = '[font=微软雅黑]' + self.content +'[/font]'+ '\n\n\n\n ' +'本文转自' + self.spiderUrl + '\n' + '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t-----自动转贴'
            print (self.content)
            return True
        except urllib2.URLError, e:
            if hasattr(e,"code"):
                print e.code
            if hasattr(e,"reason"):
                print e.reason
            return False

예제 #2

0

파일 보기

파일: blackout_poems.py 프로젝트: xsteadfastx/blackout_poems

def post_von_wagner():
    ''' check if pickle file is already there, else get it '''
    today = datetime.date.today().isoformat()

    if os.path.exists(today+'.p'):
        temp_file = open(today+'.p', 'rb')
        trash = pickle.load(temp_file)

    else:
        temp_file = open(today+'.p', 'wb+')

        ''' get the link to the newest post'''
        URL = "http://www.bild.de/themen/personen/franz-josef-wagner/kolumne-17304844.bild.html"
        r = requests.get(URL)
        soup = BeautifulSoup(r.text)
        URL = 'http://www.bild.de'+soup.find('div', 'tr').find('a').get('href')

        ''' get the text out the article '''
        r = requests.get(URL)
        soup = BeautifulSoup(r.text)
        trash = soup.find('div', 'txt clearfix').text

        ''' clean everything up and create a list'''
        trash = re.sub(r'[\;\,\(\).\"\@\:\?]', ' ', trash)
        trash = trash.split()

        ''' save trash to pickle file '''
        pickle.dump(trash, temp_file)

    temp_file.close()
    return trash[:-24]

예제 #3

0

파일 보기

파일: mango.py 프로젝트: DoctorMalboro/HeyMang

def recognise_eHentai(link, path):
    url = str(link)
    page = urllib2.urlopen(url).read()
    soup = BeautifulSoup(page)
    name = soup.findAll('title')
    name = name[0].get_text().encode('utf-8')
    name = str(name)
    path = path + '\\' + name
    download_eHentai(link, path)

    pages = soup.find_all('span')
    pages = pages[1].get_text()
    pages = int(pages)
    z = 0

    while (pages > z):
        z = z + 1
        sopa = soup.find('div', 'sn')
        sopa = sopa.find_all('a')
        sopa = sopa[2].get('href')

        url = str(sopa)
        download_eHentai(url, path)
        page = urllib2.urlopen(url).read()

        soup = BeautifulSoup(page)

        sopa = soup.find('div', 'sn')
        sopa = sopa.find_all('a')
        sopa = sopa[2].get('href')
        download_eHentai(sopa, path)

예제 #4

0

파일 보기

파일: SCLeg.py 프로젝트: djbridges/govBot

def getSCLeg(partyDict):
    houseSoup = BeautifulSoup(urllib2.urlopen('http://www.scstatehouse.gov/member.php?chamber=H&order=D').read())
    senateSoup = BeautifulSoup(urllib2.urlopen('http://www.scstatehouse.gov/member.php?chamber=S&order=D').read())
    houseTable = houseSoup.find('div', {'class': 'mainwidepanel'}).find_all('div', {'style': 'width: 325px; height: 135px; margin: 0 0 0 20px; text-align: left; float: left;'})
    senateTable = senateSoup.find('div', {'class': 'mainwidepanel'}).find_all('div', {'style': 'width: 325px; height: 135px; margin: 0 0 0 20px; text-align: left; float: left;'})
    dictList = []

    for item in houseTable:
        repInfo = {}
        link = item.find('a')
        if link is not None:
            repInfo['Website'] = 'http://www.scstatehouse.gov' + link.get('href')
            repInfo['Name'] = re.sub(r'\[.*$', '', link.string.strip()).strip().replace('   ', ' ').replace('  ', ' ')
            repInfo['Party'] = partyDict[str(re.sub(r'^.*\[(.*)\].*$', r'\1', link.string.strip()))]
        else:
            repInfo['Name'] = 'VACANT'
        repInfo['District'] = 'SC State House ' + re.sub(r'^.*(District [0-9]*).*$', r'\1', item.get_text())
        dictList.append(repInfo)

    for item in senateTable:
        repInfo = {}
        link = item.find('a')
        if link is not None:
            repInfo['Website'] = 'http://www.scstatehouse.gov' + link.get('href')
            repInfo['Name'] = re.sub(r'\[.*$', '', link.string.strip()).strip().replace('   ', ' ').replace('  ', ' ')
            repInfo['Party'] = partyDict[str(re.sub(r'^.*\[(.*)\].*$', r'\1', link.string.strip()))]
        else:
            repInfo['Name'] = 'VACANT'
        repInfo['District'] = 'SC State Senate ' + re.sub(r'^.*(District [0-9]*).*$', r'\1', item.get_text())
        dictList.append(repInfo)

    return dictList

예제 #5

0

파일 보기

파일: sami.py 프로젝트: sannies/pycaption

    def write(self, caption_set):
        caption_set = deepcopy(caption_set)
        sami = BeautifulSoup(SAMI_BASE_MARKUP, u"lxml-xml")

        caption_set.layout_info = self._relativize_and_fit_to_screen(caption_set.layout_info)

        primary = None

        for lang in caption_set.get_languages():
            self.last_time = None
            if primary is None:
                primary = lang

            caption_set.set_layout_info(lang, self._relativize_and_fit_to_screen(caption_set.get_layout_info(lang)))

            for caption in caption_set.get_captions(lang):
                # Loop through all captions/nodes and apply transformations to
                # layout in function of the provided or default settings
                caption.layout_info = self._relativize_and_fit_to_screen(caption.layout_info)
                for node in caption.nodes:
                    node.layout_info = self._relativize_and_fit_to_screen(node.layout_info)
                sami = self._recreate_p_tag(caption, sami, lang, primary, caption_set)

        stylesheet = self._recreate_stylesheet(caption_set)
        sami.find(u"style").append(stylesheet)

        a = sami.prettify(formatter=None).split(u"\n")
        caption_content = u"\n".join(a[1:])
        return caption_content

예제 #6

0

파일 보기

파일: douban_movie.py 프로젝트: icybor/Python

def get_li(doc):
    soup = BeautifulSoup(doc, 'html.parser')
    ol = soup.find('ol', class_='grid_view')
    name = [] #名字
    star_con = [] #评价人数
    score = []  #评分
    info_list = []  #短评
    for i in ol.find_all('li'):
        detail = i.find('div', attrs={'class': 'hd'})
        movie_name = detail.find('span', attrs={'class': 'title'}).get_text() #电影名字
        level_star = i.find('span',attrs={'class':'rating_num'}).get_text() #评分
        star = i.find('div',attrs={'class':'star'})
        star_num = star.find(text=re.compile('评价'))  #评价

        info = i.find('span',attrs={'class':'inq'})  #短评
        if info:     #判断是否有短评
            info_list.append(info.get_text())
        else:
            info_list.append('无')
        score.append(level_star)
        

        name.append(movie_name)
        star_con.append(star_num)
    page = soup.find('span', attrs={'class': 'next'}).find('a') #获取下一页
    if page:
        return name,star_con,score,info_list,DOWNLOAD_URL + page['href']
    return name,star_con,score,info_list,None

예제 #7

0

파일 보기

파일: movie4kis.py 프로젝트: varunrai/repository.magicality

    def sources(self, url, hostDict, hostprDict):
        try:
            sources = []

            if url == None: return sources

            year = url['year']
            h = {'User-Agent': client.randomagent()}
            title = cleantitle.geturl(url['title']).replace('-', '+')
            url = urlparse.urljoin(self.base_link, self.search_link % title)
            r = requests.get(url, headers=h)
            r = BeautifulSoup(r.text, 'html.parser').find('div', {'class': 'item'})
            r = r.find('a')['href']
            r = requests.get(r, headers=h)
            r = BeautifulSoup(r.content, 'html.parser')
            quality = r.find('span', {'class': 'calidad2'}).text
            url = r.find('div', {'class':'movieplay'}).find('iframe')['src']
            if not quality in ['1080p', '720p']:
                quality = 'SD'

            valid, host = source_utils.is_host_valid(url, hostDict)
            sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'direct': False, 'debridonly': False})
            return sources
        except:
            print("Unexpected error in Furk Script: check_api", sys.exc_info()[0])
            exc_type, exc_obj, exc_tb = sys.exc_info()
            print(exc_type, exc_tb.tb_lineno)
            return sources

예제 #8

0

파일 보기

파일: page_parsing.py 프로젝트: MarchHu/Plan-for-combating

def get_links_from(channel,pages):
    #http://bj.ganji.com/jiaju/a3o11/
    #ttp://bj.ganji.com/wupinjiaohuan/o3/#两种不同url
    if channel in ['http://bj.ganji.com/xuniwupin/','http://bj.ganji.com/qitawupin/','http://bj.ganji.com/ershoufree/','http://bj.ganji.com/wupinjiaohuan/']:
        list_view = '{}o{}/'.format(channel,str(pages))
        wb_data = requests.get(list_view,headers=headers)
        #time.sleep(1)
        soup = BeautifulSoup(wb_data.text,'lxml')
        if soup.find('ul','pageLink clearfix'):
            for link in soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dt > div > a'):
                item_link = link.get('href')
                url_list.insert_one({'url':item_link})
                print(item_link)

        else:
            #pass
            print('重复页面')
    else:
        list_view = '{}a3o{}/'.format(channel,str(pages))
        wb_data = requests.get(list_view,headers=headers)
        #time.sleep(1)
        soup = BeautifulSoup(wb_data.text,'lxml')
        if soup.find('ul','pageLink clearfix'):
            for link in soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a'):
                item_link = link.get('href')
                url_list.insert_one({'url':item_link})
                print(item_link)

        else:
            #pass
            print('重复页面')

예제 #9

0

파일 보기

파일: parse.py 프로젝트: seidelj/jeopardy_scrape

def parse_round(bsoup, rnd, gid, airdate):
	"""Parses and inserts the list of clues from a whole round"""
	round_id = "jeopardy_round" if rnd == 1 else "double_jeopardy_round"
	r = bsoup.find(id = round_id)
	# the game may not have all the rounds
	if not r:
		return False
	# the list of categories for this round
	categories = [c.get_text() for c in r.find_all("td", class_ = "category_name")]
	# the x_coord determines which category a clue is in
	# because the categories come before the clues, we will
	# have to match them up with the clues later on
	x = 0
	for a in r.find_all("td", class_ = "clue"):
		if not a.get_text().strip():
			continue
		value = a.find("td", class_ = re.compile("clue_value")).get_text().lstrip("D: $")
		value = re.sub('[:$,]','',value)
		text = a.find("td", class_ = "clue_text").get_text()
		answerDiv = BeautifulSoup(a.find("div", onmouseover = True).get("onmouseover"), "lxml")
		answer = answerDiv.find("em", class_ = "correct_response").get_text()
		right = answerDiv.find("td", class_ = "right")
		if right == None:
			right = "Triple Stumper"
		else:
			right = right.get_text()
		insert([gid, airdate, rnd, categories[x], value, text, answer, right])
		x = 0 if x == 5 else x + 1
	return True

예제 #10

0

파일 보기

파일: retrieve_data.py 프로젝트: fTrestour/genius-scraper

def scrape_song_metadata(soup, verbose = False):
    result = {}
    first_soup = soup.find("div", {"class":"song_header-primary_info"})
    first_soup = BeautifulSoup(soup.prettify(), "html.parser")

    artist = first_soup.find("a", {"class":"song_header-primary_info-primary_artist"})
    artist = clean_text(artist.string)
    if verbose:
        print "Artist : " + artist.encode('utf-8')
    result["artist"] = artist

    song = first_soup.find("h1", {"class":"song_header-primary_info-title"})
    song = clean_text(song.string)
    if verbose:
        print "Song   : " + song.encode('utf-8')
    result["song"] = song

    labels = first_soup.findAll("span", {"class":"song_info-label"})
    labels = [clean_text(l.string) for l in labels]
    contents = first_soup.findAll("span", {"class":"song_info-info"})
    contents = [BeautifulSoup(c.prettify(), "html.parser") for c in contents]
    contents = [c.a for c in contents]
    for i in range(len(labels)):
        if contents[i]:
            if verbose:
                print labels[i] + " :"
                print "    " + clean_text(contents[i].string).encode('utf-8')
                print "    " + contents[i]['href'].encode('utf-8')
            result[labels[i]] = {"name" : clean_text(contents[i].string), "link" : geniusify(contents[i]['href'])}

    return result

예제 #11

0

파일 보기

파일: backroom.py 프로젝트: 109021017/muzei

  def process_html(self, url, html):
    soup = BeautifulSoup(html)

    details_url = re.sub(r'#.+', '', url, re.I | re.S) + '?utm_source=Muzei&utm_campaign=Muzei'
    title = soup.find(itemprop='name').get_text()
    author = soup.find(itemprop='author').get_text()
    completion_year_el = soup.find(itemprop='dateCreated')
    byline = author + ((', ' + completion_year_el.get_text()) if completion_year_el else '')
    image_url = soup.find(id='paintingImage')['href']

    if not title or not author or not image_url:
      self.response.out.write('Could not parse HTML')
      self.response.set_status(500)
      return

    publish_date = (datetime.datetime
        .utcfromtimestamp(int(self.request.get('publishDate')) / 1000)
        .date())
    image_url, thumb_url = maybe_process_image(image_url,
        NO_CROP_TUPLE,
        publish_date.strftime('%Y%m%d') + ' ' + title + ' ' + byline)

    # create the artwork entry
    new_artwork = FeaturedArtwork(
        title=title,
        byline=byline,
        image_url=image_url,
        thumb_url=thumb_url,
        details_url=details_url,
        publish_date=publish_date)
    new_artwork.save()
    self.response.set_status(200)

예제 #12

0

파일 보기

파일: hackUtils.py 프로젝트: shellb0y/hackUtils

def getLinksFromWooyun(html):  
    soup = BeautifulSoup(html)
    soup = soup.find('div', class_="content")
    soup = soup.find('table',class_="listTable")
    html = soup.find('tbody')
    if not html:
        now = time.strftime('%H:%M:%S',time.localtime(time.time()))
        print "["+str(now)+"] [WARNING] failed to crawl"
    else:
        html_doc=html.find_all('tr')
        if not html_doc:
            now = time.strftime('%H:%M:%S',time.localtime(time.time()))
            print "["+str(now)+"] [WARNING] failed to crawl"
        else:
            for doc in html_doc:
                try:
                    td=doc.find_all('td')[2]
                    atag=td.find('a')
                    link=atag.get('href').strip()
                    if not isExisted(link,'wooyun.txt'):
                        logfile(link,'wooyun.txt')
                        now = time.strftime('%H:%M:%S',time.localtime(time.time()))
                        print "["+str(now)+"] [INFO] "+link
                    else:
                        now = time.strftime('%H:%M:%S',time.localtime(time.time()))
                        print "["+str(now)+"] [WARNING] url is duplicate ["+link+"]"
                except Exception:
                    pass

예제 #13

0

파일 보기

파일: hu_elections_2010.py 프로젝트: michalskop/czhuhack2016

def get_parteredmenyek(content):
    results = list()
    soup = BeautifulSoup(content)

    registry_table = soup.find('p', text='a) A választók nyilvántartása').find_next('table')
    row = registry_table.find_all('tr')[3]
    total = row.find_all('td')[4].text.replace(' ', '')

    voter_table = registry_table.find_next('table')
    row = voter_table.find_all('tr')[3]
    #voters = row.find_all('td')[4].text.replace(' ', '')

    #non_voters = int(total) - int(voters[0].replace(' ', ''))
    non_voters = 0
    nonvoter = dict()
    nonvoter['statistics_code'] = 'non-voters'
    nonvoter['statistics_name'] = 'Non voters'
    nonvoter['value'] = non_voters
    results.append(nonvoter)

    lista_table = soup.find('p', text='Érvényes szavazatok száma:').find_next('table')
    rows = lista_table.find_all('tr')
    for row in rows[1:]:
        result = {}
        cells = row.find_all('td')
        result['statistics_code'] = slugify(cells[1].text)
        result['statistics_name'] = cells[1].text
        result['value'] = cells[2].text.replace(' ', '')
        #result['percent'] = cells[3].text.replace('.', '')
        results.append(result)

    return results

예제 #14

0

파일 보기

파일: translateResultDB.py 프로젝트: MRmac1/gushiwenByPython

def attachTranslateAndAppreciation(dataSession, postId, shangxiUrls, fanyiUrls):
    # 首先添加译文信息
    if len(fanyiUrls) is not 0:
        for i, translateUrl in enumerate(fanyiUrls):
            html = requests.get(translateUrl).content
            soup = BeautifulSoup(html, "lxml")
            contentList = soup.find('div', attrs={'class': 'shangxicont'}).find_all('p')[1:-1]
            translateText = ''
            for contentElement in contentList:
                translateText += contentElement.get_text()
            if len(translateText) is not 0:
                translatePost = translatePosts(postId = postId, translateUrl = translateUrl, translateText = translateText, rate = i + 1)
                dataSession.add(translatePost)
                dataSession.commit()
    # 再添加赏析内容
    if len(shangxiUrls) is not 0:
        for i, appreciationUrl in enumerate(shangxiUrls):
            html = requests.get(appreciationUrl).content
            soup = BeautifulSoup(html, "lxml")
            contentList = soup.find('div', attrs={'class': 'shangxicont'}).find_all('p')[1:-1]
            appreciationText = ''
            for contentElement in contentList:
                appreciationText += contentElement.get_text()
            if len(appreciationText) is not 0:
                appreciationPost = appreciationPosts(postId = postId, appreciationUrl = appreciationUrl, appreciationText = appreciationText, rate = i + 1)
                dataSession.add(appreciationPost)
                dataSession.commit()

예제 #15

0

파일 보기

파일: tuicool2.py 프로젝트: simple2source/fetch_crwal

def get_article(article_id, abs_file_path):
	'''获得文章，获取成功返回True，文章不在最近一周内返回False'''
	article_url = 'http://www.tuicool.com/articles/{}'.format(article_id)
	try:
		print article_url
		common.rand_sleep(5, 10)
		res = l.session.get(article_url)
		logging.info('return url {} success'.format(res.url))
		soup = BeautifulSoup(res.text, 'html.parser')
		title = str(soup.find('div', class_='article_detail_bg').find('h1')\
			.get_text())
		print title
		pub_time = re.sub(re.compile('时间[\s\S]{2}'), '', \
			str(soup.find('span', class_='timestamp').get_text()).strip())
		keywords = [str(item.get_text())\
			for item in soup.find_all('span', class_='new-label')]
		content = str(soup.find('div', class_='article_body'))

		# 只抓最近一周内的文章
		timedelta = datetime.date.today()-datetime.datetime\
			.strptime(pub_time, '%Y-%m-%d %H:%M:%S').date()
		if timedelta.days > 7:
			return False

		with open(abs_file_path, 'w') as f:
			f.write('标题：' + title + '\n')
			f.write('发布时间：' + pub_time + '\n')
			f.write('关键字：' + ', '.join(keywords) + '\n')
			f.write('内容：' + content + '\n')
		return True
	except Exception, e:
		print Exception, e
		logging.error('run error', exc_info=True)
		return False

예제 #16

0

파일 보기

파일: get_articles.py 프로젝트: Eleonore9/get-articles-meaning

def sort_articles(path="../elife-articles/"):
    # Get all xml files from path
    path_files = [f for f in listdir(path) if isfile(join(path,f))] or []
    xml_files = filter(lambda x: '.xml' in x, path_files)
    if len(xml_files) > 0:
        print "Total number of articles: %d" % len(xml_files)
        counter, errors = 0, 0
        for article in xml_files:
            # For each xml file add info to the ARTICLES dictionary
            try:
                soup = BeautifulSoup(open(path+article), ["lxml", "xml"])
                journal_id = soup.find("journal-id").string
                pub_id = soup.find("article-id").string
                subjects = soup.find_all("subj-group")
                auth_sn = soup.find("surname").string
                auth_fn = soup.find("given-names").string
                if len(subjects) == 2:
                    counter += 1
                    unique_subj = subjects[1].string
                    if not unique_subj in ARTICLES.keys():
                        ARTICLES[unique_subj] = [{"file": article, "journal_id": journal_id,
                                                  "pub_id": pub_id,
                                                  "first_auth": "%s, %s" %(auth_sn, auth_fn)}]
                    else:
                        ARTICLES[unique_subj].append({"file": article, "journal_id": journal_id,
                                                      "pub_id": pub_id,
                                                      "first_auth": "%s, %s" %(auth_sn, auth_fn)})
            except:
                errors += 1
                pass
        print "There are %d articles out of %s with a single subject." % (counter, len(xml_files))
        print "There is/are %d exception(s). \n" % errors

예제 #17

0

파일 보기

파일: views.py 프로젝트: ThrStones/notebook

def saveNovelInfo(novelInfo):
    html = urlopen(novelInfo.website + novelInfo.novelId)
    bsObj = BeautifulSoup(html, "lxml")
    novelInfo.name = bsObj.h2.get_text()
    cover = bsObj.find("div", {"class": "cover"})
    novelInfo.avatar = cover.find("img").attrs['src']
    small = bsObj.find("div", {"class": "small"})
    for child in small.children:
        label = child.get_text().split("：")[0]
        value = child.get_text().split("：")[1]
        if label == "作者":
            novelInfo.author = value
        elif label == "分类":
            novelInfo.category = value
        elif label == "状态":
            novelInfo.state = value
        elif label == "字数":
            novelInfo.words = value
        elif label == "更新时间":
            novelInfo.latest_updatetime = value
        elif label == "最新章节":
            novelInfo.latest_chapter = value
    intro = bsObj.find("div", {"class": "intro"})
    introArray = intro.get_text().split("：")
    novelInfo.intro = introArray[1].replace("\u3000", "")

예제 #18

0

파일 보기

파일: main.py 프로젝트: WhiteTeeth/manga

def manga_parse(html, source):
    if not html:
        Logger.info('html is empty!')
        return None
    soup = BeautifulSoup(html, from_encoding='gbk')
    # Logger.debug(soup.prettify())
    intro = soup.find(id='intro_l')
    title = intro.find('h1').string.decode('utf-8').encode('utf-8')
    search = soup.find_all('p', attrs={'class':'w260'})
    up_time = search[0].find('span').string
    tmp = search[1].contents
    if(len(tmp) > 1):
        author = tmp[1]
    else:
        author = ''
    added_time = search[2].contents[1]
    cover = soup.find('div', attrs={'class':'info_cover'}).p.img['src']
    intro = soup.find('div', id='intro1').p.string
    if(intro):
        intro = intro.strip()
    up_time = datetime.strptime(up_time, '%Y-%m-%d').date()
    added_time = datetime.strptime(added_time, '%Y-%m-%d').date()
    plot = plot_parse(soup, source)
    return Manga(
        added_at=added_time,
        update_at=up_time,
        name=title,
        author=author,
        introduction=intro,
        poster=cover,
        source=source,
        plot=plot)

예제 #19

0

파일 보기

파일: dfxp.py 프로젝트: DramaFever/pycaption

    def write(self, captions, force=''):
        dfxp = BeautifulSoup(dfxp_base, 'xml')
        dfxp.find('tt')['xml:lang'] = "en"

        for style, content in captions['styles'].items():
            if content != {}:
                dfxp = self._recreate_styling_tag(style, content, dfxp)

        body = dfxp.find('body')

        if force:
            captions['captions'] = self._force_language(force,
                                                        captions['captions'])

        for lang in captions['captions']:
            div = dfxp.new_tag('div')
            div['xml:lang'] = '%s' % lang

            for sub in captions['captions'][lang]:
                p = self._recreate_p_tag(sub, dfxp)
                div.append(p)

            body.append(div)

        return unicode(dfxp.prettify(formatter=None))

예제 #20

0

파일 보기

파일: Portal.py 프로젝트: LeoLuo22/myGit

 def create_post_data(self, username, password, flag):
     if flag == "no_captcha":
         try:
             r = session.get(BASE_URL)
         except requests.exceptions.ConnectionError as err:
             print("无网络连接，程序退出")
             sys.exit()
         base = r.content.decode('utf-8')
         base_soup = BeautifulSoup(base,'lxml')
         lt = base_soup.find('input',attrs={'type':'hidden', 'name':'lt'})['value']#时间戳
         self.__POST_DATA['lt'] = lt
         self.__POST_DATA['username'] = username
         self.__POST_DATA['password'] = password
     elif flag == "captcha":
         r = session.get(BASE_URL)
         base = r.content.decode('utf-8')
         base_soup = BeautifulSoup(base,'lxml')
         lt = base_soup.find('input',attrs={'type':'hidden', 'name':'lt'})['value']
         payload = collections.OrderedDict()
         payload['username'] = username
         payload['_'] = lt
         self.get_captcha(payload)
         captcha = get_input("请输入验证码: ", "captcha")
         self.__POST_DATA_WITH_CAPTCHA['captcha'] = captcha
         self.__POST_DATA_WITH_CAPTCHA['lt'] = lt
         self.__POST_DATA_WITH_CAPTCHA['username'] = username
         self.__POST_DATA_WITH_CAPTCHA['password'] = password

예제 #21

0

파일 보기

파일: webparser.py 프로젝트: PenguinPaul/versebot

    def get_bible_hub_verse(self, verse):
        """ Retrieves the text for a user-supplied verse selection that can be found
        on Bible Hub. """

        url = ("http://biblehub.com/%s/%s/%d.htm"
            % (verse.translation.lower(), verse.book.lower().replace(" ", "_"),
            verse.chapter))

        page = urlopen(url)
        soup = BeautifulSoup(page.read())

        verses = soup.find("div", {"class":"chap"})

        if len(verses) < 1:
            return None, None, None

        for cur_verse in verses.findAll("b"):
            cur_verse.decompose()
        text = verses.get_text()

        trans_title = soup.find("div", {"class":"vheading"}).get_text()

        verse_list = text.splitlines()

        contents = ""
        for i, val in enumerate(verse_list):
            verse_num = i + 1
            if verse.start_verse == 0:
                contents += ("[**%d**] %s " % (verse_num, val))
            else:
                if (verse_num >= verse.start_verse and
                (verse.end_verse == 0 or verse_num <= verse.end_verse)):
                    contents += ("[**%d**] %s " % (verse_num, val))

        return contents, trans_title, url

예제 #22

0

파일 보기

파일: tags.py 프로젝트: denisbalyko/HabrTrend

    def add_post(self, pid):
        """
        Просмотр поста (выборка даты и тегов)
        """
        if (self.get_post(pid)):
            return

        print('-'*10,'http://habrahabr.ru/post/'+str(pid),'-'*10)
        cur=self.con.execute("select pid from %s where %s=%d" % ('post_tags','pid',pid))
        res=cur.fetchone( )
        if res==None:
            try:
                soup=BeautifulSoup(urllib.request.urlopen('http://habrahabr.ru/post/'+str(pid)).read( ))
            except (urllib.request.HTTPError):
                self.add_tag(pid,"parse_error_404","")
                print("error 404")
            else:
                published = soup.find("div", { "class" : "published" })
                tags = soup.find("ul", { "class" : "tags" })
                if tags:
                    for tag in tags.findAll("a"):
                        self.add_tag(pid, tag.string, get_date(published.string))
                else:
                    self.add_tag(pid,"parse_access_denied","")
                    print("access denied")
        else:
            print("post has already")

예제 #23

0

파일 보기

파일: restaurants.py 프로젝트: qingqingqing/parks

def review_info_from_url_review(url):
    # output review as a dict: reivew_id, review_title, review_body, review_rate, review_restaurantID
    # input is url of one review: g*+d*+r*
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html)
      
    review_info = {'review_body':'','review_title':'', 'review_id':'', 'review_restaurantID':'', 'review_rate':''}

    url_review_split = url.split('-')
    review_attID_part = url_review_split[2]
    review_attrID = review_attID_part.replace('d','')
    review_info['review_restaurantID'] = int(review_attrID.encode('utf8'))
    
    review_id_part = url_review_split[3]
    review_id = review_id_part.replace('r','')
    review_info['review_id'] = int(review_id.encode('utf8'))

    title_node = soup.find('title')
    review_info['review_title'] = ((((title_node.text).encode('utf8')).split('-'))[0]).strip()
   
    #rate_node = soup.find_all('img',{'class':'sprite-rating_no_fill rating_no_fill no50'})
    #review_info['review_rate'] = float((rate_node[0]['alt']).encode('utf8'))
    try:
        rate_node = soup.find('img',{'property': 'v:rating'})
        review_rate = float(rate_node['content'])
        review_info['review_rate'] = review_rate
    except:
        review_info['review_rate'] = 0
        
    try:
        review_node = soup.find('p',id = True, property = True)
        review_info['review_body'] = (review_node.text).encode('utf8')    
    except AttributeError:
        review_info['review_body'] = 'NA' 
    return review_info

예제 #24

0

파일 보기

파일: WebScraping2.py 프로젝트: pylon2008/AutoVerify

def get_image_info(client, my_massage):
    """
    Parse HTML page and extract featured image name and link
    """
    # Get Javascript updated HTML page
    response = client.commands.getPageText()
    #print 'type(response): ', type(response)
    #print 'response: ', response

##    fff = open('te.txt', 'w+')
##    fff.write( str(response) )
##    fff.close()

##    assert response['status']
##    assert response['result']

    # Create soup from HTML page and get desired information
    #soup = BeautifulSoup(response['result'], markupMassage=my_massage)
    #soup = BeautifulSoup(response['result'], markup="")
    soup = BeautifulSoup(response['result'])
    name = soup.find(id='caption_region').h3.string
    link = urlparse.urljoin('http://www.nasa.gov',\
                            soup.find(attrs='Full_Size')['href'])
    print name
    print link
    image_info = {'name': name, 'link': link}
    return image_info

예제 #25

0

파일 보기

파일: fetch_modules.py 프로젝트: AwaisKazi/web

    def parse_module_detail_page(self, url):
        """
        Parse a module detail page and return its extracted properties.
        """
        try:
            r = requests.get(url)
            soup = BeautifulSoup(r.content)

            def get_course(row):
                return re.match('(.*)\(', row.find('strong').text).group(1).strip()

            table = soup.find('tbody', id=re.compile('^modul'))
            course_table = soup.find('tbody', id=re.compile('^kategorieZuordnungen'))
            course_rows = course_table.find_all('div', {'class': 'katZuordnung'})
            courses = {get_course(row) for row in course_rows}

            ects_points_row = table.find('tr', id=re.compile('^Kreditpunkte'))
            objectives_row = table.find('tr', id=re.compile('^Lernziele'))
            lecturer_row = table.find('tr', id=re.compile('^dozent'))

            return {
                'ects_points': int(ects_points_row.find_all('td')[1].text),
                'objectives': objectives_row.find_all('td')[1].string,
                'lecturer': lecturer_row.find_all('td')[1].text,
                'courses': {c for c in courses if c not in course_specialisations}
                # Skip all courses that are only specialications and not real courses
            }
        except KeyboardInterrupt:
            self.stderr.write('Abort.')
            sys.exit(1)
        except:
            self.stderr.write("Could not parse {0}: {1}".format(url, sys.exc_info()[0]))

예제 #26

0

파일 보기

파일: get_venues.py 프로젝트: keithxm23/scrapesoccerway

def main():
  venues_file_name = 'venues%s.csv' % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  venue_team_file_name = 'venue_team%s.csv' % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  with open(file_name, 'wb') as f, open(venue_team_file_name) as f2:
    writer = csv.writer(f)
    writer.writerow(['venue_id', 'name', 'country', 'address', 'zipcode',
                     'city', 'fax', 'email', 'website', 'phone', 'openend',
                     'architect', 'capacity', 'surface', 'facts'])

    writer2 = csv.writer(f2)
    writer2.writerow(['venue_id','team_id'])
    

    for x in range(0,20000):
      print x
      if requests.head(BASE_URL % x).status_code == requests.codes.ok:
        r = requests.get(BASE_URL % x)
        soup = BS(r.text, 'html.parser')
        venue_data = []
        venue_data.append(x) #venue_id
        name_node = soup.select('#subheading > h1')
        venue_data.append(name_node[0].text.encode('utf-8') if name_node else '')
        venue_data.append(get_detail('Founded', soup))
        venue_data.append(get_detail('Address', soup))
        venue_data.append(get_detail('Country', soup))
        venue_data.append(get_detail('Phone', soup))
        venue_data.append(get_detail('Fax', soup))
        venue_data.append(get_detail('E-mail', soup))
        website_node = soup.find('a', text='Official website')
        venue_data.append(website_node['href'].encode('utf-8') if website_node else '')
        venue_data.append(soup.find('div', 'logo').img['src'].encode('utf-8'))
        venue_data.append(get_venue_id(x))
        print name_node[0].text
        writer.writerow(venue_data)

예제 #27

0

파일 보기

파일: timein.py 프로젝트: Xitavos/XitavosCogs

	async def _timein_country(self, country_code):
		"""Get time using country code
		
		country_code is a 2 letter country code from this list https://timezonedb.com/country-codes or a custom shortcut code
		
		Preset shortcuts:
		UK - United Kingdom (converts to GB)
		USE - United States East (New York)
		USW - United States West (Los Angeles)
		"""
		
		apiKey = self.settings['api_key']
		if ".com" in apiKey:
			await self.bot.say("You have to set your API key, see data/timein/settings.json for details")
			return
		
		url = 'http://api.timezonedb.com/v2/list-time-zone?key=' + apiKey + '&format=xml'
		flag = ':flag_'

		if country_code.lower() == 'use':
			url += '&country=US&zone=*New_York*'
			flag += 'us: EAST '
		elif country_code.lower() == 'usw':
			url += '&country=US&zone=*Los_Angeles*'
			flag += 'us: WEST '
		elif country_code.lower() == 'test':
			url += '&zone=*auckland*'
			flag += 'nz: '
		elif len(country_code) != 2 or ' ' in country_code == False:
			await self.bot.say("Country code must be 2 letters and from this list https://timezonedb.com/country-codes")
			return
		else:
			if country_code == 'UK' or country_code == 'uk':
				country_code = 'GB'
			url += '&country=' + country_code
			flag += country_code.lower() + ': '
			
		async with aiohttp.get(url) as response:
			soupObject = BeautifulSoup(await response.text(), "html.parser")
		message = ''
		
		status = soupObject.find('status').get_text()
		if status != 'OK':
			message += 'Request failed. Details:\n```'
			message += status + '\n'
			message += soupObject.find('message').get_text()
			message += '```\nMake sure country code is from the list at https://timezonedb.com/country-codes'
		else:
			zones = soupObject.find_all('zone')
			for zone in zones:
				newmessage = ''
				newmessage += flag
				newmessage += zone.find('countryname').get_text() + '\n'
				newmessage += zone.find('zonename').get_text() + '\n'
				unixtime = zone.find('timestamp').get_text()
				prettyTime = datetime.datetime.fromtimestamp(int(unixtime)).strftime('%Y-%m-%d %H:%M:%S')
				newmessage += prettyTime + '\n'
				message += newmessage + '\n'
		
		await self.bot.say(message)

예제 #28

0

파일 보기

파일: SendEmail.py 프로젝트: masterATyan/pySendOneToEmail

def http(url):
	html = requests.get(url).text

	soup_main = BeautifulSoup(html)
	# "一个"的文字
	div = soup_main.find_all("div", {"class": "fp-one-cita"})
	text = div[0].a.text
	# print(text)

	# “一个”的图片地址
	img_list = soup_main.find_all("img", {"class": "fp-one-imagen"})
	imgUrl = img_list[0].get('src')
	# print(imgUrl)

	# "一个"的标题
	title_list = soup_main.find_all("p", {"class": "titulo"})
	title = str(title_list[0].text)
	print(title)



	# “一个”的文章vol.1132#articulo'
	url_stroy = 'http://wufazhuce.com/one/' + title + '#articulo'


	soup_stroy = BeautifulSoup(requests.get(url_stroy).text)
	stroy_content = str(soup_stroy.find("div", {"class": "articulo-contenido"}))

	stroy_title = str(soup_stroy.find("h2", {"class": "articulo-titulo"}))

	stroy = stroy_title + stroy_content

	for addr in to_addr:
		sendEmail(text, imgUrl, title, stroy, addr)

예제 #29

0

파일 보기

파일: KAS.py 프로젝트: roomylee/kaggle-sf

def submission(csv_filename, compress):
    global competition_name
    
    filename = csv_filename
    if compress == True :
        with ZipFile(csv_filename, 'w') as myzip:
            filename = csv_filename + ".zip"
            myzip.write(filename)
    
    r_pre = session.get('https://www.kaggle.com/c/%s/submissions/attach' % competition_name)
    soup = BeautifulSoup(r_pre.content, 'html.parser')
    token = soup.find('input', {'name': '__RequestVerificationToken'})['value']
    competition_id = soup.find('input', {'name': 'CompetitionId'})['value']
    
    payload = {
        'CompetitionId': competition_id,
        '__RequestVerificationToken': token,
        'IsScriptVersionSubmission': 'False',
        'SubmissionDescription': 'This-is-description!'
    }
    files = {
        'SubmissionUpload': open(filename, 'rb')
    }
    
    r = session.post('https://www.kaggle.com/competitions/submissions/accept', data=payload, files=files)
    if r.status_code == 200:
        print("Submission Succeed")
        return True
    print("Submission Failed")
    return False

예제 #30

0

파일 보기

파일: loginzhihu.py 프로젝트: GiitSmile/loginzhihu.py

def getallview():
    nums = 27  # 这个是我关注的人数
    followees_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
        'Referer': 'https://www.zhihu.com/people/GitSmile/followees',
        'Origin': 'https://www.zhihu.com',
        'Accept-Encoding': 'gzip, deflate, br',
        'CG - Sid': '57226ad5 - 793b - 4a9d - 9791 - 2a9a17e682ef',
        'Accept': '* / *'

    }
    count = 0
    for index in range(0, nums):
        fo_url = 'https://www.zhihu.com/node/ProfileFolloweesListV2'
        m_data = {
            'method': 'next',
            'params': '{"offset":' + str(
                index) + ',"order_by":"created","hash_id":"de2cb64bc1afe59cf8a6e456ee5eaebc"}',
            '_xsrf': str(getxsrf())
        }
        result = session.post(fo_url, data=m_data, headers=followees_headers)
        dic = json.loads(result.content.decode('utf-8'))
        li = dic['msg'][0]
        mysoup = BeautifulSoup(li, 'html.parser')
        for result in mysoup.findAll('a', attrs={'class': 'zm-item-link-avatar'}):
            print(index + 1)
            print(result.get('title'))
            href = str(result.get('href'))
            print(mysoup.find('a', attrs={'href': href + '/followers'}).text)
            print(mysoup.find('a', attrs={'href': href + '/asks'}).text)
            print(mysoup.find('a', attrs={'href': href + '/answers'}).text)
            print(mysoup.find('a', attrs={'href': href, 'class': 'zg-link-gray-normal'}).text + '\n')
            count += 1
    print('一共关注了 %d人' % count)

예제 #31

0

파일 보기

파일: detailAnalysis.py 프로젝트: shiyu2011/cookstock

 roic = x.get_ROIC()
 mcap = x.get_marketCap_B()
 cashflow = (x.get_totalCashFromOperatingActivities())
 priceSales = x.get_pricetoSales()
 
 s = {ticker:{'market cap':mcap, 'value':price[0], 'price':stickerPrice, 'current EPS': cEPS, 'book value anual': bv, 'book value growth': growth,
      'book value quater': bvq, 'book value quater growth': qgrowth[1], 'roic': roic, 'cashflow':cashflow, 'priceSales':priceSales}}    
 
 
 
 
 #https://finance.yahoo.com/quote/DDD/profile?p=DDD
 url = 'https://finance.yahoo.com/quote/'+ticker+'/profile?p='+ticker
 data = urllib2.urlopen(url)      
 soup = BeautifulSoup(data, features = 'lxml')
 summarys = soup.find('p',class_={'Mt(15px) Lh(1.6)'})
 summary = summarys.get_text()
 s[ticker]['business summary']=summary
 
 url = 'https://finance.yahoo.com/quote/'+ticker+'?p='+ticker+'.tsrc=fin-srch'
 data = urllib2.urlopen(url) 
 time.sleep(0.5)
 soup = BeautifulSoup(data, features = 'lxml')
 divs = soup.find('div',attrs={'id':'quoteNewsStream-0-Stream-Proxy'})
 div = divs.find('div',attrs={'id':'quoteNewsStream-0-Stream'})
 ul = div.find('ul')
 lis = ul.findAll('li')
 hls = []
 count = 0
 news = {'news':{}}
 for li in lis:

예제 #32

0

파일 보기

파일: prom.py 프로젝트: klinok9/web-scraping

# #

# for i in range(1,10):
#     url = f'http://www.oaontc.ru/services/registers/lnk/?&page={i}'
#     print(url)
url = "http://www.oaontc.ru/services/registers/lnk/"
req = requests.get(url, headers=headers)
src = req.text
print(src)
with open('index.html', 'w')as file: # запись страницы в файл
    file.write(src)

with open('index.html')as file: # запись страницы в файл
    src = file.read()
soup = BeautifulSoup(src, 'lxml')
all_hrefs = soup.find(class_="textpage docs").find_all('a')
print(all_hrefs)
all_dict = {}

for item in all_hrefs:
    item_text = item.text
    item_href = 'http://www.oaontc.ru' + item.get('href')

    all_dict[item_text] = item_href

with open('all_dict.json', 'w') as file:
    json.dump(all_dict, file, indent=4,ensure_ascii=False)

with open('all_dict.json') as file:
    all_categories = json.load(file)
print(all_categories)

예제 #33

0

파일 보기

파일: ffa.py 프로젝트: RBeaudet/insur2vec

 def get_links(soup: BeautifulSoup) -> List[str]:
     list_href = []
     articles = soup.find('div', class_='view-content').find_all('a')
     for article in articles:
         list_href.append('https://www.ffa-assurance.fr' + article.get('href'))
     return list_href

예제 #34

0

파일 보기

from youtube_dl import YoutubeDL
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pyexcel

url = "https://www.apple.com/itunes/charts/songs/"
conn = urlopen(url)
raw_data = conn.read()

html_page = raw_data.decode('utf-8')
f_conn = open('itunecharts.html','wb')
f_conn.write = raw_data
f_conn.close()
soup = BeautifulSoup(html_page,"html.parser")

section = soup.find('section','section chart-grid')
li_list = section.find_all('li')

rank_list =[]
for li in li_list:
    rank = li.strong.string
    song = li.h3.a.string
    artist = li.h4.a.string
    ranking = {
        "Rank": rank,
        "Song": song,
        "Artist": artist,
    }
    rank_list.append(ranking)
print(rank_list)
pyexcel.save_as(records = rank_list, dest_file_name = 'itune_rank.xlsx' )

예제 #35

0

파일 보기

파일: nflDraftsScraper.py 프로젝트: sakshi-23/NFL_AFC_NFC

    soup2 = BeautifulSoup(page2, "html.parser")
    for table in soup2.findAll("table") [0:10]:#2014-2005
        tbody = table.find("tbody")
        # object that tells me which team, what year,
        # how many players were picked that year
        i = 0 #which column in the row is selected

        playerList=[]
        for row in tbody.findAll("td"):
            i+= 1

            if (i%5 == 1):
                player = {}
                title = table.find("tr", {'class':'thd1'})
                player['year'] = title.find("td").contents[0][:4]
                player['team'] = soup2.find('option', selected=True).getText();
                player['teamAbbr'] = teamExtraInfos[player['team']]["abbr"]
                player['division'] = teamExtraInfos[player['team']]["division"]
                player['status'] = 'N/A'
                draftsJson.append(player)

            swithcer = {
                1: roundNum,
                2: selNum,
                3: name,
                4: position,
                0: school
            }
            swithcer[i%5] (player, row)

for urlR in urlListRoster:

예제 #36

0

파일 보기

파일: p48.py 프로젝트: sparrow1058/python

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bsObj=BeautifulSoup(html,"html.parser")
#for link in bsObj.findAll("a"):
for link in bsObj.find("div",{"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$")):
	if 'href' in link.attrs:
		print(link.attrs['href'])

예제 #37

0

파일 보기

파일: state_by_state.py 프로젝트: doringutsu/State-by-state

from bs4 import BeautifulSoup
import urllib2
import os
import urllib
import os
import getxml
#create connection with main page
url = 'http://www.heart.org/HEARTORG/General/State-by-State-NIH-Allocations_UCM_440585_Article.jsp'
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read(), 'lxml')

#get all the links to countries
div = soup.find('div', class_ = 'content')
table = div.find('table', width = 400)

for row in table.findChildren('tr'):
	for cell in row.findChildren('td'):

		link = cell.find('a').get('href')
		url = 'http://www.heart.org/' + link

		state = cell.text

		urllib.urlretrieve (url, 'pdf/' + state + '.pdf')

		getxml.getxml('pdf/' + state)

		print state + ' done'

예제 #38

0

파일 보기

파일: craigslist_scraper.py 프로젝트: Filbert-Codes/Web-Scraping

scraping = True
# We use a while loop because switching pages might be necessary to collect all the
# data on one craigslist search.
while scraping:

    # Requesting the server to let us retrieve data from the site
    src = requests.get(url)
    # Getting html of the webpage
    results = src.content
    # Converting html to more readable format with BeautifulSoup
    soup = BeautifulSoup(results, 'lxml')
    # Finding all classes that matches 'result-info', this contains all the basic item info
    front_page_info = soup.find_all(class_='result-info')

    # Gets the number of total results. 120 results max per page.
    num_of_total_results = soup.find(class_='totalcount').get_text()
    # Gets the number of results in the page. Less than 120 results indicates the last page.
    range_to = soup.find(class_='rangeTo').get_text()
    range_from = soup.find(class_='rangeFrom').get_text()
    num_of_items = int(range_to) - int(range_from) + 1
    print('Total number of listings: ' + str(num_of_total_results))
    print('number of items on page: ' + str(num_of_items))
    url = 'https://seattle.craigslist.org/search/sss?query={}&sort=rel'.format(input_1 + '&s=' + str(range_to))

    # Using the find method to get title, price, location, product page, and date for all items
    for item in range(num_of_items):
        title = front_page_info[item].find(class_='result-title hdrlnk')[0].get_text()
        title_list.append(title)
        price = front_page_info[item].find(class_='result-price')[0].get_text()
        price_list.append(price)
        date = front_page_info[item].find(class_='result-date')[0].get_text()

예제 #39

0

파일 보기

파일: base.py 프로젝트: pmccaffrey4/bank_is_open

class BRefMatch:
    """
    Generates a match information from basketball reference
    """
    def __init__(self, country, league, season, code, match_type):
        self.country = country
        self.league = league
        self.season = season
        self.code = code
        self.type = match_type

    def is_crawled(self):
        """
        returns wether match is already crawled
        """
        return '{0}.json'.format(self.code) in os.listdir(
            './matches/{0}/{1}/{2}'.format(self.country, self.league,
                                           self.season))

    @timeout
    def crawl(self):
        """
        generate all stats for a nba match
        """
        match_url = self.uri_base.format(self.code)
        headers = {'User-agent': random.choice(USER_AGENTS)}
        rv = requests.get(match_url, headers=headers)
        self.soup_ = BeautifulSoup(rv.text)

        self.match_ = defaultdict(dict)
        self._gen_teams_stats()
        self._gen_match_basic_info()
        self._gen_teams_basic_info()
        self._gen_scoring()
        self._gen_extra_info()

        self._write_match()

    def _gen_teams_stats(self):
        """
        generate and add statistics related to teams and players to match dict
        """
        for team in ['home', 'away']:
            self.match_[team]['players'] = defaultdict(dict)
            self.match_[team]['totals'] = defaultdict(dict)

        stats_tables = self.soup_.find_all('table', {'class': 'stats_table'})
        bas_stats_tables = stats_tables[0], stats_tables[2]
        adv_stats_tables = stats_tables[1], stats_tables[3]
        self._read_table(bas_stats_tables, last_col=False)
        self._read_table(adv_stats_tables, last_col=True)

        self._gen_derived_stats()

        self.match_['home']['totals']['+/-'] = self.match_['home']['totals'][
            'PTS'] - self.match_['away']['totals']['PTS']
        self.match_['away']['totals']['+/-'] = self.match_['away']['totals'][
            'PTS'] - self.match_['home']['totals']['PTS']

    def _gen_match_basic_info(self):
        """
        generate and add basic information related to match to match dict
        """
        self.match_['code'] = self.code
        self.match_['type'] = self.type
        self.match_['league'] = self.league
        self.match_['season'] = self.season
        self.match_['country'] = " ".join(
            map(lambda x: x.capitalize(), self.country.split('_')))

        loc_time = [
            el.text for el in self.soup_.find('div', {
                'class': 'scorebox_meta'
            }).find_all('div')
        ]
        if len(loc_time) >= 1:
            date = loc_time[0]
            if 'AM' in date or 'PM' in date:
                date, time = gen_date_with_mins(date)
                self.match_['date'] = str(date)
                self.match_['time'] = str(time)
            else:
                self.match_['date'] = str(gen_date(date))
        if len(loc_time) == 2:
            self.match_['stadium'] = " ".join(
                map(lambda x: x.capitalize(),
                    loc_time[1].split(',')[0].split(' ')))

    def _gen_teams_basic_info(self):
        """
        generates teams (and their players) basic information
        """
        teams = [
            team.find_all('a')[-1] for team in self.soup_.find(
                'div', {'scorebox'}).find_all('div', {'itemprop': 'performer'})
        ]
        away, home = [team.text for team in teams]
        away_page, home_page = [team['href'] for team in teams]
        for team, team_name, team_page in zip(['away', 'home'], [away, home],
                                              [away_page, home_page]):
            self.match_[team]['name'] = team_name
            self._team_pls_basic_info(team, team_name, team_page)

    def _team_pls_basic_info(self, team_cond, team_name, team_page):
        """
        generate and add basic information related to players to match dict
        """
        team_info = BRefTeam(team_name, team_page)
        team_info.gen_players_info()

        pls = self.match_[team_cond]['players']
        for pl, info in pls.items():
            pl_basic_info = PlayerBasicInfo(pl, team_info)
            info.update(pl_basic_info.get())

    def _gen_scoring(self):
        """
        generate and add scoring information to match dict
        """
        raise NotImplementedError

    def _gen_extra_info(self):
        """
        generate and add attendance, duration and officials info to match dict
        """
        raise NotImplementedError

    def _read_table(self, table, last_col):
        """
        reads given table and updates relevant stats in match dict
        """
        raise NotImplementedError

    def _gen_derived_stats(self):
        for team in ['home', 'away']:
            team_stats = self.match_[team]['totals']

            def add_derivated_stats_to_dict(d, type_):
                d['FG%'] = gen_derived_var(d['FG'], d['FGA'])
                d['FT%'] = gen_derived_var(d['FT'], d['FTA'])
                d['3P%'] = gen_derived_var(d['3P'], d['3PA'])
                d['eFG%'] = gen_derived_var((d['FG'] + 0.5 * d['3P']),
                                            d['FGA'])
                d['TSA'] = d['FGA'] + 0.44 * d['FTA']
                d['TS%'] = gen_derived_var(d['PTS'], 2 * d['TSA'])
                d['3PAr'] = gen_derived_var(d['3PA'], d['FGA'])
                d['FTAr'] = gen_derived_var(d['FTA'], d['FGA'])
                d['2P'] = d['FG'] - d['3P']
                d['2PA'] = d['FGA'] - d['3PA']
                d['2P%'] = gen_derived_var(d['2P'], d['2PA'])
                d['2PAr'] = gen_derived_var(d['2PA'], d['FGA'])
                d['DRB'] = d['TRB'] - d['ORB']
                d['ORBr'] = gen_derived_var(d['ORB'], d['TRB'])
                d['DRBr'] = gen_derived_var(d['DRB'], d['TRB'])
                d['AST/TOV'] = gen_derived_var(d['AST'], d['TOV'])
                d['STL/TOV'] = gen_derived_var(d['STL'], d['TOV'])
                d['FIC'] = (d['PTS'] + d['ORB'] + 0.75 * d['DRB'] + d['AST'] +
                            d['STL'] + d['BLK'] - 0.75 * d['FGA'] -
                            0.375 * d['FTA'] - d['TOV'] - 0.5 * d['PF'])
                d['FT/FGA'] = gen_derived_var(d['FT'], d['FGA'])

                d['HOB'] = gen_derived_var(d['FG'] + d['AST'],
                                           team_stats['FG'])

            # derive players and teams stats
            for player_stats in self.match_[team]['players'].values():
                if player_stats['MP']:
                    add_derivated_stats_to_dict(player_stats, 'player')
            add_derivated_stats_to_dict(team_stats, 'team')

    def _write_match(self):
        filename = './matches/{0}/{1}/{2}/{3}.json'.format(
            self.country, self.league, self.season, self.code)
        with open(filename, 'w') as f:
            f.write(json.dumps(self.match_))

예제 #40

0

파일 보기

파일: fox.py 프로젝트: kanako777/test

class foxExtractor(object):
    """docstring for foxExtractor"""
    def __init__(self, url, testMode):

        print("Detected FOX NOW\nProcessing....\n")
        self.loginRequired = False
        self.urlName = url
        self.debug = True
        self.testMode = testMode
        self.requestsFileName = "iDoNotExistDefinitelyOnThisComputerFolder.html"
        self.showId = ""
        self.showName = ""
        self.videoGuid = ""

        self.subtitleServer = "http://static-media.fox.com/cc/"
        self.fileExtension = [".srt", ".dfxp"]
        pass

    def getSubtitles(self):
        """
        The main function which uses helper functions to get the subtitles
        """

        self.createSoupObject()

        self.getTitle()
        if self.debug:
            print(self.title)

        self.contentID = self.getContentID1(self.urlName)  # Method-1

        try:
            self.contentID = int(self.contentID)
        except:
            print("Trying an alternative method to fetch Content ID")
            self.contentID = self.getContentID2()  # Method-2

        try:
            self.contentID = int(self.contentID)
        except:
            print("Unable to fetch the contentID.")
            self.deleteUnnecessaryfiles()
            return 0

        if self.debug:
            print(self.contentID)

        jsonString = self.getShowJson()
        if self.debug:
            print(jsonString)

        if not self.standardCheck(jsonString):
            return 0

        self.getShowDetails(jsonString)

        if self.debug:
            print(self.showId)
            print(self.showName)
            print(self.videoGuid)

        if not self.standardCheck(self.showId, self.showName, self.videoGuid):
            return 0

        CaptionList = self.getSubtitleUrl()
        if self.debug:
            print(CaptionList)

        for link in CaptionList:
            returnValue = self.downloadTranscript(link)
            if returnValue:
                break

        self.deleteUnnecessaryfiles()

        return returnValue

    def createSoupObject(self):

        requestObject = requests.get(self.urlName)

        # fileHandler = open("requests.txt", "w")
        # fileHandler.write(requestObject.text)
        # fileHandler.close()

        self.soupObject = BeautifulSoup(requestObject.text,
                                        "lxml",
                                        from_encoding="utf8")
        # soupObject1 = BeautifulSoup(requestObject.text,"lxml")
        # print(self.soupObject.original_encoding)

        fh = open(self.requestsFileName, "w")
        fh.write(str(self.soupObject))
        fh.close()

        pass

    def getContentID1(self, url):
        """This is one of the methodologies to get the content ID. If this fails the alternative method will be called

        The URL follows a specific standard throughout.

        http://www.fox.com/watch/684171331973/7684520448

        We need to split and return "684171331973"

        """

        contentId = ''

        try:
            searchStringList = ["watch/"]
            juknkData, episodeName, IDContainer = url.partition(
                searchStringList[0])
            contentId, Slash, Junk = IDContainer.partition("/")

        except:
            pass

        return contentId

    def getContentID2(self):
        """
        This is the alternative method to obtain the contentID.

        <meta content="http://www.fox.com/watch/681382467805/7683748608" property="og:url"/>
        Obtained from the SOUP.
        """
        contentId = ''

        try:
            UrlObj = self.soupObject.find("meta", attrs={"property": "og:url"})
            Url = UrlObj['content']
            contentId = self.getContentID1(Url)

        except:
            pass

        return contentId

        pass

    def getShowJson(self):
        """
        The required script content  looks like this-

        jQuery.extend(Drupal.settings, {"":...............});

        1) We add everything to a new string after encountering the first "{".
        2) Remove the last parantheses and the semi-colon to create a valid JSON. ---- ');'

        """
        scripts = self.soupObject.findAll("script")
        rawScript = ""

        for strs in scripts:
            if strs.string is not None:
                if "showid" in strs.string:
                    rawScript = strs.string

        addState = False
        jsonString = ''

        for i in rawScript:
            if i == "{" and addState is False:
                addState = True
            if addState is True:
                jsonString += i

        jsonString = jsonString.replace(");", "")

        return jsonString

        pass

    def getShowDetails(self, jsonString):
        """
        The json content looks like this -

        {"foxProfileContinueWatching":{"showid":"empire","showname":"Empire"},..............
         "foxAdobePassProvider": {......,"videoGUID":"2AYB18"}}

        """

        try:
            IndexingParameters = [
                ["foxProfileContinueWatching", "showid", "showname"],
                ["foxAdobePassProvider", "videoGUID"],
            ]

            parsedJsonObject = json.loads(jsonString)

            self.showId = parsedJsonObject[IndexingParameters[0][0]][
                IndexingParameters[0][1]]
            self.showName = parsedJsonObject[IndexingParameters[0][0]][
                IndexingParameters[0][2]]
            self.videoGuid = parsedJsonObject[IndexingParameters[1][0]][
                IndexingParameters[1][1]]

        except:
            print("Unable to parse Json. Please report.")
            pass

        pass

    def getSubtitleUrl(self):
        """
        Sample Subtitle Link -
        http://static-media.fox.com/cc/sleepy-hollow/SleepyHollow_3AWL18_660599363942.srt
        http://static-media.fox.com/cc/sleepy-hollow/SleepyHollow_3AWL18_660599363942.dfxp

        The standard followed is -
        http://static-media.fox.com/cc/[showid]/showname_videoGUID_contentID.srt
        OR
        http://static-media.fox.com/cc/[showid]/showname_videoGUID_contentID.dfxp


        Some Subtitle URL's follow this standard -
        http://static-media.fox.com/cc/[showid]/showname_videoGUID.dfxp
        http://static-media.fox.com/cc/[showid]/showname_videoGUID.srt

        So we store both URL's and check both

        """
        SubsUrl = self.subtitleServer
        SecondarySubsUrl = ''

        self.showName = self.processShowName(self.showName)

        SubsUrl += str(self.showId)
        SubsUrl += "/"

        SubsUrl += str(self.showName)
        SubsUrl += "_"

        SubsUrl += str(self.videoGuid)
        SecondarySubsUrl = SubsUrl
        SubsUrl += "_"

        SubsUrl += str(self.contentID)
        SubsUrl += self.fileExtension[0]

        SecondarySubsUrl += self.fileExtension[0]

        return [SubsUrl, SecondarySubsUrl]

    def processShowName(self, name):
        """

        Removes white spaces

        """
        name = name.replace(" ", "")
        return name

    def downloadTranscript(self, SubsLink):
        """
        This function fetches the captions and writes them into a file in VTT format
        """
        try:
            subRequestObject = requests.get(SubsLink)
            subRequestObject.encoding = 'utf-8'
            # print(subRequestObject.text)

            if subRequestObject.status_code >= 400:
                # Deliberate error to exit.
                s = int("deliberateError")

            subsFileHandler = open(self.title + self.fileExtension[0], "w")
            print("Creating ~  '%s%s' ..." %
                  (self.title, self.fileExtension[0]))
            subsFileHandler.write(subRequestObject.text)
            subsFileHandler.close()
            return 1

        except:
            return 0

        pass

    def getTitle(self):
        """
        This function returns the title of the video. This is also used for naming the file.

        <title>Watch New Girl Online: Episode 21, Season 5 on FOX</title>   --> Extracting the value from here

        """

        # print(self.soupObject.title.string)
        try:

            self.title = self.soupObject.title.string.strip()
            if not self.title:
                s = int("deliberateError")

        except:
            self.title = "DownloadedFOXNowSubtitles"

        pass

    def deleteUnnecessaryfiles(self):

        if not self.debug:
            try:
                os.remove(self.requestsFileName)
            except:
                pass

    def standardCheck(self, *variablesToCheck):

        for variables in variablesToCheck:
            if not variables:
                print("Unable to fetch the subtitles.")
                self.deleteUnnecessaryfiles()
                return 0

        return 1

예제 #41

0

파일 보기

파일: udemy_freecourse_scraper.py 프로젝트: Srikanthpai5/udemy_scraper

rows = []


for page in range(1,12):
    page_url = f'https://www.udemy.com/courses/free/?lang=en&p={page}&sort=highest-rated'
    driver.get(page_url)
    time.sleep(5)

    try:
        WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'course-list--container--3zXPS')))
    except TimeoutException:
        print('Loading exceeds delay time')
        # break
    else:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        course_list = soup.find('div', {'class': 'course-list--container--3zXPS'})

        courses = course_list.find_all('a', {'class' : "udlite-custom-focus-visible browse-course-card--link--3KIkQ"})
        # total_res = driver.find_element(By.CLASS_NAME,"udlite-heading-md filter-panel--item-count--2JGx3")
        #
        # print("----------------- FOUND :",total_res, " RESULTS !-----------------")

        for course in courses:
            course_url = '{}{}'.format("https://www.udemy.com",course['href'])
            course_title = course.select('div[class*="course-card--course-title"]')[0].text
            course_details = course.find_all('span', {'class':'course-card--row--1OMjg'})
            course_len = course_details[0].text
            number_of_lectures = course_details[1].text
            difficulty = course_details[2].text
            course_rating = extract_text(course, "span", 'data-purpose', 'rating-number')

예제 #42

0

파일 보기

파일: pg.py 프로젝트: samiq/pg

H1 = '<h1>{0}</h1>'

# Load and parses RSS feed
d = feedparser.parse(RSS_FEED)
print d.feed.title
print d.feed.description

for idx, item in enumerate(d.entries):
	print 'Processing...' + item.title + ' : ' + item.link
	
	try :		
		page = urllib2.urlopen(item.link)
		soup = BeautifulSoup(page)
		
		final = ''
		c = soup.find('td', width="455")
		
		if c is None:
			c = soup.find('td', width="375")

		cc = soup.findAll(cellspacing="0",width="100%")
		[comment.extract() for comment in cc]

		if c is not None:
			s = c.prettify()
			s = re.sub('<br>\s*\s<br>','</p><p>',s)
			s = re.sub('<font(\s*|.*|\s)>|</font>|<br>|</br>|<br/>|<td(\s*|.*|\s)>|</td>|</img>','',s)
			s = re.sub('<img(\s*|.*|\s)>\s*</p>',str.format(H1,item.title),s)
			s = re.sub('<p>\s*\s<b>','<h3>',s)
			s = re.sub('</b>\s*\s</p>','</h3>',s)
			s = TEMPLATE.format(item.title,s,item.link)

예제 #43

0

파일 보기

import requests
from bs4 import BeautifulSoup

URL = 'https://www.amazon.in/BATA-Jorah-Formal-Shoes-8-8216017/dp/B079R8JWH8/ref=sr_1_1_sspa?crid=1SMS0M4ZJAB8K&keywords=formal+shoes+for+men&qid=1573122191&sprefix=formal+sho%2Caps%2C371&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzVVNOWjg2S0xRQTNGJmVuY3J5cHRlZElkPUEwNTE0NjkwUVo0SThUV0MxMVZNJmVuY3J5cHRlZEFkSWQ9QTAwMjMyODMzVVBGVEw3Qko2Qk1KJndpZGdldE5hbWU9c3BfYXRmJmFjdGlvbj1jbGlja1JlZGlyZWN0JmRvTm90TG9nQ2xpY2s9dHJ1ZQ=='

headers = {
    "User-Agent":
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
}

page = requests.get(URL, headers=headers)

soup = BeautifulSoup(page.content, "html.parser")

title = soup.find(id="productTitle")

print(title)

예제 #44

0

파일 보기

파일: .en_4sem.py 프로젝트: Abhas-Bhatnagar/result_analysis

    soup = BeautifulSoup(open(filename), 'html.parser')
    n = 0
    c = 0
    for b in soup.table():
        if (str(b.get('id')) != "None"):
            n = n + 1
            x = str(b.get('id'))
    for b in soup.table():
        if (str(b.get('id')) != "None"):
            c = c + 1
            if (c == n - 1):
                x = str(b.get('id'))
                id_selector = x[3:5]
                print(id_selector)

    rollnumber = str(soup.find(id='lblRollNo').text)
    name = str(soup.find(id='lblFullName').text)
    fathername = str(soup.find(id='lblFatherName').text)

    marks = str(
        soup.find(id='ctl' + id_selector +
                  '_ctl01_lblSemesterTotalMarksObtained').text)
    cp = str(soup.find(id='ctl' + id_selector + '_ctl01_lblResultStatus').text)
    cop = str(soup.find(id='ctl' + id_selector + '_lblCOP').text)

    i = soup.find(id='ctl' + id_selector +
                  '_ctl01_ctl00_grdViewSubjectMarksheet')

    print(rollnumber + " \n" + name + " \n" + fathername + "\n" + marks +
          "\n" + cp + "\n" + cop)
    subjects = [

예제 #45

0

파일 보기

def parseTime(time):
    return time


from toi import toi
headers = {
    "Accept":
    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent":
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0",
}
listSite = "https://timesofindia.indiatimes.com"
baseURL = "https://timesofindia.indiatimes.com"
page = requests.get(listSite, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
listN = soup.find(class_="list9").find_all('li')

topNews = []
for site in listN:
    try:
        heading = site.find('a').text
        heading = heading.replace('\xa0', ' ').encode('utf-8')
        link = baseURL + site.a["href"]
        # news = {
        #     "href": link,
        #     "headline": heading,
        #     # "time": time
        # }
        topNews.append(toi(link))
    except:
        print("Non list item ")

예제 #46

0

파일 보기

파일: downloadToDir_scrapyspider.py 프로젝트: CDonnees/MappingFrenchRussia

 def parse_list(self, response):
     soup = BeautifulSoup(response.body, "lxml")
     tr_list = soup.find('table', attrs={'class':'table table-bordered table-striped'}).find_all('tr')
     for i in tr_list:
         yield scrapy.Request(url=response.url.split('?')[0]+i.td.next_sibling.a['href'], callback=self.parse_notice)

예제 #47

0

파일 보기

파일: scrape_mars.py 프로젝트: mtagruda/web-scraping-challenge

def scrape_info():
    browser = init_browser()

    # Visit https://mars.nasa.gov/news/
    url1 = 'https://mars.nasa.gov/news/'
    browser.visit(url1)

    time.sleep(3)

    # Scrape page into Soup
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    news_titles = soup.find('div', class_="content_title")
    news_title = news_titles.text
    print(news_title)

    time.sleep(3)


    news_ps = soup.find('div', class_="article_teaser_body")
    news_p = news_ps.text
    print(news_p)

#Find the src for the featured image
    url2 = 'http://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url2)

    time.sleep(2)

    html2 = browser.html
    soup = BeautifulSoup(html2, 'html.parser')

    img = soup.find_all('a', class_="button fancybox")

    for a in img:
        print(a["data-fancybox-href"])
    
    url9 = "http://www.jpl.nasa.gov/"
    featured_image_url = url9 + a["data-fancybox-href"]

    url3 = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url3)

    time.sleep(3)

    soup = BeautifulSoup(browser.html, 'html.parser')

    mars_weather = soup.find(class_='tweet-text').text

    url4 = 'https://space-facts.com/mars/'
    browser.visit(url4)

    time.sleep(10)

    html4 = browser.html
    soup = BeautifulSoup(html4, 'html.parser')

    marsfacts = soup.find_all('table', class_="tablepress tablepress-id-p-mars")
    marsfacts

    url5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url5)

    time.sleep(5)

    html5 = browser.html
    soup = BeautifulSoup(html5, 'html.parser')

    hemis_search = soup.find_all('a', class_="itemLink product-item")
    url10 = "https://astrogeology.usgs.gov"
    img_url =  []

    for a in hemis_search:
        print(a['href'])
        img_url.append(a['href'])

    url11 = url10 + img_url[0]
    url12 = url10 + img_url[2]
    url13 = url10 + img_url[4]
    url14 = url10 + img_url[6]

    browser.visit(url11)
    html11 = browser.html

    time.sleep(5)
    soup = BeautifulSoup(html11, 'html.parser')
    hemis_search2 = soup.find_all('img', class_="wide-image")
    for a in hemis_search2:
        print(a['src'])
    url15 = url10 + (a['src'])
    print(url15)

    browser.visit(url12)
    html12 = browser.html
    time.sleep(5)
    soup = BeautifulSoup(html12, 'html.parser')
    hemis_search3 = soup.find_all('img', class_="wide-image")
    for a in hemis_search3:
        print(a['src'])
    url16 = url10 + (a['src'])
    print(url16)

    browser.visit(url13)
    html13 = browser.html
    time.sleep(5)
    soup = BeautifulSoup(html13, 'html.parser')
    hemis_search4 = soup.find_all('img', class_="wide-image")
    for a in hemis_search4:
        print(a['src'])
    url17 = url10 + (a['src'])
    print(url17)

    browser.visit(url14)
    html14 = browser.html
    time.sleep(5)
    soup = BeautifulSoup(html14, 'html.parser')
    hemis_search4 = soup.find_all('img', class_="wide-image")
    for a in hemis_search4:
        print(a['src'])
    url18 = url10 + (a['src'])
    print(url18)


    hemisphere_image_url = [
    {"title": "Cerberus Hemisphere", "img_url": url15}, 
    {"title": "Schiaparelli Hemisphere", "img_url": url16},
    {"title": "Syrtis Major Hemisphere", "img_url": url17},
    {"title": "Valles Marineris Hemisphere", "img_url": url18}
    ]

    # Store data in a dictionary
    mars_data = {
        "news_title": news_title,
        "news_p": news_p,
        "featured_image_url": featured_image_url,
        "mars_weather": mars_weather,
        "url15": url15,
        "url16": url16,
        "url17": url17,
        "url18": url18       
    }

    # Close the browser after scraping
    browser.quit()

    # Return results
    return mars_data

예제 #48

0

파일 보기

파일: reddit_scraper.py 프로젝트: Puritanic/Scraper

import urllib.request
from bs4 import BeautifulSoup
import json

url = "https://old.reddit.com/r/ProgrammerHumor/"
request = urllib.request.Request(
    url,
    headers={
        'User-Agent':
        ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'
    })
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html, 'html.parser')
# First lets get the HTML of the table called site Table where all the links are displayed
main_table = soup.find("div", attrs={'id': 'siteTable'})
# Now we go into main_table and get every a element in it which has a class "title"
links = main_table.find_all("a", class_="title")
# List to store a dict of the data we extracted
extracted_records = []
for link in links:
    title = link.text
    url = link['href']
    # There are better ways to check if a URL is absolute in Python. For sake simplicity we'll just stick to .startwith method of a string
    # https://stackoverflow.com/questions/8357098/how-can-i-check-if-a-url-is-absolute-using-python
    if not url.startswith('http'):
        url = "https://old.reddit.com/" + url
    # You can join urls better using urlparse library of python.
    # https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin
    # Lets just print it
    print("%s - %s" % (title, url))
    record = {'title': title, 'url': url}

예제 #49

0

파일 보기

def fillUnivList(ulist, html):
    soup = BeautifulSoup(html, "html.parser")
    for tr in soup.find('tbody').children:
        if isinstance(tr, bs4.element.Tag):
            tds = tr('td')
            ulist.append([tds[0].string, tds[1].string, tds[3].string])

예제 #50

0

파일 보기

파일: scrapeb24.py 프로젝트: TudorTurbinca/ProiectLuni

"""
import bs4
from bs4 import BeautifulSoup
import requests
import csv

url = requests.get('https://ziare.com/economie/analiza-economica/cum-ne-pregatim-pentru-economia-post-covid-1606027').text

soup = BeautifulSoup(url, 'html.parser')

#csv_file = open('articole.csv', 'w',  encoding ='utf-16')
#csv_writer = csv.writer(csv_file) 
#csv_writer.writerow(['Titlu', 'Articol'])
print(soup.prettify())

titlu = soup.find('h1').text

print(titlu)

articolul = soup.find('div', class_='descriere_main').text
print(articolul)

#csv_writer.writerow([titlu, articolul])
#csv_file.close()
#csv.field_size_limit()

with open('articole1.csv', 'a',newline='', encoding="utf-16") as csvfile:
    fieldnames = ['Titlu', 'Articol']
    writer = csv.DictWriter(csvfile, fieldnames = fieldnames, delimiter = '\t')
    writer.writerow({"Titlu": titlu.replace("\n", " ").strip(), "Articol": articolul.replace("\n", " ").strip()})

예제 #51

0

파일 보기

파일: movie-crawlingmdb.py 프로젝트: koty08/20190615python

import urllib
import pymysql
import db
from bs4 import BeautifulSoup


params = urllib.parse.urlencode({'page' :1})
url='https://movie.naver.com/movie/point/af/list.nhn?&%s' % params
print(url)

response = urllib.request.urlopen(url)
navigator = BeautifulSoup(response, 'html.parser')
table = navigator.find('table', class_ = 'list_netizen')
print(table)

list_records=[]
for i,r in enumerate(table.find_all('tr')):
    for j,c in enumerate(r.find_all('td')):
        if j==0:
            record=int(c.text.strip())
        elif j==2:
            record1=int(c.text.strip())
        elif j==3:
            record2= str(c.find('a', class_ = 'movie').text.strip())
            record3= str(c.text).split('\n')[2]
        elif j==4:
            record4 = str(c.find('a', class_ = 'author').text.strip())
            record5= str(c.text).split('****')[1]
    try:
        record_t=tuple([record,record1,record2,record3,record4,record5])
        list_records.append(record_t)

예제 #52

0

파일 보기

파일: Ch2_2_node.py 프로젝트: YourName0729/ynSpider

from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
soup = BeautifulSoup(html, 'html.parser')

gifts = soup.find('table', {'id': 'giftList'}).children

# print(len(gifts))
for gift in gifts:
    print(gift)

siblings = soup.find('table', {'id': 'giftList'}).tr.next_siblings

for sibling in siblings:
    print(sibling)

예제 #53

0

파일 보기

파일: ynet.py 프로젝트: sapir/newsdiffs

    def _parse(self, html):
        soup = BeautifulSoup(html, from_encoding='utf-8')

        self.meta = soup.find_all('meta')

        try:
            seo_title = soup.find('meta', property='og:title').get('content')
        except AttributeError:
            # find returned None, None has no get method
            self.real_article = False
            return

        try:
            self.title = soup.find('div', 'art_header_title').get_text()
        except AttributeError:
            self.title = seo_title

        try:
            byline_and_date = (soup.find('span', 'art_header_footer_author')
                .get_text())
        except AttributeError:
            self.real_article = False
            return
        
        byline, _, date = re.split(u'(פורסם|עדכון אחרון):', byline_and_date)
        self.byline = byline.strip()
        self.date = date.strip()

        sub_title_elt = soup.find('div', 'art_header_sub_title')
        sub_title = u'' if sub_title_elt is None else sub_title_elt.get_text()

        body_elt = soup.find('div', 'art_body')
        if body_elt is None:
            self.real_article = False
            return

        def replace_with_para(tag, para_text, dividers=True):
            para = soup.new_tag('p')
            para.string = para_text
            tag.replace_with(para)

            if dividers:
                pre_divider = soup.new_tag('p')
                pre_divider.string = NBSP
                para.insert_before(pre_divider)

                post_divider = soup.new_tag('p')
                post_divider.string = NBSP
                para.insert_after(post_divider)

        # TODO: handle these better, maybe mention the caption
        for video in body_elt.find_all('div', 'art_video'):
            replace_with_para(video, u'(סרטון)')

        for img in body_elt('div', 'citv_image'):
            replace_with_para(img, u'(תמונה)')

        for sidething in body_elt('div', 'arttvgenlink'):
            sidething.decompose()

        for ad in body_elt('div', 'CAATVcompAdvertiseTv'):
            # these are floated left
            replace_with_para(ad, u' (פרסומת) ', dividers=False)

        def body_part_to_text(part):
            if part.name == 'p':
                t = part.get_text()
                if t == NBSP:
                    # p contains just nbsp => this is the paragraph division
                    return u'\n\n'
                else:
                    return re.sub(r'\s+', ' ', t.strip())

            if part.name.startswith('h') and part.name[1:].isdigit():
                return part.get_text().strip() + u'\n\n'

            if part.name == 'ul':
                return u'\n' + u'\n'.join(
                    li.get_text().strip() for li in part('li'))

        # join with ' ' so that adjacent p tags get a space between them.
        # we'll later remove extra spaces.
        self.body = u' '.join(itertools.chain(
            [sub_title, u'\n\n'],
            itertools.ifilter(None,
                itertools.imap(body_part_to_text,
                    body_elt(['p', 'ul', 'h3'])))))

        # remove double spaces created by joining paragraphs with ' '
        self.body = re.sub(r' +', ' ', self.body)

        # remove spaces adjacent to dividers, created by joining paragraphs
        # with ' '
        self.body = re.sub(r'\n\n ', '\n\n', self.body)
        self.body = re.sub(r' \n\n', '\n\n', self.body)

        # also remove double dividers (note that this is done
        # after the adjacent spaces are removed)
        self.body = re.sub(r'\n{3,}', r'\n\n', self.body)

예제 #54

0

파일 보기

#!/usr/bin/env python2.7

from bs4 import BeautifulSoup
import requests
import smtplib

server = smtplib.SMTP('smtp.gmail.com', 587)
server.starttls()
server.login("*****@*****.**", "D0llareur0")

finsurl='http://fins.az/bank'

response = requests.get(finsurl)
respData = BeautifulSoup(response.content, "html.parser")
usd = respData.find('div', {'class': 'value'}).get_text()
eur = respData.find('div', {'class': 'value'}).find_next('div', {'class': 'value'}).get_text()

def printer_func(site, dollar, euro):
    return "%s saytindan istinad edilmishdir: " % str(site), "1 USD = %s AZN" % str(dollar), "1 EURO: %s AZN" % str(euro)

cbarurl='http://www.cbar.az/'
respcbar = requests.get(cbarurl)
cbrespData = BeautifulSoup(respcbar.content, "html.parser")
cbusd = cbrespData.find('span', {'class': 'item item_4'}).get_text()
cbeur = cbrespData.find('span', {'class': 'item item_4'}).find_next('span', {'class': 'item item_4'}).get_text()

faz = printer_func(str(finsurl), str(usd), str(eur))
caz = printer_func(str(cbarurl), str(cbusd), str(cbeur))

message = """From: Euro Dollar <*****@*****.**>
To: Email Author <*****@*****.**>

예제 #55

0

파일 보기

파일: tabela.py 프로젝트: thiagoassisk8/Web-Scraping-python

import requests
from bs4 import BeautifulSoup
from urllib.error import HTTPError
from urllib.error import URLError

try:
    site_url = requests.get(
        'https://www.ibge.gov.br/estatisticas/economicas/industria/9294-pesquisa-industrial-mensal-producao-fisica-brasil.html?=&t=resultados'
    ).text
except HTTPError as e:
    print(e)
except URLError as e:
    print("o Servidor não foi encontrado!")

soup = BeautifulSoup(site_url, 'lxml')
print(soup.prettify())

minha_tabela = soup.find('table', {'class': 'pvtTable'})
print(minha_tabela)

#indices_mensais = []
#links = minha minha_tabela.findall('div')

예제 #56

0

파일 보기

파일: utils.py 프로젝트: Tnek/appsec-autograder

def getFormMethod(text, eid):
    soup = BeautifulSoup(text, "html.parser")
    return soup.find("form", id=eid).get("method")

예제 #57

0

파일 보기

파일: __main__.py 프로젝트: book000/gotoeat_map

def main():
    merchantFilePath = os.path.dirname(
        os.path.abspath(__file__)) + "/merchants.json"

    if os.path.exists(merchantFilePath):
        json_open = open(merchantFilePath, "r", encoding="utf8")
        merchants = json.load(json_open)
    else:
        merchants = {"data": [], "names": []}
    findMerchants = []

    page = 0
    while True:
        page += 1
        print("----- Page {page} -----".format(page=page))
        html = requests.get(
            "https://www.gotoeat-tochigi.jp/merchant/index.php?word=&sort=2&page={page}"
            .format(page=page))
        html.encoding = html.apparent_encoding
        soup = BeautifulSoup(html.content, "html.parser")
        lists = soup.find("ul", {
            "class": "serch_result"
        }).findChildren("li", recursive=False)
        if (len(lists) == 0):
            break
        for merchant in lists:
            merchant_name = merchant.find("p", {"class": "name"}).text
            merchant_type = merchant.find("p", {
                "class": "name"
            }).find("span").text
            merchant_name = re.sub(
                r"{merchant_type}$".format(merchant_type=merchant_type), "",
                merchant_name)
            _merchant_address = merchant.find("div", {
                "class": "add"
            }).findAll("p")[0].text
            merchant_postal_code = re.sub(r"所在地〒([0-9\-]+) (.+)", r"\1",
                                          _merchant_address)
            merchant_address = re.sub(r"所在地〒([0-9\-]+) (.+)", r"\2",
                                      _merchant_address)
            if len(merchant.find("div", {"class": "add"}).findAll("p")) >= 2:
                merchant_tel = merchant.find("div", {
                    "class": "add"
                }).findAll("p")[1].text
                merchant_tel = re.sub(r"TEL(.+)", r"\1", merchant_tel)

            print(merchant_name + " - " + merchant_address)
            findMerchants.append(merchant_name)

            if merchant_name in merchants["names"]:
                continue

            lat, lng = getLatLng(merchant_address)
            print(str(lat) + " " + str(lng))

            merchants["data"].append({
                "name": merchant_name,
                "type": merchant_type,
                "address": merchant_address,
                "postal_code": merchant_postal_code,
                "tel": merchant_tel,
                "lat": lat,
                "lng": lng
            })
            merchants["names"].append(merchant_name)

            with open(merchantFilePath, mode="w", encoding="utf8") as f:
                f.write(json.dumps(merchants, indent=4, ensure_ascii=False))
        if (soup.find("li", {"class": "next"}) == None):
            break
        else:
            time.sleep(1)

    merchants = checkRemovedMerchant(merchants, findMerchants)

    with open(merchantFilePath, mode="w", encoding="utf8") as f:
        f.write(json.dumps(merchants, indent=4, ensure_ascii=False))

예제 #58

0

파일 보기

파일: task9.py 프로젝트: Bijusrt/IMBD-Scrapper

from bs4 import BeautifulSoup
import requests, pprint, random, time, string
url = requests.get(
    "https://www.imdb.com/india/top-rated-indian-movies/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=8a7876cd-2844-4017-846a-2c0876945b7b&pf_rd_r=C6ZKX5N78115F6BM14Y3&pf_rd_s=right-5&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_india_tr_rhs_1"
)
soup = BeautifulSoup(url.text, 'lxml')
table = soup.find('tbody', class_='lister-list')
body = table.find_all('tr')
random_var = random.randint(1, 5)
time.sleep(random_var)
_list = []
for i in body:
    _dict = {}
    data = i.find('td', class_="titleColumn")
    no = ''
    for j in data.text:
        no += j
        if j == '.':
            break
    _dict['No'] = no.strip()
    _dict['Movie'] = data.find('a').text
    _dict['Year'] = int(data.find('span').text.strip('(').strip(')'))
    _dict['Rating'] = i.find('strong').text
    _dict['Link'] = "https://www.imdb.com" + i.find('a')['href']
    _list.append(_dict)
pprint.pprint(_list)


def scrapped_movie(mov_link):
    new_url = requests.get(mov_link).text
    soup = BeautifulSoup(new_url, 'lxml')

예제 #59

0

파일 보기

def ret_header(site):
    response = requests.get(site)
    soup = BS(response.content, 'html.parser')

    titles = soup.find('title')
    return titles.text

예제 #60

0

파일 보기

파일: utils.py 프로젝트: Tnek/appsec-autograder

def getElementById(text, eid):
    soup = BeautifulSoup(text, "html.parser")
    result = soup.find(id=eid)
    return result