Python bSoup 예제들, bs4.bSoup Python 예제들

예제 #1

0

파일 보기

파일: main.py 프로젝트: momotofu/grab-citations-for-wikipedia-pages

def main(app):
    """
    CommandLineApp which opens a supplied text file, and then crawls wikipedia
    until relavant citation formats are found. The citation formats and links
    are then printed to a markdown file.
    """

    base_url = 'https://en.wikipedia.org'
    # Store hyperlink sitations as key value pairs with the link
    # being the key.
    hyperlink_citations = {}
    citation_formats = app.params.citation_formats.split(' ')
    file_name = app.params.file_name

    # Get hyperlinks from file and store in a list
    hyperlinks = [x.strip() for x in open_and_read(file_name)]

    # iterate through hyperLinks and retrieve citations
    for hyperlink in hyperlinks:
        main_page_soup = bSoup(request_html_for(hyperlink), 'html.parser')

        # retrieve citation page hyperlink
        citation_page_hyperlink = base_url + main_page_soup.find(
            'a', title='Information on how to cite this page')['href'].strip()

        # retrieve citation page soup
        citation_page_soup = bSoup(request_html_for(citation_page_hyperlink),
                                   'html.parser')

        for citation_format in citation_formats:
            key = main_page_soup.title.text
            citation = get_citation_format_from(citation_page_soup,
                                                citation_format)
            if key not in hyperlink_citations:
                hyperlink_citations[key] = {
                    'hyperlink': hyperlink,
                    'citations': [citation]
                }
            else:
                hyperlink_citations[key]['citations'].append(citation)

    with open('citedLinks.md', 'w') as output_file:
        output_file.write('# Citations For Links')
        output_file.write('\n')
        output_file.write('\n')
        for key in hyperlink_citations:
            page_data = hyperlink_citations[key]
            output_file.write(
                format_citation_for(key, page_data, citation_formats))

예제 #2

0

파일 보기

파일: manhwa.py 프로젝트: rushkii/topmanhwa-unofficial

 async def _last_page(self, path):
     #private only
     async with aiohttp.ClientSession() as ses:
         async with ses.get(f"{self.base}/{path}/") as r:
             s = bSoup(await r.text(), 'lxml')
             return int(
                 s.find('span', class_='pages').text.strip().split()[-1])

예제 #3

0

파일 보기

파일: oploverz.py 프로젝트: bejo173/MyScrap

def skipLinkZippyshare(url):
    website = uReq.get(url)
    data = bSoup(website.content, "lxml")
    for listUrl in data.findAll("script", {"type": "text/javascript"})[5]:
        getLink = re.search("https://(\w+)\.zippyshare\.com/v/(\w+)/file.html",
                            listUrl).group()
        return getLink

예제 #4

0

파일 보기

파일: jurnalotaku.py 프로젝트: bejo173/MyScrap

def JurnalOtakuPost(url):
    with uReq.session() as web:
        web.headers["user-agent"] = "Mozilla/5.0"
        url = web.get(url)
        data = bSoup(url.content, "html5lib")
        datapost = []
        for content in data.findAll(
                "div", {"class": "section-wrapper section-article-content"}):
            for contentData in content.findAll("div", {"class": "meta-cover"}):
                for getData in contentData.findAll("img"):
                    title = getData["alt"]
                    thumb = getData["src"]
            for contentPost in content.findAll("div",
                                               {"class": "meta-content"}):
                for getData in contentPost.findAll("p"):
                    text = getData.text
                    datapost.append(text)
        berita = ""
        for detailsPost in datapost:
            berita += detailsPost
        result = {
            "code": 200,
            "result": {
                "title": title,
                "thumb": thumb,
                "berita": berita
            }
        }
        print(json.dumps(result, indent=4, sort_keys=False))

예제 #5

0

파일 보기

    async def apink(self, ctx, *, msg):

        members = ['eunji', 'bomi', 'hayoung', 'naeun', 'namjoo', 'chorong']

        for mem in members:
            if str(msg.lower()) == mem:
                link = f'https://kprofiles.com/{msg}-profile-facts/'
                emb_title = 'Member Profile'

                source = requests.get(link).text
                soup = bSoup(source, 'lxml')

                # print these facts
                kp_f = soup.find('div', class_='entry-content').p.text

                kp_jpg = soup.find('div', class_='entry-content').img
                kp_src = kp_jpg['src']  # print this image

                embed_kp = discord.Embed(title=emb_title, color=0x29FFCE)

                embed_kp.add_field(name='Info', value=kp_f)
                embed_kp.set_image(url=kp_src)
                embed_kp.add_field(name='Profile Link',
                                   value=link,
                                   inline=False)

                await ctx.send(embed=embed_kp)
                break

예제 #6

0

파일 보기

파일: youtube.py 프로젝트: nalissongm/Projeto_Jarvis

    def info_video(self, buscar, options):
        i = 0
        info_title = []
        info_uploader = []
        info_video = []
        for y in buscar:
            req = requests.get(buscar[i])
            pagesoup = bSoup(req.text, 'html.parser')

            html2 = pagesoup.find_all('meta', {'name': 'title'})
            html3 = pagesoup.find_all('link', {'itemprop': 'name'})

            title = html2[0]['content']
            uploader = html3[0]['content']

            info_title.insert(i, title)
            info_uploader.insert(i, uploader)
            i += 1

        i = 0
        for y in buscar:
            info_video.append(info_title[i] + ' no canal ' + info_uploader[i])
            i += 1

        if 'título' in options:
            return info_title
        elif 'canal' in options:
            return info_uploader
        else:
            return info_video

예제 #7

0

파일 보기

파일: WebScraper.py 프로젝트: jk-engineer/PythonScripts

    def page_parser_from_file(self, file_name: str) -> 'BeautifulSoup':
        """Возвращает страницу, обработанную html-парсером

        :param file_name: имя файла.
        :return:
        """
        return bSoup(open(str(file_name), encoding='utf-8'), 'html.parser')

예제 #8

0

파일 보기

파일: functions.py 프로젝트: RCopJr/egg_hunt

def get_item_containers_ng(url):
    """Creates a List of item containers from the specified site"""
    request = requests.get(url).text
    soup = bSoup(request, "html.parser")
    itemContainers = soup.find_all("div", {"class": "item-container"})

    return itemContainers, soup

예제 #9

0

파일 보기

def skipLinkNekopoi(url, quality):
    with session as web:
        web.headers["user-agent"] = "Mozilla/5.0"
        url = web.get(url)
        data = bSoup(url.content, "lxml")
        for listUrl in data.findAll("div", {"class": "col-sm-6"}):
            result[quality].append(listUrl.a['href'])

예제 #10

0

파일 보기

def getSpreadsheet(url):
    soup = bSoup(requests.get(url).text, "html.parser")
    rows = soup.findAll("tr")
    table = []
    completedRounds = 0
    for i in range(4, 24):  # p1 to P20
        columns = str(rows[i]).split("<td")

        tempCompletedRounds = 0
        for column in columns[5:18]:  # columns[round 1: 30-num rounds]
            if (column.split("\">")[1].split("</td>")[0] != ""):
                tempCompletedRounds += 1
        completedRounds = tempCompletedRounds if tempCompletedRounds > completedRounds else completedRounds

        j = len(columns) - 1
        while j > 31:
            del columns[j]
            j -= 1
        del columns[0:2]
        for j in range(len(columns)):
            columns[j] = columns[j].split("</td")[0].split(".png")[0].split(
                "\">")[-1]
        table.append(columns)

    roundFlag = str(rows[3]).split("<td")[completedRounds].split(
        "flags/")[1].split(".png")[0]
    return table, roundFlag

예제 #11

0

파일 보기

파일: webscrape.py 프로젝트: bambriz/WebScrapeBasics

async def main():
    async with aiohttp.ClientSession() as session:
        url = input()
        html = await fetch(session, url)
        soup = bSoup(html, 'html.parser')
        print(soup.title.get_text())
        print(soup.title.name)
        print(soup.p)
        words = soup.get_text()

        delimits = string.punctuation

        words = words.strip(delimits).lower()
        wordsList = words.split()
        wordsList = [x.strip(delimits) for x in wordsList]
        notCount = ["a", "the", "and", "but", "or", "of", " "]
        for i in wordsList:
            if i in notCount:
                wordsList.remove(i)

        wordCount = {}
        for word in wordsList:
            if word not in wordCount:
                wordCount[word] = 1
            else:
                wordCount[word] += 1
        filestring = ""
        for key, item in wordCount.items():
            filestring += key + " " + str(item) + "\n"

        with open('wordcount.txt', 'w') as fp:
            fp.write(filestring)

예제 #12

0

파일 보기

파일: pymedia.py 프로젝트: ayerlock/pymediainfo

def PrintTrackData( xmlData, MediaType = "file" ):
	xmlInform									= bSoup( xmlData, "xml" )
	Keys										= ['subtitle', 'audio', 'video']
	Values										= ['Text', 'Audio', 'Video']
	TrackTypeDict								= dict( zip( Keys, Values ) )
	for Type, SearchKey in TrackTypeDict.iteritems():
		if ( Type == "video" ):
			print( "\t===== Video Tracks ===========================================================================================================" )
			for TrackSoup in xmlInform.find_all( "track", type=SearchKey ):
				Track							= GetTracks( TrackSoup, Type, MediaType )
				PrintTrack( Track )
		elif ( Type == "audio" ):
			print( "\t===== Audio Tracks ===========================================================================================================" )
			for TrackSoup in xmlInform.find_all( "track", type=SearchKey ):
				Track							= GetTracks( TrackSoup, Type, MediaType )
				PrintTrack( Track )
		elif ( Type == "subtitle" ):
			print( "\t===== Subtitle Tracks ========================================================================================================" )
			for TrackSoup in xmlInform.find_all( "track", type=SearchKey ):
				Track							= GetTracks( TrackSoup, Type, MediaType )
				PrintTrack( Track )
	print( "\t===== Chapters ===============================================================================================================" )
	Chapters									= GetChapters( xmlData )
	for ChapterID, Chapter in Chapters.iteritems():
		print( "\t   Chapter %s:\t%s" % ( ChapterID, Chapter ) )

예제 #13

0

파일 보기

def conn(my_url):
    # open connection and read from page
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()
    pageSoup = bSoup(page_html, "html.parser")
    return pageSoup

예제 #14

0

파일 보기

def get_company_data(list_of_companies):
    list_of_companies_data = []
    for item in list_of_companies:
        new_request = requests.get(item['url'])
        if new_request.ok:
            company_soup = bSoup(new_request.text, 'lxml')
            name_of_company = company_soup.find('h1').text
            website_of_company = company_soup.find('span',
                                                   class_='website-company')
            if website_of_company:
                website = website_of_company.find('a').get('href')
            else:
                website = None
            glyphicon_phone = company_soup.find('span',
                                                class_='glyphicon-phone')
            if glyphicon_phone:
                tel_contact = glyphicon_phone.find_parent('p')
            else:
                tel_contact = None
            if tel_contact:
                tel_a = tel_contact.find('a')
            else:
                tel_a = None
            if tel_a:
                tel = tel_a.text
            else:
                tel = None
            data_company = {
                'name': name_of_company,
                'website': website,
                'phone': tel
            }
            list_of_companies_data.append(data_company)
    return list_of_companies_data

예제 #15

0

파일 보기

    async def opgg(self, ctx, *, summoner):

        msgn = str(summoner)

        op_gg = msgn.split()

        if len(op_gg) > 1:
            username = msgn.replace(' ', '+')

        elif len(op_gg) == 1:
            username = msgn

        else:
            pass

        url = f'https://na.op.gg/summoner/userName={username}'

        try:
            source = requests.get(url).text
            soup = bSoup(source, 'lxml')
            meow = soup.find('h2', class_='Title').text
        except:
            meow = 'nothing'

        if meow == 'This summoner is not registered at OP.GG. Please check spelling.':
            await ctx.send('Please enter a valid summoner name')
        else:
            message = await ctx.send(url)
            await message.add_reaction('\N{White Heavy Check Mark}')
            await message.add_reaction('\N{Cross Mark}')

예제 #16

0

파일 보기

    def __linkPreAnalysis(self):
        # initialize self.hrefMap and self.anchorTextMap
        for file_ in self.allFilesMap:
            self.hrefMap[file_] = []
            self.anchorTextMap[file_] = []
            # use the URL itself as an anchorText
            self.anchorTextMap[file_].append(file_)

        for file_ in self.allFilesMap:
            with open(os.path.join(self.dataDir, file_), 'r') as fopen_:
                # check if actually parsable
                try:
                    soup = bSoup(fopen_, 'lxml')
                except:
                    continue

                for working in soup.findAll('a', href=True):
                    fullUrl = urljoin(file_, working['href'])

                    # check if the url actually exists
                    if fullUrl in self.allFilesMap:
                        # put link in
                        self.hrefMap[file_].append(fullUrl)

                        # put anchor text in
                        self.anchorTextMap[fullUrl].append(working.getText())

예제 #17

0

파일 보기

파일: RevHistToATF.py 프로젝트: nestor-e/lemmata

def convertFile(dir, fName, outputStream):
    with open(dir + fName, 'r', encoding='utf-8') as src:
        text = bSoup(src, 'html.parser').get_text()
    text = revLine.sub(insertNewline, text)
    text = tabLine.sub(insertNewline, text)
    text = imbededLemDetect.sub(unEmbed1, text)
    text = imbededTxtDetect.sub(unEmbed2, text)
    text = imbededTxtDetect2.sub(unEmbed2, text)
    lines = text.splitlines()

    revData = []
    revStart = -1
    revLemCount = 0
    for i in range(len(lines)):
        line = lines[i]
        # print(i, ":", line )
        if revLine.match(line):
            if revStart >= 0:
                revData.append((revStart, i, revLemCount))
            revStart = i
            revLemCount = 0
        elif lemLine.match(line):
            revLemCount += 1
    if revStart >= 0:
        revData.append((revStart, len(lines), revLemCount))
    revData.sort(key=itemgetter(2), reverse=True)
    # print(revData)
    if len(revData) > 0:
        selectedRev = revData[0]
        for line in lines[selectedRev[0] + 1:selectedRev[1]]:
            print(line, file=outputStream)
        return selectedRev[2] > 0
    return False

예제 #18

0

파일 보기

파일: api.py 프로젝트: rushkii/maid_manga_id

 def top_manga(self, genre=''):
     if genre != '':
         top = []
         r = requests.get(f"{self.base_url}/top-30-manga-{genre.lower()}")
         s = bSoup(r.text, 'lxml')
         mangas = s.find_all('div', class_='flexbox2-item')
         for s in mangas:
             data = {
                 'thumbnail':
                 s.find('div', class_='flexbox2-thumb').img['src'],
                 'title': {
                     'japanese': s.find('span', class_='title').text
                 },
                 'genres': [a.text for a in s.find_all('a', rel='tag')],
                 'synopsis':
                 s.find('div', class_='synops').text,
                 'chapters':
                 re.compile("Ch. ([0-9]+)").search(
                     s.find('div', class_='season').text).group(1).strip()
                 if s.find('div', class_='season') is not None else "",
                 'author':
                 s.find('span', class_='studio').text,
                 'rating':
                 s.find('div', class_='score').text
             }
             top.append(data)
         return top
     else:
         raise Exception(
             'Top genre not found, available top genre: Romance, Comedy, Harem.'
         )

예제 #19

0

파일 보기

파일: api.py 프로젝트: rushkii/maid_manga_id

 def search(self, query=''):
     all_search = []
     r = requests.get(f"{self.base_url}/?s={quote(query)}")
     s = bSoup(r.text, 'lxml')
     mangas = s.find_all('div', class_='flexbox2-item')
     for s in mangas:
         data = {
             'thumbnail':
             s.find('div', class_='flexbox2-thumb').img['src'],
             'title': {
                 'japanese': s.find('span', class_='title').text
             },
             'genres': [a.text for a in s.find_all('a', rel='tag')],
             'synopsis':
             s.find('div', class_='synops').text,
             'chapters':
             re.compile("Ch. ([0-9]+)").search(
                 s.find('div', class_='season').text).group(1).strip()
             if s.find('div', class_='season') is not None else "",
             'author':
             s.find('span', class_='studio').text,
             'rating':
             s.find('div', class_='score').text
         }
         all_search.append(data)
     return all_search

예제 #20

0

파일 보기

def get_etd_uris(uri_in='https://surface.syr.edu/etd/',
                 start_page=1,
                 end_page=1):
    """
    A recursive function for collecting ETD URIs from SURFACE (http://surface.syr.edu/etd/).
    :param uri_in: the URI where the ETDs are located (page)
    :param start_page: the starting page from which we should start collecting ETD URIs
    :param end_page: The ending page of pages we wish to collect
    :return: a list of ETD URIs (as strings) from SURFACE
    """
    if start_page != 1 and "index" not in uri_in:
        uri_in = uri_in + 'index.{}.html'.format(str(start_page))
    base_uri = 'https://surface.syr.edu/etd/'
    resp = requests.get(uri_in)
    soup = bSoup(resp.content, 'html.parser')
    anchors = soup.find_all('a')
    etd_links = [
        a['href'] for a in anchors
        if re.match('https://surface.syr.edu/' +
                    '[A-z_]+/\d{1,4}/?$', a['href']) is not None
    ]
    if start_page == end_page:
        return etd_links
    return etd_links + get_etd_uris(
        uri_in=base_uri + 'index.{}.html'.format(str(start_page + 1)),
        start_page=start_page + 1,
        end_page=end_page)

예제 #21

0

파일 보기

    async def krupdate(self, ctx):
        db = cluster['minjubot']
        krbot = db['hyewonfragrant']

        url = 'https://www.koreanclass101.com/korean-phrases/'
        headers = {
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko)Version/12.1.1 Safari/605.1.15'
        }
        source = requests.get(url, headers=headers).text
        soup = bSoup(source, 'lxml')

        wode = soup.find('div', class_='r101-wotd-widget__english').text

        wtd = []
        ix = krbot.find({'index': 'qfind'})

        for item in ix:
            wtd.append(item['krword'])
        twod = ''.join(wtd)

        if twod == wode:
            await (await ctx.send("Already updated")).delete(delay=3)
        else:
            today = date.today()
            await ctx.send(
                f"```css\n{today} - Korean words of the day with examples```")
            wodxa = soup.find_all('div', class_='r101-wotd-widget__word')
            wodexa = soup.find_all('div', class_='r101-wotd-widget__english')
            ewords = []
            for eng in wodexa:
                ewords.append(eng.get_text())
            ewords = ["||" + item + "||" for item in ewords]
            kwords = []
            for kor in wodxa:
                kwords.append(kor.get_text())

            xlst = list(reduce(operator.add, zip(ewords, kwords)))

            carrot = ''
            for xls in range(len(xlst)):
                if xls % 2 == 0:
                    nls = xls - 2
                    blist = ' - '.join(xlst[nls:xls])
                    if blist == '':
                        pass
                    else:
                        carrot += f'{blist}\n'

            await ctx.send(carrot)

            if twod == '':
                newvalues = {'index': 'qfind', 'krword': wode}
                krbot.insert_one(newvalues)
            else:
                query = {'index': 'qfind'}
                krbot.update_one(query, {'$set': {'krword': wode}})

        await ctx.message.delete()

예제 #22

0

파일 보기

파일: manhwa.py 프로젝트: rushkii/topmanhwa-unofficial

 async def _images(self, url):
     #private only
     async with aiohttp.ClientSession() as ses:
         async with ses.get(url) as r:
             s = bSoup(await r.text(), 'lxml')
         return [
             a['data-src'].lstrip()
             for a in s.find_all("img", class_='wp-manga-chapter-img')
         ]

예제 #23

0

파일 보기

파일: WebScraper.py 프로젝트: jk-engineer/PythonScripts

    def page_parser(self, source_url: str) -> 'BeautifulSoup':
        """Возвращает страницу, обработанную html-парсером.

        :param source_url: страница, обработанная html-парсером.
        :return:
        """
        result = self.__session_object.get(str(source_url), verify=False)
        page = result.text
        return bSoup(page, 'html.parser')

예제 #24

0

파일 보기

파일: reconciliation.py 프로젝트: kurumurthi455/LoC-reconcile

 def search_terms_raw(self):
     """Switches to looking for a term by scraping the first web page of search results"""
     self.LOGGER.debug("Web scraping page 1 of web results...".format(self.term))
     search_uri = self.__raw_uri_start + quote(self.term) + self.__raw_uri_end
     response = requests.get(search_uri)
     parser = bSoup(response.text, 'html.parser')
     pattern = re.compile("<td><a href=\"/authorities" + self.term_type + ".+</a></td>")
     search_results = re.findall(pattern, str(parser))
     return self.__process_results_raw(search_results)

예제 #25

0

파일 보기

파일: Musixmatch.py 프로젝트: bejo173/MyScrap

def MusixmatchLyric():
    with uReq.session() as web:
        web.headers["user-agent"] = "Mozilla/5.0"
        url = web.get(
            "https://www.musixmatch.com/lyrics/Avicii/The-Days".format(
                urllib.parse.quote))
        data = bSoup(url.content, "html5lib")
        for lyricContent in data.findAll("p",
                                         {"class": "mxm-lyrics__content "}):
            print(lyricContent)

예제 #26

0

파일 보기

파일: get_info.py 프로젝트: nourou6/Meteo

def swc_ukmetoffice():
    '''Retrieves surface weather charts links from UK met office website'''
    print('Retrieving surface weather charts...')
    url = 'https://www.metoffice.gov.uk/weather/maps-and-charts/surface-pressure'
    source = bSoup(get(url).text, 'lxml')
    # Scrape list of charts from page
    charts_list = source.find(id='colourCharts').find_all('li')
    # Extract links from list
    surface_links_uk = [chart.img['src'] for chart in charts_list][1:]
    return surface_links_uk

예제 #27

0

파일 보기

파일: parser.py 프로젝트: BatiFerren/PythonParserUsingSelenium

def get_page_data(html):
    company_list = []
    soup = bSoup(html, 'lxml')
    companies = soup.find('ul', class_='logotypes-squares').find_all('li')
    for company in companies:
        name = company.find('a').find('h5').text
        company_url = 'https://www.work.ua' + company.find('a').get('href')
        data = {'name': name, 'url': company_url}
        company_list.append(data)
    return company_list

예제 #28

0

파일 보기

파일: pymedia.py 프로젝트: ayerlock/pymediainfo

def GetChapters( xmlData ):
	Chapter										= {}
	xmlInform									= bSoup( xmlData, "xml" )
	ChapCount									= 0
	for ChapterData in xmlInform.find_all( "track", type="Menu" ):
		for child in ChapterData.children:
			if ( len( child.string.strip() ) > 0 ):
				ChapCount						+= 1
				Chapter[ChapCount]				= child.string.strip()
	return Chapter

예제 #29

0

파일 보기

 def search(self, kata):
     params = {'q': kata}
     r = self.req.post(self.url, data=params)
     data = bSoup(r.content, 'html5lib')
     temp = data.find('div', {'class': 'thesaurus_group'})
     result = []
     for i in temp.findAll("a"):
         teks = i.text
         result.append(teks)
     return result

예제 #30

0

파일 보기

파일: collectdata.py 프로젝트: kirtanpatel28/Web-Data-Scrapping

def grab100():
	result = []		# For end result



	for pageCounter in range(1, 2):
		# Create url addres
		url = 'https://www.amazon.com/Best-Sellers-Books-Biographies/zgbs/books/2' + str(pageCounter)


		# Connect to page
		connect  = uRequest(url)
		response = connect.read()
		connect.close()



		# Parse response and grab data
		pRespone 		= bSoup(response, 'html.parser')
		bookContainer 	= pRespone.findAll('li', {'class':'book'})
		booksContent 	= []



		# Grab data
		for book in bookContainer:
			bookTitle 		= book.findAll('a', {'class':'bookTitle'})[0].text
			bookAuthor		= book.findAll('a', {'itemprop':'name'})[0].text
			bookRank 		= book.findAll("div", {"class":"sprite"})[0].text

			bookStatsBox 	= book.findAll("div", {"class":"book-stats"})[0].findAll("span", {"class":"font-szary-4a"})
			bookReaders 	= bookStatsBox[0].text
			bookOpinions 	= bookStatsBox[1].text
			bookRate 		= bookStatsBox[2].text


			# Delete reserved characters
			reserved_chars = ('★', '⬈', '⬊', '⬌','\'', '\"')
			reserved_list = [bookTitle, bookAuthor, bookRank]
			free_list = []

			for element in reserved_list:
				for rChar in reserved_chars:
					if rChar in element:
						element = element.replace(rChar, '')
				free_list.append(element)


			# Add to end result
			result.append((free_list[0], free_list[1], free_list[2], bookReaders, bookOpinions, bookRate))



	print('Successful download data from website\n\n')
	return result

예제 #31

0

파일 보기

파일: parser.py 프로젝트: BatiFerren/PythonParserUsingSelenium

def get_company_data(list_of_companies):
    list_of_companies_data = []
    for item in list_of_companies:
        new_request = requests.get(item['url'])
        if new_request.ok:
            company_soup = bSoup(new_request.text, 'lxml')
            name_of_company = company_soup.find('h1').text
            website_of_company = company_soup.find('span',
                                                   class_='website-company')
            if website_of_company:
                website = website_of_company.find('a').get('href')
            else:
                website = None
            glyphicon_phone = company_soup.find('span',
                                                class_='glyphicon-phone')
            if glyphicon_phone:
                tel_contact = glyphicon_phone.find_parent('p')
            else:
                tel_contact = None
            if tel_contact:
                tel_a = tel_contact.find('a')
            else:
                tel_a = None
            if tel_a:
                tel = tel_a.text
            else:
                tel = None
            vacancies_link = []
            job_links = company_soup.find_all('div', class_='job-link')
            for new_item in job_links:
                a_name = 'https://www.work.ua' + new_item.find('h2').find(
                    'a').get('href')
                vacancies_link.append(a_name)
                tels = []
                for vacancy in vacancies_link:
                    driver = webdriver.Firefox()
                    driver.get(vacancy)
                    try:
                        driver.find_element(By.CSS_SELECTOR,
                                            ".link-phone > span").click()
                        tel_a = driver.find_element(By.CSS_SELECTOR,
                                                    "#contact-phone").text
                    except Exception:
                        tel_a = None
                    driver.quit()
                    tels.append(tel_a)
            data_company = {
                'name': name_of_company,
                'website': website,
                'phone': tel,
                'tels': tels
            }
            list_of_companies_data.append(data_company)
    return list_of_companies_data

예제 #32

0

파일 보기

파일: Musixmatch.py 프로젝트: bejo173/MyScrap

def Musixmatch():
    with uReq.session() as web:
        web.headers["user-agent"] = "Mozilla/5.0"
        url = web.get("https://www.musixmatch.com/search/avici".format(
            urllib.parse.quote))
        data = bSoup(url.content, "html5lib")
        for trackList in data.findAll("ul", {"class": "tracks list"}):
            for urlList in trackList.findAll("a"):
                title = urlList.text
                url = urlList["href"]
                print(title, url)

예제 #33

0

파일 보기

파일: pymedia.py 프로젝트: ayerlock/pymediainfo

def GetTrackData( xmlData, MediaType = None ):
	Tracks										= []
	xmlInform									= bSoup( xmlData, "xml" )
	Keys										= ['subtitle', 'audio', 'video']
	Values										= ['Text', 'Audio', 'Video']
	TrackTypeDict								= dict( zip( Keys, Values ) )
	for Type, SearchKey in TrackTypeDict.iteritems():
		for TrackSoup in xmlInform.find_all( "track", type=SearchKey ):
			Tracks.append( GetTracks( TrackSoup, Type ) )
	Chapters									= GetChapters( xmlData )
	for ChapterID, ChapterText in Chapters.iteritems():
		Keys									= ['type', 'chapterid', 'chaptertext']
		Values									= ['chapter', ChapterID, ChapterText]
		ChapterDict								= dict( zip( Keys, Values ) )
		Tracks.append( ChapterDict )
	HandBrakeCLI								= HandBrake( ProgArgs, Tracks, MediaType = MediaType, Logger = Logger )
	if ( HandBrakeCLI.ChapterList != False ):
		if ( not ProgArgs.dryrun ):
			WriteChapterFile( HandBrakeCLI.ChapterList )

예제 #34

0

파일 보기

파일: pymedia.py 프로젝트: ayerlock/pymediainfo

def PrettyPrint( xmlData ):
	xmlInform									= bSoup( xmlData, "xml" )
	print( xmlInform.prettify() )

예제 #35

0

파일 보기

파일: pymedia.py 프로젝트: ayerlock/pymediainfo

def PrintXMLData( xmlData ):
	xmlInform									= bSoup( xmlData, "xml" )
	InformData									= xmlInform.prettify()
	print(InformData)