def main(app): """ CommandLineApp which opens a supplied text file, and then crawls wikipedia until relavant citation formats are found. The citation formats and links are then printed to a markdown file. """ base_url = 'https://en.wikipedia.org' # Store hyperlink sitations as key value pairs with the link # being the key. hyperlink_citations = {} citation_formats = app.params.citation_formats.split(' ') file_name = app.params.file_name # Get hyperlinks from file and store in a list hyperlinks = [x.strip() for x in open_and_read(file_name)] # iterate through hyperLinks and retrieve citations for hyperlink in hyperlinks: main_page_soup = bSoup(request_html_for(hyperlink), 'html.parser') # retrieve citation page hyperlink citation_page_hyperlink = base_url + main_page_soup.find( 'a', title='Information on how to cite this page')['href'].strip() # retrieve citation page soup citation_page_soup = bSoup(request_html_for(citation_page_hyperlink), 'html.parser') for citation_format in citation_formats: key = main_page_soup.title.text citation = get_citation_format_from(citation_page_soup, citation_format) if key not in hyperlink_citations: hyperlink_citations[key] = { 'hyperlink': hyperlink, 'citations': [citation] } else: hyperlink_citations[key]['citations'].append(citation) with open('citedLinks.md', 'w') as output_file: output_file.write('# Citations For Links') output_file.write('\n') output_file.write('\n') for key in hyperlink_citations: page_data = hyperlink_citations[key] output_file.write( format_citation_for(key, page_data, citation_formats))
async def _last_page(self, path): #private only async with aiohttp.ClientSession() as ses: async with ses.get(f"{self.base}/{path}/") as r: s = bSoup(await r.text(), 'lxml') return int( s.find('span', class_='pages').text.strip().split()[-1])
def skipLinkZippyshare(url): website = uReq.get(url) data = bSoup(website.content, "lxml") for listUrl in data.findAll("script", {"type": "text/javascript"})[5]: getLink = re.search("https://(\w+)\.zippyshare\.com/v/(\w+)/file.html", listUrl).group() return getLink
def JurnalOtakuPost(url): with uReq.session() as web: web.headers["user-agent"] = "Mozilla/5.0" url = web.get(url) data = bSoup(url.content, "html5lib") datapost = [] for content in data.findAll( "div", {"class": "section-wrapper section-article-content"}): for contentData in content.findAll("div", {"class": "meta-cover"}): for getData in contentData.findAll("img"): title = getData["alt"] thumb = getData["src"] for contentPost in content.findAll("div", {"class": "meta-content"}): for getData in contentPost.findAll("p"): text = getData.text datapost.append(text) berita = "" for detailsPost in datapost: berita += detailsPost result = { "code": 200, "result": { "title": title, "thumb": thumb, "berita": berita } } print(json.dumps(result, indent=4, sort_keys=False))
async def apink(self, ctx, *, msg): members = ['eunji', 'bomi', 'hayoung', 'naeun', 'namjoo', 'chorong'] for mem in members: if str(msg.lower()) == mem: link = f'https://kprofiles.com/{msg}-profile-facts/' emb_title = 'Member Profile' source = requests.get(link).text soup = bSoup(source, 'lxml') # print these facts kp_f = soup.find('div', class_='entry-content').p.text kp_jpg = soup.find('div', class_='entry-content').img kp_src = kp_jpg['src'] # print this image embed_kp = discord.Embed(title=emb_title, color=0x29FFCE) embed_kp.add_field(name='Info', value=kp_f) embed_kp.set_image(url=kp_src) embed_kp.add_field(name='Profile Link', value=link, inline=False) await ctx.send(embed=embed_kp) break
def info_video(self, buscar, options): i = 0 info_title = [] info_uploader = [] info_video = [] for y in buscar: req = requests.get(buscar[i]) pagesoup = bSoup(req.text, 'html.parser') html2 = pagesoup.find_all('meta', {'name': 'title'}) html3 = pagesoup.find_all('link', {'itemprop': 'name'}) title = html2[0]['content'] uploader = html3[0]['content'] info_title.insert(i, title) info_uploader.insert(i, uploader) i += 1 i = 0 for y in buscar: info_video.append(info_title[i] + ' no canal ' + info_uploader[i]) i += 1 if 'título' in options: return info_title elif 'canal' in options: return info_uploader else: return info_video
def page_parser_from_file(self, file_name: str) -> 'BeautifulSoup': """Возвращает страницу, обработанную html-парсером :param file_name: имя файла. :return: """ return bSoup(open(str(file_name), encoding='utf-8'), 'html.parser')
def get_item_containers_ng(url): """Creates a List of item containers from the specified site""" request = requests.get(url).text soup = bSoup(request, "html.parser") itemContainers = soup.find_all("div", {"class": "item-container"}) return itemContainers, soup
def skipLinkNekopoi(url, quality): with session as web: web.headers["user-agent"] = "Mozilla/5.0" url = web.get(url) data = bSoup(url.content, "lxml") for listUrl in data.findAll("div", {"class": "col-sm-6"}): result[quality].append(listUrl.a['href'])
def getSpreadsheet(url): soup = bSoup(requests.get(url).text, "html.parser") rows = soup.findAll("tr") table = [] completedRounds = 0 for i in range(4, 24): # p1 to P20 columns = str(rows[i]).split("<td") tempCompletedRounds = 0 for column in columns[5:18]: # columns[round 1: 30-num rounds] if (column.split("\">")[1].split("</td>")[0] != ""): tempCompletedRounds += 1 completedRounds = tempCompletedRounds if tempCompletedRounds > completedRounds else completedRounds j = len(columns) - 1 while j > 31: del columns[j] j -= 1 del columns[0:2] for j in range(len(columns)): columns[j] = columns[j].split("</td")[0].split(".png")[0].split( "\">")[-1] table.append(columns) roundFlag = str(rows[3]).split("<td")[completedRounds].split( "flags/")[1].split(".png")[0] return table, roundFlag
async def main(): async with aiohttp.ClientSession() as session: url = input() html = await fetch(session, url) soup = bSoup(html, 'html.parser') print(soup.title.get_text()) print(soup.title.name) print(soup.p) words = soup.get_text() delimits = string.punctuation words = words.strip(delimits).lower() wordsList = words.split() wordsList = [x.strip(delimits) for x in wordsList] notCount = ["a", "the", "and", "but", "or", "of", " "] for i in wordsList: if i in notCount: wordsList.remove(i) wordCount = {} for word in wordsList: if word not in wordCount: wordCount[word] = 1 else: wordCount[word] += 1 filestring = "" for key, item in wordCount.items(): filestring += key + " " + str(item) + "\n" with open('wordcount.txt', 'w') as fp: fp.write(filestring)
def PrintTrackData( xmlData, MediaType = "file" ): xmlInform = bSoup( xmlData, "xml" ) Keys = ['subtitle', 'audio', 'video'] Values = ['Text', 'Audio', 'Video'] TrackTypeDict = dict( zip( Keys, Values ) ) for Type, SearchKey in TrackTypeDict.iteritems(): if ( Type == "video" ): print( "\t===== Video Tracks ===========================================================================================================" ) for TrackSoup in xmlInform.find_all( "track", type=SearchKey ): Track = GetTracks( TrackSoup, Type, MediaType ) PrintTrack( Track ) elif ( Type == "audio" ): print( "\t===== Audio Tracks ===========================================================================================================" ) for TrackSoup in xmlInform.find_all( "track", type=SearchKey ): Track = GetTracks( TrackSoup, Type, MediaType ) PrintTrack( Track ) elif ( Type == "subtitle" ): print( "\t===== Subtitle Tracks ========================================================================================================" ) for TrackSoup in xmlInform.find_all( "track", type=SearchKey ): Track = GetTracks( TrackSoup, Type, MediaType ) PrintTrack( Track ) print( "\t===== Chapters ===============================================================================================================" ) Chapters = GetChapters( xmlData ) for ChapterID, Chapter in Chapters.iteritems(): print( "\t Chapter %s:\t%s" % ( ChapterID, Chapter ) )
def conn(my_url): # open connection and read from page uClient = uReq(my_url) page_html = uClient.read() uClient.close() pageSoup = bSoup(page_html, "html.parser") return pageSoup
def get_company_data(list_of_companies): list_of_companies_data = [] for item in list_of_companies: new_request = requests.get(item['url']) if new_request.ok: company_soup = bSoup(new_request.text, 'lxml') name_of_company = company_soup.find('h1').text website_of_company = company_soup.find('span', class_='website-company') if website_of_company: website = website_of_company.find('a').get('href') else: website = None glyphicon_phone = company_soup.find('span', class_='glyphicon-phone') if glyphicon_phone: tel_contact = glyphicon_phone.find_parent('p') else: tel_contact = None if tel_contact: tel_a = tel_contact.find('a') else: tel_a = None if tel_a: tel = tel_a.text else: tel = None data_company = { 'name': name_of_company, 'website': website, 'phone': tel } list_of_companies_data.append(data_company) return list_of_companies_data
async def opgg(self, ctx, *, summoner): msgn = str(summoner) op_gg = msgn.split() if len(op_gg) > 1: username = msgn.replace(' ', '+') elif len(op_gg) == 1: username = msgn else: pass url = f'https://na.op.gg/summoner/userName={username}' try: source = requests.get(url).text soup = bSoup(source, 'lxml') meow = soup.find('h2', class_='Title').text except: meow = 'nothing' if meow == 'This summoner is not registered at OP.GG. Please check spelling.': await ctx.send('Please enter a valid summoner name') else: message = await ctx.send(url) await message.add_reaction('\N{White Heavy Check Mark}') await message.add_reaction('\N{Cross Mark}')
def __linkPreAnalysis(self): # initialize self.hrefMap and self.anchorTextMap for file_ in self.allFilesMap: self.hrefMap[file_] = [] self.anchorTextMap[file_] = [] # use the URL itself as an anchorText self.anchorTextMap[file_].append(file_) for file_ in self.allFilesMap: with open(os.path.join(self.dataDir, file_), 'r') as fopen_: # check if actually parsable try: soup = bSoup(fopen_, 'lxml') except: continue for working in soup.findAll('a', href=True): fullUrl = urljoin(file_, working['href']) # check if the url actually exists if fullUrl in self.allFilesMap: # put link in self.hrefMap[file_].append(fullUrl) # put anchor text in self.anchorTextMap[fullUrl].append(working.getText())
def convertFile(dir, fName, outputStream): with open(dir + fName, 'r', encoding='utf-8') as src: text = bSoup(src, 'html.parser').get_text() text = revLine.sub(insertNewline, text) text = tabLine.sub(insertNewline, text) text = imbededLemDetect.sub(unEmbed1, text) text = imbededTxtDetect.sub(unEmbed2, text) text = imbededTxtDetect2.sub(unEmbed2, text) lines = text.splitlines() revData = [] revStart = -1 revLemCount = 0 for i in range(len(lines)): line = lines[i] # print(i, ":", line ) if revLine.match(line): if revStart >= 0: revData.append((revStart, i, revLemCount)) revStart = i revLemCount = 0 elif lemLine.match(line): revLemCount += 1 if revStart >= 0: revData.append((revStart, len(lines), revLemCount)) revData.sort(key=itemgetter(2), reverse=True) # print(revData) if len(revData) > 0: selectedRev = revData[0] for line in lines[selectedRev[0] + 1:selectedRev[1]]: print(line, file=outputStream) return selectedRev[2] > 0 return False
def top_manga(self, genre=''): if genre != '': top = [] r = requests.get(f"{self.base_url}/top-30-manga-{genre.lower()}") s = bSoup(r.text, 'lxml') mangas = s.find_all('div', class_='flexbox2-item') for s in mangas: data = { 'thumbnail': s.find('div', class_='flexbox2-thumb').img['src'], 'title': { 'japanese': s.find('span', class_='title').text }, 'genres': [a.text for a in s.find_all('a', rel='tag')], 'synopsis': s.find('div', class_='synops').text, 'chapters': re.compile("Ch. ([0-9]+)").search( s.find('div', class_='season').text).group(1).strip() if s.find('div', class_='season') is not None else "", 'author': s.find('span', class_='studio').text, 'rating': s.find('div', class_='score').text } top.append(data) return top else: raise Exception( 'Top genre not found, available top genre: Romance, Comedy, Harem.' )
def search(self, query=''): all_search = [] r = requests.get(f"{self.base_url}/?s={quote(query)}") s = bSoup(r.text, 'lxml') mangas = s.find_all('div', class_='flexbox2-item') for s in mangas: data = { 'thumbnail': s.find('div', class_='flexbox2-thumb').img['src'], 'title': { 'japanese': s.find('span', class_='title').text }, 'genres': [a.text for a in s.find_all('a', rel='tag')], 'synopsis': s.find('div', class_='synops').text, 'chapters': re.compile("Ch. ([0-9]+)").search( s.find('div', class_='season').text).group(1).strip() if s.find('div', class_='season') is not None else "", 'author': s.find('span', class_='studio').text, 'rating': s.find('div', class_='score').text } all_search.append(data) return all_search
def get_etd_uris(uri_in='https://surface.syr.edu/etd/', start_page=1, end_page=1): """ A recursive function for collecting ETD URIs from SURFACE (http://surface.syr.edu/etd/). :param uri_in: the URI where the ETDs are located (page) :param start_page: the starting page from which we should start collecting ETD URIs :param end_page: The ending page of pages we wish to collect :return: a list of ETD URIs (as strings) from SURFACE """ if start_page != 1 and "index" not in uri_in: uri_in = uri_in + 'index.{}.html'.format(str(start_page)) base_uri = 'https://surface.syr.edu/etd/' resp = requests.get(uri_in) soup = bSoup(resp.content, 'html.parser') anchors = soup.find_all('a') etd_links = [ a['href'] for a in anchors if re.match('https://surface.syr.edu/' + '[A-z_]+/\d{1,4}/?$', a['href']) is not None ] if start_page == end_page: return etd_links return etd_links + get_etd_uris( uri_in=base_uri + 'index.{}.html'.format(str(start_page + 1)), start_page=start_page + 1, end_page=end_page)
async def krupdate(self, ctx): db = cluster['minjubot'] krbot = db['hyewonfragrant'] url = 'https://www.koreanclass101.com/korean-phrases/' headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko)Version/12.1.1 Safari/605.1.15' } source = requests.get(url, headers=headers).text soup = bSoup(source, 'lxml') wode = soup.find('div', class_='r101-wotd-widget__english').text wtd = [] ix = krbot.find({'index': 'qfind'}) for item in ix: wtd.append(item['krword']) twod = ''.join(wtd) if twod == wode: await (await ctx.send("Already updated")).delete(delay=3) else: today = date.today() await ctx.send( f"```css\n{today} - Korean words of the day with examples```") wodxa = soup.find_all('div', class_='r101-wotd-widget__word') wodexa = soup.find_all('div', class_='r101-wotd-widget__english') ewords = [] for eng in wodexa: ewords.append(eng.get_text()) ewords = ["||" + item + "||" for item in ewords] kwords = [] for kor in wodxa: kwords.append(kor.get_text()) xlst = list(reduce(operator.add, zip(ewords, kwords))) carrot = '' for xls in range(len(xlst)): if xls % 2 == 0: nls = xls - 2 blist = ' - '.join(xlst[nls:xls]) if blist == '': pass else: carrot += f'{blist}\n' await ctx.send(carrot) if twod == '': newvalues = {'index': 'qfind', 'krword': wode} krbot.insert_one(newvalues) else: query = {'index': 'qfind'} krbot.update_one(query, {'$set': {'krword': wode}}) await ctx.message.delete()
async def _images(self, url): #private only async with aiohttp.ClientSession() as ses: async with ses.get(url) as r: s = bSoup(await r.text(), 'lxml') return [ a['data-src'].lstrip() for a in s.find_all("img", class_='wp-manga-chapter-img') ]
def page_parser(self, source_url: str) -> 'BeautifulSoup': """Возвращает страницу, обработанную html-парсером. :param source_url: страница, обработанная html-парсером. :return: """ result = self.__session_object.get(str(source_url), verify=False) page = result.text return bSoup(page, 'html.parser')
def search_terms_raw(self): """Switches to looking for a term by scraping the first web page of search results""" self.LOGGER.debug("Web scraping page 1 of web results...".format(self.term)) search_uri = self.__raw_uri_start + quote(self.term) + self.__raw_uri_end response = requests.get(search_uri) parser = bSoup(response.text, 'html.parser') pattern = re.compile("<td><a href=\"/authorities" + self.term_type + ".+</a></td>") search_results = re.findall(pattern, str(parser)) return self.__process_results_raw(search_results)
def MusixmatchLyric(): with uReq.session() as web: web.headers["user-agent"] = "Mozilla/5.0" url = web.get( "https://www.musixmatch.com/lyrics/Avicii/The-Days".format( urllib.parse.quote)) data = bSoup(url.content, "html5lib") for lyricContent in data.findAll("p", {"class": "mxm-lyrics__content "}): print(lyricContent)
def swc_ukmetoffice(): '''Retrieves surface weather charts links from UK met office website''' print('Retrieving surface weather charts...') url = 'https://www.metoffice.gov.uk/weather/maps-and-charts/surface-pressure' source = bSoup(get(url).text, 'lxml') # Scrape list of charts from page charts_list = source.find(id='colourCharts').find_all('li') # Extract links from list surface_links_uk = [chart.img['src'] for chart in charts_list][1:] return surface_links_uk
def get_page_data(html): company_list = [] soup = bSoup(html, 'lxml') companies = soup.find('ul', class_='logotypes-squares').find_all('li') for company in companies: name = company.find('a').find('h5').text company_url = 'https://www.work.ua' + company.find('a').get('href') data = {'name': name, 'url': company_url} company_list.append(data) return company_list
def GetChapters( xmlData ): Chapter = {} xmlInform = bSoup( xmlData, "xml" ) ChapCount = 0 for ChapterData in xmlInform.find_all( "track", type="Menu" ): for child in ChapterData.children: if ( len( child.string.strip() ) > 0 ): ChapCount += 1 Chapter[ChapCount] = child.string.strip() return Chapter
def search(self, kata): params = {'q': kata} r = self.req.post(self.url, data=params) data = bSoup(r.content, 'html5lib') temp = data.find('div', {'class': 'thesaurus_group'}) result = [] for i in temp.findAll("a"): teks = i.text result.append(teks) return result
def grab100(): result = [] # For end result for pageCounter in range(1, 2): # Create url addres url = 'https://www.amazon.com/Best-Sellers-Books-Biographies/zgbs/books/2' + str(pageCounter) # Connect to page connect = uRequest(url) response = connect.read() connect.close() # Parse response and grab data pRespone = bSoup(response, 'html.parser') bookContainer = pRespone.findAll('li', {'class':'book'}) booksContent = [] # Grab data for book in bookContainer: bookTitle = book.findAll('a', {'class':'bookTitle'})[0].text bookAuthor = book.findAll('a', {'itemprop':'name'})[0].text bookRank = book.findAll("div", {"class":"sprite"})[0].text bookStatsBox = book.findAll("div", {"class":"book-stats"})[0].findAll("span", {"class":"font-szary-4a"}) bookReaders = bookStatsBox[0].text bookOpinions = bookStatsBox[1].text bookRate = bookStatsBox[2].text # Delete reserved characters reserved_chars = ('★', '⬈', '⬊', '⬌','\'', '\"') reserved_list = [bookTitle, bookAuthor, bookRank] free_list = [] for element in reserved_list: for rChar in reserved_chars: if rChar in element: element = element.replace(rChar, '') free_list.append(element) # Add to end result result.append((free_list[0], free_list[1], free_list[2], bookReaders, bookOpinions, bookRate)) print('Successful download data from website\n\n') return result
def get_company_data(list_of_companies): list_of_companies_data = [] for item in list_of_companies: new_request = requests.get(item['url']) if new_request.ok: company_soup = bSoup(new_request.text, 'lxml') name_of_company = company_soup.find('h1').text website_of_company = company_soup.find('span', class_='website-company') if website_of_company: website = website_of_company.find('a').get('href') else: website = None glyphicon_phone = company_soup.find('span', class_='glyphicon-phone') if glyphicon_phone: tel_contact = glyphicon_phone.find_parent('p') else: tel_contact = None if tel_contact: tel_a = tel_contact.find('a') else: tel_a = None if tel_a: tel = tel_a.text else: tel = None vacancies_link = [] job_links = company_soup.find_all('div', class_='job-link') for new_item in job_links: a_name = 'https://www.work.ua' + new_item.find('h2').find( 'a').get('href') vacancies_link.append(a_name) tels = [] for vacancy in vacancies_link: driver = webdriver.Firefox() driver.get(vacancy) try: driver.find_element(By.CSS_SELECTOR, ".link-phone > span").click() tel_a = driver.find_element(By.CSS_SELECTOR, "#contact-phone").text except Exception: tel_a = None driver.quit() tels.append(tel_a) data_company = { 'name': name_of_company, 'website': website, 'phone': tel, 'tels': tels } list_of_companies_data.append(data_company) return list_of_companies_data
def Musixmatch(): with uReq.session() as web: web.headers["user-agent"] = "Mozilla/5.0" url = web.get("https://www.musixmatch.com/search/avici".format( urllib.parse.quote)) data = bSoup(url.content, "html5lib") for trackList in data.findAll("ul", {"class": "tracks list"}): for urlList in trackList.findAll("a"): title = urlList.text url = urlList["href"] print(title, url)
def GetTrackData( xmlData, MediaType = None ): Tracks = [] xmlInform = bSoup( xmlData, "xml" ) Keys = ['subtitle', 'audio', 'video'] Values = ['Text', 'Audio', 'Video'] TrackTypeDict = dict( zip( Keys, Values ) ) for Type, SearchKey in TrackTypeDict.iteritems(): for TrackSoup in xmlInform.find_all( "track", type=SearchKey ): Tracks.append( GetTracks( TrackSoup, Type ) ) Chapters = GetChapters( xmlData ) for ChapterID, ChapterText in Chapters.iteritems(): Keys = ['type', 'chapterid', 'chaptertext'] Values = ['chapter', ChapterID, ChapterText] ChapterDict = dict( zip( Keys, Values ) ) Tracks.append( ChapterDict ) HandBrakeCLI = HandBrake( ProgArgs, Tracks, MediaType = MediaType, Logger = Logger ) if ( HandBrakeCLI.ChapterList != False ): if ( not ProgArgs.dryrun ): WriteChapterFile( HandBrakeCLI.ChapterList )
def PrettyPrint( xmlData ): xmlInform = bSoup( xmlData, "xml" ) print( xmlInform.prettify() )
def PrintXMLData( xmlData ): xmlInform = bSoup( xmlData, "xml" ) InformData = xmlInform.prettify() print(InformData)