def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)%22%7D" } url = "https://www.ynet.co.il/home/0,7340,L-8,00.html" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") News = soup.find_all('div', {"class": "str3s str3s_small str3s_type_small"}) Titles = soup.find_all('div', {"class": "title"}) TitlesText = [] for title in Titles: t = title.text TitlesText.append(t) i = 0 new_headline_links = [] for article in Headline.objects.all(): new_headline_links.append(article.title) for artcile in News: main = artcile.find_all('a')[0] link = main['href'] image_src = str(main.find('img')['src']).split(" ")[0] if (TitlesText[i] in new_headline_links): break if (link.find("https") != -1): link2 = link else: link2 = "https://www.ynet.co.il/" + link link2 = link2.replace('#autoplay', '') articleContent = session.get(link2, verify=False).content print(link2) soup = BSoup(articleContent, "html.parser") new_headline = Headline() ok = "פורסם:" #header = soup.find_all('div', {"class":"element B3 ghcite noBottomPadding"})[0] dates = soup.find_all('span', string=ok) print(dates) new_headline.date = dates[1].text new_headline.title = TitlesText[i] new_headline.url = link2 new_headline.image = image_src #if (new_headline.date != 'error#'): # new_headline.save() new_headline.save() i = i + 1 return redirect("../")
def get_query_results(url): results = get_page(url)['cargoquery'][0]['title'] return { k: re.sub( '\n+', '\n', BSoup(re.sub('<br[^>]*>', '\n', BSoup(results[k], 'lxml').text), 'lxml').text) for k in results }
def main(postcodes): uklogin_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "DeliveryStatus/ukmail_login.json") with open(uklogin_path) as ukmail_login: ukmail_login = json.load(ukmail_login) login_url = 'https://iconsign.ukmail.com/iconsignv5/Login.aspx' with requests.Session() as s: get_viewstate = s.get(login_url) get_soup = BSoup(get_viewstate.content, 'html.parser') viewstate_login = get_soup.find('input', id='_VIEWSTATE_Login').get('value') login_payload = { '_VIEWSTATE_Login' : viewstate_login, '__VIEWSTATE' : '', 'btnLogin': '******', 'txtUsername' : ukmail_login['Username'], 'txtPassword' : ukmail_login['Password'] } r = s.post(login_url, data=login_payload) consignment_data = [] for i in range(30): consignments_url = 'https://iconsign.ukmail.com/iconsignv5/FindConsignments.aspx?pn=%d' % (i + 1) resp = s.get(consignments_url) soup = BSoup(resp.content, 'html.parser') consignment_list = soup.find('table', id='ctl00_mainContent_consignmentGridView') for consignment in consignment_list.find_all('tr'): data = [] for consignment_td in consignment.find_all('td'): data.append(consignment_td.text) for postcode in postcodes: try: if postcode in data: consignment_data.append(data) except: skip = 'Skipping non-order <td> element at start of each table' if len(consignment_data) == len(postcodes): break if consignment_data: for consignment in consignment_data: status = get_status(consignment[6]) print_html(consignment, status)
def change_img_2(pPokemon): urlCmp = 'https://www.pokepedia.fr/' + str(pPokemon) requete = rq.get(urlCmp) page = requete.content soup = BSoup(page, "lxml") recup = soup.find("a", {"class": "image"}).contents[0] imgPage = 'https://www.pokepedia.fr/Fichier:' + str(recup.attrs['alt']) requete = rq.get(imgPage) page = requete.content soup = BSoup(page, "lxml") recup = soup.find("div", {"id": "file"}).contents[0] imgURL = 'https://www.pokepedia.fr' + str(recup.attrs['href']) return imgURL
def get_definition(word, lang): if lang.lower() == 'en': definition_url = 'https://www.wordreference.com/definition/' + str( word) elif lang.lower() == 'es': definition_url = 'https://www.wordreference.com/definicion/' + str( word) else: return ['Please provide a valid language'] content = requests.get(definition_url) soup = BSoup(content.text, 'html.parser') def_result = soup.find_all('ol') if def_result: results = [] no = 1 for search_result in def_result[:2]: results.append(str(no)) listed_elements = search_result.find_all('li') for element in listed_elements[:3]: def_string = str(element) def_string = re.sub(r'<.*?>|\[.*?\]|:.*', '', def_string) results.append(def_string.replace('.', '. \n')) results.append('\n') no += 1 results.append('From: ' + str(definition_url)) return results else: return ['Word not found']
def lookup_cik_ticker(ticker): import requests import sys from bs4 import BeautifulSoup as BSoup req = requests.get(\ "https://www.sec.gov/cgi-bin/browse-edgar?CIK={:s}&owner=exclude&action=getcompany&Find=Search"\ .format(ticker.lower())) ## Check for errors encountered in trying to get that url. try: req.raise_for_status() except: print(" -- {}:\n\t\t{}".format(sys.exc_info()[0], req.url)) return None soup = BSoup(req.content, "lxml") ## Search for the tag that contains the company name. conmTag = soup.find("span", {"class": "companyName"}) if not conmTag: print( "Unable to find the company name for ticker {:s}.".format(ticker)) return None ## Search for the a-ref tag that links to "all company filings". Its text contains the CIK. atags = soup.findAll("a") atagCik = None for t in atags: if "see all company filings" in t.text: atagCik = t if not atagCik: print("Unable to find the a-ref tag with the CIK for ticker {:s}.". format(ticker)) return None cik = atagCik.text.split(" ")[0] conm = conmTag.text.split("CIK")[0].strip() return (str(cik), ticker, str(conm))
def post_request(query, session): seq_data = { 'tabtype': 'animalTabPane', 'historicalDB': '', 'searchdb': 'COX1', 'sequence': query } ## send search request r = session.post( 'https://boldsystems.org/index.php/IDS_IdentificationRequest', data=seq_data, timeout=300) ## extract Top20 table links from the BOLD Result page soup = BSoup(r.text, 'html5lib') data = soup.find_all('span', style='text-decoration: none') data = [ 'http://boldsystems.org' + data[i].get('result') for i in range(len(data)) ] ## return the data return data
def login(username, password, certificate, remember=False): ## start a new html session session = requests_html.HTMLSession(verify=certificate) ## data to push into the post request data = { 'name': username, 'password': password, 'destination': 'MAS_Management_UserConsole', 'loginType': '' } ## send a post request to log into boldsystems.org session.post('https://boldsystems.org/index.php/Login', data=data) ## test if the login was successfull url = session.get('https://boldsystems.org/') soup = BSoup(url.text, 'html.parser') content = soup.find(class_='site-navigation nav navbar-nav') tags = content.find_all('a') if tags[5].text != 'Log out': sg.popup('Unable to login.\nPlease check your userdata.') else: sg.popup('Login successful.') ## save userdata only if login is successful and mark is set if remember: userdata = {"username": username, "password": password} abs_path = os.path.dirname(__file__) rel_path = os.path.join(abs_path, 'data/userdata') json.dump(userdata, open(rel_path, 'w')) ## return the session, not neccessary for this check but ## useful if you want to do other things with the login return session
def req_pkg_details(url, info, proxy_url): try: headers['User-Agent'] = random.choice(user_agents) time.sleep(3) r = requests.get(url, headers=headers, proxies=proxy_url) soup = BSoup(r.text, 'html.parser') p_tags = soup.find_all( 'p', attrs={'class': compile('fw6 mb3 mt2 truncate black-80 f4')}) ul_tag = soup.find('ul', attrs={'class': 'list pl0 cf'}) repo_link = p_tags[3].a['href'] repo_api_link = urljoin(github_api, urlparse(repo_link).path) pkg_version = p_tags[0].text pkg_license = p_tags[1].text pkg_homepage = p_tags[2].a['href'] pkg_repo = {'main': repo_link, 'api': repo_api_link} pkg_collaborator = [ handle_author_info(a['href']) for a in ul_tag.find_all('a') ] pkg_last_update = soup.find('time').text pkg_name = info['pkg_name'] logger.info('StatusCode:(' + str(r.status_code) + ') ' + 'Package: ' + pkg_name + ' --- ' + url) # 存放到mysql save_to_mysql(pkg_name, url, pkg_version, dumps(info['pkg_author']), pkg_license, pkg_homepage, pkg_last_update, dumps(info['pkg_judge']), dumps(pkg_collaborator), dumps(pkg_repo)) except BaseException as e: logger.error('Error(150):' + str(e)) return return
def new_mark(): form = MarkForm() if form.validate_on_submit(): if g.user.murl(form.url.data): flash('Mark with this url ({}) already\ exists'.format(form.url.data), category='error') return redirect(url_for('marks')) m = Mark() form.populate_obj(m) m.owner_id = g.user.id m.created = datetime.utcnow() if form.tags.data: m.tags = ' '.join([t.strip() for t in form.tags.data.strip().split(',')])\ .lower() m.clicks = 0 if not form.title.data: soup = BSoup(urlopen(form.url.data), 'html.parser') m.title = soup.title.string db.session.add(m) db.session.commit() flash('New mark {} added'.format(m.title), category='info') return redirect(url_for('marks')) if request.args.get('url'): form.url.data = request.args.get('url') if request.args.get('title'): form.title.data = request.args.get('title') if request.args.get('type') == 'feed': form.type.data = 'feed' return render_template('mark/new.html', title='New mark', form=form)
def scrape_article(request): if request.method == 'POST': url = request.POST['URL'] session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } content = session.get(url, verify=True).content soup = BSoup(content, "html.parser") article = soup.find('meta', {"name": "description"})['content'] title = soup.title.string images = soup.find('img') print("IMAGES ", images) print(' ') print(' ') print(' ') if images is not None: if images.has_attr('data-srcset'): txt = str(soup.find('img')['data-srcset']) img_list = re.findall("(?<=https).*?(?=jpg)", txt) img_list = 'https' + img_list[1] + 'jpg' print('image link: ', img_list) print(' ') print(' ') print(' ') print('article', article) print('title', title) return redirect("news")
def get_ip_lists_from_66(): r = requests.get(initial_url) soup = BSoup(r.text, 'html.parser') c = soup.find_all('a', href=re.compile('areaindex')) for alink in c: c_link = urljoin(initial_url, alink.get('href')) get_details(c_link)
def __get_redemption_form(self, code, platform): """Get Form data for code redemption""" the_url = "{}/code_redemptions/new".format(base_url) status_code, token = self.__get_token(the_url) if not token: _L.debug("no token") return False, status_code, "Could not retrieve Token" r = self.client.get("{base_url}/entitlement_offer_codes?code={code}" .format(base_url=base_url, **locals()), headers=json_headers(token)) _L.debug("{} {} {}".format(r.request.method, r.url, r.status_code)) soup = BSoup(r.text, "html.parser") if not soup.find("form", class_="new_archway_code_redemption"): return False, r.status_code, r.text.strip() inp = soup.find_all("input", attrs=dict(name="authenticity_token")) form_code = soup.find_all(id="archway_code_redemption_code") check = soup.find_all(id="archway_code_redemption_check") service = soup.find_all(id="archway_code_redemption_service") ind = None for i, s in enumerate(service): if platform in s["value"]: ind = i break if (ind is None): return False, r.status_code, "This code is not available for your platform" form_data = {"authenticity_token": inp[ind]["value"], "archway_code_redemption[code]": form_code[ind]["value"], "archway_code_redemption[check]": check[ind]["value"], "archway_code_redemption[service]": service[ind]["value"]} return True, r.status_code, form_data
def get_retried_jobs(self, workflow): retry_workflow_search_url = f"https://app.cloudsnap.com/workflow_instances?utf8=%E2%9C%93&utf8=%E2%9C%93&workflow_id={workflow.workflow_option}&q%5Bc%5D%5B0%5D%5Ba%5D%5B0%5D%5Bname%5D=relaunched&q%5Bc%5D%5B0%5D%5Bp%5D=true&q%5Bc%5D%5B0%5D%5Bv%5D%5B0%5D%5Bvalue%5D=true" self.get_by_url(retry_workflow_search_url) bs_obj = BSoup(self.driver.page_source, "html.parser") try: rows = bs_obj.find_all("table")[0].find("tbody").find_all("tr") for row in rows: cells = row.find_all("td") if len(cells) == 0: continue job_name = cells[0].get_text() entity_name = job_name[job_name.find("(") + 1:job_name.find(")")] string_datetime = time.strptime(cells[1].get_text(), "%m/%d/%Y %H:%M:%S %Z") job_date = datetime.datetime.fromtimestamp( time.mktime(string_datetime)) week = datetime.datetime.now() - datetime.timedelta(days=7) if job_date < week: continue job_state = "Retried" workflow_url = ("https://app.cloudsnap.com" + cells[0].find_all("a", href=True)[0]["href"]) retry = RetryJobs() retry.timestamp = job_date retry.url = workflow_url retry.state = job_state retry.name = job_name retry.entity = entity_name db.session.add(retry) db.session.commit() return StepFailure.query.all() except Exception as e: print(e) return None
def _parse_personal_info(self, personal_info, dic): soup = BSoup(personal_info, "lxml") for th, td in zip(soup.find_all('th'), soup.find_all('td')): if th.get_text() == 'IELTS:': dic['test_score']['IELTS'] = td.get_text().strip() if th.get_text() == 'TOEFL:': dic['test_score']['TOEFL'] = td.get_text().strip() if th.get_text() == 'GRE:': dic['test_score']['GRE'] = td.get_text().strip() if th.get_text() == 'SAT:': dic['test_score']['SAT'] = td.get_text().strip() if th.get_text() == 'GMAT:': dic['test_score']['GMAT'] = td.get_text().strip() if th.get_text() == 'ACT:': dic['test_score']['ACT'] = td.get_text().strip() if th.get_text() == 'LSAT:': dic['test_score']['LSAT'] = td.get_text().strip() if th.get_text() == 'MCAT:': dic['test_score']['MCAT'] = td.get_text().strip() if th.get_text() == 'sub:': dic['test_score']['sub'] = td.get_text().strip() if th.get_text() == '本科专业:': dic['current_major'] = td.get_text().strip() if th.get_text() == '其他说明:': dic['notes'] = td.get_text().strip() if th.get_text() == '本科专业:': dic['current_major'] = td.get_text().strip() if th.get_text() == '本科学校档次:': dic['current_school'] = td.get_text().strip() if th.get_text() == '本科成绩和算法、排名:': dic['gpa'] = td.get_text().strip() return dic
def get_sensors_status(config, driver): sensors = list() sensor_link = driver.find_element_by_css_selector('a#nav-link-sensors') logger.info('Visting sensors page link...') driver.get(sensor_link.get_attribute('href')) logger.info('Waiting for sensors page...') res = wait_for_element(driver, '#table-sensors-list', timeout=15) if res and res.get('exit'): message = 'Giving up after 3rd retry ' \ 'of waiting for element.' logger.info(message) return sensors logger.info('Finding disconnected sensors from page...') soup = BSoup(driver.page_source, 'html.parser') for row in soup.select('#table-sensors-list tr.result-row'): _id = row.get('id').replace('result-row-', str()) text = row.select('.result-column-sensor-status')[0] \ .get_text().strip('\n').split('\n')[0].strip() ip = row.select('.result-column-sensor-ip')[0] \ .get_text().strip('\n').split('\n')[0].strip() name = row.select('.result-column-sensor-name')[0] \ .get_text().strip('\n').split('\n')[0] sensors.append({'id': _id, 'name': name, 'text': text, 'ip': ip}) return sensors
def fetch_depts(): """ Fetch list of departments from the site :return: list of departments at UT Austin :rtype: list[str] """ c_html = fetch_html('https://registrar.utexas.edu/staff/fos') if c_html is None: return [] c_soup = BSoup(c_html, "html.parser") dept_dl_group = c_soup.find("div", {"class": "field body"}).findAll("dl") dept_abrs = [dt.text.strip() for dl in dept_dl_group for dt in dl.findAll("dt")] dept_names = [ dd.text.strip().replace('-', ' ') for dl in dept_dl_group for dd in dl.findAll("dd") ] dept_names = [titlecase(name) for name in dept_names] if len(dept_abrs) != len(dept_names): # print("Unexpected Error for Dept: number of abr does not equal number of names. Failed fetch") return None depts = [(dept_abrs[i], dept_names[i]) for i in range(len(dept_abrs))] return depts
def fravega_check(url): bs_obj = BSoup(requests.get(url).content, "lxml") quantity = int( bs_obj.find("h4", { "class": "even" }).text.split(" ")[1].strip(string.punctuation)) return quantity
def fetch_prof_info(depts, sem="spring", year=2020): f_profs = [] # fetching courses for each department for dept in depts: c_html = fetch_html(get_course_url(sem=sem, year=year, dept=dept)) if c_html is not None: c_soup = BSoup(c_html, "html.parser") courses = c_soup.findAll("tr", {"class": ["tboff", "tbon"]}) # fetching information for each course in the department for course in courses: info = course.findAll("td") my_info = collapse_prof_info(info, profs=f_profs) if my_info is not None: f_profs.append(my_info) return f_profs
def get_apt_info(htmlpage): apt = {} soup = BSoup(htmlpage.text, 'html.parser') res = soup.find('div', {'class': 'col-md-4 detalhes-imovel'}) infos = res.find_all('p') apt_code = None for i in infos: text = i.text.strip() stext = text.split() first_part = stext[0] sec_part = ' '.join(stext[1:]) if 'Código' in text: apt_code = sec_part elif 'R$' in text: apt['preco'] = float(sec_part.replace(',', '.')) #teste = 1 elif 'Finalidade' in text: apt['finalidade'] = sec_part elif 'Tipo' in text: apt['tipo'] = sec_part elif 'Bairro' in text: apt['bairro'] = sec_part elif 'Dorm' in text: apt['Dormitorios'] = find_value(first_part) elif 'Cozinha' in text: apt['Cozinha'] = find_value(first_part) elif 'Lavanderia' in text: apt['Lavanderia'] = find_value(first_part) return apt_code, apt
def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = "https://www.theonion.com/" content = session.get(url, verify=True).content soup = BSoup(content, "html.parser") News = soup.find_all('article', {"class": "sc-1pw4fyi-7 gDJTEP js_post_item"}) for article in News: main = article.find_all('a')[0] title = article.find_all('h4')[0] link = main['href'] images = main.find('img') if images is not None: if images.has_attr('srcset'): #print(images) image_src = str(main.find('img')['srcset']).split(".jpg")[0] print('title: ', title.text) print('link: ', link) titlet = str(title.text) image_src = image_src + '.jpg' print('image_src: ', image_src) if link is not None and image_src is not None and title is not None: new_headline = Headline(title=titlet, image=image_src, url=link) new_headline.save() return redirect('news')
def get_proxy(self, ind=True): if not ind: return None exe_path=r"/usr/local/bin/chromedriver" chrome_options = Options() chrome_options.add_argument("--headless") driver=webdriver.Chrome(executable_path=exe_path, chrome_options=chrome_options) driver.get('https://free-proxy-list.net/') # driver.find_element_by_xpath("//*[@class='ui-state-default']//*[text()='US']").click() driver.find_element_by_xpath("//*[@class='ui-state-default']//*[text()='anonymous']").click() driver.find_element_by_xpath("//*[@class='hx ui-state-default']//*[text()='yes']").click() html = driver.page_source bs_obj = BSoup(html, 'html.parser') rows = bs_obj.find_all('table')[0].find('tbody').find_all('tr') trans = [] for row in rows: t=[] cells = row.find_all('td') for i in range(5): t.append(cells[i].get_text()) trans.append(t) df_trans=pd.DataFrame(trans) PROXIES={} PROXIES['http']=PROXIES['https']='http://%s:%s' % (df_trans.iloc[0,0], df_trans.iloc[0,1]) driver.close() print(PROXIES) return PROXIES
def garbarino_check(url): bs_obj = BSoup(requests.get(url).content, "lxml") quantity = int( bs_obj.find("li", { "class": "breadcrumb-item--active" }).span.text.strip(string.punctuation).split(" ")[0]) return quantity
def get_curr_trans(self, exchange_add): logger.info('get exchange transfers for %s' % exchange_add) bs_obj = BSoup(requests.get(exchange_add, proxies=self.PROXIES).content, 'html.parser') rows = bs_obj.find_all('table')[0].find('tbody').find_all('tr') trans = [] for row in rows: cells = row.find_all('td') block = cells[1].get_text() age=cells[2].get_text() fromadd=cells[3].get_text() toadd=cells[5].get_text() val=cells[6].get_text() trans.append([ block, age, fromadd, toadd, val ]) df_trans=pd.DataFrame(trans) df_trans.columns=['block','age','from','to','value'] df_trans['time']=df_trans['age'].apply(lambda x: datetime.today()- self.get_time(x, timedelay=[0,0,0])) df_trans['value']=df_trans['value'].apply(lambda x: x.split(' Ether')[0]) df_trans['value']=df_trans['value'].apply(pd.to_numeric, errors='coerce') return df_trans
def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = "https://www.theonion.com/" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") News = soup.find_all('article', {"class": "js_post_item"}) for article in News: title = article.find_all('a', {"class": "js_link"})[-1].text link = article.find("a", {"class": "js_link"}).attrs["href"] image_src = article.find("a", {"class": "js_link"}).find("img") if image_src: try: image_src = image_src.attrs["srcset"] image_src = image_src.split(" ")[-4] except: try: image_src = image_src.attrs["data-expanded-srcset"] image_src = image_src.split(" ")[-4] except: continue else: continue new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.image = image_src new_headline.save() return redirect("../")
def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = "https://metrowatch.com.pk/" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") News = soup.find_all( 'div', { "class": "pb-5 mb-10 block-post-smallrow col-lg-7 col-md-7 col-sm-16 col-xs-16 pl-0 pr-0" }) #News = soup.find_all('li') for artcile in News: main = artcile.find_all('a')[0] link = main['href'] image_src = str(main.find('img')['srcset']).split(" ")[-4] title = artcile.find_all('a')[1] if not Headline.objects.get(url=link): new_headline = Headline() new_headline.title = title.text new_headline.url = link new_headline.image = image_src new_headline.save() messages.info(request, "Your Record display successfully") return redirect("../")
def return_reading(hexagram_number,hex_cast_number): global hex1_lines global hex2_lines global hex1_url global hex2_url res = get(f'https://divination.com/iching/lookup/{hexagram_number}-2/') soup = BSoup(res.text, 'html.parser') hex_name = soup.select('.entry-header > h1')[0].text text = soup.select('.entry-content > p') moving_lines_head = soup.select('.movinglines > h4') moving_lines_body= soup.select('.movinglines > p') for i in hexagrams[hexagram_number]: hex1_lines.append(print_line(line=i)) if hex_cast_number == 1: hex1_url = res.url print(f'{hex_name} \n') for i in text: print(f'{i.text} \n\n') for head,body in list(zip(moving_lines_head, moving_lines_body))[::-1]: print(f'\t{head.text}') print(f'{body.text}') print('\n') else: print(f'{hex_name} \n') for i in text: print(f'{i.text} \n') hex2_url = res.url for i in hexagrams[hexagram_number]: hex2_lines.append(print_line(line=i))
def scrape(request): session = requests.Session() session.headers = { "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)" } url = "https://www.theonion.com/" content = session.get(url, verify=False).content soup = BSoup(content, "html.parser") News = soup.find_all('article', {"class": "sc-1pw4fyi-5 RkwFH"}) for artcile in News: main = artcile.find_all('a')[0] link = main['href'] title = artcile.find('h4', {"class": "sc-1qoge05-0 eoIfRA"}).text News3 = artcile.find('img', {"class": "dv4r5q-2 iaqrWM"}) if News3 is None: image_src = temp else: image_src = News3['srcset'].split(' ')[0] temp = image_src new_headline = Headline() new_headline.title = title new_headline.url = link new_headline.image = image_src new_headline.save() return redirect("../")
def __init__(self, url, encoding='utf-8', ip=None, timeout=8): print(url) self.url = url self.page = requests.get(url) self.page.encoding = encoding self.html = self.page.text self.bsObj = BSoup(self.html, 'html.parser')
def extract_kqed_info(self, **kwargs): request = requests.get( "https://projects-api.kqed.org/posts/news?&page[size]=100&page[from]=0" ) content = json.loads(request.text) info = {} for post in content["data"]: info[post["attributes"]["disqusUrl"]] = {} cur_dict = info[post["attributes"]["disqusUrl"]] cur_dict["date"] = datetime.date.today( ) #need to convert from epoch cur_dict["source"] = "KQED" cur_dict["source_url"] = "https://www.kqed.org/" cur_dict["status_code"] = request.status_code cur_dict["title"] = post["attributes"]["title"] # cur_dict["author"] = "unknown" result = self.browser.open(post["attributes"]["disqusUrl"]) page = self.browser.get_current_page() # author = page.find("span", {"class":"src-routes-Site-routes-Post-components-Post-___Post__post_Author___3vn-d"}) # if author is not None: # cur_dict["author"] = author.text cur_dict["media"] = post["attributes"].get("nprAudio", None) text = post["attributes"].get("content", None) soup = BSoup(text, "lxml") text = soup.get_text() cur_dict["text"] = text.encode('ascii', 'ignore').decode("utf-8") cur_dict["labels"] = get_labels(cur_dict["text"], cur_dict["title"]) return info