def scrape_online_delv(pageNo): r = requests.get( "https://www.zomato.com/ncr/west-delhi-order-online?page=%d" % pageNo, cookies=cookie_jar, headers=headersUA) #request takes url as argument and gives string containing HTML soup = BS(r.text, "html.parser") my_divs = soup.find_all("div", {"class": "search-o2-card"}) for div in my_divs: #name of the rest rstName = div.findChildren( "a", {"class": "result-order-flow-title"})[0].text.strip() #link to the direct page of the rest on zomato link = [] link = div.findChildren("a", attrs={ 'href': re.compile("^https://") }) #re.compile helps in pattern matching rstLink = link[0].get('href') #rating of the rest if (div.findChildren("span", {"class": "rating-value"})): rstRating = div.findChildren( "span", {"class": "rating-value"})[0].text.strip() rstRating = float(rstRating) else: rstRating = 0.0 #category of the rest rstCatg = div.findChildren("div", {"class": "grey-text"})[0].text.strip() #finding the offers available rstOffer = "No Offer" rstOfferValue = 0 if (div.findChildren("span", {"class": "offer-text"})): rstOffer = div.findChildren( "span", {"class": "offer-text"})[0].text.strip() #if u"\u20b9" in rstOffer: #rstOfferValue = rstOffer[rstOffer.index(u"\u20b9")+1:rstOffer.index(" ")] if "%" in rstOffer: rstOfferValue = int((rstOffer[0:rstOffer.index("%")]).strip()) #calling the func to calculate the rest score rstScore = scorecal(rstRating, rstOfferValue) #print(rstScore) rstInfo = dict() rstInfo['rstName'] = rstName rstInfo['rstRating'] = rstRating rstInfo['rstCatg'] = rstCatg rstInfo['rstOffer'] = rstOffer rstInfo['rstScore'] = rstScore rstInfo['rstLink'] = rstLink allRest.append(rstInfo) sortedAllRest = sorted(allRest, key=lambda i: i['rstScore'], reverse=True) return sortedAllRest
print("-------------") with open('grayscale_scraping.csv', mode='w', newline='') as output_file: file_writer = csv.writer(output_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) file_writer.writerow([ 'asset_name', 'aum', 'shares', 'asset_per_share', 'holdings_per_share', 'market_per_share' ]) for asset in urls: print(asset[0]) response = scraper.get(asset[1]).text table = BS(response, "html.parser") overviewdata = table.find("table", {"class": "overview-data"}) if (asset[0] == "ETC"): #AUM aum = overviewdata.findAll("tr")[9] aum = aum.findAll("td")[1].text aum = aum.replace("*", "") aum = aum.replace("‡", "") print("AUM: " + aum) #sharesoutstanding shares = overviewdata.findAll("tr")[10] shares = shares.findAll("td")[1].text shares = shares.replace("*", "") shares = shares.replace("‡", "")
reload(sys) sys.setdefaultencoding("utf-8") def change_code(sentence): s_list = sentence.split(" ") s_list = [unicode(x) for x in s_list] return " ".join(s_list) data_file = sys.argv[1] #data_file = "IndianHistory" os.system('java -cp "../stanford-corenlp-full-2018-02-27/*" -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref -file ../dataset/'+data_file+'.txt') # exit(0) xml_path = "./"+data_file+".txt.xml" xml_fd = open(xml_path, "r") xml_file = xml_fd.read() xml_soup = BS(xml_file, 'lxml') data_fd = io.open("../dataset/"+data_file+".txt", encoding='utf-8') data_lines = data_fd.readlines() # new_data # print data_lines[0] data_fd.close() for x in xml_soup.find_all('coreference'): structure_list = list(x.children) for s_items in structure_list: if s_items == "\n": continue else: mention_list = [] s_items_list = list(s_items.children) # print len(s_items_list)
def spider(): browser = Browser() browser.visit('http://www.baidu.com') browser.execute_script( "window.location.href = 'http://bf.310v.com/3.html'") time.sleep(10) while True: import config reload(config) soup = BS(browser.html, 'html5lib') table = soup.select('table#idt')[0] a3_trs = table.find_all('tr', class_='a3') a4_trs = table.find_all('tr', class_='a4') a3_trs.extend(a4_trs) for tr in a3_trs: # 没有 style='display: none' if (not tr.has_attr('style')) and tr['id'].find('ad') == -1: time_td_text = tr.find_all('td')[3].get_text() # 比赛时间所在的td match_id = tr['id'] end_score = tr.find_all('td')[5].get_text() middle_score = tr.find_all('td')[7].get_text() match_news = News.objects.filter(match_id=match_id) if match_news: if time_td_text.find(u'完') > -1: for match_new in match_news: match_new.end_score = end_score match_new.middle_score = middle_score match_new.save() if time_td_text.find(u'中') > -1: for match_new in match_news: match_new.middle_score = middle_score match_new.save() if re.match(r'\d+', time_td_text ) and int(time_td_text) < config.STATUS_TIME: num1_td = tr.find_all('td')[9] num2_td = tr.find_all('td')[11] yapan1 = num1_td.find_all('div')[0].get_text() yapan2 = num2_td.find_all('div')[0].get_text() daxiaopan1 = num1_td.find_all('div')[1].get_text() daxiaopan2 = num2_td.find_all('div')[1].get_text() tds = tr.find_all('td') ftype = tds[1].find('font').get_text() # 比赛类型 gamestarttime = tds[2].get_text() gamestatus = time_td_text team1 = tds[4].find_all('font')[2].get_text() score = tds[5].get_text() team2 = tds[6].find_all('font')[0].get_text() halfscore = tds[7].get_text() yapanSB = re.sub(r'\s', '', tds[10].find_all('div')[0].text) daxiaopanSB = tds[10].find_all('div')[1].text same_match_sep = datetime.datetime.now( ) - datetime.timedelta(seconds=config.SAME_MATCH_SEP_TIME) matchs = News.objects.filter(score=score).filter( team1=team1).filter(team2=team2).filter( create_time__gte=same_match_sep) # print team1, team2, score, halfscore for each in config.YAPAN: if yapan1 == each.split( '-')[0] and yapan2 == each.split('-')[1]: # print each, yapan1, yapan2 if score != '0-0' and halfscore != '0-0' and len( matchs.filter(findex=each)) == 0: try: winsound.PlaySound('nokia.wav', winsound.SND_PURGE) except: pass news = News.objects.create( match_type=ftype, game_start_time=gamestarttime, status=gamestatus, team1=team1, team2=team2, half_score=halfscore, score=score, yapan=yapan1 + '-' + yapan2, daxiaopan=daxiaopan1 + '-' + daxiaopan2, findex=each, match_id=match_id, yapanSB=yapanSB, daxiaopanSB=daxiaopanSB) news.save() for each in config.DAXIAOPAN: if daxiaopan1 == each.split( '-')[0] and daxiaopan2 == each.split('-')[1]: # print each, daxiaopan1, daxiaopan2 if score != '0-0' and halfscore != '0-0' and len( matchs.filter(findex=each)) == 0: try: winsound.PlaySound('nokia.wav', winsound.SND_PURGE) except: pass news = News.objects.create( match_type=ftype, game_start_time=gamestarttime, status=gamestatus, team1=team1, team2=team2, half_score=halfscore, score=score, yapan=yapan1 + '-' + yapan2, daxiaopan=daxiaopan1 + '-' + daxiaopan2, findex=each, match_id=match_id, yapanSB=yapanSB, daxiaopanSB=daxiaopanSB) news.save() time.sleep(config.SPIDER_SEP_TIME)
utterance will begin with NO NON-DOM. If there is non-dominant hand gloss in the utterance there will be **NON-DOM** followed by the non-dominant hand gloss.""" from bs4 import BeautifulSoup as BS import re partial_path = """<write the path to "ncslgr-xml">""" # Write the path location where ncslgr-xml is saved on your local machine dominant_only_gloss = () dominant_and_non_dominant_gloss = () with open(partial_path + r'\football.xml', 'r') as f_IN: with open( """Path name to file output""", 'a' ) as f_OUT_utts: # Write path to the file name you want to use to save the output to soup = BS(f_IN.read(), 'xml') for utterance_tag in soup.find_all('UTTERANCES'): for utterance_tags in utterance_tag.find_all('UTTERANCE'): if utterance_tags.find_all('TRACK', {'FID': '10001'}): for dominant_track_tags in utterance_tags.find_all( 'TRACK', {'FID': '10000'}): for dominant_a_tags in dominant_track_tags.find_all( 'A'): if dominant_a_tags.has_attr('VID'): dominant_a_tags.decompose() for non_dominant_track_tags in utterance_tags.find_all( 'TRACK', {'FID': '10001'}): for non_dominant_a_tags in non_dominant_track_tags.find_all( 'A'): if non_dominant_a_tags.has_attr('VID'):
if index != 0: wfundid, name = nameStr.split(',') if '-' in wfundid: wfundDict[wfundid] = name print('MMA境外基金數:{}'.format(len(wfundDict))) print('==================' * 2) ## 國內基金 ## fundidsList = list(fundDict.keys()) ## 取得國內基金基本資料/經理人資料/持股狀況(個股/各分類) ## url_domestic_base = 'http://mmafund.sinopac.com/w/wr/' for no, fundid in enumerate(fundidsList): html_domestic_info = requests.get(url_domestic_base + 'wr01.djhtm?a=' + fundid).text soup_domestic_info = BS(html_domestic_info, "lxml") html_domestic_stock = requests.get(url_domestic_base + 'wr04.djhtm?a=' + fundid).text soup_domestic_stock = BS(html_domestic_stock, "lxml") fundInfo_domestic = getFundBasicInfo(soup_domestic_info) fundManager_domestic = getFundManager(soup_domestic_info) fundStock_domestic = getDomesticStockHolding(soup_domestic_stock) fundShare_domestic = getDomesticShareHolding(html_domestic_stock) dictToDb(fundInfo_domestic, '[MMA國內基金基本資料]', con) dictToDb(fundManager_domestic, '[MMA國內基金歷任經理人]', con) dictToDb(fundStock_domestic, '[MMA國內基金持股狀況_個股]', con)
def getForeignShareHolding(html_text_wb): """取得境外持股資料(圓餅圖) 剖析mma中html含有js程式碼,資料隱藏在js其中 params html_text : raw text(str) return : list of defaultdict """ def getShareHoldingTable(stockGroupList): """轉換國外持股圓餅圖資料(getForeignShareHolding)為dict格式(pd.dataframe可直接使用) """ stockGroup = defaultdict(list) for index, (k, v) in enumerate(stockGroupList): if index > 1: stockGroup['項目'].append(k) stockGroup['投資金額(美元:萬)'].append(v) else: stockGroup[k] = v return stockGroup soup = BS(html_text_wb, "lxml") date_temp = soup.select('.wfb1ar') if date_temp: update_date = '/'.join( re.findall(r"\d+", soup.select('.wfb1ar')[0].text)) ## 資料更新日期 ### fundid #### fundid = re.findall( r"(?:a=)(.+)", soup.select('#itemTab')[0].find('a').get('href'))[0] fundid = fundid.strip() ############### string1 = 'DJGraphObj1' # 切出目標字串 target_text = html_text_wb[html_text_wb.index(string1):] pat1 = r"(?:\'Title\':)(.+\')(?:])" investTitle = re.findall(pat1, target_text) # 取得並切分表單table pat2 = r"(?:\')(.*?)(?:\')" # 取出包含在 ' '內的字串 pat3 = r"(?:\'PieV\':)(.+)" # 取出包含在 PieV 後的字串 # investTitleByStock = re.findall(pat2,investTitle[0]) ## 依產業標題(List) # investTitleByStock table = defaultdict(list) tableAns = [] # pdb.set_trace() for index, titleText in enumerate(investTitle): titleList = re.findall(pat2, titleText) if len(titleList) == 1: continue colname = titleList[0] titleList = titleList[1:] titleList.insert(0, 'fundid') titleList.insert(1, '資料日期') valueList = re.findall(pat2, re.findall(pat3, target_text)[index]) valueList.insert(0, fundid) valueList.insert(1, update_date) table[colname] = list(zip(titleList, valueList)) # typeName = ['持有類股','區域','產業'] # 沒在用啦!! 之前有錯~~ share_Holding_Dict = getShareHoldingTable(table[colname]) share_Holding_Dict['分類'] = re.findall(r"產業|持有類股|區域", colname)[0] tableAns.append(share_Holding_Dict) return tableAns
import requests from bs4 import BeautifulSoup as BS search = input().split() name = '' for i in search: name = name + '+' + i name = name[1:] html_code = requests.get('https://www.citilink.ru/search/?text=' + name).text soup = BS(html_code, 'lxml') page = soup.find('div', {'class': "main_content_wrapper search"}) items = page.find('div', {'class': 'main_content_inner'}) items = items.find( 'div', { 'class': 'block_data__gtm-js block_data__pageevents-js listing_block_data__pageevents-js' }) for item in items.findAll('div', {'class': 'subcategory-product-item__body'}): title = item.find('span', {'class': 'h3'}) print(title.a.get('title')) print(title.a.get('href')) Text = item.find('p', {'class': 'short_description'}).text print(Text)
import requests from bs4 import BeautifulSoup as BS result = requests.get("http://midas.iiitd.com/") src = result.content soup = BS(src, 'lxml') print(soup.find_all('img'))
def search(sess, load_time, pay_load, cookie, root_path, resourses_path, local_path, chrome=None, page=1): page_dir = local_path + "/page_" + str(page) if not os.path.exists(page_dir): os.mkdir(page_dir) try_times = 5 flag = False try_num = 0 while try_times != 0: load_page_flag = True page_error_count = 5 while load_page_flag: try: indexes = sess.post(resourses_path, data=pay_load, cookies=cookie) load_page_flag = False page_error_count -= 1 except Exception as e: load_page_flag = True if page_error_count == 0: return None print("Error:", e) connect_error = "connect()连接127.0.0.1:6600失败,错误号:10061." html_contents = indexes.text # print(html_contents) if html_contents.find(connect_error) == 1: try_num += 1 try_times -= 1 print("Can't connect server, try ", try_num, "time(s).") time.sleep(30) else: flag = True break if not flag: print("Wait some minutes, try again.") return None time.sleep(load_page_time) soup = BS(indexes.text, "html.parser") soup.prettify() numbers = None if page == 1: pdf_numbers_info = soup.find("div", class_="search_gs") pdf_numbers_pattern = "([0-9]*)篇;" pdf_numbers = re.findall(pdf_numbers_pattern, str(pdf_numbers_info)) pdf_numbers = pdf_numbers[0] if pdf_numbers != '': numbers = int(pdf_numbers) print("Total find ", numbers, "files") else: return None # 3.parser current page pattern = "downpaper\('(.*)'\);return false" results = soup.find_all("span", class_="down") print("Download page ", page, ".") for i, raw_link in enumerate(results): a = raw_link.find_all("a") if len(a) == 1: print("============================================") print("Current file don't have a download link.") print("") print("") continue link = re.findall(pattern, str(a[1])) download_url_website = link[0].replace("&", "&") url = root_path + download_url_website pattern_title = "(&T)=(.*?)$" pattern_title_obj = re.compile(pattern_title) raw_title = re.findall(pattern_title_obj, url)[0][1] filename = parse.unquote(raw_title) filename = filename.replace("/", "") print("============================================") print("Preparing downloading: ", filename) file_path = page_dir + "/" + filename + ".pdf" if os.path.exists(file_path): print(filename, "already in dir : ", page_dir) print("") print("") continue # time.sleep(5) download_not_finish_flag = True error_count = 5 while download_not_finish_flag: try: chrome.get(url) error_count -= 1 download_not_finish_flag = False except Exception as e: print("Error:", e) if error_count == 0: continue download_not_finish_flag = True error_count = 5 while download_not_finish_flag: try: chrome.get(url) error_count -= 1 download_not_finish_flag = False except Exception as e: print("Error:", e) if error_count == 0: continue time.sleep(load_time) hrefs = chrome.find_elements_by_xpath("//*[@href]") find_url_flag = False for href in hrefs: if href.text == '下载地址' or href.text == '镜像站-高速下载-1': find_url_flag = True download_url = href.get_attribute('href') print("Download url : ", download_url) download(download_url, sess, cookies={}, filename=page_dir + "/" + filename) if not find_url_flag: print("not find") if page == 1: return numbers
from bs4 import BeautifulSoup as BS from time import time import psycopg2 import requests import config import sys import re init_ts = time() links = list( filter(lambda x: bool(x), [ a.get('href', False) for a in BS(requests.get(f'http://{sys.argv[1]}').text, features='html.parser').find_all('a') ])) with open('links.txt', 'w+') as fh: fh.write('\n'.join(links)) connection = psycopg2.connect(config.CONNECTION_STRING) cursor = connection.cursor() for link in links: cursor.execute("INSERT INTO LINKS(HREF, DOMAIN) VALUES(%s, %s)", ( link, sys.argv[1], )) print(f'INSERTED {link}') connection.commit()
# check to see the status code of our request print("Status code = {}".format(response.status_code)) # Save the data to the local cache but only if status code is good if response.status_code == 200: with open(cache_file, 'w') as file: file.write(response.text) # Store the data retrieved in memory as well cache_data = response.text print("Cache data saved to disk and memory") # Now check the data is not empty before we carry on if len(cache_data) > 0: print("cache data OK") data = BS(cache_data, 'html.parser') print("Filtering per rulesets") products = data.find('div', attrs={ 'class': 'products-list' }).find_all('div', attrs={'class': 'product-card'}) print("{} matches found".format(len(products))) print("Extracting relevant data") for product in products: title = product.find('a', attrs={'class': 'product-card__title'}) price = product.find('div', attrs={'class': 'product-card__price-value'}) # Add the product to the list of items found items.append(prod(title.text, str(price.text).strip()))
def parse(self, response): return {'title': BS(response.text).title.text}
def find_login_param(onyma, login=None, account_name=None): url_ip = 'https://10.144.196.37' url_main = 'https://10.144.196.37/onyma/main/' try: if (account_name is None) and (login is not None): payload = { 'prpoper1': 'Like', 'prpv1': login, 'prpc': '0', 'search': 'Поиск' } html = onyma.post( 'https://10.144.196.37/onyma/main/dogsearch_ok.htms', data=payload, verify=False).text if '<title>Результаты поиска</title>' in html: url = BS(html, 'lxml').find('a', title=re.compile('-.+руб.')).get('href') html = onyma.get(url_main + url).text url = BS(html, 'lxml').find('a', title=re.compile('Договор')).get('href') html = onyma.get(url_ip + url).text # Поиск учетного имени links = BS(html, 'lxml').find_all('a') for link in links: url = link.get('href') if 'clsrv.htms' in url: html = onyma.get(url_main + url).text if login in html: account_name = re.search( r'\]\. (\S+)', BS(html, 'lxml').find('title').text).group(1).strip() url = BS(html, 'lxml').find('a', id='menu4185').get('href') html = onyma.get(url_ip + url).text url = url_main + BS(html, 'lxml').find( 'td', class_='td1').find('a').get('href') html = onyma.get(url).text elif (login is None) and (account_name is not None): html = onyma.post( 'https://10.144.196.37/onyma/main/dogsearch_ok.htms', { 'sitename': account_name, 'search': 'Поиск' }, verify=False).text url = BS(html, 'lxml').find('a', title=re.compile('Договор')).get('href') html = onyma.get(url_ip + url).text url = BS(html, 'lxml').find('a', id='menu4185').get('href') html = onyma.get(url_ip + url).text url = url_main + BS(html, 'lxml').find( 'td', class_='td1').find('a').get('href') html = onyma.get(url).text else: return False urls = [] links = BS(html, 'lxml').find_all('a') for link in links: url = link.get('href') if ('service=201' in url) or ('service=4610' in url) and (link.text == account_name): urls.append(url_main + url) except: return False result_url = '' result_date = 1 for url in urls: try: html = onyma.get(url).text current_date = int( BS(html, 'lxml').find('td', class_='td1').find('a').text.split('.')[0]) except: continue if current_date >= result_date: result_date = current_date result_url = url if result_url != '': bill = re.search(r'bill=(\d+)', result_url).group(1) dmid = re.search(r'dmid=(\d+)', result_url).group(1) tmid = re.search(r'tmid=(\d+)', result_url).group(1) return { 'account_name': account_name, 'bill': bill, 'dmid': dmid, 'tmid': tmid } elif account_name is not None: return { 'account_name': account_name, 'bill': None, 'dmid': None, 'tmid': None } else: return False
def geteid(html): soup = BS(html, 'html.parser') return soup.find_all('option')
def recipe_finder(self, foodname, num_recipe=3): #get food name as keyword keyword = foodname #convert keyword into unicode keyword = str( keyword.encode('utf-8')).lstrip("b'").rstrip("'").replace( "\\x", "%") # get url url = 'http://www.10000recipe.com/recipe/list.html?q=' + keyword #connect to the site response = requests.get(url) html = response.text #parse using BS soup = BS(html, 'lxml') #get cookbook link if soup.find("a", "thumbnail") == None: recipe = '' recipe_list = [] else: cook_link_list = soup.find_all('a', "thumbnail") cook_urls = [] for cook_links in cook_link_list: cook_link = cook_links['href'] cook_urls.append('http://www.10000recipe.com' + cook_link) cook_urls = cook_urls[0:num_recipe] #connect to the site recipe_list = [] for cook_url in cook_urls: response = requests.get(cook_url) html = response.text #parse using BS soup2 = BS(html, 'lxml') contents = soup2.find_all('meta', {'name': "keywords"}) #get recipe recipe_content = contents recipe = '' for rec in recipe_content: recipe = recipe + str(rec['content']) #replace useless tokens #recipe = recipe.replace('text/html; charset=euc-kr','') #recipe = recipe.replace('\r\n','') #recipe = re.sub('http.+','',recipe) recipe_list.append(recipe) return recipe_list
def get_page_data(html): # парсинг игр soup = BS(html, 'lxml') items = soup.find_all('div', class_='item') for item in items: url = 'https://www.playground.ru' + item.find( 'div', class_='media-heading title').find('a').get('href') soup = BS(get_html(url), 'lxml') gameCard = soup.find('div', class_='gp-game-card-top') try: name = gameCard.find('h1', class_='gp-game-title').text.strip() sp = name.split(" ") name = sp[0].strip() except: name = '' try: genres = gameCard.find('div', class_='genres').text.strip() genres = 'Жанры: ' + ", ".join(genres.split('\n\n')) except: genres = '' try: releaseList = gameCard.find('div', class_='releases').find_all( 'div', class_='release-item') except: releaseList = '' release = 'Дата выхода:' + '\n' for i in releaseList: release += ' '.join(i.text.split()) + '\n' release = release.strip() try: info = soup.find('div', class_='description-wrapper').text.strip() except: info = '' try: info += '\nРазработчик: ' + gameCard.find( 'div', class_='game-card-info js-redirect').find( 'span', itemprop="name").text.strip() except: pass try: info += '\nИздатель: ' + gameCard.find( 'div', class_='game-card-info js-redirect').find( 'span', itemprop="publisher").text.strip() except: pass try: photo = soup.find('div', class_='gp-game-cover').find('a').get('href') except: photo = '' data = { 'name': name, 'genres': genres, 'release': release, 'info': info, 'photo': photo, 'url': url } write_csv(data)
def go(self, url): self.phantom.get(url) self.soup = BS(self.phantom.page_source, 'lxml')
def getDomesticShareHolding(html_text): """取得國內持股資料(圓餅圖) 剖析mma中html含有js程式碼,資料隱藏在js其中 params html_text : raw text(str) return : list of defaultdict """ def getShareHoldingTable(stockGroupList): """轉換國內持股圓餅圖資料(getDomesticShareHolding)為dict格式(pd.dataframe可直接使用) """ stockGroup = defaultdict(list) for index, (k, v) in enumerate(stockGroupList): if index > 1: stockGroup['項目'].append(k) stockGroup['投資金額(萬元)'].append(v) else: stockGroup[k] = v return stockGroup ### 取得 fundid ### soup = BS(html_text, "lxml") fundid = re.findall(r"(?:a=)(.+)", soup.select('#itemTab')[0].find('a').get('href'))[0] # print('fundid:{}'.format(fundid)) ##### 取得 資料日期 #### date_temp = soup.select('.wfb1ar') if date_temp: try: update_dateStr = re.findall( r"\d+\/\d+\/\d+", soup.select('.wfb1ar')[-1].text)[0] # xx分布--資料日期 except IndexError: update_date = re.findall(r'\d+/\d+', date_temp[-1].text)[0] ## 年/月 ####################### string1 = 'DJGraphObj1' # 切出目標字串 target_text = html_text[html_text.index(string1):] pat1 = r"(?:\'Title\':)(.+)" investTitle = re.findall(pat1, target_text) # 取得並切分表單table pat2 = r"(?:\')(.*?)(?:\')" # 取出包含在 ' '內的字串 pat3 = r"(?:\'PieV\':)(.+)" # 取出包含在 PieV 後的字串 table = defaultdict(list) tableAns = [] for index, titleText in enumerate(investTitle): titleList = re.findall(pat2, titleText) if len(titleList) == 1: continue colname = titleList[1] titleList = titleList[2:] titleList.insert(0, 'fundid') titleList.insert(1, '資料日期') valueList = re.findall(pat2, re.findall(pat3, target_text)[index]) valueList.insert(0, fundid) valueList.insert(1, update_dateStr) # print(titleList,valueList) table[colname] = list(zip(titleList, valueList)) share_Holding_dict = getShareHoldingTable(table[colname]) # typeName = ['持有類股','區域','產業'] ## 沒用了,之前有錯! share_Holding_dict['分類'] = re.findall(r"產業|持有類股|區域", colname)[0] tableAns.append(share_Holding_dict) return tableAns
def search(self, css_select, key_text, parser='lxml'): b = BS(self.html(), 'lxml') for module in b.find_all(text=re.compile(key_text)): path = self._get_absolute_path(module.parent) if css_select in path: return path
home_page_url = 'http://portal.chd.edu.cn/index.portal?.pn=p167' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36" } cookies = login(login_url, headers, home_page_url) #此函数最终返回一个cookies #print(type(cookies)) #print(cookies['JSESSIONID']) #print(cookies) session = requests.session() # 此方法可以保存服务器发来的cookies url = 'http://bkjw.chd.edu.cn/eams/teach/grade/course/person!historyCourseGrade.action?projectType=MAJOR' session.headers = headers session.cookies = cookies # 手动加载返回的cookies res = session.get(url) soup = BS(res.text, 'html.parser') # 在 Android 上未能安装lxml,就换用html.parser了 text1 = soup.findAll(class_=re.compile('griddata')) st0 = ['学年度', '学期', '门数', '总学分', '平均绩点'] for i in text1: if len(i.contents) == 3: print(i.get_text()) continue elif len(i.contents) == 9: st = text1[4].get_text().split('\n')[1:5] print(st[0] + ":") print(st0[2] + ": " + st[1]) print(st0[3] + ": " + st[2]) print(st0[4] + ": " + st[3]) continue st = i.get_text().split('\n')[1:6] for j in range(5):
from bs4 import BeautifulSoup as BS import time, re, os, pickle, sys tags = ["shiny", "shinyapps", "shiny-server", "shinydashboard"] wd = "/home/tian/shinyExpert/StackOverflow/" topics = [] for tag in tags: path = wd + tag + "/" print path HTML = os.listdir(path) HTML.sort() for i in range(0, len(HTML)): f = HTML[i] bs = BS(open(path + f).read()) threads = [f] try: Q = bs.find("div", {"id": "question"}) Q_text = Q.find("div", {"class": "post-text"}) q_text = Q_text.get_text() Q_time = Q.findAll("div", {"class": "user-action-time"}) q_time = Q_time[0].find("span")["title"] Q_name = Q.findAll("a", {"href": re.compile("/users/.*")}) q_name = Q_name[len(Q_name) - 1].get_text() Q_cmnt = Q.findAll("tr", {"class": "comment"}) cmnts = [] if (len(Q_cmnt) > 0): for c in range(0, len(Q_cmnt)): c_text = Q_cmnt[c].find("span", { "class": "comment-copy" }).get_text()
base_url = 'https://djinni.co/jobs/?lang=uk&location=%D0%9A%D0%B8%D0%B5%D0%B2&' \ 'page=1&primary_keyword=Python' domain = 'https://djinni.co' jobs = [] urls = [] urls.append(base_url) urls.append(base_url + '&page=2') urls.append(base_url + '&page=3') for url in urls: time.sleep(1) req = session.get(url, headers=headers) if req.status_code == 200: bsObj = BS(req.content, "html.parser") li_list = bsObj.find_all('li', attrs={'class': 'list-jobs__item'}) for li in li_list: div = li.find('div', attrs={'class': 'list-jobs__title'}) title = div.a.text href = div.a['href'] short = "No Description" #company = 'No name' descr = li.find('div', attrs={'class': 'list-jobs__description'}) if descr: short = descr.p.text jobs.append({ 'href': domain + href, 'title': title, 'descript': short, 'company': "No name"
def get_metadata(headers, departments): s = VideoClient(headers) logger.info("Getting metadata") vid_list = s.get(VIDEO_LIST_URL) vid_list_bs = BS(vid_list, features="lxml") logger.info("Parsing department list") dept_select = vid_list_bs.find("select", id="dep_id").findAll("option") dept_names = {el["value"]: el.text for el in dept_select if el["value"]} # get rid of empty dept logger.info("Parsing course list JSON") metadata = vid_list_bs.findAll("script", type="text/javascript", src=None) metadata = '\n'.join(str(i) for i in metadata) # resistant to extra <script> tags metadata = [i for i in metadata.splitlines() if "JSON.decode" in i][0] metadata = metadata[metadata.index("{"):metadata.rindex("}") + 1] # bounds of actual json metadata = json.loads(metadata) logger.info("Scraping Video List Request Format") video_data = video_post_data(s) # the metadata format is: # dept num: {course num: {course name, course num}} # desired format is # dept num: { # text: dept name, # thumbnail: dep thumb, # courses: {course num: { # text: course name, # thumbnail: course thumb, # videos: { id : {url, data} } # }}} # because this will be yamled easily def get_department(dep, client): if dep not in dept_names: dept_names[dep] = f"{dep} - Uncategorized" courses = metadata[dep] # thumbs = {} course_metadata = {} for c in courses: videos = get_videos(client, video_data, dep, c) if videos == None: continue course_metadata[c] = { "text": html.unescape(courses[c]["text"]), "videos": videos, # "thumbnail": thumbnail, } # thumbs[thumb_date] = thumbnail data = { "text": html.unescape(dept_names[dep]), "courses": course_metadata, # "thumbnail": thumbs[max(thumbs)], } return dep, data logging.info("Departments: %s", sorted(metadata.keys())) if departments == []: departments = metadata departments = [i for i in departments if i in metadata] clients = [VideoClient(headers) for i in range(len(departments))] sane_data = {} with concurrent.futures.ThreadPoolExecutor(max_workers=len(departments) + 4) as executor: futures = executor.map(get_department, departments, clients) for dep, data in futures: logging.info("Done scraping %s", dep) sane_data[dep] = data return sane_data
def handle_data(self, data): if self._recording == "State2": self.code += data def handle_endtag(self, tag): if tag == "pre" and self._recording == "State2": self._recording = "State1" elif tag == "div" and self._recording == "State1": self._recording = "State0" if __name__ == "__main__": team_addr = f"http://{sys.argv[1]}:5000" search_resp = requests.get(team_addr + "/search", params={"query": "get_flag"}) soup = BS(search_resp.text, "html.parser") for article in soup.find_all(name="article", attrs={"class": "media content-section"}): title: str = article.find(name="a", attrs={ "class": "article-title" }).text lang = title.split()[0] code_tag: Tag = article.find(name="div", attrs={"class": "highlight"}) code = CodeGetter(str(code_tag)).code if lang == "Python": flag_repr = re.search(r"\[[\d, ]+\]", code) if flag_repr: flag_repr = literal_eval(flag_repr.group()) key = re.search(r"chr\(x \^ (\d+)\)", code) if key: key = int(key.group(1))
import os from bs4 import BeautifulSoup as BS docpath = r'E:\Users\yuyun\Desktop\workspace\TempJob\650' files = os.listdir(docpath) # namedict = {} names = [] for i in files: filepath = os.path.join(docpath, i) print(filepath) with open(filepath, 'r', encoding='gbk') as fp: text = fp.read() bsobj = BS(text, 'html.parser') find = bsobj.find_all('tr') for j in find: tr = j.find_all('td') if len(tr) != 10: continue elif tr[-2].text == chr(8730): names.append(tr[-4].text) else: continue # namedict[filepath] = names names = set(names) print(names) print(len(names))
import os import urllib.request from bs4 import BeautifulSoup as BS import requests print("Your current path: " + os.getcwd()) new_path = input("Enter Your New Path") os.chdir(new_path) l = int(input('first comic index( >= 39)')) h = int(input('last comic index( <= 4537)')) for i in range(l, h + 1): url = 'http://explosm.net/comics/' + str(i) src_code = requests.get(url) code = src_code.text soup = BS(code, 'html.parser') for img in soup.find_all('img', {'id': 'main-comic'}): img_url = 'http:' + img.get('src') print(img_url) img_url = img_url.strip() try: urllib.request.urlretrieve(img_url, 'Comic' + str(i) + '.jpeg') except: print('Cant download this...Skipping')
def getfilename(html): soup = BS(html, 'html.parser') headers = soup.find('select', class_='').text content = remove_all(headers.split('\n')) return content
htm = '''<html><head><title>國立臺灣大學系統</title></head> <body> <p class="title"><b>三校聯盟 NTU SYSTEM</b></p> <p class="ntu_system"> <a href="http://www.ntu.edu.tw" class="union" id="link1">臺灣大學</a> <a href="http://www.ntnu.edu.tw" class="union" id="link2">臺灣師範大學</a> <a href="http://www.ntust.edu.tw" class="union" id="link3">臺灣科技大學</a> </p></body></html> ''' from bs4 import BeautifulSoup as BS soup = BS(htm, "html.parser") A1 = soup.title A2 = soup.find("a") #<a> A3 = soup.find("b") #<b> A4 = soup.find_all("a", {"class": "union"}) web = soup.find("a", {"id": "link1"}) data = soup.select(".union") #list[] B = soup.select("#link3") #list[] print(A1) print("*" * 50) print(A2) print("*" * 50) print(A3) print("*" * 50) print(A4) print("*" * 50) print(web.get("href")) #常用GET網址 print("*" * 50) for i in data: print(i) #data[0~n]
def scrape_dine_out(pageNo): global dineOffer global dineOfferValue r = requests.get( "https://www.zomato.com/ncr/west-delhi-restaurants?table_booking=1&page=%d" % pageNo, cookies=cookie_jar, headers=headersUA) #print(r.text) soup = BS(r.text, "html.parser") my_divs = soup.find_all("div", {"class": "search-snippet-card"}) for div in my_divs: #name of the rest dineName = div.findChildren("a", {"class": "result-title"})[0].text.strip() #direct link to the rest page on zomato link = [] link = div.findChildren("a", attrs={ 'href': re.compile("^https://") }) #re.compile helps in pattern mathcing dineLink = link[0].get('href') #rating of the rest if (div.findChildren("span", {"class": "rating-value"})): dineRating = div.findChildren( "span", {"class": "rating-value"})[0].text.strip() dineRating = float(dineRating) else: dineRating = 0.0 #category of the rest dineCatg = div.findChildren("span", {"class": "col-m-12"})[0].text.strip() #finding the offers available if (div.findChildren("a", {"class": "zgreen"})): dineOffer = div.findChildren("a", {"class": "zgreen"})[0].text.strip() if "%" in dineOffer: dineOfferValue = int( (dineOffer[0:dineOffer.index("%")]).strip()) #open timings dineTime = div.findChildren("div", {"class": "col-s-11"})[0].text.strip() #calling the function to calculate the rest score dineScore = scorecal(dineRating, dineOfferValue) #print(rstScore) dineInfo = dict() dineInfo['dineName'] = dineName dineInfo['dineRating'] = dineRating dineInfo['dineCatg'] = dineCatg dineInfo['dineOffer'] = dineOffer dineInfo['dineTime'] = dineTime dineInfo['dineScore'] = dineScore dineInfo['dineLink'] = dineLink allDine.append(dineInfo) sortedAllDine = sorted(allDine, key=lambda i: i['dineScore'], reverse=True) return sortedAllDine