def get_results(moss_url): resp = r.get(moss_url) soup = BeautifulSoup(resp.content.decode('utf-8'), 'html5lib') ps = soup('p') name = None row = soup.table('tr')[1:2][0] first, second, lines = map(lambda x: x.text, row('td')) line, per = first.split() # Regex must be optimized, but this will work... m = re.match(r".*/([a-z- ]*)/[A-Za-z_öë ]*\.[a-z]+", line) if m: if m.groups(): name = '_'.join(m.groups()) if not name: name = 'moss_%s' % moss_url[33:] matches = [] for row in soup.table('tr')[1:]: first, second, lines = map(lambda x: x.text, row('td')) first = parse_col(first) second = parse_col(second) lines = int(lines) url = row.a['href'] matches.append(Match(first, second, lines, url)) fil = Filter() matches = list(filter(fil.include, matches)) return Results(name, matches)
def userinfo(self, desc): if desc.find('?reset=1') != -1: desc = self.user.get(Gamer.url + '?reset=1') desc = BeautifulSoup(desc, 'html.parser') self.depth = int(desc.h2.text.split()[-1]) info = desc.table('tr')[2]('td') self.level = int(info[0].text) self.hp = int(info[1].text) self.xp = int(info[2].text) self.weapon = Weapon(info[3].text) self.inventory = self.get_inventory(desc.table('tr')[3].text) return desc
def page_of_data(i): page_no = str(i) url_base = 'http://mobile311.sfgov.org/' url_ext = '?page=' + page_no + '&' + category url = url_base + url_ext + '&status=open' page = urllib2.urlopen(url) soup = BeautifulSoup(page.read(), 'lxml') #get report numbers reports = soup.table('span', "activity-timestamp") #get details from second page #should modify code to also get location information for line in reports: line = str(line) x = line.find("#") + 1 y = x + 7 z = line[x:y] #print z url_goal = url_base + "reports/" + z print url_goal # Debugging maybe comment out this line page2 = urllib2.urlopen(url_goal) real_soup = BeautifulSoup(page2.read()) blockquote = real_soup('blockquote') for lne in blockquote: request_type = lne.find_next_sibling('p') #print request_type if 'Fire hydrant' in str(request_type): print url_goal print blockquote pane = real_soup("div", "tab-pane active") #kids = real_soup.findchildren("div","tab-pane active") for ln in pane: #kids = real_soup.findchildren("div","tab-pane active") print ln print " kids**********************************"
def page_of_data(i): page_no =str(i) url_base = 'http://mobile311.sfgov.org/' url_ext = '?page='+page_no+'&service_id=518d5892601827e3880000c5' # street and sidewalk cleaning url= url_base+url_ext+'&status=open' page = urllib2.urlopen(url) soup = BeautifulSoup(page.read(),'lxml') #get report numbers reports = soup.table('span',"activity-timestamp") #get details from second page #should modify code to also get location information for line in reports: line=str(line) x=line.find("#")+1 y=x+7 z=line[x:y] #print z url_goal = url_base+"reports/"+z #print url_goal page2 = urllib2.urlopen(url_goal) real_soup = BeautifulSoup(page2.read()) blockquote = real_soup('blockquote') for lne in blockquote: request_type = lne.find_next_sibling('p') #print request_type if 'Human / Animal Waste'in str(request_type): print url_goal print blockquote pane = real_soup("div","tab-pane active") #kids = real_soup.findchildren("div","tab-pane active") for ln in pane: #kids = real_soup.findchildren("div","tab-pane active") print ln print " kids**********************************"
def trade_spider(): list_of_contracts = [] page = 1 last_page = max_page() while page <= last_page: url = "http://fcpir.ru/participation_in_program/contracts/?PAGEN_1=" +str(page) source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text, features="html.parser") for link in soup.table("a"): if link.get("href").startswith("#"): continue else: href = "http://fcpir.ru" + link.get("href") # Get link of project etap = get_single_item_data(href) # Get dictionary key - number of project for key, value in etap.items(): # value - list of etaps if len(value) < 1: continue else: for v in value: list_of_contracts.append([key, v]) page+=1 return list_of_contracts
def getTable(handler): tagStart = '<table' tagStop = '</table>' indexStart = handler.find(tagStart) newString = handler[indexStart:] indexStop = newString.find(tagStop) newString = newString[:indexStop + len(tagStop)] soup = BeautifulSoup(newString, 'html.parser') rows = soup.find_all('tr') table = [] for i in range(0, len(rows) - 1): table.append([td.text for td in soup.table('tr')[i]('td')]) for j in range(len(table)): for i in range(len(table[0])): if i == 5 and j > 0 or i == 6 and j > 0 or i == 7 and j > 0: regex = re.sub('\d\d.\d\d.\d\d$', '', table[j][i]) try: table[j][i] = float(regex) except: try: regex = regex.replace(',', '.') table[j][i] = float(regex) except: table[j][i] = regex return (table)
def get_results(moss_url, name=None): if args.verbose >= 1: print(f"Getting {moss_url}") resp = r.get(moss_url) soup = BeautifulSoup(resp.content.decode('utf-8'), 'html5lib') if name is None: ps = soup('p') if len(ps) >= 2: name = ps[2].text.strip() if not name: name = 'moss_%s' % date_str() matches = [] for row in soup.table('tr')[1:]: first, second, lines = map(lambda x: x.text, row('td')) first = parse_col(first) second = parse_col(second) lines = int(lines) url = row.a['href'] matches.append(Match(name, first, second, lines, url)) fil = Filter() matches = list(filter(fil.include, matches)) return Results(name, matches)
def page_of_data(i): page_no =str(i) url_base = 'http://mobile311.sfgov.org/' url_ext = '?page='+page_no+'&'+category url= url_base+url_ext+'&status=open' page = urllib2.urlopen(url) soup = BeautifulSoup(page.read(),'lxml') #get report numbers reports = soup.table('span',"activity-timestamp") #get details from second page #should modify code to also get location information for line in reports: line=str(line) x=line.find("#")+1 y=x+7 z=line[x:y] #print z url_goal = url_base+"reports/"+z print url_goal # Debugging maybe comment out this line page2 = urllib2.urlopen(url_goal) real_soup = BeautifulSoup(page2.read()) blockquote = real_soup('blockquote') for lne in blockquote: request_type = lne.find_next_sibling('p') #print request_type if 'Fire hydrant'in str(request_type): print url_goal print blockquote pane = real_soup("div","tab-pane active") #kids = real_soup.findchildren("div","tab-pane active") for ln in pane: #kids = real_soup.findchildren("div","tab-pane active") print ln print " kids**********************************"
def get_results(moss_url): resp = r.get(moss_url) soup = BeautifulSoup(resp.content.decode('utf-8'), 'html5lib') ps = soup('p') name = None if len(ps) >= 2: name = ps[2].text.strip() if not name: name = 'moss_%s' % date_str() matches = [] for row in soup.table('tr')[1:]: first, second, lines = map(lambda x:x.text, row('td')) first = parse_col(first) second = parse_col(second) lines = int(lines) url = row.a['href'] matches.append(Match(first, second, lines, url)) fil = Filter() matches = list(filter(fil.include,matches)) return Results(name, matches)
def compare_files(request, testcaseresultid): '''Function to visually compare expected output and actual output''' testcaseresult = get_object_or_404(TestcaseResult, pk=testcaseresultid) expected_output = os.path.join(settings.MEDIA_ROOT, testcaseresult.test_case.output_files.path) test_input = os.path.join(settings.MEDIA_ROOT, testcaseresult.test_case.input_files.path) actual_output_tar = os.path.join(settings.MEDIA_ROOT, testcaseresult.output_files.path) input_lines = "\n".join( read_file(name=test_input, readthis=testcaseresult.test_case.std_in_file_name)) soup = BeautifulSoup(difflib.HtmlDiff().make_file( read_file(name=expected_output, readthis=testcaseresult.test_case.std_out_file_name), read_file(name=actual_output_tar))) for row in soup.find('table').findAll('tr'): for col in row.find_all('td'): if col.has_attr('nowrap'): del col['nowrap'] soup.table.tbody.insert_before(soup.new_tag("thead")) soup.table.thead.append(soup.new_tag("th")) for s in [ 'Line Number', 'Expected Output', None, 'Line Number', 'Actual Output' ]: new_tag = soup.new_tag("th") if s: new_tag.string = s soup.table.thead.append(new_tag) soup.table = soup.find("table", {"rules": "groups"}) soup.table['width'] = "100%" soup.table.insert_after(soup.new_tag('br')) new_tag = soup.new_tag("style", type='text/css') soup.style.insert_after(new_tag) new_tag.append( " table {border-collapse:collapse; table-layout:fixed;}table td {border:solid 1px; " "width:100px; word-wrap:break-word;} table th{border:solid 1px;text-align:center;}" ) new_tag_style = soup.new_tag("style", type='text/css') new_tag.insert_after(new_tag_style) new_tag_style.append("td.diff_header {text-align:center}") for new_tag in soup.find_all('colgroup'): new_tag.extract() colgroup_tag = soup.new_tag('colgroup') soup.thead.insert_before(colgroup_tag) for w in ['2%', '8%', '40%', '2%', '8%', '40%']: colgroup_tag.append(soup.new_tag('col', width=w)) assignment = testcaseresult.test_case.program.assignment return render_to_response("evaluate/fileComparison.html", { 'course': assignment.course, 'assignment': assignment, 'tst': testcaseresult, 'inp': input_lines, 'table': str(soup), 'error_msg': error_msg }, context_instance=RequestContext(request))
def get_from_html(html): soup = BeautifulSoup(html, 'html.parser') rows = soup.table('tr', recursive=False) away_rows = rows[2].table('tr', recursive=False)[2:] home_rows = rows[4].table('tr', recursive=False)[2:] print "Home Team" process_rows(home_rows) print "Away Team" process_rows(away_rows)
def down_table(url): driver.get(url) source = driver.page_source soup = Soup(source, 'lxml') table = Soup(str(soup.table()), 'lxml') table_list = list(table.find_all('td')) final_list = list( map( lambda x: str(x).split(r'<')[int( (len(str(x).split(r'<')) + 1) / 2) - 1].split(r'>')[-1], table_list)) col_even1 = soup.find_all('tr', class_="even")[0] judge_a = Soup(str(col_even1), 'lxml') judge_list = list(judge_a.find_all('td')) judge_list = list( map( lambda x: str(x).split(r'<')[int( (len(str(x).split(r'<')) + 1) / 2) - 1].split(r'>')[-1], judge_list)) if len(judge_list) != 19: pad = ['--'] dif = 19 - len(judge_list) final_list[19 + len(judge_list):19 + len(judge_list)] = pad * dif name = re.findall(r'<h2>.+?</h2>', source)[0].split(r'<')[-2].split(r'>')[1] book = Workbook() sheet1 = book.active sheet1.title = "电影信息" sheet1.merge_cells('A1:S1') sheet1.cell(row=1, column=1, value=name) head = ['时间', '网票', '哈票', '万达', '金逸', '淘电影', '星美'] for i in range(len(head)): if i == 0: sheet1.cell(row=2, column=i + 1, value=head[i]) else: sheet1.cell(row=2, column=3 * i - 1, value=head[i]) sheet1.merge_cells('B2:D2') sheet1.merge_cells('E2:G2') sheet1.merge_cells('H2:J2') sheet1.merge_cells('K2:M2') sheet1.merge_cells('N2:P2') sheet1.merge_cells('Q2:S2') for i in range(int(len(final_list) / 19)): for j in range(19): sheet1.cell(row=i + 3, column=j + 1, value=final_list[19 * i + j]) book.save(name + '.xlsx') splitcri = u'每日票房数据统计' return name.split(splitcri)[0]
def get_from_html(html): soup = BeautifulSoup(html, 'html.parser') data_table = soup.table('tr', recursive=False)[7].table # .tag is equal to .find('tag'). ('tag') is equal to .find_all('tag'). # below line is equivalent to above line, for example # data_table = soup.find('table').find_all('tr', recursive=False)[7].find('table') # start with away team away = True # display a header to make output more readable print 'T P# P Player Name G A P +- PN PIM TOT SHF AVG PP SH EV S AB MS TH GV TK BS FW FL F%' # first two rows just contain headings for tr in data_table('tr', recursive=False)[2:]: try: # rows starting with integers (player numbers) have relevant data int(tr.td.text) except ValueError: # Once we hit rows that don't start with numbers # we are at the summary lines between teams # and further player data will be for the home team away = False # since this line is invalid, go to next line instead of doing below processing continue ( num, pos, name, g, a, p, pm, pn, pim, tot, shf, avg, pp, sh, ev, s, ab, ms, th, gv, tk, bs, fw, fl, fp ) = [td.text.strip() for td in tr('td')] # name must be encoded in utf8 to ensure display of accented E name = name.encode('utf8') # could convert +/- values into numbers # when empty, throws an error, so could overwrite to # preserve being empty, or just save as 0 try: pm = int(pm) except: pm = '' print '{:1} {:>2} {:1} {:22.22} {:1} {:1} {:1} {:>2} {:>2} {:>3} {:>5} {:>3} {:>5} {:>5} {:>5} {:>5} {:>2} {:>2} {:>2} {:>2} {:>2} {:>2} {:>2} {:>2} {:>2} {:>3}'.format( 'A' if away else 'H', num, pos, name.encode('ascii','replace'), g, a, p, pm, pn, pim, tot, shf, avg, pp, sh, ev, s, ab, ms, th, gv, tk, bs, fw, fl, fp )
def predej_dluzne(evidence, db, vypis, sumplus, summinus, pocet, csv_nejpozdeji): #jirkovo = nacti_jirkovo_ze_souboru('jirkovo.html') br = sa_login("Mirek Zv.", "miiirek1+1") sleep(2) jirkovo = br.open(url_zakaznici).read() vfp.strtofile(jirkovo, os.path.join(os.getcwd(), 'applications', 'platby', 'downloads', 'zakaznici.html')) # mírná duplicita v controllers/platby.py, kde tento soubor parsuji # ke zjištění aktuální zálohy soup = BeautifulSoup(jirkovo) for zakaznik in soup.table('tr'): sloupce = zakaznik('td') if len(sloupce): # první řádek (hlavička) totiž <td> nemá planovano = unformat_castka(sloupce[-1].string) neuhrazeno = unformat_castka(sloupce[-2].string) zaloha = unformat_castka(sloupce[-4].string) chybi = planovano + neuhrazeno - zaloha if chybi>0: symbol = str(sloupce[0].a.string).strip().lstrip('0') wk_zakaznik = db(db.auth_user.ss==symbol).select().first() if wk_zakaznik and wk_zakaznik.zaloha>0: jeste_chybi = chybi - evidence.get(wk_zakaznik.id, 0) # minus kolik jsme mu právě vyplatili v predej_planovane() if jeste_chybi: fl_zaloha = float(wk_zakaznik.zaloha) popis = (u'z sa.cz poptával %s Kč' % jeste_chybi ) if (jeste_chybi>fl_zaloha) else '' posleme_mu = min(jeste_chybi, fl_zaloha) id_pohybu = db.pohyb.insert( idauth_user=wk_zakaznik.id, idma_dati=Uc_sa.oz, iddal=Uc_sa.oz_sa, datum=datetime.now(), castka=posleme_mu, ss=symbol, popis=popis ) wk_zakaznik.update_record(zaloha=fl_zaloha-posleme_mu) pohyb = db(db.pohyb.id==id_pohybu).select().first() vypis1, sumplus1, summinus1 = __add_csv( pohyb, csv_nejpozdeji) vypis += vypis1 sumplus += sumplus1 summinus += summinus1 #db.commit() - commit je v kontroléru csv.py pocet += 1 return pocet, vypis, sumplus, summinus
def get_faculties(self): ''' Retrieves the faculties from eclass.teilar.gr The output is dictionary with the following structure: faculties_from_eclass = {'url': ['name', 'code']} ''' faculties_from_eclass = {} output = teilar_anon_login('http://openclass.teilar.gr/modules/auth/listfaculte.php') soup = BeautifulSoup(output) all_faculties = soup.table('td') for faculty in all_faculties: url = 'http://openclass.teilar.gr/modules/auth/' + faculty.a.get('href') name = faculty.a.contents[0].strip() code = faculty.small.contents[0].split(')')[0].replace('(', '').strip() faculties_from_eclass[url] = [name, code] return faculties_from_eclass
def get_MBB_tds(content_url): # now get the actual content r = s.post(content_url) # r.content spits out the content soup = BeautifulSoup(r.content) # make array of all "rows" (i.e. <tr> tags) alltrs = soup.table("tr") # get just the <td>...</td> content in an array alltds = [] for row in alltrs: alltds.append(row.td) return alltds
def __get_zaloha(ss): '''zjistí zákazníkovu nedávnou zálohu parsováním uloženého zakaznici.html ''' # duplicitní s export_csv.predej_dluzne, odkud jsem to oprásknul zaloha = fdate = None try: fname = os.path.join(request.folder, 'downloads', 'zakaznici.html') fdate = datetime.fromtimestamp(os.stat(fname).st_ctime) jirkovo = vfp.filetostr(fname) soup = BeautifulSoup(jirkovo) for zakaznik in soup.table('tr'): if str(zakaznik.td.a.string).strip().lstrip('0')==ss: zaloha = unformat_castka(zakaznik('td')[-4].string) break except: pass return fdate, zaloha
def ajax(): today = datetime.date.today() initial_day = datetime.date(2000, 1, 1) payload = { "__EVENTTARGET": "date_cal", "__EVENTARGUMENT": (today - initial_day).days, "classlist_ddl": request.args.get('building'), "__VIEWSTATE": "dDw1NTk0MzU4NjE7dDw7bDxpPDE+Oz47bDx0PDtsPGk8Mz47aTw0Pjs+O2w8dDxAMDw7Ozs7Ozs7Ozs7Pjs7Pjt0PDtsPGk8NT47PjtsPHQ8QDA8cDxwPGw8U0Q7PjtsPGw8U3lzdGVtLkRhdGVUaW1lLCBtc2NvcmxpYiwgVmVyc2lvbj0xLjAuNTAwMC4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPWI3N2E1YzU2MTkzNGUwODk8MjAxNy0xMi0wNT47Pjs+Pjs+Ozs7Ozs7Ozs7Oz47Oz47Pj47Pj47Pj47Pu5S1476NkYk5hmd81mL76xisA4B", "__VIEWSTATEGENERATOR": "D2C5BC33" } r = requests.post(query_url, data=payload) html_data = BeautifulSoup(r.text, "html5lib") table_data = { row("td", nowrap="nowrap")[0].text.strip(): [cell.text.strip() for cell in row("td", nowrap="nowrap")][1:] for row in html_data.table("tr", nowrap="nowrap") } return jsonify(table_data)
def down_table(url): driver.get(url) source = driver.page_source soup = Soup(source,'lxml') table = Soup(str(soup.table()),'lxml') table_list = list(table.find_all('td')) final_list = list(map(lambda x: str(x).split(r'<')[int((len(str(x).split(r'<'))+1)/2)-1].split(r'>')[-1],table_list)) col_even1 = soup.find_all('tr',class_ ="even")[0] judge_a = Soup(str(col_even1),'lxml') judge_list = list(judge_a.find_all('td')) judge_list = list(map(lambda x: str(x).split(r'<')[int((len(str(x).split(r'<'))+1)/2)-1].split(r'>')[-1],judge_list)) if len(judge_list) != 19: pad = ['--'] dif = 19 - len(judge_list) final_list[19+len(judge_list):19+len(judge_list)] = pad*dif name = re.findall(r'<h2>.+?</h2>',source)[0].split(r'<')[-2].split(r'>')[1] book=Workbook() sheet1=book.active sheet1.title = "电影信息" sheet1.merge_cells('A1:S1') sheet1.cell(row = 1,column=1 ,value = name) head = ['时间','网票','哈票','万达','金逸','淘电影','星美'] for i in range(len(head)): if i == 0: sheet1.cell(row = 2,column=i+1,value = head[i]) else: sheet1.cell(row = 2,column=3*i-1,value = head[i]) sheet1.merge_cells('B2:D2') sheet1.merge_cells('E2:G2') sheet1.merge_cells('H2:J2') sheet1.merge_cells('K2:M2') sheet1.merge_cells('N2:P2') sheet1.merge_cells('Q2:S2') for i in range(int(len(final_list)/19)): for j in range(19): sheet1.cell(row = i+3,column=j+1 ,value = final_list[19*i+j]) book.save(name + '.xlsx') splitcri = u'每日票房数据统计' return name.split(splitcri)[0]
def page_of_data(i): # normally the web-page with ten reports on it. page_no =str(i) url_base = 'http://mobile311.sfgov.org/' #url_ext = '?page='+page_no+'&service_id=518d5892601827e3880000c5' # street and sidewalk cleaning url_ext = '?page='+page_no+'&service_id=55e8409a45ff461f92000006' # homeless concerns # change this line for other type of service reports url= url_base+url_ext+'&status=open' #status closed is possible page = urllib2.urlopen(url) soup = BeautifulSoup(page.read(),'lxml') #get report numbers from first page reports = soup.table('span',"activity-timestamp") # using find_all => gives [ ] #get details from second page #should modify code to also get location information for line in reports: line=str(line) x=line.find("#")+1 y=x+7 z=line[x:y] #print z - the active report number url_goal = url_base+"reports/"+z #print url_goal page2 = urllib2.urlopen(url_goal) real_soup = BeautifulSoup(page2.read()) #print real_soup # for debugging blockquote = real_soup('blockquote') for lne in blockquote: request_type = lne.find_next_sibling('p') #print request_type if 'Encampment' in str(request_type): print url_goal #print blockquote #thefile.write("%s\n" % url_goal) with open("url_list.txt","a") as thefile: thefile.write("%s\n" % url_goal)
def get_single_item_data(item_url): global contract_count l_1 = [] l_2 = [] dict = {} source_code = requests.get(item_url) plain_text = source_code.text soup = BeautifulSoup(plain_text, features="html.parser") for item1 in soup.tbody.findAll('p'): l_1.append(item1.string) # List of project numbers for item2 in soup.tbody('tr', class_='tr-hr-dashed'): i_1 = item2.find_all('td')[0].string i_2 = item2.find('span').string[3:] i_3 = item2.find_all('div')[1].contents if (i_2 == "Этап принят") and (len(i_3) == 1): l_2.append(i_1) # List of etap numbers for item_name in soup.table("a", {'class':'panel-some-doc preview'}): contract_count += 1 # Additional task to count project files dict[l_1[1]] =l_2 return dict
url_base = 'http://mobile311.sfgov.org/' url_ext = '?external=false&service_id=518d5892601827e3880000c5' # street and sidewalk cleaning url= 'http://mobile311.sfgov.org/?external=true&service_id=55e8409a45ff461f92000006&status=open' url= url_base+url_ext+'&status=open' page = urllib2.urlopen(url) soup = BeautifulSoup(page.read(),'lxml') #print soup.table.tbody.tr print ("------------") #print soup.table.tbody.tr.span #reports = soup.table('span',class_="activity-timestamp") reports = soup.table('span',"activity-timestamp") #reports = soup.table.find('span',"activity-timestamp").get_text() for line in reports: line=str(line) x=line.find("#")+1 y=x+7 z=line[x:y] print z url_goal = url_base+"reports/"+z print url_goal page2 = urllib2.urlopen(url_goal) real_soup = BeautifulSoup(page2.read()) blockquote = real_soup('blockquote') for line in blockquote: request_type = line.find_next_sibling('p')
from bs4 import BeautifulSoup import os import glob import sys from xlrd import open_workbook from xlwt import Workbook import xlsxwriter workbook = xlsxwriter.Workbook('IT_2nd_sem_2nd.xlsx') #NAME OF GENERATED FILE worksheet = workbook.add_worksheet() row = 1 for filename in glob.glob('*.html'): soup = BeautifulSoup(open(filename),'html.parser') n=0 c=0 for b in soup.table(): if(str(b.get('id'))!="None"): n=n+1 x=str(b.get('id')) for b in soup.table(): if(str(b.get('id'))!="None"): c=c+1 if(c==n-1): x=str(b.get('id')) id_selector=x[3:5] print(id_selector) rollnumber = str(soup.find(id='lblRollNo').text) name = str(soup.find(id='lblFullName').text) fathername = str(soup.find(id='lblFatherName').text)
def scrape(): # Scrape the NASA Mars News Site for the latest news title and paragraph url = 'https://mars.nasa.gov/news/' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') result = soup.find('div', class_='content_title') result = soup.find('div', class_='features') news_title = result.find('div', class_="content_title").a.text news_p = result.find('div', class_='rollover_description_inner').text.strip() news_url = result.find('div', class_="content_title").a['href'] news_url = 'https://mars.nasa.gov' + news_url # Scrape the JPL's Featured Space Image executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') result = soup.find('a', class_='button fancybox') featured_image_url = 'https://www.jpl.nasa.gov' + result[ 'data-fancybox-href'] # Scrape the Mars Weather twitter account url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') result = soup.find( 'p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text') mars_weather = result.text # Scrape the Mars Facts webpage url = 'https://space-facts.com/mars/' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') facts = [td.text for td in soup.table('td')] mars_facts = [] for i in range(0, len(facts), 2): mars_facts.append({facts[i]: facts[i + 1]}) # Scrape the USGS Astrogeology site for high resolution images url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') results = soup.find_all('a', class_='itemLink product-item') hemispher_image_urls = [] for i in range(0, 8, 2): new_url = 'https://astrogeology.usgs.gov' + results[i]['href'] browser.visit(new_url) html2 = browser.html soup2 = BeautifulSoup(html2, 'html.parser') hemisphere_image = soup2.find_all('a') hemisphere_image_url = hemisphere_image[41]['href'] hemisphere_title = soup2.find('h2').contents[0] hemispher_image_urls.append({ 'title': hemisphere_title, 'img_url': hemisphere_image_url }) # Aggregate all the data above into a single dictionary and return to the caller mars_data = { 'news_title': news_title, 'news_p': news_p, 'news_url': news_url, 'featured_image': featured_image_url, 'weather': mars_weather, 'facts': mars_facts, 'image1': hemispher_image_urls[0], 'image2': hemispher_image_urls[1], 'image3': hemispher_image_urls[2], 'image4': hemispher_image_urls[3] } return mars_data
def unit_table_to_dict( data: str) -> Dict[str, Dict[str, Union[Dict[str, int], str, int]]]: """ Parse HTML unit table from prismata.gamepedia.com into dict format. Parameters ---------- data : str HTML table for unit list from prismata.gamepedia.com.. Returns ------- dict Example ------- output: { "Unit Name": { "name": "Unit Name", "costs": { "gold": 1, "energy": 0, "green": 1, "blue": 0, "red": 1, }, "stats": { "attack": 1, "health": 1, }, "attributes": { "supply": 1, "frontline": True, "fragile": False, "blocker": True, "prompt": False, "stamina": 0, "lifespan": 0, "build_time": 0, "exhaust_turn": 0, "exhaust_ability": 0, }, "links": { "path": "/Unit_Name", }, "type": 1, "unit_spell": "Unit|Spell", }, ... } """ soup = BeautifulSoup(data, "html.parser") table = soup.table("tr") if soup.table else [] return { clean(unit[0]): { # unit name "name": clean(unit[0]), "costs": { "gold": clean(unit[3], int), "energy": clean(unit[4], int), "green": clean(unit[5], int), "blue": clean(unit[6], int), "red": clean(unit[7], int), }, "stats": { "attack": int(clean(unit[15]) or 0), "health": clean(unit[10], int), }, "attributes": { "supply": clean(unit[8], int), "frontline": clean(unit[11], bool), "fragile": clean(unit[12], bool), "blocker": clean(unit[13], bool), "prompt": clean(unit[14], bool), "stamina": clean(unit[16], int), "lifespan": clean(unit[19], int), "build_time": clean(unit[9], int), "exhaust_turn": clean(unit[17], int), "exhaust_ability": clean(unit[18], int), }, "links": { "path": unit[0].a.get("href"), }, "type": clean(unit[1], int), "unit_spell": clean(unit[2]), } for unit in map(lambda row: row("td"), table) # type: ignore if unit # Ignoring typing in map, all uses of the clean function return Any # This means that it can't match the return types for this function }
def scrape(): browser = init_browser() # create mars_data dict that we can insert into mongo mars_data = {} # visit nasa for news of mars browser = Browser('chrome', headless=False) url_news = 'https://mars.nasa.gov/news/' browser.visit(url_news) # create a soup object from the html html_news = browser.html soup_news = BeautifulSoup(html_news, 'html.parser') div1 = soup_news.find('div', class_='content_title') news_title = div1.find('a').text news_p = soup_news.find('div', class_='article_teaser_body').text # add them into mars_data dict mars_data['news_title'] = news_title mars_data['news_p'] = news_p # visit JPL Mars space images to get a big image browser = Browser('chrome', headless=False) url_img = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_img) browser.click_link_by_partial_text('FULL IMAGE') # create a soup object from the html html_img = browser.html soup_img = BeautifulSoup(html_img, 'html.parser') home = soup_img.find('article', class_="carousel_item") link = home.a['data-fancybox-href'] featured_image_url = 'https://www.jpl.nasa.gov' + link # add it into mars_data dict mars_data['featured_image_url'] = featured_image_url # visit twitter to get Mars Weather url_weather = 'https://twitter.com/marswxreport?lang=en' html_weather = requests.get(url_weather) soup_weather = BeautifulSoup(html_weather.text, 'html.parser') tweet = soup_weather.find('div', class_='stream') mars_weather = tweet.find(text="Mars Weather").findNext('p').text # add it into mars_data dict mars_data['mars_weather'] = mars_weather # visit Mars facts and create a table by pandas url_facts = 'https://space-facts.com/mars/' facts_table = pd.read_html(url_facts) df = facts_table[0] df.columns = ['Description', 'Value'] df.set_index(['Description'], inplace = True) df.to_html('Mars_df.html') # Generate a html table from dataframe html_table = df.to_html() html_table.replace('\n','') soup_table = BeautifulSoup(open('mars_df.html'),'html.parser') # create a dictionaries for all cells to create a table in html mars_facts = {} mars_list = [] ths = [x.text.strip(':') for x in soup_table.table('th') if x.text != ''] column_list = ths[0:2] column_list.reverse() th = ths[2:] td = [y.text for y in soup_table.table('td')] mars_facts = dict([(i, j) for i, j in zip(th, td)]) mars_list.append(mars_facts) # add them into mars_data dict mars_data['columns'] = column_list mars_data['mars_list'] = mars_list # get the hemisperes imgs url_hemisperes = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url_hemisperes) html_hemisperes = browser.html soup_hem = BeautifulSoup(html_hemisperes, 'html.parser') jpg_links = soup_hem.find_all('div', class_='description') Mars_Hemisperes = [] for link in jpg_links: info = {} h3 = link.find('h3').text info['title'] = h3 browser.click_link_by_partial_text(h3) html2 = browser.html soup2 = BeautifulSoup(html2, 'html.parser') url = soup2.find('img', class_='wide-image')['src'] info['img_url'] = 'https://astrogeology.usgs.gov' + url Mars_Hemisperes.append(info) browser.click_link_by_partial_text('Back') # add it into mars_data dict mars_data['Mars_Hemisperes'] = Mars_Hemisperes return mars_data
''' https://kaijento.github.io/2017/03/30/beautifulboup-removing-tags/ ''' import csv, json, requests, sys from bs4 import BeautifulSoup url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_sector_composition' r = requests.get(url) soup = BeautifulSoup(r.content, 'html5lib') writer = csv.writer(sys.stdout) for tr in soup.table('tr')[2:]: for tag in tr(['span', 'sup']): tag.decompose() writer.writerow([td.text for td in tr('td')])
def obtainWindSpeed(self, model, cities=False): """! function to get the api data app.deta.sh @param cities true or false value to get cities @return object json with the following structure example: [ { "city": "London, United Kingdom", "wind_speed": "13 kph", "coordinates": { "w": "0.1278", "n": "51.5074" } }, ... ] """ response = requests.get(URL_API_WIND, verify=True) soup = BeautifulSoup(response.content, 'html.parser') rows = soup.table('tr') data = [] for row in rows[1:]: column = row('td') td_elements = [c.text.replace('°', '') for c in column] coordinates = td_elements[0].split(',') coordinates = [ "=".join(coordinate.strip().split(" ")[::-1]).lower() for coordinate in coordinates ] wind_speed = td_elements[1] dict_coordinates = self.coordinatesToJson(coordinates) if cities: response_cities = self.getCityName(coordinates) city = response_cities['result'] if response_cities.get( 'result', None) else None else: city = None object_model = model.objects.filter(city=city, wind_speed=wind_speed).last() if object_model: date_now = timezone.now() diff = relativedelta(date_now, object_model.date_register) if diff.hours >= 1: object_model = model.objects.create( city=city, coordinates=dict_coordinates, wind_speed=wind_speed) else: object_model = model.objects.create( city=city, coordinates=dict_coordinates, wind_speed=wind_speed) comments = WeatherJournal.objects.filter( fk_weather=object_model.pk) data.append({ 'id': object_model.pk, 'coordinates': dict_coordinates, 'wind_speed': wind_speed, 'city': city, 'comments': WeatherJournalSerializer(comments, many=True).data }) dict_coordinates = {} return json.dumps(data)