def data_per_case (single_case): year = single_case['citation']['year'] docket = single_case['docket_number'] facts = single_case['facts_of_the_case'] question = single_case['question'] f = bs4(facts).text.replace('\n', '').replace('\xa0', '') q = bs4(question).text.replace('\n', '').replace('\xa0', '') return [year, docket, f, q]
def __SetupPage(self): """Inits the bs4 property""" page = bs4(self.__FetchPage(self.url),HTML_PARSER) # put iframes into the page object, some blog themes require this for frame in page.find_all('iframe'): try: frame_content = bs4(self.__FetchPage(frame['src']),HTML_PARSER) frame.replace_with(frame_content.body) except KeyError: pass self.page = page
def main(): now = str(datetime.datetime.now()) conn = sqlite3.connect(DIR + "scrape.db") conn.row_factory=sqlite3.Row c = conn.cursor() changes = {} new_beers = [] items = [] # Add beers to items from each url for url in URLS: soup = bs4(urllib2.urlopen(url).read()) print "Found %s Items..." % (total_items(soup)) items += get_items(soup) # See if there are multiple pages page = 2 while (int(total_items(soup)) > len(items)): items += get_items(bs4(urllib2.urlopen(url + "&sort=20a&page=%d" % page ).read())) page += 1 # Loop over beers found for item in items: # See if the beer exists in the database entry = c.execute("SELECT * FROM beers WHERE name = ?", [item['name']]).fetchall() if (len(entry) == 0): # If it doesn't insert it into the data base c.execute("INSERT INTO beers (last_updated, name, qty, price, etreId) VALUES (?, ?, ?, ?, ?)", [now, item['name'], item['qty'], item['price'], item['etreId']]) new_beers.append({"name":item['name'], "qty":item['qty'], "price":item['price'], "etreId":item['etreId']}) # print "New beer found! name: %s qty: %d price: %f" % (item['name'], item['qty'], item['price']) elif (len(entry) == 1): # If it does exist e = entry[0] # Loop over the keys that are important (not id, time, etreId) #print e.keys() for key in e.keys()[1:-2]: if e[key] != item[key]: if item['name'] in changes.keys(): changes[item['name']][key] = [str(e[key]), str(item[key])] else: changes[item['name']] = {key:[str(e[key]), str(item[key])], 'etreId': item['etreId']} c.execute("UPDATE beers SET name=?, qty=?, price=?, last_updated=?, etreId=? WHERE id = ?", [item['name'], item['qty'], item['price'], now, item['etreId'], entry[0][0]]) # Rendering #print changes, new_beers render(changes, new_beers) # Commit and close the db cursor conn.commit() conn.close()
def get_new_players(): ''' Obtiene los fichajes y ventas de la liga ''' session = requests.session() session.get('http://stats.comunio.es/transfers.php', headers=headers) soup = bs4(session.get('http://stats.comunio.es/transfers.php', headers=headers).content) new_members = True for table in soup.find_all('table', {'class': 'rangliste'}): if new_members: for row in table.find_all('tr', re.compile(r"r[1-2]"))[1:]: nuna = re.search("([0-9]+)-(.*)", row.find('a', {'class': 'nowrap'})['href']) number = nuna.group(1) name = nuna.group(2).replace("+", " ").strip() club = row.find('td', {'class': 'clubPic'}).a['href'] club_id = re.search("([0-9]+)-(.*)", club).group(1) club_name = re.search("([0-9]+)-(.*)", club).group(2).replace("+", " ") position = _position_translation(row.contents[6].text) db.commit_query('INSERT IGNORE INTO players (idp, name, position, idcl) VALUES (%s, "%s", %s, %s)' % ( number, name, position, club_id)) get_all_prices(name, incremental=True) print 'Alta jugador %s (%s) en el club %s (%s) como %s (%s)' % ( name, number, club_name, club_id, row.contents[6].text, position) new_members = False else: for row in table.find_all('tr', re.compile(r"r[1-2]"))[1:]: nuna = re.search("([0-9]+)-(.*)", row.find('a', {'class': 'nowrap'})['href']) number = nuna.group(1) name = nuna.group(2).replace("+", " ").strip() club = row.find('td', {'class': 'clubPic'}).a['href'] club_id = re.search("([0-9]+)-(.*)", club).group(1) club_name = re.search("([0-9]+)-(.*)", club).group(2).replace("+", " ") db.commit_query('UPDATE players SET idcl=NULL WHERE idp=%s' % (number)) print 'Baja jugador %s (%s) del club %s (%s)' % (name, number, club_name, club_id)
def __format_bidding(self, bidding): """Convert bidding data to properly formatted HTML table.""" log.getLogger('b_format').debug('formatting bidding: %s', bidding) bid_match = re.compile(r'(\d)([SHDCN])') html_output = bs4('<table>', 'lxml') header_row = html_output.new_tag('tr') html_output.table.append(header_row) for direction in self.__directions: header_cell = html_output.new_tag('th') header_cell.string = direction header_row.append(header_cell) for bid_round in bidding: bidding_row = html_output.new_tag('tr') html_output.table.append(bidding_row) for bid in bid_round: bid_cell = html_output.new_tag('td') call_match = re.match(bid_match, bid) if call_match: bid_cell.append(call_match.group(1)) bid_icon = html_output.new_tag( 'img', src='images/' + call_match.group(2) + '.gif') bid_cell.append(bid_icon) else: if bid == 'SkipBid': bid = '( - )' bid_cell.append(bid) bidding_row.append(bid_cell) log.getLogger('b_format').debug('%5s' * 4, *bid_round) return html_output.table.prettify()
def query(self, query): result = { 'result': [], } http = ModuleManager.call_module_method( 'http_lib', 'get', 'https://encrypted.google.com/search?q=%s&num=10000' % quote(query) ) if not 'html' in http: result['error'] = 'No server responce' return result soup = bs4(http['html']) for title in soup.find_all('h3', attrs={'class': 'r'}): element = title.find('a') # parse google url g_url = urlparse(element['href']) g_args = parse_qs(g_url.query) url = g_args['q'][0] result['result'].append({ 'title': element.text, 'url': url, }) return result
def post(self, request): symbols = request.POST['symbols'].split(',') for value in range(len(symbols)): symbols[value] = symbols[value].strip() roc = [] pe_ratio = [] for value in symbols: value = value.upper() # result = r.get(self.lookup_url + value).json() url = r.get("http://174.129.18.141/companies/" + value + "/pe_ratio") soup = bs4(url.text) pe_text = soup.select('span#pgNameVal') pe_text = pe_text[0].text pe_text = re.split('\s+', pe_text) for i in range(1): pe_ratio.append(float(pe_text[i])) url = r.get("http://www.gurufocus.com/term/ROC_JOEL/" + value + "/Return%252Bon%252BCapital%252B%252B-%252BJoel%252BGreenblatt/") soup = bs4(url.text) for div in soup.select('.data_value'): roc.append(float(div.get_text()[:-19])) magic_dict = {} counter = -1 for value in symbols: counter+=1 magic_dict[value] = {"magic number":roc[counter]-pe_ratio[counter]} print(magic_dict) return JsonResponse({'magic_dict':magic_dict})
def do_grades(self, ln): if self.is_login: try: self.response['grades'] = self.session.get(GRADE_URL) current_term = bs4(self.response['grades'].text, 'lxml') \ .find('option', selected=True)['value'] self.request['Donem'] = current_term self.request['Ara'] = 'Listele' self.response['grades'] = self.session.post(GRADE_URL, data=self.request) table = bs4(self.response['grades'].text, 'lxml') \ .find_all('table', 'table table-bordered ' 'table-condensed')[0] parsed_table = table.findChildren(['th', 'tr']) out_table = PrettyTable( ['DERS', 'AKTS', 'VIZE', 'FIN', 'BÜT', 'BN']) for i in range(1, len(parsed_table)): a = [str(n.text).strip() for n in parsed_table[i].findChildren('td')] a = [a[2].split('(')[0]] + a[3:8] out_table.add_row(a) print(out_table) except: print(colored(">>>> Notlar işlenirken hata oluştu", 'red')) else: self.do_login(ln) self.do_grades(ln)
def command_price(bot, user, channel, args): """.price [Steam game name] - Find whether a currently on-sale Steam game has ever been on sale for cheaper""" search = args.replace(" ","+") # req = urllib2.Request("http://store.steampowered.com/search/?term=%s&category1=998" % search, headers={"User-Agent":"Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"}) # db = bs4(urllib2.urlopen(req).read()) db = bs4(urllib.urlopen("http://store.steampowered.com/search/?term=%s&category1=998" % search)) row = db.find(class_ = "search_result_row") appid = row['data-ds-appid'] xml = requests.get("http://steamsales.rhekua.com/xml/sales/app_%s.xml" % appid) pricehist = bs4(xml.text) name = row.find("span", {'class': 'title'}).string price = row.find(class_ = "search_price").text price = price[price.rfind('$'):] current = float(price[1:]) lowest = current date = "never" for entry in pricehist.find_all('set'): price = float(entry['value']) if price < lowest: lowest = price date = entry['name'] elif price == lowest: if not date == "never": date = entry['name'] if lowest == current: bot.say(channel, name + " has never been cheaper than the $" + str(current) + " it is right now!") else: bot.say(channel, name + " is $" + str(current) + " now, but was $" + str(lowest) + " on " + date)
def get_all_city_info(): r = requests.get('http://www.pm25.in/rank') soup = bs4(r.content) citys = soup.find_all('tr')[1:] today = datetime.datetime.now() now = datetime.datetime(today.year,today.month,today.day,today.hour) for c in citys: td = c.find_all('td') print data = {'name':td[1].a.string,'name_py':td[1].a['href'][1:]} data['rank'] = td[0].string data['AQI'] = td[2].string data['level'] = td[3].string data['PM25'] = td[5].string data['PM10'] = td[6].string data['CO'] = td[7].string data['SO2'] = td[11].string data['time'] = now print data mdb.con['air'].pmcn.update( {'name':data['name']}, {'$set':{ 'AQI':data['AQI'], 'PM25':data['PM25'], 'PM10':data['PM10'], 'CO':data['CO'], 'SO2':data['SO2'], 'rank':data['rank'], 'level':data['level'], 'time':data['time'] } }, upsert=True)
def get_image_link_and_title(): try: page = urllib2.urlopen( "http://photography.nationalgeographic.com/photography/photo-of-the-day/") except urllib2.URLError: # Looks like it didn't work, just return from the function # and try again at the next interval print "there was an error opening the url" return None, None soup = bs4(page) try: link = soup.find('div', class_='download_link').find('a').attrs['href'] except AttributeError: #looks like there wasn't a download link. # just grab the low-res image instead try: link = soup.find('div', class_='primary_photo').find('img').attrs['src'] except AttributeError: # couldn't find the right div, # try again next interval (not a robust solution) print "there was an error parsing the html" return None, None # now, prepend http onto the url link = 'http:{0}'.format(link) title = soup.find('div', id='page_head').find('h1').contents[0].encode('ascii', 'ignore') return link, title
def get_details_apprt(url): r_details = requests.get(url) html = bs4(r_details.text, 'html.parser') titre = re.sub('\s+', ' ', html.select('.page-header.mbm')[0].text) prix = html.select('.vi-price-label .amount') if len(prix) == 0: return None prix = prix[0]['title'] aside_infos = html.select('aside.panel.panel-body.panel-info')[0].text m = re.match(r'.*?Nombre de pièces: ([0-9]+).*?', aside_infos, re.DOTALL) if m: nb_pieces = m.group(1) m = re.match('.*?Surface: (.*? m²).*?', aside_infos, re.DOTALL) if m: surface = m.group(1) m = re.match(r'.*?Secteur: ([\w_-]+).*?', aside_infos, re.DOTALL) if m: secteur = m.group(1) m = re.match(r".*?Adresse: ([\w'_\s-]+).*?Type", aside_infos, re.DOTALL) if m: adresse = re.sub(r"\s+", " ", m.group(1)) date = html.select('.date.dtstart.value')[0]['title'] annee_mois = extract_date_infos(date) annee = annee_mois[0] mois = annee_mois[1] annonce_details = {'titre': titre, 'prix': prix, 'nb_pieces': nb_pieces, 'surface': surface, 'secteur': secteur, 'adresse': adresse, 'annee': annee, 'mois': mois} return annonce_details
def findfeed(site): raw = requests.get(site).text result = [] possible_feeds = [] html = bs4(raw) feed_urls = html.findAll("link", rel="alternate") if len(feed_urls) > 1: for f in feed_urls: t = f.get("type",None) if t: if "rss" in t or "xml" in t: href = f.get("href",None) if href: possible_feeds.append(href) parsed_url = urllib.parse.urlparse(site) base = parsed.scheme+"://"+parsed_url.hostname atags = html.findAll("a") for a in atags: href = a.get("href",None) if href: if "xml" in href or "rss" in href or "feed" in href: possible_feeds.append(base+href) for url in list(set(possible_feeds)): f = feedparser.parse(url) if len(f.entries) > 0: if url not in result: result.append(url) return(result)
def categorieIgnore(url, ignoreCategories): page = bs4(get(url).text, "lxml") categories = page.select("#content ul")[1].select("a") for categorie in categories: if categorie.get_text().split("/")[0] in ignoreCategories: return True return False
def get_area(url): base_url = 'https://www.govtrack.us' resp = requests.get(url) soup = bs4(resp.text, 'lxml') div = soup.find('div', {'id': 'vote_explainer'}) links = div.findAll('a') if len(links) == 0: return "Nomination" for a in links: #print(a) if is_absolute(a['href']): #print('is absolute') next_url = a['href'] else: #print('relative') next_url = base_url + a['href'] #print("current url: %s\nnext url: " %url, next_url) if 'members' in next_url: continue elif 'govtrack.us' in next_url: return get_from(next_url) else: continue raise Exception("Couldnt find link to follow!", url)
def ud(self): if not self.values: self.chat("Whatchu wanna know, bitch?") return try: request = pageopen('http://www.urbandictionary.com/define.php', params={'term': ' '.join(self.values)}) soup = bs4(request.text) except: self.chat("parse error") return elem = soup.find('div', {'class': 'meaning'}) try: defn = [] for string in elem.stripped_strings: defn.append(string) except: self.chat("couldn't find anything") if defn: # Unfortunately, BeautifulSoup doesn't parse hexadecimal HTML # entities like ' so use the parser for any stray entities. for paragraph in defn: wrapped = textwrap.wrap(paragraph, 200) for line in wrapped: self.chat(unescape(line)) else: self.chat("couldn't find anything")
def load_html_data(self): """Scrape all data from a movie's HTML page""" soup = bs4(session.base_get("torrents.php", params={'id': self.ID}).text, "html.parser") self.data['Cover'] = soup.find('img', class_='sidebar-cover-image')['src'] # Title and Year match = re.match(r'(.*) \[(\d{4})\]', soup.find('h2', class_='page__title').encode_contents()) self.data['Title'] = match.group(1) self.data['Year'] = match.group(2) # Genre tags self.data['Tags'] = [] for tagbox in soup.find_all('div', class_="box_tags"): for tag in tagbox.find_all("li"): self.data['Tags'].append(tag.find('a').string) self.data['Directors'] = [] for director in soup.find('h2', class_='page__title').find_all('a', class_='artist-info-link'): self.data['Directors'].append({'Name': director.string}) # File list & trumpability for tor in self['Torrents']: # Get file list filediv = soup.find("div", id="files_%s" % tor.ID) tor.data['Filelist'] = {} basepath = re.match(r'\/(.*)\/', filediv.find("thead").find_all("div")[1].get_text()).group(1) for elem in filediv.find("tbody").find_all("tr"): bytesize = elem("td")[1]("span")[0]['title'].replace(",", "").replace(' bytes', '') filepath = os.path.join(basepath, elem("td")[0].string) tor.data['Filelist'][filepath] = bytesize # Check if trumpable if soup.find(id="trumpable_%s" % tor.ID): tor.data['Trumpable'] = [s.get_text() for s in soup.find(id="trumpable_%s" % tor.ID).find_all('span')] else: tor.data['Trumpable'] = []
def command_gb(bot, user, channel, args): """.gb upcoming - Returns any posted upcoming items at GiantBomb.com (it's a website about video games)""" global videos if args: cmds = args.split() subcommand = cmds[0] if subcommand == "ql": bot.say(channel, "Latest QL: %s" % videos['ql']) elif subcommand == "feature": bot.say(channel, "Latest Feature: %s" % videos['feature']) elif subcommand == "sub": bot.say(channel, "Latest Subscriber Content: %s" % videos['sub']) elif subcommand == "article": bot.say(channel, "Latest Article: %s" % videos['article']) elif subcommand == "review": bot.say(channel, "Latest Review: %s" % videos['review']) elif subcommand == "bombastica": bot.say(channel, "Latest Bombastica: %s" % videos['bombastica']) elif subcommand == "upcoming": page = bs4(urllib.urlopen("http://www.giantbomb.com/")) upcoming = page.find("dl", {"class": "promo-upcoming"}) slots = upcoming.find_all("dd") if len(slots) == 0: bot.say(channel, "No items on the upcoming list! Alert @GiantBombStats!") else: if len(cmds) > 1 and cmds[1] == "nopat": before = len(slots) slots = [slot for slot in slots if not str(slot.find("h4").text).__contains__("Scoops")] bot.say(channel, "NOPAT MODE ACTIVATED - %s ITEMS ELIMINATED" % (before - len(slots))) bot.say(channel, "%d Upcoming Items (times in EST):" % len(slots)) for slot in slots: text = slot.find("h4").text time = slot.find("p").text bot.say(channel, "%s - %s" % (text, time))
def extract_mosque(self, mosque, page): """Extract Mosque.""" fields = [field.attname.replace('_id', '') for field in Mosque._meta.fields if field.attname != 'id'] mosque_text = re.sub(u"(\u2018|\u2019)", "'", mosque.text) mosque_link = '/'.join([root_url, mosque.find('a').get('href').split('../')[-1]]) log_text = '\nWriting {} to file, from page {}' self.stdout.write(log_text.format(mosque_text, page)) mosque_page = bs4(requests.get(mosque_link).content, 'html.parser') rows_selector = '#mosque_info_contents table table tr' mosque_info_rows = mosque_page.select(rows_selector) values = {} # page is a giant table, so go over the rows for row in mosque_info_rows: cells = row.find_all('td') # check we have the right fields try: key = cells[0].text.replace(':', '').lower().strip().replace(' ', '_') except (IndexError, AttributeError): import pdb; pdb.set_trace() # if no key or replace atribute, probably don't want it continue if len(cells) == 2 and key in fields: values[key] = cells[1].text name_address = mosque_page.select('#results h1') matches = re.match(r'(?P<name>[^(]*)\(', name_address[0].text) values['name'] = matches.group('name').strip() values['rating'] = mosque_page.select('.star_rating strong')[0].text values['mdpk'] = mosque_link.split('/')[-1] self.stdout.write(str(set(values.keys()) ^ set(fields)))
def update_dict(): """ getting the info from the archive page. url="http://xkcd.com/archive/" """ archive_page = requests.get(ARCHIVE_URL) if archive_page.status_code == 200: page_content = archive_page.content archive_soup = bs4(page_content, "html.parser") ## now get all the <a> tags under the div '<div class="box" id="middleContainer">' from the soup object for data in archive_soup.find_all("div", {"id": "middleContainer"}): ## this gets all the contents inside "<div class="box" id="middleContainer">" ## now to get the individual links for alinks in data.find_all("a"): ## tries to get all the <a> tags from the 'data' object href = alinks.get("href").strip("/") ## the href stored is in form of eg: "/3/". So make it of form "3" date = alinks.get("title") description = alinks.contents[0] make_keyvalue_list(href, date, description) with open(xkcd_dict_filename, "w") as f: json.dump(XKCD_DICT, f) print( "XKCD link database updated\nStored it in '{file}'. You can start downloading your XKCD's!\nRun 'xkcd-dl --help' for more options".format( file=xkcd_dict_filename ) ) else: print("Something bad happened!")
def getAllLaunches(): launchEntry = 0 tableDepth = 0 launchEvents = list() global listCount try: # Grab the entire page launchCalHandle = urllib.request.urlopen('http://www.spaceflightinsider.com/launch-schedule/') launchCalHtml = launchCalHandle.read() soup = bs4(launchCalHtml, 'html.parser') # Cleanup the Launch Entries as a string with consistent spacing, allows # better modularization of the script. for launchEvent in soup.body.find_all(launch_table)[1:]: # Increment the list counter listCount += 1 launchFields = list() launchString = re.sub(' +', ' ', launchEvent.prettify().replace('\n', ' ').replace('\r', '')) # print(launchString) # Get the launchID launchFields.append(launchString.split('"launchcalendar" id="')[1].split('"> <tr>')[0].strip()) # Get the date, bypass non-hard-scheduled launches launchFields.append(launchString.split('</span> <span>')[1].split(' </span>')[0].strip()) if( not('Jan' in launchFields[-1]) and not('Feb' in launchFields[-1]) and not('Mar' in launchFields[-1]) and not('Apr' in launchFields[-1]) and not('May' in launchFields[-1]) and not('Jun' in launchFields[-1]) and not('Jul' in launchFields[-1]) and not('Aug' in launchFields[-1]) and not('Sep' in launchFields[-1]) and not('Oct' in launchFields[-1]) and not('Nov' in launchFields[-1]) and not('Dec' in launchFields[-1])): continue # Get the time, bypass non-hard-scheduled launches if("Time" in launchString): if("TBD" in launchString.split('<th> Time </th> <td>')[1].split(' </td>')[0].strip()): continue else: tempTime = splitTimeFields(launchString.split('<th> Time </th> <td>')[1].split(' </td>')[0].strip()) for timeField in tempTime: launchFields.append(timeField) else: continue # Get the Location launchFields.append(launchString.split('<th> Location </th> <td>')[1].split('</td>')[0].strip()) # Get the Satellite launchFields.append(launchString.split('<th colspan="2">')[1].split('</th>')[0].strip()) # Get the Launch Vehicle if("<wbr>" in launchString.split('<br/>')[1].split('</td>')[0].strip()): launchFields.append(re.sub(' </wbr>', '', re.sub(' <wbr> ', '', launchString.split('<br/>')[1].split('</td>')[0].strip()))) else: launchFields.append(launchString.split('<br/>')[1].split('</td>')[0].strip()) # Get the description launchFields.append(launchString.split('"description" colspan="2"> <p>')[1].split('</p>')[0].strip()) # Convert Stored Data to writeEvent() writeEvent(convertLaunchData(launchFields)) except urllib.error.HTTPError: print("There was an error accessing the Space Flight Insider Launch Schedule.") print("The server could be down or having issues. Try again.") except urllib.error.URLError: print("There was an error decoding the URL for the Space Flight Insider Launch Schedule. :::nodename not known :::") print("Check that your computer has access to the Internet.")
def contest_leaders(self): """Get data on who's winning""" LOGGER.debug("Fetching contest leaderboard") soup = bs4(session.base_get("contestleaders.php").content, "html.parser") ret_array = [] for cell in soup.find('table', class_='table--panel-like').find('tbody').find_all('tr'): ret_array.append((cell.find_all('td')[1].get_text(), cell.find_all('td')[2].get_text())) return ret_array
def _get_volumes_0_1(self, html): soup = bs4(html, 'lxml') lis = soup.select_one('.vol > .bl').find_all('li') vols = [ (li.a.text, 'http://{}{}'.format(self.domain, li.a.attrs['href'])) for li in lis ] return self._sort_vol_by_title(vols)
def categorie_ignore(url, ignore_categories): """ ignores defined categories of wallappers """ page = bs4(get(url).text, "lxml") categories = page.select('#content ul')[1].select('a') for categorie in categories: if categorie.get_text().split('/')[0] in ignore_categories: return True return False
def log(self): """Gets the PTP log""" soup = bs4(session.base_get('/log.php').content, "html.parser") ret_array = [] for message in soup.find('table').find('tbody').find_all('tr'): ret_array.append((message.find('span', class_='time')['title'], message.find('span', class_='log__message').get_text().lstrip().encode('UTF-8'))) return ret_array
def check_review(query): results = [] soup = bs4(requests.get(query).text, 'html.parser') soup = soup.findAll('p') for paragraph in soup: paragraph = str(paragraph).replace('<p>','') paragraph = str(paragraph).replace('</p>','') return soup
def MainSearch(keyword, base="http://seeker.dice.com", tail="/jobsearch/servlet/JobSearch?op=100&NUM_PER_PAGE=5&FREE_TEXT="): """ Get job listings from main keyword search and returns bs """ url = base + tail + keyword resp = urllib.request.urlopen(url) soup = bs4(resp.read(), from_encoding=resp.info().get_param('charset')) return soup
def getCareerHistory(url): req = re.get(url) soup = bs4(req.text, 'html.parser') careerHistory = soup.findAll("div", { "class" : "markets_module bio_career" }) corporateInformation = soup.findAll("div", { "class" : "markets_module corporate_info" }) memberShips = soup.findAll("div", { "class" : "markets_module bio_membership" }) return str(careerHistory), str(corporateInformation), str(memberShips)
def shows(self): """Read shows into list of Shows.""" response = requests.get(self.base_url) source = response.text soup = bs4(source, "html.parser") show_tags = soup.select("a[title]") show_tags = [tag for tag in show_tags if tag.has_attr("class") is False] shows = [Show(tag["title"], tag["href"]) for tag in show_tags] return shows
def html_to_text(html): soup = bs4(html) b = soup.find(id="bodyContent") if b: #wikipedia page return b.get_text() else: #fall back onto just grabbing all text return soup.get_text()
def stats(self): """ Return all stats associated with a user :rtype: A dictionary of stat names and their values, both in string format. """ soup = bs4( session.base_get('user.php', params={ 'id': self.ID }).text, "lxml") stats = {} for li in soup.find('span', text='Stats').parent.parent.find_all('li'): stat, value = self.__parse_stat(li.text) stats[stat] = value for li in soup.find('span', text='Personal').parent.parent.find_all('li'): stat, value = self.__parse_stat(li.text) if value: stats[stat] = value for li in soup.find('span', text='Community').parent.parent.find_all('li'): stat, value = self.__parse_stat(li.text) if stat == "Uploaded": match = re.search(r'(.*) \((.*)\)', value) stats["UploadedTorrentsWithDeleted"] = match.group(1) value = match.group(2) stat = "UploadedTorrents" elif stat == "Downloaded": stat = "DownloadedTorrents" elif stat == "SnatchesFromUploads": match = re.search(r'(.*) \((.*)\)', value) stats["SnatchesFromUploadsWithDeleted"] = match.group(1) value = match.group(2) elif stat == "AverageSeedTime(Active)": stat = "AverageSeedTimeActive" stats[stat] = value return stats
def getContent(self, ARTICLE_List, record): newsLists = [] articleIDList = [] driver = webdriver.PhantomJS() for articleURL in ARTICLE_List: if articleURL in record: continue sys.stdout.write('\r ' + ' ' * 65) sys.stdout.write('\r URL: ' + articleURL[:69]) t.sleep(random.randint(5, 8)) r = driver.get(articleURL) pageSource = driver.page_source soup = bs4(pageSource, 'html.parser') news = soup.find(class_='main-container') content = "" title = str(news.find('p').text) time = re.split('/', news.find(class_='date-display-single').text) datetime = '/'.join(time[:3]) + ' 00:00' article = news.find( class_='node node-post node-promoted clearfix').findAll('p') #filter fault news if t.strftime('%Y/%m/%d', t.localtime()) not in datetime: continue else: pass for contents in article: content += contents.text articleID = ''.join(time) + '0000000' while articleID in articleIDList: articleID = str(int(articleID) + 1) articleIDList.append(articleID) articleID = 'cld' + articleID newsLists.append([articleID, articleURL, title, datetime, content]) return newsLists
def APOPND_Ovencoffee_crawler(self): ContentList = [] for hundreds in range(10): for tens in range(10): for units in range(10): page = str(hundreds) + str(tens) + str(units) url = "http://www.ovencoffee.com.tw/store_list.asp?storeid=" + page '''建立Return機制, 因為可能會建立多組連線''' res = requests.Session() res.keep_alive = False retry = Retry(connect=5, backoff_factor=0.5) adapter = HTTPAdapter(max_retries=retry) res.mount('https://', adapter) res = res.get(url, headers=headers) res.encoding = ("utf-8") soup = bs4(res.text, 'html.parser') # print(soup) data = soup.findAll('p') if data[1].text != "" and data[1].text is not None: name = data[1].text phone = re.split(':', data[2].text)[1] business_time = str.join('', re.split(':', data[3].text)) ContentList.append([name, phone, business_time]) Filename = 'APOPND_Ovencoffee.csv' #這裡要改 storage_dir = "data/csv/" #這裡要改 df = pd.DataFrame(data=ContentList, columns=['Name', 'Phone', 'Business_time']) df.to_csv(storage_dir + Filename, sep=',', encoding='utf_8_sig', index=False)
def scrap(link): try: session = HTMLSession() with session.get(link) as res: res.html.render() soup = bs4(res.html.html, 'html5lib') risultato=pd.DataFrame(columns=['tipo','casoF','casoV']) tags= soup.findAll("div",{"class":"TabellaEsitiRow-hvzh1w-0 jxwiSe"}) esdop=tags[0].findAll("div",{"class":"EsitoButton-mp5c0x-0 dQZBRx"}) new_row = {'tipo':"1X-2", 'casoF':esdop[3].getText(), 'casoV':esdop[2].getText()} risultato = risultato.append(new_row, ignore_index=True) new_row = {'tipo':"12-X", 'casoF':esdop[5].getText(), 'casoV':esdop[1].getText()} risultato = risultato.append(new_row, ignore_index=True) new_row = {'tipo':"2X-1", 'casoF':esdop[4].getText(), 'casoV':esdop[0].getText()} risultato = risultato.append(new_row, ignore_index=True) gng=tags[1].findAll("div",{"class":"EsitoButton-mp5c0x-0 dQZBRx"}) new_row = {'tipo':"GOL/NOGOL", 'casoF':gng[1].getText(), 'casoV':gng[0].getText()} risultato = risultato.append(new_row, ignore_index=True) over=tags[2].findAll("div",{"class":"TabellaColumn-nrcwsc-0 iJTAjk"}) under=tags[3].findAll("div",{"class":"TabellaColumn-nrcwsc-0 iJTAjk"}) new_row = {'tipo':"UNDER/OVER 0.5", 'casoF':under[0].getText(), 'casoV':over[0].getText()} risultato = risultato.append(new_row, ignore_index=True) new_row = {'tipo':"UNDER/OVER 1.5", 'casoF':under[1].getText(), 'casoV':over[1].getText()} risultato = risultato.append(new_row, ignore_index=True) new_row = {'tipo':"UNDER/OVER 2.5", 'casoF':under[2].getText(), 'casoV':over[2].getText()} risultato = risultato.append(new_row, ignore_index=True) new_row = {'tipo':"UNDER/OVER 3.5", 'casoF':under[3].getText(), 'casoV':over[3].getText()} risultato = risultato.append(new_row, ignore_index=True) new_row = {'tipo':"UNDER/OVER 4.5", 'casoF':under[4].getText(), 'casoV':over[4].getText()} risultato = risultato.append(new_row, ignore_index=True) new_row = {'tipo':"UNDER/OVER 5.5", 'casoF':under[5].getText(), 'casoV':over[5].getText()} risultato = risultato.append(new_row, ignore_index=True) return risultato except: session.close() print("Errore nella ricerca DATI su SISAL, cerco di nuovo") return scrap(link)
def scrapCampionato(num): campionato=["https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:21", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:22", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:18", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:153", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:86", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:1", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:79", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:137", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:4", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:3", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:14", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:15", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:29", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:30", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:54", "https://www.sisal.it/scommesse-matchpoint?filtro=0&schede=man:1:190"] #italia, champions e europa, inghilterra, spagna, germania, francia, olanda, portogallo risultato=pd.DataFrame(columns=['giorno','ora','match','link']) session = HTMLSession() with session.get(campionato[num]) as res: res.html.render() soup = bs4(res.html.html, 'html5lib') partite= (soup.findAll("div",{"TabellaEsitiRow-hvzh1w-0 eyTFpO"})) for partita in partite: match=partita.find("span",{"class":"AvvenimentoDescription-rieyuj-0 clFosV"}).getText().strip() dataora=partita.find("span",{"class":"AvvenimentoDate-giucxs-0 iaSisn"}).getText().strip().split(" ") link ="https://www.sisal.it"+partita.find("a",{"class":"AvvenimentoDetailWrapper-w9f4wf-0 bhgtKE"}).get("href") ora=dataora[2] data=dataora[0].split("/") new_row = {'giorno':data[0], 'ora':ora, 'match':match,'link':link} risultato = risultato.append(new_row, ignore_index=True) if len(risultato)!=0: return risultato else: print("SISAL RIPROVO") return scrapCampionato(num)
def get_data(html): item_list = [] soup = bs4(html, 'lxml') positions = soup.find_all( 'div', class_= 'products-view-block js-products-view-block products-view-block-static' ) for position in positions: pos_article = position.find('div', class_='col-xs-8 align-right').text pos_name = position.find('span', class_='products-view-name-link').text pos_price = position.find('div', class_='price-number').text data = { 'article': norm_article(pos_article), 'name': norm_name(pos_name), 'price': norm_price(pos_price) } # print(f'Артикул: {norm_article(pos_article)}') # print(f'Наименование: {norm_name(pos_name)}') # print(f'Цена: {norm_price(pos_price)} руб.') # print('=' * 20) # print(data) item_list.append(data) return item_list
def get_ranks_by_nikname(platform, nikname): first_role = 'Unranked (2vs2)' second_role = 'Unranked (3vs3)' try: response = requests.get(url % (platform, nikname)) html = response.text soup = bs4(html, 'html.parser') for tab in soup.find_all('table'): if 'Playlist' in str(tab): table = tab break playTable = table trs = playTable.find_all('tr') for tr in trs[1:]: if 'Ranked Doubles 2v2' in str(tr.find_all('td')[1]): r2v2 = tr.small first_role = (str(r2v2).split('\n')[1] + ' (2vs2)').strip() if 'Ranked Standard 3v3' in str(tr.find_all('td')[1]): r3v3 = tr.small second_role = (str(r3v3).split('\n')[1] + ' (3vs3)').strip() return (first_role, second_role) except: return False
def insta(username, gecko_path): """ Download images from instagram """ link = URL + username print("Downloading images {}...".format(username)) with Browser("firefox", headless=True, executable_path=gecko_path) as browser: browser.visit(link) html = browser.html soup = bs4(html, "html.parser") data = soup.findAll("img") for x in data: x = x.get("src") os.system( f"wget --no-check-certificate -c -N -P Images/{username} {x}") print("Downloaded {}".format(x)) def rename_image_dir(foldername): i = 1 dirName = os.path.join("./Images/", foldername) path = os.getcwd() + dirName for filename in os.listdir(dirName): if not filename.endswith(".jpg"): os.rename( os.path.join(path, filename), os.path.join(path, foldername + '_' + str(i) + ".jpg")) i += 1 rename_image_dir(username) print("\nFiles downloaded into Images/{}".format(username))
def get_baike_text(hospital_list,urls): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': 'baike.baidu.com', "Referer": "https://baike.baidu.com/", 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' } for i in range(len(urls)): html = req.get(urls[i], headers=headers).content.decode('utf8') soup=bs4(html,'lxml') des = soup.find('div', {'class': 'lemma-summary'}) if not des is None: des=re.sub('\[[0-9]{1,2}\]', "", des.get_text()).replace('\n','').replace('\xa0','')\ .replace('"','“').replace("'","‘") sql='update hospital set description="'+des+'" where name="'+hospital_list[i]+'"' print('剩余'+str(len(urls)-i)+':'+urls[i]) cursor.execute(sql) db.commit() else: print('未收录:'+urls[i]) continue
def scrape_search_results(areas): ''' scrapes search page, collects information about the cars available for sale. ''' results = [] search_indices = np.arange(0, 300, 100) for area in areas: print area for i in search_indices: url = 'http://sfbay.craigslist.org/search/{0}/cta'.format(area) resp = requests.get(url, params={'hasPic': 1, 's': i}) txt = bs4(resp.text, 'html.parser') cars = txt.findAll(attrs={'class': "row"}) tags=txt.findAll('img') img_tags = "\n".join(set(tag['src'] for tag in tags)) title = [rw.find('a', attrs={'class': 'hdrlnk'}).text for rw in cars] links_raw = [rw.find('a', attrs={'class': 'hdrlnk'})['href'] for rw in cars] links = ['http://sfbay.craigslist.org'+car_link for car_link in links_raw] # find the time and the price time = [pd.to_datetime(rw.find('time')['datetime']) for rw in cars] price = find_prices(cars) # create a dataframe to store all the data data = np.array([time, price, title, links]) col_names = ['time', 'price', 'title', 'link'] df = pd.DataFrame(data.T, columns=col_names) # add the location variable to all entries df['loc'] = area results.append(df) # concatenate all the search results results = pd.concat(results, axis=0) return results
def main(prefix, docnum): #获取提单号 # prefix= '784' # docnum = '27400365' open_text() CZ._data['ctl00$ContentPlaceHolder1$txtPrefix'] = prefix CZ._data['ctl00$ContentPlaceHolder1$txtNo'] = docnum r = requests.post(CZ.url + CZ.prefix + prefix + CZ.awbno + docnum + CZ.menuid + CZ.lang, data=CZ._data) content = bs4(r.content, 'html.parser') table = content.find_all('table') status = list(table[2].stripped_strings) flight = [] for i in range(len(status)): if re.search(r'[\dA-Z]{4,10}', status[i]) and re.search( r'[-\d]{8,10}', status[i + 1]): d = { 'air_code': '%s' % status[i], '_dep_port': '%s' % status[i - 2], '_dest_port': '%s' % status[i - 1], 'airline_comp': 'CZ', } flight.append(d) status = list(table[4].stripped_strings) j = 0 for i in range(len(status)): if 'Cargo has been loaded' in status[i]: flight[j].update({'_atd': '%s' % status[i - 3]}) flight[j].update({'_std': '%s' % status[i - 3]}) if 'Flight has arrived' in status[i]: flight[j].update({'_ata': '%s' % status[i - 3]}) flight[j].update({'_sta': '%s' % status[i - 3]}) j += 1 return flight
def get_lyrics(artist, song): if not isinstance(artist, str): raise TypeError("The artist name should be a string") if not isinstance(song, str): raise TypeError("The song name should be a string") artist_name, song_name = _clean_names(artist, song) # print(artist_name, song_name) url = _create_url(artist_name, song_name) try: page = _get_page(url) except ValueError: return [] soup = bs4(page, "html.parser") mydivs = soup.find("div", {"class": "ringtone"}) lyrics = mydivs.find_next_sibling("div") # Use the .stripped_strings generator to remove all extra whitespace # and strings consisting only of whitespace lyric_list = [text for text in lyrics.stripped_strings] return lyric_list
def scraper(pgs1, pgs2, srt): pages = [] prices = [] stars = [] titles = [] data = {'Title': titles, 'Price': prices, 'Rating': stars} for numPgs in range(int(pgs1), int(pgs2) + 1): url = ( 'http://books.toscrape.com/catalogue/category/books_1/page-{}.html' .format(numPgs)) pages.append(url) for item in pages: page = requests.get(item) soup = bs4(page.text, 'html.parser') for iterA in soup.findAll('h3'): ttl = iterA.getText() titles.append(ttl) for iterB in soup.findAll('p', class_='price_color'): price = iterB.getText() prices.append(price) for iterC in soup.findAll('p', class_='star-rating'): for key, value in iterC.attrs.items(): star = value[1] stars.append(star) if (srt == "title"): titles.sort() elif (srt == "price"): prices.sort() elif (srt == "rating"): stars.sort() df = pd.DataFrame(data=data) return df
def scrap(link): try: session = HTMLSession() with session.get(link) as res: res.html.render() soup = bs4(res.html.html, 'html5lib') tags= soup.findAll("table",{"class":"table table-bordered table-condensed table-striped table-hover margin-bottom-10 ng-scope"}) risultato=pd.DataFrame(columns=['tipo','casoF','casoV']) esatto=tags[0].findAll("span",{"class":"ng-binding ng-scope"}) doppia=tags[1].findAll("span",{"class":"ng-binding ng-scope"}) new_row = {'tipo':"1X-2", 'casoF':doppia[0].getText().strip(), 'casoV':esatto[2].getText().strip()} risultato = risultato.append(new_row, ignore_index=True) new_row = {'tipo':"12-X", 'casoF':doppia[2].getText().strip(), 'casoV':esatto[1].getText().strip()} risultato = risultato.append(new_row, ignore_index=True) new_row = {'tipo':"2X-1", 'casoF':doppia[1].getText().strip(), 'casoV':esatto[0].getText().strip()} risultato = risultato.append(new_row, ignore_index=True) tipo=tags[2].find("div",{"class":"pull-left ng-binding"}).getText().strip() t=tags[2].findAll("span",{"class":"ng-binding ng-scope"}) quotaF=t[1].getText().strip() quotaV=t[0].getText().strip() new_row = {'tipo':tipo, 'casoF':quotaF, 'casoV':quotaV} if "GOL" in tipo: risultato = risultato.append(new_row, ignore_index=True) for tag in tags[3:12]: tipo=tag.find("div",{"class":"pull-left ng-binding"}).getText().strip() t=tag.findAll("span",{"class":"ng-binding ng-scope"}) quotaF=t[0].getText().strip() quotaV=t[1].getText().strip() new_row = {'tipo':tipo, 'casoF':quotaF, 'casoV':quotaV} if "UNDER" in tipo: risultato = risultato.append(new_row, ignore_index=True) return risultato except: print("Errore nella ricerca DATI su SNAI, cerco di nuovo") session.close() return scrap(link)
def __detect_virtual_pairs(self): """Auto-detect virtual pairs by their record file header.""" virtual_pairs = [] # RegEx for matching pair number and names in pair record header pair_header_match = re.compile('([0-9]{1,}): (.*) - (.*), .*') for record_file_path in self.__pair_records_files: log.getLogger('detect').debug('examining record file %s', record_file_path) with file(record_file_path) as record_file: record = bs4(record_file, 'lxml') # first <td class="o1"> with content matching # pair header is what we're after header = [con for con in record.select('td.o1')[0].contents if isinstance(con, NavigableString) and re.search( pair_header_match, con)] log.getLogger('detect').debug('detected header: %s', header) if len(header): header_match = re.match(pair_header_match, header[0]) pair_number = int(header_match.group(1)) names = [name for name in [header_match.group(2).strip(), header_match.group(3).strip()] if len(name)] log.getLogger('detect').debug('parsed header: %d, %s', pair_number, names) # virtual pair does not have any names filled if len(names) == 0: virtual_pairs.append(pair_number) if len(virtual_pairs) == 0: log.getLogger('detect').warning('No virtual pairs detected') else: log.getLogger('detect').info('virtual pairs: %s', ' '.join(sorted( [str(pair) for pair in virtual_pairs]))) return sorted(virtual_pairs)
def fill_pair_list_table(cells, row_cell_count=20): """Format cell list into well-formed rows, aligned by column count.""" content = bs4('<table />', 'lxml') content.append(content.new_tag('table')) # first filler cell of each new row first_cell = content.new_tag('td', **{'class': 'n'}) first_cell.string = u'\xa0' # arrange cells into rows, full rows first while len(cells) >= row_cell_count: new_row = content.new_tag('tr') new_row.append(copy.copy(first_cell)) for cell in cells[0:row_cell_count]: new_row.append(cell) content.table.append(new_row) log.getLogger('rec_list').debug('aligning cells %s to %s in a row', cells[0].a.contents, cells[row_cell_count-1].a.contents) del cells[0:row_cell_count] # last row may or may not be full last_row = content.new_tag('tr') last_row.append(copy.copy(first_cell)) for cell in cells: last_row.append(cell) log.getLogger('rec_list').debug('leaving cells %s to %s in last row', cells[0].a.contents, cells[-1].a.contents) # if it wasn't full, fill it with a col-spanned last cell if len(cells) < row_cell_count: last_cell = content.new_tag('td', colspan=row_cell_count-len(cells)) last_cell.string = u'\xa0' last_row.append(last_cell) log.getLogger('rec_list').debug('filling last row with: %s', last_cell) content.table.append(last_row) return content.table.contents[:]
def videoDetail(videoUrl): try: itemPage = bs4(req.get(videoUrl).text, 'html.parser') except expression as e: # print(e) print('=====> request failed/ check network connection!') choices = [ i['aria-label'].split(' ')[-1] for i in itemPage.select('.menu-list .link a') ] downloadLinks = {} for itemLink in itemPage.select('.menu-list .link a'): downloadLinks[itemLink['aria-label'].split(' ')[-1]] = itemLink['href'] questions = [ inq.List( 'quality', message="\U0001F914 Select quality", choices=choices, ), ] answer = inq.prompt(questions) itemPageDownloadLink = downloadLinks[answer['quality']] itemTitle = f"{itemPage.select('#videoTitle')[0].text}-{answer['quality']}" return itemTitle, itemPageDownloadLink
def inbox(self, page=1): """Fetch a list of messages from the user's inbox Incidentally update the number of messages""" soup = bs4( session.base_get('inbox.php', params={ 'page': page }).text, "html.parser") self.new_messages = self.__parse_new_messages(soup) for row in soup.find(id="messageformtable").tbody.find_all('tr'): yield { 'Subject': row.find_all('td')[1].text.encode('UTF-8').strip(), 'Sender': row.find_all('td')[2].text, 'Date': row.find_all('td')[3].span['title'], 'ID': re.search(r'id=(\d+)', row.find_all('td')[1].a['href']).group(1), 'Unread': True if 'inbox-message--unread' in row['class'] else False }
def getStock(code, query): reply = '' stock = [[ 'Time', 'Market Price', 'Buy', 'Sell', 'Rise&Fall', 'Volume', 'Previous Close', 'Open', 'High', 'Low' ]] driver = webdriver.PhantomJS( executable_path= 'D:\\Anaconda3.6\\Scripts\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe' ) x = code r = driver.get('https://tw.stock.yahoo.com/q/q?s=' + x) pageSource = driver.page_source soup = bs4(pageSource, 'html.parser') tables = soup.find_all('table') row = tables[5].find_all('td')[0].findAll('tr')[1].findAll('td')[1:-1] for item in row: row[row.index(item)] = item.text.strip() if query == '市價': reply = row[1] elif query == '買價': reply = row[2] elif query == '賣價': reply = row[3] elif query == '成交量': reply = row[5] elif query == '前一天收盤價': reply = row[6] elif '開盤' in query: reply = row[7] elif '最高' in query: reply = row[8] elif '買低' in query: reply = row[9] return (code + ' 的' + query + '是 ' + reply + 'ㄛ~~~~<3')
def parse(html): res = { '身高测量': -1, '体重测量': -1, '肺活量': -1, '50米跑': -1, '立定跳远': -1, '1000米跑': -1, '800米跑': -1, '坐体前屈': -1, '仰卧起坐': -1, '引体向上': -1, '左眼视力': -1, '右眼视力': -1 } html = bs4(html, 'html.parser') tr = html.find_all('tr') for i in tr: td = i.find_all('td') if not td: continue if td[0].text in res: res[td[0].text] = td[1].text return res
def get_followers(self): time.sleep(2) #open followers link of followers on any account # right click followers link on the page then right click on the selected script and copy selector flw_btn = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located(( By.CSS_SELECTOR, '#react-root > section > main > div > header > section > ul > li:nth-child(2) > a' ))) flw_btn.click() # When the pop up of followers appears webpage changes # so we click on follower button, then pop up appears then we click inspect , then we check the divison which get highlighted when we scroll down, copy that CSS popup = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'body > div.RnEpo.Yx5HN > div > div.isgrP'))) for i in range(0, 10): time.sleep(1) self.driver.e xecute_script( 'arguments[0].scrollTop = arguments[0].scrollHeight', popup ) #excecuting javascripts, this tells us scroll down to the bottom of the followers list. So the scroller is at the bottom popup = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'body > div.RnEpo.Yx5HN > div > div.isgrP'))) print('cool') b_popup = bs4(popup.get_attribute('innerHTML'), 'html.parser') for p in b_popup.findAll( 'li', {'class': 'woI9H' }): # findAll means find all the attributes under that tag print( p.findAll('a')[0]['href'] ) # So li tags means all the followers and 'a' tag is in that li tag to and can be used to find href of the li or followers to get to their link by savuing all href in a li print('Awesome') print('end')
class WorldCovid: def __init__(self): self.today = ctime(time()) self.scrape_url = "https://www.worldometers.info/coronavirus/" self.datafile = f"./data/World/new_covid_dat.csv" self.appendfile = f"./data/World/world_timeseries/{self.today[4:10]+self.today[-5:]}.csv" self.is_updated = path.isfile(self.appendfile) def getData(self) -> list, list: page = requests.get(self.scrape_url) html = bs4(page.text, 'html.parser') table = html.find(id="main_table_countries_today") thead_all = table.thead.find_all('th') thead = [th.text for th in thead_all] tbody_all = table.find_all('tbody') tr_temp = [tr for tr in tbody_all[0].find_all('tr')] td_temp = [td.find_all('td') for td in tr_temp] tbody = [[j.text.strip() for j in i] for i in td_temp] return thead, tbody
def getblog(value): try: #getting all the values if (value == 'all'): x = requests.get('https://www.freecodecamp.org/news/').text else: # getting the response based on particular tag x = requests.get(f"https://www.freecodecamp.org/news/tag/{value}").text soup=bs4(x,'lxml') hack = soup.find_all('article',class_ = 'post-card') #intitializing a dictionary val={} val["dic"]=[] for i in range(0,len(hack)): data={} data["Tag"] = hack[i].find('span',class_='post-card-tags').text.strip(' \t\n\r') data["Blog-Title"] = hack[i].find('h2',class_='post-card-title').text.strip(' \t\n\r') data["Blog-link"] = hack[i].find('a',class_='post-card-image-link').get('href') data["Blog-link"]="https://www.freecodecamp.org"+data["Blog-link"] data["Author"] = hack[i].find('a',class_='meta-item').text.strip(' \t\n\r') val["dic"].append(data) return val except Exception as e: return {"status":False,"error":e}
def main(self): params = dict(pets_cat=1, max_price=2000) rsp = requests.get(self.url, params=params) html = bs4(rsp.text, 'html.parser') apts = html.find_all('p', attrs={'class': 'row'}) for apt in apts: # print apt.prettify() size = apt.findAll(attrs={'class': 'housing'})[0].text sqft, beds = self.find_size_and_bdrms(size) self.apartment['sqft'] = sqft self.apartment['beds'] = beds self.apartment['updated_datetime'] = apt.find('time')['datetime'] self.apartment['price'] = float(apt.find('span', {'class': 'price'}).text.strip('$')) self.apartment['title'] = apt.find('a', attrs={'class': 'hdrlnk'}).text self.apartment['url'] = 'h'+self.url.strip('/search/apa') + apt.find('a', attrs={'class': 'hdrlnk'})['href'] info = self.get_more_info(self.apartment['url']) for k,v in self.apartment.iteritems(): print k,v print '\n' exit() time.sleep(1)
song_list.append(song) i += 2 for i in range(len(song_list)): print(str(i + 1) + ': ' + song_list[i]['name']) choice = int(input('Enter the song number to download: ')) # todo: add choice to download entire album also # todo: check for invalid input choice -= 1 return song_list[choice]['url'], song_list[choice]['name'] if __name__ == '__main__': url = 'https://www.starmusiq.fun/search/search-for-blocked-movies-starmusiq.html' query = input('Enter album name: ') search_result = requests.get(url, params={'query': query}) soup = bs4(search_result.text, 'html.parser') albums_container = soup.find("div", {"id": "search_albums"}) search_result_links = albums_container.findAll( 'a', {'class': 'label label-danger'}) choice = display_results(search_result_links) while choice[0] != 1 and choice[0] != 2: choice = display_results(search_result_links) if choice[0] == 1: url = choice[1] album_page = requests.get(url) soup = bs4(album_page.text, 'html.parser') links = soup.findAll( 'a', { 'style': 'background:#cb413f;color:#fff;line-height:39px;padding:8px 6px;text-decoration:' + 'none;border-radius:.25em; font-weight:700;'
#pp = pprint.PrettyPrinter(indent=4).pprint chrome_options = Options() #chrome_options.add_extension("proxy.zip") chrome_options.add_argument("--incognito") driver = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=chrome_options) url = 'https://xxxxxx' #url2 = 'https://xxxxxx' url3 = 'https://xxxxxx/Extracted/' driver.get(url) driver.get(url3) page = driver.page_source soup = bs4(page, 'html.parser') parsesoup = re.findall( r"(.*href)*=\"(.*)(mkv).*(.mkv\")(.*)(\d{4}-[a-zA-Z]{3}-\d{2} \d{2}:\d{2})", page) tables = soup.findChildren('table') tables1 = tables[0] rows = tables1.findChildren(['tr']) dates = [] for row in rows: cells = row.findChildren('td') for cell in cells: value = cell.string dates.append(value)
def addnewtemplate(): def templatelookup(): business = str(session['business']) con = sqlite.connect('db/db1.db') with con: cur = con.cursor() cur.execute('PRAGMA key = ' + dbkey + ';') templatelist = [] for row in cur.execute( 'select name from templates where business = (?) or shared = 1;', (business, )): templatelist.append(row[:][0]) con.close return templatelist Path("./templates/businesses/" + str(session['business'])).mkdir( parents=True, exist_ok=True) if request.method == "POST": print(request.form.get('templateview')) if str(request.form.get('templateview')) != 'None': searchtemplates = templatelookup() print(searchtemplates) print(request.form.get('templateview')) templateview = request.form.get('templateview') if templateview == 'prototype2': templateview = '/templates/prototype2.html' searchtemplates = templatelookup() return render_template('addtemplate.html', searchtemplates=searchtemplates, templateview=templateview) else: templatecustom = 'businesses+^+' + session[ 'business'] + '+^+' + templateview + '.html' searchtemplates = templatelookup() return render_template('addtemplate.html', searchtemplates=searchtemplates, templatecustom=templatecustom) if request.form.get('editordata') != None: try: savehtml = request.form.get('editordata') soup = bs4(savehtml) for a in soup.findAll('a'): a['href'] = "replacelink" a['data-saferedirecturl'] = 'replacelink' savehtml = str(soup) savehtmlnam = str(request.form.get('templatename')) savehtmlnam = savehtmlnam.replace(' ', '_') savehtmlname = savehtmlnam + '.html' templatesubject = request.form.get('templatesubject') if os.path.isfile('./templates/businesses/' + session['business'] + '/' + savehtmlname): flash('A template with this name already exists', 'category2') return render_template("addtemplate.html", searchtemplates=searchtemplates) else: with open( './templates/businesses/' + session['business'] + '/' + savehtmlname, 'w') as f: f.write(savehtml) con = sqlite.connect('db/db1.db') with con: cur = con.cursor() cur.execute('PRAGMA key = ' + dbkey + ';') cur.execute( 'insert into templates (business, name, emailsubject) VALUES (?,?,?);', (session['business'], savehtmlnam, templatesubject)) con.commit con.close flash('Submitted!', 'category2') return render_template("addtemplate.html", searchtemplates=searchtemplates) except: searchtemplates = templatelookup() if request.form.get('selecttemplate') != 'Templates': if request.form.get('selecttemplate') != None: selecttemplate = request.form.get('selecttemplate') if selecttemplate == 'prototype2': flash('No deleting default templates', 'category2') else: con = sqlite.connect('db/db1.db') with con: cur = con.cursor() cur.execute('PRAGMA key = ' + dbkey + ';') cur.execute( 'delete from templates where business LIKE (?) and name LIKE (?);', ( session['business'], selecttemplate, )) con.close() os.remove('./templates/businesses/' + session['business'] + '/' + selecttemplate + '.html') flash('Deleted!', 'category2') searchtemplates = templatelookup() print(searchtemplates) return render_template("addtemplate.html", searchtemplates=searchtemplates)
def download_one(xkcd_dict, xkcd_num): if not xkcd_dict: return None xkcd_number = str(xkcd_num) if xkcd_number in excludeList: downloadImage = False print('{num} is special. It does not have an image.'.format( num=xkcd_number)) ''' [2] Some comics are special and either don't have an image or have a dynamic one. The full list is the array excludeList and needs to be manually updated upon the release of such a comic. ''' else: downloadImage = True if xkcd_number in xkcd_dict: date = xkcd_dict[xkcd_number]['date-published'] description = xkcd_dict[xkcd_number]['description'] new_description = sanitize_description(description) new_folder = '{current_directory}/xkcd_archive/{name}'.format( current_directory=WORKING_DIRECTORY, name=xkcd_number) to_download_single = "{base}/{xkcd_num}/".format(base=BASE_URL, xkcd_num=xkcd_number) print( "Downloading xkcd from '{img_url}' and storing it under '{path}'". format(img_url=to_download_single, path=new_folder)) alt = requests.get(to_download_single + 'info.0.json').json()['alt'] if os.path.exists(new_folder): print("xkcd number '{num}' has already been downloaded!".format( num=xkcd_number)) else: os.makedirs(new_folder) os.chdir(new_folder) with open('description.txt', 'w') as f: content = """title : {description} date-published: {date} url: {url} alt: {altText} \n""".format(description=description, date=date, url=to_download_single, altText=alt) f.write(content) image_page = requests.get(to_download_single, stream=True) if downloadImage: if image_page.status_code == 200: image_page_content = image_page.content image_page_content_soup = bs4(image_page_content, 'html.parser') for data in image_page_content_soup.find_all( "div", {"id": "comic"}): for img_tag in data.find_all('img'): img_link = img_tag.get('src') complete_img_url = "http:{url}".format(url=img_link) file_name = "{description}.jpg".format( description=new_description) r = requests.get(complete_img_url, stream=True) if r.status_code == 200: with open(file_name, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) else: print("Error with connectivity. HTTP error {}".format( r.status_code)) magic_response = str(magic.from_file(file_name, mime=True)) if 'png' in magic_response: os.rename( file_name, "{description}.png".format( description=new_description)) elif 'jpeg' in magic_response: os.rename( file_name, "{description}.jpeg".format( description=new_description)) elif 'gif' in magic_response: os.rename( file_name, "{description}.gif".format( description=new_description)) else: print("{} does not exist! Please try with a different option".format( xkcd_number))
def truePeopleSearch(): print("\n STEP 2: Webscraping") # create url variable of web address url='https://www.truepeoplesearch.com/' # prepare df for webscrape print('\n Starting truepeoplesearch.com') #zone = [', AL',', AR',', GA',', LA',', MS',', NC',', TN',', WV'] # could possibly add an area that would emcumpass possible resutls #area = ['AL','AR','GA','LA','MS','NC','TN','WV'] # define dedup function to dedup lists def dedup(seq): """ removes duplicate values from a list or duplicate characters from a string """ if type(seq) == list: seen = set() seen_add = seen.add return [x for x in seq if not (x in seen or seen_add(x))] elif type(seq) == str: seen = set() seen_add = seen.add return ''.join([x for x in seq if not (x in seen or seen_add(x))]) else: print("Currently function can only handle lists and strings") # create empty dictionary and list # ps, hardest webscraped i have ever done. the captch problem makes it even worse fullDict = {} nameList = [] for key, value in beneDict.items(): nameDict = {} counter = -1 if value[1] not in nameList: print('Searching TruePeopleSearch for %s'% value[1]) nameList.append(value[1]) splitName = '%20'.join(value[1].split()) # for k, kvalue in enumerate(replaceList) cleanName = value[1] first = value # splitName = 'john%20smith' # print(splitName) flag = 'Y' pageCount = 0 while flag == 'Y': # print(flag) pageCount +=1 tempURL = url+'results?name='+splitName+'&citystatezip=%s&page=%s' % (value[2], pageCount) pageContent = requests.get(tempURL).content # time.sleep(1) # print(tempURL) if 'captchasubmit?returnUrl' in str(pageContent): # print('first') while 'captchasubmit?returnUrl' in str(pageContent): print('captcha found') def afterCaptcha(): global captcha, pageContent pageContent = requests.get(tempURL).content print("Program will continue until another captcha is requested") captcha.destroy() def quitTkinterBox(): global captcha captcha.destroy() print("raise SystemExit") captcha = Tk() captcha.lift() captcha.attributes('-topmost',True) captcha.after_idle(captcha.attributes,'-topmost',False) # messagebox.showinfo("*** Warning ***", "The website has requested a captcha \n please go to https://www.truepeoplesearch.com/ and manually \n solve the captcha") Label(captcha, text="*** Warning ***, The website has requested a captcha \n please go to https://www.truepeoplesearch.com/ and manually \n solve the captcha. When you have finished, \n come back to this window and please press continue.").grid(row=0, sticky = W) Button(captcha, text='Continue', command=afterCaptcha).grid(row=4, column=1, sticky=W, pady=1) Button(captcha, text='Quit Program and Exit', command= quitTkinterBox).grid(row=4, column=2, sticky=W, pady=4) captcha.mainloop() print("Continuing") pageContent = requests.get(tempURL).content # time.sleep(1) soup = bs4(pageContent, "html.parser") # print(soup) linkList = [] diffList = [] if str(soup).find('btnNextPage') == -1: flag = 'N' for card in soup.find_all(attrs= {'class':'card card-block shadow-form card-summary'}): if str(value[5]) in card.text: for h4 in card.find_all(attrs= {'class':'h4'}): # print(value[1]) # print(h4.text.strip().upper()) result = difflib.SequenceMatcher(None, value[1], h4.text.strip().upper()).ratio() # print(result) for a in card.find_all('a'): if 'name' in a['href']: if 'page' not in a['href']: if a['href'] not in linkList: # if result > .5: diffList.append(result) linkList.append(a['href']) else: None else: soup = bs4(pageContent, "html.parser") # print(soup) linkList = [] diffList = [] if str(soup).find('btnNextPage') == -1: flag = 'N' for card in soup.find_all(attrs= {'class':'card card-block shadow-form card-summary'}): if str(value[5]) in card.text: for h4 in card.find_all(attrs= {'class':'h4'}): # print(value[1]) # print(h4.text.strip().upper()) result = difflib.SequenceMatcher(None, value[1], h4.text.strip().upper()).ratio() # print(result) for a in card.find_all('a'): if 'name' in a['href']: if 'page' not in a['href']: if a['href'] not in linkList: # if result > .5: diffList.append(result) linkList.append(a['href']) else: None for i, ivalue in enumerate(linkList): counter += 1 infoDict = {} tempURL = url+linkList[i] pageContent2 = requests.get(tempURL).content # time.sleep(1) if 'captchasubmit?returnUrl' in str(pageContent2): # print('first') while 'captchasubmit?returnUrl' in str(pageContent2): print('captcha found') def afterCaptcha(): global captcha, pageContent2 pageContent2 = requests.get(tempURL).content # time.sleep(1) # print("Failed to solve captcha. Ending program. Please try again.") print("Program will continue until another captcha is requested") captcha.destroy() def quitTkinterBox(): global captcha captcha.destroy() print("raise SystemExit") captcha = Tk() captcha.lift() captcha.attributes('-topmost',True) captcha.after_idle(captcha.attributes,'-topmost',False) # messagebox.showinfo("*** Warning ***", "The website has requested a captcha \n please go to https://www.truepeoplesearch.com/ and manually \n solve the captcha") Label(captcha, text="*** Warning ***, The website has requested a captcha \n please go to https://www.truepeoplesearch.com/ and manually \n solve the captcha. When you have finished, \n come back to this window and please press continue.").grid(row=0, sticky = W) Button(captcha, text='Continue', command=afterCaptcha).grid(row=4, column=1, sticky=W, pady=1) # Button(master, text='Run', command=importNums).grid(row=4, column=1, sticky=W, pady=1) Button(captcha, text='Quit Program and Exit', command= quitTkinterBox).grid(row=4, column=2, sticky=W, pady=4) captcha.mainloop() print("Continuing") soup = bs4(pageContent2, "html.parser") phoneList = [] # print(ivalue) infoDict['name'] = soup.find(attrs= {'class','h2'}).text.strip() infoDict['age'] = soup.find(attrs= {'class','content-value'}).text.strip() infoDict['address'] = soup.find(attrs= {'link-to-more','link-to-more'}).text.strip() infoDict['match'] = value[0] infoDict['origFullName'] = value[1] for a in soup.find_all('a'): if 'phoneno' in a['href']: phone = a['href'][a['href'].find('=')+1:] if phone not in phoneList: phoneList.append(phone) infoDict['phone'] = phoneList infoDict['source'] = 'TPS' infoDict['diff'] = diffList[i] else: soup = bs4(pageContent2, "html.parser") phoneList = [] # print(ivalue) infoDict['name'] = soup.find(attrs= {'class','h2'}).text.strip() infoDict['age'] = soup.find(attrs= {'class','content-value'}).text.strip() infoDict['address'] = soup.find(attrs= {'link-to-more','link-to-more'}).text.strip() infoDict['match'] = value[0] infoDict['origFullName'] = value[1] for a in soup.find_all('a'): if 'phoneno' in a['href']: phone = a['href'][a['href'].find('=')+1:] if phone not in phoneList: phoneList.append(phone) infoDict['phone'] = phoneList infoDict['source'] = 'TPS' infoDict['diff'] = diffList[i] nameDict[counter] = infoDict fullDict[value[0]] = nameDict else: None return fullDict
driver = webdriver.PhantomJS(executable_path=r'S:\DA_work_files\DA_Work_Python\phantomjs-2.1.1-windows\bin\phantomjs.exe') driver.set_window_size(1124, 850) # set browser size. # use driver to get url driver.get(url) #Find the search box and input the name nameInput = driver.find_element_by_id('fullName') nameInput.send_keys(name) # click on search button submit = driver.find_element_by_css_selector('button.btn.btn-lg.btn-block').click() #get current page page_content = requests.get(driver.current_url).content soup = bs4(page_content, "html.parser") try: #finding the number or total results for line in soup.find_all("span",class_="ThatsThem-results-preheader"): results1 = ''.join(line.find_all(text=True)) results=results1[7:10] results=int(results.strip()) # Grab name, Address, Phone number, Email for each result name=[] address1=[] address2=[] address3=[]