def get_professor_info(id): url = FINAL_RATE_MY_PROFESSOR_URL + str(id) ### IDK WHAT THIS DOES? ### MAYBE CHECK THAT THE PROF IS THERE # if len(scrap_info_from_url(url, "div", "rating-breakdown")) == 0: # return None soup = BeautifulSoup(requests.get(url).content, "html.parser") grades = soup.find_all("div", {"class": "grade"}) first_name = soup.findall("span", {"class": "pfname"})[0].text.strip() last_name = soup.findall("span", {"class": "plname"})[0].text.strip() department = soup.findall("div", {"class": "result-title"})[0].text.strip() professor_info = { 'first_name': first_name, 'last_name': last_name, 'department': department.split('department')[0].split('fessor in the')[1].strip(), 'overall_quality': grades[0].text.strip(), 'would_take_again': grades[1].text.strip(), 'level_of_difficulty': grades[2].text.strip(), 'url': url } return professor_info
def execute(self): result = {'scraped_datetime': str(datetime.datetime.now())} try: if self.exec_type == 'GET': site = requests.get(self.url, verify=False) for key, selector in self.selectors.__dict__.items(): if selector is not None: if selector.parser_type == 'bs4': try: page = BeautifulSoup(site.text, features="lxml") if selector.attribute is None: result[key.replace( '_selector', '')] = page.select_one( selector.selector).get_text() else: result[key.replace( '_selector', '')] = page.select_one( selector.selector).get( selector.attribute) if selector.processor is not None: result[key.replace( '_selector', '')] = selector.processor( page.select_one( selector.selector).get_text()) except Exception as e: result[key.replace('_selector', '')] = str(e) else: try: page = html.fromstring(site.text) result[key.replace( '_selector', '')] = page.findall( selector.selector)[0].text if selector.processor is not None: result[key.replace( '_selector', '')] = selector.processor( page.findall( selector.selector)[0].text) except Exception as e: result[key.replace('_selector', '')] = str(e) else: result = self.call() except Exception as e: print(e) if self.result_postprocessor is not None: result = self.result_postprocessor(result) result['region'] = self.region result['source_type'] = 'individual_website' print("{} : {}".format(self.region, result)) return result
def Xampp(): #check if xampp install path exists dir = Path("/opt/lampp/") if not dir.exists(): choice = str(input("Do you have xampp file on computer (YES,NO):")) if choice in ['yes','YES','y','Y']: print("Please move the file to %s \n" %os.getcwd()) input("PRESS ANY KEY WHEN COPY IS DONE:") os.system("mv *.run xampp.run") else: url = 'https://www.apachefriends.org' dst = 'xampp.run' r = requests.get(url) soup = BeautifulSoup(r.text,'html.parser') for x in soup.findall('a'): link = x.get('href') #extract href links from results if re.search("xampp-linux",link): #search for linux in links url = link #assign url to the link gotten print("\033[31m.............DOWNLOADING XAMPP PLEASE WAIT...............") urlretrieve(url,dst,MyProgressBar()) #instructions to execute after decision making os.system("echo %s | sudo -S chmod +x xampp.run" %pwd) os.system("echo %s | sudo -S ./xampp.run" %pwd) os.system("echo %s | sudo -S ln -s /opt/lampp/bin/php /usr/local/bin/php" %pwd) os.system("echo %s | sudo -S /opt/lampp/xampp start" %pwd) else: print("\n xampp install already existing. installing other requirements ..... \n \n")
def scrape(): # Initialize browser browser = init_browser() # Create an empty dicitonary to store scraped dog restaurant data dog_scrapped_data = [] ############################ # Bring Fido dog friendly restaurants (https://www.bringfido.com/restaurant/city/san_francisco_ca_us/) dog_url = "https://www.bringfido.com/restaurant/city/san_francisco_ca_us/" browser.visit(dog_url) time.sleep(3) dog_html = browser.html dog_soup = BeautifulSoup(dog_html, 'html.parser') restaurant_names = dog_soup.findall("div", class_="info").find("span").text for restaurant in restaurant_names: dog_scrapped_data.append(restaurant) browser.quit() return dog_scrapped_data
class WikiParse(object): def __init__(self, **kwargs): self.keyword = kwargs.get('keyword') self.obj = wikipedia.page(self.keyword) self.soup = BeautifulSoup(self.obj.html()) #WARNING: making the assumption that all band pages will #have a info box and all relevant information will be #provided under the Background Information subheader def find_background_info(self): infoboxes = self.soup.findall(attr={'class','infobox'}) background_found = False for infobox in infoboxes: tr_list = infobox.find_all('tr') for tr in tr_list: th = tr.find_all('th') data = {} if not th: continue th_str = th[0].string if not background_found: if th_str and th_str == 'Background Information': background_found = True else: td = tr.find_all('td') td_str = td.string if td_str: data[th_str] = td_str else: alinks = td.find_all('a') data[th_str] = ','.join(alinks) def find_inspiration(self): sentences = self.obj.content.split('.') syn_list = SynWord.objects.get(name='inspiration').synonym key_sentences = [] for sentence in sentences: for syn_word in syn_list: if syn_word.name in sentence: key_sentences.append(sentence) return key_sentences def connected_nodes(self): sentence = self.find_inspiration() parents = self.find_connections(sentence) print 'PARENTS: %s' % parents #going to make the initial assumption that wikipedia #articles rarely mentions whom the band influenced for #the sake of getting this to work and to sidestep #the complexities of determining the influencer in the #sentence. def find_connections(self, sentence): #parents = [] tokens = nltk.word_tokenize(sentence) tagged = nltk.pos_tag(tokens) proper_nouns = [x for x, y in tagged if y in KEY_NOUNS] return proper_nouns
def spider(maximum_pages): page = 1 while page <= maximum_pages: url = 'paste your web address here' + str( page) # Paste address of the website to be crawled source_code = requests.get( url ) # source_code is a variable which will store the scrapped information from the url. requests.get() will collect all the information. plain_text = source_code.text # source_code will be in html formt for our sake we will convert it into txt format. soup = BeautifulSoup( plain_text ) # BeautifulSoup constructor takes the document in the form dtring and parses the document and creates a corresponding data structure. for link in soup.findall( 'a', {'': ''} ): # This for loop will run until the crawler collects all the href from the class. The tag for links is " 'a' ". In the other colons you can add more details like class and sub class. href = "paste your web address here" + link.get( 'href' ) # Here we connects the web address to href to create a complete link of that object. title = link.string # It collects the title of the href . print(href) print(title) pages += 1 # Incrementing the page number.
def get_city_urls(): #the base urls for scraping - the school list is paginated with open("spot_states.txt") as f: base_urls = f.read().splitlines() #create a list individual school district urls for url in base_urls: with open("spot_area_urls.txt") as f: states = f.read().splitlines() r = requests.get(url) data=r.text soup = BeautifulSoup(data, "html.parser") main_table = soup.findall(class_="table table-condensed table-striped table-hover text-left") for cur_link in all_links: cur_url = cur_link.get('href') for i in range(0, 50): print(states[i]) length = len(states[i]) if cur_url[0:length] == states[i] and len(cur_url)>length: school_district_urls.append(url_pre+cur_url+url_post) break write_to_csv(school_district_urls,"school_district_urls2.csv")
def bsoup(): r = requests.get("https://answers.yahoo.com/question/index?qid=20080613085817AAqvcNW") soup = BeautifulSoup (open(r.content)) string = soup.findall("div", {"class":"group"}) print soup.finalall(re.compile("^[A-Z]")) print soup.prettify() print string
def get_new_yorker_reports(end_year=2010): base_url = 'https://www.newyorker.com/magazine/reporting/page/' page_num = 1 conn = sqlite3.connect('./magazine_features.db') cur = conn.cursor() url = f'{base_url}{page_num}' while True: r = requests.get(url) page = BeautifulSoup(r.text, features='html.parser') articles = page.findall('li', class_='River__riverItem___3huWr') for article in articles: issue_date = article.find('div', class_='River__issueDate___2DPuc') issue_link = issue_date.find('a', class_='Link__link___3dWao')['href'] if int(issue_link.split('/')[2]) < end_year: break date = '-'.join(issue_link.split('/')[2:]) byline_div = article.find('div', class_='Byline__by___37lv8') authors = byline_div.findall('a') for author in authors: row = ('new_yorker', date, author.text.a) cur.execute('INSERT INTO authors VALUES (?,?,?)', row) conn.commit() print(f'Inserted for issue {date}') page_num += 1 conn.close() return 'success!'
def get_city_urls(): #the base urls for scraping - the school list is paginated with open("spot_states.txt") as f: base_urls = f.read().splitlines() #create a list individual school district urls for url in base_urls: with open("spot_area_urls.txt") as f: states = f.read().splitlines() r = requests.get(url) data = r.text soup = BeautifulSoup(data, "html.parser") main_table = soup.findall( class_="table table-condensed table-striped table-hover text-left") for cur_link in all_links: cur_url = cur_link.get('href') for i in range(0, 50): print(states[i]) length = len(states[i]) if cur_url[0:length] == states[i] and len(cur_url) > length: school_district_urls.append(url_pre + cur_url + url_post) break write_to_csv(school_district_urls, "school_district_urls2.csv")
def main(): #url = 'https://imgur.com/a/LpH5UiD' #tid = 1 c = False tid = int(sys.argv[1]) url = urllib.parse.unquote(sys.argv[2]) r = requests.get(url, proxies=config.PROXIES, allow_redirects=True) if r.headers['Content-Type'].split('/')[0] == 'image': #ext = r.headers['Content-Type'].split('/')[1] create_thumbnail(r, tid) c = True else: soup = BeautifulSoup(r.text) image = soup.find('meta', property='og:image') try: iurl = image.get('content', None) r = requests.get(iurl, proxies=config.PROXIES, allow_redirects=True) soup = BeautifulSoup(r.text) if r.headers['Content-Type'].split('/')[0] == 'image': #ext = r.headers['Content-Type'].split('/')[1] create_thumbnail(r, tid) except: try: icon_link = soup.find("link", rel="shortcut icon") r = requests.get(icon_link['href'], proxies=config.PROXIES, allow_redirects=True) soup = BeautifulSoup(r.text) create_thumbnail(r, tid) except: i = soup.findall('img') guess = 0 src = '' limit = 0 for im in i: try: limit += 1 if limit > 15: break try: height = int(im.attrs.get('height', None)) width = int(im.attrs.get('width', None)) except: height = 1 width = 1 isrc = im.attrs.get('src', None) if (height * width) > guess: src = isrc guess = height * width except: pass if src != '': r = requests.get(i['href'], proxies=config.PROXIES) create_thumbnail(r, tid)
def bsoup(): r = requests.get( "https://answers.yahoo.com/question/index?qid=20080613085817AAqvcNW") soup = BeautifulSoup(open(r.content)) string = soup.findall("div", {"class": "group"}) print soup.finalall(re.compile("^[A-Z]")) print soup.prettify() print string
def xtest_we_have_the_right_page(self): r = self.client.get(self.url) soup = BeautifulSoup(r.content) req_text = "Add Fire Behaviour Calculation document" h1_tags = soup.find_all('h1') print(h1_tags) assert False assert any(tag.string == req_text for tag in soup.findall('h1'))
def print_to_text(base_url): import requests from bs4 import BeautifulSoup r = requests.get(base_url) soup = BeautifulSoup(r.text, features="html.parser") with open("less.txt", "w") as f: for paragraph in soup.findall(dir='ltr'): f.write(paragraph.text.replace("<span>", ""))
def AnalizeReserveList() : topurl = "https://www.lib.city.kobe.jp/opac/opacs/reservation_cancel_confirmation?reservation_order_confirmation=%e9%a0%86%e4%bd%8d%e7%a2%ba%e8%aa%8d" html = open("book.htm","r").read() sp = BeautifulSoup(html) table = sp.findall("table") for row in rows: cell =row.findAll("td")[2] # 予約番号は3カラム目 print(cell)
def parse(self, response): rss = BeautifulSoup(response.body, "html.parser") for item in rss.findall("item"): feed_item = ChinanewsCrawlItem() #返回一个对象 feed_item['title'] = item.title.text # 后面的是页面代码中的内容 feed_item['link'] = item.link.text feed_item['desc'] = item.description.text feed_item['pub_date'] = item.pubDate yield feed_item #迭代对象
def get_single_item_data(item_url): source_code = requests.get(item_url) plain_text = source_code.text soup = BeautifulSoup(plain_text) for item_name in soup.findAll('div', {'class': 'notranslate'}): print (item_name.string) for link in soup.findall('a'): href = "https://www.ebay.com" + link.get('href') print(href)
def AnalizeReserveList(): topurl = "https://www.lib.city.kobe.jp/opac/opacs/reservation_cancel_confirmation?reservation_order_confirmation=%e9%a0%86%e4%bd%8d%e7%a2%ba%e8%aa%8d" html = open("book.htm", "r").read() sp = BeautifulSoup(html) table = sp.findall("table") for row in rows: cell = row.findAll("td")[2] # 予約番号は3カラム目 print(cell)
def get_single_item_date(item_url): source_code=requests.get(item_url) plain_text=source_code.text soup=BeautifulSoup(plain_text) for item_name in soup.findAll('div',{'class':'i-name'}): print(item_name.string) for link in soup.findall('a') href="http://www.imdb.com" + link.get('href) print(href)
def getStockList(lst, stockURL): html = getHTMLText(stockURL) soup = BeautifulSoup(html, 'html.parser') a = soup.findall('a') for i in a: try: href = i.attrs['href'] lst.append(re.findall(r'[s][hz]\d{6}', href)) except: continue
def trade_spider(max_pages): url='http://www.imdb.com/chart/toptv/?ref_=nv_tvv_250_4/page+' + str(page) source_code=requests.get(url) plain_text=source_code.text soup=BeautifulSoup(plain_text) for link in soup.findall('a',{'class':'item.name}): href='http://www.imdb.com" + link.get('href') title=link.string print(href) print(title) get_single_item_date(href)
def getStockList(lst,stockURl): html = getHTMLText(stockURL,'GB2312') soup = BeautifulSoup(stockURl,'parser.html') a = soup.findall('a') for i in a:#可能并不是所有的a标签都符合正则表达式的方式,中间可能会出现各种错误及异常,如果出现异常,可能不是要解析的范围,直接是程序继续运行即可 try: href = i.attrs['href'] lst.append(re.findall(r'[s][hz]\d{6}'),href) except: continue return""
def print_zillow_price(address, city, state): address = address.replace(', Cary', '') query_url = 'http://www.zillow.com/webservice/GetSearchResults.htm?zws-id={0}&address={1}&citystatezip={2}' query_url = query_url.format(ZILLOW_KEY, address, city + ',' + state) r = urllib.request.urlopen(query_url) doc = BeautifulSoup(r, 'xml') for item in doc.findall('message'): print(item)
def scrape(self): r = urllib.request.urlopen(self.site) html = r.read() parser = 'html.parser' sp = BeautifulSoup(html, parser) for tag in sp.findall('a'): url = tag.get('href') if url is None: continue if 'html' in url: print('\n' + url)
def TLD_specific_search(document, is_raw_content): TLD = top_level_domain_pattern(document, is_raw_content) raw_content = document["raw_content"] # raw_content = document if TLD and raw_content: soup = BeautifulSoup(raw_content, 'html.parser') if TLD == "escortcafe.com": content = soup.find_all("div", class_="details") # print type(content) # print re.findall('Blonde', content) elif TLD == "classifriedads.com": content = soup.find_all(id="contentcell") elif TLD == "slixa.com": content = soup.find_all("div", class_="span9 profile-content") + soup.find_all("aside", class_="profile-sidebar span3") # elif TLD == "allsexyescort.com": elif TLD == "escort-ads.com": content = soup.findall("div", class_="container main-content vip-content") # elif TLD == "liveescortreviews.com": # elif TLD == "escort-europe.com": elif TLD == "find-escorts.com": content = soup.findall(id="contentcell") elif TLD == "escortserv.com": content = soup.findall(id="index") elif TLD == "slixa.ca": content = soup.find_all("div", class_="span9 profile-content") + soup.find_all("aside", class_="profile-sidebar span3") elif TLD == "escortpost.com": content = soup.findall(id="content") elif TLD == "privateescorts.ro": content = soup.findall("tbody") elif TLD == "adultsearch.com": content = soup.findall(id="ad") return str(content) else: return ""
def TLD_specific_search(document): TLD = extraction.top_level_domain_pattern(document) raw_content = extraction.get_raw_content(document) if TLD and raw_content: soup = BeautifulSoup(raw_content, 'html.parser') content = "" if TLD == "escortcafe.com": content = soup.find_all("div", class_="details") elif TLD == "classifriedads.com": content = soup.find_all(id="contentcell") elif TLD == "slixa.com": content = soup.find_all("div", class_="span9 profile-content") + soup.find_all("aside", class_="profile-sidebar span3") # elif TLD == "allsexyescort.com": elif TLD == "escort-ads.com": content = soup.findall("div", class_="container main-content vip-content") # elif TLD == "liveescortreviews.com": # elif TLD == "escort-europe.com": elif TLD == "find-escorts.com": content = soup.findall(id="contentcell") elif TLD == "escortserv.com": content = soup.findall(id="index") elif TLD == "slixa.ca": content = soup.find_all("div", class_="span9 profile-content") + soup.find_all("aside", class_="profile-sidebar span3") elif TLD == "escortpost.com": content = soup.findall(id="content") elif TLD == "privateescorts.ro": content = soup.findall("tbody") elif TLD == "adultsearch.com": content = soup.findall(id="ad") return str(content) else: return ""
def get_recipe_info(recipe_link): recipe_dict = dict() import requests from bs4 import BeautifulSoup try: response = requests.get(recipe_link) if not response.statuscode == 200: return recipe_dict results_page = BeautifulSoup(response.content, 'lxml') ingredient_list = list() prep_steps = list() for ingredients in results_page.findall('li', class_='ingredient'): ingredient_list.append(ingredients.get_text()) for steps in results_page.findall('li', class_='preparation-step'): prep_steps.append(steps.get_text().strip()) recipe_dict['ingredients'] = ingredient_list recipe_dict['preparation'] = prep_steps return recipe_dict except: return recipe_dict
def trade_spider(max_pages): page = 1 while page <= max_pages: url = 'https://buckysroom.org/trade/search.php?page=' + str(page) source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text) for link in soup.findall('a', {'class': 'item-name'}): href = 'https://buckysroom.org' + link.get('href') title = link.string print(href) print(title) page += 1
def main(): c = False tid = int(sys.argv[1]) url = urllib.parse.unquote(sys.argv[2]) r = requests.get(url, proxies=config.PROXIES, allow_redirects=True) if r.headers['Content-Type'].split('/')[0] == 'image': create_thumbnail(r, tid) add_remote_image(url, tid) c = True else: soup = BeautifulSoup(r.text) image = soup.find('meta', property='og:image') try: iurl = image.get('content', None) r = requests.get(iurl, proxies=config.PROXIES, allow_redirects=True) soup = BeautifulSoup(r.text) if r.headers['Content-Type'].split('/')[0] == 'image': create_thumbnail(r, tid) add_remote_image(iurl, tid) except: try: raise Exception except: i = soup.findall('img') guess = 0 src = '' limit = 0 for im in i: try: limit += 1 if limit > 15: break try: height = int(im.attrs.get('height', None)) width = int(im.attrs.get('width', None)) except: height = 1 width = 1 isrc = im.attrs.get('src', None) if (height * width) > guess: src = isrc guess = height * width except: pass if src != '': r = requests.get(i['href'], proxies=config.PROXIES) create_thumbnail(r, tid)
def trade_spider(max_pages): page = 1 while page < max_pages: url = 'https....' + str(page) source_code = requests.get(url) plain_text = source_code.txt soup = BeautifulSoup(plain_text) for link in soup.findall('a', {'class': 'item-name'}): href = "https...." + link.get('href') title = link.string # print(href) # print(title) get_single_item_data(href) page += 1
def trade_spider(max_pages): page = 1 while page < max_pages: url = 'https....' +str(page) source_code = requests.get(url) plain_text = source_code.txt soup = BeautifulSoup(plain_text) for link in soup.findall('a',{'class': 'item-name'}): href = "https...." + link.get('href') title = link.string # print(href) # print(title) get_single_item_data(href) page += 1
def getpic(search): try: browser = mechanize.Browser() browser.set_handle_robot(False) browser.addheaders = [('User-agent','Mozilla')] htmltext = browser.open("http://www.google.com/?t=lm") img_urls = [] soup = BeautifulSoup(htmltext) results = soup.findall("a") print results except: print "error"
def mirrorImages(url, dir): ab = anonBrowser() ab.anonymize() html = ab.open(url) soup = BeautifulSoup(html) image_tags = soup.findall('img') for image in image_tags: filename = image['src'].lstrip('https://') filename = os.path.join(dir, filename.replace('/', '_')) print('[+] Saving ' + str(filename)) data = ab.open(image['src']).read() ab.back() save = open(filename, 'wb') save.write(data) save.close()
def summarize_best_books(filepath): """ Write a function to get a list of categories, book title and URLs from the "BEST BOOKS OF 2020" page in "best_books_2020.htm". This function should create a BeautifulSoup object from a filepath and return a list of (category, book title, URL) tuples. For example, if the best book in category "Fiction" is "The Testaments (The Handmaid's Tale, #2)", with URL https://www.goodreads.com/choiceawards/best-fiction-books-2020, then you should append ("Fiction", "The Testaments (The Handmaid's Tale, #2)", "https://www.goodreads.com/choiceawards/best-fiction-books-2020") to your list of tuples. """ with open(filepath) as fp: soup = BeautifulSoup(fp, "html5lib") #print(soup) categories = soup.findall('h4', class_="category__copy") titles = soup.findall('a', class_="readable") urls = soup.findall('a', class_="readable") l_titles = [] for t in titles: l_titles.append(t.text) l_categories = [] for c in categories: l_categories.append(c.text) l_urls = [] for u in urls: l_urls.append(u.text) l_tups = [] for i in range(len(l_titles)): l_tups.append((l_categories[i], l_titles[i], l_urls[i])) return l_tups
def parse_page(url_page, path, headers): #下载一个page的48张图片 global n req = getHtml(url_page, headers) #获取html源代码 html = req.text bf = BeautifulSoup(html, 'lxml') #将html源码转换为BeautifulSoup对象 page_url = bf.findall('div', class_='page') #targets_page_url = targets_url = bf.find_all( 'div', class_='photo_card__grid') #从中提取该页中所有的48张图片的详情页链接 targets_url1 = targets_url[0].find_all('a') #将所有链接保存到一个list中 for each in targets_url1: #提取并下载每张图片 url_photo = each.get('href') parses_picturePage(url_photo, path) n = n + 1
def build_interface(prime_prod): spec_table = [] if len(prime_prod) > 10: for i in range(0, 10): spec_page = requests.get(prime_prod[i].product_href, 'html5lib').text spec_soup = BeautifulSoup(spec_page, "html5lib") # spec_table.append(spec_soup.find("div",{'class':'container pad-btm-lg'}).get) # spec_table.append(spec_soup.find("div", {'class': 'col-md-12'}).text) # st = spec_soup.find("div", {'class': 'col-md-12'}).get st = spec_soup.findall("div", {'class': 'pad-top-sm'}) for item in st: spec_part = item.text spec_table.append(spec_part) # spec_table.append(st) return spec_table
def crawl(url): header = { 'User-Agent': 'Mozilla/5.0(Window;U;Windows NT 6.1;en-US;rv:1.9.1.6) Gecko/20091201 Frefox/3.5.6' } req = urllib.request.Request(url, headers=headers) page = urllib.request.urlopen(req, timeout=20) contents = page.read() soup = BeautifulSoup(contents) my_girl = soup.findall('img') for girl in my_girl: link = girl.get('src') print(link) contents = urllib.request.urlopen(link).read() with open(u'D:/meizi' + '/' + link[-11:], 'wb') as code: code.write(contents)
def pull_oddshark_baseball(url, naming_standard): ''' Gets the betting offers listed at an oddshark webpage This function should be used instead of pull_oddshark for baseball matches returns a dictionary of events ''' months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] #use this for coverting string months into the conventional integer for that month #e.g. February = 2 month_str_to_number = {months[i]: i+1 for i in range(len(months))} ''' HTML Tags Team 1 win odds class = op-item op-spread border-bottom op-<BOOKIE_NAME> Team 2 win odds class = op-item op-spread op-<BOOKIE_NAME> Date class = op-separator-bar op-left no-group-name data-op-date = {"full_date":"Tuesday August 21","short_date":"Tue Aug 21","group_name":""} Team 1 class = op-matchup-team op-matchup-text op-team-top data-op-name = {"full_name":"Atlanta","short_name":"ATL"} Team 2 class = op-matchup-team op-matchup-text op-team-bottom data-op-name = {"full_name":"Pittsburgh","short_name":"PIT"} ''' bookie_names = ['opening', 'bovada.lv', 'mybookie', 'intertops', 'betonline', 'caesars', '5dimes', 'westgate', 'topbet', 'sportsbetting', 'gtbets', 'betnow', 'skybook', 'sportbet', 'station', 'mirage', 'wynn'] team_1_win_class = 'op-item op-spread border-bottom op-{}' team_2_win_class = 'op-item op-spread op-{}' page = urllib.request.urlopen(url) soup = BeautifulSoup(page, 'html.parser') datetime_valid = datetime.datetime.now(datetime.timezone.utc) team_1_tags = soup.findall("div", class_ = "op-matchup-team op-matchup-text op-team-top") for team_1_tag in team_1_tags: pass
def fileToArticle(aid): html = open(tempArticleDir +"/"+ str(aid)[0:2] + "/" + str(aid)).read() soup = BeautifulSoup(html,from_encoding="utf8") #aid atype = None if html.find(u"btn_Addpost") > -1: atype = 1 elif html.find(u"op_btn") > -1: atype = 2 title = unicode(soup.find("div","post_title").h1.string) author_div = soup.find("div","fl").a author = unicode(author_div.string) author_link = u"http://www.mafengwo.cn" + unicode(author_div["href"]) wdate = datetime.strptime(soup.find("span","date").string,"%Y-%m-%d %H:%M:%S") #replyCount r_div = soup.findall("div","post_item") replyCount = len(r_div) - 1 reply_div = soup.find("div","turn_page").find("div","paginator") if reply_div: findall("a") reply_page = int(a[-2].string) replyCount = replyCount + (reply_page-1)*50 location = None dl_location = soup.find("dl","related_mdd") if dl_location: location = unicode(dl_location.p.a.string) imgs,txt = getImgTexts(html) imgCount = len(imgs) txtCount = 0 for t in txt: txtCount = txtCount + len(t)
import time import urllib.request from bs4 import BeautifulSoup from dbconnect import connection req = urllib.request.urlopen('http://www.nationaljournal.com/politics?rss=1') xml = BeautifulSoup(req, 'xml') c, conn = connection() for item in xml.findall('link')[3:]: url = item.text c.execute("INSERT INTO links (time, link) VALUES (%s, %s)", (time.time(), url)) conn.commit() conn.close()
def test_detail_response_html(dbtransaction, authenticated_app, new_entry): new_entry_id = new_entry.id response = authenticated_app.get('/entries/{}'.format(new_entry_id)) soup = BeautifulSoup(response.html, 'html.parser') anchors = soup.findall('a') assert '<li class="tab"><a href="/entries/{{entry.id}}/edit">Edit</a></li>' in anchors
import requests from bs4 import BeautifulSoup from urlparse import urljoin URL = 'http://philadelphia.craigslist.org/search/sss?sort=date&query=firefly%20tickets' BASE = 'http://philadelphia.craigslist.org/cpg/' response = requests.get(URL) soup = BeautifulSoup(response.content) for listing in soup.findall('p',{'class':'row'}): if listing.find('span',{'class':'price'}) != None: price = listing.text[2:6] price = int(price) if price <=250 and price > 100: print listing.text linkend = listing.a['href'] url = urljoin(BASE, link_end) print url print "\n"
source = 'http://mp.weixin.qq.com/s?__biz=MjM5MDkyOTI1OQ==&mid=209217907&idx=6&sn=9d975e5aca7a10bc70bec1e8b3dec3db&scene=5#rd' # import requests from urllib2 import urlopen urlcontent = urlopen(source).read() print urlcontent from bs4 import BeautifulSoup soup = BeautifulSoup(urlcontent) # print dir(soup) for p in soup.findall('p'): print p # al = soup.findall()
from bs4 import BeautifulSoup import urllib.request urls = open("testurl.txt","r") # for each page for page in urls: soup = BeautifulSoup(page) woah = soup.findall("div",class_="biography").children print woah
import urllib2 import datetime import re import MySQLdb import csv from bs4 import BeautifulSoup as Soup today = datetime.date.today() html = urllib2.urlopen("http://www.99acres.com/property-in-velachery-chennai-south-ffid").read() soup = Soup(html) print "INSERT INTO Property (URL,Rooms, Place, Phonenumber1,Phonenumber2,Phonenumber3,Typeofperson, Name)" print "VALUES (" f = open('out.txt', 'w') re_digit = re.compile('(\d+)') pdate = soup.findall('i', {'class':'pdate'}) properties = soup.findAll('a', title=re.compile('Bedroom')) for eachproperty in properties: # title = today,","+"http:/"+ eachproperty['href']+",", eachproperty.string+"," +",".join(re.findall("'([a-zA-Z0-9,\s]*)'", eachproperty['onclick'])) for eachdate in pdate: pdates = re.sub('(\s{2,})', ' ', eachdate.text) for div in soup.find_all('div', {'class': 'sT_disc grey'}): try: project = div.find('span').find('b').text.strip() except: project = 'No project' area = re.findall(re_digit, div.find('i', {'class': 'blk'}).text.strip()) print today,","+"http:/"+ eachproperty['href']+",", eachproperty.string+"," +",".join(re.findall("'([a-zA-Z0-9,\s]*)'", eachproperty['onclick']))+","+ ", ".join([project] + area),","+pdates print "),"
import webbrowser from bs4 import BeautifulSoup #from lxml import html f=open('amazon_results.txt','r') x=f.read() soup=BeautifulSoup(x) LMT=10 cnt=0 val=0 arr=[] for val in range(1,LMT+1): v="" v+='result_'+str(val) print v for x in soup.findall('div',str(v)): for a in x.findall('a'): f=1 for b in a.find_all('span','lrg bold'): c=b.get_text() if c: p1="Product Name:"+c print p1 for b in a.find_all('span','med reg'): c=b.get_text() if c: p2=c print c for b in a.find_all('span','price bld'): c=b.get_text() if c:
def parseTimeline(html,username): soup = BeautifulSoup(html) tlTime = soup.findAll("abbr") temp123 = soup.findAll("div",{"role" : "article"}) placesCheckin = [] timeOfPostList = [] counter = 0 for y in temp123: soup1 = BeautifulSoup(str(y)) tlDateTimeLoc = soup1.findAll("a",{"class" : "uiLinkSubtle"}) #Universal Time try: soup2 = BeautifulSoup(str(tlDateTimeLoc[0])) tlDateTime = soup2.find("abbr") #Facebook Post Link tlLink = tlDateTimeLoc[0]['href'] try: tz = get_localzone() unixTime = str(tlDateTime['data-utime']) localTime = (datetime.datetime.fromtimestamp(int(unixTime)).strftime('%Y-%m-%d %H:%M:%S')) timePostList.append(localTime) timeOfPost = localTime timeOfPostList.append(localTime) print "[*] Time of Post: "+localTime except TypeError: continue if "posts" in tlLink: #print tlLink.strip() pageID = tlLink.split("/") parsePost(pageID[3],username) peopleIDLikes = parseLikesPosts(pageID[3]) try: for id1 in peopleIDLikes: global peopleIDList global likesCountList if id1 in peopleIDList: lastCount = 0 position = peopleIDList.index(id1) likesCountList[position] +=1 else: peopleIDList.append(id1) likesCountList.append(1) except TypeError: continue if len(tlDateTimeLoc)>2: try: #Device / Location if len(tlDateTimeLoc[1].text)>0: print "[*] Location of Post: "+unicode(tlDateTimeLoc[1].text) if len(tlDateTimeLoc[2].text)>0: print "[*] Device: "+str(tlDateTimeLoc[2].text) except IndexError: continue else: try: #Device / Location if len(tlDateTimeLoc[1].text)>0: if "mobile" in tlDateTimeLoc[1].text: print "[*] Device: "+str(tlDateTimeLoc[1].text) else: print "[*] Location of Post: "+unicode(tlDateTimeLoc[1].text) except IndexError: continue #Facebook Posts tlPosts = soup1.find("span",{"class" : "userContent"}) try: tlPostSec = soup1.findall("span",{"class" : "userContentSecondary fcg"}) tlPostMsg = "" #Places Checked In except TypeError: continue soup3 = BeautifulSoup(str(tlPostSec)) hrefLink = soup3.find("a") """ if len(str(tlPostSec))>0: tlPostMsg = str(tlPostSec) #if " at " in str(tlPostMsg) and " with " not in str(tlPostMsg): if " at " in str(tlPostMsg): print str(tlPostSec) print tlPostMsg #print hrefLink #placeUrl = hrefLink['href'].encode('utf8').split('?')[0] #print "[*] Place: "+placeUrl #placesCheckin.append([timeOfPost,placeUrl]) """ try: if len(tlPosts)>0: tlPostStr = re.sub('<[^>]*>', '', str(tlPosts)) if tlPostStr!=None: print "[*] Message: "+str(tlPostStr) except TypeError as e: continue tlPosts = soup1.find("div",{"class" : "translationEligibleUserMessage userContent"}) try: if len(tlPosts)>0: tlPostStr = re.sub('<[^>]*>', '', str(tlPosts)) print "[*] Message: "+str(tlPostStr) except TypeError: continue except IndexError as e: continue counter+=1 tlDeviceLoc = soup.findAll("a",{"class" : "uiLinkSubtle"}) print '\n' global reportFileName if len(reportFileName)<1: reportFileName = username+"_report.txt" reportFile = open(reportFileName, "w") reportFile.write("\n********** Places Visited By "+str(username)+" **********\n") filename = username+'_placesVisited.htm' if not os.path.lexists(filename): html = downloadPlacesVisited(driver,uid) text_file = open(filename, "w") text_file.write(html.encode('utf8')) text_file.close() else: html = open(filename, 'r').read() dataList = parsePlacesVisited(html) count=1 for i in dataList: reportFile.write(normalize(i[2])+'\t'+normalize(i[1])+'\t'+normalize(i[3])+'\n') count+=1 reportFile.write("\n********** Places Liked By "+str(username)+" **********\n") filename = username+'_placesLiked.htm' if not os.path.lexists(filename): html = downloadPlacesLiked(driver,uid) text_file = open(filename, "w") text_file.write(html.encode('utf8')) text_file.close() else: html = open(filename, 'r').read() dataList = parsePlacesLiked(html) count=1 for i in dataList: reportFile.write(normalize(i[2])+'\t'+normalize(i[1])+'\t'+normalize(i[3])+'\n') count+=1 reportFile.write("\n********** Places checked in **********\n") for places in placesVisitedList: unixTime = places[0] localTime = (datetime.datetime.fromtimestamp(int(unixTime)).strftime('%Y-%m-%d %H:%M:%S')) reportFile.write(localTime+'\t'+normalize(places[1])+'\t'+normalize(places[2])+'\n') reportFile.write("\n********** Apps used By "+str(username)+" **********\n") filename = username+'_apps.htm' if not os.path.lexists(filename): html = downloadAppsUsed(driver,uid) text_file = open(filename, "w") text_file.write(html.encode('utf8')) text_file.close() else: html = open(filename, 'r').read() data1 = parseAppsUsed(html) result = "" for x in data1: reportFile.write(normalize(x)+'\n') x = x.lower() if "blackberry" in x: result += "[*] User is using a Blackberry device\n" if "android" in x: result += "[*] User is using an Android device\n" if "ios" in x or "ipad" in x or "iphone" in x: result += "[*] User is using an iOS Apple device\n" if "samsung" in x: result += "[*] User is using a Samsung Android device\n" reportFile.write(result) reportFile.write("\n********** Videos Posted By "+str(username)+" **********\n") filename = username+'_videosBy.htm' if not os.path.lexists(filename): html = downloadVideosBy(driver,uid) text_file = open(filename, "w") text_file.write(html.encode('utf8')) text_file.close() else: html = open(filename, 'r').read() dataList = parseVideosBy(html) count=1 for i in dataList: reportFile.write(normalize(i[2])+'\t'+normalize(i[1])+'\n') count+=1 reportFile.write("\n********** Pages Liked By "+str(username)+" **********\n") filename = username+'_pages.htm' if not os.path.lexists(filename): print "[*] Caching Pages Liked: "+username html = downloadPagesLiked(driver,uid) text_file = open(filename, "w") text_file.write(html.encode('utf8')) text_file.close() else: html = open(filename, 'r').read() dataList = parsePagesLiked(html) for i in dataList: pageName = normalize(i[0]) tmpStr = normalize(i[3])+'\t'+normalize(i[2])+'\t'+normalize(i[1])+'\n' reportFile.write(tmpStr) print "\n" c = conn.cursor() reportFile.write("\n********** Friendship History of "+str(username)+" **********\n") c.execute('select * from friends where sourceUID=?',(uid,)) dataList = c.fetchall() try: if len(str(dataList[0][4]))>0: for i in dataList: #Date First followed by Username reportFile.write(normalize(i[4])+'\t'+normalize(i[3])+'\t'+normalize(i[2])+'\t'+normalize(i[1])+'\n') #Username followed by Date #reportFile.write(normalize(i[4])+'\t'+normalize(i[3])+'\t'+normalize(i[2])+'\t'+normalize(i[1])+'\n') print '\n' except IndexError: pass reportFile.write("\n********** Friends of "+str(username)+" **********\n") reportFile.write("*** Backtracing from Facebook Likes/Comments/Tags ***\n\n") c = conn.cursor() c.execute('select userName from friends where sourceUID=?',(uid,)) dataList = c.fetchall() for i in dataList: reportFile.write(str(i[0])+'\n') print '\n' tempList = [] totalLen = len(timeOfPostList) timeSlot1 = 0 timeSlot2 = 0 timeSlot3 = 0 timeSlot4 = 0 timeSlot5 = 0 timeSlot6 = 0 timeSlot7 = 0 timeSlot8 = 0 count = 0 if len(peopleIDList)>0: likesCountList, peopleIDList = zip(*sorted(zip(likesCountList,peopleIDList),reverse=True)) reportFile.write("\n********** Analysis of Facebook Post Likes **********\n") while count<len(peopleIDList): testStr = str(likesCountList[count]).encode('utf8')+'\t'+str(peopleIDList[count]).encode('utf8') reportFile.write(testStr+"\n") count+=1 reportFile.write("\n********** Analysis of Interactions between "+str(username)+" and Friends **********\n") c = conn.cursor() c.execute('select userName from friends where sourceUID=?',(uid,)) dataList = c.fetchall() photosliked = [] photoscommented = [] userID = [] photosLikedUser = [] photosLikedCount = [] photosCommentedUser = [] photosCommentedCount = [] for i in dataList: c.execute('select * from photosLiked where sourceUID=? and username=?',(uid,i[0],)) dataList1 = [] dataList1 = c.fetchall() if len(dataList1)>0: photosLikedUser.append(normalize(i[0])) photosLikedCount.append(len(dataList1)) for i in dataList: c.execute('select * from photosCommented where sourceUID=? and username=?',(uid,i[0],)) dataList1 = [] dataList1 = c.fetchall() if len(dataList1)>0: photosCommentedUser.append(normalize(i[0])) photosCommentedCount.append(len(dataList1)) if(len(photosLikedCount)>1): reportFile.write("Photo Likes: "+str(username)+" and Friends\n") photosLikedCount, photosLikedUser = zip(*sorted(zip(photosLikedCount, photosLikedUser),reverse=True)) count=0 while count<len(photosLikedCount): tmpStr = str(photosLikedCount[count])+'\t'+normalize(photosLikedUser[count])+'\n' count+=1 reportFile.write(tmpStr) if(len(photosCommentedCount)>1): reportFile.write("\n********** Comments on "+str(username)+"'s Photos **********\n") photosCommentedCount, photosCommentedUser = zip(*sorted(zip(photosCommentedCount, photosCommentedUser),reverse=True)) count=0 while count<len(photosCommentedCount): tmpStr = str(photosCommentedCount[count])+'\t'+normalize(photosCommentedUser[count])+'\n' count+=1 reportFile.write(tmpStr) reportFile.write("\n********** Analysis of Time in Facebook **********\n") for timePost in timeOfPostList: tempList.append(timePost.split(" ")[1]) tempTime = (timePost.split(" ")[1]).split(":")[0] tempTime = int(tempTime) if tempTime >= 21: timeSlot8+=1 if tempTime >= 18 and tempTime < 21: timeSlot7+=1 if tempTime >= 15 and tempTime < 18: timeSlot6+=1 if tempTime >= 12 and tempTime < 15: timeSlot5+=1 if tempTime >= 9 and tempTime < 12: timeSlot4+=1 if tempTime >= 6 and tempTime < 9: timeSlot3+=1 if tempTime >= 3 and tempTime < 6: timeSlot2+=1 if tempTime >= 0 and tempTime < 3: timeSlot1+=1 reportFile.write("Total % (00:00 to 03:00) "+str((timeSlot1/totalLen)*100)+" %\n") reportFile.write("Total % (03:00 to 06:00) "+str((timeSlot2/totalLen)*100)+" %\n") reportFile.write("Total % (06:00 to 09:00) "+str((timeSlot3/totalLen)*100)+" %\n") reportFile.write("Total % (09:00 to 12:00) "+str((timeSlot4/totalLen)*100)+" %\n") reportFile.write("Total % (12:00 to 15:00) "+str((timeSlot5/totalLen)*100)+" %\n") reportFile.write("Total % (15:00 to 18:00) "+str((timeSlot6/totalLen)*100)+" %\n") reportFile.write("Total % (18:00 to 21:00) "+str((timeSlot7/totalLen)*100)+" %\n") reportFile.write("Total % (21:00 to 24:00) "+str((timeSlot8/totalLen)*100)+" %\n") """ reportFile.write("\nDate/Time of Facebook Posts\n") for timePost in timeOfPostList: reportFile.write(timePost+'\n') """ reportFile.close()