def table(): r = requests.get('https://www.worldometers.info/coronavirus/#countries') print(r.status_code) soup = b(r.text, 'lxml') # find our table table = soup.find('table', id='main_table_countries_today') tbody = table.find('tbody') # print(tbody) table_row = tbody.find_all('tr') all_rows = [] for tr in table_row: td = tr.find_all('td') row = [i.text.replace('\n', ' ').strip() for i in td] all_rows.append(row) df = pd.DataFrame(all_rows, columns=[ 'country', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'total_recovered', 'active', 'serirous', '1', '2', '3', '4', '5' ]) # print(df.head()) df.drop(index=[0, 1, 2, 3, 4, 5, 6, 7], inplace=True) df.drop(columns=['1', '2', '3', '4'], inplace=True) copy_df = df.copy() # print(copy_df.head()) copy_df['total_recovered'] = copy_df['total_recovered'].str.replace( 'N/A', '0') copy_df['new_cases'] = copy_df['new_cases'].str.replace('+', '') copy_df['new_deaths'] = copy_df['new_deaths'].str.replace('+', '') # print(copy_df.head()) copy_df['total_cases'] = copy_df['total_cases'].str.replace(',', '') copy_df['new_cases'] = copy_df['new_cases'].str.replace(',', '') copy_df['total_deaths'] = copy_df['total_deaths'].str.replace(',', '') copy_df['total_recovered'] = copy_df['total_recovered'].str.replace( ',', '') copy_df['active'] = copy_df['active'].str.replace(',', '') copy_df['serirous'] = copy_df['serirous'].str.replace(',', '') # print(copy_df.head()) copy_df['total_cases'] = pd.to_numeric(copy_df['total_cases']) copy_df['new_cases'] = pd.to_numeric(copy_df['new_cases']) copy_df['total_deaths'] = pd.to_numeric(copy_df['total_deaths']) copy_df['new_deaths'] = pd.to_numeric(copy_df['new_deaths']) copy_df['total_recovered'] = pd.to_numeric(copy_df['total_recovered']) copy_df['active'] = pd.to_numeric(copy_df['active']) copy_df['serirous'] = pd.to_numeric(copy_df['serirous']) # print(copy_df.head()) copy_df.fillna(0, inplace=True) # print(copy_df.head()) return copy_df
def get_num_flw(self, flag, user): if flag == "followers": # Followers flw = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.XPATH, f"//a[contains(@href, \'/{user}/followers/\')]"))) # flw = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#react-root > section > main'))) sflw = b(flw.get_attribute('innerHTML'), 'html.parser') followers = sflw.findAll('span', {'class': 'g47SY'}) f = followers[0].getText().replace(',', '') elif flag == "following": # Following flw = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.XPATH, f"//a[contains(@href, \'/{user}/following/\')]"))) sflw = b(flw.get_attribute('innerHTML'), 'html.parser') followers = sflw.findAll('span', {'class': 'g47SY'}) f = followers[0].getText().replace(',', '') if 'k' in f: f = float(f[:-1]) * 10**3 return f elif 'm' in f: f = float(f[:-1]) * 10**6 return f else: return float(f)
def MCQScraper(link): page_title=link.split("/")[3] heading="<!DOCTYPE html>\n<html>\n<head>\n<title>"+page_title+"</title>\n</head>\n<link rel='stylesheet' type='text/css' href='./style.css'>\n<body>"; file=open("../cse/"+page_title+".html","w") file.write(heading) p=r.get(link).text soup=b(p,features="html.parser") links=[] table_data=soup.find_all("td"); for data in table_data: href=data.a["href"] # text=data.a.text; if(href): #title.append(text) links.append(href) k=0 for i in links: p=r.get(i).text soup=b(p,features="html.parser") questions=soup.find("div",{"class":"entry-content","itemprop":"text"}) file.write(str(questions)) ending="</body>\n<script src='./script.js'></script>\n</html>" file.write(ending)
def queued_data_loop(session): """ Main data loop once logged in, gets list of ids and initializes a queue to iterate through and gathers data. :param session: User Login info """ while not id_queue.empty(): try: #A more broad exception to try and encapsulate if anything happens during the request process print('Ticket Number: ', id_queue.qsize()) quoteID = id_queue.get() print('getting id: ', str(quoteID)) r = session.get(ENGINEERING_DETAILS + str(quoteID), verify=False) soup = b(r.text, 'html.parser') table = soup.find_all('a') dateTable = soup.find_all('caption') dateTableList = [] soupList = [] for link in table: soupList.append(link) quotes = findQuotes(soupList) orderMatch = r'(^[a-zA-Z])(\w+)' #Determine the quarter of ticket for item in dateTable: dateTableList.append(item) date = findDate(dateTableList) if not quotes: print("No quotes...Next ticket!") id_queue.task_done() else: for quote in quotes: # get rack from each quote #TODO test this area if re.search(orderMatch, str(quote)): r = session.get( ENGINEERING_ORDERS + str(re.search(orderMatch, str(quote)).group(2)), verify=False) # print(ENGINEERING_ORDERS + str(re.search(orderMatch, str(quote)).group(2))) else: r = session.get(ENGINEERING_QUOTES + str(quote), verify=False) quotehtml = b(r.text, 'html.parser') quotelinks = quotehtml.find_all('tr') quoteDetails = [] for link in quotelinks: quoteDetails.append(link) findRack(quoteDetails, date, session, quoteID) # list of racks from each quote # print("Racks in dict: ", rackDict) # print(racks) id_queue.task_done() #TODO create more informative error handeling except Exception as e: import logging logging.exception('Something Happened...') id_queue.task_done() print('Ended at: ', id_queue.qsize())
def shopSearch(request, page): b_url = "https://www.n11.com/arama?q=" url_sep = "&pg=" url = b_url + request + url_sep + page html = urllib2.urlopen(url).read() soup = b(html, "html.parser") for post in soup.findAll("li", {"class": "column"}): try: item = post.findAll("a", {"class": "plink"})[0] title = item['title'] price = post.findAll("ins")[0].text.replace(" ", "").replace("\n", "") link = item['href'] sellerName = post.findAll("span", {"class": "sallerName"})[0].text.replace( " ", "").replace("\n", "") sellerPoint = post.findAll("span", {"class": "point"})[0].text.replace( " ", "").replace("\n", "") print(title) print(price + "\t Seller Name: " + sellerName + "\t Rating: " + sellerPoint) print(link + "\n") except: pass
def cheapest(request, page): x = [] b_url = "https://www.cimri.com/arama?" url_sep = "page=" seps2 = "&q=" url = b_url + url_sep + page + seps2 + request html = urllib2.urlopen(url).read() soup = b(html, "html.parser") for post in soup.findAll("div", {"id": "cimri-product"}): try: item = post.findAll("h2", {"class": "product-title"})[0].text link = post.findAll("a")[0]['href'] for markets in post.findAll("div", {"class": "tag"}): market = markets.text for prices in post.findAll("a", {"class": "s14oa9nh-0 gwkxYt"}): x.append( prices.text.replace("com", "com : ").replace(".tr", "")) print(item) print(x[0]) print(x[1]) print("https://www.cimri.com/" + link + "\n") x = [] except: pass
def news(grabyear, grabcountry): import requests from bs4 import BeautifulSoup as b import pandas as pd import webbrowser Country = grabcountry Before = grabyear url = f"https://www.google.co.in/search?q=+{Country}+co2+emissions+scholarly+articles+before:+{Before}" print(url) response = requests.get(url) soup = b(response.text, "lxml") articles = [] r = soup.find_all('div', attrs={'class': 'BNeawe vvjwJb AP7Wnd'}) for i in range(len(r)): articles.append(r[i].text) urls = soup.find_all('div', attrs={'class': 'kCrYT'}) Links = [] for link in urls: href = link.find('a') try: raw_website = href.get('href') clean_web = raw_website[7:] Links.append(clean_web) except: continue newsdata = [{"articles": articles, "links": Links}] return jsonify(newsdata)
def extract_relationships(cache, url,person_url): """ Extract all the relationships of the mensioned celev """ relationships = [] new_url = "https://www.whosdatedwho.com/dating/"+url filename = get_url_content(new_url,cache) soup = b(open(filename, 'r'), 'html.parser') ##grab the h4 class status_h4 = soup.find('h4', 'ff-auto-status') #always the tag type #grab the net sibling key_div = status_h4.next_sibling candidate = key_div.find_all('a') #we need all that start with dating relationships.extend(find_candidate(candidate, person_url)) ##get all prior relationships prev_h4 = soup.find('h4', 'ff-auto-relationships') div_past_relationships = prev_h4.next_sibling while div_past_relationships is not None and div_past_relationships.name=='p': candidate = div_past_relationships.find_all('a') relationships.extend(find_candidate(candidate, person_url)) div_past_relationships= div_past_relationships.next_sibling return relationships
def getDeckCards(backUrl): frontUrl = "https://www.mtggoldfish.com" url = frontUrl + backUrl html = urllib.request.urlopen(url).read() soup = b(html, 'html.parser') cards = soup.find_all('td', {'class': 'deck-col-card'}) #print(cards) cardList = [] for card in cards: cardList.append(card.text[1:-1]) #print('the card:',card.text) #print(cardList) cards = soup.find_all('td', {'class': 'deck-col-qty'}) #print(cards) cardQty = [] for card in cards: cardQty.append(card.text[1:-1]) #print('the card:',card.text) #print(cardQty) cardDic = {} for x in range(len(cardList)): cardDic[cardList[x]] = int(cardQty[x]) #print(cardDic) for key, val in cardDic.items(): if key not in addCards: addCards[key] = val else: addCards[key] = addCards[key] + val
def get_content(url): '''分析网页内容''' # 存储帖子信息的字典 comments = [] html = get_url_html(url) soup = b(html, 'lxml') li_tags = soup.find_all('li', class_=" j_thread_list clearfix") for li in li_tags: # 初始化一个字典,存储文章信息 comment = {} # 使用try,防止爬虫出错而停止运行 try: # 获取网页内容 comment['title'] = li.find('a', attrs={ 'class': 'j_th_tit ' }).text.strip() comment['link'] = URL + li.find('a', class_="j_th_tit ")['href'] comment['author'] = li.find('span', attrs={ 'class': 'frs-author-name-wrap' }).text.strip() comment['reply'] = li.find('div', attrs={ 'class': 'col2_left j_threadlist_li_left' }).text.strip() comments.append(comment) except: print("get_content,error") return comments
def getConceptDerivedTerms(word): searchTerm = word link = 'http://conceptnet.io/c/en/' link = link + searchTerm http_pool = url.connection_from_url(link) r = http_pool.urlopen('GET', link) http_pool.close() html = r.data.decode('utf-8') soup = b(html, features="html5lib") divs = soup.findAll("a") div = [] candies = [] for d in divs: if d.contents[0] == 'Derived terms': div = d.find_parent().find_parent() if len(div) > 0: links = div.findAll("a") for k in links: candies.append(n.word_tokenize(k.contents[0])) del (candies[0]) c = [] for k in candies: if len(k) > 1: counter = 0 s = '' for j in k: if len(j) > 2: counter += 1 s = s + ' ' + j if counter == len(k): c.append(s) elif len(k[0]) > 2: c.append(k[0]) candies = c c = [] for k in candies: if not k == searchTerm: c.append(k) candies = c for k in range(len(candies)): temp = n.word_tokenize(candies[k]) if len(temp) > 1: s = '' for j in temp: s = s + j + ' ' candies[k] = s else: candies[k] = temp[0] return candies
def grabProxiesHttp(): site = 'https://free-proxy-list.net/' hdr = {'User-Agent': 'Mozilla/5.0'} req = urllib.Request(site, headers=hdr) #sending requests with headers url = urllib.urlopen(req).read() #opening and reading the source code html = b(url, "lxml") #structuring the source code in proper format rows = html.findAll("tr") #finding all rows in the table if any. proxies = [] for row in rows: cols = row.find_all('td') cols = [ele.text for ele in cols] try: ipaddr = cols[ 0] #ipAddress which presents in the first element of cols list portNum = cols[ 1] #portNum which presents in the second element of cols list proxy = ipaddr + ":" + portNum #concatinating both ip and port portName = cols[6] #portName variable result will be yes / No if portName == "no": proxies.append(str(proxy)) #proxies.append(str(proxy)+":http") #if yes then it appends the proxy with https #else: #proxies.append(str(proxy)+":https") #if no then it appends the proxy with http except: pass #for j in proxies: # print(j) return proxies
def main(line): try: line = line.split(",")[1].strip() json = soup(url%line) #print domain domain_soup = b(requests.get(domain).content,"lxml").find("body").text put(line[i].split(",")[1].strip()+ " at line " + str([i+1]), "INFO") json_data = json.loads(domain_soup.split("process(")[1].split(")")[0]) try: json_data = json_data[json_data.keys()[0]]['categories'].keys() if ('401' in json_data) or ('402' in json_data): put("ADULT!!!!!!!", "WARNING") print "---------------------------------------------------------------------" else: put("SAFE FOR WORK", "SUCCESS") file.write(line[i].split(",")[1].strip()+"\n") print "------------------------------------------------------------------------" except Exception as e: put("DOES NOT CONTAIN INFO", "FAIL") print "------------------------------------------------------------------------------------" i += 1
def __init__(self, day_url, season_id, current_day, scraping_parameters_path='scraping/scraping_parameters.yml'): self.season_id = season_id self.current_day = current_day self.scraping_parameters_path = scraping_parameters_path self.scraping_parameters = ScrapingParameters( self.scraping_parameters_path) self.day_results_list = [] # Access the current day data, using current day url day_http_request = requests.get(day_url) day_page = day_http_request.content day_soup = b(day_page, 'html.parser') day_container = day_soup.select('div.day-results-table') for html_match_info in day_container[0].select('tr.info-line.after'): current_match = Match(html_match_info, self.season_id, self.current_day) self.day_results_list.append(current_match.match_results) # for match_info in day_container[0].select('tr.info-line.after.table-hr'): # current_match = Match(match_info, season_id, current_day) # self.day_results.append(current_match.match_results) self.day_results = pd.DataFrame( self.day_results_list, columns=self.scraping_parameters.saison_cols)
def __init__(self, url, scraping_parameters_path='scraping/scraping_parameters.yml'): self.scraping_parameters_path = scraping_parameters_path self.scraping_parameters = ScrapingParameters( self.scraping_parameters_path) self.season_results = pd.DataFrame( columns=self.scraping_parameters.saison_cols) self.days = {} self.season_id = None # Access the current season data, using current season url season_http_request = requests.get(url) season_page = season_http_request.content self.season_soup = b(season_page, 'html.parser') # Get the urls to the different Days of the current season (stored in a dictionary) \ # and the identification of the current season. self.season_id, self.days = self.get_days_url() for day in self.days: current_day = Day( self.scraping_parameters.fixe + self.days[day], self.season_id, day, self.scraping_parameters_path, ) self.season_results = self.season_results.append( current_day.day_results)
def houseofbots(s): s.text.delete("1.0", tkinter.END) s.enter_webno.delete(0, tkinter.END) try: s.p = requests.get("https://www.houseofbots.com").text s.soup = b(s.p, features="html.parser") except: messagebox.showinfo( "ERROR", "Plse Make Sure You Have Internet Connection ") length = 1 s.heading1 = [] s.link2 = [] s.text.insert(tkinter.INSERT, "\t\tThe contents in house of bots are \n \n".upper()) for i in s.soup.find_all("li"): if i.find("h4") != None: s.heading1.append(i.find("h4").text) s.text.insert(tkinter.INSERT, str(length) + "." + i.find("h4").text + "\n\n") s.link2.append(i.find("a").get("href")) length += 1 s.sumbit_button = tkinter.Button(s.root, text="Open", command=lambda: s.dispaly(s.link2), fg="red", relief="sunken") s.sumbit_button.place(x=300, y=620, height=20, width=70)
def get_content_from_images(html): content = '' soup = b(html) text_class = soup.find_all('div', {'class': 'rc'}) for text in text_class: content = content + text.text return content
def get_content(u,response): if 'https://twitter.com' in u: # url other than image # getting content #print("-----------------------twitter status---------------------------") html = response.text soup = b(html) if ('i/moments' in u): text = '' moments_text = soup.find('div', {'class': 'MomentCapsuleCover-details'}) if moments_text: text = moments_text.text else: tweet_status_text = soup.find_all('p', { 'class': 'TweetTextSize TweetTextSize--jumbo js-tweet-text tweet-text'}) tweet_text = '' for txt in tweet_status_text: if txt: tweet_text = tweet_text + txt.text text = tweet_text else: #print('------------------------other content--------------------------') html = response.text text = get_content_from_urls(html) return text
def get_followers(self): time.sleep(2) flw_btn = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#react-root > section > main > div > header > section > ul > li:nth-child(2) > a"))) flw_btn.click() time.sleep(3) self.popup = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.XPATH, "/html/body/div[4]/div/div/div[2]"))) for h in range(11): time.sleep(1) print('scrolling') print(h) print('arguments[0].scrollTop = arguments[0].scrollHeight/{}'.format(str(11 - h))) self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight/{}'.format(str(11 - h)), self.popup) if h == 5: break for i in range(40): time.sleep(2) self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', self.popup) self.popup = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.XPATH, "/html/body/div[4]/div/div/div[2]"))) b_popup = b(self.popup.get_attribute('innerHTML'), 'html.parser') for p in b_popup.findAll('li', {'class': 'wo9IH'}): try: hlink = p.find_all('a')[0]['href'] print(hlink) if 'div' in hlink: print('div found not adding to list') else: self.hrefs.append(hlink) except: pass return self.hrefs
def cnt(self): with open(self.path) as f2: sp = b(f2, 'html.parser') text = sp.get_text().lower() print 'Enter "no" to exit' while True: w = raw_input('ENTER THE WORD WANT TO SEARCH IN THE FILE\n') if w != 'no': cnt = text.count(w) print '"%s occured %d times in the file"' % (w, cnt) self.lst.append(w) continue else: print 'want to see your searches' res = raw_input() if res != 'no': c = Counter(self.lst) print 'Top 5 words searched by you are ' for i, j in c.most_common(5): print '"%s"::"%d"times' % (i, j) with open('data.csv', 'ab') as d: self.header = ['words,date,time'] dt = csv.writer(d, delimiter=' ') if self.empty == 0: dt.writerow(self.header) for i in self.lst: dt.writerow([i + ',', self.d, self.t]) break
def __init__(self, fixed_url, season_id, current_day, season_url, day_url): self.season_url = season_url self.day_url = day_url self.complete_day_url = fixed_url + day_url self.season_id = season_id self.current_day = current_day self.day_results = pd.DataFrame(columns=MATCHES_RESULTS_COLS) # Access the current day raw_data, using current day url day_http_request = requests.get(self.complete_day_url) day_page = day_http_request.content day_soup = b(day_page, 'html.parser') day_container = day_soup.select('div.day-results-table') for html_match_info in day_container[0].select('tr.info-line.after'): current_match = Match(html_match_info, self.season_id, self.current_day, self.season_url, self.day_url) self.day_results = self.day_results.append( current_match.match_results) del current_match self.day_results.drop_duplicates( inplace=True, subset=['season_id', 'season_day', 'team_dom'], keep='last')
def liked_post(self): #If this returns tue then is is most likely an account with 0 posts try: p_text = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located(( By.CSS_SELECTOR, '#react-root > section > main > div > header > section > ul > li:nth-child(1) > span' ))) if p_text == '0': return False else: return True except: return True post = self.driver.find_element_by_css_selector( '#react-root > section > main > div > div._2z6nI > article > div > div > div:nth-child(1) > div:nth-child(1)' ) html = post.get_attribute('innerHTML') h = b(html, 'html.parser') href = h.a['href'] self.driver.get('https://www.instagram.com' + href) like_btn = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located(( By.CSS_SELECTOR, '#react-root > section > main > div > div.ltEKP > article > div.eo2As > section.ltpMr.Slqrh > span.fr66n > button > div' ))) like_btn.click()
def waterdebt(request): c = {} l = [] # c['error'] = "Zəhmət olmasa ilk öncə formu doldurub sonra borcu öyrən butonuna basın." abkodu = request.POST.get('abkodu') if abkodu: api = "https://opendata.e-gov.az/api/v1/json/azersu/DebtInfo/{}".format( abkodu) data = requests.get(api) d_d = json.loads(data.text) b_d = d_d['Response']['HtmlField'] soup = b(b_d, 'html.parser') if len(soup.find_all('b')) == 0: c['error'] = """Abonent kodu ya yanlışdır ya da boş buraxılıb. Zəhmət olmasa kodunuzu yoxlayın və yenidən yazın""" else: for a in soup.find_all('b'): l.append(re.sub(r"[<b>,</b>]", "", str(a))) c['result'] = True c['code'] = "Abonent kodu: " + l[1] c['name'] = "Ad: " + l[3] c['debt'] = "Borc: " + l[5] + " AZN" return render(request, test, c)
def get_content(url): comments = [] html = get_html(url) soup = b(html, 'lxml') liTags = soup.find_all('li', attrs={'class': 'j_thread_list clearfix'}) for li in liTags: comment = {} try: comment['title'] = li.find('a', attrs={ 'class': 'j_th_tit' }).text.strip() comment['link'] = li.find('a', attrs={'class': 'j_th_tit'})['href'] comment['name'] = li.find('span', {'class': 'tb_icon_author'})['title'] comment['time'] = li.find('span', { 'class': "pull-right is_show_create_time" }).text.strip() comment['replyNum'] = li.find( 'span', { 'class': "threadlist_rep_num center_text" }).text.strip() comments.append(comment) print('complete ' + comment["link"]) except: print("Error") return comments
def get_top_machts(): html = ur('http://football.kulichki.net/') bs = b(html.read()) div = bs.find('div', {"class": 'col2 inl vtop'}).center.table tr_list = div.find_all('tr') result = '' for item in tr_list[1:]: if item.find('span') is not None: flag = plus_flag(item.find('span').text) plus_flag(flag) result = result + flag + '\n' else: a = item.find('p', {"align": "left"}).text a = a.replace('\n', '') a = a.replace(' ', ' ') matchtime = a[1:a.index('.')] timeplus = (int(matchtime[:2]) + 2) % 24 timeplus = str(timeplus) if len(timeplus) == 1: timeplus = '0' + timeplus matchname = a[a.index('.') + 2:a.rindex('-')] result = result + '*' + timeplus + matchtime[ 2:] + '* _' + matchname + '_\n' return result
def __init__(self, fixed_url, season_id, season_url, day_url, day): # self.season_url = season_url self.day_url = day_url self.complete_day_url = fixed_url + day_url self.season_id = season_id self.season_url = season_url self.current_day = day self.day_ranking = pd.DataFrame(columns=RANKING_COLS) # Access the current day raw_data, using current day url day_http_request = requests.get(self.complete_day_url) day_page = day_http_request.content self.day_soup = b(day_page, features="lxml") # there are 14 teams for team in range(14): current_team = Team(self.season_id, self.current_day, self.season_url, self.day_url, self.day_soup, team=team) self.day_ranking = self.day_ranking.append( current_team.team_attributes_list) del current_team self.day_ranking.drop_duplicates(inplace=True, subset=['season', 'day', 'equipe'], keep='last')
def news(grabyear, grabcountry): Country = grabcountry Before = grabyear url = f"https://www.worldbank.org/en/search?q=global+warming+{Country}+{grabyear}¤tTab=1" print(url) response = requests.get(url) soup = b(response.text, "lxml") titles = [] links = [] descriptions = [] titles_html = soup.find_all( 'h4', attrs={'class': 'list-group-item-heading result-header'}) links_html = soup.find_all( 'p', attrs={'class': 'list-group-item-text result-link'}) descriptions_html = soup.find_all( 'p', attrs={'class': 'list-group-item-text result-description'}) # print(titles[0].text) # print(links[0].text) # print(descriptions[0].text) for i in range(len(titles_html)): titles.append(titles_html[i].text) links.append(links_html[i].text) descriptions.append(descriptions_html[i].text) newsdata = [{ "articles": titles, "links": links, "descriptions": descriptions }] return jsonify(newsdata)
def clg_details(): if request.method == 'POST': text = request.form['mail'] text = text.split("@") query = text[1].split(".")[0] + " college" print(query) url = [] #query = "cit college" if (query == "sona college"): print("SCT|SONA COLLEGE OF TECHNOLOGY IN SALEM,TAMILNADU") else: for j in search(query, tld="co.in", num=2, stop=2, pause=2): url.append(j) for i in url: html = requests.get(i) soup = b(html.content, "html.parser") r = soup.find("title") print(r.text) for q in [ 'Technology', 'University', 'Institutions', 'College', 'Engineering' ]: if q in r.text: return render_template("index.html", data=r.text) return render_template("index.html", data="No such college was found")
def cnt(path,p): with open (path) as f2: sp = b(f2,'html.parser') text = sp.get_text().lower() print 'Enter "no" to exit' while True: w = raw_input('ENTER THE WORD WANT TO SEARCH IN THE FILE\n') if w!='no': cnt = text.count(w) print '"%s occured %d times in the file"'%(w,cnt) lst.append(w) arr.append([w,d,t]) continue else: print 'want to see your searches' res = raw_input() if res !='no': c = Counter(lst) print 'Top 5 words searched by you are ' for i,j in c.most_common(5): print '"%s"::"%d"times'%(i,j) hdr = ['words','date','time'] sve(p,hdr,arr) break
def hackernews(s): #cleaning the dispaly s.text.delete("1.0", tkinter.END) s.enter_webno.delete(0, tkinter.END) try: s.p = requests.get("https://thehackernews.com").text s.soup = b(s.p, features="html.parser") except: messagebox.showinfo( "ERROR", "Plse Make Sure You Have Internet Connection ") length = 1 s.heading1 = [] s.link4 = [] s.text.insert(tkinter.INSERT, "\t\tThe contents in hacker news are\n \n".upper()) for i in s.soup.find_all("h2"): s.text.insert(tkinter.INSERT, str(length) + "." + i.text + "\n\n") s.heading1.append(i.text) length += 1 length = 0 for i in s.soup.find_all("a", class_='story-link'): s.link4.append(i.get("href")) length += 1 s.sumbit_button = tkinter.Button(s.root, text="Open", command=lambda: s.dispaly(s.link4), fg="red", relief="sunken") s.sumbit_button.place(x=300, y=620, height=20, width=70)
def main(line): try: tags = soup(b(requests.get((url+line).strip()).content,"lxml"),line) dic ={} dic[line.strip()] = tags put(dic,"SUCCESS") keyword.insert(dic, check_keys=False) put(line.strip()+" added to MongoClient","ENDC") except Exception as e: put(e,"FAIL")
import requests import re from bs4 import BeautifulSoup as b file=open("shutterstock.txt",'a') url="http://www.shutterstock.com/cat.mhtml?autocomplete_id=&language=en&lang=en&search_source=&safesearch=1&version=llv1&searchterm=&media_type=images" soup=b(requests.get(url).content) soup=soup.find("div",{"class":"secondary_links clearfix"}) ul=soup.findAll("ul") for per_ul in ul: li=per_ul.findAll('li') for per_li in li: category=per_li.find('a').text category_link="http://www.shutterstock.com"+per_li.find('a').get('href') file.write(str(category)+'\n') #print category_link file.write(str(category_link)+'\n') category_soup=b(requests.get(category_link).content) category_soup=str(category_soup.find("div",{"class":"grid_pager"}).findAll("span")[1]) page=1 total=int(re.sub(r'[^\d+]',"",category_soup)) while (page!=total): print category+(" page ")+str(page)+" of "+str(total) category_page=category_link+"?page=%d&thumb_size=mosaic"%page category_page=b(requests.get(category_page).content) image=category_page.findAll("span",{"class":"gc_clip"}) #file.write(str(page)+'\n') if (type(image) is list): break for per_image in image: link= per_image.find('img').get('src') #print link
import requests from bs4 import BeautifulSoup as b i=1 dic={} while True: soup = b(requests.get("http://www.national-football-teams.com/club/"+str(i)+"/2015_1/Real_Madrid.html").content,"lxml") club = soup.findAll("div",{"class":"span6"})[2].find('h1') if club: if club.find('small'): club.find('small').replaceWith('') # print club.text.encode('ascii','ignore') dic.update({club.text.encode('ascii','ignore'):i}) print dic i+=1
import MySQLdb db = MySQLdb.connect("localhost","root","root","TESTDB") cursor = db.cursor() import re import requests from bs4 import BeautifulSoup as b url="http://www.shutterstock.com/cat.mhtml?autocomplete_id=&language=en&lang=en&search_source=&safesearch=1&version=llv1&searchterm=&media_type=images" page_categories=[] soup=b(requests.get(url).content) soup=soup.find('div',{"class":"secondary_links clearfix"}).findAll("ul") for per_ul in soup: li=per_ul.findAll("li") for per_li in li: page_categories.append(str(per_li.text)) file = open('shutterstock.txt', 'r') i=0 x=file.readlines() while(i<len(x)): #category if x[i].replace("\n","") in page_categories: a= str(x[i].replace("\n","").replace("\"","")) i+=1 #category link elif re.search(r"http.*.html",x[i]): b= str(x[i].replace("\n","").replace("\"","")) i+=1 #link elif re.search(r"http.*.jpg",x[i]):
from bs4 import BeautifulSoup as b import requests import json file = open('nation_data.txt','a') i = 1 dic = {} DIC={} l = ["country",'id'] while True: country = b(requests.get("http://www.national-football-teams.com/country/"+str(i)+"/2015/Italy.html").content,"lxml").findAll("div",{"class":"span6"})[2].find('h1') if country: if country.find('small'): country.find('small').replaceWith ('') dic.update({country.text.encode('utf-8'):i}) print dic i+=1 else: print 546666666666666666666666666666666666666665 break json.dump(dic,file)
# file=open("dances_test.txt","a") #todo import requests from bs4 import BeautifulSoup as b import re dic = {} url = "https://en.wikipedia.org/wiki/List_of_dances" uls = b(requests.get(url).content).find("div", {"id": "mw-content-text"}).findAll("ul") for i in range(2, 47): # print uls[46] #(2-47) lis = uls[i].findAll("li") for per_li in lis: try: display_order = ["Dance_name", "Dance_desc", "Dance_link"] dance_name = per_li.find("a").get("title").encode("utf-8") if "(page does not exist)" in str(dance_name): continue else: dic["Dance_name"] = " " + dance_name + " " # print dance_name # else: # file.write(dance_name.encode('utf-8')) # try: dance_link = "https://en.wikipedia.org" + per_li.find("a").get("href") dic["Dance_link"] = " " + dance_link + " " # file.write(dance_link.encode('utf-8')) # print dance_link
import requests from bs4 import BeautifulSoup as b main_page=1 file=open("archive.txt","a") while True: web="https://archive.org/details/image?&sort=-downloads&page=%d"%main_page soup=b(requests.get(web).content) soup=soup.findAll("div",{"class":"collection-title C C2"}) main_page+=1 for category in soup: category_link=category.find('a').get('href') category_name=(category.find('a').find('div').text).encode('utf-8') file.write(str(category_name)+'\n') file.write(str( "https://archive.org"+category_link)+'\n') print category_name category_page=1 while True: cat="https://archive.org"+category_link+"?&sort=-downloads&page=%d"%category_page cat_soup=b(requests.get(cat).content) cat_soup=cat_soup.findAll('div',{"class":"ttl C C2"}) category_page+=1 cat_soup=cat_soup[1:] if len(cat_soup)==0: break for img in cat_soup: link=img.find('a').get('href') desc=(img.find('a').text).encode('utf-8') file.write(str(desc)+'\n') file.write(str( "https://archive.org"+link)+'\n') print category_page-1
import requests from bs4 import BeautifulSoup as b import MySQLdb db = MySQLdb.connect("localhost","root","root","TESTDB") cursor = db.cursor() url="http://www.bigstockphoto.com/" #### getting inside website #### soup=b(requests.get(url).content,"lxml") ###finding all category links categories=soup.findAll("div",{"class":"row-fluid"})[4].findAll("a") ####picking one category at a time for per_category in categories: cat="http://www.bigstockphoto.com"+str(per_category.get('href')) category_name = per_category.text print category_name ####just for the lulz cat = cat.split("category/")[0]+"?category="+cat.split("category/")[1][:-1] cat = cat.split("?")[0]+"?start=0&"+cat.split("?")[1] print cat ##category link ####initialising page from 0 page = 0 #####iterating over page of category
def soup(website): soup = b(requests.get(website).content,"lxml").find("pre") return soup
import requests import re #from pymongo import MongoClient #cl = MongoClient() #coll = cl["local"]["test2"] from bs4 import BeautifulSoup as b url="http://www.differencebetween.net/" display_order = ["Title1","Title2","TitleLink","Description"] dic={} ###getting all categories#### cats=b(requests.get(url).content).find("div",{"id":"featured-cats"}).findAll("h5") for per_cat in cats: ###getting links category wise cat_link = per_cat.find('a').get('href') # print cat_link ###getting page numbers if exists try: pages=b(requests.get(cat_link).content).find("span",{"class":"pages"}).text pages = re.findall(r'.[\d]',pages) pages= int(pages.pop().strip()) except Exception as e: print e for curr_page in range(1,pages+1): cat_soup = cat_link+"page/%d/"%curr_page print "*************************************************************************************************************" print cat_soup
import json file = '/home/varun/Desktop/lets-football/crawler/nation_data.txt' dic=json.load(open(file)) print "Enter country" print id url = "http://www.national-football-teams.com/country/"+str(dic[raw_input()])+"/2015/Italy.html" import requests from bs4 import BeautifulSoup as b #url = "http://www.national-football-teams.com/country/"+str(174)+"/2015/Italy.html" soup = b(requests.get(url).content,"lxml") club= [] table= soup.find("table",{"class":"sortable"}).find('tbody').findAll('tr') for per_tr in table: td = per_tr.findAll('td') for i in range(5): if i==3: continue print td[i].text.strip() club.append(td[4].text.strip().encode('ascii', 'ignore')) print '---------------------------' total_clubs = len(club) dic={x:club.count(x) for x in club} #print dic print "Below are the team club stats in percent" for i in dic: print i+" \t\t\t| %.3f"%((100.000*dic[i])/total_clubs)+"\t\t|"+str(dic[i])