def main(): """Main entry point for the script.""" url = 'https://blog.docker.com/category/engineering/' response = requests.get(url) #print response.text blog_list = [] response_data = BeautifulSoup(response.text, "html.parser") if response_data.__len__() == 0: print "no data returned" exit(1) for each_title in response_data.find_all('entry-title'): print each_title.text #blog_title = each_title.text #if blog_title != 'X': # blog_list.append((blog_title.text)) #for blog_data in blog_list: # print blog_data.text print 'End main' pass
def get_content(self): content = self.content bs = BeautifulSoup(content, 'html.parser') bs = bs.text if bs: if bs.__len__() > 60: return bs[:60] + '...' return bs
def get_article_content(): print("正在获取文章目录...") content_list = [] href_list = [] txt="" for page in range(1,26): #取得url内容 htmltext = get_conn(url_jichupian+"list_1_"+str(page)+".html").text soup = BeautifulSoup(htmltext, 'html.parser') soup = soup.find_all("ul")[1] soup = soup.find_all("h3") for item in range(soup.__len__()): title = soup[item].get_text() content_list.append(title) save_list(content_list)
def load_page(self): conn = self.connect_database() cursor = conn.cursor() try: browser = AcademyRankInfo.browser ''' 预留30s管理员手动点击///无奈/// ''' print("请开始点击页面上的加载更多,90s内完成") time.sleep(90) print("数据抓取已经开始,请不要做其他操作") html = browser.page_source doc = pq(html) soup = BeautifulSoup(str(doc), 'html.parser') tbody = soup.select('.tbody-container') trs = BeautifulSoup(str(tbody), 'html.parser').find_all('tr') print('trs长度:', trs.__len__()) # SELECT aca_no FROM academy_info WHERE aca_name = '北京大学' for tr in trs: try: aca_ranking = int( BeautifulSoup( str(tr), 'html.parser').select('.t1')[0].get_text()) aca_name = BeautifulSoup( str(tr), 'html.parser').select('.t2')[0].get_text() query = Query() aca_id = str(query.query_acaIdByacaName(aca_name)) cur_sql_academyRank_value = "('" + aca_id + "','" + aca_name + "','" + str( aca_ranking) + "')" cur_sql_academyRank = "insert into " + AcademyRankInfo.insert_db + "(aca_id,aca_name,aca_ranking) values" + cur_sql_academyRank_value print(cur_sql_academyRank) cursor.execute(cur_sql_academyRank) conn.commit() except Exception as e: print(repr(e)) except TimeoutException: print("爬取院校失败")
def query(self, query): page = 1 thumbs = [] print("Fetching results") while (True): r = requests.get("https://alpha.wallhaven.cc/search?q=" + query.replace(" ", "+") + "&page=" + str(page)) results = BeautifulSoup(r.content, "html.parser").find_all( "a", {"class": "preview"}) r.close() if results.__len__() > 0: for result in results: thumbs.append(result) else: break page += 1 print("Fetched {} results".format(len(thumbs))) print("Found " + str(thumbs.__len__()) + " results") self.results = thumbs self.handle()
def lyricsMint(song, album, save): print song, "Searching..." # Searching Song url = "http://www.lyricsmint.com/search?q="+"+".join(song.split()) req = urllib2.Request(url=url) html = urllib2.urlopen(req) soup = BeautifulSoup(html) soup = soup.select('.blog-posts.hfeed .date-outer') #No Lyrics if soup.__len__() == 0: print "No Lyrics found" return lyric = None href = [] song = ''.join(ch for ch in song if ch.isalnum() or ch.isspace()) album = ''.join(ch for ch in album if ch.isalnum() or ch.isspace()) #Searching require lyric for s in soup: temp = s.select('.post-title a')[0] for name in song.split(): if name.lower() in temp.text.lower(): href.append((temp.text, temp['href'])) break if href.__len__() == 1: lyric = href[0] else: if href.__len__() == 0: for i in range(soup.__len__()): title = soup[i].select('.post-title a')[0] href.append((title.text, title['href'])) #print i, title.text #choice = input("Enter the choice: ") #lyric = (soup[choice].select('.post-title a')[0].text, soup[choice].select('.post-title a')[0]['href']) else: pass temp = [] for i in range(href.__len__()): for name in album.split(): if name.lower() in href[i][0].lower(): temp.append(href[i]) break if temp.__len__() == 1: lyric = temp[0] else: for i in range(temp.__len__()): print i, temp[i][0] choice = input("Enter the choice: ") lyric = temp[choice] url = lyric[1] req = urllib2.Request(url=url) html = urllib2.urlopen(req) soup = BeautifulSoup(html) soup = soup.select('#lyric') file = open(save, 'w') file.write(soup[0].select('p')[0].text) file.close() print 'Downloaded!'
def get_equip(self, link): ## La lista che restituiremo equip = [] ## Faccio la richiesta response = requests.get(link) ## E' tutto in try e expect siccome potrebbe essere che non abbiamo nessuna equip. Per come è fatto, non possiamo ## Fare controlli. L'unica maniera è un try except try: ## Prendo il testo che contiene l'equipe prima_parte = \ BeautifulSoup(response.content, "html.parser").find(id="accordion-content-equipe-%c2%bb").contents[ 1].getText() ## Se contiene i : allora vuol dire che è semplice come struttura if prima_parte.__contains__(":"): ## Per non fare un codice stra lungo ( e siccome sono solamente 2 le pagine quelle diverse ) faccio questa cosa dove le toglie i due casi speciali if prima_parte.__len__() < 3000 and prima_parte.__contains__( "Antonio RAMPONI") == False: ## Nel caso normale, ## Prendo la prima parte, la divido in parti a seconda del :, prendo ciò che ci interessa e la ridivido per \n. Una volta questo, ## Itero ogni parte della lista e ci toglio le cose in più per poi toglierli le celle vuote e le varie eccezioni. ## Lo rendo una stringa per poi dividerlo ogni , equip = (list( filter( lambda val: val.split().__len__() != 1, list( filter( lambda val: val.__len__( ) != 0 and val != "Struttura semplice" and val != "Strutture semplici" and val != "Coordinatore Infermieristico", list( map( lambda val: val.strip(), prima_parte.split(":")[1].split( "\n")))))))) ## In questi 2 casi, li aggiungo "manualmente" elif prima_parte.__contains__("Antonio RAMPONI"): equip = [ "Cristiana BOZZOLA", "Francesca FOTI", "Angela GIACALONE", "Monica LEUTNER", "Emanuela UGLIETTI", "Guido VALENTE" ] else: equip = [ "Patrizia NOTARI", "Matteo VIDALI", "Vessellina KRUOMOVA", "Giuseppina ANTONINI", "Ilaria CRESPI", "Luisa DI TRAPANI", "Lucia FRANCHINI", "Roberta Rolla", "Marco Bagnati", "Patrizia PERGOLONI" ] else: ## Nel caso non abbia i :, allora ce la caviamo semplicemente così equip = \ prima_parte.strip().split(",") except AttributeError: pass ## Per risolvere un errore if equip.__len__() == 1 and equip[0].__len__() > 20: equip = equip[0].split(',') ## Chiudo la connessione response.close() ## Ritorno l'array return equip
def lyricsMint(song, album, save): print song, "Searching..." # Searching Song url = "http://www.lyricsmint.com/search?q=" + "+".join(song.split()) req = urllib2.Request(url=url) html = urllib2.urlopen(req) soup = BeautifulSoup(html) soup = soup.select('.blog-posts.hfeed .date-outer') #No Lyrics if soup.__len__() == 0: print "No Lyrics found" return lyric = None href = [] song = ''.join(ch for ch in song if ch.isalnum() or ch.isspace()) album = ''.join(ch for ch in album if ch.isalnum() or ch.isspace()) #Searching require lyric for s in soup: temp = s.select('.post-title a')[0] for name in song.split(): if name.lower() in temp.text.lower(): href.append((temp.text, temp['href'])) break if href.__len__() == 1: lyric = href[0] else: if href.__len__() == 0: for i in range(soup.__len__()): title = soup[i].select('.post-title a')[0] href.append((title.text, title['href'])) #print i, title.text #choice = input("Enter the choice: ") #lyric = (soup[choice].select('.post-title a')[0].text, soup[choice].select('.post-title a')[0]['href']) else: pass temp = [] for i in range(href.__len__()): for name in album.split(): if name.lower() in href[i][0].lower(): temp.append(href[i]) break if temp.__len__() == 1: lyric = temp[0] else: for i in range(temp.__len__()): print i, temp[i][0] choice = input("Enter the choice: ") lyric = temp[choice] url = lyric[1] req = urllib2.Request(url=url) html = urllib2.urlopen(req) soup = BeautifulSoup(html) soup = soup.select('#lyric') file = open(save, 'w') file.write(soup[0].select('p')[0].text) file.close() print 'Downloaded!'
from botocore.config import Config from bs4 import BeautifulSoup import boto3.s3 import boto3 from botocore.exceptions import NoCredentialsError registrationNumber=[] result=[] names=[] marks=[] missingNumbers=[] for x in range(1400000001,1499999999): try: url="https://www.vidyavision.com/results/ssc2014.aspx?h="+str(x) page = requests.get(url).text soup = BeautifulSoup(page, "html.parser") if(soup.__len__()!=0): registrationNumber.append(str(soup).split('~')[0]) names.append(str(soup).split('~')[1]) result.append(str(soup).split('~')[-1]) marks.append(str(soup).rsplit("~")[-3]) print(str(x)+"OK") else: missingNumbers.append(x) print(str(x) + "Not OK") except: print("Error"+str(x)) d = [registrationNumber, names,marks,result] export_data = zip_longest(*d, fillvalue = '') with open('output.csv', 'w', newline='') as myfile: wr = csv.writer(myfile) wr.writerow(("registrationNumber", "names","marks","result"))