def html_to_match_objects(html_page): soup = BeautifulSoup(html_page, 'html.parser') title_div = soup.div(class_="sidebar-body-title ng-binding") if title_div != "": title = title_div[0].text print(title) match_data = soup.div(class_="sidebar-body-item ng-scope") if match_data == 0: return string_list = [] # get all table content for match in match_data[0].find_all('tr'): if match != "": string_list.append(match.text) data_list = string_to_list(string_list) return generate_objects(data_list)
def function3(link4): '''scrap all the phone attribute from the page ie http://www.gsmarena.com/apple_ipad_air-5797.php and add it to the database ''' try: dict_y_update() data3 = requests.get(link4) soup5 = BeautifulSoup(data3.text) soup6 = BeautifulSoup(str(soup5.div(id="specs-list"))) def empty_tbl_dt1(): p = "_" + tbl_dt1.get_text().replace(u'\xa0','h').replace(u'\xc2','h').replace(u' ','_').replace(u'.','_').replace(u'-','_') if p == ("_h"): return "_" + table.th.get_text() + "_Extra" else: if p == ('_hh'): return "_" + table.th.get_text() + "_Extra" else: return p Comments = '' for para in soup6.find_all('p'): Comments = Comments + para.get_text() + ". " Extra_comments = str(Comments.encode('utf-8').replace('[. ','')) model_name = soup5.h1.get_text() img_url = BeautifulSoup(str(soup5.div(id="specs-cp-pic"))).img["src"] print model_name, img_url , Extra_comments company = model_name.split()[0] try: connect_to_db() cur.execute("SELECT * from models where model_name = (%s)",[model_name]) cur.fetchone()[1] print "--------------Alredy Present In Database-------------------" except: for table in soup6.find_all("table"): soup7 = BeautifulSoup(str(table)) for tbl_dt1,tbl_dt2 in zip(soup7.find_all("td",class_="ttl"),soup7.find_all("td",class_="nfo")): #print empty_tbl_dt1(), "---------" , tbl_dt2.get_text().encode("utf-8") y.update({"_model_name":model_name,"_gsm_link":link4,"img_url":img_url,"Extra_comments":Extra_comments,"company":company}) y.update({empty_tbl_dt1():tbl_dt2.get_text().encode("utf-8")}) print "--------------------------------------------------------" for key,value in dict.items(y): print key ,"--" ,value print "--------------------------------------------------------" print "--------------------------------------------------------" connect_to_db() cur.execute("INSERT INTO models (model_name, Company, Gsm_link, Extra_comments, Image_url, _2G_Network, _3G_Network, _4G_Network, Sim, Announced, Status, General_Extra, Dimensions, Weights, Keyboard, Body_Extra, Type, Size, Multitouch, Protection, Display_Extra, Alert_Types, Loudspeaker, _3_5mm_jack, Sound_extra, Card_Slot, Internal, Phonebook, Call_Records, Memory_Extra, GPRS, EDGE, Speed, WLAN, Bluetooth, Infrared_Port, USB, NFC, DATA_Extra, _Primary, Features, Video, Secondary, Camera_Extra, OS, Chipset, CPU, GPU, Sensors, Messaging, Browser, Radio, GPS, Java, Colours, Games, Clock, Alarm, Languages, Features_Extra, Battery_Extra, Stand_By, Talk_Time, Music_Play, Price_Group, SAR_US, SAR_EU, MISC_Extra ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",[y['_model_name'],y['company'],y['_gsm_link'],y['Extra_comments'],y['img_url'],y['_2G_Network'],y['_3G_Network'],y['_4G_Network'],y['_SIM'],y['_Announced'],y['_Status'],y['_General_Extra'],y['_Dimensions'],y['_Weight'],y['_Keyboard'],y['_Body_Extra'],y['_Type'],y['_Size'],y['_Multitouch'],y['_Protection'],y['_Display_Extra'],y['_Alert_types'],y['_Loudspeaker_'],y['_3_5mm_jack_'],y['_Sound_Extra'],y['_Card_slot'],y['_Internal'],y['_Phonebook'],y['_Call_records'],y['_Memory_Extra'],y['_GPRS'],y['_EDGE'],y['_Speed'],y['_WLAN'],y['_Bluetooth'],y['_Infrared_port'],y['_USB'],y['_NFC'],y['_DATA_Extra'],y['_Primary'],y['_Features'],y['_Video'],y['_Secondary'],y['_Camera_Extra'],y['_OS'],y['_Chipset'],y['_CPU'],y['_GPU'],y['_Sensors'],y['_Messaging'],y['_Browser'],y['_Radio'],y['_GPS'],y['_Java'],y['_Colors'],y['_Games'],y['_Clock'],y['_Alarm'],y['_Languages'],y['_Features_Extra'],y['_Battery_Extra'],y['_Stand_by'],y['_Talk_time'],y['_Music_play'],y['_Price_Group'],y['_SAR_US'],y['_SAR_EU'],y['_MISC_Extra']]) disconnect_to_db() y.clear() dict_y_update() except: connect_to_db() cur.execute("INSERT INTO errorlogs (error_link) VALUES (%s)",[link4]) disconnect_to_db() y.clear() dict_y_update()
def gettingOldData(): #droup = BeautifulSoup(open("historyUser.html", "r"), "html.parser") droup = BeautifulSoup("historyUser.html", "html.parser") oldDiv = droup.div("target") newCal = oldDiv droup.close() return oldCal
def parse_commit(self, branch): N=test_last_page(baseURL+branch.commit_url) print 'Branch: %s' % branch.branch_name print 'Total pages:%s' % N for i in range(N, 0, -1): try: req=urllib2.urlopen(baseURL+branch.commit_url+'?page='+str(i)) result=req.read() soup=BeautifulSoup(result) commit_list=[] for d in soup.div(): if d.has_attr('class') and 'js-navigation-container' in d.attrs['class']: h3_list=d.findAll('h3') ol_list=d.findAll('ol') if len(h3_list)==len(ol_list): for index in range(len(h3_list)): h3_date=datetime.datetime.strptime(h3_list[index].string, '%b %d, %Y').date() for li in ol_list[index].findAll('li'): commit=Commit(li.p.a['href'], h3_date) commit.parse_parent_info() sys.stderr.write('Parent info %s\n' % '\t'.join(commit.parent_sha_list)) commit_list.append(commit) else: print 'Error! h3 and ol do not match!' commit_list.reverse() for commit in commit_list: # self.branch_commit_fp.write('%s %s %s %s\n' % (branch.branch_name, commit.commit_sha, commit.commit_date.strftime('%m/%d/%Y'), '\t'.join(commit.parent_sha_list))) self.logger.info('Commit:%s (%s) in Branch:%s Parent:%s' % (commit.commit_sha, commit.commit_date.strftime('%m/%d/%Y'), branch.branch_name, '\t'.join(commit.parent_sha_list))) if commit not in self.visited_commit: # self.retrieve_commit(commit) self.visited_commit.add(commit) except urllib2.HTTPError, e: print e
def scrape(serial_number): scraper = PyScraper() #scraper.get('http://www.cpic-cipc.ca/English/searchformbikes.cfm') url = 'http://app.cpic-cipc.ca/English/searchFormResultsbikes.cfm' raw_params = { 'ser': serial_number, #'sType': 'Bicycles', 'Submit': 'Begin Search', } params = urllib.urlencode(raw_params) data = scraper.post(url, params) soup = BeautifulSoup(data) entries = [] main = soup.div(id='wb-main-in') hrs = soup.findAll('hr',title="") for hr in hrs: entry = {} p = hr.find_next_sibling("p") entry = { 'Status': p.find("strong", text="Status:").find_all_next(text=True)[1], 'Serial': p.find("strong", text="Status:").find_all_next(text=True)[4], 'Make' : p.find("strong", text="Status:").find_all_next(text=True)[7], 'Model' : p.find("strong", text="Status:").find_all_next(text=True)[10], 'Colour': p.find("strong", text="Status:").find_all_next(text=True)[13], 'Speeds': p.find("strong", text="Status:").find_all_next(text=True)[16] } #print entry entries.append(entry) return entries
def parse_company_urls(html): soup = BeautifulSoup(html, 'lxml') pages = [{ 'company': div.a.text, 'url': div.a.attrs['href'] } for div in soup.div(class_='mp_cassette_title')] return pages
def function1(link3): '''returns all the page to be scrap of a particular company http://www.gsmarena.com/samsung-phones-9.php It will extract all the navigation link present on the bottom of the page''' data2 = requests.get(link3) soup3 = BeautifulSoup(data2.text) soup4= BeautifulSoup(str(soup3.div(class_="nav-pages"))) if soup4.get_text() == '[]': #some pages have no navigation pages thats why if is used print link3 function2(link3) print "----------------------------------------------" else: print link3 function2(link3) link1= "http://www.gsmarena.com/"+ soup4.a['href'] print link1 function2(link1) for links in soup4.find_all('a'): link2 = "http://www.gsmarena.com/" + links['href'] if link2 == link1: pass else: link5 = "http://www.gsmarena.com/" + links['href'] print link5 function2(link5) print "-----------------------------------------------------------------"
def main(): target = "https://www.biqubao.com/book/17570/" save_path = 'E:/' \ '' index_path = 'https://www.biqubao.com' req = requests.get(url=target) req.encoding = 'gbk' # gbk是网站编码方式 soup = BeautifulSoup(req.text, "html.parser") list_tag = soup.div(id="list") print(type(list_tag)) #find title title = list_tag[0].dl.dt.string path = save_path + '/' + title if not os.path.exists(path): os.path.join(save_path, title) os.mkdir(path) print(1) for tag in list_tag[0].dl.find_all('dd'): chapter_name = tag.string print(2) chapter_url = index_path + tag.a.get("href") chapter_req = requests.get(url=chapter_url) chapter_req.encoding = "gbk" chapter_soup = BeautifulSoup(chapter_req.text, "html.parser") text = chapter_soup.div.find(id="content") print(type(text)) content_text = str(text.text.replace('\xa0', '\n')) with open(path + '/' + chapter_name + '.txt', 'w') as f: f.write('本文网址:' + chapter_url) f.write(content_text)
def function1(link3): '''returns all the page to be scrap of a particular company http://www.gsmarena.com/samsung-phones-9.php It will extract all the navigation link present on the bottom of the page''' data2 = requests.get(link3) soup3 = BeautifulSoup(data2.text, "lxml") soup4 = BeautifulSoup(str(soup3.div(class_="nav-pages")), "lxml") if soup4.get_text( ) == '[]': #some pages have no navigation pages thats why if is used #print("NoNav") #print (link3) function2(link3) #print ("-------------------No1--------------------------") else: #print("Nav") #print (link3) function2(link3) link1 = "http://www.gsmarena.com/" + soup4.a['href'] #print (link1) function2(link1) for links in soup4.find_all('a'): link2 = "http://www.gsmarena.com/" + links['href'] if link2 == link1: pass else: link5 = "http://www.gsmarena.com/" + links['href'] print(link5) function2(link5)
def get_url(url): print('GET URL') try: contents = urllib.request.urlopen(url).read() page = urllib.request.urlopen(url) soup = BeautifulSoup(page, "html5lib") print('CONTENIDOS:', contents) print('PAGINA:', page) # Título titulo = soup.title.string print('TITULO:', titulo) # print('PARSE:', soup.div(id='contenedor_central')) # print('PARSE:', soup.div(id='principal')) # print('PARSE:', soup.div(id='fecha_creditos')) # Fecha fecha = soup.div(id='fecha_actividad') print('FECHA:', fecha) # Tipo tipo = soup.div(id='online') print('TIPO:', tipo) # Centro centro = soup.div(id='centro') print('CENTRO:', centro) # Ponentes ponentes = soup.find_all(['a'], href=re.compile('idponente')) print('PONENTES:', ponentes) exit(0) print('PARSE:', soup.div(id='actividad')) print('PARSE0:', soup.find_all(['div'], attrs={"class": "contenedor_actividad"})) print( 'PARSE0:', soup.find_all(['div'], attrs={"class": 'cabeceraDetalleActividad'})) print('PARSE0:', soup.find_all(['div'], attrs={"class": 'cajasActividad'})) #print('PARSE LIMPIO:', soup.prettify()) return soup.title.string except Exception as e: return e
def function2(phn_links): '''return all the phone links of the page ie http://www.gsmarena.com/amazon-phones-76.php''' #phn_links = "http://www.gsmarena.com/amazon-phones-76.php" phn_links_page = requests.get(phn_links) phn_soup = BeautifulSoup(phn_links_page.text) phn_soup2 = BeautifulSoup(str(phn_soup.div(class_="makers"))) for link in phn_soup2.find_all('a'): link = "http://www.gsmarena.com/" + link['href'] function3(link)
def fetch_data(): """ 从小说网站直接获取小说 :return: """ # 想要爬取的小说主页 target = "https://www.biqubao.com/book/17570/" # 本地保存的路径 save_path = "E:/爬虫练习/spider_data/small_story" # 想要爬取网站的根路径 index_path = "https://www.biqubao.com" req = requests.get(target) # 查看request默认的编码,发现与网站response不符,改为网站使用的gbk print(req.headers) exit(0) print(req.encoding) req.encoding = 'gbk' # 解析HTML soup_object = BeautifulSoup(req.text, "html.parser") list_tag = soup_object.div(id="list") # 打印每个章节小说的路径的链接元素 print('list_tag:', list_tag) # 获取小说名称 story_title = list_tag[0].dl.dt.string # 根据小说名称创建一个文件夹,如果不存在就创建 dir_path = save_path + '/' + story_title if not os.path.exists(dir_path): os.path.join(save_path, story_title) os.mkdir(dir_path) i = 0 # 循环获取每一个章节,获取章节名称,与章节对应的网址 for ddTag in list_tag[0].dl.find_all('dd'): i += 1 # 章节名称 chapter_name = ddTag.string # 章节网址 chapter_url = index_path + ddTag.a.get('href') # 访问该章节详情网址,爬取该章节正文 chapter_req = requests.get(url=chapter_url) chapter_req.encoding = 'gbk' chapter_soup = BeautifulSoup(chapter_req.text, "html.parser") # 解析正文所在的标签: content_tag = chapter_soup.div.find(id="content") # 获取正文文本,并将空格替换为换行符 content_text = str(content_tag.text.replace('\xa0', '\n')) # 将当前章节,写入以章节名字命名的txt文件 with open(dir_path + '/' + chapter_name + '.txt', 'w') as f: f.write('本文网址: ' + chapter_url) f.write(content_text)
def topaqu(target, index_path, type): global q, all_novel try: req = requests.get(url=target) # 查看request默认的编码,发现与网站response不符,改为网站使用的gdk print(req.encoding) req.encoding = 'gbk' # 解析html soup = BeautifulSoup(req.text, "html.parser") list_tag = soup.div(id="list") if len(list_tag) < 1: list_tag = soup.findAll(name="div", attrs={"class": "listmain"}) print('list_tag:', list_tag) # 获取小说名称 story_title = list_tag[0].dl.dt.string # 根据小说名称创建一个文件夹,如果不存在就新建 dir_path = save_path + '/' + story_title if not os.path.exists(dir_path): os.path.join(save_path, story_title) os.mkdir(dir_path) # 开始循环每一个章节,获取章节名称,与章节对应的网址 q = collections.deque() num = 0 for dd_tag in list_tag[0].dl.find_all('dd'): num = num + 1 # 章节名称 chapter_name = dd_tag.string # 章节网址 chapter_url = index_path + dd_tag.a.get('href') # novelList[str(dd_tag.a.get('href')).split("/")[-1].split(".")[0]] = chapter_name + ';' + chapter_url print(str(dd_tag.a.get('href')).split("/")[-1].split(".")[0]) print(chapter_name + ';' + chapter_url) q.append( str(dd_tag.a.get('href')).split("/")[-1].split(".")[0] + ';' + chapter_name + ';' + chapter_url) print("一共_____:" + str(num)) threads = [] all_novel = {} for i in range(0, 6): t = threading.Thread(target=get_zj, args=(i, )) threads.append(t) t.start() for j in threads: j.join() print("********所有线程执行完毕************") novel = sorted(all_novel.items(), key=lambda x: x[0]) txt = open(dir_path + '/' + story_title + '.txt', 'a', encoding="utf-8") for a in novel: # 将当前章节,写入以章节名字命名的txt文件 txt.write(a[1]) txt.write('\n') txt.close() except Exception as e: print("发送异常" + str(e))
def add_column_timestamp(db_conn, alter_table=False): """ Agrega columna timestamp a la tabla según la fecha de consumo parseada del url_review del usuario """ c = db_conn.cursor() table_name = 'user_reviews' col_timestamp = 'timestamp' reviews_path = "/mnt/f90f82f4-c2c7-4e53-b6af-7acc6eb85058/crawling_data/goodreads_crawl/user_reviews/" if alter_table: c.execute("ALTER TABLE {0} ADD COLUMN {1} {2}".format( table_name, col_timestamp, 'INTEGER')) c.execute("SELECT * FROM {0}".format(table_name)) all_rows = c.fetchall() i = 0 for tupl in all_rows: logging.info( "-> Viendo tupla {0} de {1}. Usuario: {2}, Review: {3}".format( i, len(all_rows), tupl[0], tupl[1])) i += 1 try: with open(reviews_path + tupl[1] + '.html', 'r') as fp: soup = BeautifulSoup(fp, 'html.parser') except Exception as e: logging.info("No se pudo abrir HTML {0}. Error: {1}".format( tupl[1], e)) continue try: date = int( soup.div(class_='dtreviewed')[0].find_all( 'span', class_='value-title')[0]['title'].replace('-', '')) except Exception as e: logging.info("No se pudo parsear fecha") continue try: c.execute( "UPDATE {0} SET {1} = '{2}' WHERE user_id = {3} AND url_review = '{4}'"\ .format( table_name, col_timestamp, date, tupl[0], tupl[1] )) except sqlite3.IntegrityError: logging.info('ERROR ACTUALIZANDO VALORES'.format(file_name)) continue db_conn.commit()
def crawler_trending(self, href): t=''.join([baseURL, '/trending?', href]) print t try: req=urllib2.urlopen(t) result=req.read() soup=BeautifulSoup(result) for d in soup.div(): if d.attrs.has_key('class') and 'leaderboard-list-content' in d.attrs['class']: repos=Repository(d.a['href']) self.userQueue.put(User('/'+repos.user)) except urllib2.URLError as e: print e.reason
def fetch_intern_dates(pageDict): interns = [] try: html = urllib.request.urlopen(url=pageDict['url']) except urllib.error.HTTPError as e: print(e) print('this company has no internship pages') # empty list return interns soup = BeautifulSoup(html, 'lxml') # list of internship divs internDivs = soup.div(class_='ts-p-_internshipList-item-info') prefix = 'ts-p-_internshipList-item-info-row-' titleClassName = prefix + 'title' daysClassName = ' '.join( [prefix + 'detail-text', prefix + 'detail-text_day']) dateClassName = ' '.join( [prefix + 'detail-text', prefix + 'detail-text_place']) # list of deadline divs deadlineDivs = soup.div( class_='ts-p-_internshipList-item-entry js-p-entryItem-empty') deadlineClassName = 'ts-p-_internshipList-item-entry-deadline' for iDiv, dDiv in zip(internDivs, deadlineDivs): intern = { 'company': pageDict['company'], 'title': iDiv.div()[0].text, 'days': iDiv.find_all('div', class_=daysClassName)[0].text, 'date': iDiv.find_all('div', class_=dateClassName)[0].text } intern['deadline'] = re.sub( 'エントリー締切:', '', dDiv.find_all('div', class_=deadlineClassName)[0].text) interns.append(intern) return interns
def scrape_for_vine(query1,query2=""): url = "https://twitter.com/search/realtime?q=vine.co%2Fv%2F+%2B+"+query1+query2+"&src=typd" html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) vine_url_array=[] vine_dict={} for instance in soup.find_all('span',{'class' : 'js-display-url'}): vine_url = instance.get_text() vine_url_array.append(vine_url) #print vine_url_array for i in vine_url_array: i='http://'+i soupe = BeautifulSoup( urllib2.urlopen(i).read() ) link = soupe.source['src'] title = soupe.p.get_text() vine_dict[title]=link print soupe.div('class':'user').img['src'] '''
def crawling_repos_contributors(self, repos, item): failure=True while failure: try: print baseURL+repos.href+item req=urllib2.urlopen(baseURL+repos.href+item) result=req.read() soup=BeautifulSoup(result) for d in soup.div(): if d.attrs.has_key('id') and d.attrs['id']=='contributors': print d failure=False except urllib2.URLError as e: sys.stderr.write('%s when crawling %s' % (e, repos.href+item))
def parse_branch_name(self): self.branches=[] try: # print baseURL+self.target_repos.href req=urllib2.urlopen(baseURL+self.target_repos.href) result=req.read() soup=BeautifulSoup(result) for d in soup.div(): if d.has_attr('class') and 'select-menu-list' in d.attrs['class'] and d.has_attr('data-tab-filter') and d['data-tab-filter']=='branches': for item in d.div(): if item.has_attr('class') and 'select-menu-item' in item.attrs['class']: branch=Branch(item.a['href']) self.branches.append(branch) self.logger.info('Branch %s' % branch.branch_name) except urllib2.HTTPError, e: print e
def crawling_repos_followers(self, repos, item): failure=True while failure: try: print baseURL+repos.href+item req=urllib2.urlopen(baseURL+repos.href+item) result=req.read() soup=BeautifulSoup(result) for d in soup.div(): if d.attrs.has_key('class') and 'follow-list-container' in d.attrs['class']: user=User(d.a['href']) #self.crawler_user(user) self.userQueue.put(user) self.logger.info('Repository:%s Lang:%s %s:%s' % (repos.href, repos.lang, item.split('/')[1], user.user)) failure=False except urllib2.URLError as e: sys.stderr.write('%s when crawling %s' % (e, repos.href+item))
def add_column_book_url(db_conn, alter_table=False): db_conn.row_factory = lambda cursor, row: row[0] c = db_conn.cursor() table_name = 'user_reviews' col_book = 'url_book' reviews_path = "/mnt/f90f82f4-c2c7-4e53-b6af-7acc6eb85058/crawling_data/goodreads_crawl/user_reviews/" # Creamos columna que contiene las URL de los libros en la tabla de consumos if alter_table: c.execute("ALTER TABLE {0} ADD COLUMN {1} {2}".format( table_name, col_book, 'TEXT')) c.execute("SELECT url_review FROM {0}".format(table_name)) all_rows = c.fetchall() i = 0 for url_review in all_rows: logging.info("Viendo fila {0} de {1}".format(i, len(all_rows))) i += 1 with open(reviews_path + url_review + '.html', 'r') as fp: soup = BeautifulSoup(fp, 'html.parser') try: url_book = soup.div(class_='bookTitle')[0].get('href') except Exception as e: logging.info("URL DE LIBRO NO ENCONTRADO: {}".format(e)) logging.info("Encontrado HTML conflictivo: {}".format(url_review)) with open("non_user_reviews_htmls.txt", 'a+') as f: f.write("{0}\n".format(url_review)) continue try: c.execute( "UPDATE {0} SET {1} = '{2}' WHERE url_review = '{3}'"\ .format(table_name, col_book, url_book, url_review)) except sqlite3.IntegrityError: logging.info('ERROR ACTUALIZANDO VALORES'.format(file_name)) db_conn.commit()
def parse_parent_info(self): # crawling the parent commit of current commit self.parent_sha_list=[] failure=True while failure: try: req=urllib2.urlopen(baseURL+self.href) result=req.read() soup=BeautifulSoup(result) for d in soup.div(): if d.has_attr('class') and 'commit-meta' in d['class'] and 'clearfix' in d['class']: for s in d.findAll('span'): for a in s.findAll('a'): if a.has_attr('data-hotkey'): self.parent_sha_list.append(a['href'].strip().split('/')[-1]) # self.parent_sha=a['href'].strip().split('/')[-1] failure=False except urllib2.HTTPError, e: sys.stderr.write('%s when crawling %s\n' % (e, self.href))
def load_to_db(item_links, month, year): for url in item_links: print('[FETCH] Loading an item from \'{0}\'...'.format(url)) try: page = BeautifulSoup(urlopen(url).read(), 'html5lib') page.div(class_='StockCodeSrp')[0].strong.extract() try: dollars = float(page.div(class_='StockCodeSrp')[0].text.strip().replace('$', '')) except: continue if dollars > 8.00 or not page.div(class_='StockCodeDescription'): continue publisher = page.div(class_='StockCodePublisher')[0].text.strip() item = [ page.div(class_='StockCodeDescription')[0].text, str(dollars), publisher[publisher.index('\xa0') + 1:], url, 'http://previewsworld.com' + page.div(class_='StockCodeImage')[0].a.get('href') if page.div(class_='StockCodeImage')[0].a.get('href') else '', month, year, ] try: Preview.objects.create( name=item[0], dollars=item[1], rubles=usd_to_rub(float(item[1])), publisher=item[2], src_url=item[3], cover_url=item[4], month=item[5], year=item[6], ) except IntegrityError: continue except IndexError: print('[FETCH] Error loading from \'{0}\''.format(url))
def cnt_story(): global _dir_path req = requests.get(url=target) # 查看request默认的编码,发现与网站response不符,改为网站使用的gdk print(req.encoding) req.encoding = 'gbk' # 解析html soup = BeautifulSoup(req.text, "html.parser") list_tag = soup.div(id="list") #print('list_tag:', list_tag) # 获取小说名称 story_title = list_tag[0].dl.dt.string # 根据小说名称创建一个文件夹,如果不存在就新建 _dir_path = save_path + '/' + story_title if not os.path.exists(_dir_path): os.path.join(save_path, story_title) os.mkdir(_dir_path) cnt = len(list_tag[0].dl.find_all('dd')) story_content = list_tag[0].dl.find_all('dd') print("章节数量:" + str(cnt)) print(story_content) return story_content
def DownloadBook(target): # 本地保存爬取的文本根路径 save_path = 'Book' # 笔趣阁网站根路径 global index_path req = requests.get(url=target) # 查看request默认的编码,发现与网站response不符,改为网站使用的gdk print(req.encoding) req.encoding = 'gbk' # 解析html soup = BeautifulSoup(req.text, "html.parser") list_tag = soup.div(id="list") print('list_tag:', list_tag) # 获取小说名称 story_title = list_tag[0].dl.dt.string # 根据小说名称创建一个文件夹,如果不存在就新建 dir_path = save_path + '/' + story_title if not os.path.exists(dir_path): os.path.join(save_path, story_title) os.mkdir(dir_path) # 开始循环每一个章节,获取章节名称,与章节对应的网址 for dd_tag in list_tag[0].dl.find_all('dd'): # 章节名称 chapter_name = dd_tag.string # 章节网址 chapter_url = index_path + dd_tag.a.get('href') # 访问该章节详情网址,爬取该章节正文 chapter_req = requests.get(url=chapter_url) chapter_req.encoding = 'gbk' chapter_soup = BeautifulSoup(chapter_req.text, "html.parser") # 解析出来正文所在的标签 content_tag = chapter_soup.div.find(id="content") # 获取正文文本,并将空格替换为换行符 content_text = str(content_tag.text.replace('\xa0', '\n')) # 将当前章节,写入以章节名字命名的txt文件 with open(dir_path + '/' + chapter_name + '.txt', 'w') as f: f.write('本文网址:' + chapter_url) f.write(content_text)
def detail_page(response,pattern): """"获取岗位要求和岗位工作内容""" soup = BeautifulSoup(response,'lxml') #构造soup target_tag = soup.div(class_ = "pos-ul")#截取div class:"pos-ul"标签 if target_tag: tag_str = target_tag[0].get_text() tag_str = pattern.sub('-',tag_str) job_describe = tag_str #print(job_describe) job_demand = '' #获取工资 最高-最低 salary_str = soup.strong.text if "面议" in salary_str: salary_down = 0 salary_up = 0 elif "以上" in salary_str: salary_down = re.sub("\D","",salary_str) salary_up = 0 elif "以下" in salary_str: salary_down = 0 salary_up = re.sub("\D","",salary_str) else: salary_num = salary_str.find('-') salary_down = int(re.sub("\D","",salary_str[:salary_num])) salary_up = int(re.sub("\D","",salary_str[salary_num:])) job_detail = [job_describe,job_demand,salary_down,salary_up] else: job_detail = ['','',0,0] try: industry = soup.select('ul.promulgator-ul a')[0].text job_detail.append(industry) except Exception as e: job_detail.append('未知') print(e) return job_detail
def extract(dirname): processed_content = [] with open("/home/liki/old/warehouse/complete_info") as f: lines = f.readlines() for i in range(0,len(lines)): processed_content.append(lines[i].split('\t')[0]) f.close() os.chdir(dirname) # target = open("/home/liki/old/warehouse/complete_info",'a+') # html = BeautifulSoup(open(filename)) for filename in os.listdir("."): print filename if filename in processed_content: continue; else: html = BeautifulSoup(open(filename)) target = open("/home/liki/old/warehouse/complete_info",'a+') if len(html.find_all(attrs={"class": "dv-meta-info size-small"}))!=0: #tabular html page table = html.find("table") row = table.findAll('tr') #movie_name movie_name = html.h1.contents[0].strip() #Starring info # if html.find_all(attrs={"class": "dv-meta-info size-small"})[0].dt.string == "Starring:" : # starring = html.find_all(attrs={"class": "dv-meta-info size-small"})[0].dd.string.strip() #else: # starring = "NULL_STARRING" #include format/genre/releasetime header = [] content = [] for chld in row: header.extend(chld.findAll('th')) content.extend(chld.findAll('td')) for i in range(0,len(header)): header[i] = (''.join(x for x in (header[i].findAll(text = True)))).split(',') content[i] = (''.join(x for x in (content[i].findAll(text = True)))).split(',') for k in range(len(header[i])): header[i][k] = (header[i][k].strip('\n')).strip() for k in range(len(content[i])): content[i][k] = (re.sub(r'\n','',content[i][k].strip('\n'))).strip() tmp=[] for elem in header: tmp.append(elem[0]) header = tmp if ("Genres" in header): i = header.index("Genres") genres = ','.join(content[i]) else: genres = "NULL_GENRE" if ("Director" in header): i = header.index("Director") director = ','.join(content[i]) else: director = "NULL_DIRECTOR" if ("Starring" in header): i = header.index("Starring") starring = ','.join(content[i]) else: starring = "NULL_STARRING" if ("Supporting actors" in header): i = header.index("Supporting actors") actor = ','.join(content[i]) else: actor = "NULL_actor" if ("Format" in header): i = header.index("Format") movie_format = ','.join(content[i]) else: movie_format = "NULL_FORMAT" if ("time" in header): time = ','.join(content[i]) else: time = "NULL_SHOWTIME" # target.write(filename+'\t'+movie_name+'\t'+genres+'\t'+director+'\t'+starring+'\t'+actor+'\t'+movie_format+'\t'+time) # target.write('\n') # target.close() # return header,content else: ##normal kind of page #movie name if len(html.div(id = "titleSection")) > 0: movie_name = html.div(id = "titleSection")[0].find(id = "productTitle").string #Format Info if len(html.div(id = "byline")) > 0 : index = len(html.div(id = "byline")[0].find_all("span"))-1 if index > 0: movie_format = html.div(id = "byline")[0].find_all("span")[index].string or u"" #lead actor if len(html.div(id = "byline")[0].find_all("span")[0]("span")) > 0: if ("Actor" in html.div(id = "byline")[0].find_all("span")[0]("span")[1].string): starring = html.div(id = "byline")[0].find_all("span")[0].a.string or u"NULL_STARRING" else: starring = "NULL_STARRING" else: starring = "NULL_STARRING" else: movie_format = "NULL_FORMAT" starring = "NULL_STARRING" else: movie_format = "NULL_FORMAT" starring = "NULL_STARRING" else: #other info movie_name = "NULL_MOVIENAMW" movie_format = "NULL_FORMAT" starring = "NULL_STARRING" if html.find("div",{"id":"detail-bullets"}) != None: tag = html.find("div",{"id":"detail-bullets"}).findAll('li') header = [] for elem in tag: header.append(elem.b.string) if "Actors:" in header: i = header.index("Actors:") tmp_1=tag[i].find_all("a") tmp = [elem.string for elem in tmp_1] actor = ",".join(tmp) or u"NULL_actor" else: actor = "NULL_actor" if "Directors:" in header: i = header.index("Directors:") tmp_1=tag[i].find_all("a") tmp = [elem.string for elem in tmp_1] if tmp != [None]: director = ",".join(tmp) else: director = "NULL_DIRECTOR" else: director = "NULL_DIRECTOR" # for x in tag[i].find_all("a"): # director = director + x.string + "," #tag[i].a.string if ("DVD Release Date:" in header) or ("VHS Release Date:" in header): if ("DVD Release Date:" in header): i = header.index("DVD Release Date:") else: i = header.index("VHS Release Date:") tmp_1=str(tag[i]) mm=re.compile('>(.*?)<',re.S) tmp = mm.findall(tmp_1) time = (tmp[len(tmp)-1]).strip() or u"NULL_SHOWTIME" else: time = "NULL_SHOWTIME" #if "VHS Release Date:" in header: # i = header.index("VHS Release Date:") # tmp_1=str(tag[i]) # mm=re.compile('>(.*?)<',re.S) # tmp = mm.findall(tmp_1) # time = (tmp[len(tmp)-1]).strip() or u"NULL_SHOWTIME" #else: # time = "NULL_SHOWTIME" else: actor = "NULL_actor" director = "NULL_DIRECTOR" time = "NULL_SHOWTIME" #else: # movie_name = "NULL_MOVIENAMW" # movie_format = "NULL_FORMAT" # actor = "NULL_actor" # director = "NULL_DIRECTOR" # time = "NULL_SHOWTIME" # starring = "NULL_STARRING" # time = "NULL_SHOWTIME" genres = "NULL_GENRE" target.write(filename+'\t'+movie_name+'\t'+genres+'\t'+director+'\t'+starring+'\t'+actor+'\t'+movie_format+'\t'+time) target.write('\n') target.close()
def get_url_actividades(url): print('GET URL ACTIVIDADES') print('###################') try: contents = urllib.request.urlopen(url).read() page = urllib.request.urlopen(url) soup = BeautifulSoup(page, "html5lib") print('CONTENIDOS:', contents) print('PAGINA:', page) # Título titulo = soup.title.string print('TITULO:', titulo) # print('PARSE:', soup.div(id='contenedor_central')) # print('PARSE:', soup.div(id='principal')) # print('PARSE:', soup.div(id='fecha_creditos')) # Actividades actividades = soup.find_all(['a'], href=re.compile('actividad/idactividad')) #print('ACTIVIDADES:', actividades) for actividad in actividades: # ID actividad link = actividad.get('href') idregistro = actividad.get('href').split('/')[3] # Actividad titulo = actividad.get_text() print('ACTIVIDAD:', idregistro, titulo, link) # DIV actividad actividad_completa = soup.find(['div'], idregistro=re.compile(idregistro)) # Fecha fecha = "'Sólo disponible en los tres primeros cursos del listado'" print('FECHA:', fecha) # Tipo tipo = "'Sólo disponible en los tres primeros cursos del listado'" print('TIPO:', tipo) # Centro centro = actividad_completa.find( ['a'], href=re.compile('indice/idcentro') ) # actividad.find_all(['a'], href=re.compile('indice/idcentro')) centro = centro.get_text() print('CENTRO:', centro) # WEB CURSO get_url_curso(web_extension + link) print('PARSE0:', soup.find_all(['div'], attrs={"class": "lista_mas_actividades"})) exit(0) print('PARSE:', soup.div(id='actividad')) print('PARSE0:', soup.find_all(['div'], attrs={"class": "contenedor_actividad"})) print( 'PARSE0:', soup.find_all(['div'], attrs={"class": 'cabeceraDetalleActividad'})) print('PARSE0:', soup.find_all(['div'], attrs={"class": 'cajasActividad'})) #print('PARSE LIMPIO:', soup.prettify()) return soup.title.string except Exception as e: return e
if __name__=='__main__': #所要爬取的小说主页,每次使用时,修改该网址即可,同时保证本地保存根路径存在即可 target="https://www.biqubao.com/book/17570/" # 本地保存爬取的文本根路径 save_path = 'd:/' #笔趣阁网站根路径 index_path='https://www.biqubao.com' req=requests.get(url=target) #查看request默认的编码,发现与网站response不符,改为网站使用的gdk print(req.encoding) req.encoding = 'gbk' #解析html soup=BeautifulSoup(req.text,"html.parser") list_tag=soup.div(id="list") print('list_tag:',list_tag) #获取小说名称 story_title=list_tag[0].dl.dt.string # 根据小说名称创建一个文件夹,如果不存在就新建 dir_path=save_path+'/'+story_title if not os.path.exists(dir_path): os.path.join(save_path,story_title) os.mkdir(dir_path) #开始循环每一个章节,获取章节名称,与章节对应的网址 for dd_tag in list_tag[0].dl.find_all('dd'): #章节名称 chapter_name=dd_tag.string #章节网址 chapter_url=index_path+dd_tag.a.get('href') #访问该章节详情网址,爬取该章节正文
def appmain(request): youtubeUrl = random.choice(youtube_list_selectOne) zd = a = tdColor = character = luckyNum = zdName = '' t3 = 'https://www.youtube.com/' bgColor = bgColorList[bgColorNum[0]] totalNum = 0 if request.method == "GET": form = PostForm(request.GET) if form.is_valid(): zd = form.cleaned_data["zodiac"] t = youtubeUrl.split('watch?v=') t1 = t[0] + 'embed/' t2 = t[1].split('&') t3 = t1 + t2[0] if zd == '': zdTotal = zdLove = zdMoney = zdWork = '' elif zd == 'zodiac1': character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/aries_banner.jpg' zdName = zdNameDic[zd] url1 = 'https://fortune.yahoo.co.jp/12astro/aries' req1 = urllib.request.Request(url1) response1 = urllib.request.urlopen(req1) html1 = response1.read() soup1 = BeautifulSoup(html1, "lxml") images = soup1.find_all('img') for img in images: if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot' ): zdTotalNum = int(re.search(r'\d+', img['src']).group()) / 10 zdTotal = fortuneList[int(zdTotalNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov' ): zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10 zdLove = fortuneList[int(zdLoveNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny' ): zdMoneyNum = int(re.search(r'\d+', img['src']).group()) / 10 zdMoney = fortuneList[int(zdMoneyNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk' ): zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10 zdWork = fortuneList[int(zdWorkNum)] url2 = 'https://uranai.nifty.com/f12seiza/ohitsuji/' req2 = urllib.request.Request(url2) response2 = urllib.request.urlopen(req2) html2 = response2.read() soup2 = BeautifulSoup(html2, "lxml") a = str(soup2.div(class_='hako')) #今日の得点を取得 totalNum = int(re.search(r'\d+', a).group()) #得点を抽出 bgColor = bgColorList[bgColorNum[totalNum % 6]] tdColor = bgColorNum[totalNum % 6] luckyNum = totalNum % 9 + 1 if totalNum < 50: youtubeUrl = youtube_list_selectOne[ (luckyNum + 123) % len(youtube_list_selectOne)] else: youtubeUrl = youtube_list_selectTwo[ (luckyNum + 123) % len(youtube_list_selectTwo)] t = youtubeUrl.split('watch?v=') t1 = t[0] + 'embed/' t2 = t[1].split('&') t3 = t1 + t2[0] elif zd == 'zodiac2': character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/taurus_banner.jpg' zdName = zdNameDic[zd] url1 = 'https://fortune.yahoo.co.jp/12astro/taurus' req1 = urllib.request.Request(url1) response1 = urllib.request.urlopen(req1) html1 = response1.read() soup1 = BeautifulSoup(html1, "lxml") images = soup1.find_all('img') for img in images: if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot' ): zdTotalNum = int(re.search(r'\d+', img['src']).group()) / 10 zdTotal = fortuneList[int(zdTotalNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov' ): zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10 zdLove = fortuneList[int(zdLoveNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny' ): zdMoneyNum = int(re.search(r'\d+', img['src']).group()) / 10 zdMoney = fortuneList[int(zdMoneyNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk' ): zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10 zdWork = fortuneList[int(zdWorkNum)] url2 = 'https://uranai.nifty.com/f12seiza/oushi/' req2 = urllib.request.Request(url2) response2 = urllib.request.urlopen(req2) html2 = response2.read() soup2 = BeautifulSoup(html2, "lxml") a = str(soup2.div(class_='hako')) #今日の得点を取得 totalNum = int(re.search(r'\d+', a).group()) #得点を抽出 bgColor = bgColorList[bgColorNum[totalNum % 6]] tdColor = bgColorNum[totalNum % 6] luckyNum = totalNum % 9 + 1 if totalNum < 50: youtubeUrl = youtube_list_selectOne[ (luckyNum + 123) % len(youtube_list_selectOne)] else: youtubeUrl = youtube_list_selectTwo[ (luckyNum + 123) % len(youtube_list_selectTwo)] t = youtubeUrl.split('watch?v=') t1 = t[0] + 'embed/' t2 = t[1].split('&') t3 = t1 + t2[0] elif zd == 'zodiac3': character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/gemini_banner.jpg' zdName = zdNameDic[zd] url1 = 'https://fortune.yahoo.co.jp/12astro/gemini' req1 = urllib.request.Request(url1) response1 = urllib.request.urlopen(req1) html1 = response1.read() soup1 = BeautifulSoup(html1, "lxml") images = soup1.find_all('img') for img in images: if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot' ): zdTotalNum = int(re.search(r'\d+', img['src']).group()) / 10 zdTotal = fortuneList[int(zdTotalNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov' ): zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10 zdLove = fortuneList[int(zdLoveNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny' ): zdMoneyNum = int(re.search(r'\d+', img['src']).group()) / 10 zdMoney = fortuneList[int(zdMoneyNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk' ): zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10 zdWork = fortuneList[int(zdWorkNum)] url2 = 'https://uranai.nifty.com/f12seiza/hutago/' req2 = urllib.request.Request(url2) response2 = urllib.request.urlopen(req2) html2 = response2.read() soup2 = BeautifulSoup(html2, "lxml") a = str(soup2.div(class_='hako')) #今日の得点を取得 totalNum = int(re.search(r'\d+', a).group()) #得点を抽出 bgColor = bgColorList[bgColorNum[totalNum % 6]] tdColor = bgColorNum[totalNum % 6] luckyNum = totalNum % 9 + 1 if totalNum < 50: youtubeUrl = youtube_list_selectOne[ (luckyNum + 123) % len(youtube_list_selectOne)] else: youtubeUrl = youtube_list_selectTwo[ (luckyNum + 123) % len(youtube_list_selectTwo)] t = youtubeUrl.split('watch?v=') t1 = t[0] + 'embed/' t2 = t[1].split('&') t3 = t1 + t2[0] elif zd == 'zodiac4': character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/cancer_banner.jpg' zdName = zdNameDic[zd] url1 = 'https://fortune.yahoo.co.jp/12astro/cancer' req1 = urllib.request.Request(url1) response1 = urllib.request.urlopen(req1) html1 = response1.read() soup1 = BeautifulSoup(html1, "lxml") images = soup1.find_all('img') for img in images: if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot' ): zdTotalNum = int(re.search(r'\d+', img['src']).group()) / 10 zdTotal = fortuneList[int(zdTotalNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov' ): zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10 zdLove = fortuneList[int(zdLoveNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny' ): zdMoneyNum = int(re.search(r'\d+', img['src']).group()) / 10 zdMoney = fortuneList[int(zdMoneyNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk' ): zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10 zdWork = fortuneList[int(zdWorkNum)] url2 = 'https://uranai.nifty.com/f12seiza/kani/' req2 = urllib.request.Request(url2) response2 = urllib.request.urlopen(req2) html2 = response2.read() soup2 = BeautifulSoup(html2, "lxml") a = str(soup2.div(class_='hako')) #今日の得点を取得 totalNum = int(re.search(r'\d+', a).group()) #得点を抽出 bgColor = bgColorList[bgColorNum[totalNum % 6]] tdColor = bgColorNum[totalNum % 6] luckyNum = totalNum % 9 + 1 if totalNum < 50: youtubeUrl = youtube_list_selectOne[ (luckyNum + 123) % len(youtube_list_selectOne)] else: youtubeUrl = youtube_list_selectTwo[ (luckyNum + 123) % len(youtube_list_selectTwo)] t = youtubeUrl.split('watch?v=') t1 = t[0] + 'embed/' t2 = t[1].split('&') t3 = t1 + t2[0] elif zd == 'zodiac5': character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/leo_banner.jpg' zdName = zdNameDic[zd] url1 = 'https://fortune.yahoo.co.jp/12astro/leo' req1 = urllib.request.Request(url1) response1 = urllib.request.urlopen(req1) html1 = response1.read() soup1 = BeautifulSoup(html1, "lxml") images = soup1.find_all('img') for img in images: if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot' ): zdTotalNum = int(re.search(r'\d+', img['src']).group()) / 10 zdTotal = fortuneList[int(zdTotalNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov' ): zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10 zdLove = fortuneList[int(zdLoveNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny' ): zdMoneyNum = int(re.search(r'\d+', img['src']).group()) / 10 zdMoney = fortuneList[int(zdMoneyNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk' ): zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10 zdWork = fortuneList[int(zdWorkNum)] url2 = 'https://uranai.nifty.com/f12seiza/shishi/' req2 = urllib.request.Request(url2) response2 = urllib.request.urlopen(req2) html2 = response2.read() soup2 = BeautifulSoup(html2, "lxml") a = str(soup2.div(class_='hako')) #今日の得点を取得 totalNum = int(re.search(r'\d+', a).group()) #得点を抽出 bgColor = bgColorList[bgColorNum[totalNum % 6]] tdColor = bgColorNum[totalNum % 6] luckyNum = totalNum % 9 + 1 if totalNum < 50: youtubeUrl = youtube_list_selectOne[ (luckyNum + 123) % len(youtube_list_selectOne)] else: youtubeUrl = youtube_list_selectTwo[ (luckyNum + 123) % len(youtube_list_selectTwo)] t = youtubeUrl.split('watch?v=') t1 = t[0] + 'embed/' t2 = t[1].split('&') t3 = t1 + t2[0] elif zd == 'zodiac6': character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/virgo_banner.jpg' zdName = zdNameDic[zd] url1 = 'https://fortune.yahoo.co.jp/12astro/virgo' req1 = urllib.request.Request(url1) response1 = urllib.request.urlopen(req1) html1 = response1.read() soup1 = BeautifulSoup(html1, "lxml") images = soup1.find_all('img') for img in images: if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot' ): zdTotalNum = int(re.search(r'\d+', img['src']).group()) / 10 zdTotal = fortuneList[int(zdTotalNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov' ): zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10 zdLove = fortuneList[int(zdLoveNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny' ): zdMoneyNum = int(re.search(r'\d+', img['src']).group()) / 10 zdMoney = fortuneList[int(zdMoneyNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk' ): zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10 zdWork = fortuneList[int(zdWorkNum)] url2 = 'https://uranai.nifty.com/f12seiza/otome/' req2 = urllib.request.Request(url2) response2 = urllib.request.urlopen(req2) html2 = response2.read() soup2 = BeautifulSoup(html2, "lxml") a = str(soup2.div(class_='hako')) #今日の得点を取得 totalNum = int(re.search(r'\d+', a).group()) #得点を抽出 bgColor = bgColorList[bgColorNum[totalNum % 6]] tdColor = bgColorNum[totalNum % 6] luckyNum = totalNum % 9 + 1 if totalNum < 50: youtubeUrl = youtube_list_selectOne[ (luckyNum + 123) % len(youtube_list_selectOne)] else: youtubeUrl = youtube_list_selectTwo[ (luckyNum + 123) % len(youtube_list_selectTwo)] t = youtubeUrl.split('watch?v=') t1 = t[0] + 'embed/' t2 = t[1].split('&') t3 = t1 + t2[0] elif zd == 'zodiac7': character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/libra_banner.jpg' zdName = zdNameDic[zd] url1 = 'https://fortune.yahoo.co.jp/12astro/libra' req1 = urllib.request.Request(url1) response1 = urllib.request.urlopen(req1) html1 = response1.read() soup1 = BeautifulSoup(html1, "lxml") images = soup1.find_all('img') for img in images: if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot' ): zdTotalNum = int(re.search(r'\d+', img['src']).group()) / 10 zdTotal = fortuneList[int(zdTotalNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov' ): zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10 zdLove = fortuneList[int(zdLoveNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny' ): zdMoneyNum = int(re.search(r'\d+', img['src']).group()) / 10 zdMoney = fortuneList[int(zdMoneyNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk' ): zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10 zdWork = fortuneList[int(zdWorkNum)] url2 = 'https://uranai.nifty.com/f12seiza/tenbin/' req2 = urllib.request.Request(url2) response2 = urllib.request.urlopen(req2) html2 = response2.read() soup2 = BeautifulSoup(html2, "lxml") a = str(soup2.div(class_='hako')) #今日の得点を取得 totalNum = int(re.search(r'\d+', a).group()) #得点を抽出 bgColor = bgColorList[bgColorNum[totalNum % 6]] tdColor = bgColorNum[totalNum % 6] luckyNum = totalNum % 9 + 1 if totalNum < 50: youtubeUrl = youtube_list_selectOne[ (luckyNum + 123) % len(youtube_list_selectOne)] else: youtubeUrl = youtube_list_selectTwo[ (luckyNum + 123) % len(youtube_list_selectTwo)] t = youtubeUrl.split('watch?v=') t1 = t[0] + 'embed/' t2 = t[1].split('&') t3 = t1 + t2[0] elif zd == 'zodiac8': character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/scorpio_banner.jpg' zdName = zdNameDic[zd] url1 = 'https://fortune.yahoo.co.jp/12astro/scorpio' req1 = urllib.request.Request(url1) response1 = urllib.request.urlopen(req1) html1 = response1.read() soup1 = BeautifulSoup(html1, "lxml") images = soup1.find_all('img') for img in images: if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot' ): zdTotalNum = int(re.search(r'\d+', img['src']).group()) / 10 zdTotal = fortuneList[int(zdTotalNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov' ): zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10 zdLove = fortuneList[int(zdLoveNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny' ): zdMoneyNum = int(re.search(r'\d+', img['src']).group()) / 10 zdMoney = fortuneList[int(zdMoneyNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk' ): zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10 zdWork = fortuneList[int(zdWorkNum)] url2 = 'https://uranai.nifty.com/f12seiza/sasori/' req2 = urllib.request.Request(url2) response2 = urllib.request.urlopen(req2) html2 = response2.read() soup2 = BeautifulSoup(html2, "lxml") a = str(soup2.div(class_='hako')) #今日の得点を取得 totalNum = int(re.search(r'\d+', a).group()) #得点を抽出 bgColor = bgColorList[bgColorNum[totalNum % 6]] tdColor = bgColorNum[totalNum % 6] luckyNum = totalNum % 9 + 1 if totalNum < 50: youtubeUrl = youtube_list_selectOne[ (luckyNum + 123) % len(youtube_list_selectOne)] else: youtubeUrl = youtube_list_selectTwo[ (luckyNum + 123) % len(youtube_list_selectTwo)] t = youtubeUrl.split('watch?v=') t1 = t[0] + 'embed/' t2 = t[1].split('&') t3 = t1 + t2[0] elif zd == 'zodiac9': character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/sagittarius_banner.jpg' zdName = zdNameDic[zd] url1 = 'https://fortune.yahoo.co.jp/12astro/sagittarius' req1 = urllib.request.Request(url1) response1 = urllib.request.urlopen(req1) html1 = response1.read() soup1 = BeautifulSoup(html1, "lxml") images = soup1.find_all('img') for img in images: if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot' ): zdTotalNum = int(re.search(r'\d+', img['src']).group()) / 10 zdTotal = fortuneList[int(zdTotalNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov' ): zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10 zdLove = fortuneList[int(zdLoveNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny' ): zdMoneyNum = int(re.search(r'\d+', img['src']).group()) / 10 zdMoney = fortuneList[int(zdMoneyNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk' ): zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10 zdWork = fortuneList[int(zdWorkNum)] url2 = 'https://uranai.nifty.com/f12seiza/ite/' req2 = urllib.request.Request(url2) response2 = urllib.request.urlopen(req2) html2 = response2.read() soup2 = BeautifulSoup(html2, "lxml") a = str(soup2.div(class_='hako')) #今日の得点を取得 totalNum = int(re.search(r'\d+', a).group()) #得点を抽出 bgColor = bgColorList[bgColorNum[totalNum % 6]] tdColor = bgColorNum[totalNum % 6] luckyNum = totalNum % 9 + 1 if totalNum < 50: youtubeUrl = youtube_list_selectOne[ (luckyNum + 123) % len(youtube_list_selectOne)] else: youtubeUrl = youtube_list_selectTwo[ (luckyNum + 123) % len(youtube_list_selectTwo)] t = youtubeUrl.split('watch?v=') t1 = t[0] + 'embed/' t2 = t[1].split('&') t3 = t1 + t2[0] elif zd == 'zodiac10': character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/capricorn_banner.jpg' zdName = zdNameDic[zd] url1 = 'https://fortune.yahoo.co.jp/12astro/capricorn' req1 = urllib.request.Request(url1) response1 = urllib.request.urlopen(req1) html1 = response1.read() soup1 = BeautifulSoup(html1, "lxml") images = soup1.find_all('img') for img in images: if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot' ): zdTotalNum = int(re.search(r'\d+', img['src']).group()) / 10 zdTotal = fortuneList[int(zdTotalNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov' ): zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10 zdLove = fortuneList[int(zdLoveNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny' ): zdMoneyNum = int(re.search(r'\d+', img['src']).group()) / 10 zdMoney = fortuneList[int(zdMoneyNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk' ): zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10 zdWork = fortuneList[int(zdWorkNum)] url2 = 'https://uranai.nifty.com/f12seiza/yagi/' req2 = urllib.request.Request(url2) response2 = urllib.request.urlopen(req2) html2 = response2.read() soup2 = BeautifulSoup(html2, "lxml") a = str(soup2.div(class_='hako')) #今日の得点を取得 totalNum = int(re.search(r'\d+', a).group()) #得点を抽出 bgColor = bgColorList[bgColorNum[totalNum % 6]] tdColor = bgColorNum[totalNum % 6] luckyNum = totalNum % 9 + 1 if totalNum < 50: youtubeUrl = youtube_list_selectOne[ (luckyNum + 123) % len(youtube_list_selectOne)] else: youtubeUrl = youtube_list_selectTwo[ (luckyNum + 123) % len(youtube_list_selectTwo)] t = youtubeUrl.split('watch?v=') t1 = t[0] + 'embed/' t2 = t[1].split('&') t3 = t1 + t2[0] elif zd == 'zodiac11': character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/aquarius_banner.jpg' zdName = zdNameDic[zd] url1 = 'https://fortune.yahoo.co.jp/12astro/aquarius' req1 = urllib.request.Request(url1) response1 = urllib.request.urlopen(req1) html1 = response1.read() soup1 = BeautifulSoup(html1, "lxml") images = soup1.find_all('img') for img in images: if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot' ): zdTotalNum = int(re.search(r'\d+', img['src']).group()) / 10 zdTotal = fortuneList[int(zdTotalNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov' ): zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10 zdLove = fortuneList[int(zdLoveNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny' ): zdMoneyNum = int(re.search(r'\d+', img['src']).group()) / 10 zdMoney = fortuneList[int(zdMoneyNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk' ): zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10 zdWork = fortuneList[int(zdWorkNum)] url2 = 'https://uranai.nifty.com/f12seiza/mizugame/' req2 = urllib.request.Request(url2) response2 = urllib.request.urlopen(req2) html2 = response2.read() soup2 = BeautifulSoup(html2, "lxml") a = str(soup2.div(class_='hako')) #今日の得点を取得 totalNum = int(re.search(r'\d+', a).group()) #得点を抽出 bgColor = bgColorList[bgColorNum[totalNum % 6]] tdColor = bgColorNum[totalNum % 6] luckyNum = totalNum % 9 + 1 if totalNum < 50: youtubeUrl = youtube_list_selectOne[ (luckyNum + 123) % len(youtube_list_selectOne)] else: youtubeUrl = youtube_list_selectTwo[ (luckyNum + 123) % len(youtube_list_selectTwo)] t = youtubeUrl.split('watch?v=') t1 = t[0] + 'embed/' t2 = t[1].split('&') t3 = t1 + t2[0] elif zd == 'zodiac12': character = 'https://www.vogue.co.jp/assets/commons/img/horoscope/daily/pisces_banner.jpg' zdName = zdNameDic[zd] url1 = 'https://fortune.yahoo.co.jp/12astro/pisces' req1 = urllib.request.Request(url1) response1 = urllib.request.urlopen(req1) html1 = response1.read() soup1 = BeautifulSoup(html1, "lxml") images = soup1.find_all('img') for img in images: if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_tot' ): zdTotalNum = int(re.search(r'\d+', img['src']).group()) / 10 zdTotal = fortuneList[int(zdTotalNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_lov' ): zdLoveNum = int(re.search(r'\d+', img['src']).group()) / 10 zdLove = fortuneList[int(zdLoveNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_mny' ): zdMoneyNum = int(re.search(r'\d+', img['src']).group()) / 10 zdMoney = fortuneList[int(zdMoneyNum)] if img['src'].startswith( 'https://s.yimg.jp/images/fortune/images/common/yftn_param_wrk' ): zdWorkNum = int(re.search(r'\d+', img['src']).group()) / 10 zdWork = fortuneList[int(zdWorkNum)] url2 = 'https://uranai.nifty.com/f12seiza/uo/' req2 = urllib.request.Request(url2) response2 = urllib.request.urlopen(req2) html2 = response2.read() soup2 = BeautifulSoup(html2, "lxml") a = str(soup2.div(class_='hako')) #今日の得点を取得 totalNum = int(re.search(r'\d+', a).group()) #得点を抽出 bgColor = bgColorList[bgColorNum[totalNum % 6]] tdColor = bgColorNum[totalNum % 6] luckyNum = totalNum % 9 + 1 if totalNum < 50: youtubeUrl = youtube_list_selectOne[ (luckyNum + 123) % len(youtube_list_selectOne)] else: youtubeUrl = youtube_list_selectTwo[ (luckyNum + 123) % len(youtube_list_selectTwo)] t = youtubeUrl.split('watch?v=') t1 = t[0] + 'embed/' t2 = t[1].split('&') t3 = t1 + t2[0] return render( request, 'demo/appname.html', { 'url': t3, 'zdTotal': zdTotal, 'zdLove': zdLove, 'zdMoney': zdMoney, 'zdWork': zdWork, 'bgColor': bgColor, 'tdColor': tdColor, 'character': character, 'luckyNum': luckyNum, 'zdName': zdName })
def create_user_reviews_table(path_jsons, db_conn): """ Recibe direccion de directorio de documentos JSON y el objeto de la conexión de la BD """ c = db_conn.cursor() # Creacion de la tabla en la BD: user_reviews(user_id, url_review, rating) table_name = 'user_reviews' col_user_id = 'user_id' col_url = 'url_review' col_rating = 'rating' c.execute( 'CREATE TABLE IF NOT EXISTS {0} ({1} {2}, {3} {4} PRIMARY KEY, {5} {6})'\ .format(table_name, \ col_user_id, 'INTEGER', \ col_url, 'TEXT', \ col_rating, 'INTEGER') ) # Listando el contenido del directorio <path_jsons>/ json_titles = [ f for f in listdir(path_jsons) if isfile(join(path_jsons, f)) ] for i in range(0, len(json_titles)): with open(path_jsons + json_titles[i], 'r') as f: # Recuperando toda la info del documento data_json = json.load(f) for j in range(0, len(data_json)): # Guardando texto del tweet tweet = data_json[j]['text'] # Guardando URL de la opinion del usuario en GR try: url_review = data_json[j]['entities']['urls'][-1][ 'expanded_url'] url_review = unshorten_url(url_review) except Exception as e: logging.info("¡Tweet con contenido NO predefinido!") continue # Guardando username del usuario en Twitter screen_name = data_json[j]['user']['screen_name'] # Guardando ID del usuario en Twitter user_id = data_json[j]['user']['id'] logging.info( "Obteniendo HTML del Tweet {1}/{2}. Usuario: {0}, {3}/{4}.". format(screen_name, j, len(data_json), i, len(json_titles))) # Guardando en disco el HTML crawleado de url_review file_name = url_review.split('/')[ -1] # Cortamos después del último '/' de la URL file_name = file_name.split('?')[ 0] # Cortamos después del primer '?' de la URI save_path = "/mnt/f90f82f4-c2c7-4e53-b6af-7acc6eb85058/crawling_data/goodreads_crawl/user_reviews/" + file_name + ".html" # Intentando ingresar a la URL # Si no es accesible o si no corresponde a ruta de GR, # sigue con el próximo tweet if "goodreads.com/review" in url_review: try: urllib.request.urlretrieve(url_review, save_path) except Exception as e: logging.info("No se pudo ingresar al sitio!") continue else: logging.info("Enlace no es ruta de review de GR") continue # Abriedo HTML recién guardado para capturar el rating with open(save_path) as fp: soup = BeautifulSoup(fp, 'html.parser') # Guardamos el rating # A veces en GR no se renderiza el HTML que incluye el rating (why? dunno), # pero sí está el rating puesto en el Tweet ("1 out of 5 stars to [...]").. # ..en esos casos se usa un regex para capturar el rating desde el texto del tweet. # Si todo falla guardamos el rating como 0, sólo indicando que el usuario # consumió aquel item (presuponiendo de que si aparece la URL del review en el tweet # es porque el item fue consumido) try: rating = int( soup.div(class_='rating')[0].find_all( 'span', class_='value-title')[0]['title']) # En caso que no encuentre rating en la ruta del review (porque no puede encontrar # la rewview o porque no hay estrellitas donde debiera estar el rating)... except Exception as e: try: #..lo capturo con un regex desde el tweet match = re.search(r"(\d+) of (\d+) stars", tweet.lower()) rating = int(match.group(1)) if rating > 5 or rating < 0: rating = 0 except Exception as er: rating = 0 # Insertando tupla (user_id, url_review, rating) en la BD try: c.execute( "INSERT INTO {0} ({1}, {2}, {3}) VALUES (?, ?, ?)" \ .format(table_name, col_user_id, col_url , col_rating), \ (user_id , file_name, rating) ) except sqlite3.IntegrityError: logging.info( 'ERROR: URI de review ya existe: {}'.format(file_name)) # Manda los cambios al final de pasar por todos los tweets de cada usuario db_conn.commit()
from bs4 import BeautifulSoup import requests html = requests.get('https://ip.cn/').content print(html) soup = BeautifulSoup(html, 'lxml', from_encoding='utf-8') result = soup.div(id='result')[0].p.code.get_text() print(result)
if __name__ == '__main__': #所要爬取的小说主页,每次使用时,修改该网址即可,同时保证本地保存根路径存在即可 target = "https://www.biqubao.com/book/9062/" # 本地保存爬取的文本根路径 save_path = 'F:/P' #笔趣阁网站根路径 index_path = 'https://www.biqubao.com' req = requests.get(url=target) #查看request默认的编码,发现与网站response不符,改为网站使用的gdk print(req.encoding) req.encoding = 'gbk' #解析html soup = BeautifulSoup(req.text, "html.parser") list_tag = soup.div(id="list") print('list_tag:', list_tag) #获取小说名称 story_title = list_tag[0].dl.dt.string # 根据小说名称创建一个文件夹,如果不存在就新建 dir_path = save_path + '/' + story_title if not os.path.exists(dir_path): os.path.join(save_path, story_title) os.mkdir(dir_path) #开始循环每一个章节,获取章节名称,与章节对应的网址 for dd_tag in list_tag[0].dl.find_all('dd'): #章节名称 chapter_name = dd_tag.string #章节网址
def crawling_branch(branch, baseURL, local_repos_dir): sys.stderr.write('%s %s %s\n' % (branch.branch_name, baseURL, local_repos_dir)) # logging.basicConfig(filename='crawler-threadpool.log', level = logging.DEBUG, format = '%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger=logging.getLogger('-'.join(['Branch', branch.branch_name])) if os.path.isdir(os.path.join(local_repos_dir, 'branches', branch.branch_name.replace('/', '~'))): os.system(' '.join(['rm', '-rf', os.path.join(local_repos_dir, 'branches', branch.branch_name.replace('/', '~'))])) sys.stderr.write('Start parsing %s\n' % branch.branch_name) os.mkdir(os.path.join(local_repos_dir, 'branches', branch.branch_name.replace('/', '~'))) os.system(' '.join(['git', 'clone', '-b', branch.branch_name, baseURL+branch.repos.href, os.path.join(local_repos_dir, 'branches', branch.branch_name.replace('/', '~'))])) N=test_last_page(baseURL+branch.commit_url) fp=open(os.path.join(local_repos_dir, 'logs', branch.branch_name.replace('/', '~')), 'w') logger.info('Total pages:%s' % N) visit_commit_set=set() for i in range(N, 0, -1): sys.stderr.write('Branch:%s\tPage:%s\n' % (branch, i)) failure=True while failure: try: req=urllib2.urlopen(baseURL+branch.commit_url+'?page='+str(i)) result=req.read() soup=BeautifulSoup(result) commit_list=[] for d in soup.div(): if d.has_attr('class') and 'js-navigation-container' in d.attrs['class']: h3_list=d.findAll('h3') ol_list=d.findAll('ol') if len(h3_list)==len(ol_list): for index in range(len(h3_list)): h3_date=datetime.datetime.strptime(h3_list[index].string, '%b %d, %Y').date() for li in ol_list[index].findAll('li'): for c_a in li.p.findAll('a', {'class':'message'}): commit=Commit(c_a['href'], h3_date) if commit.commit_sha not in visit_commit_set: # sys.stderr.write('Parent info %s\n' % '\t'.join(commit.parent_sha_list)) commit_list.append(commit) visit_commit_set.add(commit.commit_sha) # if i==N: # # tracing_parent=[commit] # # while len(tracing_parent)>0: # # com=tracing_parent.pop() # # com.parse_parent_info() # # for parent_sha in com.parent_sha_list: # ## print 'Deep parent %s' % parent_sha # # if parent_sha not in visit_commit_set: # # parent_commit=Commit(os.path.join(branch.repos.href, 'commit', parent_sha), datetime.datetime(2000, 1, 1)) # ## parent_commit.parse_parent_info() # # commit_list.append(parent_commit) # # visit_commit_set.add(parent_sha) # # tracing_parent.append(parent_commit) else: print 'Error! h3 and ol do not match!' commit_list.reverse() for commit in commit_list: commit.parse_parent_info() fp.write('%s %s %s %s\n' % (branch.branch_name, commit.commit_sha, commit.commit_date.strftime('%m/%d/%Y'), '\t'.join(commit.parent_sha_list))) logger.info('Commit:%s (%s) in Branch:%s Parent:%s' % (commit.commit_sha, commit.commit_date.strftime('%m/%d/%Y'), branch.branch_name, '\t'.join(commit.parent_sha_list))) # if not os.path.isdir(os.path.join(local_repos_dir, 'previous_commits', commit.commit_sha)): # os.mkdir(os.path.join(local_repos_dir, 'previous_commits', commit.commit_sha)) # clone_commit(commit, os.path.join(local_repos_dir, 'branches', branch.branch_name.replace('/', '~'))) failure=False except urllib2.HTTPError, e: print e, baseURL+branch.commit_url+'?page='+str(i)
import requests from bs4 import BeautifulSoup import os #小说列表网页以及主网页 target = 'https://www.biqubao.com/book/13991/' server = 'https://www.biqubao.com' #注意转化编码 req = requests.get(url=target) req.encoding = 'gbk' html = req.text #找到list div = BeautifulSoup(html, "html.parser") list_tag = div.div(id='list') #小说名 title = list_tag[0].dl.dt.string #目标文件夹 save_path = 'F:/Python/novel/new' dir_path = save_path + '/' + title if not os.path.exists(dir_path): os.path.join(save_path, title) os.mkdir(dir_path) for dd_tag in list_tag[0].dl.find_all('dd'): #章节名字 chapter_name = dd_tag.string #章节网址 chapter_url = server + dd_tag.a.get('href') c_req = requests.get(url=chapter_url) c_req.encoding = 'gbk'
def parse(county_page_html): soup = BeautifulSoup(county_page_html, 'html5lib') block = soup.div(class_='zsg-lg-1-2 zsg-sm-1-1')[0] lis = block.find_all("a") return [k.text for k in lis]
# Import modules import urllib2 from bs4 import BeautifulSoup import sqlite3 # Parse box office results html = urllib2.urlopen("http://www.boxofficemojo.com/weekend/chart/").read() rawpage = BeautifulSoup((html)) rawbody = rawpage.div(id="body")[0].findAll('tr')[4] rawtable = rawbody.findAll('tr')[1:44] # Range will change every weekend rawdata = [x.findAll('td') for x in rawtable] tempdata = [str(i.string) for x in rawdata for i in x] findata = [tempdata[x: x+12] for x in range(0, len(tempdata), 12)] # Create SQL table and enter data conn = sqlite3.connect("test.db") c = conn.cursor() # All data saved as text due to unpredictable use of '-' symbols c.execute('''create table boxoffice (tw text, lw text, title text, studio text, weekend_gross text, gross_change text, headcount text, headcount_change text, average text, total_gross text, budget text, week text) ''') for i in range(len(findata)): c.execute('''insert into boxoffice values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', findata[i]) # Query results and print first row
function2(link1) for links in soup4.find_all('a'): link2 = "http://www.gsmarena.com/" + links['href'] if link2 == link1: pass else: link5 = "http://www.gsmarena.com/" + links['href'] print link5 function2(link5) print "-----------------------------------------------------------------" def function2(phn_links): '''return all the phone links of the page ie http://www.gsmarena.com/amazon-phones-76.php''' #phn_links = "http://www.gsmarena.com/amazon-phones-76.php" phn_links_page = requests.get(phn_links) phn_soup = BeautifulSoup(phn_links_page.text) phn_soup2 = BeautifulSoup(str(phn_soup.div(class_="makers"))) for link in phn_soup2.find_all('a'): link = "http://www.gsmarena.com/" + link['href'] function3(link) link = "http://www.gsmarena.com/makers.php3" #start link to scrap gsm contains all the phone maker company data = requests.get(link) soup = BeautifulSoup(data.text) soup2= BeautifulSoup(str(soup.div(id="main"))) lis = soup2.find_all('a') for i in range(0,len(lis),2): link2 = "http://www.gsmarena.com/"+ lis[i]['href'] function1(link2)