def post(request, post_id): post_id = post_id[-14:] post = Post.objects.get(id=post_id) manifest = os.path.join(RAW_DIR, post.dir_name, 'manifest.json') with open(manifest, 'r') as mf: manifest = json.load(mf) try: page_type = manifest['page_type'] except: pass if page_type == 'Typora': with open( os.path.join(RAW_DIR, post.dir_name, 'templates', 'index.html'), 'r') as html: html = BeautifulSoup(html.read(), 'lxml') html = str(html.body) html = html.replace('body', 'div') return render(request, 'post/index.html', {'html': html, 'post': post}) else: context = {} scripts = manifest['data']['script'] if scripts: f_script = os.path.join(RAW_DIR, post.dir_name, 'static', scripts[0]) f_content = os.path.join(RAW_DIR, post.dir_name, 'static', 'content.xml') import importlib spec = importlib.util.spec_from_file_location('script', f_script) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) context = module.get_context(f_content) context['post'] = post return render(request, os.path.join(post.dir_name, 'templates', 'index.html'), context)
def scanPage(br: Browser, uri_materials: str, saveto: str, fileext_whitelist: List[str], pythomat: Pythomat, section: str, overwrite: int): soup = br.open(uri_materials) soup = BeautifulSoup(soup.read(), "html.parser") os.chdir(saveto) icons = soup.select(".activityinstance .activityicon") for icon in icons: ressourceClassification = classifyRessource(icon.get("src"), fileext_whitelist) if ressourceClassification is None: print( "[Ignored] Since its icon could not be classified: {}".format( icon.get("src"))) elif ressourceClassification[1] == 1: # Download filelink_dom = icon.parent downloadpath = filelink_dom.get("href") downloadFromRawUrl(downloadpath, pythomat, section, br, fileext_whitelist, overwrite, saveto) elif ressourceClassification[1] == 2: # Folder filelink_dom = icon.parent downloadpath = filelink_dom.get("href") scanSubPage(br, downloadpath, saveto, fileext_whitelist, pythomat, section, overwrite) elif ressourceClassification[1] == 3: # Folder filelink_dom = icon.parent downloadpath = filelink_dom.get("href") scanAssignmentPage(br, downloadpath, saveto, fileext_whitelist, pythomat, section, overwrite) else: # Don't download | ressourceClassification[1] == 0: print("[Ignored] Since its icon is not whitelisted: {}".format( icon.get("src")))
def fourchan(s, request, channel): try: global fCounter board = "/" + request + "/" page = urllib2.Request("http://4chan.org" + board, headers={'User-agent': 'Mozilla/5.0'}) request = urllib2.urlopen(page) request = BeautifulSoup(request.read()) details = request.find_all("div", {"class": "thread"}) global fourchan_topics fourchan_topics = details name = details[0].find_all("span", {"class": "name"}) send(s, "PRIVMSG %s :%s" % (channel, name[0].text)) date = details[0].find_all("span", {"class": "dateTime"}) send(s, "PRIVMSG %s :%s" % (channel, date[0].text)) url = details[0].a['href'] send( s, "PRIVMSG %s :%s" % (channel, "http://boards.4chan.org" + board + url)) problem_text = details[0].find_all("blockquote", {"class": "postMessage"}) send(s, "PRIVMSG %s :%s" % (channel, problem_text[0].text)) fCounter = 0 global current_board current_board = board except urllib2.HTTPError: send(s, "PRIVMSG %s :Throttled by tor." % channel)
def printPage(soup, name, t=False): if (not t): soup = BeautifulSoup(soup.read(), 'lxml') if not os.path.exists('LogFiles/ErrorFiles/'): os.makedirs('LogFiles/ErrorFiles') misc = open("LogFiles/ErrorFiles/" + name + ".html", "w") print(soup.prettify(), file=misc) misc.close()
def attrs_p(): ''' <tr class> <td class="count"></td> <td>111</td> <td>222</td> </tr> <tr class="odd"></tr> ''' soup = BeautifulSoup() soup.read() # 匹配带有class属性的tr标签 taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")}) for trtag in taglist: tdlist = trtag.find_all('td') #在每个tr标签下,查找所有的td标签 print tdlist[1].string #这里提取IP值 print tdlist[2].string #这里提取端口值
def openUrl(url): #获取url信息 try: html = urlopen(url) except (HTTPError, URLError) as e: return None try: html = BeautifulSoup(html.read(), "html.parser") except AttributeError as e: return None return html
def faviconUrl(self, url): try: with urlopen(url) as html: html = BeautifulSoup(html.read()) if not html.find(rel="shortcut icon") is None: favicon_url = html.find(rel="shortcut icon")["href"] elif not html.find(rel="icon")["href"] is None: favicon_url = html.find(rel="icon")["href"] return urljoin(url, favicon_url) except: return None
def getPage(self, pageNum): try: url = self.baseURL + self.seeLZ + '&pn=' + str(pageNum) request = Request(url) html = urlopen(request) bsobj = BeautifulSoup(html.read(), "html.parser") # print(bsobj) return bsobj.read() except URLError as e: if hasattr(e, "reason"): print(u"链接百度贴吧失败,错误原因", e.reason) return None
def SearchLinks(url_list): url_beginning = 'http://' + craigslist_region +'.craigslist.org/' for item in url_list: post_url = url_beginning + item post_html = urllib2.urlopen(post_url) post_html = BeautifulSoup(post_html.read()) meta_tags = post_html.find('meta') post_parent = meta_tags.parent post_parent = post_parent.contents post_body = post_parent[7] keywords = ['mario','ps3','ps4','xbox','gameboy','linux','sega','brewing'] #insert your own! for item in keywords: if str(post_body).find(item) is not -1: send_text(post_url)
def get_data(data, name, pattern, iscomments=False, istags=False): if istags: result = ", ".join( [tag.text.strip() for tag in data.find_all(**pattern)]) elif iscomments: if not data.find(**pattern["counter"]): return "" else: postlink = data.find(**pattern["postlink"])["href"] with req.urlopen(postlink) as comments_data: comments_data = BeautifulSoup(comments_data.read(), "lxml") return get_data(comments_data, name, pattern["comment"]) else: data = data.find(**pattern) if data: result = data.text.strip() else: return None return "{}: {}".format(name, result)
def scanSubPage(br: Browser, url: str, saveto: str, fileext_whitelist: List[str], pythomat: Pythomat, section: str, overwrite: int): soup = br.open(url) soup = BeautifulSoup(soup.read(), "html.parser") icons = soup.select(".fp-filename-icon .icon") for icon in icons: ressourceClassification = classifyRessource(icon.get("src"), fileext_whitelist) if ressourceClassification is None: print( "[Ignored] Since its icon could not be classified: {}".format( icon.get("src"))) elif ressourceClassification[1] == 1: # Download filelink_dom = icon.parent.parent downloadpath = filelink_dom.get("href") downloadFromRawUrl(downloadpath, pythomat, section, br, fileext_whitelist, overwrite, saveto) else: # Don't download | ressourceClassification[1] == 0: print("[Ignored] Since its icon is not whitelisted: {}".format( icon.get("src")))
def fourchan(s, request, channel): try: global fCounter board = "/"+request+"/" page = urllib2.Request("http://4chan.org"+ board, headers={'User-agent' : 'Mozilla/5.0'}) request = urllib2.urlopen(page) request = BeautifulSoup(request.read()) details = request.find_all("div", {"class" : "thread"}) global fourchan_topics fourchan_topics = details name = details[0].find_all("span", {"class" : "name"}) send(s, "PRIVMSG %s :%s" % (channel, name[0].text)) date = details[0].find_all("span", {"class" : "dateTime"}) send(s, "PRIVMSG %s :%s" % (channel, date[0].text)) url = details[0].a['href'] send(s, "PRIVMSG %s :%s" % (channel, "http://boards.4chan.org"+board+url)) problem_text = details[0].find_all("blockquote",{"class":"postMessage"}) send(s, "PRIVMSG %s :%s" % (channel, problem_text[0].text)) fCounter = 0 global current_board current_board = board except urllib2.HTTPError: send(s, "PRIVMSG %s :Throttled by tor." % channel)
my_writer.writeheader() faculty_1=soup.find_all('div',{'class':"views-row"}) for i in range(0, len(faculty_1)): faculty=faculty_1[i] member = faculty.find('a',{'class':"person-view-primary-field" }) member_name = member.get_text() member_name = member_name.replace('\n', '') member_page = "http://polisci.wustl.edu" + member['href'] member_title = faculty.get_text() member_title = member_title.split("\n") member_title = member_title[2] member_title = member_title.replace(' ', '') #member_page = member_page.replace('\n', ' ') member_page_proper = urllib2.urlopen(member_page) member_page_proper = BeautifulSoup(member_page_proper.read()) member_specialization_spots = member_page_proper.find_all('a',{'property':"rdfs:label skos:prefLabel"}) for i in member_specialization_spots: fields = ["Political Theory", "American", "Methodology", "Comparative", "International Political Economy", "Formal Theory"] if i.get_text() in fields: member_specialization = i.get_text() page_email_spot = member_page_proper.find("div",{'class':"field field-name-field-person-email field-type-email field-label-inline clearfix"}) page_email_spot_2 = page_email_spot.find_all("div", {"class" : "field-item even"}) for i in page_email_spot_2: if i.find("a",href=True): page_email = i.get_text() page_website_spot=member_page_proper.find('div',{'class':"field field-name-field-person-website field-type-link-field field-label-inline clearfix"}) if page_website_spot: page_website = page_website_spot.find("a",href=True)['href'] else: page_website = member_page
try: # html = urlopen("https://site_nao_existe.com.br") # Forçando erro URLError # html = urlopen("http://pythonscraping.com/pages/page1.html") # Forçando o sucesso html = urlopen( "http://pythonscraping.com/pages/page1.html") # Forçando o sucesso except HTTPError as e: print(e) except URLError as e: print("Servidor não encontrado") else: print("Servidor encontrado!") ''' Existem situações que o site foi encontrado e esta ok, porém o conteudo do site pode não ser o esperado. Neste cenário é impressindivel validar o conteudo com o BS4 ''' html = BeautifulSoup(html.read(), "html.parser") ''' Se uma tag não existir, o bs4 retorna None por padrão ''' print(html.tag_inexistente) ''' Caso tentar acessar uma tag dentro de uma tag que não exista, ou seja (None), uma AttributeError sera lançada ''' try: print(html.tag_inexistente.qualquer_outra_tag) except AttributeError as e: print("Tag inexistente") ''' Tratamento recomendado com exemplo de captura da tag H1 do site alvo: ''' try: conteudo = html.h1 except AttributeError as e: print("Tag inexistente")
prepareFile.close() f = open('document.txt') f.read() # In[16]: f = open('document.txt', 'r') for line in f: print(line.strip()) # In[17]: # Finding nltk data path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt') raw = open(path, 'r') raw.read() # ### Capturing User Input # In[18]: s = input("Enter some text") # In[19]: print("You typed {0} words.".format(len(word_tokenize(s)))) # ### The NLP Pipeline # In[20]:
def Summary(): k = 0 Summary = pd.DataFrame() ###### WTI #### url1 = 'https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=OIL_CL&fdtc=2&page=1' html1 = urlopen(url1) src1 = BeautifulSoup(html1.read(), "html.parser") url2 = 'https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=OIL_CL&fdtc=2&page=2' html2 = urlopen(url2) src2 = BeautifulSoup(html2.read(), "html.parser") url3 = 'https://finance.naver.com//marketindex/exchangeDailyQuote.naver?marketindexCd=FX_USDKRW' html3 = urlopen(url3) src3 = BeautifulSoup(html3.read(), "html.parser") date_USD = src3.find_all(class_="date") USD = src3.find_all(class_="num") date1 = src1.find_all(class_="date") date2 = src2.find_all(class_="date") date = date1 + date2 WTI1 = src1.find_all(class_="num") WTI2 = src2.find_all(class_="num") WTI = WTI1 + WTI2 # print(WTI[1]) for k in range(len(date)): if date_USD[0].text.strip() == date[0].text.strip(): # print(str(WTI[k*2+1])[str(WTI[k*2+1]).find("alt=")+5:str(WTI[k*2+1]).find("alt=")+7]) if str(WTI[k * 2 + 1])[str(WTI[k * 2 + 1]).find("alt=") + 5:str(WTI[k * 2 + 1]).find("alt=") + 7] == "하락": Summary.loc[k + 1, ['일자']] = date[k].text.strip() Summary.loc[k + 1, ['WTI']] = WTI[ k * 3].text.strip() + '/-' + WTI[k * 3 + 2].text.strip() else: Summary.loc[k + 1, ['일자']] = date[k].text.strip() Summary.loc[k + 1, ['WTI']] = WTI[ k * 3].text.strip() + '/' + WTI[k * 3 + 2].text.strip() else: # print(str(WTI[k*2+1])[str(WTI[k*2+1]).find("alt=")+5:str(WTI[k*2+1]).find("alt=")+7]) if str(WTI[k * 2 + 1])[str(WTI[k * 3 + 2]).find("alt=") + 5:str(WTI[k * 2 + 1]).find("alt=") + 7] == "하락": Summary.loc[0, ['일자']] = date_USD[0].text.strip() Summary.loc[k + 1, ['일자']] = date[k].text.strip() Summary.loc[k + 1, ['WTI']] = WTI[ k * 3].text.strip() + '/-' + WTI[k * 3 + 2].text.strip() else: Summary.loc[0, ['일자']] = date_USD[0].text.strip() Summary.loc[k + 1, ['일자']] = date[k].text.strip() Summary.loc[k + 1, ['WTI']] = WTI[ k * 3].text.strip() + '/' + WTI[k * 3 + 2].text.strip() ###### 환율(USD) #### for k in range(len(date)): for m in range(len(date_USD)): if date_USD[m].text.strip() == list(Summary.loc[k, ['일자']])[0]: # print(str(USD[m*2+1])[str(USD[m*2+1]).find("alt=")+4:str(USD[m*2+1]).find("alt=")+8]) if str(USD[m * 2 + 1])[str(USD[m * 2 + 1]).find("alt=") + 5:str(USD[m * 2 + 1]).find("alt=") + 7] == "하락": Summary.loc[k, ['환율(USD)']] = USD[m * 2].text + '/-' + USD[ m * 2 + 1].text.strip() else: Summary.loc[k, ['환율(USD)']] = USD[m * 2].text + '/' + USD[ m * 2 + 1].text.strip() else: pass ###### 미국채(10year) #### url1 = 'https://kr.investing.com/rates-bonds/u.s.-10-year-bond-yield-historical-data' req = Request(url1, headers={'User-Agent': 'Mozila/5.0'}) src1 = urlopen(req) src2 = BeautifulSoup(src1.read(), "html.parser") date_US_INTER10Y_list = src2.find_all(class_="first left bold noWrap") US_INTER10Y = src2.find_all("tbody")[0] for k in range(len(date)): for m in range(len(date_US_INTER10Y_list)): date_US_INTER10Y = date_US_INTER10Y_list[m].text[ 0:4] + "." + date_US_INTER10Y_list[m].text[ 6:8] + "." + date_US_INTER10Y_list[m].text[10:12] # print(date_US_INTER10Y) # print(US_INTER10Y.find_all("tr")[m].find_all("td")[1].text) if date_US_INTER10Y == list(Summary.loc[k, ['일자']])[0]: Summary.loc[k, ['미국채(10year)']] = US_INTER10Y.find_all("tr")[ m].find_all("td")[1].text + '/' + US_INTER10Y.find_all( "tr")[m].find_all("td")[5].text else: pass ###### 다우 #### url1 = 'https://finance.naver.com/world/sise.naver?symbol=DJI@DJI&fdtc=0' # req=Request(url1,headers={'User-Agent':'Mozila/5.0'}) html1 = urlopen(url1) src1 = BeautifulSoup(html1.read(), "html.parser") date_DJI = src1.find_all(class_="tb_td") DJI = src1.find_all(class_="tb_td2") DJI_delta = src1.find_all(class_="tb_td3") DJI_UPDW = src1.find_all('tr') # print(DJI_UPDW[3]) for k in range(len(date)): for m in range(len(date_DJI)): if date_DJI[m].text.strip() == list(Summary.loc[k, ['일자']])[0]: # print(str(DJI_UPDW[m+3])[str(DJI_UPDW[m+3]).find("=")+2:str(DJI_UPDW[m+3]).find("=")+10]) if str(DJI_UPDW[m + 3])[str(DJI_UPDW[m + 3]).find("=") + 2:str(DJI_UPDW[m + 3]).find("=") + 10] == "point_dn": Summary.loc[k, ['다우']] = DJI[m].text + '/-' + DJI_delta[ m].text.strip() else: Summary.loc[k, ['다우']] = DJI[m].text + '/' + DJI_delta[ m].text.strip() else: pass ###### 나스닥 #### url1 = 'https://finance.naver.com/world/sise.naver?symbol=NAS@IXIC' # req=Request(url1,headers={'User-Agent':'Mozila/5.0'}) html1 = urlopen(url1) src1 = BeautifulSoup(html1.read(), "html.parser") date_NAS = src1.find_all(class_="tb_td") NAS = src1.find_all(class_="tb_td2") NAS_delta = src1.find_all(class_="tb_td3") NAS_UPDW = src1.find_all('tr') # print(NAS_UPDW[3]) for k in range(len(date)): for m in range(len(date_NAS)): if date_NAS[m].text.strip() == list(Summary.loc[k, ['일자']])[0]: # print(str(NAS_UPDW[m+3])[str(NAS_UPDW[m+3]).find("=")+2:str(NAS_UPDW[m+3]).find("=")+10]) if str(NAS_UPDW[m + 3])[str(NAS_UPDW[m + 3]).find("=") + 2:str(NAS_UPDW[m + 3]).find("=") + 10] == "point_dn": Summary.loc[k, ['나스닥']] = NAS[m].text + '/-' + NAS_delta[ m].text.strip() else: Summary.loc[k, ['나스닥']] = NAS[m].text + '/' + NAS_delta[ m].text.strip() else: pass ###### SNP #### url1 = 'https://finance.naver.com/world/sise.naver?symbol=SPI@SPX' # req=Request(url1,headers={'User-Agent':'Mozila/5.0'}) html1 = urlopen(url1) src1 = BeautifulSoup(html1.read(), "html.parser") date_SNP = src1.find_all(class_="tb_td") SNP = src1.find_all(class_="tb_td2") SNP_delta = src1.find_all(class_="tb_td3") SNP_UPDW = src1.find_all('tr') # print(SNP_UPDW[3]) for k in range(len(date)): for m in range(len(date_SNP)): if date_SNP[m].text.strip() == list(Summary.loc[k, ['일자']])[0]: # print(str(SNP_UPDW[m+3])[str(SNP_UPDW[m+3]).find("=")+2:str(SNP_UPDW[m+3]).find("=")+10]) if str(SNP_UPDW[m + 3])[str(SNP_UPDW[m + 3]).find("=") + 2:str(SNP_UPDW[m + 3]).find("=") + 10] == "point_dn": Summary.loc[k, ['S&P']] = SNP[m].text + '/-' + SNP_delta[ m].text.strip() else: Summary.loc[k, ['S&P']] = SNP[m].text + '/' + SNP_delta[ m].text.strip() else: pass ###### PDM #### url1 = 'https://finance.naver.com/world/sise.naver?symbol=NAS@SOX' # req=Request(url1,headers={'User-Agent':'Mozila/5.0'}) html1 = urlopen(url1) src1 = BeautifulSoup(html1.read(), "html.parser") date_PDM = src1.find_all(class_="tb_td") PDM = src1.find_all(class_="tb_td2") PDM_delta = src1.find_all(class_="tb_td3") PDM_UPDW = src1.find_all('tr') # print(PDM_UPDW[3]) for k in range(len(date)): for m in range(len(date_PDM)): if date_PDM[m].text.strip() == list(Summary.loc[k, ['일자']])[0]: # print(str(PDM_UPDW[m+3])[str(PDM_UPDW[m+3]).find("=")+2:str(PDM_UPDW[m+3]).find("=")+10]) if str(PDM_UPDW[m + 3])[str(PDM_UPDW[m + 3]).find("=") + 2:str(PDM_UPDW[m + 3]).find("=") + 10] == "point_dn": Summary.loc[k, ['PDM']] = PDM[m].text + '/-' + PDM_delta[ m].text.strip() else: Summary.loc[k, ['PDM']] = PDM[m].text + '/' + PDM_delta[ m].text.strip() else: pass print(Summary) Summary_html = Summary.to_html(index=False, justify='center') s = smtplib.SMTP('smtp.gmail.com', 587) s.starttls() s.login('*****@*****.**', 'thdfcvhemyjyxfik') msg = MIMEText(Summary_html, 'html') msg['Subject'] = '주요시장지표' s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string()) s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string()) s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string()) s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string()) s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string()) s.quit() return Summary
def getHTMLsoup(url): HTMLsoup = urllib2.urlopen(url) HTMLsoup = BeautifulSoup(HTMLsoup.read()) return HTMLsoup
class File(Archive): def __init__(self, _path, _logger=None): Archive.__init__(self, _path, _logger) self.type = Types.FOLDER self.content = None @property def title(self): name = self.name title = os.path.splitext(name)[0] return title @property def extension(self): name = self.name extension = os.path.splitext(name)[1] return extension.upper() def exist(self): exist = super().exist(Types.FILE) return exist def delete(self): self.logger.info(f"Deleting {self.name} on " f"in {self.parent_path}.") os.remove(self.path) def read(self): if self.extension.upper() == '.XML': try: file_obj = open(self.path, 'r') self.content = BeautifulSoup( file_obj.read(), features=self.extension.lower().replace('.', '')) except Exception as e: raise NameError(str(e)) elif self.extension.upper() in ['.PNG', '.JPG', '.JPEG']: try: self.content = open(self.path, 'rb') except Exception as e: raise NameError(str(e)) else: msg_error = f'Extensión {self.extension} aun no soportada' self.logger.error(msg_error) raise NameError(msg_error) def save(self): if self.extension.upper() in ['.XML', '.HTML']: file_obj = open(self.path, mode="w", encoding="utf-8") file_obj.write(self.content) file_obj.close() if self.extension.upper() in ['.PNG', '.JPG', '.JPEG']: file_obj = open(self.path, mode="wb") file_obj.write(self.content.read()) file_obj.close() else: msg_error = f'Extensión {self.extension} aun no soportada' self.logger.error(msg_error) raise NameError(msg_error)
my_writer.writeheader() faculty_1 = soup.find_all('div', {'class': "views-row"}) for i in range(0, len(faculty_1)): faculty = faculty_1[i] member = faculty.find('a', {'class': "person-view-primary-field"}) member_name = member.get_text() member_name = member_name.replace('\n', '') member_page = "http://polisci.wustl.edu" + member['href'] member_title = faculty.get_text() member_title = member_title.split("\n") member_title = member_title[2] member_title = member_title.replace(' ', '') #member_page = member_page.replace('\n', ' ') member_page_proper = urllib2.urlopen(member_page) member_page_proper = BeautifulSoup(member_page_proper.read()) member_specialization_spots = member_page_proper.find_all( 'a', {'property': "rdfs:label skos:prefLabel"}) for i in member_specialization_spots: fields = [ "Political Theory", "American", "Methodology", "Comparative", "International Political Economy", "Formal Theory" ] if i.get_text() in fields: member_specialization = i.get_text() page_email_spot = member_page_proper.find( "div", { 'class': "field field-name-field-person-email field-type-email field-label-inline clearfix" }) page_email_spot_2 = page_email_spot.find_all("div",
def Summary(): k = 0 Summary = pd.DataFrame() ###### WTI #### url1 = 'https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=OIL_CL&fdtc=2&page=1' html1 = urlopen(url1) src1 = BeautifulSoup(html1.read(), "html.parser") url2 = 'https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=OIL_CL&fdtc=2&page=2' html2 = urlopen(url2) src2 = BeautifulSoup(html2.read(), "html.parser") date1 = src1.find_all(class_="date") date2 = src2.find_all(class_="date") date_WTI = date1 + date2 WTI1 = src1.find_all(class_="num") WTI2 = src2.find_all(class_="num") WTI = WTI1 + WTI2 date = datetime.date.today() for k in range(20): if k == 0: Summary.loc[k, ['일자']] = datetime.date.strftime(date, '%Y.%m.%d') else: Summary.loc[k, ['일자']] = datetime.date.strftime( date + pd.DateOffset(days=-k), '%Y.%m.%d') m = 0 for m in range(10): for k in range(len(Summary['일자'])): if date_WTI[m].text.strip() == Summary['일자'].iloc[k]: Summary.loc[k, ['WTI']] = WTI[m * 3].text.strip() + '/' + WTI[ m * 3 + 2].text.strip() else: pass ###### VIX #### url1 = 'https://kr.investing.com/indices/volatility-s-p-500-historical-data' req = Request(url1, headers={'User-Agent': 'Mozila/5.0'}) src1 = urlopen(req) src2 = BeautifulSoup(src1.read(), "html.parser") date_vix = src2.find_all(class_="first left bold noWrap") vix = src2.find_all("tbody")[1] for k in range(len(Summary['일자'])): for m in range(len(date_vix)): print("date", date_vix, date_vix[m], m) date_vix_list = date_vix[m].text[0:4] + "." + date_vix[m].text[ 6:8] + "." + date_vix[m].text[10:12] if date_vix_list == Summary['일자'].iloc[k]: Summary.loc[k, ['VIX']] = vix.find_all("tr")[m].find_all( "td")[1].text + '/' + vix.find_all("tr")[m].find_all( "td")[6].text else: pass print(Summary)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Date : 2017-09-23 17:11:12 # @Author : Your Name ([email protected]) # @Link : http://example.org # @Version : $Id$ from urllib.request import urlopen from urllib.error import URLError, HTTPError from bs4 import BeautifulSoup # GET values = {} values['username'] = "******" values['password'] = "******" # data = urllib.urlencode(values) url = "http://passport.csdn.net/account/login" geturl = url + "?" + data print(geturl) html = urlopen(geturl) html = BeautifulSoup(html.read(), 'lxml') print(html)
dataFile = open("news20081001-20081031.txt", "w", encoding="utf-8") from urllib import request mediaList = [] hrefSet = set() #排除重复链接 errorHref = "http://tech.sina.com.cn/mobile/n/2008-10-21/0027842330.shtml" #直接提取正文在我的电脑上会发生MemoryError,因此单独提出来在line63-71处理 from bs4 import BeautifulSoup for i in range(1, 32): #得出31个日期 if i < 10: date = "0" + str(i) else: date = str(i) url = request.urlopen("http://news.sina.com.cn/hotnews/200810" + date + ".shtml") url = BeautifulSoup(url.read().decode("gb18030"), "html.parser") charNum = 0 #计算字数 imgList = [] #计算图片数量 url = str(url.find_all("a")).split(",") hrefs = "" for link in url: if "comment" not in link: hrefs += link + "\n" hrefs = hrefs[hrefs.find(""" <a name="2"></a>"""):hrefs.rfind( """<a href="/guest.html" target="_blank">新闻中心意见反馈留言板""")] hrefList = hrefs.split("\n") hrefList = hrefList[2:]
def Summary(): k = 0 Summary = pd.DataFrame() ###### WTI #### url1 = 'https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=OIL_CL&fdtc=2&page=1' html1 = urlopen(url1) src1 = BeautifulSoup(html1.read(), "html.parser") url2 = 'https://finance.naver.com/marketindex/worldDailyQuote.naver?marketindexCd=OIL_CL&fdtc=2&page=2' html2 = urlopen(url2) src2 = BeautifulSoup(html2.read(), "html.parser") date1 = src1.find_all(class_="date") date2 = src2.find_all(class_="date") date_WTI = date1 + date2 WTI1 = src1.find_all(class_="num") WTI2 = src2.find_all(class_="num") WTI = WTI1 + WTI2 date = datetime.date.today() for k in range(20): if k == 0: Summary.loc[k, ['일자']] = datetime.date.strftime(date, '%Y.%m.%d') else: Summary.loc[k, ['일자']] = datetime.date.strftime( date + pd.DateOffset(days=-k), '%Y.%m.%d') m = 0 for m in range(10): for k in range(len(Summary['일자'])): # print(date_WTI[m].text.strip(),len(Summary['일자'].iloc[k])) if date_WTI[m].text.strip() == Summary['일자'].iloc[k]: Summary.loc[k, ['WTI']] = WTI[m * 3].text.strip() + '/' + WTI[ m * 3 + 2].text.strip() # print(Summary.loc[k,['WTI']]) else: pass ###### 환율(USD) #### url1 = 'https://finance.naver.com//marketindex/exchangeDailyQuote.naver?marketindexCd=FX_USDKRW' html1 = urlopen(url1) src1 = BeautifulSoup(html1.read(), "html.parser") date_USD = src1.find_all(class_="date") USD = src1.find_all(class_="num") m = 0 for m in range(10): for k in range(len(Summary['일자'])): if date_USD[m].text.strip() == Summary['일자'].iloc[k]: # print(m,k,date_USD[m].text.strip(),Summary['일자'].iloc[k],len(Summary['일자'])) if str(USD[m * 2 + 1])[str(USD[m * 2 + 1]).find("alt=") + 5:str(USD[m * 2 + 1]).find("alt=") + 7] == "하락": Summary.loc[k, ['USD']] = USD[m * 2].text.strip( ) + '/-' + USD[m * 2 + 1].text.strip() else: Summary.loc[k, ['USD']] = USD[m * 2].text.strip() + '/' + USD[ m * 2 + 1].text.strip() else: pass ###### 미국채(10year) #### url1 = 'https://kr.investing.com/rates-bonds/u.s.-10-year-bond-yield-historical-data' req = Request(url1, headers={'User-Agent': 'Mozila/5.0'}) src1 = urlopen(req) src2 = BeautifulSoup(src1.read(), "html.parser") date_US_INTER10Y_list = src2.find_all(class_="first left bold noWrap") US_INTER10Y = src2.find_all("tbody")[0] for k in range(len(Summary['일자'])): for m in range(len(date_US_INTER10Y_list)): date_US_INTER10Y = date_US_INTER10Y_list[m].text[ 0:4] + "." + date_US_INTER10Y_list[m].text[ 6:8] + "." + date_US_INTER10Y_list[m].text[10:12] # print(date_US_INTER10Y,Summary['일자'].iloc[k]) if date_US_INTER10Y == Summary['일자'].iloc[k]: Summary.loc[k, ['미국채(10year)']] = US_INTER10Y.find_all("tr")[ m].find_all("td")[1].text + '/' + US_INTER10Y.find_all( "tr")[m].find_all("td")[5].text else: pass ###### 다우 #### url1 = 'https://finance.naver.com/world/sise.naver?symbol=DJI@DJI' # req=Request(url1,headers={'User-Agent':'Mozila/5.0'}) html1 = urlopen(url1) src1 = BeautifulSoup(html1.read(), "html.parser") date_DJI = src1.find_all(class_="tb_td") DJI = src1.find_all(class_="tb_td2") DJI_delta = src1.find_all(class_="tb_td3") DJI_UPDW = src1.find_all('tr') m = 0 for k in range(len(Summary['일자'])): if m + 1 != 11: # print(date_DJI[m+1].text.strip(),Summary['일자'].iloc[k]) if date_DJI[m + 1].text.strip() == Summary['일자'].iloc[k]: # print('dji',str(DJI_UPDW[m+3])[str(DJI_UPDW[m+3]).find("=")+2:str(DJI_UPDW[m+3]).find("=")+10]) if str(DJI_UPDW[m + 3])[str(DJI_UPDW[m + 3]).find("=") + 2:str(DJI_UPDW[m + 3]).find("=") + 10] == "point_dn": Summary.loc[k - 1, ['다우']] = DJI[ m + 1].text + '/-' + DJI_delta[m + 1].text.strip() # print(Summary.loc[k-1,['다우']],DJI[m+1].text+'/-'+DJI_delta[m+1].text.strip()) m = m + 1 else: Summary.loc[k - 1, ['다우']] = DJI[ m + 1].text + '/' + DJI_delta[m + 1].text.strip() # print(Summary.loc[k-1,['다우']],DJI[m+1].text+'/'+DJI_delta[m+1].text.strip()) m = m + 1 else: pass else: break ###### 나스닥 #### url1 = 'https://finance.naver.com/world/sise.naver?symbol=NAS@IXIC' # req=Request(url1,headers={'User-Agent':'Mozila/5.0'}) html1 = urlopen(url1) src1 = BeautifulSoup(html1.read(), "html.parser") date_NAS = src1.find_all(class_="tb_td") NAS = src1.find_all(class_="tb_td2") NAS_delta = src1.find_all(class_="tb_td3") NAS_UPDW = src1.find_all('tr') m = 0 for k in range(len(Summary['일자'])): if m + 1 != 11: if date_NAS[m + 1].text.strip() == Summary['일자'].iloc[k]: # print('nas',str(NAS_UPDW[m+3])[str(NAS_UPDW[m+3]).find("=")+2:str(NAS_UPDW[m+3]).find("=")+10]) if str(NAS_UPDW[m + 3])[str(NAS_UPDW[m + 3]).find("=") + 2:str(NAS_UPDW[m + 3]).find("=") + 10] == "point_dn": Summary.loc[k - 1, ['나스닥']] = NAS[ m + 1].text + '/-' + NAS_delta[m + 1].text.strip() m = m + 1 else: Summary.loc[k - 1, ['나스닥']] = NAS[ m + 1].text + '/' + NAS_delta[m + 1].text.strip() m = m + 1 else: pass else: break ###### SNP #### url1 = 'https://finance.naver.com/world/sise.naver?symbol=SPI@SPX' # req=Request(url1,headers={'User-Agent':'Mozila/5.0'}) html1 = urlopen(url1) src1 = BeautifulSoup(html1.read(), "html.parser") date_SNP = src1.find_all(class_="tb_td") SNP = src1.find_all(class_="tb_td2") SNP_delta = src1.find_all(class_="tb_td3") SNP_UPDW = src1.find_all('tr') m = 0 for k in range(len(Summary['일자'])): if m + 1 != 11: if date_SNP[m + 1].text.strip() == Summary['일자'].iloc[k]: # print(str(SNP_UPDW[m+3])[str(SNP_UPDW[m+3]).find("=")+2:str(SNP_UPDW[m+3]).find("=")+10]) if str(SNP_UPDW[m + 3])[str(SNP_UPDW[m + 3]).find("=") + 2:str(SNP_UPDW[m + 3]).find("=") + 10] == "point_dn": Summary.loc[k - 1, ['S&P']] = SNP[ m + 1].text + '/-' + SNP_delta[m + 1].text.strip() m = m + 1 else: Summary.loc[k - 1, ['S&P']] = SNP[ m + 1].text + '/' + SNP_delta[m + 1].text.strip() m = m + 1 else: pass else: break ###### PDM #### url1 = 'https://finance.naver.com/world/sise.naver?symbol=NAS@SOX' # req=Request(url1,headers={'User-Agent':'Mozila/5.0'}) html1 = urlopen(url1) src1 = BeautifulSoup(html1.read(), "html.parser") date_PDM = src1.find_all(class_="tb_td") PDM = src1.find_all(class_="tb_td2") PDM_delta = src1.find_all(class_="tb_td3") PDM_UPDW = src1.find_all('tr') m = 0 for k in range(len(Summary['일자'])): if m + 1 != 11: if date_PDM[m + 1].text.strip() == Summary['일자'].iloc[k]: # print('pdm',str(PDM_UPDW[m+3])[str(PDM_UPDW[m+3]).find("=")+2:str(PDM_UPDW[m+3]).find("=")+10]) if str(PDM_UPDW[m + 3])[str(PDM_UPDW[m + 3]).find("=") + 2:str(PDM_UPDW[m + 3]).find("=") + 10] == "point_dn": Summary.loc[k - 1, ['PDM']] = PDM[ m + 1].text + '/-' + PDM_delta[m + 1].text.strip() m = m + 1 else: Summary.loc[k - 1, ['PDM']] = PDM[ m + 1].text + '/' + PDM_delta[m + 1].text.strip() m = m + 1 else: pass else: break ###### VIX(S&P500) #### url1 = 'https://kr.investing.com/indices/volatility-s-p-500-historical-data' req = Request(url1, headers={'User-Agent': 'Mozila/5.0'}) src1 = urlopen(req) src2 = BeautifulSoup(src1.read(), "html.parser") date_vix = src2.find_all(class_="first left bold noWrap") vix = src2.find_all("tbody")[1] for k in range(len(Summary['일자'])): for m in range(len(date_vix)): # print("date",date_vix,date_vix[m],m) date_vix_list = date_vix[m].text[0:4] + "." + date_vix[m].text[ 6:8] + "." + date_vix[m].text[10:12] if date_vix_list == Summary['일자'].iloc[k]: Summary.loc[k, ['VIX(S&P)']] = vix.find_all("tr")[m].find_all( "td")[1].text + '/' + vix.find_all("tr")[m].find_all( "td")[6].text else: pass ###### VIX(KOSPI) #### url1 = 'https://kr.investing.com/indices/kospi-volatility-historical-data' req = Request(url1, headers={'User-Agent': 'Mozila/5.0'}) src1 = urlopen(req) src2 = BeautifulSoup(src1.read(), "html.parser") date_vix = src2.find_all(class_="first left bold noWrap") vix = src2.find_all("tbody")[0] for k in range(len(Summary['일자'])): for m in range(len(date_vix)): # print("date",date_vix,date_vix[m],m) date_vix_list = date_vix[m].text[0:4] + "." + date_vix[m].text[ 6:8] + "." + date_vix[m].text[10:12] if date_vix_list == Summary['일자'].iloc[k]: Summary.loc[k, ['VIX(KOSPI)']] = vix.find_all( "tr")[m].find_all("td")[1].text + '/' + vix.find_all( "tr")[m].find_all("td")[6].text else: pass Summary = Summary.fillna('-') # print(Summary) Summary_html = Summary.to_html(index=False, justify='center') s = smtplib.SMTP('smtp.gmail.com', 587) s.starttls() s.login('*****@*****.**', 'thdfcvhemyjyxfik') msg = MIMEText(Summary_html, 'html') msg['Subject'] = '주요시장지표' s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string()) s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string()) s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string()) s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string()) s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string()) s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string()) s.sendmail("*****@*****.**", "*****@*****.**", msg.as_string()) s.quit() return Summary
def scraper(self): try: self.text(92, "\n\n BOT : Starting Wikipedia Crawl...\n") self.text(92, "\n BOT : Crawling...\n") self.log.info("BOT : Scraper Function Started") r = self.r_session.get(self.host, headers=self.browser_headers) data = r.content # LXML ERROR MAY RISE UNCAUGHT CHECK WHEN OS CHANGE parse = BeautifulSoup(str(data), 'lxml', from_encoding="utf-8") div = parse.find("div", {"id": "mp-otd"}) hot_link = div.findAll('li') for elem, links in enumerate(hot_link): if elem == 0 or elem == 1 or elem == 2 or elem == 3 or elem == 4: year = re.findall(r'[0-9]{3,4}', links.text) # Without Year Content # content = re.sub(r'[0-9]{4}','',links.text) # With Year Content content = str(links.text.encode('ascii', 'ignore')) # BULK LINK EMPTY ARRAY INITIALISED ONLY bulk_link = [] inner_link = links.findAll('a') inner_link.pop(0) for links in inner_link: temp = BeautifulSoup(str(links), 'lxml') title = temp.a['title'] link = temp.a['href'] bulk_link.append({ 'title': title, 'url': link, 'img_w': False, 'img_h': False, 'img_size': False, 'ext_type': False, 'img': False }) # Certain Days Wikipedia Tends To Shows Less Data, Fail Safe Check | Break And Continue try: self.scrape_set.update({ elem: { 'year': year[0], 'content': str(content), 'links': bulk_link } }) except: break with open('scraped_data/temp_scraped_data.json', 'wb+') as content: content.write(json.dumps(self.scrape_set)) content.close() data = open('scraped_data/temp_scraped_data.json', 'r') content = json.loads(data.read()) data.close() for key, val in sorted(content.iteritems()): for link in val['links']: url = link['url'] title = str(link['title'].encode('ascii', 'ignore')) # Fail Safe Encodes '#' => %23 As Requests Library Does Not Do That Automatically encoded_url = re.sub('#', '%23', url.replace("/wiki/", '')) r = self.r_session.get(self.img_url % (encoded_url, self.img_size_approx), headers=self.browser_headers) data = json.loads(r.content) # FAIL SAFE PAGE KEY ERROR WIKI API # Break The Loop And Move Forward if 'pages' not in data['query']: self.text( 93, "\n BOT : Page Key Error For - %s\n") % (title) self.log.error("BOT : Page Key Error For - %s") % ( title) break for k, v in data['query']['pages'].iteritems(): if 'thumbnail' in v: thumb_url = str(v['thumbnail']['source'].encode( 'ascii', 'ignore')) img_w = v['thumbnail']['width'] img_h = v['thumbnail']['height'] img_data = self.save_img(thumb_url, title) self.temp_link.append({ 'content_key': key, 'url': url, 'title': title, 'img_w': img_w, 'img_h': img_h, 'img_size': img_data[1], 'ext_type': img_data[2], 'img': img_data[0] }) else: self.text( 93, "\n BOT : Image Not Available For - %s (Skipping)\n" % (title)) self.log.warn( "BOT : Image Not Available For - %s (Skipping)" % (title)) temp_link_list = open('scraped_data/temp_link.json', 'wb+') temp_link_list.write(json.dumps({'links': self.temp_link})) temp_link_list.close() # Read Saved JSON FOR FURTHER PROCESS save_json = open('scraped_data/temp_scraped_data.json', 'r') json_content_main = json.loads(save_json.read()) save_json.close() for key, val in sorted(json_content_main.iteritems()): temp = open('scraped_data/temp_link.json', 'r') content_link = json.loads(temp.read()) temp.close() for link in val['links']: for link_img in content_link['links']: if link_img['title'] == link['title']: # Replace And Re-Prepare Json With Fetched Img link['img'] = link_img['img'] link['img_w'] = link_img['img_w'] link['img_h'] = link_img['img_h'] link['ext_type'] = link_img['ext_type'] link['img_size'] = link_img['img_size'] #Write JSON (Base) save_json_clean = open('scraped_data/temp_scraped_data.json', 'wb') save_json_clean.write(json.dumps(json_content_main)) save_json_clean.close() os.remove('scraped_data/temp_link.json') # ///////////////////////////////////////////////////////////////////////////////// # Highlighted On This Day # //////////////////////////////////////////////////////////////////////////////// temp_highlight_json = {} r = self.r_session.get(self.host, headers=self.browser_headers) data = r.content parse = BeautifulSoup(str(data), 'lxml', from_encoding='utf-8') div = parse.find("div", {"id": "mp-otd"}) hot_link = BeautifulSoup(str(div.find('p')), 'lxml', from_encoding='utf-8') # Highlight Day And Contents Of It Grab Here highlight_day = str(hot_link.getText()).split(':')[0] highlight_content = str(hot_link.getText()) # --------------------- # MAIN SEPERATOR # (CHANGE HERE IF U REQUIRE ONLY ONE ANCHOR TAG FROM FIRST <P> HIGHLIGHT ELEM) # ---------------------- # ->Fetch Only 1st Elem ->># str(hot_link.html.body.p.b.a.contents[0]) # --------------------- # HIGHLIGHT DIV <P> ELEM FILTER LOGIC # --------------------- # GRAB ALL ANCHOR IN HIGHLIGHT DIV <P> FIRST HIGHLIGHT TAG # POP 0 ELEM AS IT'S YEAR AND ALSO IGNORE YEAR LINKS # --------------------- highlight_data = list(hot_link.html.body.p.findAll('a', href=True)) highlight_data.pop(0) # print highlight_data # TEMP Highlight Holder temp_highlight_link = [] # Cleaned Highlight HOLDER highlight_link = [] for link in highlight_data: temp = BeautifulSoup(str(link), 'lxml', from_encoding='utf-8') title = temp.a['title'] link = temp.a['href'] year = re.search(r"/wiki/[0-9]{3,4}", str(link)) # IMP ---------- # Ignoring Year Links In Highlight Todays Div <P> FIRST TAG if not year: temp_highlight_link.append({'title': title, 'url': link}) for link in temp_highlight_link: # Link title = link['title'] url = link['url'] # Fail Safe Encodes '#' => %23 As Requests Library Does Not Do That Automatically encoded_url = re.sub('#', '%23', url.replace("/wiki/", '')) r = self.r_session.get(self.img_url % (encoded_url, self.img_size_approx), headers=self.browser_headers) data = json.loads(r.content) # print str(data)+str('\n') # FAIL SAFE PAGE KEY ERROR WIKI API # Break The Loop And Move Forward if 'pages' not in data['query']: self.text(93, "\n BOT : Page Key Error For - %s\n") % (title) self.log.error("BOT : Page Key Error For - %s") % (title) break for k, v in data['query']['pages'].iteritems(): if 'thumbnail' in v: thumb_url = str(v['thumbnail']['source'].encode( 'ascii', 'ignore')) img_w = v['thumbnail']['width'] img_h = v['thumbnail']['height'] img_data = self.save_img(thumb_url, title) highlight_link.append({ 'url': url, 'title': title, 'img_w': img_w, 'img_h': img_h, 'img_size': img_data[1], 'ext_type': img_data[2], 'img': img_data[0] }) else: self.text( 93, "\n BOT : Image Not Available For - %s (Skipping)\n" % (title)) self.log.warn( "BOT : Image Not Available For - %s (Skipping)" % (title)) # print temp_highlight_link # print highlight_link self.text( 92, "\n BOT : Scraping Data And Preparing Cleaned JSON...\n") # ---------------------------- # Prepare Final Cleaned JSON # ---------------------------- json_open_clean = open('scraped_data/temp_scraped_data.json', 'r') temp_link_clean_hold = json.loads(json_open_clean.read()) json_open_clean.close() save_json = open( '%s/%s.json' % (self.todays_dir, self.todays_dir_dt_fmt), 'wb') temp_l = [] for k, link in sorted(temp_link_clean_hold.iteritems()): temp_l.append(link) clean_data = { 'timestamp': self.current_day, 'article_day': self.article_day, 'todays_highlight': { 'highlight_day': highlight_day, 'highlight_content': highlight_content, 'links': highlight_link }, 'year_highlight': temp_l } save_json.write(json.dumps(clean_data)) os.remove('scraped_data/temp_scraped_data.json') self.log.info( str(time.strftime("BOT : Scraper Function Completed"))) self.text(92, "\n BOT : Completed...\n\n") return None except Exception, e: self.log.exception(e) self.text(91, "\nError : %s\n\n" % (e)) print '-' * 60 traceback.print_exc() print '-' * 60 sys.exit(1)
import urllib2 from bs4 import BeautifulSoup import re import sys durl = "http://www.imdb.com/list/export?list_id=jwTh5Uwt2JU&author_id=ur24339561" page = urllib2.urlopen(durl) soup = BeautifulSoup(page) rst = open("mvdb_const.txt", "a") for i in re.finditer(r"\"tt\d{7}\"", soup.read()): print >> rst, i.group(0) rst.close()
from pathlib import Path from bs4 import BeautifulSoup data_folder = Path("input-extraction/") overstock1 = data_folder / "jewelry01.html" overstock2 = data_folder / "jewelry02.html" rtvslo1 = data_folder / "Audi.html" rtvslo2 = data_folder / "Volvo.html" ceneje1 = data_folder / "PC-Ceneje.si.html" ceneje2 = data_folder / "Kavci-Ceneje.si.html" RacNovice1 = data_folder / "RacNovice1.html" RacNovice2 = data_folder / "RacNovice2.html" overstock1 = open(overstock1, "r", encoding='utf-8', errors='ignore') overstock2 = open(overstock2, "r", encoding='utf-8', errors='ignore') rtvslo1 = open(rtvslo1, "r", encoding='utf-8', errors='ignore') rtvslo2 = open(rtvslo2, "r", encoding='utf-8', errors='ignore') ceneje1 = open(ceneje1, "r", encoding='utf-8', errors='ignore') ceneje2 = open(ceneje2, "r", encoding='utf-8', errors='ignore') RacNovice1 = open(RacNovice1, "r", encoding='utf-8', errors='ignore') RacNovice2 = open(RacNovice2, "r", encoding='utf-8', errors='ignore') overstock1 = BeautifulSoup(overstock1.read(), 'html.parser') overstock2 = BeautifulSoup(overstock2.read(), 'html.parser') rtvslo1 = BeautifulSoup(rtvslo1.read(), 'html.parser') rtvslo2 = BeautifulSoup(rtvslo2.read(), 'html.parser') ceneje1 = BeautifulSoup(ceneje1.read(), 'html.parser') ceneje2 = BeautifulSoup(ceneje2.read(), 'html.parser') RacNovice1 = BeautifulSoup(RacNovice1.read(), 'html.parser') RacNovice2 = BeautifulSoup(RacNovice2.read(), 'html.parser')
result = opener.open(loginurl, postdata) cookie.save(ignore_discard=True, ignore_expires=True) # 新建csv文件对象 csv_file = open('yxdatas.csv', 'wb') csv_writer = csv.writer(csv_file, delimiter=',') # 处理数据进行拼接 Collegeurl = 'http://www.233.mistong.com/clpnew/index?cengci=b&cid={cid}&mid=#anchor' cid = eval(config.ConfigIni.get_Cid()) for i in xrange(len(cid)): urlcollege = Collegeurl.format(cid=cid[i]) response = opener.open(urlcollege) response = BeautifulSoup(response.read()) try: colleges = response.select(".anaMes")[0].string print colleges except IndexError as e: print e data = response.select(".anaBg > ul > li ") for zklist in data: zkl = zklist.select("b")[0].get_text() print zkl # for i in zkl: # zk = i # print zk per = zklist.select("strong")[0].string print per csv_writer.writerow([colleges, zkl, per, urlcollege])