from bs4 import BeautifulSoup import urllib2 import re url = urllib2.urlopen("http://www.python.org") content = url.read() soup = BeautifulSoup(content) for a in soup.findAll('a',href=True): if re.findall('group', a['href']): print "Found the URL:", a['href']
import re # this provide format that needs to be scraped import time # it make the program to sleep for specific time from urllib.request import Request, urlopen # this get the html ,act like headless browser from bs4 import BeautifulSoup print("ENTER THE URL TO FIND MOBILE NUMBER AND EMAIL url eg:https://www.homersbrandcare.com") # url of website from which we need email and phone number url=str(input()) req = Request(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}) html = urlopen(req).read() html=html.decode() bsObj = BeautifulSoup(html,features="lxml"); links=[] for link in bsObj.find_all('a'): links.append(str(link.get('href'))) #print(url + str(link.get('href'))) def Email(url): try: req = Request(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}) hh = urlopen(req).read() # we are making request act like firefox browser time.sleep(0.5) #time.sleep(1) #raw_html=t.text #dd=html2text.html2text(hh) dd=hh.decode() email=re.findall(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+',dd) # this is the format ,based on this format only we will get results # this is email format print(email) phone=re.findall(r'(tel\s?:?[0-9]+)|(\+91-?[0-9]+)',dd) # phone number format
import requests from bs4 import BeautifulSoup url = "https://news.naver.com" req = requests.get(url) html = req.text soup = BeautifulSoup(html, 'html.parser') rankStr = "#ranking_10" print(soup) for i in range(0, 6): rankList = soup.select(rankStr + str(i)) print(rankList) aList = [] for rank in rankList: aList = rank.find_all('a') with open("news.txt", "a") as f: f.write("#####" + str(i) + "####\n") for article in aList: f.write(article.text + "\n" + url + article['href'] + "\n\n") f.write("###################\n")
retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdfFile) device.close() content = retstr.getvalue() retstr.close() return content html = urlopen( "http://www.cbf.com.br/competicoes/brasileiro-serie-a/tabela/2016#.WQTfO9orLIV" ) bsObj = BeautifulSoup(html.read(), 'html.parser') nameList = bsObj.findAll('div', class_="full-game-links") sumulas = [] for name in nameList: res = name.findAll('a') if 'retificacoes' in str(res[0]): res = str(res[2]) else: res = str(res[0]) j = res.find('url[]=') k = res.find('.pdf') sumula = res[j + 6:k + 4] if len(sumula) > 0: sumulas.append(sumula)
from urllib import request import requests from bs4 import BeautifulSoup url = 'https://www.ptt.cc/bbs/movie/index.html' headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36' } res = requests.get(url, headers=headers) #print(res.text) soup = BeautifulSoup(res.text, 'html.parser') title = soup.select('div[class="title"]') #print(title) for t in title: print('________') try: #print(t) article_title = t.select( 'a')[0].text #'a'為把所有結構的a標籤取出來, .text可以把字串拿出來 article_url = 'https://www.ptt.cc/' + t.select('a')[0][ 'href'] #取出網址 需自行加前面才會有正常的網址 #也可寫成 article_url = t.a print(article_title) print(article_url) except: print(t)
def __init__(self, name, page): self.name = name self.page = page rv = requests.get( 'http://www.basketball-reference.com{0}'.format(page)) self.soup = BeautifulSoup(rv.text, features="html.parser")
async def devices_specifications(request): if request.fwd_from: return """ Mobile devices specifications """ textx = await request.get_reply_message() brand = request.pattern_match.group(1).lower() device = request.pattern_match.group(2).lower() if brand and device: pass elif textx: brand = textx.text.split(" ")[0] device = " ".join(textx.text.split(" ")[1:]) else: await sunday.edit_or_reply(request, "`Usage: .specs <brand> <device>`") return all_brands = ( BeautifulSoup( get("https://www.devicespecifications.com/en/brand-more").content, "lxml" ) .find("div", {"class": "brand-listing-container-news"}) .findAll("a") ) brand_page_url = None try: brand_page_url = [ i["href"] for i in all_brands if brand == i.text.strip().lower() ][0] except IndexError: await sunday.edit_or_reply(request, f"`{brand} is unknown brand!`") return devices = BeautifulSoup(get(brand_page_url).content, "lxml").findAll( "div", {"class": "model-listing-container-80"} ) device_page_url = None try: device_page_url = [ i.a["href"] for i in BeautifulSoup(str(devices), "lxml").findAll("h3") if device in i.text.strip().lower() ] except IndexError: await sunday.edit_or_reply(request, f"`can't find {device}!`") return if len(device_page_url) > 2: device_page_url = device_page_url[:2] reply = "" for url in device_page_url: info = BeautifulSoup(get(url).content, "lxml") reply = "\n" + info.title.text.split("-")[0].strip() + "\n" info = info.find("div", {"id": "model-brief-specifications"}) specifications = re.findall(r"<b>.*?<br/>", str(info)) for item in specifications: title = re.findall(r"<b>(.*?)</b>", item)[0].strip() data = ( re.findall(r"</b>: (.*?)<br/>", item)[0] .replace("<b>", "") .replace("</b>", "") .strip() ) reply += f"**{title}**: {data}\n" await sunday.edit_or_reply(request, reply)
def insertCode(): # Con la libreria soup se abre el fichero index de templates soup = BeautifulSoup(open(url_index), 'html.parser') # Eliminar los link y script que vamos a insertar después soup.find("script", {"src": "resources/jquery.js"}).extract() soup.find("script", {"src": "resources/marked.min.js"}).extract() soup.find("link", {"href": "resources/primer.css"}).extract() soup.find("link", {"href": "resources/rec.css"}).extract() soup.find("link", {"href": "resources/extra.css"}).extract() soup.find("link", {"href": "resources/owl.css"}).extract() # Se introduce primer link a un css con sus atributos en head con append que lo mete al final new_link = soup.new_tag('link') new_link.attrs['rel'] = 'stylesheet' new_link.attrs['href'] = '{% static \'extra.css\' %}' # static soup.head.append(new_link) new_link = soup.new_tag('link') new_link.attrs['rel'] = 'stylesheet' new_link.attrs['href'] = '{% static \'owl.css\' %}' soup.head.append(new_link) new_link = soup.new_tag('link') new_link.attrs['rel'] = 'stylesheet' new_link.attrs['href'] = '{% static \'primer.css\' %}' soup.head.append(new_link) new_link = soup.new_tag('link') new_link.attrs['rel'] = 'stylesheet' new_link.attrs['href'] = '{% static \'rec.css\' %}' soup.head.append(new_link) new_link = soup.new_tag('link') new_link.attrs['rel'] = 'stylesheet' new_link.attrs['href'] = '{% static \'validate.css\' %}' soup.head.append(new_link) # Se introduce el script que modifica tabla de contenidos y se mete con insert al principio, es esencial que se ejecute antes new_script = soup.new_tag('script') new_script.attrs['type'] = 'text/javascript' new_script.attrs['src'] = '{% static \'jquery.js\' %}' soup.head.insert(1, new_script) # Se introduce script para el css new_script = soup.new_tag('script') new_script.attrs['type'] = 'text/javascript' new_script.attrs['src'] = '{% static \'marked.min.js\' %}' soup.head.insert(3, new_script) # Se introduce nuestro script para añadir la funcion de validacion new_script = soup.new_tag('script') new_script.attrs['type'] = 'text/javascript' new_script.attrs['src'] = '{% static \'validate.js\' %}' soup.head.insert(4, new_script) # Se introduce script con la libreria ajax para utilizar en nuestro script de validacion... esencial ponerlo al principio porque si no no carga bien el script new_script = soup.new_tag('script') new_script.attrs['src'] = 'https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js' soup.head.insert(2, new_script) # Aqui se empieza a meter la parte para descargar las shapes, desde dentro hacia fuera new_img = soup.new_tag('img') new_img.attrs['src'] = 'https://img.shields.io/badge/Format-TTL-blue.svg' new_img.attrs['<'] = '' new_img.attrs['img'] = '' new_a = soup.new_tag('a') new_a.attrs['href'] = 'shapes.ttl' new_a.attrs['target'] = '_blank' new_a.append(new_img) new_span = soup.new_tag('span') new_span.append(new_a) new_dd = soup.new_tag('dd') new_dd.append(new_span) new_dt = soup.new_tag('dt') new_dt.string = "Download shapes" new_dl = soup.new_tag('dl') new_dl.insert(0, new_dt) new_dl.insert(1, new_dd) new_br = soup.new_tag('br') soup.find("dt", text="Download serialization:").parent.insert_after(new_dl) # Aqui se empieza a meter la parte de validacion, otra vez desde dentro hacia fuera new_text = soup.new_tag('textarea') new_text.attrs['id'] = 'textrules' new_text.attrs['rows'] = '14' new_text.attrs['cols'] = '60' new_text.attrs['placeholder'] = 'This is a placeholder where the rules should be explained in natural language.' new_h4 = soup.new_tag('h4') new_h4.string = 'Validation rules' new_button = soup.new_tag('button') new_button.attrs['class'] = 'buttonS' new_button.attrs['onclick'] = "validate('{% url 'validate'%}', '{{ csrf_token }}')" new_button.string = 'Validate' new_text1 = soup.new_tag('textarea') new_text1.attrs['id'] = 'textdata' new_text1.attrs['rows'] = '14' new_text1.attrs['cols'] = '100' new_text1.attrs['placeholder'] = 'Write data for the validation. Only Turtle format (ttl) is supported.' new_input = soup.new_tag('input') new_input.attrs['class'] = 'inp' new_input.attrs['type'] = 'checkbox' new_label = soup.new_tag('label') new_label.attrs['id'] = 'labelValid' new_label.string = 'Coverage' new_label.append(new_input) new_br = soup.new_tag('br') new_br1 = soup.new_tag('br') new_br2 = soup.new_tag('br') new_br3 = soup.new_tag('br') new_span1 = soup.new_tag('span') new_span1.attrs['class'] = 'spanExp' new_span1.attrs['style'] = 'color:#87CEFA' new_span1.string = '*' new_span2 = soup.new_tag('span') new_span1.attrs['class'] = 'spanExp' new_span2.string = 'Coverage parameter specifies, if true, which types within the data are covered by the provided shape (consider that those not covered by the shape are always correctly validated).' new_divValid = soup.new_tag('div') new_divValid.attrs['id'] = 'divValid' new_divValid.append(new_label) new_divValid.append(new_br1) new_divValid.append(new_span1) new_divValid.append(new_span2) new_divValid.append(new_br) new_divValid.append(new_br2) new_divValid.append(new_text1) new_divValid.append(new_br3) new_divValid.append(new_button) new_a = soup.new_tag('a') new_a.attrs['href'] = '#toc' new_a.string = 'ToC' new_span = soup.new_tag('span') new_span.attrs['class'] = 'backlink' new_span.string = 'back to' new_span.append(new_a) new_h2 = soup.new_tag('h2') #new_h2.attrs['id'] = 'valid' new_h2.attrs['class'] = 'list' new_h2.string = 'Validation' new_h2.append(new_span) # Se mete el div con el resto incluido, importante incluirselo en orden. Al final se añade al body con soup new_div = soup.new_tag('div') new_div.attrs['id'] = 'validation' new_div.append(new_h2) new_div.append(new_divValid) new_div.append(new_h4) new_div.append(new_text) soup.find("div", {"id": "references"}).insert_before(new_div) # Se escribe el codigo añadido en el index de templates para actualizarlo with open(url_index, "w") as file: file.write(str(soup))
def eurorub(): full_page = requests.get(euro_rub, headers=headers) soup = BeautifulSoup(full_page.content, 'html.parser') convert = soup.findAll("span", {"class": "DFlfde", "class": "SwHCTb", "data-precision": 2}) return convert[0].text
palabra.append(letra) if ' ' in palabra: palabra.remove(' ') nueva_palabra = ''.join(palabra) os.rename(n, nueva_palabra) palabra.clear() for item in os.listdir(): os.path.splitext(item) name_changed = os.path.splitext(item)[0] print(name_changed) if name_changed.isdigit(): req = requests.get('/{}/'.format(name_changed)) soup = BeautifulSoup(req.text, "lxml") new_name = soup.h1.string + '.rar' palabra_dos = [] not_allowed = ['*', '"', '/', '\ ', '<', '>', ':', '|', '?'] for letra in new_name: palabra_dos.append(letra) for character in no_allowed: for n in palabra_dos: if character == n: palabra_dos.remove(character) new_name = ''.join(palabra_dos) print(new_name) os.rename(item, new_name)
def get_soup(file): with codecs.open(file, encoding='utf-8')as f: handler = f.read() return BeautifulSoup(handler, features="lxml")
def main(): # connect to server try: cnx = mysql.connector.connect( host=dbhostname, user=dbusername, passwd=dbpassword, db=dbname, charset='utf8', use_unicode=True ) cursor = cnx.cursor() except mysql.connector.Error as err: if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: print('Something is wrong with your user name or password') else: print(err) print('Script is shutting down.') os._exit(1) # load DB and initialize table try: cursor.execute(f'USE {dbname}') except mysql.connector.Error as err: print(f'Database {dbname} does not exist.') if err.errno == errorcode.ER_BAD_DB_ERROR: create_database(cursor) print(f'Database {dbname} created successfully.') cnx.database = dbname else: print(err) os._exit(1) cursor.execute(TABLE_create_query) # scrape main page URL = 'https://celestrak.com/NORAD/elements/' page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') tables = soup.find_all('table', class_='striped-odd') tot_files = 0 for table in tables: # get main category header = table.find('tr', class_='header') main_cat = header.next.next # find all links within main category links = header.find_next_siblings() for link in links: _tmp_link = link.next.next if type(_tmp_link) != element.Tag: continue if 'href' in _tmp_link.attrs: name = _tmp_link['href'] if name[-4:] == '.txt': # start processing file in new thread sub_cat = _tmp_link.get_text() _url = URL + name Thread(target=process_file, args=(_url, name, sub_cat, main_cat)).start() tot_files += 1 # Skip the supplemental for now if (False): # scrape supplemental page URL_SUP = 'https://celestrak.com/NORAD/elements/supplemental/' page = requests.get(URL_SUP) soup = BeautifulSoup(page.content, 'html.parser') table = soup.find('table', class_='center outline') # get main category header = table.find('tr', class_='header') main_cat = header.next.next.next # find all links within main category links = header.find_next_siblings() for link in links: _tmp_link = link.next.next.next name = _tmp_link['href'] if name[-4:] == '.txt': # start processing file in new thread _url = URL_SUP + name Thread(target=process_file, args=(_url, name, sub_cat, main_cat)).start() tot_files += 1 # wait for all threads to finish while displaying progress while True: if tot_proc == tot_files: break print(f'Processed {tot_proc}/{tot_files} files', end='\r') time.sleep(0.25) print(f'{tot_proc} categories loaded successfully, with {len(buffer)} ' 'entries in total.\nSaving to database...', end='') # clear current records clear_table = ("TRUNCATE TABLE categories") cursor.execute(clear_table) # save to DB add_entry_query = """INSERT INTO categories (obj_no, name, sub_category, description) VALUES (%s, %s, %s, %s)""" i = 0 entry_list = [] for _x in buffer: if (i<1000): entry_list.append(_x) i+=1 else: cursor.executemany(add_entry_query, entry_list) entry_list = [] i = 0 # Commit the remaining batch < 1000 if (len(entry_list) > 0): cursor.executemany(add_entry_query, entry_list) cnx.commit() print('done') cursor.close() cnx.close() print('All satellites successfully saved to database!')
f.write(i + '\n') print(i) f.close() if __name__ == '__main__': try: start_date = '20200101' end_date = '20200111' last_one_day = timedelta(days=1) today = datetime.strptime(start_date,'%Y%m%d').date() end_date = datetime.strptime(end_date,'%Y%m%d').date() current_url = 'http://data.eastmoney.com/hsgt/top10.html' while today <= end_date: if is_workday(today): driver.get(current_url[:-5] + "/" + str(today) + ".html") page_load_complete = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".titbar > .tit"))) # print("页面加载完成") html = driver.page_source soup = BeautifulSoup(html, 'lxml') date = soup.select_one(".sitebody > .maincont > .contentBox > .content > .tab1") # print(today) data_list.append(today.strftime('%Y%m%d')) # print(date is not None) today = today+last_one_day path = get_stock_data_path() save_file(path+"/北向买卖A股时间") finally: driver.quit()
def soup_the_response(self): """Creates a soup object.""" self.soup = BeautifulSoup(self.content, 'lxml') logger.info('The response was souped, and a soup object was created.')
from urllib.request import urlopen from bs4 import BeautifulSoup html=urlopen('https://www.wikipedia.org/') #用html.parser来进行解析 bso=BeautifulSoup(html,"html.parser") a_list=bso.findall("div",{class=''}) for item in a_list: print(item.get_text())
def createText(newEpub, textPath, basePath): #生成Cover.html htmlContent = [] htmlHead1 = '<?xml version="1.0" encoding="utf-8" standalone="no"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"\n"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<link href="../Styles/style.css" rel="stylesheet" type="text/css" />\n<title>封面</title>\n</head>\n<body>' htmlContent.append(htmlHead1) htmlContent.append( '<div class="cover"><img alt=""src="../Images/' + newEpub.coverUrl.split('/')[-1] + '" /></div>') htmlContent.append('<div class="entry">\n<span class="title">简介</span>\n<div class="entry-content">') htmlContent.append('<p>' + newEpub.introduction + '</p>') htmlContent.append('</div></div></body></html>') tempContent = '' for line in htmlContent: tempContent += line with codecs.open(os.path.join(textPath, 'Cover.html'), 'w', 'utf-8') as f: f.write(BeautifulSoup(tempContent).prettify()) #生成单章节html for i in sorted(newEpub.chapter, key=lambda chapter: chapter[0]): htmlContent = [] print('正在生成', i[1]) if hasQT: sender.sigChangeStatus.emit('正在生成' + i[1]) htmlHead1 = '<?xml version="1.0" encoding="utf-8" standalone="no"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"\n"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN">\n<head>\n<link href="../Styles/style.css" rel="stylesheet" type="text/css" />\n<title>' htmlHead2 = '</title>\n</head>\n<body>\n<div>' htmlContent.append(htmlHead1 + i[1] + htmlHead2) htmlContent.append('<h4>' + i[1] + '</h4>') for line in i[2]: if line.startswith('<div class="lk-view-img">'): findImagesUrl = re.compile(r'data-cover="(.*)" src="') imageUrl = findImagesUrl.search(line).group(1) if not imageUrl.startswith('http://'): imageUrl = 'http://lknovel.lightnovel.cn' + imageUrl downloadQueue.put((imageUrl, basePath)) imageP = '<div class="illust"><img alt="" src="../Images/' + imageUrl.split('/')[ -1] + '" /></div>\n<br/>' htmlContent.append(imageP) elif line.startswith('<a class="inline"'): pass else: htmlContent.append('<p>' + line + '</p>') htmlHead3 = '</div>\n</body>\n</html>' htmlContent.append(htmlHead3) tempContent = '' for line in htmlContent: tempContent += line with codecs.open(os.path.join(textPath, str(i[0]) + '.html'), 'w', 'utf-8') as f: f.write(BeautifulSoup(tempContent).prettify()) #生成Title.html htmlContent = [] htmlHead1 = '<?xml version="1.0" encoding="utf-8" standalone="no"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"\n"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN">\n<head>\n<link href="../Styles/style.css" rel="stylesheet" type="text/css" />\n<title>' htmlHead2 = '</title>\n</head>\n<body>\n<div class="entry">' htmlContent.append(htmlHead1 + newEpub.volumeName + htmlHead2) htmlContent.append('<span class="title">' + newEpub.volumeName + '</span>') htmlContent.append('<div class="entry-content introduction">\n<h4>' + newEpub.volumeNumber + '</h4>') htmlContent.append('<div>\n<br />\n</div>') htmlContent.append('<p>作者:' + newEpub.authorName + '</p>') if newEpub.illusterName: htmlContent.append('<p>插画:' + newEpub.illusterName + '</p>') htmlContent.append('<p>制作:<a target="_blank" href="http://www.github.com/bebou/lknovel">lknovel</a></p>') htmlContent.append('</div>\n</div>\n</body>\n</html>') tempContent = '' for line in htmlContent: tempContent += line with codecs.open(os.path.join(textPath, 'Title.html'), 'w', 'utf-8') as f: f.write(BeautifulSoup(tempContent).prettify()) #生成Contents.html htmlContent = [] htmlContent.append( '<?xml version="1.0" encoding="utf-8" standalone="no"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"\n"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<link href="../Styles/style.css" rel="stylesheet" type="text/css" />\n<title>目录</title>\n</head>') htmlContent.append( '<body>\n<div class="entry">\n<span class="title">目录</span>\n<div class="entry-content">\n<ul class="contents">\n') for i in sorted(newEpub.chapter, key=lambda chapter: chapter[0]): htmlContent.append('<li class="c-rules"><a href="../Text/' + str(i[0]) + '.html">' + i[1] + '</a></li>') htmlContent.append('</ul>\n</div>\n</div>\n</body>\n</html>') tempContent = '' for line in htmlContent: tempContent += line with codecs.open(os.path.join(textPath, 'Contents.html'), 'w', 'utf-8') as f: f.write(BeautifulSoup(tempContent).prettify()) #下载相关图片 th = [] for i in range(5): t = threading.Thread(target=download) t.start() th.append(t) for i in th: i.join() #生成content.opf htmlContent = [] htmlContent.append( '<?xml version="1.0" encoding="utf-8" standalone="yes"?>\n<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId" version="2.0">\n<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">') htmlContent.append( '<dc:identifier id="BookId" opf:scheme="UUID">urn:uuid:' + str(uuid.uuid1()) + '</dc:identifier>') htmlContent.append('<dc:title>' + newEpub.bookName + '</dc:title>') htmlContent.append( '<dc:creator opf:file-as="' + newEpub.authorName + '" opf:role="aut">' + newEpub.authorName + '</dc:creator>') htmlContent.append('<dc:language>zh</dc:language>') htmlContent.append('<dc:source>http://www.lightnovel.cn</dc:source>') htmlContent.append('<dc:description>由https://github.com/bebound/lknovel/生成</dc:description>') htmlContent.append('<meta content="' + newEpub.coverUrl.split('/')[-1] + '" name="cover" />') htmlContent.append('</metadata>') htmlContent.append('<manifest>\n<item href="toc.ncx" id="ncx" media-type="application/x-dtbncx+xml" />') for dirPath, dirNames, fileNames in os.walk(os.path.join(basePath, 'Text')): for file in fileNames: htmlContent.append('<item href="Text/' + file + '" id="' + file + '" media-type="application/xhtml+xml" />') htmlContent.append('<item href="Styles/style.css" id="style.css" media-type="text/css" />') for dirPath, dirNames, fileNames in os.walk(os.path.join(basePath, 'Images')): for file in fileNames: if file.split('.')[-1] == 'jpg': htmlContent.append('<item href="Images/' + file + '" id="' + file + '" media-type="image/jpeg" />') else: htmlContent.append('<item href="Images/' + file + '" id="' + file + '" media-type="image/png" />') htmlContent.append('</manifest>') htmlContent.append('<spine toc="ncx">') htmlContent.append( '<itemref idref="Cover.html" />\n<itemref idref="Title.html" />\n<itemref idref="Contents.html" />\n') for dirPath, dirNames, fileNames in os.walk(os.path.join(basePath, 'Text')): for file in sorted(fileNames, key=sortItemref): if file not in ('Cover.html', 'Title.html', 'Contents.html'): htmlContent.append('<itemref idref="' + file + '" />') htmlContent.append('</spine>') htmlContent.append( '<guide>\n<reference href="Text/Contents.html" title="Table Of Contents" type="toc" />') htmlContent.append( '<reference href="Text/Cover.html" title="Cover" type="cover"/>\n</guide>') htmlContent.append('</package>') with codecs.open(os.path.join(basePath, 'content.opf'), 'w', 'utf-8') as f: for line in htmlContent: f.write(line + '\n') #生成toc.ncx htmlContent = [] htmlContent.append( '<?xml version="1.0" encoding="UTF-8" standalone="no" ?>\n<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"\n"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">\n<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">\n<head>\n<meta content="0" name="dtb:depth"/>\n<meta content="0" name="dtb:totalPageCount"/>\n<meta content="0" name="dtb:maxPageNumber"/>\n</head>\n<docTitle>\n<text>' + newEpub.bookName + '</text>\n</docTitle>') htmlContent.append('<docAuthor>\n<text>' + newEpub.authorName + '</text>\n</docAuthor>\n<navMap>') htmlContent.append( '<navPoint id="Contents" playOrder="1">\n<navLabel>\n<text>封面</text>\n</navLabel>\n<content src="Text/Cover.html"/>\n</navPoint>') htmlContent.append( '<navPoint id="Contents" playOrder="2">\n<navLabel>\n<text>标题</text>\n</navLabel>\n<content src="Text/Title.html"/>\n</navPoint>') htmlContent.append( '<navPoint id="Contents" playOrder="3">\n<navLabel>\n<text>目录</text>\n</navLabel>\n<content src="Text/Contents.html"/>\n</navPoint>') playorder = 4 for i in sorted(newEpub.chapter, key=lambda chapter: chapter[0]): htmlContent.append( '<navPoint id="' + str(i[0]) + '" playOrder="' + str(playorder) + '">\n<navLabel>\n<text>' + i[ 1] + '</text>\n</navLabel>\n<content src="Text/' + str(i[0]) + '.html"/>\n</navPoint>') playorder += 1 htmlContent.append('</navMap>\n</ncx>') with codecs.open(os.path.join(basePath, 'toc.ncx'), 'w', 'utf-8') as f: for line in htmlContent: f.write(line + '\n')
def aktuelBim(cikti='gorsel_veri'): """ BİM Aktüel Verileri Kullanım; aktuelBim("json_veri") aktuelBim("json_gorsel") aktuelBim("gorsel_veri") aktuelBim("basliklar") """ url = f"https://www.bim.com.tr/default.aspx" kimlik = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36' } istek = requests.get(url, headers=kimlik, allow_redirects=True) corba = BeautifulSoup(istek.text, "lxml") sozluk = {} tarih = corba.find('a', class_='active subButton').text.strip() urun_alani = corba.find('div', class_='productArea') urun_rerero = [] for urun in urun_alani.findAll('div', class_='inner'): host = 'https://www.bim.com.tr' try: urun_basligi = urun.find('h2', class_='title').text.strip() urun_linki = host + urun.a['href'] urun_gorseli = host + urun.img['src'].replace(' ', '%20') urun_fiyati = urun.find('a', class_='gButton triangle').text.strip() urun_rerero.append({ "urun_baslik": urun_basligi, "urun_link": urun_linki, "urun_gorsel": urun_gorseli, "urun_fiyat": urun_fiyati }) except: pass sozluk.update({'tarih': tarih}) sozluk.update({'urunler': urun_rerero}) basliklar = [anahtar for anahtar in sozluk['urunler'][0].keys()] if cikti == 'json_veri': return sozluk elif cikti == 'json_gorsel': return json.dumps(sozluk, indent=2, sort_keys=False, ensure_ascii=False) elif cikti == 'gorsel_veri': return tabulate(sozluk['urunler'], headers='keys', tablefmt='psql') elif cikti == 'basliklar': return basliklar else: return kullanim # print(aktuelBim("json_veri")) # print(aktuelBim("json_gorsel")) # print(aktuelBim("gorsel_veri")) # print(aktuelBim("basliklar")) # print(aktuelBim("alakasız bişi"))
return errorcheck def wait_review(): WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.CLASS_NAME, "score_result"))) time.sleep(0.6) while n <= 174747: try: time.sleep(0.1) driver.get('https://movie.naver.com/movie/bi/mi/basic.nhn?code=' + str(n)) html = driver.page_source soup = BeautifulSoup(html, "html.parser") except UnexpectedAlertPresentException as e: Alert(driver).accept() n = n + 1 continue except AttributeError as e: n = n + 1 continue except Exception as ex: continue # 서버 막혔을때 15분 타임슬립 if (len(soup.get_text())) == 0:
import requests import pandas as pd import csv import datetime url = 'https://www.nigeriapropertycentre.com/for-sale/houses?q=for-sale+houses' page = requests.get(url) from bs4 import BeautifulSoup soup = BeautifulSoup(page.content, 'html.parser') items = soup.find_all('div', {'class' : 'col-md-12'}) end_page_num = 23 filename = "nigeriaprop_houses.csv" with open(filename, "w+") as f: writer = csv.writer(f) writer.writerow(["Listing_type", 'Location', "Price","Bedroom", 'Bathroom', 'Toilet', 'Parking']) i = 1 while i <= end_page_num: r = requests.get("https://www.nigeriapropertycentre.com/for-sale/houses?q=for-sale+houses?page={}".format(i)) soup = BeautifulSoup(r.text, "html.parser") items = soup.find_all('div', {'class' : 'col-md-12'}) x = items[2:] for item in x: try: Listing_type = item.find('span').get_text() except: Listing_type = 'N/A' try:
r"or Choose a user", # This is a parsing bug in the tool r"argument ", # I can't find this one r"text", ] # Sort regexes in descending order of their lengths. As a result, the # longer phrases will be ignored first. IGNORED_PHRASES.sort(key=lambda regex: len(regex), reverse=True) # Compile regexes to improve performance. This also extracts the # text using BeautifulSoup and then removes extra whitespaces from # it. This step enables us to add HTML in our regexes directly. COMPILED_IGNORED_PHRASES = [ re.compile(' '.join(BeautifulSoup(regex, 'lxml').text.split())) for regex in IGNORED_PHRASES ] SPLIT_BOUNDARY = '?.!' # Used to split string into sentences. SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY)) # Regexes which check capitalization in sentences. DISALLOWED_REGEXES = [ re.compile(regex) for regex in [ r'^[a-z]', # Checks if the sentence starts with a lower case character. r'^[A-Z][a-z]+[\sa-z0-9]+[A-Z]', # Checks if an upper case character exists # after a lower case character when the first character is in upper case. ] ]
def get_total_num(content): soup = BeautifulSoup(content, 'html5lib') page_num = re.findall(re.compile('共[0-9]+条'), str(soup)) page_num = int(page_num[0][1:-1]) print(page_num) return page_num
correct_list =[] aggregatePhotos = [] for i in range(len(input_list)): url = "https://www.everydayhealth.com/drugs/"+input_list[i] r = requests.get(url) if r.status_code != 200: print("Could not connect to " + url) print("Response : " + str(r.status_code)) continue correct_list.append(input_list[i]) bs = BeautifulSoup(requests.get(url).text, 'html.parser') json_object = {"name": input_list[i] } photos = [] b2 = bs.findAll("div", {"class" : "drug-image"}) for b4 in b2 : if b4.findChildren("img")[0].attrs["src"] != "" : src = b4.findChildren("img")[0].attrs["src"] photos.append(get_image("https:" + src,input_list[i])) elif b4.findChildren("img")[0].attrs["data-src"] != "" : src = b4.findChildren("img")[0].attrs["data-src"] photos.append(get_image("https:" + src,input_list[i])) json_object["photos"] = photos aggregatePhotos.append(json_object)
driver.get( 'https://flights.makemytrip.com/makemytrip/search/O/O/E/1/0/0/S/V0/DEL_BOM_06-03-2018?contains=false&remove=' ) from urllib.request import urlopen from bs4 import BeautifulSoup import ssl ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE html_page = driver.page_source driver.quit() soup = BeautifulSoup(html_page, "html.parser") tags = soup('span') c = soup.find_all( "span", { "class": "block logo_name hidden-xs visible-stb light_gray flt_number_less600 ng-binding ng-scope" }) #flNumber d = soup.find_all("span", {"class": "num ng-binding"}) #price handle = open('random.txt', 'w') for i in c: handle.write("%s\n" % i.contents) for i in d: handle.write("%s\n" % i.contents)
def main(): with open(item_list, 'r', encoding='utf-8') as spell_file, open(csv_file, 'w', newline='') as output_file: writer = csv.writer(output_file, delimiter ='\t', quotechar='"') raw_html = spell_file.read().replace('−', '-').replace('&mdash', '--').replace('–','-').replace('×', 'x').replace('—', '--').replace('–', '–') soup = BeautifulSoup(raw_html, 'html.parser') extract_items(soup, writer)
import re from nltk.tokenize import RegexpTokenizer from nltk.stem import WordNetLemmatizer count = 0 tokenizer = RegexpTokenizer(r'\w+') lemmatizer = WordNetLemmatizer() stopwords = ['nimh', 'nih', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'monday', 'tuesday', 'friday', 'saturday','sunday','wednesday','thursday', "a", "about", "above", "after", "again", "against", "ain", "all", "am", "an", "and", "any", "are", "aren", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "couldn", "couldn't", "d", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", "doing", "don", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn", "hadn't", "has", "hasn", "hasn't", "have", "haven", "haven't", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", "just", "ll", "m", "ma", "me", "mightn", "mightn't", "more", "most", "mustn", "mustn't", "my", "myself", "needn", "needn't", "no", "nor", "not", "now", "o", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "re", "s", "same", "shan", "shan't", "she", "she's", "should", "should've", "shouldn", "shouldn't", "so", "some", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "ve", "very", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "won", "won't", "wouldn", "wouldn't", "y", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "could", "he'd", "he'll", "he's", "here's", "how's", "i'd", "i'll", "i'm", "i've", "let's", "ought", "she'd", "she'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "we'd", "we'll", "we're", "we've", "what's", "when's", "where's", "who's", "why's", "would", "able", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "afterwards", "ah", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "announce", "another", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently", "approximately", "arent", "arise", "around", "aside", "ask", "asking", "auth", "available", "away", "awfully", "b", "back", "became", "become", "becomes", "becoming", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "believe", "beside", "besides", "beyond", "biol", "brief", "briefly", "c", "ca", "came", "cannot", "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing", "contains", "couldnt", "date", "different", "done", "downwards", "due", "e", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "ff", "fifth", "first", "five", "fix", "followed", "following", "follows", "former", "formerly", "forth", "found", "four", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten", "h", "happens", "hardly", "hed", "hence", "hereafter", "hereby", "herein", "heres", "hereupon", "hes", "hi", "hid", "hither", "home", "howbeit", "however", "hundred", "id", "ie", "im", "immediate", "immediately", "importance", "important", "inc", "indeed", "index", "information", "instead", "invention", "inward", "itd", "it'll", "j", "k", "keep", "keeps", "kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look", "looking", "looks", "ltd", "made", "mainly", "make", "makes", "many", "may", "maybe", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "moreover", "mostly", "mr", "mrs", "much", "mug", "must", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "nobody", "non", "none", "nonetheless", "noone", "normally", "nos", "noted", "nothing", "nowhere", "obtain", "obtained", "obviously", "often", "oh", "ok", "okay", "old", "omitted", "one", "ones", "onto", "ord", "others", "otherwise", "outside", "overall", "owing", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "said", "saw", "say", "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sent", "seven", "several", "shall", "shed", "shes", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially", "successfully", "sufficiently", "suggest", "sup", "sure", "take", "taken", "taking", "tell", "tends", "th", "thank", "thanks", "thanx", "thats", "that've", "thence", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "thereto", "thereupon", "there've", "theyd", "theyre", "think", "thou", "though", "thoughh", "thousand", "throug", "throughout", "thru", "thus", "til", "tip", "together", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "ts", "twice", "two", "u", "un", "unfortunately", "unless", "unlike", "unlikely", "unto", "upon", "ups", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "v", "value", "various", "'ve", "via", "viz", "vol", "vols", "vs", "w", "want", "wants", "wasnt", "way", "wed", "welcome", "went", "werent", "whatever", "what'll", "whats", "whence", "whenever", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "whim", "whither", "whod", "whoever", "whole", "who'll", "whomever", "whos", "whose", "widely", "willing", "wish", "within", "without", "wont", "words", "world", "wouldnt", "www", "x", "yes", "yet", "youd", "youre", "z", "zero", "a's", "ain't", "allow", "allows", "apart", "appear", "appreciate", "appropriate", "associated", "best", "better", "c'mon", "c's", "cant", "changes", "clearly", "concerning", "consequently", "consider", "considering", "corresponding", "course", "currently", "definitely", "described", "despite", "entirely", "exactly", "example", "going", "greetings", "hello", "help", "hopefully", "ignored", "inasmuch", "indicate", "indicated", "indicates", "inner", "insofar", "it'd", "keep", "keeps", "novel", "presumably", "reasonably", "second", "secondly", "sensible", "serious", "seriously", "sure", "t's", "third", "thorough", "thoroughly", "three", "well", "wonder", "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "co", "op", "research-articl", "pagecount", "cit", "ibid", "les", "le", "au", "que", "est", "pas", "vol", "el", "los", "pp", "u201d", "well-b", "http", "volumtype", "par", "0o", "0s", "3a", "3b", "3d", "6b", "6o", "a1", "a2", "a3", "a4", "ab", "ac", "ad", "ae", "af", "ag", "aj", "al", "an", "ao", "ap", "ar", "av", "aw", "ax", "ay", "az", "b1", "b2", "b3", "ba", "bc", "bd", "be", "bi", "bj", "bk", "bl", "bn", "bp", "br", "bs", "bt", "bu", "bx", "c1", "c2", "c3", "cc", "cd", "ce", "cf", "cg", "ch", "ci", "cj", "cl", "cm", "cn", "cp", "cq", "cr", "cs", "ct", "cu", "cv", "cx", "cy", "cz", "d2", "da", "dc", "dd", "de", "df", "di", "dj", "dk", "dl", "do", "dp", "dr", "ds", "dt", "du", "dx", "dy", "e2", "e3", "ea", "ec", "ed", "ee", "ef", "ei", "ej", "el", "em", "en", "eo", "ep", "eq", "er", "es", "et", "eu", "ev", "ex", "ey", "f2", "fa", "fc", "ff", "fi", "fj", "fl", "fn", "fo", "fr", "fs", "ft", "fu", "fy", "ga", "ge", "gi", "gj", "gl", "go", "gr", "gs", "gy", "h2", "h3", "hh", "hi", "hj", "ho", "hr", "hs", "hu", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ic", "ie", "ig", "ih", "ii", "ij", "il", "in", "io", "ip", "iq", "ir", "iv", "ix", "iy", "iz", "jj", "jr", "js", "jt", "ju", "ke", "kg", "kj", "km", "ko", "l2", "la", "lb", "lc", "lf", "lj", "ln", "lo", "lr", "ls", "lt", "m2", "ml", "mn", "mo", "ms", "mt", "mu", "n2", "nc", "nd", "ne", "ng", "ni", "nj", "nl", "nn", "nr", "ns", "nt", "ny", "oa", "ob", "oc", "od", "of", "og", "oi", "oj", "ol", "om", "on", "oo", "oq", "or", "os", "ot", "ou", "ow", "ox", "oz", "p1", "p2", "p3", "pc", "pd", "pe", "pf", "ph", "pi", "pj", "pk", "pl", "pm", "pn", "po", "pq", "pr", "ps", "pt", "pu", "py", "qj", "qu", "r2", "ra", "rc", "rd", "rf", "rh", "ri", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "rv", "ry", "s2", "sa", "sc", "sd", "se", "sf", "si", "sj", "sl", "sm", "sn", "sp", "sq", "sr", "ss", "st", "sy", "sz", "t1", "t2", "t3", "tb", "tc", "td", "te", "tf", "th", "ti", "tj", "tl", "tm", "tn", "tp", "tq", "tr", "ts", "tt", "tv", "tx", "ue", "ui", "uj", "uk", "um", "un", "uo", "ur", "ut", "va", "wa", "vd", "wi", "vj", "vo", "wo", "vq", "vt", "vu", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y2", "yj", "yl", "yr", "ys", "yt", "zi", "zz"] urls = [] Finals = set() with open("urlfiles.txt", "r") as file: urls = file.readlines() for each in urls: if len(Finals) < 800: if "shtml" in each: page = urllib.urlopen(each) soup = BeautifulSoup(page, 'html.parser') paragraphs = soup.findAll('p') data = "" for single in paragraphs: data += single.text.strip() if "Page Not Found" not in data or 'possible that the page is temporarily unavailable' not in data: with open('ccdumpbig.txt', 'a') as f: f.write((data+"\n").encode("utf8")) data = re.sub(r"http\S+", "", data) # Remove emoticons data = data.encode('ascii', 'ignore').decode('ascii') data = tokenizer.tokenize(data) final_data = "" for w in data: if w.isalpha() and w.lower() not in stopwords: # Stem it
import time import os with open('gov_newspapers.csv', 'w') as f: writer = csv.DictWriter(f, fieldnames=('name', 'state', 'newspaper', 'article headline', 'date', 'url', 'body text', 'negative', 'positive', 'neutral', 'composite')) writer.writeheader() news = {} for i in range(6): mo_url = "https://www.stltoday.com/search/?f=html&q=mike+parson&d1=2018-04-01&d2=2019-07-01&s=start_time&sd=desc&l=100&t=article&nsa=eedition&app%5B0%5D=editorial&o={}00".format( i) mo_search = urllib.request.urlopen(mo_url) parsed_mo = BeautifulSoup(mo_search.read()) h3_tag = parsed_mo.find_all('h3', {'class': 'tnt-headline'}) for headline in range(len(h3_tag)): #time.sleep(2) try: news['name'] = "Mike Parsons" news['state'] = 'MO' news['newspaper'] = 'St. Louis Post Dispatch' news['article headline'] = unicodedata.normalize( 'NFKD', h3_tag[headline].get_text()).strip() news['url'] = 'https://www.stltoday.com' + h3_tag[ headline].find('a')['href'] article = urllib.request.urlopen(news['url']) parsed_article = BeautifulSoup(article.read()) news['date'] = parsed_article.find('time').get_text() news['body text'] = parsed_article.find(
from bs4 import BeautifulSoup import requests import re r = requests.get('https://www.jimsmowing.net') content = r.text soup = BeautifulSoup(content, 'html.parser') #print(soup.find_all('p')[4].get_text())
import os import time from var import List driver = webdriver.Chrome("./driver/chromedriver") feel = List.img_list for item in feel: time.sleep(1) driver.get("https://www.google.co.jp/imghp?hl=ja&tab=wi&ogbl") driver.find_element_by_name("q").send_keys(item, Keys.ENTER) # 現在のページのurlを変数に入れる current_url = driver.current_url html = requests.get(current_url) bs = BeautifulSoup(html.text, "lxml") images = bs.find_all("img", limit=10 + 1) # imgフォルダの作成 if not os.path.isdir("img/ans"): os.makedirs("img/ans") # 取得した画像をループして保存 for i, img in enumerate(images, start=1): src = img.get("src") try: responce = requests.get(src) with open("img/ans/" + item + "{}.jpg".format(i - 1), "wb") as f: f.write(responce.content) except requests.exceptions.MissingSchema: pass driver.quit()
def collect_data(process_num, course_url_list, course_name_list, course_dur_list): # This will get the keywords from faculty file and put it into a dictionary with open('C:/Users/veye/Dropbox/Scrapping/Others/faculty.csv', 'rt', encoding='utf-8' ) as List: # Can use the faculty file from the Dropbox also. reader = csv.reader(List) mydict = {rows[0]: rows[1] for rows in reader} # This part will scrape the page with open('C:/Scrape/your_uni_folder/ExtractedData_' + '_all' + '.csv', 'at', newline='', encoding='utf-8-sig') as website: writer = csv.writer(website) while True: num_loop = 0 while num_loop < len(course_url_list): req = requests.get(course_url_list[num_loop]) soup = BeautifulSoup(req.content, 'lxml') # details['Course Name', 'Level', 'Faculty', 'Duration', 'Duration Type', 'URL', 'Description', 'Keywords', 'ScrapeAll'] details = ['', '', '', '', '', '', '', '', '', ''] ## # Course name if 'null' in course_name_list[num_loop]: "" else: details[0] = course_name_list[num_loop] #------------------------------- change code here --------------------------------------------- # Duration Text durationText = soup.find('div', { 'class': 'someclassnameyouhavetofind' }).text # Description descText = soup.find('div', { 'class': 'someclassnameyouhavetofind' }).text details[6] = clean(descText) #--------------- you don't have to change anything past here unless you really need to -------------- # Duration and Duration Type # this returns a pair duration (int), durationtype (string) durationPair = convertDuration(durationText) details[3] = durationPair[0] details[4] = durationPair[1] # Both the code for levels and faculty can be changed to suit the website that you are doing. # Levels word = details[0] lock = 0 for level, key in level_key.items(): for each in key: for wd in word.split(): if each.lower() == wd.lower( ): # Testing the equal, might change back to in details[1] = level lock = 1 break if lock == 1: break if lock == 1: break # Faculty loop_must_break = False for a in details[0].split(): for fac, key in mydict.items(): for each in key.split(','): if each.replace("'", '').title() in a: print("\t\t\t" + each + ' in ' + details[0] + ' from ' + course_url_list[num_loop]) details[2] = fac loop_must_break = True break if loop_must_break: break if loop_must_break: break # URL details[5] = req.url # Scrape All [ s.extract() for s in soup( ['style', 'script', '[document]', 'head', 'title']) ] visible_text = repr(soup.get_text().replace( r'\\n', ' ').replace('\n', '').replace('\\', '').replace(', ', '')) visible_text = re.sub(r'[^\x00-\x7f]', r' ', visible_text) visible_text = ' '.join(visible_text.split()) details[7] = str(repr(visible_text)) writer.writerow(details) print(details) print("Page " + str(num_loop) + '/' + str(len(course_url_list)) + " from " + '_all') time.sleep(3) num_loop += 1 print("\n" + str(len(course_url_list)) + " in the queue of " + '_all') print("\n" + '_all' + " has exited the loop") break
def get_soup(url): response = requests.request('get', url=url) soup = BeautifulSoup(response.content, 'html.parser') return soup