Python BeautifulSoup.BeautifulSoup Examples, bs4.BeautifulSoup.BeautifulSoup Python Examples

Example #1

0

Show file

File: bs2.py Project: sharang1996/python-code-dump

from bs4 import BeautifulSoup
import urllib2
import re

url = urllib2.urlopen("http://www.python.org")
content = url.read()
soup = BeautifulSoup(content)
for a in soup.findAll('a',href=True):
    if re.findall('group', a['href']):
        print "Found the URL:", a['href']

Example #2

0

Show file

File: scrape email address and phone number from any website.py Project: youmania83/scrape-emailAddress-phoneNumber-from-any-website

import re              # this provide format that needs to be scraped
import time            # it make the program to sleep for specific time 

from urllib.request import Request, urlopen  # this get the html ,act like headless browser
from bs4 import BeautifulSoup
print("ENTER THE URL TO FIND MOBILE NUMBER AND EMAIL url eg:https://www.homersbrandcare.com")     # url of website from which we need email and phone number

url=str(input())
req = Request(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
html = urlopen(req).read()
html=html.decode()
bsObj = BeautifulSoup(html,features="lxml");
links=[]
for link in bsObj.find_all('a'):
    links.append(str(link.get('href')))
    #print(url + str(link.get('href')))


def Email(url):
    try:
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
        hh = urlopen(req).read()       # we are making request act like firefox browser
        time.sleep(0.5)
        #time.sleep(1)
    #raw_html=t.text
    #dd=html2text.html2text(hh)
        dd=hh.decode()                                              
        email=re.findall(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+',dd)   # this is the format ,based on this format only we will get results
                                                                                                                    # this is email format
        print(email)
        phone=re.findall(r'(tel\s?:?[0-9]+)|(\+91-?[0-9]+)',dd)   # phone number format

Example #3

0

Show file

File: news.py Project: songhayoon/test

import requests
from bs4 import BeautifulSoup
url = "https://news.naver.com"

req = requests.get(url)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
rankStr = "#ranking_10"
print(soup)
for i in range(0, 6):
    rankList = soup.select(rankStr + str(i))
    print(rankList)
    aList = []
    for rank in rankList:
        aList = rank.find_all('a')

        with open("news.txt", "a") as f:
            f.write("#####" + str(i) + "####\n")
            for article in aList:
                f.write(article.text + "\n" + url + article['href'] + "\n\n")
            f.write("###################\n")

Example #4

0

Show file

    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)

    process_pdf(rsrcmgr, device, pdfFile)
    device.close()

    content = retstr.getvalue()
    retstr.close()
    return content


html = urlopen(
    "http://www.cbf.com.br/competicoes/brasileiro-serie-a/tabela/2016#.WQTfO9orLIV"
)
bsObj = BeautifulSoup(html.read(), 'html.parser')

nameList = bsObj.findAll('div', class_="full-game-links")

sumulas = []
for name in nameList:
    res = name.findAll('a')
    if 'retificacoes' in str(res[0]):
        res = str(res[2])
    else:
        res = str(res[0])
    j = res.find('url[]=')
    k = res.find('.pdf')
    sumula = res[j + 6:k + 4]
    if len(sumula) > 0:
        sumulas.append(sumula)

Example #5

0

Show file

File: pttmovie.py Project: Yun-Ju-Huang/homework

from urllib import request

import requests
from bs4 import BeautifulSoup
url = 'https://www.ptt.cc/bbs/movie/index.html'

headers = {
    'User-agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}

res = requests.get(url, headers=headers)
#print(res.text)
soup = BeautifulSoup(res.text, 'html.parser')

title = soup.select('div[class="title"]')
#print(title)

for t in title:
    print('________')
    try:
        #print(t)
        article_title = t.select(
            'a')[0].text  #'a'為把所有結構的a標籤取出來，    .text可以把字串拿出來
        article_url = 'https://www.ptt.cc/' + t.select('a')[0][
            'href']  #取出網址  需自行加前面才會有正常的網址
        #也可寫成  article_url = t.a
        print(article_title)
        print(article_url)
    except:
        print(t)

Example #6

0

Show file

File: base.py Project: kpolimis/basketball_reference

 def __init__(self, name, page):
     self.name = name
     self.page = page
     rv = requests.get(
         'http://www.basketball-reference.com{0}'.format(page))
     self.soup = BeautifulSoup(rv.text, features="html.parser")

Example #7

0

Show file

File: android.py Project: GertShnaiders/SundayUserbot

async def devices_specifications(request):
    if request.fwd_from:
        return
    """ Mobile devices specifications """
    textx = await request.get_reply_message()
    brand = request.pattern_match.group(1).lower()
    device = request.pattern_match.group(2).lower()
    if brand and device:
        pass
    elif textx:
        brand = textx.text.split(" ")[0]
        device = " ".join(textx.text.split(" ")[1:])
    else:
        await sunday.edit_or_reply(request, "`Usage: .specs <brand> <device>`")
        return
    all_brands = (
        BeautifulSoup(
            get("https://www.devicespecifications.com/en/brand-more").content, "lxml"
        )
        .find("div", {"class": "brand-listing-container-news"})
        .findAll("a")
    )
    brand_page_url = None
    try:
        brand_page_url = [
            i["href"] for i in all_brands if brand == i.text.strip().lower()
        ][0]
    except IndexError:
        await sunday.edit_or_reply(request, f"`{brand} is unknown brand!`")
        return
    devices = BeautifulSoup(get(brand_page_url).content, "lxml").findAll(
        "div", {"class": "model-listing-container-80"}
    )
    device_page_url = None
    try:
        device_page_url = [
            i.a["href"]
            for i in BeautifulSoup(str(devices), "lxml").findAll("h3")
            if device in i.text.strip().lower()
        ]
    except IndexError:
        await sunday.edit_or_reply(request, f"`can't find {device}!`")
        return
    if len(device_page_url) > 2:
        device_page_url = device_page_url[:2]
    reply = ""
    for url in device_page_url:
        info = BeautifulSoup(get(url).content, "lxml")
        reply = "\n" + info.title.text.split("-")[0].strip() + "\n"
        info = info.find("div", {"id": "model-brief-specifications"})
        specifications = re.findall(r"<b>.*?<br/>", str(info))
        for item in specifications:
            title = re.findall(r"<b>(.*?)</b>", item)[0].strip()
            data = (
                re.findall(r"</b>: (.*?)<br/>", item)[0]
                .replace("<b>", "")
                .replace("</b>", "")
                .strip()
            )
            reply += f"**{title}**: {data}\n"
    await sunday.edit_or_reply(request, reply)

Example #8

0

Show file

File: insert.py Project: LucasHerreroPecker/proyectoTFG

def insertCode():


# Con la libreria soup se abre el fichero index de templates
    soup = BeautifulSoup(open(url_index), 'html.parser')

# Eliminar los link y script que vamos a insertar después
    soup.find("script", {"src": "resources/jquery.js"}).extract()
    soup.find("script", {"src": "resources/marked.min.js"}).extract()

    soup.find("link", {"href": "resources/primer.css"}).extract()
    soup.find("link", {"href": "resources/rec.css"}).extract()
    soup.find("link", {"href": "resources/extra.css"}).extract()
    soup.find("link", {"href": "resources/owl.css"}).extract()

# Se introduce primer link a un css con sus atributos en head con append que lo mete al final
    new_link = soup.new_tag('link')
    new_link.attrs['rel'] = 'stylesheet'
    new_link.attrs['href'] = '{% static \'extra.css\' %}'  # static
    soup.head.append(new_link)

    new_link = soup.new_tag('link')
    new_link.attrs['rel'] = 'stylesheet'
    new_link.attrs['href'] = '{% static \'owl.css\' %}'
    soup.head.append(new_link)

    new_link = soup.new_tag('link')
    new_link.attrs['rel'] = 'stylesheet'
    new_link.attrs['href'] = '{% static \'primer.css\' %}'
    soup.head.append(new_link)

    new_link = soup.new_tag('link')
    new_link.attrs['rel'] = 'stylesheet'
    new_link.attrs['href'] = '{% static \'rec.css\' %}'
    soup.head.append(new_link)

    new_link = soup.new_tag('link')
    new_link.attrs['rel'] = 'stylesheet'
    new_link.attrs['href'] = '{% static \'validate.css\' %}'
    soup.head.append(new_link)

# Se introduce el script que modifica tabla de contenidos y se mete con insert al principio, es esencial que se ejecute antes
    new_script = soup.new_tag('script')
    new_script.attrs['type'] = 'text/javascript'
    new_script.attrs['src'] = '{% static \'jquery.js\' %}'
    soup.head.insert(1, new_script)

# Se introduce script para el css
    new_script = soup.new_tag('script')
    new_script.attrs['type'] = 'text/javascript'
    new_script.attrs['src'] = '{% static \'marked.min.js\' %}'
    soup.head.insert(3, new_script)

# Se introduce nuestro script para añadir la funcion de validacion
    new_script = soup.new_tag('script')
    new_script.attrs['type'] = 'text/javascript'
    new_script.attrs['src'] = '{% static \'validate.js\' %}'
    soup.head.insert(4, new_script)

# Se introduce script con la libreria ajax para utilizar en nuestro script de validacion... esencial ponerlo al principio porque si no no carga bien el script
    new_script = soup.new_tag('script')
    new_script.attrs['src'] = 'https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js'
    soup.head.insert(2, new_script)

# Aqui se empieza a meter la parte para descargar las shapes, desde dentro hacia fuera
    new_img = soup.new_tag('img')
    new_img.attrs['src'] = 'https://img.shields.io/badge/Format-TTL-blue.svg'
    new_img.attrs['<'] = ''
    new_img.attrs['img'] = ''

    new_a = soup.new_tag('a')
    new_a.attrs['href'] = 'shapes.ttl'
    new_a.attrs['target'] = '_blank'
    new_a.append(new_img)

    new_span = soup.new_tag('span')
    new_span.append(new_a)

    new_dd = soup.new_tag('dd')
    new_dd.append(new_span)

    new_dt = soup.new_tag('dt')
    new_dt.string = "Download shapes"

    new_dl = soup.new_tag('dl')
    new_dl.insert(0, new_dt)
    new_dl.insert(1, new_dd)

    new_br = soup.new_tag('br')

    soup.find("dt", text="Download serialization:").parent.insert_after(new_dl)

# Aqui se empieza a meter la parte de validacion, otra vez desde dentro hacia fuera
    new_text = soup.new_tag('textarea')
    new_text.attrs['id'] = 'textrules'
    new_text.attrs['rows'] = '14'
    new_text.attrs['cols'] = '60'
    new_text.attrs['placeholder'] = 'This is a placeholder where the rules should be explained in natural language.'

    new_h4 = soup.new_tag('h4')
    new_h4.string = 'Validation rules'

    new_button = soup.new_tag('button')
    new_button.attrs['class'] = 'buttonS'
    new_button.attrs['onclick'] = "validate('{% url 'validate'%}', '{{ csrf_token }}')"
    new_button.string = 'Validate'

    new_text1 = soup.new_tag('textarea')
    new_text1.attrs['id'] = 'textdata'
    new_text1.attrs['rows'] = '14'
    new_text1.attrs['cols'] = '100'
    new_text1.attrs['placeholder'] = 'Write data for the validation. Only Turtle format (ttl) is supported.'

    new_input = soup.new_tag('input')
    new_input.attrs['class'] = 'inp'
    new_input.attrs['type'] = 'checkbox'

    new_label = soup.new_tag('label')
    new_label.attrs['id'] = 'labelValid'
    new_label.string = 'Coverage'
    new_label.append(new_input)

    new_br = soup.new_tag('br')
    new_br1 = soup.new_tag('br')
    new_br2 = soup.new_tag('br')
    new_br3 = soup.new_tag('br')

    new_span1 = soup.new_tag('span')
    new_span1.attrs['class'] = 'spanExp'
    new_span1.attrs['style'] = 'color:#87CEFA'
    new_span1.string = '*'

    new_span2 = soup.new_tag('span')
    new_span1.attrs['class'] = 'spanExp'
    new_span2.string = 'Coverage parameter specifies, if true, which types within the data are covered by the provided shape (consider that those not covered by the shape are always correctly validated).'

    new_divValid = soup.new_tag('div')
    new_divValid.attrs['id'] = 'divValid'
    new_divValid.append(new_label)
    new_divValid.append(new_br1)
    new_divValid.append(new_span1)
    new_divValid.append(new_span2)
    new_divValid.append(new_br)
    new_divValid.append(new_br2)
    new_divValid.append(new_text1)
    new_divValid.append(new_br3)
    new_divValid.append(new_button)


    new_a = soup.new_tag('a')
    new_a.attrs['href'] = '#toc'
    new_a.string = 'ToC'

    new_span = soup.new_tag('span')
    new_span.attrs['class'] = 'backlink'
    new_span.string = 'back to'
    new_span.append(new_a)

    new_h2 = soup.new_tag('h2')
    #new_h2.attrs['id'] = 'valid'
    new_h2.attrs['class'] = 'list'
    new_h2.string = 'Validation'
    new_h2.append(new_span)

# Se mete el div con el resto incluido, importante incluirselo en orden. Al final se añade al body con soup
    new_div = soup.new_tag('div')
    new_div.attrs['id'] = 'validation'
    new_div.append(new_h2)
    new_div.append(new_divValid)
    new_div.append(new_h4)
    new_div.append(new_text)
    soup.find("div", {"id": "references"}).insert_before(new_div)

# Se escribe el codigo añadido en el index de templates para actualizarlo
    with open(url_index, "w") as file:
        file.write(str(soup))

Example #9

0

Show file

def eurorub():
    full_page = requests.get(euro_rub, headers=headers)
    soup = BeautifulSoup(full_page.content, 'html.parser')
    convert = soup.findAll("span", {"class": "DFlfde", "class": "SwHCTb", "data-precision": 2})
    return convert[0].text

Example #10

0

Show file

File: renombrar.py Project: Withjap/bite-the-diamond

		palabra.append(letra)
	if ' ' in palabra:
		palabra.remove(' ')
		nueva_palabra = ''.join(palabra)
		os.rename(n, nueva_palabra)

	palabra.clear()

for item in os.listdir():
	os.path.splitext(item)
	name_changed = os.path.splitext(item)[0]
	print(name_changed)

	if name_changed.isdigit():
		req = requests.get('/{}/'.format(name_changed))
		soup = BeautifulSoup(req.text, "lxml")

		new_name = soup.h1.string + '.rar'
	
		palabra_dos = []
		not_allowed = ['*', '"', '/', '\ ', '<', '>', ':', '|', '?']

		for letra in new_name:
			palabra_dos.append(letra)
		for character in no_allowed:
			for n in palabra_dos:
				if character == n:
					palabra_dos.remove(character)
		new_name = ''.join(palabra_dos)
		print(new_name)
		os.rename(item, new_name)

Example #11

0

Show file

File: ruwordnet_reader.py Project: lilaspourpre/dialogue2020_shared_task_hypernyms

def get_soup(file):
    with codecs.open(file, encoding='utf-8')as f:
        handler = f.read()
    return BeautifulSoup(handler, features="lxml")

Example #12

0

Show file

def main():
    # connect to server
    try:
        cnx = mysql.connector.connect(
            host=dbhostname,
            user=dbusername,
            passwd=dbpassword,
            db=dbname,
            charset='utf8',
            use_unicode=True
        )
        cursor = cnx.cursor()
    except mysql.connector.Error as err:
        if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
            print('Something is wrong with your user name or password')
        else:
            print(err)
        print('Script is shutting down.')
        os._exit(1)

    # load DB and initialize table
    try:
        cursor.execute(f'USE {dbname}')
    except mysql.connector.Error as err:
        print(f'Database {dbname} does not exist.')
        if err.errno == errorcode.ER_BAD_DB_ERROR:
            create_database(cursor)
            print(f'Database {dbname} created successfully.')
            cnx.database = dbname
        else:
            print(err)
            os._exit(1)
    cursor.execute(TABLE_create_query)

    # scrape main page
    URL = 'https://celestrak.com/NORAD/elements/'
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    tables = soup.find_all('table', class_='striped-odd')

    tot_files = 0

    for table in tables:
        # get main category
        header = table.find('tr', class_='header')
        main_cat = header.next.next
        # find all links within main category
        links = header.find_next_siblings()
        for link in links:
            _tmp_link = link.next.next
            if type(_tmp_link) != element.Tag:
                continue
            if 'href' in _tmp_link.attrs:
                name = _tmp_link['href']
                if name[-4:] == '.txt':
                    # start processing file in new thread
                    sub_cat = _tmp_link.get_text()
                    _url = URL + name
                    Thread(target=process_file, args=(_url, name, sub_cat, main_cat)).start()
                    tot_files += 1

    # Skip the supplemental for now
    if (False):
        # scrape supplemental page
        URL_SUP = 'https://celestrak.com/NORAD/elements/supplemental/'
        page = requests.get(URL_SUP)
        soup = BeautifulSoup(page.content, 'html.parser')
        table = soup.find('table', class_='center outline')

        # get main category
        header = table.find('tr', class_='header')
        main_cat = header.next.next.next
        # find all links within main category
        links = header.find_next_siblings()
        for link in links:
            _tmp_link = link.next.next.next
            name = _tmp_link['href']
            if name[-4:] == '.txt':
                # start processing file in new thread
                _url = URL_SUP + name
                Thread(target=process_file, args=(_url, name, sub_cat, main_cat)).start()
                tot_files += 1

    # wait for all threads to finish while displaying progress
    while True:
        if tot_proc == tot_files: break
        print(f'Processed {tot_proc}/{tot_files} files', end='\r')
        time.sleep(0.25)

    print(f'{tot_proc} categories loaded successfully, with {len(buffer)} '
            'entries in total.\nSaving to database...', end='')
    
    # clear current records
    clear_table = ("TRUNCATE TABLE categories")
    cursor.execute(clear_table)

    # save to DB
    add_entry_query = """INSERT INTO categories 
                (obj_no, name, sub_category, description) 
                VALUES (%s, %s, %s, %s)"""

    i = 0
    entry_list = []
    for _x in buffer:
        if (i<1000):
            entry_list.append(_x)
            i+=1
        else:
            cursor.executemany(add_entry_query, entry_list)
            entry_list = []
            i = 0
    # Commit the remaining batch < 1000
    if (len(entry_list) > 0):
        cursor.executemany(add_entry_query, entry_list)
    cnx.commit()
    print('done')

    cursor.close()
    cnx.close()
    print('All satellites successfully saved to database!')

Example #13

0

Show file

            f.write(i + '\n')
            print(i)
        f.close()


if __name__ == '__main__':
    try:
        start_date = '20200101'
        end_date = '20200111'
        last_one_day = timedelta(days=1)
        today = datetime.strptime(start_date,'%Y%m%d').date()
        end_date = datetime.strptime(end_date,'%Y%m%d').date()
        current_url = 'http://data.eastmoney.com/hsgt/top10.html'
        while today <= end_date:
            if is_workday(today):
                driver.get(current_url[:-5] + "/" + str(today) + ".html")
                page_load_complete = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".titbar > .tit")))
                # print("页面加载完成")
                html = driver.page_source
                soup = BeautifulSoup(html, 'lxml')
                date = soup.select_one(".sitebody > .maincont > .contentBox > .content > .tab1")
                # print(today)
                data_list.append(today.strftime('%Y%m%d'))
                # print(date is not None)
            today = today+last_one_day
        path = get_stock_data_path()
        save_file(path+"/北向买卖A股时间")
    finally:
        driver.quit()

Example #14

0

Show file

File: main.py Project: Dragosjosan/Yahoo_Financials

 def soup_the_response(self):
     """Creates a soup object."""
     self.soup = BeautifulSoup(self.content, 'lxml')
     logger.info('The response was souped, and a soup object was created.')

Example #15

0

Show file

File: crawl_language11_29.py Project: Velvet233/StudyPython

from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen('https://www.wikipedia.org/')
#用html.parser来进行解析
bso=BeautifulSoup(html,"html.parser")

a_list=bso.findall("div",{class=''})

for item in a_list:
    print(item.get_text())

Example #16

0

Show file

File: lk2epub.py Project: lyblacke/lknovel

def createText(newEpub, textPath, basePath):
    #生成Cover.html
    htmlContent = []
    htmlHead1 = '<?xml version="1.0" encoding="utf-8" standalone="no"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"\n"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<link href="../Styles/style.css" rel="stylesheet" type="text/css" />\n<title>封面</title>\n</head>\n<body>'
    htmlContent.append(htmlHead1)
    htmlContent.append(
        '<div class="cover"><img alt=""src="../Images/' + newEpub.coverUrl.split('/')[-1] + '" /></div>')
    htmlContent.append('<div class="entry">\n<span class="title">简介</span>\n<div class="entry-content">')
    htmlContent.append('<p>' + newEpub.introduction + '</p>')
    htmlContent.append('</div></div></body></html>')
    tempContent = ''
    for line in htmlContent:
        tempContent += line
    with codecs.open(os.path.join(textPath, 'Cover.html'), 'w', 'utf-8') as f:
        f.write(BeautifulSoup(tempContent).prettify())


    #生成单章节html
    for i in sorted(newEpub.chapter, key=lambda chapter: chapter[0]):
        htmlContent = []
        print('正在生成', i[1])
        if hasQT:
            sender.sigChangeStatus.emit('正在生成' + i[1])
        htmlHead1 = '<?xml version="1.0" encoding="utf-8" standalone="no"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"\n"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN">\n<head>\n<link href="../Styles/style.css" rel="stylesheet" type="text/css" />\n<title>'
        htmlHead2 = '</title>\n</head>\n<body>\n<div>'
        htmlContent.append(htmlHead1 + i[1] + htmlHead2)
        htmlContent.append('<h4>' + i[1] + '</h4>')
        for line in i[2]:
            if line.startswith('<div class="lk-view-img">'):
                findImagesUrl = re.compile(r'data-cover="(.*)" src="')
                imageUrl = findImagesUrl.search(line).group(1)
                if not imageUrl.startswith('http://'):
                    imageUrl = 'http://lknovel.lightnovel.cn' + imageUrl
                downloadQueue.put((imageUrl, basePath))
                imageP = '<div class="illust"><img alt="" src="../Images/' + imageUrl.split('/')[
                    -1] + '" /></div>\n<br/>'
                htmlContent.append(imageP)
            elif line.startswith('<a class="inline"'):
                pass
            else:
                htmlContent.append('<p>' + line + '</p>')
        htmlHead3 = '</div>\n</body>\n</html>'
        htmlContent.append(htmlHead3)
        tempContent = ''
        for line in htmlContent:
            tempContent += line
        with codecs.open(os.path.join(textPath, str(i[0]) + '.html'), 'w', 'utf-8') as f:
            f.write(BeautifulSoup(tempContent).prettify())

    #生成Title.html
    htmlContent = []
    htmlHead1 = '<?xml version="1.0" encoding="utf-8" standalone="no"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"\n"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN">\n<head>\n<link href="../Styles/style.css" rel="stylesheet" type="text/css" />\n<title>'
    htmlHead2 = '</title>\n</head>\n<body>\n<div class="entry">'
    htmlContent.append(htmlHead1 + newEpub.volumeName + htmlHead2)
    htmlContent.append('<span class="title">' + newEpub.volumeName + '</span>')
    htmlContent.append('<div class="entry-content introduction">\n<h4>' + newEpub.volumeNumber + '</h4>')
    htmlContent.append('<div>\n<br />\n</div>')
    htmlContent.append('<p>作者：' + newEpub.authorName + '</p>')
    if newEpub.illusterName:
        htmlContent.append('<p>插画：' + newEpub.illusterName + '</p>')
    htmlContent.append('<p>制作：<a target="_blank" href="http://www.github.com/bebou/lknovel">lknovel</a></p>')
    htmlContent.append('</div>\n</div>\n</body>\n</html>')
    tempContent = ''
    for line in htmlContent:
        tempContent += line
    with codecs.open(os.path.join(textPath, 'Title.html'), 'w', 'utf-8') as f:
        f.write(BeautifulSoup(tempContent).prettify())

    #生成Contents.html
    htmlContent = []
    htmlContent.append(
        '<?xml version="1.0" encoding="utf-8" standalone="no"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"\n"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<link href="../Styles/style.css" rel="stylesheet" type="text/css" />\n<title>目录</title>\n</head>')
    htmlContent.append(
        '<body>\n<div class="entry">\n<span class="title">目录</span>\n<div class="entry-content">\n<ul class="contents">\n')
    for i in sorted(newEpub.chapter, key=lambda chapter: chapter[0]):
        htmlContent.append('<li class="c-rules"><a href="../Text/' + str(i[0]) + '.html">' + i[1] + '</a></li>')
    htmlContent.append('</ul>\n</div>\n</div>\n</body>\n</html>')
    tempContent = ''
    for line in htmlContent:
        tempContent += line
    with codecs.open(os.path.join(textPath, 'Contents.html'), 'w', 'utf-8') as f:
        f.write(BeautifulSoup(tempContent).prettify())


    #下载相关图片
    th = []
    for i in range(5):
        t = threading.Thread(target=download)
        t.start()
        th.append(t)
    for i in th:
        i.join()

    #生成content.opf
    htmlContent = []
    htmlContent.append(
        '<?xml version="1.0" encoding="utf-8" standalone="yes"?>\n<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId" version="2.0">\n<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">')
    htmlContent.append(
        '<dc:identifier id="BookId" opf:scheme="UUID">urn:uuid:' + str(uuid.uuid1()) + '</dc:identifier>')
    htmlContent.append('<dc:title>' + newEpub.bookName + '</dc:title>')
    htmlContent.append(
        '<dc:creator opf:file-as="' + newEpub.authorName + '" opf:role="aut">' + newEpub.authorName + '</dc:creator>')
    htmlContent.append('<dc:language>zh</dc:language>')
    htmlContent.append('<dc:source>http://www.lightnovel.cn</dc:source>')
    htmlContent.append('<dc:description>由https://github.com/bebound/lknovel/生成</dc:description>')
    htmlContent.append('<meta content="' + newEpub.coverUrl.split('/')[-1] + '" name="cover" />')
    htmlContent.append('</metadata>')
    htmlContent.append('<manifest>\n<item href="toc.ncx" id="ncx" media-type="application/x-dtbncx+xml" />')
    for dirPath, dirNames, fileNames in os.walk(os.path.join(basePath, 'Text')):
        for file in fileNames:
            htmlContent.append('<item href="Text/' + file + '" id="' + file + '" media-type="application/xhtml+xml" />')
    htmlContent.append('<item href="Styles/style.css" id="style.css" media-type="text/css" />')
    for dirPath, dirNames, fileNames in os.walk(os.path.join(basePath, 'Images')):
        for file in fileNames:
            if file.split('.')[-1] == 'jpg':
                htmlContent.append('<item href="Images/' + file + '" id="' + file + '" media-type="image/jpeg" />')
            else:
                htmlContent.append('<item href="Images/' + file + '" id="' + file + '" media-type="image/png" />')
    htmlContent.append('</manifest>')
    htmlContent.append('<spine toc="ncx">')
    htmlContent.append(
        '<itemref idref="Cover.html" />\n<itemref idref="Title.html" />\n<itemref idref="Contents.html" />\n')
    for dirPath, dirNames, fileNames in os.walk(os.path.join(basePath, 'Text')):
        for file in sorted(fileNames, key=sortItemref):
            if file not in ('Cover.html', 'Title.html', 'Contents.html'):
                htmlContent.append('<itemref idref="' + file + '" />')
    htmlContent.append('</spine>')
    htmlContent.append(
        '<guide>\n<reference href="Text/Contents.html" title="Table Of Contents" type="toc" />')
    htmlContent.append(
        '<reference href="Text/Cover.html" title="Cover" type="cover"/>\n</guide>')
    htmlContent.append('</package>')
    with codecs.open(os.path.join(basePath, 'content.opf'), 'w', 'utf-8') as f:
        for line in htmlContent:
            f.write(line + '\n')

    #生成toc.ncx
    htmlContent = []
    htmlContent.append(
        '<?xml version="1.0" encoding="UTF-8" standalone="no" ?>\n<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"\n"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">\n<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">\n<head>\n<meta content="0" name="dtb:depth"/>\n<meta content="0" name="dtb:totalPageCount"/>\n<meta content="0" name="dtb:maxPageNumber"/>\n</head>\n<docTitle>\n<text>' + newEpub.bookName + '</text>\n</docTitle>')
    htmlContent.append('<docAuthor>\n<text>' + newEpub.authorName + '</text>\n</docAuthor>\n<navMap>')
    htmlContent.append(
        '<navPoint id="Contents" playOrder="1">\n<navLabel>\n<text>封面</text>\n</navLabel>\n<content src="Text/Cover.html"/>\n</navPoint>')
    htmlContent.append(
        '<navPoint id="Contents" playOrder="2">\n<navLabel>\n<text>标题</text>\n</navLabel>\n<content src="Text/Title.html"/>\n</navPoint>')
    htmlContent.append(
        '<navPoint id="Contents" playOrder="3">\n<navLabel>\n<text>目录</text>\n</navLabel>\n<content src="Text/Contents.html"/>\n</navPoint>')
    playorder = 4
    for i in sorted(newEpub.chapter, key=lambda chapter: chapter[0]):
        htmlContent.append(
            '<navPoint id="' + str(i[0]) + '" playOrder="' + str(playorder) + '">\n<navLabel>\n<text>' + i[
                1] + '</text>\n</navLabel>\n<content src="Text/' + str(i[0]) + '.html"/>\n</navPoint>')
        playorder += 1
    htmlContent.append('</navMap>\n</ncx>')

    with codecs.open(os.path.join(basePath, 'toc.ncx'), 'w', 'utf-8') as f:
        for line in htmlContent:
            f.write(line + '\n')

Example #17

0

Show file

def aktuelBim(cikti='gorsel_veri'):
    """
    BİM Aktüel Verileri

        Kullanım;

                aktuelBim("json_veri")
                aktuelBim("json_gorsel")
                aktuelBim("gorsel_veri")
                aktuelBim("basliklar")
    """

    url = f"https://www.bim.com.tr/default.aspx"
    kimlik = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
    }
    istek = requests.get(url, headers=kimlik, allow_redirects=True)
    corba = BeautifulSoup(istek.text, "lxml")

    sozluk = {}

    tarih = corba.find('a', class_='active subButton').text.strip()
    urun_alani = corba.find('div', class_='productArea')

    urun_rerero = []
    for urun in urun_alani.findAll('div', class_='inner'):
        host = 'https://www.bim.com.tr'
        try:
            urun_basligi = urun.find('h2', class_='title').text.strip()
            urun_linki = host + urun.a['href']
            urun_gorseli = host + urun.img['src'].replace(' ', '%20')
            urun_fiyati = urun.find('a',
                                    class_='gButton triangle').text.strip()

            urun_rerero.append({
                "urun_baslik": urun_basligi,
                "urun_link": urun_linki,
                "urun_gorsel": urun_gorseli,
                "urun_fiyat": urun_fiyati
            })
        except:
            pass

    sozluk.update({'tarih': tarih})
    sozluk.update({'urunler': urun_rerero})

    basliklar = [anahtar for anahtar in sozluk['urunler'][0].keys()]

    if cikti == 'json_veri':
        return sozluk

    elif cikti == 'json_gorsel':
        return json.dumps(sozluk,
                          indent=2,
                          sort_keys=False,
                          ensure_ascii=False)

    elif cikti == 'gorsel_veri':
        return tabulate(sozluk['urunler'], headers='keys', tablefmt='psql')

    elif cikti == 'basliklar':
        return basliklar

    else:
        return kullanim


# print(aktuelBim("json_veri"))

# print(aktuelBim("json_gorsel"))

# print(aktuelBim("gorsel_veri"))

# print(aktuelBim("basliklar"))

# print(aktuelBim("alakasız bişi"))

Example #18

0

Show file

    return errorcheck


def wait_review():
    WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.CLASS_NAME, "score_result")))
    time.sleep(0.6)


while n <= 174747:
    try:
        time.sleep(0.1)
        driver.get('https://movie.naver.com/movie/bi/mi/basic.nhn?code=' +
                   str(n))
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

    except UnexpectedAlertPresentException as e:
        Alert(driver).accept()
        n = n + 1
        continue

    except AttributeError as e:
        n = n + 1
        continue

    except Exception as ex:
        continue

    # 서버 막혔을때 15분 타임슬립
    if (len(soup.get_text())) == 0:

Example #19

0

Show file

File: scraping.py Project: KingAshiru/Simple-analysis-on-Lagos-Abuja-Housing-sales

import requests
import pandas as pd
import csv   
import datetime
url = 'https://www.nigeriapropertycentre.com/for-sale/houses?q=for-sale+houses'
page = requests.get(url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
items = soup.find_all('div', {'class' : 'col-md-12'})
end_page_num = 23

filename = "nigeriaprop_houses.csv"
with open(filename, "w+") as f:

    writer = csv.writer(f)
    writer.writerow(["Listing_type", 'Location', "Price","Bedroom", 'Bathroom', 'Toilet', 'Parking'])
    i = 1
    while i <= end_page_num:

        r = requests.get("https://www.nigeriapropertycentre.com/for-sale/houses?q=for-sale+houses?page={}".format(i))

        soup = BeautifulSoup(r.text, "html.parser")
        items = soup.find_all('div', {'class' : 'col-md-12'})
        x = items[2:]
        
        for item in x:
            try:
                Listing_type = item.find('span').get_text()
            except:
                Listing_type = 'N/A'
            try:

Example #20

0

Show file

File: capitalization.py Project: zippyy/zulip

    r"or Choose a user",
    # This is a parsing bug in the tool
    r"argument ",
    # I can't find this one
    r"text",
]

# Sort regexes in descending order of their lengths. As a result, the
# longer phrases will be ignored first.
IGNORED_PHRASES.sort(key=lambda regex: len(regex), reverse=True)

# Compile regexes to improve performance. This also extracts the
# text using BeautifulSoup and then removes extra whitespaces from
# it. This step enables us to add HTML in our regexes directly.
COMPILED_IGNORED_PHRASES = [
    re.compile(' '.join(BeautifulSoup(regex, 'lxml').text.split()))
    for regex in IGNORED_PHRASES
]

SPLIT_BOUNDARY = '?.!'  # Used to split string into sentences.
SPLIT_BOUNDARY_REGEX = re.compile(r'[{}]'.format(SPLIT_BOUNDARY))

# Regexes which check capitalization in sentences.
DISALLOWED_REGEXES = [
    re.compile(regex) for regex in [
        r'^[a-z]',  # Checks if the sentence starts with a lower case character.
        r'^[A-Z][a-z]+[\sa-z0-9]+[A-Z]',  # Checks if an upper case character exists
        # after a lower case character when the first character is in upper case.
    ]
]

Example #21

0

Show file

def get_total_num(content):
    soup = BeautifulSoup(content, 'html5lib')
    page_num = re.findall(re.compile('共[0-9]+条'), str(soup))
    page_num = int(page_num[0][1:-1])
    print(page_num)
    return page_num

Example #22

0

Show file

File: photosScrapingHealth.py Project: dipperalbel/webdm-everydayhealth-reviews-scraper

correct_list =[]

aggregatePhotos = []

for i in range(len(input_list)):
    
    url = "https://www.everydayhealth.com/drugs/"+input_list[i]
    r = requests.get(url)
    
    if r.status_code != 200:
        print("Could not connect to " + url)
        print("Response : " + str(r.status_code))
        continue

    correct_list.append(input_list[i])
    bs = BeautifulSoup(requests.get(url).text, 'html.parser')
    
    
    json_object = {"name": input_list[i] }
    photos = []
    b2 = bs.findAll("div", {"class" : "drug-image"})
    for b4 in b2 :
        if b4.findChildren("img")[0].attrs["src"] != "" :
            src = b4.findChildren("img")[0].attrs["src"]
            photos.append(get_image("https:" + src,input_list[i]))
        elif b4.findChildren("img")[0].attrs["data-src"] != "" :
            src = b4.findChildren("img")[0].attrs["data-src"]
            photos.append(get_image("https:" + src,input_list[i]))
    json_object["photos"] = photos
    aggregatePhotos.append(json_object)

Example #23

0

Show file

driver.get(
    'https://flights.makemytrip.com/makemytrip/search/O/O/E/1/0/0/S/V0/DEL_BOM_06-03-2018?contains=false&remove='
)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

html_page = driver.page_source
driver.quit()

soup = BeautifulSoup(html_page, "html.parser")
tags = soup('span')
c = soup.find_all(
    "span", {
        "class":
        "block logo_name hidden-xs visible-stb light_gray flt_number_less600 ng-binding ng-scope"
    })  #flNumber
d = soup.find_all("span", {"class": "num ng-binding"})  #price

handle = open('random.txt', 'w')

for i in c:
    handle.write("%s\n" % i.contents)
for i in d:
    handle.write("%s\n" % i.contents)

Example #24

0

Show file

File: Mythic ArtifactExtract.py Project: Faelwen/MythicAdventures

def main():
    with open(item_list, 'r', encoding='utf-8') as spell_file, open(csv_file, 'w', newline='') as output_file:
        writer = csv.writer(output_file, delimiter ='\t', quotechar='"')
        raw_html = spell_file.read().replace('&minus;', '-').replace('&mdash', '--').replace('&ndash;','-').replace('&times;', 'x').replace('â€”', '--').replace('â€“', 'â€“')
        soup = BeautifulSoup(raw_html, 'html.parser')
        extract_items(soup, writer)

Example #25

0

Show file

File: Webscraping.py Project: DhanaSolanke/Big-Data-Analysis-using-MapReduce

import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
count = 0
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
stopwords = ['nimh', 'nih', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'monday', 'tuesday', 'friday', 'saturday','sunday','wednesday','thursday', "a", "about", "above", "after", "again", "against", "ain", "all", "am", "an", "and", "any", "are", "aren", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "couldn", "couldn't", "d", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", "doing", "don", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn", "hadn't", "has", "hasn", "hasn't", "have", "haven", "haven't", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", "just", "ll", "m", "ma", "me", "mightn", "mightn't", "more", "most", "mustn", "mustn't", "my", "myself", "needn", "needn't", "no", "nor", "not", "now", "o", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "re", "s", "same", "shan", "shan't", "she", "she's", "should", "should've", "shouldn", "shouldn't", "so", "some", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "ve", "very", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "won", "won't", "wouldn", "wouldn't", "y", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "could", "he'd", "he'll", "he's", "here's", "how's", "i'd", "i'll", "i'm", "i've", "let's", "ought", "she'd", "she'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "we'd", "we'll", "we're", "we've", "what's", "when's", "where's", "who's", "why's", "would", "able", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "afterwards", "ah", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "announce", "another", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently", "approximately", "arent", "arise", "around", "aside", "ask", "asking", "auth", "available", "away", "awfully", "b", "back", "became", "become", "becomes", "becoming", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "believe", "beside", "besides", "beyond", "biol", "brief", "briefly", "c", "ca", "came", "cannot", "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing", "contains", "couldnt", "date", "different", "done", "downwards", "due", "e", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "ff", "fifth", "first", "five", "fix", "followed", "following", "follows", "former", "formerly", "forth", "found", "four", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten", "h", "happens", "hardly", "hed", "hence", "hereafter", "hereby", "herein", "heres", "hereupon", "hes", "hi", "hid", "hither", "home", "howbeit", "however", "hundred", "id", "ie", "im", "immediate", "immediately", "importance", "important", "inc", "indeed", "index", "information", "instead", "invention", "inward", "itd", "it'll", "j", "k", "keep", "keeps", "kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look", "looking", "looks", "ltd", "made", "mainly", "make", "makes", "many", "may", "maybe", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "moreover", "mostly", "mr", "mrs", "much", "mug", "must", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "nobody", "non", "none", "nonetheless", "noone", "normally", "nos", "noted", "nothing", "nowhere", "obtain", "obtained", "obviously", "often", "oh", "ok", "okay", "old", "omitted", "one", "ones", "onto", "ord", "others", "otherwise", "outside", "overall", "owing", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "said", "saw", "say", "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sent", "seven", "several", "shall", "shed", "shes", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially", "successfully", "sufficiently", "suggest", "sup", "sure", "take", "taken", "taking", "tell", "tends", "th", "thank", "thanks", "thanx", "thats", "that've", "thence", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "thereto", "thereupon", "there've", "theyd", "theyre", "think", "thou", "though", "thoughh", "thousand", "throug", "throughout", "thru", "thus", "til", "tip", "together", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "ts", "twice", "two", "u", "un", "unfortunately", "unless", "unlike", "unlikely", "unto", "upon", "ups", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "v", "value", "various", "'ve", "via", "viz", "vol", "vols", "vs", "w", "want", "wants", "wasnt", "way", "wed", "welcome", "went", "werent", "whatever", "what'll", "whats", "whence", "whenever", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "whim", "whither", "whod", "whoever", "whole", "who'll", "whomever", "whos", "whose", "widely", "willing", "wish", "within", "without", "wont", "words", "world", "wouldnt", "www", "x", "yes", "yet", "youd", "youre", "z", "zero", "a's", "ain't", "allow", "allows", "apart", "appear", "appreciate", "appropriate", "associated", "best", "better", "c'mon", "c's", "cant", "changes", "clearly", "concerning", "consequently", "consider", "considering", "corresponding", "course", "currently", "definitely", "described", "despite", "entirely", "exactly", "example", "going", "greetings", "hello", "help", "hopefully", "ignored", "inasmuch", "indicate", "indicated", "indicates", "inner", "insofar", "it'd", "keep", "keeps", "novel", "presumably", "reasonably", "second", "secondly", "sensible", "serious", "seriously", "sure", "t's", "third", "thorough", "thoroughly", "three", "well", "wonder", "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "co", "op", "research-articl", "pagecount", "cit", "ibid", "les", "le", "au", "que", "est", "pas", "vol", "el", "los", "pp", "u201d", "well-b", "http", "volumtype", "par", "0o", "0s", "3a", "3b", "3d", "6b", "6o", "a1", "a2", "a3", "a4", "ab", "ac", "ad", "ae", "af", "ag", "aj", "al", "an", "ao", "ap", "ar", "av", "aw", "ax", "ay", "az", "b1", "b2", "b3", "ba", "bc", "bd", "be", "bi", "bj", "bk", "bl", "bn", "bp", "br", "bs", "bt", "bu", "bx", "c1", "c2", "c3", "cc", "cd", "ce", "cf", "cg", "ch", "ci", "cj", "cl", "cm", "cn", "cp", "cq", "cr", "cs", "ct", "cu", "cv", "cx", "cy", "cz", "d2", "da", "dc", "dd", "de", "df", "di", "dj", "dk", "dl", "do", "dp", "dr", "ds", "dt", "du", "dx", "dy", "e2", "e3", "ea", "ec", "ed", "ee", "ef", "ei", "ej", "el", "em", "en", "eo", "ep", "eq", "er", "es", "et", "eu", "ev", "ex", "ey", "f2", "fa", "fc", "ff", "fi", "fj", "fl", "fn", "fo", "fr", "fs", "ft", "fu", "fy", "ga", "ge", "gi", "gj", "gl", "go", "gr", "gs", "gy", "h2", "h3", "hh", "hi", "hj", "ho", "hr", "hs", "hu", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ic", "ie", "ig", "ih", "ii", "ij", "il", "in", "io", "ip", "iq", "ir", "iv", "ix", "iy", "iz", "jj", "jr", "js", "jt", "ju", "ke", "kg", "kj", "km", "ko", "l2", "la", "lb", "lc", "lf", "lj", "ln", "lo", "lr", "ls", "lt", "m2", "ml", "mn", "mo", "ms", "mt", "mu", "n2", "nc", "nd", "ne", "ng", "ni", "nj", "nl", "nn", "nr", "ns", "nt", "ny", "oa", "ob", "oc", "od", "of", "og", "oi", "oj", "ol", "om", "on", "oo", "oq", "or", "os", "ot", "ou", "ow", "ox", "oz", "p1", "p2", "p3", "pc", "pd", "pe", "pf", "ph", "pi", "pj", "pk", "pl", "pm", "pn", "po", "pq", "pr", "ps", "pt", "pu", "py", "qj", "qu", "r2", "ra", "rc", "rd", "rf", "rh", "ri", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "rv", "ry", "s2", "sa", "sc", "sd", "se", "sf", "si", "sj", "sl", "sm", "sn", "sp", "sq", "sr", "ss", "st", "sy", "sz", "t1", "t2", "t3", "tb", "tc", "td", "te", "tf", "th", "ti", "tj", "tl", "tm", "tn", "tp", "tq", "tr", "ts", "tt", "tv", "tx", "ue", "ui", "uj", "uk", "um", "un", "uo", "ur", "ut", "va", "wa", "vd", "wi", "vj", "vo", "wo", "vq", "vt", "vu", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y2", "yj", "yl", "yr", "ys", "yt", "zi", "zz"]
urls = []
Finals = set()
with open("urlfiles.txt", "r") as file:
    urls = file.readlines()
for each in urls:
    if len(Finals) < 800:
        if "shtml" in each:
            page = urllib.urlopen(each)
            soup = BeautifulSoup(page, 'html.parser')
            paragraphs = soup.findAll('p')
            data = ""
            for single in paragraphs:
                data += single.text.strip()
            if "Page Not Found" not in data or 'possible that the page is temporarily unavailable' not in data:
                with open('ccdumpbig.txt', 'a') as f:
                    f.write((data+"\n").encode("utf8"))
                data = re.sub(r"http\S+", "", data)
                # Remove emoticons
                data = data.encode('ascii', 'ignore').decode('ascii')
                data = tokenizer.tokenize(data)
                final_data = ""
                for w in data:
                    if w.isalpha() and w.lower() not in stopwords:
                        # Stem it

Example #26

0

Show file

File: scraping_lecture.py Project: tmcnary/python-studies

import time
import os

with open('gov_newspapers.csv', 'w') as f:
    writer = csv.DictWriter(f,
                            fieldnames=('name', 'state', 'newspaper',
                                        'article headline', 'date', 'url',
                                        'body text', 'negative', 'positive',
                                        'neutral', 'composite'))
    writer.writeheader()
    news = {}
    for i in range(6):
        mo_url = "https://www.stltoday.com/search/?f=html&q=mike+parson&d1=2018-04-01&d2=2019-07-01&s=start_time&sd=desc&l=100&t=article&nsa=eedition&app%5B0%5D=editorial&o={}00".format(
            i)
        mo_search = urllib.request.urlopen(mo_url)
        parsed_mo = BeautifulSoup(mo_search.read())
        h3_tag = parsed_mo.find_all('h3', {'class': 'tnt-headline'})
        for headline in range(len(h3_tag)):
            #time.sleep(2)
            try:
                news['name'] = "Mike Parsons"
                news['state'] = 'MO'
                news['newspaper'] = 'St. Louis Post Dispatch'
                news['article headline'] = unicodedata.normalize(
                    'NFKD', h3_tag[headline].get_text()).strip()
                news['url'] = 'https://www.stltoday.com' + h3_tag[
                    headline].find('a')['href']
                article = urllib.request.urlopen(news['url'])
                parsed_article = BeautifulSoup(article.read())
                news['date'] = parsed_article.find('time').get_text()
                news['body text'] = parsed_article.find(

Example #27

0

Show file

File: very_basic_web_scraper..py Project: Melted-Cheese96/WebinteractionBots

from bs4 import BeautifulSoup
import requests
import re


r = requests.get('https://www.jimsmowing.net')

content = r.text

soup = BeautifulSoup(content, 'html.parser')
#print(soup.find_all('p')[4].get_text())

Example #28

0

Show file

import os
import time
from var import List

driver = webdriver.Chrome("./driver/chromedriver")

feel = List.img_list

for item in feel:
    time.sleep(1)
    driver.get("https://www.google.co.jp/imghp?hl=ja&tab=wi&ogbl")
    driver.find_element_by_name("q").send_keys(item, Keys.ENTER)
    # 現在のページのurlを変数に入れる
    current_url = driver.current_url
    html = requests.get(current_url)
    bs = BeautifulSoup(html.text, "lxml")
    images = bs.find_all("img", limit=10 + 1)
    # imgフォルダの作成
    if not os.path.isdir("img/ans"):
        os.makedirs("img/ans")
    # 取得した画像をループして保存
    for i, img in enumerate(images, start=1):
        src = img.get("src")
        try:
            responce = requests.get(src)
            with open("img/ans/" + item + "{}.jpg".format(i - 1), "wb") as f:
                f.write(responce.content)
        except requests.exceptions.MissingSchema:
            pass

driver.quit()

Example #29

0

Show file

def collect_data(process_num, course_url_list, course_name_list,
                 course_dur_list):
    # This will get the keywords from faculty file and put it into a dictionary
    with open('C:/Users/veye/Dropbox/Scrapping/Others/faculty.csv',
              'rt',
              encoding='utf-8'
              ) as List:  # Can use the faculty file from the Dropbox also.
        reader = csv.reader(List)
        mydict = {rows[0]: rows[1] for rows in reader}

    # This part will scrape the page
    with open('C:/Scrape/your_uni_folder/ExtractedData_' + '_all' + '.csv',
              'at',
              newline='',
              encoding='utf-8-sig') as website:
        writer = csv.writer(website)

        while True:
            num_loop = 0
            while num_loop < len(course_url_list):

                req = requests.get(course_url_list[num_loop])
                soup = BeautifulSoup(req.content, 'lxml')

                # details['Course Name', 'Level', 'Faculty', 'Duration', 'Duration Type', 'URL', 'Description', 'Keywords', 'ScrapeAll']
                details = ['', '', '', '', '', '', '', '', '', '']

                ##

                # Course name
                if 'null' in course_name_list[num_loop]:
                    ""
                else:
                    details[0] = course_name_list[num_loop]

                #-------------------------------  change code here ---------------------------------------------

                # Duration Text
                durationText = soup.find('div', {
                    'class': 'someclassnameyouhavetofind'
                }).text

                # Description
                descText = soup.find('div', {
                    'class': 'someclassnameyouhavetofind'
                }).text
                details[6] = clean(descText)

                #--------------- you don't have to change anything past here unless you really need to --------------

                # Duration and Duration Type
                # this returns a pair duration (int), durationtype (string)
                durationPair = convertDuration(durationText)
                details[3] = durationPair[0]
                details[4] = durationPair[1]

                # Both the code for levels and faculty can be changed to suit the website that you are doing.
                # Levels
                word = details[0]
                lock = 0
                for level, key in level_key.items():
                    for each in key:
                        for wd in word.split():
                            if each.lower() == wd.lower(
                            ):  # Testing the equal, might change back to in
                                details[1] = level
                                lock = 1
                                break
                        if lock == 1:
                            break
                    if lock == 1:
                        break

                # Faculty
                loop_must_break = False
                for a in details[0].split():
                    for fac, key in mydict.items():
                        for each in key.split(','):
                            if each.replace("'", '').title() in a:
                                print("\t\t\t" + each + '  in  ' + details[0] +
                                      ' from ' + course_url_list[num_loop])
                                details[2] = fac
                                loop_must_break = True
                                break
                        if loop_must_break:
                            break
                    if loop_must_break:
                        break

                # URL
                details[5] = req.url

                # Scrape All
                [
                    s.extract() for s in soup(
                        ['style', 'script', '[document]', 'head', 'title'])
                ]
                visible_text = repr(soup.get_text().replace(
                    r'\\n',
                    ' ').replace('\n', '').replace('\\', '').replace(', ', ''))
                visible_text = re.sub(r'[^\x00-\x7f]', r' ', visible_text)
                visible_text = ' '.join(visible_text.split())

                details[7] = str(repr(visible_text))

                writer.writerow(details)
                print(details)
                print("Page " + str(num_loop) + '/' +
                      str(len(course_url_list)) + " from " + '_all')
                time.sleep(3)
                num_loop += 1
            print("\n" + str(len(course_url_list)) + " in the queue of " +
                  '_all')

            print("\n" + '_all' + " has exited the loop")
            break

Example #30

0

Show file

def get_soup(url):
    response = requests.request('get', url=url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup