Ejemplos de Bs4 en Python, ejemplos de bs4.Bs4 en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: EDSM Name Scrapper.py Proyecto: HassanAbouelela/PlayerLocaterSystem

async def active_name_generator():
    a = True
    i = 1
    links = []
    while a:
        async with aiohttp.ClientSession() as session:
            async with session.get(
                    f"https://www.edsm.net/en/expeditions/p/{i}") as response:
                async with session.get(
                        f"https://www.edsm.net/en/expeditions/p/{i - 1}"
                ) as pre_response:
                    if await response.text() == await pre_response.text(
                    ) or i == 1:
                        html = Bs4(await response.text(), "html.parser")
                        for link in html.find_all("a"):
                            formated = link.get("href")
                            if formated is not None:
                                if "/en/expeditions/summary/" in formated:
                                    if formated not in links:
                                        links.append(formated)
                        i += 1
                    else:
                        a = False

    ongoing = []
    for link in links:
        async with aiohttp.ClientSession() as session:
            async with session.get(f"https://www.edsm.net/{link}") as response:
                raw_html = await response.text()
        html = Bs4(raw_html, "html.parser")
        if "This expedition is finished." not in html.get_text():
            ongoing.append(link)

    names = {}
    for link in ongoing:
        expedition = link[link.find("name") + 5:]
        expedition_clean = unquote(expedition).replace("+", " ")
        link = link.replace("summary", "participants")
        async with aiohttp.ClientSession() as session:
            async with session.get(f"https://www.edsm.net/{link}") as response:
                raw_html = await response.text()
        html = Bs4(raw_html, "html.parser")
        for text in html.find_all("a"):
            formated = text.get("href")
            if formated is not None:
                if "/en/user/profile/" in formated:
                    spot = formated.find("cmdr")
                    name = formated[spot + 5:]
                    name_clean = unquote(name).replace("+", " ")
                    if name_clean not in names:
                        names[name_clean] = []
                    names[name_clean].append(expedition_clean)

    return names

Ejemplo n.º 2

0

Mostrar archivo

Archivo: dee.py Proyecto: dovelx/changqing

def get_next_url(urllist):
    url_list = []
    for url in urllist:
        response = requests.get(url, headers=headers)
        soup = Bs4(response.text, "html.parser")
        urls = soup.find_all("a")
        if urls:
            for url2 in urls:
                url2_1 = url2.get("href")
                if url2_1:
                    if url2_1[0] == "/":
                        url2_1 = head_url + url2_1
                        url_list.append(url2_1)
                        if url2_1[
                                0:24] == "http://192.168.6.27:6030/portals/hd":
                            url2_1 = url2_1
                            url_list.append(url2_1)
                        else:
                            pass
                    else:
                        pass
                else:
                    pass
        else:
            pass
    url_list2 = set(url_list)
    for url_ in url_list2:
        res = requests.get(url_)
        if res.status_code == 200:
            print(url_)
    print(len(url_list2))
    get_next_url(url_list2)

Ejemplo n.º 3

0

Mostrar archivo

def get_next_url(urllist):
    url_list = []
    for url in urllist:
        response = requests.get(url, headers=headers)
        soup = Bs4(response.text, "lxml")
        urls = soup.find_all("a")
        if urls:
            for url2 in urls:
                url2_1 = url2.get("href")
                if url2_1:
                    if url2_1[0] == "/":
                        url2_1 = head_url + url2_1
                        url_list.append(url2_1)
                        if url2_1[0:24] == "http://www.xxx.com.cn":
                            url2_1 = url2_1
                            url_list.append(url2_1)
                        else:
                            pass
                    else:
                        pass
                else:
                    pass
        else:
            pass
    url_list2 = set(url_list)
    for url_ in url_list2:
        res = requests.get(url_)
        if res.status_code == 200:
            print(url_)
    print(len(url_list2))
    get_next_url(url_list2)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: inneed_1.py Proyecto: 6formyself/pythonSpider

def get_title_link(item_url):
    """将数据页面的url记录"""
    links_file = open('links.txt', 'a')
    # data_list = []
    html = requests.get(item_url, headers=head)
    html.encoding = 'utf-8'
    soup = Bs4(html.text, 'lxml')
    # 拿到尾页数据
    aa_list = soup.select('a')
    useable_a = []
    for aa in aa_list:
        if aa.get('href') and aa.get('href').startswith('/news/?page='):
            useable_a.append(aa.get('href'))
    if len(useable_a) == 0:
        return
    i_need = useable_a[len(useable_a) - 1]
    pages = int(i_need[12:i_need.find('&')])
    params = i_need[i_need.find('&'):]
    # 第一页的数据
    links_file.write(item_url + '\n')
    # get_item(data_list, item_url)
    # 其他页数据
    for i in range(2, pages + 1):
        uri = 'https://grain.sci99.com/news/?page=' + str(i) + params
        print(uri)
        links_file.write(uri + '\n')
        # get_item(data_list, uri)
    links_file.close()

Ejemplo n.º 5

0

Mostrar archivo

Archivo: get_photos.py Proyecto: Maiale-al-Mare/get-photos-google

def main():
    global response
    photo_dir_name = keyword.replace(' ', '-') + '-photos'
    chrome.get(f'https://www.google.com/imghp?hl=en&q={keyword}')
    chrome.find_element_by_css_selector('input[name=q]').send_keys(Keys.ENTER)
    scroll_pause_time = 0.10
    scroll_length = 200
    scroll_position = 0
    for _ in range(scroll_to_bottom_number):
        time.sleep(1.5)
        page_height = int(
            chrome.execute_script('return document.body.scrollHeight'))
        while scroll_position < page_height:
            scroll_position = scroll_position + scroll_length
            chrome.execute_script('window.scrollTo(0, ' +
                                  str(scroll_position) + ');')
            time.sleep(scroll_pause_time)
        time.sleep(1.5)
    source = chrome.page_source
    chrome.close()
    soup = Bs4(source, 'html.parser')
    photos = [
        photo for photo in soup.find(attrs={
            'id': 'islmp'
        }).find_all('img')
    ]
    if not os.path.exists(photo_dir_name):
        os.mkdir(photo_dir_name)
    os.chdir(photo_dir_name)
    with open('sources.txt', 'a') as sources:
        for x, photo in enumerate(photos):
            try:
                key = 'src' if 'src' in photo.attrs else 'data-src'
                if photo[key].startswith('data'):
                    mime_type = re.search(':(.+);',
                                          photo[key]).group(1).split('/')[1]
                else:
                    sources.write(photo[key] + '\n')
                    response = requests.get(photo[key], stream=True)
                    if not response.ok:
                        continue
                    mime_type = response.headers['content-type'].split('/')[1]
                with open(f'{keyword.replace(" ", "-")}-{x}.{mime_type}',
                          'wb') as handle:
                    if photo[key].startswith('data'):
                        handle.write(
                            base64.decodebytes(photo[key].split('base64,')
                                               [1].encode('unicode_escape')))
                    else:
                        for block in response.iter_content(1024):
                            if not block:
                                break
                            handle.write(block)
            except Exception as e2:
                print(f'C\'è stato un errore: {e2}')
                continue
    os.chdir('..')
    shutil.make_archive(photo_dir_name, 'zip', photo_dir_name)
    shutil.rmtree(photo_dir_name)
    print('Fatto!')

Ejemplo n.º 6

0

Mostrar archivo

Archivo: Banner.py Proyecto: 6formyself/pythonSpider

def get_banner_page_link():
    file = open('banner_link.txt', 'a')
    for i in range(1, 82):
        html = requests.get(head_url + '/page/' + str(i), headers=head)
        html.encoding = 'utf-8'
        soup = Bs4(html.text, 'lxml')
        a_list = soup('a', attrs={'class': 'item-href'})
        for a in a_list:
            print(a.get('href'))
            file.write(a.get('href') + '\n')

Ejemplo n.º 7

0

Mostrar archivo

Archivo: bithumb.py Proyecto: himanmenGit/aws_lambda_exchange_crawler_to_telegram_message

    def parser_notice(self, html):
        soup = Bs4(html, 'html.parser')
        notices = soup.select(
            '#content  > div:nth-of-type(1) > div:nth-of-type(1) > div > table > tbody > tr > td.board-title > span')
        pks = soup.select('#content > div:nth-of-type(1) > div:nth-of-type(1) > div > table > tbody > tr')

        for index, pk in enumerate(pks):
            title = notices[index].text
            url = 'https://cafe.bithumb.com/view/board-contents/' + re.findall("'([a-zA-Z0-9,\s]*)'", pk['onclick'])[0]
            self.current_notice_list.append('<a href="{}">{}</a>'.format(url, self.to_html(title)))

Ejemplo n.º 8

0

Mostrar archivo

Archivo: dee.py Proyecto: dovelx/changqing

def get_first_url():
    list_href = []
    reaponse = requests.get(head_url, headers=headers, cookies=cookies)
    soup = Bs4(reaponse.text, "html.parser")
    urls_li = soup.select("#mainmenu_top > div > div > ul > li")
    for url_li in urls_li:
        urls = url_li.select("a")
        for url in urls:
            url_href = url.get("href")
            list_href.append(head_url + url_href)
            out_url = list(set(list_href))
    return out_url

Ejemplo n.º 9

0

Mostrar archivo

    def parser_notice(self, html):
        soup = Bs4(html, 'html.parser')
        pre = soup.select_one('body > pre')
        html = pre.text
        json_val = json.loads(html)
        notices = json_val.get('results')

        for notice in notices:
            title = notice.get('title')
            url = 'https://coinone.co.kr/talk/notice/detail/{}'.format(
                notice.get('id'))
            self.current_notice_list.append('<a href="{}">{}</a>'.format(
                url, self.to_html(title)))

Ejemplo n.º 10

0

Mostrar archivo

Archivo: 1220zikao365_3.py Proyecto: chenxun450/pysp

def html3_handle(html,courseid,coursename):

    soup = bs3.BeautifulSoup(html)
    soup1 = Bs4(html,'lxml')
    if len(soup1.findAll("img")):
        with open('未爬取科目.txt','w+') as f:
            str1 = courseid + "|" + coursename + "\r\n"
            f.write(str1)
        time.sleep(1)
        raise Exception("有图片，无法解析")

    for tag in soup.findAll('status'):
        tag.string = "1*"
    for tag in soup.findAll('count'):
        tag.string = "1*"
    for tag in soup.findAll('score'):
        tag.string = "1*"
    ali = soup.findAll(text=True)
    bli = []
    for x in ali:
        if x.isspace():
            pass
        else:
            x += '||'
            bli.append(x)

    astr = ' '.join(bli)
    bstr = p1.sub('', astr)

    cli = bstr.split("1*||")

    array1 = []
    p = re.compile(r"<.*?>")
    p2 = re.compile(r"参见教材.*?。|参见教材P\d*|参考教材.*?。|参考教材P\d*")
    for x in cli:

        if x.isspace():
            pass
        else:
            xli = x.split("||")
            dli = []
            for x in xli:
                if len(x):
                    a = p2.sub('', x)
                    str1 = p.sub('', a).strip()
                    if not str1.isspace():
                        dli.append(str1)
            if len(dli) > 3:
                array1.append(dli)
    print(array1)
    return array1

Ejemplo n.º 11

0

Mostrar archivo

Archivo: inneed_1.py Proyecto: 6formyself/pythonSpider

def get_more_urls(m_head_url, back_url):
    """获取所有更多的链接"""
    html = requests.get(m_head_url + back_url, headers=head)
    html.encoding = 'utf-8'
    soup = Bs4(html.text, 'lxml')
    a_list = soup.select('a')
    all_url_list = []
    for a_tag in a_list:
        if str(a_tag).find("更多") != -1:
            url_href = a_tag.get('href')
            if url_href and url_href.startswith('/news'):
                print(m_head_url + url_href)
                all_url_list.append(m_head_url + url_href)
    return all_url_list

Ejemplo n.º 12

0

Mostrar archivo

Archivo: web_url.py Proyecto: zqm0779/tools

def get_first_url():
    list_href = []
    reaponse = requests.get(head_url, headers=headers)
    print(reaponse.text)
    soup = Bs4(reaponse.text, "lxml")
    urls_li = soup.select("#__next > div > div > header > div > div > div")
    print(urls_li)
    for url_li in urls_li:
        urls = url_li.select("a")
        for url in urls:
            url_href = url.get("href")
            list_href.append(head_url+url_href)
            out_url = list(set(list_href))
            print(out_url)
    return out_url

Ejemplo n.º 13

0

Mostrar archivo

Archivo: inneed_3.py Proyecto: 6formyself/pythonSpider

def get_item(url):
    data_con = []
    html = requests.get(url, headers=head)
    html.encoding = 'utf-8'
    soup = Bs4(html.text, 'lxml')
    ul_list = soup.select('ul')
    for ul in ul_list:
        ul_id = ul.get('id')
        if ul_id and ul_id == 'list':
            a_list = ul.select('a')
            for a_item in a_list:
                href = a_item.get('href')
                title = list(a_item.select('h2')[0].stripped_strings)[0]
                data_con.append([title, href])
    return data_con

Ejemplo n.º 14

0

Mostrar archivo

def get_li_a_link():
    html = requests.get(head_url, headers=head)
    html.encoding = 'utf-8'
    soup = Bs4(html.text, 'lxml')
    ul_con = soup.select('ul')
    a_list = []
    for ul in ul_con:
        if not ul.get('class'):
            continue
        if ul.get('class')[0] == 'menu_ul_left' or ul.get(
                'class')[0] == 'menu_ul_right':
            a_s = ul.select('a')
            for a in a_s:
                a_list.append(a.get('href'))
    return a_list

Ejemplo n.º 15

0

Mostrar archivo

Archivo: inneed_1.py Proyecto: 6formyself/pythonSpider

def get_item(uri):
    """通过url读取数据页面的数据"""
    data_con = []
    html = requests.get(uri, headers=head)
    html.encoding = 'utf-8'
    soup = Bs4(html.text, 'lxml')
    ul_list = soup.select('ul')
    for ul in ul_list:
        class_name = ul.get('class')
        if class_name and class_name[0].startswith('ul_w488'):
            a_list = ul.select('a')
            for a_item in a_list:
                href = head_url + '/news/' + a_item.get('href')
                title = a_item.string.strip()
                data_con.append([title, href])
    return data_con

Ejemplo n.º 16

0

Mostrar archivo

def get_other_page(sheet, page_url):
    my_file = open('data.txt', 'a')
    html = requests.get(page_url, headers=head)
    html.encoding = 'utf-8'
    soup = Bs4(html.text, 'lxml')
    # 拿到所有div
    my_div_list = soup.select('div')
    for div in my_div_list:
        if div.get('class') and div.get('class')[0] == 'side_con_left':
            title = div.select('a')[0].get('title')
            href = head_url + div.select('a')[0].get('href')
            data = [title, href]
            global row
            row = row + 1
            excel_write.write_excel(sheet, row, data)
            my_file.write(str(data) + '\n')
    my_file.close()

Ejemplo n.º 17

0

Mostrar archivo

Archivo: parse-grad-students.py Proyecto: SunPowered/mac-ep-office-space

def main(options):
    # Handle DEBUG
    logger.info("Parsing Eng Phys Grad Students")

    if options.debug:
        logger.setLevel(logging.DEBUG)
        logger.debug("Debug Enabled")

        pkl_file = os.path.join(os.path.dirname(__file__), PKL_FILE)
        if not os.path.exists(pkl_file):
            logger.debug("Debug HTML file does not exist, parsing web site")
            r = requests.get(GRAD_LIST_URL)
            html_text = r.text
            with open(pkl_file, 'w') as f:
                f.write(html_text)
        else:
            logger.debug("Reading Debug HTML file")
            with open(pkl_file, 'r') as f:
                html_text = f.read()
    else:
        logger.info("Parsing Web Site")
        r = requests.get(GRAD_LIST_URL)
        html_text = r.text

    bs = Bs4(html_text)
    # Student table
    students = bs.find(id="bottom_content").table

    if options.file is not None:
        filename = options.file
    else:
        filename = DEFAULT_DATA_CSV

    logger.info("Saving student data to {}".format(filename))
    with open(filename, 'w') as f:
        writer = csv.DictWriter(
            f,
            ["first", "last", "level", "email", "supervisor", "room", "ext"])
        writer.writeheader()
        for row in students.find_all("tr")[1:]:
            data = strip_row(row)
            writer.writerow(data)

    logger.info("Finished parsing student data")

Ejemplo n.º 18

0

Mostrar archivo

Archivo: 300789.py Proyecto: StarXC/test_Git

def get_first_url():
    list_href = []
    responses = requests.get(head_url)
    # print(responses.text)
    soup = Bs4(responses.text, "lxml")
    # print(soup)
    urls_li = soup.find_all(["a","br","tr"])
    # urls_li = soup["href"]
    # print(urls_li)
    # urls_li = soup.select(selector  = ".a")
    # print(urls_li)
    for url_li in urls_li:
        urls = url_li.select("a")
        # print(urls)
        for url in urls:
            url_href = url.get("href")
            a = list_href.append(head_url + url_href)
            # print(a)
            out_url = list(set(list_href))
            # print(out_url)
    # print(out_url)
    return out_url

Ejemplo n.º 19

0

Mostrar archivo

Archivo: Banner.py Proyecto: 6formyself/pythonSpider

            print("图片已存在")
    except:
        print("图片获取失败")


def get_banner_page_link():
    file = open('banner_link.txt', 'a')
    for i in range(1, 82):
        html = requests.get(head_url + '/page/' + str(i), headers=head)
        html.encoding = 'utf-8'
        soup = Bs4(html.text, 'lxml')
        a_list = soup('a', attrs={'class': 'item-href'})
        for a in a_list:
            print(a.get('href'))
            file.write(a.get('href') + '\n')


if __name__ == '__main__':
    banner = open('banner.txt', 'a')
    banner_file = open('banner_link.txt')
    link_con = banner_file.readlines()
    for link in link_con:
        html = requests.get(link[0:-1], headers=head)
        html.encoding = 'utf-8'
        soup = Bs4(html.text, 'lxml')
        div = soup('div', attrs={'class': 'inspiration-images'})[0]
        image_links = div.select('img')
        for image_link in image_links:
            print(image_link.get('src'))
            save_image(image_link.get('src'))

Ejemplo n.º 20

0

Mostrar archivo

    os.mkdir(os.path.join(os.getcwd(), 'images'))

except:
    pass

os.chdir(os.path.join(os.getcwd(), 'images'))

profile_url = 'https://www.instagram.com/pic.the.nature/'
driver.get(profile_url)

driver.implicitly_wait(10)

scroll(driver, 4)

source = driver.page_source
html_soup = Bs4(source, 'lxml')
html_soup.prettify()

images = html_soup.find_all('img', {'class': 'FFVAD'})

for image in images:
    name = image['alt'][:10]
    link = image['src']
    try:
        with open(name.replace('.', '-').replace('_', '-').replace(' ', '-').replace('/', '') + '.png', 'wb') as f:
            img = requests.get(link)
            f.write(img.content)

    except:
        print('fail')

Ejemplo n.º 21

0

Mostrar archivo

Archivo: main_exec.py Proyecto: fdolbec123/multi-search

base_google_url = "https://www.google.ca/search?q="
base_duck_duck_go = "https://duckduckgo.com/html?q="

terme = "pizza toute garnie"
terme_converti_en_html = parse.quote_plus(terme)

url_de_recherche_google = base_google_url + terme_converti_en_html
url_de_recherche_duck_duck_go = base_duck_duck_go + terme_converti_en_html

request_google = url.Request(url_de_recherche_google, None, headers)
request_duck_duck_go = url.Request(url_de_recherche_duck_duck_go, None, headers)

with url.urlopen(request_google) as response:
    html = response.read().decode('utf8')
    soup = Bs4(html, 'html.parser')

    g = soup.find_all("div", {"class": "g"})
    h3_results = []
    links_results = []
    print(g)
    g_list = list(g)
    descriptions_list = []
    for div_g in g:
        h3s = div_g.find_all("h3", {"class": "LC20lb"})
        for h3 in h3s:
            a = h3.text
            h3_results.append(a)
        hrefs = div_g.find_all("a")
        c = []
        for href in hrefs:

Ejemplo n.º 22

0

Mostrar archivo

import requests
from bs4 import BeautifulSoup as Bs4
from selenium import webdriver

head = {
    'User-Agent':
    'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
}
head_url = 'https://www.lawtime.cn/dongchengqu/lawfirm/'

if __name__ == '__main__':
    driver = webdriver.Chrome()
    file = open('dongcheng.txt', 'a')
    # 79
    for i in range(1, 23):
        driver.get(head_url + 'p' + str(i) + '?order=1')
        soup = Bs4(driver.page_source, 'lxml')
        lay_info_div_con = soup('div', attrs={'class': 'law-info'})
        for lay_info_div in lay_info_div_con:
            lay_name = lay_info_div.select('a')[0].text
            lay_phone = lay_info_div.select('a')[1].text
            people = lay_info_div.select('span')[0].text
            if 10 <= int(people) <= 50:
                file.write(lay_name + ' ' + lay_phone + ' ' + people + '人\n')
            print(lay_name, lay_phone, people)
    file.close()

Ejemplo n.º 23

0

Mostrar archivo

def main():
    url = 'http://ketqua.net/'
    r = requests.get(url)
    soup = Bs4(r.content, 'html.parser')
    target = soup.find('td', attrs={'class': 'bor f2 db'})
    return target.get_text()

Ejemplo n.º 24

0

Mostrar archivo

Archivo: initialbuild.py Proyecto: fabrizioct86/Galnet-Newsfeed

async def db_builder(host: str,
                     database: str,
                     table: str = "Articles",
                     create_table=True,
                     user: str = "postgres",
                     passfile=None,
                     password: str = None,
                     ssl=False,
                     port: int = None):
    """Builds an article database, with all articles to date."""
    # Establishing DB Connection
    connection = await asyncpg.connect(host=host,
                                       port=port,
                                       user=user,
                                       password=password,
                                       passfile=passfile,
                                       database=database,
                                       ssl=ssl)

    # Make table if one is not provided
    if create_table:
        table = table.strip()
        await connection.execute(f"""
        CREATE TABLE "{table}" (
        "ID" serial NOT NULL, 
        "Title" text, 
        "UID" text, 
        "dateReleased" date, 
        "dateAdded" date, 
        "Text" text,
        PRIMARY KEY ("ID"));
        ALTER TABLE "{table}" OWNER to "{user}";
        """)

    # Collecting Links and articles
    links = []
    date_now = datetime.datetime.now().strftime("%Y-%m-%d")

    async with aiohttp.ClientSession() as session:
        async with session.get(
                "https://community.elitedangerous.com/#") as response:
            bs4 = Bs4(await response.text(), "html.parser")

    for entry in bs4.find_all(
            id="block-frontier-galnet-frontier-galnet-block-filter"):
        for link in entry.find_all("a"):
            links.append(link.get("href"))
    links.reverse()

    for result in links:
        date_article = datetime.datetime.strptime(
            result.replace("#", "")[re.search("^/galnet/", result).end():],
            "%d-%b-%Y")
        if date_article.year >= 3300:
            date_article = date_article.replace(
                year=(date_article.year - articlesearch.GAME_YEAR_OFFSET))
        date_article = date_article.strftime("%Y-%m-%d")

        async with aiohttp.ClientSession() as session:
            async with session.get(
                    f"https://community.elitedangerous.com{result}"
            ) as response:
                bs4 = Bs4(await response.text(), "html.parser")

        for entry in bs4.find_all("h3",
                                  {"class": "hiLite galnetNewsArticleTitle"}):
            entry_title = entry.get_text().strip().replace("'", "''")
            if entry_title == "" or entry_title is None:
                entry_title = "No Title Available"

            entry_uid = entry.find("a").get(
                "href")[re.search("^/galnet/uid/",
                                  entry.find("a").get("href")).end():]

            async with aiohttp.ClientSession() as session:
                async with session.get(
                        f"https://community.elitedangerous.com/galnet/uid/{entry_uid}/"
                ) as response:
                    bs4 = Bs4(await response.text(), "html.parser")
            text = unquote(bs4.find_all("p")[1].get_text().replace("'", "''"))

            await connection.execute(
                f"""
            INSERT INTO "{table}"("Title", "UID", "dateReleased", "dateAdded", "Text")
            VALUES($1, $2, $3, $4, $5);""", entry_title, entry_uid,
                date_article, date_now, text)

    await connection.close()

    # Dumping Settings For Future Use
    if os.path.exists("Settings.json"):
        os.remove("Settings.json")

    settings = await articlesearch.fetch_settings()
    settings["previous version"] = settings["version"]

    settings["host"] = host
    settings["database"] = database
    settings["table"] = table
    settings["user"] = user
    settings["passfile"] = passfile
    settings["password"] = password
    settings["ssl"] = ssl
    settings["port"] = port

    with open("Settings.json", "w+") as settings_file:
        json.dump(settings, settings_file, indent=2)

Ejemplo n.º 25

0

Mostrar archivo

async def update():
    """Looks for new articles."""
    # Load Settings
    settings = await fetch_settings()

    table = settings["table"]
    async with aiohttp.ClientSession() as session:
        async with session.get(
                "https://community.elitedangerous.com/") as response:
            html = Bs4(await response.text(), "html.parser")

    connection = await connect()

    uids = []
    new_articles = set()

    uid_records = await connection.fetch(f"""
                SELECT "UID" FROM "{table}" ORDER BY "dateReleased" DESC LIMIT 50;
    """)

    for record in uid_records:
        uids.append(record["UID"])

    for entry in html.find_all("h3",
                               {"class": "hiLite galnetNewsArticleTitle"}):
        entry = entry.find("a").get(
            "href")[re.search("^/galnet/uid/",
                              entry.find("a").get("href")).end():]
        if entry not in uids:
            new_articles.add(entry)

    added = []
    for article in new_articles:
        date_today = datetime.datetime.now()

        async with aiohttp.ClientSession() as session:
            async with session.get(
                    f"https://community.elitedangerous.com/galnet/uid/{article}"
            ) as response:
                bs4 = Bs4(await response.text(), "html.parser")
                entry = bs4.find("h3",
                                 {"class": "hiLite galnetNewsArticleTitle"})

        # Article Content
        entry_title = entry.get_text().strip().replace("'", "''")
        if entry_title == "" or entry_title is None:
            entry_title = "No Title Available"

        text = unquote(bs4.find_all("p")[1].get_text().replace("'", "''"))

        # Date info
        date_article = bs4.find("p").get_text()
        date_article = datetime.datetime.strptime(date_article, "%d %b %Y")
        if date_article.year >= 3300:
            date_article = date_article.replace(year=(date_article.year -
                                                      GAME_YEAR_OFFSET))

        added.append(article)
        await connection.execute(
            f"""
            INSERT INTO "{table}"("Title", "UID", "dateReleased", "dateAdded", "Text") VALUES (
            $1, $2, $3, $4, $5);
            """, entry_title, article, date_article, date_today, text)

    await connection.close()
    if len(new_articles) > 0:
        return len(added), added

Ejemplo n.º 26

0

Mostrar archivo

Archivo: 3 Facebook scraping.py Proyecto: KaleemArshad/Codes-For-Scraping

find_password_element.send_keys(Keys.ENTER)

sleep(6)

group_url = 'https://www.facebook.com/groups/playstoreappofficial/members'
driver.get(group_url)

driver.implicitly_wait(10)

scroll(driver, 2)

names = []
final_names = []

src = driver.page_source
html_soup = Bs4(src, 'lxml')
html_soup.prettify()

for name in html_soup.find_all(
        'a',
    {
        'class':
        "oajrlxb2 g5ia77u1 qu0x051f esr5mh6w e9989ue4 r7d6kgcz rq0escxv nhd2j8a9 nc684nl6 p7hjln8o kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x jb3vyjys rz4wbd8a qt6c0cv9 a8nywdso i1ao9s8h esuyzwwr f1sip0of lzcic4wl oo9gr5id gpro0wi8 lrazzd5p"
    }):
    text = name.get_text()
    list_0 = names.append(text)

for final_name in names[1:]:
    final_names.append(final_name)

df = pd.DataFrame(final_names)