Python Soup Exemples, bs4.Soup Python Exemples

Exemple #1

0

Afficher le fichier

def pagination():

    result_csv = pd.read_csv('pagewise_results.csv')

    # get the parameters
    parameters = pd.read_csv('parameters.csv')
    location = parameters['Location'][0]
    searched_tweet = parameters['Searched'][0]
    last_pg = parameters['Pages'][0]

    if pd.isnull(location):
        location = ""

    # Get the current page as the argument in URL
    pg = request.args.get('page', default="1", type=str)

    either_arrows = False

    # Parse the searched.html file for updating the new table

    if (os.path.exists('templates/searched_pg_left_arrow_pagination.html')
            and os.path.exists(
                'templates/searched_pg_right_arrow_pagination.html')):

        time_left = os.path.getmtime(
            'templates/searched_pg_left_arrow_pagination.html')
        time_right = os.path.getmtime(
            'templates/searched_pg_right_arrow_pagination.html')

        either_arrows = True

        if time_left > time_right:
            print('Both exists - left')
            f = open('templates/searched_pg_left_arrow_pagination.html',
                     encoding='utf-8').read()
            soup = Soup(f, features="html.parser")
        else:
            print('Both exists - right')
            f = open('templates/searched_pg_right_arrow_pagination.html',
                     encoding='utf-8').read()
            soup = Soup(f, features="html.parser")

    elif (os.path.exists('templates/searched_pg_left_arrow_pagination.html')):

        print('left exist')
        either_arrows = True

        f = open('templates/searched_pg_left_arrow_pagination.html',
                 encoding='utf-8').read()
        soup = Soup(f, features="html.parser")

    elif (os.path.exists('templates/searched_pg_right_arrow_pagination.html')):

        print('right exist')
        either_arrows = True

        f = open('templates/searched_pg_right_arrow_pagination.html',
                 encoding='utf-8').read()
        soup = Soup(f, features="html.parser")
    else:
        print('none exist')
        f = open('templates/searched_' + searched_tweet + '_' + location +
                 '.html',
                 encoding='utf-8').read()
        soup = Soup(f, features="html.parser")

    print('current_pg: ', pg)
    p = soup.find("p", {"class": "searched_for"})

    arrow_clicked = ""

    if 'left' in pg or 'right' in pg:

        if 'right' in pg:

            arrow_clicked = "right_arrow_pagination"

            a = soup.find("a", {"id": "left_arrow_pagination"})

            current_pg = soup.find("a", {"id": pg})

            current_pg = current_pg.previous_sibling.previous_sibling

            pg_no = int(current_pg.text)

            for s in soup.find_all("a", {"class": "inactive"}):
                s.decompose()

            for s in soup.find_all("a", {"class": "active"}):
                s.decompose()

            if (pg_no != last_pg):

                for i in range(pg_no + 1, pg_no + 21):
                    pages = soup.new_tag("a")

                    if (i == pg_no + 1):
                        pages['class'] = 'active'
                    else:
                        pages['class'] = 'inactive'

                    pages['id'] = i
                    pages['onclick'] = "redirectPage(this.id)"
                    pages.string = str(i)
                    a.insert_after(pages)
                    a = pages

            pg = pg_no + 1

        elif 'left' in pg:

            if (either_arrows == False):
                return render_template('searched_' + searched_tweet + '_' +
                                       location + '.html')

            arrow_clicked = "left_arrow_pagination"

            a = soup.find("a", {"id": "left_arrow_pagination"})

            current_pg = soup.find("a", {"id": pg})

            current_pg = current_pg.next_sibling

            pg_no = int(current_pg.text)

            for s in soup.find_all("a", {"class": "inactive"}):
                s.decompose()

            for s in soup.find_all("a", {"class": "active"}):
                s.decompose()

            if (pg_no != 1):

                for i in range(pg_no - 20, pg_no):
                    pages = soup.new_tag("a")

                    if (i == pg_no - 20):
                        pages['class'] = 'active'
                    else:
                        pages['class'] = 'inactive'

                    pages['id'] = i
                    pages['onclick'] = "redirectPage(this.id)"
                    pages.string = str(i)
                    a.insert_after(pages)
                    a = pages

                pg = pg_no - 20

            else:

                for i in range(1, 21):
                    pages = soup.new_tag("a")

                    if (i == 1):
                        pages['class'] = 'active'
                    else:
                        pages['class'] = 'inactive'

                    pages['id'] = i
                    pages['onclick'] = "redirectPage(this.id)"
                    pages.string = str(i)
                    a.insert_after(pages)
                    a = pages

                pg = 1

        # file = open('templates/searched_pg_' + str(pg) + '.html', "w", encoding="utf-8")
        # file.write(str(soup))
        # file.close()

        # return render_template('searched_pg_' + str(pg) + '.html')

    # If location is empty, then delete previous table results
    if location == "":
        # Remove the table tag for previous page results from the html file.
        for s in soup.select('table'):
            s.extract()

        tweets_per_pg = 30

    # Otherwise, delete other results table and display data for next page
    else:
        for s in soup.find_all("table", {"class": "other_results"}):
            s.decompose()

        tweets_per_pg = 15

    if arrow_clicked == "":
        # Make the previous page class as inactive
        a = soup.find("a", {"class": "active"})
        a["class"] = "inactive"

        # Make the current page (pg) class as active
        a = soup.find("a", {"id": pg})
        a['class'] = "active"

    pg = int(pg)

    if (pg == last_pg):
        # Get remaining results from result_csv
        show_results = result_csv.loc[(pg - 1) * tweets_per_pg + 1:]
    else:
        # Get only 30 results from result_csv depending upon the page number
        show_results = result_csv.loc[(pg - 1) * tweets_per_pg + 1:pg *
                                      tweets_per_pg]

    result = Soup(show_results.to_html(), features="html.parser")
    result.find("tr")['style'] = 'text-align:center;'
    # Make URLs as hyperlinks
    count = 0
    insert = 3
    for td in result.find_all("td"):
        count += 1

        if (count == insert):

            if (td.text != "Not Available"):

                a = soup.new_tag("a")
                a["href"] = td.text
                a.string = td.text

                td.string = ""
                td.append(a)

            insert += 4

        if count == insert - 2:
            td['style'] = "width:12%;"

    table = result.find("table")
    table['border'] = '0'

    if location != "":
        table["class"] = "other_results"
        table[
            'style'] = 'position:absolute;top:800px;padding-left:35px;padding-right:35px;text-align:center;'
        p = soup.find("p", {"class": "other_results_para"})

    else:
        table[
            'style'] = 'position:absolute;top:180px;padding-left:35px;padding-right:35px;text-align:center;'

    p.insert_after(table)

    if (arrow_clicked == ""):
        file = open('templates/searched_pg_' + str(pg) + '.html',
                    "w",
                    encoding="utf-8")
        file.write(str(soup))
        file.close()

        return render_template('searched_pg_' + str(pg) + '.html')
    else:
        file = open('templates/searched_pg_' + arrow_clicked + '.html',
                    "w",
                    encoding="utf-8")
        file.write(str(soup))
        file.close()

        return render_template('searched_pg_' + arrow_clicked + '.html')

Exemple #2

0

Afficher le fichier

 def soup(self) -> Soup:
     """Converts string data from File into a BeautifulSoup object.
     Returns:
         Soup -- BeautifulSoup object created from the File.
     """
     return Soup(self.data)

Exemple #3

0

Afficher le fichier

Fichier : myscrapper.py Projet : divyamangra/ImageScrapper

import mechanize, os
from bs4 import BeautifulSoup as Soup

url = input("enter url:")
browser = mechanize.Browser()
browser.set_handle_equiv(True)
browser.set_handle_redirect(True)
browser.set_handle_referer(True)
browser.set_handle_robots(False)
browser.addheaders = [(
    'User-agent',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
)]
html = browser.open(url)
browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
print(browser.geturl())
soup = Soup(html, "html.parser")

image_tags = soup.findAll('img')
i = 0
for image in image_tags:
    i = i + 1
    filename = image['src']
    print(filename)
    filename = os.path.join(os.getcwd(), str(i))
    data = browser.open(image['src']).read()
    savename = (str(i) + '.jpg')
    save = open(savename, 'wb')
    save.write(data)
    save.close()

Exemple #4

0

Afficher le fichier

Fichier : newsearch.py Projet : venkateshwarans/ScreenScraping

from string import punctuation, whitespace
import urllib2
import datetime
import re
from bs4 import BeautifulSoup as Soup
import csv
today = datetime.date.today()
html = urllib2.urlopen("http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html").read()

soup = Soup(html)
p = re.compile(r'<.*?>')
aslink = soup.findAll('span',attrs={'class':'ac'})
for li in soup.findAll('li', attrs={'class':'g'}):
    sLink = li.find('a')
    sSpan = li.find('span', attrs={'class':'st'})
    print sLink['href'][7:] , "," + p.sub('', str(sSpan)).replace('.','')
for adli in soup.findAll('div',attrs={'id':'rhs_block'}):
	adlink = adli.find('a')
	print adlink['href']
print p.sub('', str(aslink)[1:-1]).replace('.','\n')


#for ads in soup.findAll('div',{'id':'tads'}):

Exemple #5

0

Afficher le fichier

# IDE   : PyCharm
# description :

# 我们在操作电脑时，经常会需要操作电脑上的文件/文件夹
# 现在我们来学一学怎么用python来批量地操作文件
# 我们把上节课通过爬虫爬取到的内容存到我们电脑上的一个文件里面
#

from bs4 import BeautifulSoup as Soup
import requests

# 大家平时一定都喜欢看书，今天我们就来一起来爬一爬豆瓣top250的图书
url = 'https://book.douban.com/top250?start=0'
r = requests.get(url)
html_code = r.text
soup = Soup(html_code, "html.parser")
all_book = soup.find_all('table')
count = 0
# 首先我们先将上节课爬到的内容记录到一个字典变量里面
books = {}
for book in all_book[1:]:
    count = count + 1
    # print('{:*^30}'.format('book',count))
    # print(book.div.a['title'])
    # print(book.p.string)
    # print(book.find(name='span', attrs={'class': 'inq'}).string)
    title = book.div.a['title']
    content = book.p.string
    intro = book.find(name='span', attrs={'class': 'inq'}).string
    books[title] = {'content': content, 'intro': intro}  # 记录下来

Exemple #6

0

Afficher le fichier

from bs4 import BeautifulSoup as Soup

html = "output1.html"
soup = Soup(open(html), "html.parser")

mydivs = soup.find("div", {"id": "legend"})

print(soup)

Exemple #7

0

Afficher le fichier

Fichier : virgool.py Projet : mahdikhashan/virgoolak

 def __init__(self, url, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.post = Soup(self.scraper.get(url).content, 'html5lib')

Exemple #8

0

Afficher le fichier

Fichier : makeSenticonMap.py Projet : Ant4g0n1st/natural-language-processing

from bs4 import BeautifulSoup as Soup
import pickle

fileName = "senticon.es.xml"

pol = dict()

with open(fileName, "r") as f:
    soup = Soup(f.read(), "xml")
    for lemma in soup.find_all("lemma"):
        pol[(lemma.get_text().strip(), lemma["pos"])] = float(lemma["pol"])

print(pol)

with open("polarity.pkl", "wb") as f:
    pickle.dump(pol, f, pickle.HIGHEST_PROTOCOL)

Exemple #9

0

Afficher le fichier

def test_metadata_json_html(app_client):
    response = app_client.get("/-/metadata")
    assert response.status == 200
    pre = Soup(response.body, "html.parser").find("pre")
    assert METADATA == json.loads(pre.text)

Exemple #10

0

Afficher le fichier

            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file

    return None


############################################################
# Globals
############################################################
# WEBSITE = 'http://shinsekai.cadet-nine.org/'
ROOT = pjoin(os.path.split(os.path.abspath(__file__))[0], '..')
soup = Soup('', 'lxml')
TEMPLATE_FOLDER = pjoin(ROOT, "book_templates/epub")


############################################################
# EBook Class
############################################################
class EBook:
    TEMPLATE_FOLDER = pjoin(ROOT, "book_templates/epub")
    """An ebook basically consists of a bunsh of html files usually one pr chapter
    and a table of content that describes the relationship between the chapters"""
    def __init__(self,
                 url,
                 out_file='book.epup',
                 title=None,
                 workers=5,

Exemple #11

0

Afficher le fichier

Fichier : Series.py Projet : Tathar/TorrentBot

    async def search_task(self, site, page, search):
        """taches de recherche"""
        async with site["semaphore"]:

            logger.debug("%s %i - in search_task: search '%s'", self.name,
                         self._episode, search.data)
            content = None
            try:
                page = await page.browser.newPage()
                if self.canceled:
                    raise asyncio.CancelledError
                await page.setViewport({"width": 1920, "height": 1080})
                if self.canceled:
                    raise asyncio.CancelledError
                await self._going_to(page, site["going_search"])
                logger.debug("%s %i - in search_task: search.data = '%s'",
                             self.name, self._episode, search.data)
                if self.canceled:
                    raise asyncio.CancelledError
                await search.run(page)
                if self.canceled:
                    raise asyncio.CancelledError
                await self._screenshot(page)
                if self.canceled:
                    raise asyncio.CancelledError
                await site["search_button_selector"].run(page)
                if self.canceled:
                    raise asyncio.CancelledError
                # sleep(10)
                if site["search_response_selector"] is not None:
                    await site["search_response_selector"].run(page)
                if self.canceled:
                    raise asyncio.CancelledError
                content = await page.content()  # return HTML document
                # print(content)

            except asyncio.CancelledError:
                raise

            except errors.TimeoutError as error:
                logger.error("%s %i - in search_task : TimeoutError : %s",
                             self.name, self._episode, error)
                await self._close_page(page)
                # logger.debug("%s %i - in search_task: send = None", self.name,
                #              self._episode)
                #await self.url.put(None)
                raise asyncio.CancelledError("TimeoutError : {}".format(error))

            except errors.NetworkError as error:
                logger.error("%s %i - in search_task : NetworkError : %s",
                             self.name, self._episode, error)
                await self._close_page(page)
                # logger.debug("%s %i - in search_task: send = None", self.name,
                #              self._episode)
                #await self.url.put(None)
                raise asyncio.CancelledError("NetworkError : {}".format(error))

            except errors.PageError as error:
                logger.error("%s %i - in search_task : PageError : %s",
                             self.name, self._episode, error)
                await self._close_page(page)
                # logger.debug("%s %i - in search_task: send = None", self.name,
                #              self._episode)
                #await self.url.put(None)
                raise asyncio.CancelledError("PageError : {}".format(error))

            except Exception as error:
                logger.error("%s %i - in search_task : Exception : %s",
                             self.name, self._episode, error)
                await self._close_page(page)
                # logger.debug("%s %i - in search_task: send = None", self.name,
                #              self._episode)
                #await self.url.put(None)
                raise asyncio.CancelledError("Exception : {}".format(error))
            # except:
            #     await self._screenshot(page, "_except")
            #     await self.url.put(None)
            #     logger.debug("raise error '%s'", search.data)
            #     await self._close_page(page)
            #     raise asyncio.CancelledError

            # print(content)
            soup = Soup(content, features="lxml")
            ahref = soup.find_all("a", href=True)
            logger.debug("%s %i - in search_task: ahref = '%s'", self.name,
                         self._episode, clean_str(str(ahref)))
            logger.info("%s %i - in search_task: search episode %s of %s",
                        self.name, self._episode, self.episode,
                        " ".join(self.filters_and))
            urls = list()
            for data in ahref:
                if self.as_all_ellements(self.filters_and, data.get_text()):
                    if self.as_one_ellement(self.filters_or, data.get_text()):
                        #self.torrent_page_url.append(full_url(page, data["href"]))
                        url = full_url(page, data["href"])
                        logger.debug(
                            "%s %i - in search_task: append url = '%s'",
                            self.name, self._episode, url)
                        #await self.url.put(url)
                        urls.append((site, url))

            await self._close_page(page)
            # logger.debug("%s %i - in search_task: send = None", self.name,
            #              self._episode)
            #await self.url.put(None)
            return urls

Exemple #12

0

Afficher le fichier

Fichier : match_parsing.py Projet : ahprh12/OcnSpdwy9-Golf-GPS

def game_soup(gameId):
    match_url = 'http://www.espn.com/nfl/game?gameId=' + str(gameId)
    u_client = ureq(match_url)
    page_html = u_client.read()
    u_client.close()
    return Soup(page_html, 'html.parser')

Exemple #13

0

Afficher le fichier

Fichier : createWebBook.py Projet : JGL/ofBook

    ])

    # ----------- copy images over:

    print destImagesPath
    if os.path.exists(sourceImagesPath):
        copytree(sourceImagesPath, destImagesPath)

    chapterDict = {}
    chapterDict['path'] = chapter
    chapterDict['href'] = chapter + ".html"

    # ----------- now let's alter the HTML that's produced:

    if os.path.exists(destChapterPath):
        soup = Soup(open(destChapterPath).read())

        # --- grab the title from h1

        h1s = soup.find_all("h1")
        if (len(h1s) > 0):
            chapterDict['title'] = h1s[0].getText()
        else:
            chapterDict['title'] = "needs h1"

        chapterDict['chapterListName'] = chapter
        chapterDict['sections'] = []
        chapterDict['destChapterPath'] = destChapterPath

        # --- Grab all the h2 (we call them sections)
        h2s = soup.find_all("h2")

Exemple #14

0

Afficher le fichier

Fichier : sensor.py Projet : adrkable/homeassistant-mta-subway

def parse_subway_status(data):
    """ Returns a nested dictionary of MTA subway line statues
        given an XML response.
    """
    # Set all line statuses to base status.
    line_status = {
        line: {
            "state": None,
            "direction_0_state": None,
            "direction_1_state": None,
            "delays_description": None,
            "service_change_description": None,
            "planned_work_description": None
        }
        for line in SUBWAY_LINES
    }

    # Parse MTA lines from XML
    soup = Soup(data.text, "xml")

    # Iterate over line lookup and parse status.
    for line in SUBWAY_LINES:

        # Rename the MTA alias for Shuttle (S).
        line_alias = "H" if line == "S" else str(line)

        # Search for line name in affected lines XML.
        line_re = re.compile("NYCT_" + line_alias + "$")
        hits = [
            _ for _ in soup.find_all("Affects")
            if _.findChildren("LineRef", text=line_re)
        ]

        # Set line status to Good Service if no status.
        if not hits:
            line_status[line].update({
                "state": "Good Service",
                "direction_0_state": "Good Service",
                "direction_1_state": "Good Service",
            })
            continue

        # Parse all subway line situations that contain
        # affected line.
        situations = [_.find_parent("PtSituationElement") for _ in hits]

        # Parse subway line state.
        statuses = [_.ReasonName.text for _ in situations]

        # Look for overlap of statuses with known states
        # in STATE_PRIORITY dictionary
        matches = set(STATE_PRIORITY.keys()).intersection(set(statuses))

        # Set the current state using the minimum of the
        # ordinal STATE_PRIORITY dictionary, or unknown if
        # state does not exist in dictionary.
        if len(matches) > 0:
            line_status[line]["state"] = min(
                {_: STATE_PRIORITY[_]
                 for _ in matches},
                key=STATE_PRIORITY.get)
        else:
            line_status[line]["state"] = "Unknown"

        # Determine state for each direction on the line.
        dir_states = {
            "0": ["Good Service"],
            "1": ["Good Service"],
        }
        for sit in situations:

            # Find affected line directions.
            directions = [
                _.DirectionRef.text
                for _ in sit.find_all("AffectedVehicleJourney")
                if _.findChildren("LineRef", text=line_re)
            ]

            # Add states to line direction.
            for dct in directions:
                dir_states[dct].append(sit.ReasonName.text)

        # Set the direction states using STATE_PRIORITY.
        for dct in dir_states:
            matches = set(STATE_PRIORITY.keys()).intersection(
                set(dir_states[dct]))

            direction = "direction_{}_state".format(dct)

            if len(matches) > 0:
                line_status[line][direction] = min(
                    {_: STATE_PRIORITY[_]
                     for _ in matches},
                    key=STATE_PRIORITY.get)
            else:
                line_status[line][direction] = "Unknown"

        # Set line status descriptions.
        for status in STATE_PRIORITY:
            desc_key = (status.lower().replace(" ", "_") + "_description")

            descs = [
                _.find("Description").text for _ in situations
                if _.find("ReasonName").text == status
            ]
            if descs:
                line_status[line][desc_key] = (descs
                                               if len(descs) > 1 else descs[0])

    return line_status

Exemple #15

0

Afficher le fichier

Fichier : xml_parser.py Projet : gudjonragnar/thesis

def soupify_xml(filename: str) -> Soup:
    with open(filename, "r") as f:
        soup = Soup(f, "xml")
    return soup

Exemple #16

0

Afficher le fichier

def test_zero_results(app_client, path):
    response = app_client.get(path)
    soup = Soup(response.text, "html.parser")
    assert 0 == len(soup.select("table"))
    assert 1 == len(soup.select("p.zero-results"))

Exemple #17

0

Afficher le fichier

    'non_food_allergies', 'specialized_diet', 'vioscreen_activity_level',
    'vioscreen_age', 'vioscreen_bcodeid', 'vioscreen_bmi', 'vioscreen_dob',
    'vioscreen_eer', 'vioscreen_email', 'vioscreen_finished',
    'vioscreen_gender', 'vioscreen_height',
    'vioscreen_nutrient_recommendation', 'vioscreen_procdate',
    'vioscreen_protocol', 'vioscreen_recno', 'vioscreen_scf', 'vioscreen_scfv',
    'vioscreen_srvid', 'vioscreen_started', 'vioscreen_subject_id',
    'vioscreen_time', 'vioscreen_user_id', 'vioscreen_visit',
    'vioscreen_weight'
]

for filename in os.listdir(folders[3]):
    fullname = os.path.join(folders[3], filename)
    infile = open(fullname, "r")
    contents = infile.read()
    soup = Soup(contents, 'xml')
    title = soup.find('TITLE')
    if title is not None:
        primary_ID = soup.find('PRIMARY_ID')
        external_ID = soup.find('EXTERNAL_ID')
        sample_ID = title
        taxon_ID = soup.find('TAXON_ID')
        science_name = soup.find('SCIENTIFIC_NAME')
        tags = soup.findAll('TAG')
        values = soup.findAll('VALUE')
        infile.close()
        tags = [i.get_text() for i in tags]
        tags = [x for x in tags if x not in notoktags]
        values = [i.get_text() for i in values]
        values_corrected = []
        for i in values:

Exemple #18

0

Afficher le fichier

def test_definition_sql(path, expected_definition_sql, app_client):
    response = app_client.get(path)
    pre = Soup(response.body, "html.parser").select_one("pre.wrapped-sql")
    assert expected_definition_sql == pre.string

Exemple #19

0

Afficher le fichier

Fichier : scrappingVSFS01.py Projet : kocourOggy/Webscrapping-and-analyzing-downloaded-data

def processThesis(thesis, fileObject):
    global URL_WITH_ADDITIONAL_INFO, globalRequest

    thesisInfo = thesis.find_all("i")
    if (len(thesisInfo) > 7):
        possibleName = thesisInfo[0]
        nameConvert = ''
        for letter in possibleName:
            nameConvert += str(letter)
        if ("roz." in nameConvert):
            print("Roz. problem")
            del thesisInfo[0]

    if (len(thesisInfo) != 7):
        print("Bad number of data len: ", len(thesisInfo))

    year = extractYearFromList(thesisInfo)
    successRate = extractSuccessRateFromList(thesisInfo)

    extraLink = extractLink(thesis)

    #print("url: " + URL_WITH_ADDITIONAL_INFO + extraLink)

    requestForPage = handleRequestGET(URL_WITH_ADDITIONAL_INFO + extraLink, 3)
    if requestForPage.status_code != 200:
        print("Cannot handle request with status code: ",
              requestForPage.status_code)
        time.sleep(10)
        fileObject.close()
        return

    soupWebPage = Soup(requestForPage.content, "html5lib")

    authorName = soupWebPage.h3.b.string
    nameOfThesis = soupWebPage.find_all('h2')[1].string

    subjectOfStudy = soupWebPage.find('div', {
        'class': 'oddil'
    }).em.string.split('/')[1]
    typeOfThesis = globalRequest['TIT']

    nameOfSupervisor = extractSupervisor(soupWebPage)
    nameOfOponent = extractOponent(soupWebPage)

    authorName = authorName.replace(',', ' ')
    nameOfThesis = nameOfThesis.replace(',', ' ')
    subjectOfStudy = subjectOfStudy.replace(',', ' ')
    nameOfSupervisor = nameOfSupervisor.replace(',', ' ')
    nameOfOponent = nameOfOponent.replace(',', ' ')
    '''
	print("author: " +  authorName)
	print("year: " +  year)
	print("type of thesis:" + typeOfThesis)
	print("subjectOfStudy: " +  subjectOfStudy)
	print("nameOfThesis: " +  nameOfThesis)
	print("succes: " +  successRate)
	print("supervisor: " +  nameOfSupervisor)
	print("oponent: " +  nameOfOponent)
	'''

    csvRow = authorName + ',' + nameOfThesis + ',' + subjectOfStudy + ',' + typeOfThesis + ',' + year + ',' + successRate + ',' + nameOfSupervisor + ',' + nameOfOponent + '\n'
    '''
	print("author: "+authorName+"; year: "+year+"; supervisor: "+nameOfSupervisor+"; oponent: "+nameOfOponent)
	print("nameOfThesis: "+nameOfThesis+"; subjectOfStudy: "+subjectOfStudy)
	print("Succes: "+successRate)
	'''
    if len(csvRow.split(',')) == 8:
        fileObject.write(csvRow)
        print("Succesful write to CSV file")
    else:
        print("Bad CSV format!!!")

Exemple #20

0

Afficher le fichier

def test_sort_links(app_client):
    response = app_client.get("/fixtures/sortable?_sort=sortable")
    assert response.status == 200
    ths = Soup(response.body, "html.parser").findAll("th")
    attrs_and_link_attrs = [{
        "attrs":
        th.attrs,
        "a_href":
        (th.find("a")["href"].split("/")[-1] if th.find("a") else None),
    } for th in ths]
    assert [
        {
            "attrs": {
                "class": ["col-Link"],
                "scope": "col"
            },
            "a_href": None
        },
        {
            "attrs": {
                "class": ["col-pk1"],
                "scope": "col"
            },
            "a_href": None
        },
        {
            "attrs": {
                "class": ["col-pk2"],
                "scope": "col"
            },
            "a_href": None
        },
        {
            "attrs": {
                "class": ["col-content"],
                "scope": "col"
            },
            "a_href": None
        },
        {
            "attrs": {
                "class": ["col-sortable"],
                "scope": "col"
            },
            "a_href": "sortable?_sort_desc=sortable",
        },
        {
            "attrs": {
                "class": ["col-sortable_with_nulls"],
                "scope": "col"
            },
            "a_href": "sortable?_sort=sortable_with_nulls",
        },
        {
            "attrs": {
                "class": ["col-sortable_with_nulls_2"],
                "scope": "col"
            },
            "a_href": "sortable?_sort=sortable_with_nulls_2",
        },
        {
            "attrs": {
                "class": ["col-text"],
                "scope": "col"
            },
            "a_href": "sortable?_sort=text",
        },
    ] == attrs_and_link_attrs

Exemple #21

0

Afficher le fichier

Fichier : top100 movie.py Projet : Satpal19/imdb-scraping

from requests import get
url = get(
    "https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating"
)
request = url.text
from bs4 import BeautifulSoup as Soup
soup_data = Soup(request, 'html.parser')
#print(soup_data.title.text)
movies = soup_data.findAll('div', {"class": 'lister-item mode-advanced'})
frist_movie = movies[0]
Name = []
Position = []
Year = []
Rating = []
Ure = []
#x=(frist_movie.find('div',{"class":"lister-itemfrist_-image float-left"}).find('a').get("href"))
#div=frist_movie.find('div',{"class":"lister-item-image float-left"})
# moovie_link=div.find("a").get("href")
for i in movies:
    Name.append(i.h3.a.text)
    Position.append(
        i.find('span', {
            "class": "lister-item-index unbold text-primary"
        }).text[:1])
    Year.append(
        i.find('span', {
            "class": "lister-item-year text-muted unbold"
        }).text[1:5])
    Rating.append(
        i.find('div',
               {"class": "inline-block ratings-imdb-rating"})['data-value'])

Exemple #22

0

Afficher le fichier

def test_facet_display(app_client):
    response = app_client.get(
        "/fixtures/facetable?_facet=planet_int&_facet=city_id&_facet=on_earth")
    assert response.status == 200
    soup = Soup(response.body, "html.parser")
    divs = soup.find("div", {"class": "facet-results"}).findAll("div")
    actual = []
    for div in divs:
        actual.append({
            "name":
            div.find("strong").text,
            "items": [{
                "name":
                a.text,
                "qs":
                a["href"].split("?")[-1],
                "count":
                int(str(a.parent).split("</a>")[1].split("<")[0]),
            } for a in div.find("ul").findAll("a")],
        })
    assert [
        {
            "name":
            "city_id",
            "items": [
                {
                    "name": "San Francisco",
                    "qs":
                    "_facet=planet_int&_facet=city_id&_facet=on_earth&city_id=1",
                    "count": 6,
                },
                {
                    "name": "Los Angeles",
                    "qs":
                    "_facet=planet_int&_facet=city_id&_facet=on_earth&city_id=2",
                    "count": 4,
                },
                {
                    "name": "Detroit",
                    "qs":
                    "_facet=planet_int&_facet=city_id&_facet=on_earth&city_id=3",
                    "count": 4,
                },
                {
                    "name": "Memnonia",
                    "qs":
                    "_facet=planet_int&_facet=city_id&_facet=on_earth&city_id=4",
                    "count": 1,
                },
            ],
        },
        {
            "name":
            "planet_int",
            "items": [
                {
                    "name": "1",
                    "qs":
                    "_facet=planet_int&_facet=city_id&_facet=on_earth&planet_int=1",
                    "count": 14,
                },
                {
                    "name": "2",
                    "qs":
                    "_facet=planet_int&_facet=city_id&_facet=on_earth&planet_int=2",
                    "count": 1,
                },
            ],
        },
        {
            "name":
            "on_earth",
            "items": [
                {
                    "name": "1",
                    "qs":
                    "_facet=planet_int&_facet=city_id&_facet=on_earth&on_earth=1",
                    "count": 14,
                },
                {
                    "name": "0",
                    "qs":
                    "_facet=planet_int&_facet=city_id&_facet=on_earth&on_earth=0",
                    "count": 1,
                },
            ],
        },
    ] == actual

Exemple #23

0

Afficher le fichier

	def __init__(self, conf_file=""):
		self.db_conn_dict = {}
		self.queries_dict = {}
		self.users_dict = {}
		self.chartboards = [];
		self.appName=""
		
		handler = open(conf_file).read()
		soup = Soup(handler,'xml')

		'''
		load env variable  from config file
		'''
		env = soup.find('env')
		
		orahome = env.find('ora_home').string
		ldlibpath = env.find('ld_library_path').string
		self.appName= env.find('appName').string

		os.putenv('ORACLE_HOME', orahome)
		os.putenv('LD_LIBRARY_PATH',ldlibpath)
		dss = soup.find('datasources')
		'''
		load datasources from config file
		'''
		for ds in dss.findAll('datasource'):
			dbtype = ds.find("type").string
			name= ds.find("name").string
			host= ds.find("host").string if ds.find("host")!=None else "";
			driver=ds.find("driver").string if ds.find("driver")!=None else "";
			port= ds.find("port").string if ds.find("port")!=None else "";
			user= ds.find("user").string if ds.find("user")!=None else "";
			password=ds.find("password").string if ds.find("password")!=None else "";
			service= ds.find("service").string if ds.find("service")!=None else "";
			sid= ds.find("sid").string if ds.find("sid")!=None else "";
			self.db_conn_dict[name] = {'type':dbtype,'host':host,'driver':driver,'port':port,'user':user,'password':password,'service':service,'sid':sid };
		
		'''
		load queries from xml file
		'''
		queries = soup.find('queries')
		for q in queries.findAll('query'):
			#add Query to queries dict
			query_ =  q.find('sql').string
			
			params=()
			name_ = q.attrs['name']
			target_ = q.attrs['ds']
			paramNum = q.attrs['params'] if 'params' in q.attrs else None
			_desc = q.attrs['description'] if 'description' in q.attrs else ""
			select_number = int(q.attrs['selects']) if 'selects' in q.attrs else 1
			
			if paramNum:
				k=1;
				parameters = q.find('params')
				for p in parameters.findAll('param'): 
					_p_name=p.attrs['name']
					_p_type=p.attrs['paramtype']
					_p_desc=p.attrs['description']
					_combo_vals =p.attrs['vals'] if 'vals' in p.attrs else ''
					toAdd=Param(k,_p_name,_p_type,_combo_vals.split(','),_p_desc )
					params=params+(toAdd,)
					k=k+1
					#print toAdd,'from xml: ',_combo_vals
			
			qry = Query(query = query_,parmap=params,target = target_,parnum=paramNum,name = name_,selectNumber=select_number,description=_desc)
			#print qry
			self.queries_dict[name_] = qry
		'''
		load users from xml file
		'''
		users = soup.find('users')
		for u in users.findAll('user'):
			
			username = u.find("username").string
			pswd = u.find("password").string
			dslist = ();
			datasourceslist = u.findAll("ds")
			for d in datasourceslist:
				dslist = dslist+(d.string,)
				 

			user = User(username,pswd,dslist)
			self.users_dict[username] = user
		'''
		load chartboards from xml file
		'''
		chartsb=soup.find('chartboards')
		
		for c in chartsb.findAll('chartboard'):
			toadd={}
			_user = c.find("user").string
			_type=c.find("type").string
			_querydata=c.find("querydata").string
			_title = c.find("title").string
			
			toadd['user']=_user
			toadd['type']=_type
			toadd['querydata']=_querydata
			toadd['title']=_title
			toadd['inverted']=True if 'inverted' in c.attrs else False
			self.chartboards.append(toadd)

Exemple #24

0

Afficher le fichier

def test_database_download_disallowed_for_mutable(app_client):
    response = app_client.get("/fixtures")
    soup = Soup(response.body, "html.parser")
    assert 0 == len(soup.findAll("a", {"href": re.compile(r"\.db$")}))
    assert 403 == app_client.get("/fixtures.db").status

Exemple #25

0

Afficher le fichier

Fichier : marathon_data_1.py Projet : pombredanne/scraperwiki-scraper-vault

import scraperwiki
import urllib
from bs4 import BeautifulSoup as Soup

# stap 1: urls samenstellen

base_url = "http://evenementen.uitslagen.nl/2013/marathonrotterdam/details.php?s="
end_url = "&o=1&t=nl"

for num in range(1, 10):
    baseplusnr = base_url + str(num)
    url = baseplusnr + end_url

    # stap 2: de urls openen

    soup = Soup(urllib.urlopen(url))

    # stap 3: de 'niet gevonden' pagina's buiten beschouwing laten

    split = soup.find("b", style="color:red")
    if split is None:
        col = soup.findAll('td')

        # stap 4: aanwijzen welke table data we willen hebben

        startnr = col[0].string.replace("Startnummer", "")
        naam = col[4].string
        woonplaats = col[6].string
        afstand = col[8].string
        cat = col[10].string
        totplaats = col[12].string

Exemple #26

0

Afficher le fichier

def test_allow_sql_on(app_client):
    response = app_client.get("/fixtures")
    soup = Soup(response.body, "html.parser")
    assert len(soup.findAll("textarea", {"name": "sql"}))
    response = app_client.get("/fixtures/sortable")
    assert b"View and edit SQL" in response.body

Exemple #27

0

Afficher le fichier

Fichier : eventscrap.py Projet : sanjanamukherjee/webscrap-events-bangalore

import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as Soup

my_url= "https://www.eventbrite.com/d/india--bengaluru/events/"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

page_soup = Soup(page_html, "html.parser")

containers = page_soup.findAll("div",{"class":"eds-media-card-content__content__principal"})

filename= "events.csv"
f = open(filename,"w")

headers = "event, day, date, time, location, price\n"

f.write(headers)


for container in containers :
	
	title_container= container.a.div.div
	event = title_container.text

	another_container = container.findAll("div",{"class":"eds-media-card-content__sub-content"})
	date = another_container[0].div.text

	place_container= container.findAll("div",{"class":"eds-media-card-content__sub-content-cropped"})

Exemple #28

0

Afficher le fichier

Fichier : speaker.py Projet : Zaubeerer/bitesofpy

def _get_soup(html=PYCON_HTML):
    return Soup(html.read_text(encoding="utf-8"), "html.parser")

Exemple #29

0

Afficher le fichier

    'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Greenwood%20Village%2C%20CO%22}]',
    'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Lakewood%2C%20CO%22}]',
    'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Castle%20Rock%2C%20CO%22}]',
    'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Littleton%2C%20CO%22}]',
    'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Louisville%2C%20CO%22}]',
    'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Loveland%2C%20CO%22}]',
    'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22N%252E%20Lakewood%2C%20CO%22}]',
    'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Northglenn%2C%20CO%22}]',
    'https://jobs.lowes.com/search-jobs/Colorado?orgIds=1627&alp=6252001-5417618&alt=3&ascf=[{%22key%22:%22campaign%22,%22value%22:%22Westminster%2C%20CO%22}]'
]

pd.set_option('max_colwidth', 500)  # remove column limits or info will be lost
df = pd.DataFrame()  # Create a new data frame

for a in base_url:
    target = Soup(urllib.urlopen(a), "lxml")

    # This part of the code returns specific job info desired
    targetElements = target.findAll('li')

    for elem in targetElements:
        comp_name = "Lowes"
        try:
            job_title = elem.find('h2').getText()
        except AttributeError:
            job_title = "null"
        home_url = "https://jobs.lowes.com/"
        try:
            job_link = "%s%s" % (home_url, elem.find('a').get('href'))
        except AttributeError:
            job_link = "null"

Exemple #30

0

Afficher le fichier

def get_data():

    if request.method == 'POST':
        for filename in glob.glob("templates/searched*"):
            os.remove(filename)

        searched_tweet = request.form['search']
        location = request.form['location']

        result_csv = requestResults(searched_tweet)

        f = open('templates/trying_local.html').read()
        soup = Soup(f, features="html.parser")
        p = soup.find("p", {"class": "searched_for"})
        paginate = soup.find("div", {"class": "pagination"})

        if result_csv.empty:
            p.append("You searched for: " + searched_tweet +
                     ". This is a Non-Donation request.")
        else:

            # Check for tweets at the location given. If location not given, then show results for the world (only 30 tweets per page).
            if location != "":
                show_results = result_csv[
                    result_csv['Location'].str.contains(location.upper())
                    | result_csv['Location'].str.contains(location.lower())]

                location_results = show_results

                # If no tweets are present at searched location
                if len(show_results) == 0:

                    show_results = result_csv[:30]
                    p.append("You searched for: " + searched_tweet + " at " +
                             location + ". Found 0 results. Displaying " +
                             str(len(result_csv)) +
                             " results for other locations.")

                    location = ""
                else:
                    result_csv = result_csv[
                        ~result_csv['Location'].str.contains(location.upper())
                        &
                        ~result_csv['Location'].str.contains(location.lower())]

                    # Showing only top 15 tweets of searched location
                    if len(show_results) > 15:
                        show_results = show_results[:13]

                    p.append("You searched for: " + searched_tweet + " at " +
                             location + ". Found " + str(len(show_results)) +
                             " results.")
            else:
                show_results = result_csv[:30]
                p.append("You searched for: " + searched_tweet + ". Found " +
                         str(len(result_csv)) + " results.")

            if location == "":
                n = len(result_csv) // 30

                if (n % 30 != 0):
                    n += 1

            else:
                n = len(result_csv) // 15

                if (n % 15 != 0):
                    n += 1

            parameters = pd.DataFrame({
                'Location': [location],
                'Searched': [searched_tweet],
                'Pages': n
            })

            # result_csv['location_searched'] = location
            # result_csv['searched'] = searched_tweet
            parameters.to_csv('parameters.csv', index=False)
            result_csv.to_csv('pagewise_results.csv', index=False)
            # result_csv = result_csv.drop(['location_searched', 'searched'], axis=1)

            show_results.reset_index(drop=True, inplace=True)
            show_results.index += 1

            result = Soup(show_results.to_html(), features="html.parser")

            result.find("tr")['style'] = 'text-align:center;'

            # Make URLs as hyperlinks
            count = 0
            insert = 3
            for td in result.find_all("td"):
                count += 1

                if (count == insert):

                    if (td.text != "Not Available"):

                        a = soup.new_tag("a")
                        a["href"] = td.text
                        a.string = td.text

                        td.string = ""
                        td.append(a)

                    insert += 4

                if count == insert - 2:
                    td['style'] = "width:12%;"

            table = result.find("table")
            table['border'] = '0'
            table[
                'style'] = 'position:absolute;top:180px;padding-left:35px;padding-right:35px;text-align:center;'

            p.insert_after(result)

            a = soup.find("a", {"id": 1})

            if location == "":
                # n = len(result_csv)//30

                if n > 20:
                    paginate[
                        "style"] = "position:absolute;left:50%;top:190%;width:71%;transform: translate(-50%, -50%); background-color: #525252;background-size: cover;"
                else:
                    paginate[
                        "style"] = "position:absolute;left:50%;top:190%;transform: translate(-50%, -50%); background-color: #525252;background-size: cover;"
            else:
                # n = len(result_csv)//15

                if n > 20:
                    paginate[
                        "style"] = "position:absolute;left:50%;top:195%;width:71%;transform: translate(-50%, -50%); background-color: #525252;background-size: cover;"
                else:
                    paginate[
                        "style"] = "position:absolute;left:50%;top:195%;transform: translate(-50%, -50%); background-color: #525252;background-size: cover;"

            if (n > 20):
                n = 20

            # if (n%30 != 0):
            # 	n += 1

            for i in range(n - 1):
                pages = soup.new_tag("a")
                pages['class'] = 'inactive'
                pages['id'] = i + 2
                pages['onclick'] = "redirectPage(this.id)"
                pages.string = str(i + 2)
                a.insert_after(pages)
                a = pages

            if location != "":
                # Other Location Results
                p = soup.new_tag("p")
                p['class'] = "other_results_para"
                p['style'] = "position: absolute;top:750px;font-weight: bold;"
                p.string = "Other Location Tweets (Found " + str(
                    len(result_csv)) + " results)"

                table = soup.find("table", {"class": "dataframe"})
                table.insert_after(p)

                other_results = result_csv[:15]
                other_results.reset_index(drop=True, inplace=True)
                other_results.index += 1

                result = Soup(other_results.to_html(), features="html.parser")

                result.find("tr")['style'] = 'text-align:center;'

                # Make URLs as hyperlinks
                count = 0
                insert = 3
                for td in result.find_all("td"):
                    count += 1

                    if (count == insert):

                        if (td.text != "Not Available"):

                            a = soup.new_tag("a")
                            a["href"] = td.text
                            a.string = td.text

                            td.string = ""
                            td.append(a)

                        insert += 4

                    if count == insert - 2:
                        td['style'] = "width:12%;"

                table = result.find("table")
                table['class'] = 'other_results'
                table['border'] = '0'
                table[
                    'style'] = 'position:absolute;top:800px;padding-left:35px;padding-right:35px;text-align:center;'

                p = soup.find("p", {"class": "other_results_para"})
                p.insert_after(table)

        file = open('templates/searched_' + searched_tweet + '_' + location +
                    '.html',
                    "w",
                    encoding="utf-8")
        file.write(str(soup))
        file.close()

        return render_template('searched_' + searched_tweet + '_' + location +
                               '.html')