Python BeautifulSoup.BeautifulSoup Beispiele, BeautifulSoup.BeautifulSoup.BeautifulSoup Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: HarkinCorrect.py Projekt: erossiter/WUSTL

month['10'] = 'Oct'
month['11'] = 'Nov'
month['12'] = 'Dec'
month['1'] = 'Jan'
month['2'] = 'Feb'
month['3'] = 'Mar'
month['4'] = 'Apr'
month['5'] = 'May'
month['6'] = 'Jun'
month['7'] = 'Jul'
month['8'] = 'Aug'
month['9'] = 'Sep'

for j in range(len(html)):
    out = urlopen(html[j]).read()
    soup = BeautifulSoup(out)
    res = soup.findAll('a')
    fr = []
    for k in range(len(res)):
        if res[k].has_key('href'):
            ab = res[k]['href']
            ba = re.findall('p.cfm\?i', str(ab))
            if len(ba) > 0:
                fr.append(ab.encode('UTF-8'))
    date = []
    spans = soup.findAll('span')
    for k in range(len(spans)):
        if spans[k].has_key('class'):
            if spans[k]['class'] == 'smaller':
                abc = utilities.clean_html(str(spans[k]))
                abc = abc.split('/')

Beispiel #2

0

Datei anzeigen

#raymond zhu
import csv
import requests
from BeautifulSoup import BeautifulSoup

url = 'http://www.showmeboone.com/sheriff/JailResidents/JailResidents.asp'
response = requests.get(url)
html = response.content

soup = BeautifulSoup(html)
table = soup.find('tbody', attrs={'class': 'stripe'})

list_of_rows = []
for row in table.findAll('tr'):
    list_of_cells = []
    for cell in row.findAll('td'):
        text = cell.text.replace('&nbsp;', '')
        list_of_cells.append(text)
    list_of_rows.append(list_of_cells)

outfile = open("./inmates.csv", "wb")
writer = csv.writer(outfile)
writer.writerow(
    ["Last", "First", "Middle", "Gender", "Race", "Age", "City", "State"])
writer.writerows(list_of_rows)

Beispiel #3

0

Datei anzeigen

Datei: flamenco.py Projekt: Davidlosabe/letrasflamencas

def make_soup(response):
    """
    Create a ``BeautifulSoup.BeautifulSoup`` instance out of a `response`.
    """
    return BeautifulSoup(response.content)

Beispiel #4

0

Datei anzeigen

    def __init__(self, params):
        import re
        from addon import Addon
        from addondict import AddonDict as XBMCDict
        from BeautifulSoup import BeautifulSoup, SoupStrainer, Comment

        a = Addon()
        site = self.__module__
        mode = params['mode']

        home_url = 'http://playporn.to/'
        search_url = home_url + '?submit=Search&s='
        movies_url = home_url + 'category/xxx-movie-stream/'
        scenes_url = home_url + 'category/xxx-clips-scenes-stream/'
        false_positives = ['http://playporn.to/deutsche-milfs-anonym-sex/']

        if mode == 'main':
            item_list = [{
                'site': site,
                'mode': 'list',
                'title': a.language(30003),
                'content': '',
                'url': home_url,
                'cover_url': a.image('recent.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }, {
                'site': site,
                'mode': 'sub',
                'title': a.language(30001),
                'content': '',
                'url': movies_url,
                'cover_url': a.image('movies.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }, {
                'site': site,
                'mode': 'sub',
                'title': a.language(30002),
                'content': '',
                'url': scenes_url,
                'cover_url': a.image('scenes.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }, {
                'site': site,
                'mode': 'list',
                'title': a.language(30004),
                'content': 'search',
                'url': search_url,
                'cover_url': a.image('search.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }]
            item_list.extend(a.favs_hist_menu(site))
            item_list.extend(a.extended_menu())
            a.add_items(item_list)
            a.end_of_directory()

        elif mode == 'sub':
            item_list = [{
                'site': site,
                'mode': 'list',
                'title': a.language(30006),
                'content': '',
                'url': params['url'],
                'cover_url': a.image('all.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }, {
                'site': site,
                'mode': 'category',
                'title': a.language(30005),
                'content': '',
                'url': home_url,
                'cover_url': a.image('categories.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }]
            a.add_items(item_list)
            a.end_of_directory()

        elif mode == 'category':
            index = 1
            if 'scenes' in params['url'].lower(): index = 2
            html = a.get_page(home_url)
            soup = BeautifulSoup(html,
                                 parseOnlyThese=SoupStrainer('ul', 'nav fl'))
            item_list = []
            for item in soup.findAll('ul')[index].findAll({'a': True}):
                item_list.extend([{
                    'site': 'playporn',
                    'mode': 'list',
                    'url': item.get('href'),
                    'content': '',
                    'title': item.contents[0].encode('UTF-8'),
                    'cover_url': a.image(image, image),
                    'backdrop_url': a.art(),
                    'type': 3
                }])
            if item_list:
                a.add_items(item_list)
                a.end_of_directory()

        elif mode == 'list':
            if params.get('content', '') == 'search':
                item = a.search_input()
                if item:
                    params['url'] = search_url + item
                else:
                    exit(1)
            elif params.get('content', '') == 'goto':
                last_item = re.search('/page/([0-9]+)/', params['url'])
                if last_item:
                    last_item = int(last_item.group(1))
                else:
                    last_item = 10000
                item = a.page_input(last_item)
                if item:
                    params['url'] = re.sub('/page/[0-9]+/',
                                           '/page/' + str(item) + '/',
                                           params['url'])
                else:
                    exit(1)
            html = a.get_page(params['url'])
            soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer('body'))
            item_list = []
            params['mode'] = 'play'
            params['content'] = 'movies'
            params['type'] = 0
            params['context'] = 0
            params['duration'] = '7200'
            xbmcdict = XBMCDict(0).update(params)
            for item in soup.findAll('div', 'photo-thumb-image'):
                if not item.a.get('href') in false_positives:
                    _dict = xbmcdict.copy()
                    if 'scenes' in params['url']:
                        _dict['duration'] = '2700'
                        _dict['content'] = 'episodes'
                    _dict['url'] = item.a.get('href')
                    _dict['title'] = item.a.get('title').encode('UTF-8')
                    _dict['tvshowtitle'] = _dict['title']
                    _dict['originaltitle'] = _dict['title']
                    _dict['cover_url'] = a.image(item.img.get('src'))
                    _dict['thumb_url'] = _dict['cover_url']
                    _dict['poster'] = _dict['cover_url']
                    _dict['sub_site'] = site

                    item_list.extend([_dict])
            soup = BeautifulSoup(html,
                                 parseOnlyThese=SoupStrainer(
                                     'div', 'more_entries'))
            if soup:
                item = soup.find('a', 'previouspostslink')
                if item:
                    item_list.extend([{
                        'site':
                        site,
                        'mode':
                        'list',
                        'url':
                        item.get('href'),
                        'content':
                        params['content'],
                        'title':
                        a.language(30017, True),
                        'cover_url':
                        a.image('previous.png', image),
                        'backdrop_url':
                        a.art(),
                        'type':
                        3
                    }])
                item = soup.find('a', 'nextpostslink')
                if item:
                    item_list.extend([{
                        'site': site,
                        'mode': 'list',
                        'url': item.get('href'),
                        'content': params['content'],
                        'title': a.language(30018, True),
                        'cover_url': a.image('next.png', image),
                        'backdrop_url': a.art(),
                        'type': 3
                    }])
                item = soup.find('a', 'last')
                if item:
                    item_list.extend([{
                        'site': site,
                        'mode': 'list',
                        'url': item.get('href'),
                        'content': 'goto',
                        'title': a.language(30019, True),
                        'cover_url': a.image('goto.png', image),
                        'backdrop_url': a.art(),
                        'type': 3
                    }])
            if item_list:
                a.add_items(item_list)
                a.end_of_directory()

        elif mode == 'play':
            html = a.get_page(params['url'])
            soup = BeautifulSoup(html,
                                 parseOnlyThese=SoupStrainer(
                                     'div', {'id': 'loopedSlider'}))
            soup = soup.find(text=lambda text: isinstance(text, Comment))
            if soup:
                soup = re.sub('&lt;', '<', soup.encode('utf-8'))
                soup = re.sub('&gt;', '>', soup)
                soup = BeautifulSoup(soup,
                                     parseOnlyThese=SoupStrainer(
                                         'div', 'video'))
                if soup:
                    item_list = []
                    xbmcdict = XBMCDict(0).update(params)
                    for item in soup.findAll('iframe'):
                        _dict = xbmcdict.copy()
                        _dict['url'] = item.get('src').replace(
                            'http://playporn.to/stream/all/?file=',
                            '').encode('UTF-8')
                        if 'flashx.tv' in _dict['url'].lower():
                            item = re.search('hash=(.+?)&', _dict['url'])
                            if item:
                                _dict[
                                    'url'] = 'http://flashx.tv/video/' + item.group(
                                        1) + '/'
                        elif 'played.to' in _dict['url'].lower():
                            item = re.search('embed-([a-zA-Z0-9]+?)-.+?html',
                                             _dict['url'])
                            if item:
                                _dict[
                                    'url'] = 'http://played.to/' + item.group(
                                        1)
                        item_list.extend([_dict])
                    if item_list:
                        from playback import Playback
                        Playback().choose_sources(item_list)
                    else:
                        a.alert(a.language(30904, True), sound=False)
                else:
                    a.alert(a.language(30904, True), sound=False)
            else:
                a.alert(a.language(30904, True), sound=False)

Beispiel #5

0

Datei anzeigen

 def __init__(self, html):
     self.soup = BeautifulSoup(html)

Beispiel #6

0

Datei anzeigen

Datei: shogun_web.py Projekt: iglesias/shogun-web2

def irclog(date):
    logfile = '%s/#shogun.%s.log.html' % (SHOGUN_IRCLOGS, date)
    html = open(logfile).read()
    soup = BeautifulSoup(html)
    log = str(soup.body.table)
    return render_template('irclogs.html', log=log)

Beispiel #7

0

Datei anzeigen

Datei: ba.py Projekt: wwk3j/britishairways-awards-tool

    def parse_flights(self, html, directonly):
        """ Parse the BA HTML page into a structured dict of flight options. """
        if self.debug:
            out = open("results.html", "w")
            out.write(html)
            out.close()

        results = []

        # parse with BeautifulSoup
        soup = BeautifulSoup(html)
        #for table in soup.findAll("table[class=tblLyOut]"):
        route = []
        for table in soup.findAll("table"):
            cls = table.get("class")
            if cls is None:
                continue

            if not ("flightListTable" in cls):
                continue

            # ignore the outer tables
            if table.get("id") is not None:
                continue

            # get the header in case the route is listed in it (does this for some direct routes)
            if len(route) == 0:
                for thead in table.findAll("thead"):
                    for a in thead.findAll("a", {"class": "airportCodeLink"}):
                        if len(route) < 2:
                            route.append(a.string)

            result = {}

            for tbody in table.findAll("tbody"):
                for tr in tbody.findAll("tr"):
                    if tr.get("id") and tr.get("id")[0:4] == "smry":
                        continue  # this is a summary row, hidden to users that contains > 1 flight information, we ignore it

                    flight = {}
                    cabincode = "0"
                    for span in tr.findAll("span"):
                        cls = span.get("class")
                        if cls is None:
                            continue
                        if "departtime" in cls:
                            flight['departs'] = span.string
                        if "arrivaltime" in cls:
                            flight['arrives'] = span.string
                        if "journeyTime" in cls:
                            # journey time is total per result not per flight
                            result['duration'] = span.string.replace(
                                u'\xa0', u' ')
                    for a in tr.findAll("a"):
                        cls = a.get("class")
                        if cls is None:
                            continue
                        if "flightPopUp" in cls:
                            flight['flight'] = a.string

                        # these are if the route is in the first column in side the cell with the time and date
                        # sometimes they are in the column headers, see below
                        if "airportCodeLink" in cls:
                            if "route" not in flight:
                                flight['route'] = []

                            if len(
                                    flight['route']
                            ) < 2:  # it will repeat after 2 because the page repeats them
                                flight['route'].append(a.string)

                        # capturing the route

                    for inpu in tr.findAll("input", {"type": "radio"}):
                        cabincode = "0"
                        codes = {
                            "CabinCodeF": "F",
                            "CabinCodeC": "C",
                            "CabinCodeW": "W",
                            "CabinCodeM": "M"
                        }
                        for code_find, code in codes.items():
                            if code_find in inpu.get("id"):
                                cabincode = code
                                continue

                        if 'class' in flight and flight[
                                'class'] != '':  # since each query returns more than one class now
                            flight['class'] += "/"
                        else:
                            flight['class'] = ""
                        flight['class'] += self.classes[cabincode]

    #                for td in tr.findAll("td"):
    #                    cls = td.get("class")
    #                    if cls is None:
    #                        continue
    #                    if "classoftravel" in cls:
    #                        if td.string == "" or td.string is None:
    #                            flight['class'] = td.a.string # BA flights have a link to the class
    #                        else:
    #                            flight['class'] = td.string # AA etc flights do not

                    if flight != {}:
                        if "flights" not in result:
                            result['flights'] = []

                        # add the route from the header column if it wasn't included in the individual rows
                        if flight.get("route") is None:
                            flight['route'] = route

                        if flight.get("class") is None:
                            flight['class'] = self.classes[cabincode]

                        result['flights'].append(flight)

                if result != {}:  # some rows have no flights / data at all now
                    if directonly and len(result['flights']) > 1:
                        self.logger.info(
                            "Skipping non-direct flight with {0} segments.".
                            format(len(result['flights'])))
                    else:
                        results.append(result)

        return results

Beispiel #8

0

Datei anzeigen

Datei: kisspanda.py Projekt: katiaberbel/repository.rainhabox

def SOUPIFY(link):
    return BeautifulSoup(requests.get(link).content)

Beispiel #9

0

Datei anzeigen

        cruises['ports'] = c_link['ports_of_call']

        # Select relevant controls and set the criteria then submit - This is step 1 in their booking process
        br.open(c_link['link'])
        br.select_form("departureInfoForm")
        br["totalStaterooms"] = ['1']
        br["adultCount"] = ['3']
        br["childCount"] = ['1']
        #br["hasAir"] =
        #br["gateway"] =
        br.submit()

        # Start step 2 - choose category and get the price per guest per category
        step2 = br.geturl()
        response = br.open(step2)
        soup = BeautifulSoup(response.read())
        #print soup.prettify()
        for title in soup.findAll('div', {'class': 'yourCruiseModuleSection'}):
            cruise_name = title('li')[1].text
            sail_date = title('li')[2].text
        cruises['name'] = cruise_name[7:len(cruise_name)]
        cruises['date'] = "".join(sail_date.split())
        for each in soup.findAll('script', {'type': 'text/javascript'}):
            if (each is not None and "categories.push" in each.text):
                some_script = each.text
                packages['cat'] = "".join(
                    some_script[(some_script.find('categoryDisplayName') +
                                 23):(some_script.find('hasUpsell') -
                                      2)].split())
                packages['room_code'] = "".join(
                    some_script[(some_script.find('code') +

Beispiel #10

0

Datei anzeigen

Datei: acm_classification_to_json_file.py Projekt: smnikolakaki/teammutualrespect

def scrape_acm_categories(json_objects_file, json_assignments_file):
    '''
    Input: None
    Output: None
    This function scrapes the categories/sub-categories/keywords of each sub-category from the official acm classification website
    '''
    id_to_category_name_dict = {}
    id_category_hierarch_dict = {}
    id_category_hierarch_list = {}
    id_category_hierarch_assign_list = {}

    id_category_hierarch_list['categories'] = []
    id_category_hierarch_assign_list['categories'] = []

    log.info("URL TO REQUEST: %s \n" % base_url)
    headers = {'User-Agent': user_agent}
    response = requests.get(base_url, headers=headers)
    html = response.text.encode('utf-8')
    soup = BeautifulSoup(html)

    # extracting main categories with corresponding ids
    main_categories = soup.findAll("a",
                                   attrs={
                                       "class": "boxedlinkh",
                                       "title": "Assign This CCS Concept"
                                   })
    print('Number of main categories:', len(main_categories))
    for i, cat in enumerate(main_categories):
        num_id = cat["href"].split('"')[1][2:]
        main_cat_name = cat.text
        id_to_category_name_dict[num_id] = main_cat_name

    # extracting sub-categories with corresponding ids
    sub_categories = soup.findAll("a",
                                  attrs={
                                      "class": "boxedlink",
                                      "title": "Assign This CCS Concept"
                                  })
    print('Number of sub-categories:', len(sub_categories))
    for i, cat in enumerate(sub_categories):
        num_id_string = cat["href"].split('"')[1]
        num_ids = num_id_string.split(".")[1:]
        sub_cat_name = cat.text
        id_to_category_name_dict[num_ids[-1]] = sub_cat_name
        id_category_hierarch_dict = {}
        id_category_hierarch_assign_dict = {}

        if len(num_ids) == 2:
            main_cat = num_ids[0]
            sub_cat = num_ids[1]
            id_category_hierarch_dict['main-category-id'] = main_cat
            id_category_hierarch_dict['sub-category-id'] = sub_cat
            id_category_hierarch_dict[
                'main-category-name'] = id_to_category_name_dict[
                    main_cat].lower()
            id_category_hierarch_dict[
                'sub-category-name'] = id_to_category_name_dict[sub_cat].lower(
                )

            id_category_hierarch_assign_dict[
                id_to_category_name_dict[main_cat].lower()] = 1
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_cat].lower()] = 2

        if len(num_ids) == 3:
            main_cat = num_ids[0]
            sub_cat = num_ids[1]
            sub_sub_cat = num_ids[2]
            id_category_hierarch_dict['main-category-id'] = main_cat
            id_category_hierarch_dict['sub-category-id'] = sub_cat
            id_category_hierarch_dict['sub-sub-category-id'] = sub_sub_cat
            id_category_hierarch_dict[
                'main-category-name'] = id_to_category_name_dict[
                    main_cat].lower()
            id_category_hierarch_dict[
                'sub-category-name'] = id_to_category_name_dict[sub_cat].lower(
                )             \
                        id_category_hierarch_dict['sub-sub-category-name'] = id_to_category_name_dict[sub_sub_cat].lower()

            id_category_hierarch_assign_dict[
                id_to_category_name_dict[main_cat].lower()] = 1
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_cat].lower()] = 2
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_sub_cat].lower()] = 3

        if len(num_ids) == 4:
            main_cat = num_ids[0]
            sub_cat = num_ids[1]
            sub_sub_cat = num_ids[2]
            sub_sub_sub_cat = num_ids[3]
            id_category_hierarch_dict['main-category-id'] = main_cat
            id_category_hierarch_dict['sub-category-id'] = sub_cat
            id_category_hierarch_dict['sub-sub-category-id'] = sub_sub_cat             \
                        id_category_hierarch_dict['sub-sub-sub-category-id'] = sub_sub_sub_cat
            id_category_hierarch_dict[
                'main-category-name'] = id_to_category_name_dict[
                    main_cat].lower()
            id_category_hierarch_dict[
                'sub-category-name'] = id_to_category_name_dict[sub_cat].lower(
                )             \
                        id_category_hierarch_dict['sub-sub-category-name'] = id_to_category_name_dict[sub_sub_cat].lower()
            id_category_hierarch_dict[
                'sub-sub-sub-category-name'] = id_to_category_name_dict[
                    sub_sub_sub_cat].lower()

            id_category_hierarch_assign_dict[
                id_to_category_name_dict[main_cat].lower()] = 1
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_cat].lower()] = 2
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_sub_cat].lower()] = 3
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_sub_sub_cat].lower()] = 4

        if len(num_ids) == 5:
            main_cat = num_ids[0]
            sub_cat = num_ids[1]
            sub_sub_cat = num_ids[2]
            sub_sub_sub_cat = num_ids[3]
            sub_sub_sub_sub_cat = num_ids[4]
            id_category_hierarch_dict['main-category-id'] = main_cat
            id_category_hierarch_dict['sub-category-id'] = sub_cat
            id_category_hierarch_dict['sub-sub-category-id'] = sub_sub_cat             \
                        id_category_hierarch_dict['sub-sub-sub-category-id'] = sub_sub_sub_cat
            id_category_hierarch_dict[
                'sub-sub-sub-sub-category-id'] = sub_sub_sub_sub_cat
            id_category_hierarch_dict[
                'main-category-name'] = id_to_category_name_dict[
                    main_cat].lower()
            id_category_hierarch_dict[
                'sub-category-name'] = id_to_category_name_dict[sub_cat].lower(
                )             \
                        id_category_hierarch_dict['sub-sub-category-name'] = id_to_category_name_dict[sub_sub_cat].lower()
            id_category_hierarch_dict[
                'sub-sub-sub-category-name'] = id_to_category_name_dict[
                    sub_sub_sub_cat].lower()
            id_category_hierarch_dict[
                'sub-sub-sub-sub-category-name'] = id_to_category_name_dict[
                    sub_sub_sub_sub_cat].lower()

            id_category_hierarch_assign_dict[
                id_to_category_name_dict[main_cat].lower()] = 1
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_cat].lower()] = 2
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_sub_cat].lower()] = 3
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_sub_sub_cat].lower()] = 4
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_sub_sub_sub_cat].lower()] = 5

        if len(num_ids) == 6:
            main_cat = num_ids[0]
            sub_cat = num_ids[1]
            sub_sub_cat = num_ids[2]
            sub_sub_sub_cat = num_ids[3]
            sub_sub_sub_sub_cat = num_ids[4]
            sub_sub_sub_sub_sub_cat = num_ids[4]
            id_category_hierarch_dict['main-category-id'] = main_cat
            id_category_hierarch_dict['sub-category-id'] = sub_cat
            id_category_hierarch_dict['sub-sub-category-id'] = sub_sub_cat             \
                        id_category_hierarch_dict['sub-sub-sub-category-id'] = sub_sub_sub_cat
            id_category_hierarch_dict[
                'sub-sub-sub-sub-category-id'] = sub_sub_sub_sub_cat
            id_category_hierarch_dict[
                'sub-sub-sub-sub-sub-category-id'] = sub_sub_sub_sub_sub_cat
            id_category_hierarch_dict[
                'main-category-name'] = id_to_category_name_dict[
                    main_cat].lower()
            id_category_hierarch_dict[
                'sub-category-name'] = id_to_category_name_dict[sub_cat].lower(
                )             \
                        id_category_hierarch_dict['sub-sub-category-name'] = id_to_category_name_dict[sub_sub_cat].lower()
            id_category_hierarch_dict[
                'sub-sub-sub-category-name'] = id_to_category_name_dict[
                    sub_sub_sub_cat].lower()
            id_category_hierarch_dict[
                'sub-sub-sub-sub-category-name'] = id_to_category_name_dict[
                    sub_sub_sub_sub_cat].lower()
            id_category_hierarch_dict[
                'sub-sub-sub-sub-sub-category-name'] = id_to_category_name_dict[
                    sub_sub_sub_sub_sub_cat].lower()

            id_category_hierarch_assign_dict[
                id_to_category_name_dict[main_cat].lower()] = 1
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_cat].lower()] = 2
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_sub_cat].lower()] = 3
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_sub_sub_cat].lower()] = 4
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_sub_sub_sub_cat].lower()] = 5
            id_category_hierarch_assign_dict[
                id_to_category_name_dict[sub_sub_sub_sub_sub_cat].lower()] = 6

        id_category_hierarch_list['categories'].append(
            id_category_hierarch_dict)
        id_category_hierarch_assign_list['categories'].append(
            id_category_hierarch_assign_dict)

    store_list_json(id_category_hierarch_list, json_objects_file)
    store_list_json(id_category_hierarch_assign_list, json_assignments_file)

Beispiel #11

0

Datei anzeigen

Datei: abcfamily.py Projekt: lukewiersma/redcop

def videos(url=common.args.url):
    data = common.getURL(url)
    tree=BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES)
    cats=tree.findAll(attrs={'class' : re.compile('(.+?)videoCollectionModule(.+?)')})
    for cat in cats:
        catname = cat.find('div',attrs={'class' : 'twocolumnheader'}).find('h3').string.title()
        if catname == common.args.name:
            episodes = cat.findAll(attrs={'class' : 'fullgallery'})
            if len(episodes) > 0:
                for video in episodes:
                    url = video.find('a')['href']
                    thumb = video.find('img')['src']
                    name = video.find('img')['alt']
                    description = video.contents[8].strip() 
                    seasonepisode = video.contents[6].strip().split('|')
                    season = int(seasonepisode[0].replace('Season','').strip())
                    episode = int(seasonepisode[1].replace('Episode','').strip())
                    displayname = '%sx%s - %s' % (str(season),str(episode),name)
                    u = sys.argv[0]
                    u += '?url="'+urllib.quote_plus(url)+'"'
                    u += '&mode="abcfamily"'
                    u += '&sitemode="play"'
                    infoLabels={ "Title":name,
                                 "Season":season,
                                 "Episode":episode,
                                 "Plot":description
                                 #"premiered":airDate,
                                 #"Duration":duration,
                                 #"TVShowTitle":tvshow
                                 }
                    common.addVideo(u,displayname,thumb,infoLabels=infoLabels)
            else:
                videos = cat.findAll(attrs={'class' : 'shortgallery'})
                for video in videos:
                    url = BASE + video.find('a')['href']
                    try:
                        thumb = video.find('img')['src']
                    except:
                        try:
                            thumb = video.find('image')['src']
                        except:
                            print 'no thumb'
                            thumb = ''
                    name = video.find(attrs={'class' : 'shortvideoTitle'}).find('a').string
                    if name == None:
                        name = video.find(attrs={'class' : 'shortvideoTitle'}).find('abbr')['title']
                    description = video.find(attrs={'class' : 'shortvideoDesc'}).string.strip()
                    try:
                        seasonepisode = video.find(attrs={'class' : 'videoSeasonInfo'}).string.strip().split('|')
                        season = int(seasonepisode[0].replace('Season','').strip())
                        episode = int(seasonepisode[1].replace('Episode','').strip())
                        displayname = '%sx%s - %s' % (str(season),str(episode),name)
                    except:
                        season = 0
                        episode = 0
                        displayname = name
                    u = sys.argv[0]
                    u += '?url="'+urllib.quote_plus(url)+'"'
                    u += '&mode="abcfamily"'
                    u += '&sitemode="play"'
                    infoLabels={ "Title":name,
                                 "Season":season,
                                 "Episode":episode,
                                 "Plot":description
                                 #"premiered":airDate,
                                 #"Duration":duration,
                                 #"TVShowTitle":tvshow
                                 }
                    common.addVideo(u,displayname,thumb,infoLabels=infoLabels)
    common.setView('episodes')

Beispiel #12

0

Datei anzeigen

Datei: versioning.py Projekt: affinitic/interclps.skin

 def stripHTML(self, text):
     return ''.join(BeautifulSoup(text).findAll(text=True))

Beispiel #13

0

Datei anzeigen

Datei: bookmark.py Projekt: yejia/osl_notebook

def import_with_tags2(username,
                      bookmark_file,
                      default_vote=0,
                      common_tag=None,
                      common_ws=None):
    """This not only gets all the urls, but also turns the folders in the file into tags"""

    T = getT(username)
    W = getW(username)

    urls = [(tag['href'], tag.string, tag.get('add_date'),
             tag.get('last_modified'))
            for tag in BeautifulSoup(bookmark_file).findAll('a')]
    count_urls_in_file = len(urls)
    #print count_urls_in_file, ' urls found in the bookmark file.'

    count_tag_created = 0

    w = W.objects.get(name="bookmarkbook")

    count_note_created = 0
    duplicate = []
    #move the pointer back to the beginning of the file
    bookmark_file.seek(0)
    b = None
    folder_list = []
    for line in bookmark_file:
        if line.find('<DT><H3') != -1:
            tname = [tag.string
                     for tag in BeautifulSoup(line).findAll('h3')][0]
            folder_list.append({tname: []})
            #print 'one folder with folder name ',tname,' pushed to stack.'
        if line.find('</DL><P>') != -1 or line.find(
                '</DL><p>') != -1:  #FF and Chrome use <p> while Opera uses <P>

            #there is one extra '</DL><P>' at the end of the file for <H1>Bookmarks</H1>. So when it comes to the
            #it, just skip
            if len(folder_list) == 0:
                continue
            folder_of_urls = folder_list.pop()
            #print 'one folder ',folder_of_urls,' popped out of stack.'
            folder = folder_of_urls.keys()[0]
            urls = folder_of_urls.get(folder)
            folderstr = unicode(folder)
            if folderstr not in [
                    u'Unsorted Bookmarks', u'[Folder Name]',
                    u'Bookmarks Toolbar'
            ]:
                t, created = T.objects.get_or_create(name=folderstr)
                if created:
                    count_tag_created += 1
                w.tags.add(t)
                w.save()
                for url in urls:
                    #print 'url in the popped out stack is: ', url
                    url.tags.add(t)
                    num_of_tags_created = url.add_tags(common_tag,
                                                       'bookmarkbook')
                    count_tag_created = count_tag_created + num_of_tags_created
                    url.save()
        if line.find('<DT><A') != -1:
            u = [(tag['href'], tag.string, tag.get('add_date'),
                  tag.get('last_modified'))
                 for tag in BeautifulSoup(line).findAll('a')][0]
            b, created = build_one_bookmark(username, u, default_vote)
            if not created:
                duplicate.append((u[0], u[1]))
            else:
                count_note_created += 1
            #for url that is at the top, simply create the bookmark without adding it to any tag
            if len(folder_list) == 0:
                pass
            else:
                for i in range(len(folder_list)
                               ):  #add this url to every folder on the stack
                    f_of_bs = folder_list[i]
                    f = f_of_bs.keys()[0]
                    bs = f_of_bs.get(f)
                    bs.append(b)
                    f_of_bs.update({f: bs})
                    folder_list[i] = f_of_bs
                    #print 'one url ', b, 'is added to a folder on stack ', f
        if line.find('<DD>') != -1:
            if b:
                desc = line.strip('<DD>').strip('</DD>')
                b.desc = desc
                b.save()
                print 'b.desc:', b.desc

    #print  count_note_created, ' bookmarks created'
    #print len(duplicate), ' duplicated bookmarks.'
    #print  'duplicate is:', duplicate
    duplicate.sort()
    return count_urls_in_file, count_note_created, duplicate, count_tag_created

Beispiel #14

0

Datei anzeigen

Datei: bookmark.py Projekt: yejia/osl_notebook

def import_with_tags(username,
                     bookmark_file,
                     default_vote=0,
                     common_tag=None,
                     common_ws=None):
    """This not only gets all the urls, but also turns the folders in the file into tags"""

    T = getT(username)
    W = getW(username)

    urls = [(tag['href'], tag.string, tag.get('add_date'),
             tag.get('last_modified'))
            for tag in BeautifulSoup(bookmark_file).findAll('a')]
    count_urls_in_file = len(urls)
    print count_urls_in_file, ' urls found in the bookmark file.'

    bookmark_file.seek(0)
    folders = [
        tag.string for tag in BeautifulSoup(bookmark_file).findAll('h3')
    ]
    print 'folders:', folders
    print len(folders), ' folders found in the bookmark file.'

    count_tag_created = 0

    w = W.objects.get(name="bookmarks")

    #make each of them into a tag
    for folder in folders:
        print 'type(folder):', type(folder)
        print 'folder:', folder

        #some bug with BeautifulSoup's custom unicode. So upcast back to unicode itself. See http://code.djangoproject.com/ticket/11932
        folderstr = unicode(folder)
        print 'type(folderstr):', type(folderstr)
        print 'folderstr:', folderstr

        if folderstr not in [
                u'Unsorted Bookmarks', u'[Folder Name]', u'Bookmarks Toolbar'
        ]:

            t, created = T.objects.get_or_create(name=folderstr)
            print 'tag ', t, ' created ', created
            print 't.name:', t.name
            if created:
                #print 'tag:', t, ' is created.'
                count_tag_created += 1
            w.tags.add(t)
            w.save()

    print count_tag_created, 'tags are created.'

    count_note_created = 0
    duplicate = []
    #move the ponter back to the beginning of the file
    bookmark_file.seek(0)
    t = None
    for line in bookmark_file:
        if line.find('<DT><A') != -1:
            url = [(tag['href'], tag.string, tag.get('add_date'),
                    tag.get('last_modified'))
                   for tag in BeautifulSoup(line).findAll('a')][0]
            n, created = build_one_bookmark(username, url, default_vote)
            if not created:
                duplicate.append((url[0], url[1]))
            else:
                count_note_created += 1
            if t:
                n.tags.add(t)
                n.save()
        elif line.find('<DT><H3') != -1:
            tname = [tag.string
                     for tag in BeautifulSoup(line).findAll('h3')][0]
            if unicode(tname) not in [
                    u'Unsorted Bookmarks', u'[Folder Name]',
                    u'Bookmarks Toolbar'
            ]:
                print 'unicode(tname) is:', unicode(tname)
                t = T.objects.get(name__exact=unicode(tname))
        else:
            continue

    print count_note_created, ' bookmarks created'
    print len(duplicate), ' duplicated bookmarks.'
    #print  'duplicate is:', duplicate
    duplicate.sort()
    return count_urls_in_file, count_note_created, duplicate, count_tag_created

Beispiel #15

0

Datei anzeigen

Datei: test.PixivHelper.py Projekt: Wuji2000/PixivUtil2

 def testParseLoginForm(self):
     p = open('./test/test-login-form.html', 'r')
     page = BeautifulSoup(p.read())
     r = page.findAll('form', attrs={'action': '/login.php'})
     # print(r)
     self.assertTrue(len(r) > 0)

Beispiel #16

0

Datei anzeigen

import scraperwiki
html = scraperwiki.scrape('http://scraperwiki.com/hello_world.html')
print html

# -----------------------------------------------------------------------------
# 1. Parse the raw HTML to get the interesting bits - the part inside <td> tags.
# -- UNCOMMENT THE 6 LINES BELOW (i.e. delete the # at the start of the lines)
# -- CLICK THE 'RUN' BUTTON BELOW
# Check the 'Console' tab again, and you'll see how we're extracting
# the HTML that was inside <td></td> tags.
# We use BeautifulSoup, which is a Python library especially for scraping.
# -----------------------------------------------------------------------------

from BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(html)  # turn our HTML into a BeautifulSoup object
tds = soup.findAll('td')  # get all the <td> tags
for td in tds:
    print td  # the full HTML tag
    print td.text  # just the text inside the HTML tag

# -----------------------------------------------------------------------------
# 2. Save the data in the ScraperWiki datastore.
# -- UNCOMMENT THE THREE LINES BELOW
# -- CLICK THE 'RUN' BUTTON BELOW
# Check the 'Data' tab - here you'll see the data saved in the ScraperWiki store.
# -----------------------------------------------------------------------------

for td in tds:
    record = {"td": td.text}  # column name and value
    scraperwiki.datastore.save(["td"], record)  # save the records one by one

Beispiel #17

0

Datei anzeigen

Datei: GetImages.py Projekt: norah1j/PythonProgrammingLanguage

from BeautifulSoup import BeautifulSoup
import urllib2
import re

html_page = urllib2.urlopen("http://imgur.com")
soup = BeautifulSoup(html_page)
images = []
for img in soup.findAll('img'):
    images.append(img.get('src'))

print(images)

Beispiel #18

0

Datei anzeigen

Datei: test_questiontarget.py Projekt: pombreda/UnnaturalCodeFork

 def test_any_question_target_any_user(self):
     content = BeautifulSoup(self.view())
     self.assertCommonPageElements(content)

Beispiel #19

0

Datei anzeigen

def clean_markdown(doc):
    doc = markdown.markdown(doc)
    doc = ''.join(BeautifulSoup(doc).findAll(text=True))
    return doc

Beispiel #20

0

Datei anzeigen

Datei: test_questiontarget.py Projekt: pombreda/UnnaturalCodeFork

 def test_product_with_packaging_elements(self):
     self.linkPackage(self.product, 'cow')
     content = BeautifulSoup(self.view())
     self.assertCommonPageElements(content)
     self.assertTrue(content.find(True, id='ubuntu-support') is not None)

Beispiel #21

0

Datei anzeigen

Datei: get-official-reports.py Projekt: samknight/parlparse

        ur = urllib.urlopen(index_page_url)
        fp = open(output_filename, 'w')
        fp.write(ur.read())
        fp.close()
        ur.close()

for year in range(options.year, currentyear+1):

    year_index_filename = output_directory  + str(year) + ".html"
    if not os.path.exists(year_index_filename):
        raise Exception, "Missing the year index: '%s'" % year_index_filename
    fp = open(year_index_filename)
    html = fp.read()
    fp.close()

    soup = BeautifulSoup( html )
    link_tags = soup.findAll( 'a' )

    for t in link_tags:
        # old format link - /or-10/sor1223-01.htm
        # new format link - http://www.scottish.parliament.uk/Apps2/Business/ORSearch/ReportView.aspx?r=6132&amp;mode=html
        if t.has_key('href') and (re.match('^or-',t['href']) or re.search('ORSearch/ReportView.aspx.*?mode=html', t['href'])):
            # print t
            s = ""
            for c in t.contents:
                if type(c) == NavigableString:
                    s = s + str(c)
            s = re.sub(',','',s)
            # print year_index_filename + "==> " + s
            d = None
            m = re.match( '^(Official Report)?\s*(\d+)\s+(\w+)', s )

Beispiel #22

0

Datei anzeigen

def imagesDownloader(images_url, totalImages=20):

    images_root = '/var/www/clouds/img/'

    file_url = images_url + 'latest.info'
    pattern = re.compile(r'http://goes.gsfc.nasa.gov/')
    match = pattern.search(images_url)
    prelen = len(match.group(0))
    subpath = images_url[prelen:]

    print "Esperando a la NASA"

    response = urlopen(images_url)
    try:
        latest_remote_file = urlopen(file_url)
    except:
        print "No existe latest.info"
        return
    if not os.path.exists(images_root + subpath):
        os.makedirs(images_root + subpath)
    remote_latest = open(images_root + subpath + 'remote_latest.info', 'wb')
    remote_latest.write(latest_remote_file.read())
    remote_latest.close()
    print "La NASA nos muestra su pagina"
    html = response.read()
    soup = BeautifulSoup(html)
    i = 0
    if not os.path.exists(images_root + subpath):
        os.makedirs(images_root + subpath)
    try:
        files_cmp = filecmp.cmp(images_root + subpath + 'latest.info',
                                images_root + subpath + 'remote_latest.info')
    except:
        new_latest = open(images_root + subpath + 'latest.info', 'wb')
        new_latest.write("Initial file")
        new_latest.close()
        files_cmp = filecmp.cmp(images_root + subpath + 'latest.info',
                                images_root + subpath + 'remote_latest.info')
    if (not files_cmp):
        os.remove(images_root + subpath + 'latest.info')
        os.rename(images_root + subpath + 'remote_latest.info',
                  images_root + subpath + 'latest.info')
        for trs in reversed(soup.findAll('a')[-totalImages - 1:]):
            print soup.findAll('a')[-totalImages - 1:]
            match = re.search(r'>(.*).tif', str(trs))
            if match is not None:
                try:
                    filenametif = match.group(1) + '.tif'
                    print filenametif
                    filenamejpg = match.group(1) + '.jpg'
                    print "Pongo a bajar" + images_url + filenametif
                    fi = urlopen(images_url + filenametif)
                    print "Ahi se bajo y la voy a nombrar" + str(i) + '.jpg'
                    fi_read = fi.read()
                    print "Y la lei"
                    file_tif = StringIO(fi_read)
                    image_jpg = Image.open(file_tif)
                    if not os.path.exists(images_root + subpath):
                        os.makedirs(images_root + subpath)

                    try:
                        image_jpg.save(images_root + subpath + str(i) + '.jpg',
                                       quality=90)
                        #logger.info(images_root + subpath + str(i)+'.jpg' + "Downloaded")
                    except:
                        try:
                            image_jpg.convert('RGB').save(
                                images_root + subpath + str(i) + '.jpg',
                                quality=90)
                        except:
                            logger.error(
                                images_root + subpath + str(i) +
                                '.jpg couldnt save after converting to RGB')

                    i += 1
                except:
                    print "Something went wrong with image handling"
            else:
                print "no matchea"

Beispiel #23

0

Datei anzeigen

Datei: metalarchives.py Projekt: thejordan95/Groovebot2

def mareviews(inp, conn=None, bot=None, nick=None, chan=None):
    """marating [band] -- Displays band rating
     from metal archives."""

    if not inp:
        return "You must specify a band"

    comms = inp.split(",")

    album = None
    if (len(comms) == 1):
        inp = comms[0].strip()
    else:
        inp = comms[0].strip()
        album = comms[1]

    album = str(album).strip()

    response = http.get_json(baseurl + "search/ajax-advanced/searching/bands",
                             bandName=inp,
                             exactBandMatch=0,
                             sEcho=1,
                             iColumns=3,
                             sColumns='',
                             iDisplayStart=0,
                             iDisplayLength=200,
                             sNames=',,')

    if response["error"] != "":
        return "Error: {}.".format(response["error"])

    if response["iTotalRecords"] == 0:
        return u"No bands were found named {}".format(inp)

    bands = response["aaData"]

    band = BeautifulSoup(bands[0][0]).findAll("a")[0].contents[0]
    href = BeautifulSoup(bands[0][0]).findAll("a")[0]["href"]

    regex1 = re.compile("(?<=bands/).*\/")
    rawBand = regex1.findall(href)[0]

    regex2 = re.compile("(?<={}/).*".format(rawBand.replace("/", "")))
    bandId = regex2.findall(href)[0]

    reviews = http.get_json(
        baseurl + "review/ajax-list-band/id/{}/json/1".format(bandId),
        sEcho=1,
        iColumns=4,
        sColums='',
        iDisplayStart=0,
        iDisplayLength=200,
        mDataProp_0=0,
        mDataProp_1=1,
        mDataProp_2=2,
        mDataProp_3=3,
        iSortingCols=1,
        iSortCol_0=3,
        sSortDir_0="desc",
        bSortable_0="true",
        bSortable_1="true",
        bSortable_2="true",
        bSortable_3="true")

    percentages = []
    if not album:
        if type(reviews["aaData"]) == list and len(reviews["aaData"]) > 0:
            for review in reviews["aaData"]:
                percentages.append(int(review[1].replace("%", "")))

            average = reduce(lambda x, y: x + y,
                             percentages) / len(percentages)

            return u'\x02{}\x0f has an average review of \x02{}\x0f% based on their album reviews. Use "," to separate artist, album.'.format(
                band, average)
        else:
            return u'Could not calculate average review for {} or too many bands with the same name. Use "," to seperate artist, album.'.format(
                band)
    else:
        if type(reviews["aaData"]) == list and len(reviews["aaData"]) > 0:
            fullAlbum = ""
            if reviews["aaData"] == list:
                for review in reviews["aaData"]:
                    ulink = review[0]
                    alink = BeautifulSoup(ulink).findAll("a")
                    text = alink[0].contents[0].lower()
                    if text == album.lower() or text.find(album) != -1:
                        percentages.append(int(review[1].replace("%", "")))
                        fullAlbum = alink[0].contents[0]

                if len(percentages) > 0:
                    average = reduce(lambda x, y: x + y,
                                     percentages) / len(percentages)

                    return u'The album \x02{}\x0f by \x02{}\x0f has an average review of \x02{}\x0f%'.format(
                        fullAlbum, band, average)
                else:
                    return u'Could not find the album {} for the band {}'.format(
                        album, band)
            else:
                return u'Could not find reviews for album like "{}" by {}.'.format(
                    album, band)
        else:
            return u'Could not calculate average review for {} or too many bands with the same name'.format(
                band)

Beispiel #24

0

Datei anzeigen

from os import listdir
from os.path import isfile, join
from BeautifulSoup import BeautifulSoup
import re
hexentityMassage = [(re.compile('&#x([^;]+);'),
                     lambda m: '&#%d;' % int(m.group(1), 16))]

mypath = '/home/vvasuki/Downloads/HTML/'
onlyfiles = [
    f for f in listdir(mypath) if isfile(join(mypath, f)) and f.startswith('v')
]
for file_name in onlyfiles:
    with open(join(mypath, file_name), 'r') as content_file:
        content = content_file.read()
        soup = BeautifulSoup(content,
                             convertEntities=BeautifulSoup.HTML_ENTITIES,
                             markupMassage=hexentityMassage)
        h2_items = soup.findAll('h2')
        li_items = soup.findAll('li')
        output = re.sub(r'<.+?>', r'', str(h2_items[0]))
        for item in li_items:
            output = output + ',' + re.sub(r'<.+?>', r'', str(item))
        print output
        # print ul_items

Beispiel #25

0

Datei anzeigen

Datei: dumpert_play.py Projekt: jaqb/repository.skipmodea1

    def playVideo(self):
        #
        # Init
        #
        no_url_found = False
        unplayable_media_file = False
        have_valid_url = False

        #
        # Get current list item details...
        #
        # title = unicode(xbmc.getInfoLabel("listitem.Title"), "utf-8")
        thumbnail_url = xbmc.getInfoImage("list_item.Thumb")
        # studio = unicode(xbmc.getInfoLabel("list_item.Studio"), "utf-8")
        plot = unicode(xbmc.getInfoLabel("list_item.Plot"), "utf-8")
        genre = unicode(xbmc.getInfoLabel("list_item.Genre"), "utf-8")

        #
        # Show wait dialog while parsing data...
        #
        dialog_wait = xbmcgui.DialogProgress()
        dialog_wait.create(LANGUAGE(30504), self.title)
        # wait 1 second
        xbmc.sleep(1000)

        # Set cookies for cookie-firewall and nsfw-switch
        if SETTINGS.getSetting('nsfw') == 'true':
            cookies = {"Cookie": "cpc=10", "nsfw": "1"}
        else:
            cookies = {"Cookie": "cpc=10"}

            # Make a session
            sess = requests.session()

            # Set cookies for cookie-firewall and nsfw-switch
            if SETTINGS.getSetting('nsfw') == 'true':
                cookies = {"Cookie": "cpc=10", "nsfw": "1"}
            else:
                cookies = {"Cookie": "cpc=10"}

            # Determine if cloudflare protection is active or not
            html_source = sess.get(self.video_page_url, cookies=cookies).text
            if str(html_source).find("cloudflare") >= 0:
                cloudflare_active = True
            else:
                cloudflare_active = False

            # Get the page
            if cloudflare_active == True:
                try:
                    import cfscrape
                except:
                    xbmcgui.Dialog().ok(LANGUAGE(30000), LANGUAGE(30513))
                    sys.exit(1)
                try:
                    # returns a CloudflareScraper instance
                    scraper = cfscrape.create_scraper(sess)
                except:
                    xbmcgui.Dialog().ok(LANGUAGE(30000), LANGUAGE(30514))
                    sys.exit(1)
                try:
                    html_source = scraper.get(self.video_page_url).content
                except:
                    xbmcgui.Dialog().ok(LANGUAGE(30000), LANGUAGE(30515))
                    sys.exit(1)

        soup = BeautifulSoup(html_source)

        video_url = ''
        # <div class="videoplayer" id="video1" data-files="eyJmbHYiOiJodHRwOlwvXC9tZWRpYS5kdW1wZXJ0Lm5sXC9mbHZcLzI4OTE2NWRhXzEwMjU1NzUyXzYzODMxODA4OTU1NDc2MV84MTk0MzU3MDVfbi5tcDQuZmx2IiwidGFibGV0IjoiaHR0cDpcL1wvbWVkaWEuZHVtcGVydC5ubFwvdGFibGV0XC8yODkxNjVkYV8xMDI1NTc1Ml82MzgzMTgwODk1NTQ3NjFfODE5NDM1NzA1X24ubXA0Lm1wNCIsIm1vYmlsZSI6Imh0dHA6XC9cL21lZGlhLmR1bXBlcnQubmxcL21vYmlsZVwvMjg5MTY1ZGFfMTAyNTU3NTJfNjM4MzE4MDg5NTU0NzYxXzgxOTQzNTcwNV9uLm1wNC5tcDQiLCJzdGlsbCI6Imh0dHA6XC9cL3N0YXRpYy5kdW1wZXJ0Lm5sXC9zdGlsbHNcLzY1OTM1MjRfMjg5MTY1ZGEuanBnIn0="></div></div>
        video_urls = soup.findAll('div', attrs={'class': re.compile("video")}, limit=1)
        if len(video_urls) == 0:
            no_url_found = True
        else:
            video_url_enc = video_urls[0]['data-files']
            # base64 decode
            video_url_dec = str(base64.b64decode(video_url_enc))
            # {"flv":"http:\/\/media.dumpert.nl\/flv\/5770e490_Jumbo_KOOP_DAN__Remix.avi.flv","tablet":"http:\/\/media.dumpert.nl\/tablet\/5770e490_Jumbo_KOOP_DAN__Remix.avi.mp4","mobile":"http:\/\/media.dumpert.nl\/mobile\/5770e490_Jumbo_KOOP_DAN__Remix.avi.mp4","720p":"http:\/\/media.dumpert.nl\/720p\/5770e490_Jumbo_KOOP_DAN__Remix.avi.mp4","still":"http:\/\/static.dumpert.nl\/stills\/6593503_5770e490.jpg"}
            # or
            # {"embed":"youtube:U89fl5fZETE","still":"http:\/\/static.dumpert.nl\/stills\/6650228_24eed546.jpg"}

            xbmc.log("[ADDON] %s v%s (%s) debug mode, %s = %s" % (
                    ADDON, VERSION, DATE, "video_url_dec", str(video_url_dec)), xbmc.LOGDEBUG)

            # convert string to dictionary
            video_url_dec_dict = ast.literal_eval(video_url_dec)

            video_url_embed = ''
            try:
                video_url_embed = str(video_url_dec_dict['embed'])
                embed_found = True
            except KeyError:
                embed_found = False

            video_url = ''
            if embed_found:
                # make youtube plugin url
                youtube_id = video_url_embed.replace("youtube:", "")
                youtube_url = 'plugin://plugin.video.youtube/play/?video_id=%s' % youtube_id
                video_url = youtube_url
                have_valid_url = True
                xbmc.log("[ADDON] %s v%s (%s) debug mode, %s = %s" % (
                        ADDON, VERSION, DATE, "video_url1", str(video_url)), xbmc.LOGDEBUG)
            else:
                # matching the desired and available quality
                if self.VIDEO == '0':
                    try:
                        video_url = str(video_url_dec_dict['mobile'])
                    except KeyError:
                        no_url_found = True
                elif self.VIDEO == '1':
                    try:
                        video_url = str(video_url_dec_dict['tablet'])
                    except KeyError:
                        try:
                            video_url = str(video_url_dec_dict['mobile'])
                        except KeyError:
                            no_url_found = True
                elif self.VIDEO == '2':
                    try:
                        video_url = str(video_url_dec_dict['720p'])
                    except KeyError:
                        try:
                            video_url = str(video_url_dec_dict['tablet'])
                        except KeyError:
                            try:
                                video_url = str(video_url_dec_dict['mobile'])
                            except KeyError:
                                no_url_found = True

                if no_url_found:
                    pass
                else:
                    video_url = video_url.replace('\/', '/')
                    xbmc.log("[ADDON] %s v%s (%s) debug mode, %s = %s" % (
                            ADDON, VERSION, DATE, "video_url2", str(video_url)), xbmc.LOGDEBUG)

                    # The need for speed: let's guess that the video-url exists
                    have_valid_url = True

        # Play video...
        if have_valid_url:
            list_item = xbmcgui.ListItem(path=video_url)
            xbmcplugin.setResolvedUrl(self.plugin_handle, True, list_item)
        #
        # Alert user
        #
        elif no_url_found:
            xbmcgui.Dialog().ok(LANGUAGE(30000), LANGUAGE(30505))
        elif unplayable_media_file:
            xbmcgui.Dialog().ok(LANGUAGE(30000), LANGUAGE(30506))

Beispiel #26

0

Datei anzeigen

Datei: argentina-senators.py Projekt: yuandra/scraperwiki-scraper-vault

import scraperwiki
from BeautifulSoup import BeautifulSoup

SERVER = "http://www.senado.gov.ar"
INDEX_PATH = "/web/senadores/senadores.php"
# This page lists all the "Senadores"
page = BeautifulSoup(scraperwiki.scrape(SERVER+INDEX_PATH))
# Find the tabel with all the senators
# Search by border color because of all the nested tables without id's or classes.
table = page.find("table", {"bordercolor":"#ece8e1"})
# Each row contains a senator
for row in table.findAll("tr")[1:]: # The first row is the header. Skip it.
    td = row.findAll("td")
    scraperwiki.datastore.save(["name",], {
        "name": td[1].text,
        # Replace height=50 with height=100 to get larger imgs      vvvvvvvvvvv
        "picture": SERVER+"/web/senadores/"+td[0].find("img")["src"][:-2]+"100",
        "district": td[2].text,
        "party": td[3].text,
    })import scraperwiki
from BeautifulSoup import BeautifulSoup

SERVER = "http://www.senado.gov.ar"
INDEX_PATH = "/web/senadores/senadores.php"
# This page lists all the "Senadores"
page = BeautifulSoup(scraperwiki.scrape(SERVER+INDEX_PATH))
# Find the tabel with all the senators
# Search by border color because of all the nested tables without id's or classes.
table = page.find("table", {"bordercolor":"#ece8e1"})
# Each row contains a senator
for row in table.findAll("tr")[1:]: # The first row is the header. Skip it.

Beispiel #27

0

Datei anzeigen

sites_path = 'sites'

sites = [f for f in listdir(sites_path) if isfile(join(sites_path, f))]
mystem = Mystem()
russian_stopwords = stopwords.words("russian")
words = []

print('Parsing sites......................')
files_len = len(sites)
for i in range(files_len):
    file = sites[i]
    print('Processing site ' + str(i) + '/' + str(files_len) + '. ' + file)
    html_file = open(sites_path + '/' + file, "r", encoding="utf-8")
    html = html_file.read().replace("<br>", " ")
    html_file.close()
    parsed_html = BeautifulSoup(html, features="html.parser")
    sentence = re.sub(r"[\n\s.,:–\\?—\-!()/»><;'*+©\"]+",
                      " ",
                      parsed_html.text,
                      flags=re.UNICODE).lower()  # makes normalization faster
    tokens = [token for token in sentence.split(" ") if token not in russian_stopwords \
              and token != " " \
              and token.strip() not in punctuation and len(token) > 1]
    words.extend(tokens)

words_file = open(words_path, "a", encoding="utf-8")
words_dict = Counter(words)

print('Dumping words to file......................')
for key, value in words_dict.items():
    words_file.write(key + " " + str(value) + "\n")

Beispiel #28

0

Datei anzeigen

Datei: test.PixivHelper.py Projekt: Wuji2000/PixivUtil2

 def testParseLoginError(self):
     p = open('./test/test-login-error.htm', 'r')
     page = BeautifulSoup(p.read())
     r = page.findAll('span', attrs={'class': 'error'})
     self.assertTrue(len(r) > 0)
     self.assertEqual(u'Please ensure your pixiv ID, email address and password is entered correctly.', r[0].string)

Beispiel #29

0

Datei anzeigen

Datei: api_request.py Projekt: mizti/switch_selling_alert

    message = '''
PRICE: {0}
Started selling nintendo switch!!! at {1}
        '''.format(price, store_url)
    print 'came here'
    response = sns.publish(TopicArn=TOPIC_ARN,
                           Subject=subject,
                           Message=message)
    return response


amazon = bottlenose.Amazon(ADV_API_AWS_ACCESS_KEY_ID,
                           ADV_API_AWS_SECRET_ACCESS_KEY,
                           AWS_ASSOCIATE_TAG,
                           Region="JP",
                           ErrorHandler=error_handler)
response = amazon.ItemLookup(ItemId=ITEM_ID,
                             ResponseGroup="OfferSummary",
                             ErrorHandler=error_handler)
soup = BeautifulSoup(response)
item = soup.find("item")

price = min([
    int(item.offersummary.lowestnewprice.amount.contents[0]),
    int(item.offersummary.lowestusedprice.amount.contents[0]),
    int(item.offersummary.lowestcollectibleprice.amount.contents[0])
])
if price < TARGET_PRICE:
    #send sms
    send_sms_message('https://amazon.co.jp/dp/' + ITEM_ID, price)

Beispiel #30

0

Datei anzeigen

Datei: htmlbparser.py Projekt: inabhi9/uassign-nalashaa-demo

 def getFirstImageSrc(self):
     soup = BeautifulSoup(self._htmlContent)
     try:
         return soup.find('img').get('src')
     except (IndexError, KeyError, TypeError):
         return None