Python parse Examples, bs4.parse Python Examples

Example #1

0

Show file

File: thueringen.py Project: ialokim/openmensa-parsers

def parse_url(url, today=False):
    canteen = LazyBuilder()

    content = urlopen(url).read()
    document = parse(content, 'lxml')

    available_weeks = parse_available_weeks(document)

    # for the case that the start date is not auto set by the page e.g. on weekends
    noskip = find_start_date(document) is None

    employees_fee, guests_fee = parse_fees(document)
    groups = parse_ingredients(document)

    for idx, week in enumerate(available_weeks):
        if idx > 0 or noskip:
            content = urlopen("{}?selWeek={}".format(url, week)).read()
            document = parse(content, 'lxml')

        parse_meals_for_canteen(document, canteen, employees_fee, guests_fee,
                                groups, today)
        if today:
            break

    return canteen.toXMLFeed()

Example #2

0

Show file

File: page_builder.py Project: chipp972/RecipeFinder

def create_opinions(user_id):
    """
    retrieve the recipes the user visited and didn't comment,
    format them then return them in a form intended to be in the left part
    @param user_id the id of the user
    @return string containing all the opinion forms
    """
    search_rows = db_execute_out("""
        SELECT DISTINCT recipe_id
        FROM search
        WHERE user_id LIKE {0}
        AND recipe_id NOT NULL
        AND recipe_id NOT IN (
            SELECT DISTINCT recipe_id
            FROM opinions
            WHERE author LIKE {0}
        );
    """.format(user_id))

    if search_rows == [] or search_rows is None:
        return parse(
            """
            <h4>How did you find theese recipes ?</h4><p>No recipe to comment</p>
        """, 'lxml').prettify(formatter='html')
    opinion_list = format_recipes([x[0] for x in search_rows])
    # constructing the web page part
    config = SafeConfigParser()
    config.read(CONFIG_FILE)
    with open(config.get('html', 'opinion_form_path')) as _fd:
        search_panel = _fd.read()
    soup = parse('<h4>How did you find theese recipes ?</h4><div></div>',
                 'lxml')
    form_group = soup.div
    form_group['class'] = 'container-fluid'
    # creating a form for each recipe
    for recipe in opinion_list:
        form = parse(search_panel, 'lxml')
        # hidden info
        r_id = form.select('input#$recipe_info')[0]
        r_id['id'] = 'recipe_info_{}'.format(str(recipe['id']))
        r_id['value'] = str(recipe['id'])

        u_id = form.select('input#$user_info')[0]
        u_id['id'] = 'user_info_{}'.format(str(recipe['id']))
        u_id['value'] = str(user_id)

        # the form
        head = form.select('form#$id_form')[0]
        head['id'] = '{}_{}_form_head'.format(str(user_id), str(recipe['id']))
        # the button
        button = form.select('button#$id_button')[0]
        button['id'] = '{}_{}_form'.format(str(user_id), str(recipe['id']))
        # the img
        img = form.select('img')[0]
        img['src'] = recipe['img']
        # the fav button
        fav_button = form.select('button#$fav_id')[0]
        fav_button['id'] = 'fav_{}_{}'.format(str(user_id), str(recipe['id']))
        form_group.append(form)
    return soup.prettify(formatter='html')

Example #3

0

Show file

File: muenchen.py Project: a-andre/openmensa-parsers

def parse_url(url, today=False):
    canteen = LazyBuilder()
    legend = {'f': 'fleischloses Gericht', 'v': 'veganes Gericht'}
    document = parse(urlopen(base + '/speiseplan/zusatzstoffe-de.html').read())
    for td in document.find_all('td', 'beschreibung'):
        legend[td.previous_sibling.previous_sibling.text] = td.text
    document = parse(urlopen(base + '/unsere-preise/').read())
    prices = {}
    for tr in document.find('table', 'essenspreise').find_all('tr'):
        meal = tr.find('th')
        if not meal or not meal.text.strip():
            continue
        if len(tr.find_all('td', 'betrag')) < 3:
            continue
        if 'titel' in meal.attrs.get('class', []) or 'zeilentitel' in meal.attrs.get('class', []):
            continue
        meal = meal.text.strip()
        prices[meal] = {}
        for role, _id in [('student', 0), ('employee', 1), ('other', 2)]:
            price_html = tr.find_all('td', 'betrag')[_id].text
            price_search = price_regex.search(price_html)
            if price_search:
                prices[meal][role] = price_search.group('price')
    errorCount = 0
    date = datetime.date.today()
    while errorCount < 7:
        try:
            document = parse(urlopen(url.format(date)).read())
        except HTTPError as e:
            if e.code == 404:
                errorCount += 1
                date += datetime.date.resolution
                continue
            else:
                raise e
        else:
            errorCount = 0
        for tr in document.find('table', 'zusatzstoffe').find_all('tr'):
            identifier = tr.find_all('td')[0].text \
                           .replace('(', '').replace(')', '')
            legend[identifier] = tr.find_all('td')[1].text.strip()
        canteen.setLegendData(legend)
        mensa_data = document.find('table', 'menu')
        category = None
        for menu_tr in mensa_data.find_all('tr'):
            if menu_tr.find('td', 'headline'):
                continue
            if menu_tr.find('td', 'gericht').text:
                category = menu_tr.find('td', 'gericht').text
            data = menu_tr.find('td', 'beschreibung')
            name = data.find('span').text.strip()
            notes = [span['title'] for span in data.find_all('span', title=True)]
            canteen.addMeal(
                date, category, name, notes,
                prices.get(category.replace('Aktionsessen', 'Bio-/Aktionsgericht'), {})
            )
        date += datetime.date.resolution
        if today:
            break
    return canteen.toXMLFeed()

Example #4

0

Show file

File: page_builder.py Project: chipp972/RecipeFinder

def create_opinions(user_id):
    """
    retrieve the recipes the user visited and didn't comment,
    format them then return them in a form intended to be in the left part
    @param user_id the id of the user
    @return string containing all the opinion forms
    """
    search_rows = db_execute_out("""
        SELECT DISTINCT recipe_id
        FROM search
        WHERE user_id LIKE {0}
        AND recipe_id NOT NULL
        AND recipe_id NOT IN (
            SELECT DISTINCT recipe_id
            FROM opinions
            WHERE author LIKE {0}
        );
    """.format(user_id))

    if search_rows == [] or search_rows is None:
        return parse("""
            <h4>How did you find theese recipes ?</h4><p>No recipe to comment</p>
        """, 'lxml').prettify(formatter='html')
    opinion_list = format_recipes([x[0] for x in search_rows])
    # constructing the web page part
    config = SafeConfigParser()
    config.read(CONFIG_FILE)
    with open(config.get('html', 'opinion_form_path')) as _fd:
        search_panel = _fd.read()
    soup = parse('<h4>How did you find theese recipes ?</h4><div></div>', 'lxml')
    form_group = soup.div
    form_group['class'] = 'container-fluid'
    # creating a form for each recipe
    for recipe in opinion_list:
        form = parse(search_panel, 'lxml')
        # hidden info
        r_id = form.select('input#$recipe_info')[0]
        r_id['id'] = 'recipe_info_{}'.format(str(recipe['id']))
        r_id['value'] = str(recipe['id'])

        u_id = form.select('input#$user_info')[0]
        u_id['id'] = 'user_info_{}'.format(str(recipe['id']))
        u_id['value'] = str(user_id)

        # the form
        head = form.select('form#$id_form')[0]
        head['id'] = '{}_{}_form_head'.format(str(user_id), str(recipe['id']))
        # the button
        button = form.select('button#$id_button')[0]
        button['id'] = '{}_{}_form'.format(str(user_id), str(recipe['id']))
        # the img
        img = form.select('img')[0]
        img['src'] = recipe['img']
        # the fav button
        fav_button = form.select('button#$fav_id')[0]
        fav_button['id'] = 'fav_{}_{}'.format(str(user_id), str(recipe['id']))
        form_group.append(form)
    return soup.prettify(formatter='html')

Example #5

0

Show file

File: dresden.py Project: mswart/openmensa-parsers

def parse_week(url, canteen):
    document = parse(urlopen(url).read())
    for day_table in document.find_all("table", "speiseplan"):
        try:
            date = extractDate(day_table.thead.tr.th.text)
        except ValueError:
            # There was no valid date in the table header, which happens eg
            # for special "Aktionswoche" tables.
            # TODO: check if this table contains any meals, which was not the
            #       case when it was used for the first time.
            continue
        if day_table.find("td", "keinangebot"):
            canteen.setDayClosed(date)
            continue
        for meal_tr in day_table.tbody.children:
            if len(meal_tr.find_all("a") or []) < 1:
                continue
            name = meal_tr.td.text
            if ": " in name:
                category, name = name.split(": ", 1)
            else:
                category = "Angebote"
            if len(name) > 200:
                name = name[:200] + " ..."
            notes = []
            for img in meal_tr.contents[1].find_all("img"):
                notes.append(img["title"])
            canteen.addMeal(date, category, name, notes, price_regex.findall(meal_tr.contents[2].text), roles)

Example #6

0

Show file

def instagram():
	url = raw_input('\nURL : ')
	xxx = raw_input('\nDownload? (y/n) ')
	if 'y' in xxx:
		bra = raw_input('Output : ')
		print('{}\n[!] Loading...'.format(R))
		save = r.get(url).text
		soup = parse(save, 'html.parser')
		love = soup.findAll('script', type='text/javascript')
		for heart in love:
			if 'window._sharedData = ' in heart.text:
				pakboi = heart.text.replace('window._sharedData = ','').replace(';','')
		pakboi = json.loads(pakboi)
		pakboi = pakboi["entry_data"]['PostPage'][0]["graphql"]['shortcode_media']["video_url"]
#		print('{}[!] Sedang Mendownload...'.format(R))
		time.sleep(7)
		pakgerl = r.get(pakboi)
		pants = open(bra, 'wb')
		pants.write(pakgerl.content)
		pants.close
		print('{}[!] Download Berhasil'.format(GL))
		time.sleep(3)
		print('{}\n[!] Salin File ke internal'.format(R))
		time.sleep(3)
		print('{}[!] Berhasil\n\n\n{}Periksa Pada Internal!!!'.format(GL,BL))
		time.sleep(2)
		os.system('cp '+bra+' /sdcard && rm -rf '+bra)

Example #7

0

Show file

def facebook():
     try:
	url = raw_input('\nURL : ')
        xxx = raw_input('\nDownload? (y/n) ')
        if 'y' in xxx:
		bra = raw_input('Output : ')
		print('{}\n[!] Loading...'.format(R))
	        save = r.get(url).text
#       	print save
        	sop = parse(save, "html.parser")
        	res = sop.find("script", type="application/ld+json")
	        a = json.loads(res.text)
        	b = a['contentUrl']
		time.sleep(7)
	        c = r.get(b)
	        d = open(bra, 'wb')
	        d.write(c.content)
	        d.close
	        print('{}[!] Download Berhasil'.format(GL))
                time.sleep(3)
                print('{}\n[!] Salin File ke Internal'.format(R))
                time.sleep(3)
                print('{}[!] Berhasil\n\n\n{}PERIKSA PADA INTERNAL!!!'.format(GL,BL))
                time.sleep(2)
                os.system('cp '+bra+' /sdcard && rm -rf '+bra)
     except KeyboardInterrupt:
                exit()
     except:
                print('URL TIDAK VALID')
                time.sleep(int("2"))
                os.system('python2 main.py')

Example #8

0

Show file

def facebook():
    try:
        url = raw_input('\n[?] URL : ')
        ct = raw_input('[?] Download? (y/n): ')
        if 'y' in ct:
            file = raw_input('[?] File Name : ')
            print('[!] Loading...')
            save = r.get(url).text

            sop = parse(save, "html.parser")
            res = sop.find("script", type="application/ld+json")
            a = json.loads(res.text)
            b = a['contentUrl']
            time.sleep(5)
            c = r.get(b)
            d = open(file, 'wb')
            d.write(c.content)
            d.close
            print('[!] Download Succes')
            time.sleep(1)
            print('[!] Copy File to Internal')
            time.sleep(1)
            print('[!] \x1b[32;1mSuccesfully\x1b[37;1m')
            time.sleep(2)
            os.system('cp ' + file + ' /sdcard && rm -rf ' + file)
    except KeyboardInterrupt:
        exit()
    except:
        print('[!]\x1b[31;1m URL FAILED\x1b[37;1m')
        time.sleep(int("2"))
        os.system('python2 dl.py')

Example #9

0

Show file

def parse_week(url, date, canteen):
    url += '/{0}/{1:0>2}/'.format(*date.isocalendar())
    document = parse(urlopen(url).read())
    week_data = document.find('table', id='week-menu')
    if week_data is None:
        print('week not found')
        return
    weekDays = extractWeekDates(week_data.thead.find_all('th')[0].text)
    for category_tr in week_data.find_all('tr'):
        category = category_tr.find('th').text
        i = 0
        for day_td in category_tr.find_all('td'):
            for meal_data in day_td.find_all('p', 'dish'):
                if not meal_data.find('strong'):
                    continue
                name = extra_regex.sub('', meal_data.find('strong').text)
                name = strip_regex.sub(' ', name).strip()
                if len(name) > 250:
                    name = name[:245] + '...'
                notes = [
                    span['title']
                    for span in meal_data.find_all('span', 'tooltip')
                ]
                notes += [img['title'] for img in meal_data.find_all('img')]
                prices = price_regex.findall(
                    meal_data.find('span', 'price').text)
                canteen.addMeal(weekDays[i], category, name, list(set(notes)),
                                prices, ('student', 'employee', 'other'))
            i += 1

Example #10

0

Show file

File: igfo2.py Project: fajarid05/igtul

def instagram():
    url = input('\n[?] URL : ')
    ct = input('[?] Download? (y/n): ')
    if 'y' in ct:
        bra = input('[?] File Name: ')
        print('[!] Loading...')
        save = r.get(url).text
        soup = parse(save, 'html.parser')
        love = soup.findAll('script', type='text/javascript')
        for heart in love:
            if 'window._sharedData = ' in heart.text:
                jonson = heart.text.replace('window._sharedData = ',
                                            '').replace(';', '')
        jonson = json.loads(jonson)
        jonson = jonson["entry_data"]['PostPage'][0]["graphql"][
            'shortcode_media']["video_url"]
        time.sleep(5)
        alukar = r.get(jonson)
        pants = open(bra, 'wb')
        pants.write(alukar.content)
        pants.close
        print('[!] \x1b[32;1mDownload Succesfully\x1b[37;1m')
        time.sleep(2)
        os.system('cp ' + bra + ' /sdcard && rm -rf ' + bra)
    exit()

Example #11

0

Show file

File: marburg.py Project: klemens/openmensa-parsers

def parse_week(url, canteen, mensa):
    document = parse(urlopen(url).read())
    # extra legends information
    canteen.setLegendData(text=document.find(text='Kennzeichnung: ').parent.next_sibling.get_text().replace('&nbsp;', ' '))
    # additional charges
    prices = {}
    for p in document.find_all('p'):
        match = employeePrice.search(p.text)
        if match:
            prices['employee'] = match.group('price')
        match = otherPrice.search(p.text)
        if match:
            prices['other'] = match.group('price')
    if len(prices) != 2:
        print('Could not extract addtional charges for employee and others')
    canteen.setAdditionalCharges('student', prices)
    # find
    mensa_data = document.find('h1', text=re.compile(mensa)).parent
    while type(mensa_data) != Tag or mensa_data.name != 'div'\
            or 'tx-cagcafeteria-pi1' not in mensa_data.get('class', []):
        mensa_data = mensa_data.next_sibling
    weekDays = extractWeekDates(mensa_data.find('h2').text)
    for day_headline in mensa_data.find_all('h3'):
        date = weekDays[day_headline.text]
        day_table = day_headline.next_sibling.next_sibling
        for tr_menu in day_table.tbody.find_all('tr'):
            category = tr_menu.find_all('td')[0].text.strip()
            name = tr_menu.find_all('td')[1].text.replace('\r\n', ' ').strip()
            canteen.addMeal(date, category, name, [], tr_menu.find_all('td')[2].text)

Example #12

0

Show file

File: hamburg.py Project: a-andre/openmensa-parsers

def parse_week(url, date, canteen):
    url += '/{0}/{1:0>2}/'.format(*date.isocalendar())
    document = parse(urlopen(url).read())
    week_data = document.find('table', id='week-menu')
    if week_data is None:
        print('week not found')
        return
    weekDays = extractWeekDates(week_data.thead.find_all('th')[0].text)
    for category_tr in week_data.find_all('tr'):
        category = category_tr.find('th').text
        i = 0
        for day_td in category_tr.find_all('td'):
            for meal_data in day_td.find_all('p', 'dish'):
                if not meal_data.find('strong'):
                    continue
                name = extra_regex.sub('', meal_data.find('strong').text)
                name = strip_regex.sub(' ', name).strip()
                if len(name) > 250:
                    name = name[:245] + '...'
                notes = [span['title'] for span in meal_data.find_all('span', 'tooltip')]
                notes += [img['title'] for img in meal_data.find_all('img')]
                prices = price_regex.findall(meal_data.find('span', 'price').text)
                canteen.addMeal(weekDays[i], category, name,
                                list(set(notes)),
                                prices, ('student', 'employee', 'other')
                                )
            i += 1

Example #13

0

Show file

File: page_builder.py Project: chipp972/RecipeFinder

def add_options_to_form(table_name, form, tag_id):
    """
    Add in the form having the id tag_id the content of the two first rows
    of the table_name given (id and name typically)
    @param table_name the name of the table
    @param form       an option in the config file containing the path to an html file
    @param tag_id     the tag id in the form (exemple : select#type)
    """
    config = SafeConfigParser()
    config.read(CONFIG_FILE)
    # adding types to the search form
    types = db_execute_out("SELECT * FROM " + table_name + " ORDER BY name;")
    form_path = config.get('html', form)
    _fd = open(form_path)
    soup = parse(_fd.read(), "lxml")
    _fd.close()

    soup.select(tag_id)[0].string = ''
    for row in types:
        opt = soup.new_tag('option')
        opt.string = row[1]
        opt['value'] = row[0]
        soup.select(tag_id)[0].append(opt)

    # writing the html file
    html = soup.prettify(formatter='html')
    with open(form_path, "wb") as _fd:
        _fd.write(html)

Example #14

0

Show file

 def madeinfo(self, link):
     infos = []
     desc = []
     r = requests.get(link)
     s = parse(r.content, 'lxml')
     data1 = s.find("div", {"class": "fl-l score"}).attrs
     data1_rate = s.find("div", {"class": "fl-l score"}).get_text()
     data1_rate = data1_rate.replace(' ', '')
     data1_rate = data1_rate.replace('\n', '')
     data2_rank = s.find("span", {"class": "numbers ranked"}).get_text()
     data3_info = s.find("span", {"itemprop": "description"}).get_text()
     desc.append(data3_info)
     data4_episodes = s.find_all("div", {"class": "spaceit"})[0].get_text()
     data4_episodes = data4_episodes.replace('\n', '')
     data4_status = s.find_all("div", {"class": "spaceit"})[1].get_text()
     data4_status = data4_status.replace('\n', '')
     data4_air = s.find_all("div", {"class": "spaceit"})[2].get_text()
     data4_air = data4_air.replace('\n', '')
     data5_image = s.find("img", {"class": "lazyloaded"}, src=True)
     infos.append(data4_episodes + "\n" + data4_status + "\n" + data4_air)
     return {
         "users": data1["data-user"],
         "rating": data1_rate,
         "rank": data2_rank,
         "inf": desc,
         "add": infos,
         "image": data5_image["src"]
     }

Example #15

0

Show file

File: ostniedersachsen.py Project: mswart/openmensa-parsers

def parse_url(url, today=False, canteentype="Mittagsmensa", this_week="", next_week=True, legend_url=None):
    canteen = LazyBuilder()
    canteen.legendKeyFunc = lambda v: v.lower()
    if not legend_url:
        legend_url = url[: url.find("essen/") + 6] + "wissenswertes/lebensmittelkennzeichnung"
    legend_doc = parse(urlopen(legend_url)).find(id="artikel")
    allergene = buildLegend(
        text=legend_doc.text.replace("\xa0", " "), regex=r"(?P<name>[A-Z]+) {3,}enthält (?P<value>\w+( |\t|\w)*)"
    )
    allergene["EI"] = "Ei"
    zusatzstoffe = buildLegend(
        text=legend_doc.text.replace("\xa0", " "), regex=r"(?P<name>\d+) {3,} (enthält )?(?P<value>\w+( |\t|\w)*)"
    )
    for tr in legend_doc.find_all("tr"):
        tds = tr.find_all("td")
        if len(tds) != 2:
            continue
        title = tds[0].find("strong")
        if title is None:
            continue
        else:
            title = title.text
        text = tds[1].text.replace("enthält", "").strip()
        if title.isdigit():
            zusatzstoffe[title] = text
        else:
            allergene[title] = text
    parse_week(url + this_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe)
    if not today and next_week is True:
        parse_week(url + "-kommende-woche", canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe)
    if not today and type(next_week) is str:
        parse_week(url + next_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe)
    print(canteen.toXMLFeed())
    return canteen.toXMLFeed()

Example #16

0

Show file

File: page_builder.py Project: chipp972/RecipeFinder

def add_options_to_form(table_name, form, tag_id):
    """
    Add in the form having the id tag_id the content of the two first rows
    of the table_name given (id and name typically)
    @param table_name the name of the table
    @param form       an option in the config file containing the path to an html file
    @param tag_id     the tag id in the form (exemple : select#type)
    """
    config = SafeConfigParser()
    config.read(CONFIG_FILE)
    # adding types to the search form
    types = db_execute_out("SELECT * FROM "+ table_name +" ORDER BY name;")
    form_path = config.get('html', form)
    _fd = open(form_path)
    soup = parse(_fd.read(), "lxml")
    _fd.close()

    soup.select(tag_id)[0].string = ''
    for row in types:
        opt = soup.new_tag('option')
        opt.string = row[1]
        opt['value'] = row[0]
        soup.select(tag_id)[0].append(opt)

    # writing the html file
    html = soup.prettify(formatter='html')
    with open(form_path, "wb") as _fd:
        _fd.write(html)

Example #17

0

Show file

File: aachen.py Project: mswart/openmensa-parsers

def parse_url(url, today=False):
    canteen = OpenMensaCanteen()
    # todo only for: Tellergericht, vegetarisch, Klassiker, Empfehlung des Tages:
    canteen.setAdditionalCharges("student", {"other": 1.5})

    document = parse(urlopen(url).read())

    global legend
    regex = "\((?P<name>[\dA-Z]+)\)\s*(?P<value>[\w\s]+)"
    legend = buildLegend(legend, document.find(id="additives").text, regex=regex)

    days = (
        "montag",
        "dienstag",
        "mittwoch",
        "donnerstag",
        "freitag",
        "montagNaechste",
        "dienstagNaechste",
        "mittwochNaechste",
        "donnerstagNaechste",
        "freitagNaechste",
    )
    for day in days:
        data = document.find("div", id=day)
        headline = document.find("a", attrs={"data-anchor": "#" + day})
        parse_day(canteen, headline.text, data)
    return canteen.toXMLFeed()

Example #18

0

Show file

    def animeSearch(self, query: str = None):
        '''Getting the Result List'''

        try:
            if query == None:
                print("Missing Anime Name!")
                return

            anime_names = []
            text = query.lower()
            text = text.replace(' ', '+')
            link_anime = "https://myanimelist.net/anime.php?q=" + text + "&type=0&score=0&status=0&p=0&r=0&sm=0&sd=0&sy=0&em=0&ed=0&ey=0&c[]=a&c[]=b&c[]=c&c[]=f&gx=0"
            r = requests.get(link_anime)
            s = parse(r.content, 'lxml')
            data_names = s.find_all("a",
                                    {"class": "hoverinfo_trigger fw-b fl-l"})
            refined = "None"
            for x in data_names:
                names = x.text
                anime_names.append(names)
            refined = '\n'.join(anime_names[:7])
            return refined

        except Exception as e:
            print(e)

Example #19

0

Show file

def parse_week(url, canteen):
    document = parse(urlopen(url).read())
    for day_table in document.find_all('table', 'speiseplan'):
        try:
            date = extractDate(day_table.thead.tr.th.text)
        except ValueError:
            # There was no valid date in the table header, which happens eg
            # for special "Aktionswoche" tables.
            # TODO: check if this table contains any meals, which was not the
            #       case when it was used for the first time.
            continue
        if day_table.find('td', 'keinangebot'):
            canteen.setDayClosed(date)
            continue
        for meal_tr in day_table.tbody.children:
            if len(meal_tr.find_all('a') or []) < 1:
                continue
            name = meal_tr.td.text
            if ': ' in name:
                category, name = name.split(': ', 1)
            else:
                category = 'Angebote'
            if len(name) > 200:
                name = name[:200] + ' ...'
            notes = []
            for img in meal_tr.contents[1].find_all('img'):
                notes.append(img['title'])
            canteen.addMeal(date, category, name, notes,
                            price_regex.findall(meal_tr.contents[2].text),
                            roles)

Example #20

0

Show file

    def animeData(self, query: str = None):
        '''Getting the links to the selected result'''

        try:
            if query == None:
                print("Missing Name!")
                return

            anime_links = []
            anime_names = []
            query = query.lower()
            text = query.replace(' ', '+')
            link_anime = "https://myanimelist.net/anime.php?q=" + text + "&type=0&score=0&status=0&p=0&r=0&sm=0&sd=0&sy=0&em=0&ed=0&ey=0&c[]=a&c[]=b&c[]=c&c[]=f&gx=0"
            r = requests.get(link_anime)
            s = parse(r.content, 'lxml')
            data_links = s.find_all("a",
                                    {"class": "hoverinfo_trigger fw-b fl-l"})
            for x in data_links:
                names = x.text.lower()
                links = x["href"]
                anime_links.append(links)
                anime_names.append(names)
            anime_names = anime_names[:7]
            anime_links = anime_links[:7]
            link_found = "None"

            if query in anime_names:
                n = anime_names.index("{}".format(query))
                link_found = anime_links[n]

            datas = self.madeinfo(link_found)
            return datas

        except Exception as e:
            print(e)

Example #21

0

Show file

def parse_url(url,
              today=False,
              canteentype='Mittagsmensa',
              this_week='',
              next_week=True,
              legend_url=None):
    canteen = LazyBuilder()
    canteen.legendKeyFunc = lambda v: v.lower()
    if not legend_url:
        legend_url = url[:url.find('essen/') +
                         6] + 'wissenswertes/lebensmittelkennzeichnung'
    legend_doc = parse(urlopen(legend_url), 'lxml').find(id='artikel')
    allergene = buildLegend(
        text=legend_doc.text.replace('\xa0', ' '),
        regex=r'(?P<name>[A-Z]+) {3,}enthält (?P<value>\w+( |\t|\w)*)')
    allergene['EI'] = 'Ei'
    zusatzstoffe = buildLegend(
        text=legend_doc.text.replace('\xa0', ' '),
        regex=r'(?P<name>\d+) {3,} (enthält )?(?P<value>\w+( |\t|\w)*)')
    suballergene = re.compile(
        r'(?P<name>[0-9A-Z]+)[^a-zA-Z]*enthält (?P<value>\w+( |\t|\w)*)')
    for tr in legend_doc.find_all('tr'):
        tds = tr.find_all('td')
        if len(tds) != 2:
            continue
        title = tds[0].find('strong')
        if title is None:
            continue
        else:
            title = title.text
        lines = tds[1].text.split('\n')
        for line in lines[1:]:
            try_allergine = suballergene.match(line)
            if try_allergine:
                allergene[try_allergine.group('name')] = try_allergine.group(
                    'value')
        text = lines[0].replace('enthält', '').strip()
        if title.isdigit():
            zusatzstoffe[title] = text
        else:
            allergene[title] = text
    parse_week(url + this_week,
               canteen,
               canteentype,
               allergene=allergene,
               zusatzstoffe=zusatzstoffe)
    if not today and next_week is True:
        parse_week(url + '-kommende-woche',
                   canteen,
                   canteentype,
                   allergene=allergene,
                   zusatzstoffe=zusatzstoffe)
    if not today and type(next_week) is str:
        parse_week(url + next_week,
                   canteen,
                   canteentype,
                   allergene=allergene,
                   zusatzstoffe=zusatzstoffe)
    return canteen.toXMLFeed()

Example #22

0

Show file

def parse_url(url, today=False):
    canteen = LazyBuilder()
    document = parse(urlopen(url).read())
    for day_div in document.find_all('div', 'day') + document.find_all(
            'article', attrs={'data-day': True}):
        # parse date, warning: calculate year number needed
        date_test = day_regex.search(day_div['data-day'])
        if not date_test:
            print('Error: unable to parse date')
            continue
        else:
            year = datetime.datetime.now().year
            if datetime.datetime.now().month > int(date_test.group('month')):
                year += 1  # date from next year
            date = "{}-{}-{}".format(
                year,
                date_test.group('month'),
                date_test.group('day'),
            )
        if 'nodata' in day_div.attrs.get('class',
                                         []) or 'GESCHLOSSEN' in day_div.text:
            canteen.setDayClosed(date)
            continue
        closed_candidate = False
        for meal_article in day_div.find_all('article', 'menu'):
            name = meal_article.find('div', 'title').text
            if not name:
                continue
            if 'geschlossen' in name:
                closed_candidate = True
                continue
            category = meal_article.find('div')['title']
            notes = [
                v['title'] for v in meal_article.find_all('div', 'theicon')
                if v['title']
            ]
            if meal_article.find('div', 'additive'):
                notes += [
                    v[0] for v in extra_regex.findall(
                        meal_article.find('div', 'additive').text)
                ]
            price_div = meal_article.find('div', 'price')
            if price_div is None:
                canteen.addMeal(date, category, name, notes)
                continue
            prices = {}
            for v, r in (('default', 'student'), ('bed', 'employee'),
                         ('guest', 'other')):
                price = price_regex.search(price_div['data-' + v])
                if price:
                    prices[r] = price.group('price')
                elif v == 'default':
                    prices = {}
                    break
            canteen.addMeal(date, category, name, notes, prices)
        if closed_candidate and not canteen.hasMealsFor(date):
            canteen.setDayClosed(date)
    return canteen.toXMLFeed()

Example #23

0

Show file

def parse_week(url, canteen):
    data = urlopen(url).read().decode('utf-8')
    document = parse(data, 'lxml')

    # The day plans are in a div with no special class or id. Thus
    # we try to find a div with a heading "Speiseplan "
    for week_heading in document(class_='swdd-ueberschrift',
                                 text=speiseplan_regex):
        week_div = week_heading.parent

        # The meals for each day a in card. Again there is no class or id to
        # select the meal cards. Thus we lookung for all card with a card-header
        # which stores the date
        for card_header in week_div.find_all(class_='card-header'):
            day_card = card_header.parent

            try:
                date = extractDate(card_header.text)
            except ValueError:
                # There was no valid date in the table header, which happens eg
                # for special "Aktionswoche" cards.
                # TODO: check if this card contains any meals, which was not the
                #       case when it was used for the first time.
                continue

            # Check if there is a "kein Angebot" item
            if day_card.find(class_='list-group-item', text=kein_angebot_regex):
                canteen.setDayClosed(date)
                continue

            # Iterate over the list-group-item within the card which are used
            # for individual meals
            for meal in day_card.find_all(class_='list-group-item'):

                name = meal.find(name='span')
                if name is not None:
                    name = name.text
                else:
                    continue

                if ': ' in name:
                    category, name = name.split(': ', 1)
                else:
                    category = 'Angebote'

                notes = [img['alt'] for img in meal.find_all(class_='swdd-spl-symbol')]

                if '* ' in name:
                    name, note = name.split('* ', 1)
                    notes.append(note)

                if meal.strong is not None:
                    prices = price_regex.findall(meal.strong.text)
                else:
                    prices = []

                canteen.addMeal(date, category, name, notes,
                                prices, roles)

Example #24

0

Show file

File: currency.py Project: GitCaptain/currency

def GetData():
    # http://www.cbr.ru/ - офф сайт ЦБР 
    cbr = req.urlopen("http://www.cbr.ru/").read().decode("utf-8")
    Data = parse(cbr, 'html.parser')
    CurUSDnEUR = Data.find_all('td', {"class": "weak"})
    CurUSD, CurEUR = CurUSDnEUR[0].get_text(), CurUSDnEUR[1].get_text()
    Date = Data.find_all('a', {"href": re.compile("\/currency_base\/daily\.aspx\?date_req=\d{2}\.\d{2}\.\d{4}")})
    CurDate, NextDate = Date[0].get_text(), Date[1].get_text()
    return (CurDate, CurUSD, CurEUR)

Example #25

0

Show file

File: web_crawler.py Project: chipp972/RecipeFinder

def get_recipe(url, base):
    """
    Retrieve a web page and get informations from it
    @param url the url of the web page to analyze
    @param base the base url of the web site
    @return a dictionnary containing all the informations of a recipe or just
            the urls found on the page and the url of the page :
            {url, name, img, type, ingredients, add_urls} or
            {url, add_urls}
    """
    web_page = urllib2.urlopen(url)
    html = web_page.read()

    soup = parse(html.decode('utf8', 'replace'), "lxml")

    # urls on marmiton
    _urls = []
    for i in soup.find_all('a'):
        curr_url = i.get('href')
        if curr_url is not None:
            if base + 'recettes/' in curr_url:
                _urls.append(curr_url)

    # ingredients on marmiton
    ingr_list = []
    for i in soup.find_all('div'):
        if i.get('class') is not None:
            if 'm_content_recette_ingredients' in i.get('class'):
                ingr_list = str(i).split('<br/>')
                ingr_list = clean_ingredients(ingr_list)

    # image on marmiton
    _img = ''
    for i in soup.find_all('a'):
        if i.get('class') == ['m_content_recette_illu']:
            _img = i.findChildren()[0].get('src')

    if len(ingr_list) == 0 or _img == '':
        return {'url': url, 'add_urls': _urls}

    # title on marmiton
    title = soup.title.string
    title = re.sub(r'[\r|\n|\t]*', '', title)
    title = re.sub(r'\"', '', title)
    title = unicodedata.normalize('NFD', title).encode('utf8', 'ignore')

    # type
    _type = determine_type(title)

    return {
        'url': url,
        'name': title,
        'img': _img,
        'type': _type,
        'ingredients': ingr_list,
        'add_urls': _urls
    }

Example #26

0

Show file

File: page_builder.py Project: chipp972/RecipeFinder

def create_favs(user_id):
    """
    retrieve the favorites recipes of the user and format them then return them
    @param user_id the id of the user
    @return favorites recipes formatted in html
    """
    fav_rows = db_execute_out("""
        SELECT idRecipe
        FROM user_has_favorite_recipes
        WHERE idUser LIKE \"{}\";
    """.format(user_id))
    if fav_rows == []:
        return parse(
            """
            <h4>Favorite List :</h4><p>No favorite</p>
        """, 'lxml').prettify(formatter='html')
    favorite_list = format_recipes([x[0] for x in fav_rows])
    # constructing the web page part
    config = SafeConfigParser()
    config.read(CONFIG_FILE)
    _fd = open(config.get('html', 'fav_panel'))
    fav_panel = _fd.read()
    _fd.close()
    soup = parse('<h4>Favorite List :</h4><div></div>', 'lxml')
    panel_group = soup.div
    panel_group['class'] = 'container-fluid'
    # creating a panel for each recipe
    for recipe in favorite_list:
        panel = parse(fav_panel, 'lxml')
        # the well
        well = panel.select('div#$id_fav')[0]
        well['id'] = 'well_unfav_{}_{}'.format(str(user_id), str(recipe['id']))
        unfav = panel.select('button#$unfav_id')[0]
        unfav['id'] = 'unfav_{}_{}'.format(str(user_id), str(recipe['id']))
        # the img
        img = panel.select('img#$fav_img')[0]
        img['id'] = str(recipe['id']) + '_favimg'
        img['src'] = recipe['img']
        # the url
        url = panel.select('a#$fav_url')[0]
        url['id'] = str(recipe['id']) + '_favurl'
        url['href'] = recipe['url']
        panel_group.append(panel)
    return soup.prettify(formatter='html')

Example #27

0

Show file

File: page_builder.py Project: chipp972/RecipeFinder

def create_favs(user_id):
    """
    retrieve the favorites recipes of the user and format them then return them
    @param user_id the id of the user
    @return favorites recipes formatted in html
    """
    fav_rows = db_execute_out("""
        SELECT idRecipe
        FROM user_has_favorite_recipes
        WHERE idUser LIKE \"{}\";
    """.format(user_id))
    if fav_rows == []:
        return parse("""
            <h4>Favorite List :</h4><p>No favorite</p>
        """, 'lxml').prettify(formatter='html')
    favorite_list = format_recipes([x[0] for x in fav_rows])
    # constructing the web page part
    config = SafeConfigParser()
    config.read(CONFIG_FILE)
    _fd = open(config.get('html', 'fav_panel'))
    fav_panel = _fd.read()
    _fd.close()
    soup = parse('<h4>Favorite List :</h4><div></div>', 'lxml')
    panel_group = soup.div
    panel_group['class'] = 'container-fluid'
    # creating a panel for each recipe
    for recipe in favorite_list:
        panel = parse(fav_panel, 'lxml')
        # the well
        well = panel.select('div#$id_fav')[0]
        well['id'] = 'well_unfav_{}_{}'.format(str(user_id), str(recipe['id']))
        unfav = panel.select('button#$unfav_id')[0]
        unfav['id'] = 'unfav_{}_{}'.format(str(user_id), str(recipe['id']))
        # the img
        img = panel.select('img#$fav_img')[0]
        img['id'] = str(recipe['id'])+'_favimg'
        img['src'] = recipe['img']
        # the url
        url = panel.select('a#$fav_url')[0]
        url['id'] = str(recipe['id'])+'_favurl'
        url['href'] = recipe['url']
        panel_group.append(panel)
    return soup.prettify(formatter='html')

Example #28

0

Show file

File: aachen.py Project: mlewe/openmensa-parsers

def parse_week(url, data, canteen):
    document = parse(urlopen(url, data).read())
    # parse extra/notes legend
    legends = {}
    legendsData = document.find('table', 'zusatz_std')
    if legendsData:
        legends = {int(v[0]): v[1] for v in legend_regex.findall(legendsData.text.replace('\xa0', ' '))}
    data = document.find('table', 'wo_std')
    if not data:
        message = document.find('div', 'Meldung_std')
        if message:
            m = day_range_regex.search(message.text)
            if m:
                fromDate = datetime.datetime.strptime(m.group('from') + '.' + m.group('year'), '%d.%m.%Y')
                toDate = datetime.datetime.strptime(m.group('to'), '%d.%m.%Y')
                while fromDate <= toDate:
                    canteen.setDayClosed(fromDate.strftime('%Y-%m-%d'))
                    fromDate += datetime.date.resolution
        return
    # iterator about all rows of the table
    rowIter = iter(document.find('table', 'wo_std').find_all('tr'))
    # extra category names fro th's of first row
    headRow = next(rowIter)
    for br in headRow.find_all('br'):
        br.replace_with(document.new_string(' - '))
    categories = list(map(lambda v: (v.text.strip() + '#').replace(' -#', '#')[:-1], headRow.find_all('th')))[1:]
    try:
        while True:
            tr = next(rowIter)  # meal row
            # extract date from first column:
            date = day_regex.search(tr.contents[0].text).group('date')
            if tr.contents[0].get('rowspan') is None:
                canteen.setDayClosed(date)
                continue
            extratr = next(rowIter)  # addition meal component row, ToDo
            # build iterators for lists:
            categoriesIterator = iter(categories)
            colIter = iter(tr.find_all('td'))
            extraIter = iter(extratr.find_all('td'))
            # skip first row (date):
            next(colIter)
            next(extraIter)
            try:
                while True:
                    name = next(colIter).text
                    # extract notes from name
                    notes = [legends[int(v)] for v in set(','.join(extra_regex.findall(name)).split(',')) if v and int(v) in legends]
                    # from notes from name
                    name = extra_regex.sub('', name).replace('\xa0', ' ').replace('  ', ' ').strip()
                    # extract price
                    canteen.addMeal(date, next(categoriesIterator), name, notes, next(colIter).text)
            except StopIteration:
                pass
    except StopIteration:
        pass

Example #29

0

Show file

File: page_builder.py Project: chipp972/RecipeFinder

def get_content(_file):
    """
    Return the content of the web page inside the body tags
    @param _file an option in the config file containing the path to an html file
    @return the content of the body tags in the html file
    """
    config = SafeConfigParser()
    config.read(CONFIG_FILE)
    _fd = open(config.get('html', _file), 'r')
    soup = parse(_fd.read(), "lxml")
    return soup.find('body').prettify(formatter='html')

Example #30

0

Show file

def parse_url(url, today=False):
    content = urlopen(url).read()
    document = parse(content, 'lxml')
    legends = document.find_all('div', {'class': 'legende'})
    if len(legends) > 0:
        extraLegend = {
            int(v[0]): v[1]
            for v in reversed(legend_regex.findall(legends[0].text))
        }
    else:
        extraLegend = {}
    canteen = LazyBuilder()
    for day_td in document.find_all('td', text=day_regex):
        date = day_regex.search(day_td.string).group('date')
        table = None
        for element in day_td.parents:
            if element.name == 'table':
                table = element
                break
        if not table:
            continue
        for tr in table.tbody.find_all('tr'):
            if 'geschlossen' in tr.text or 'Feiertage' in tr.text:
                match = day_range_regex.search(tr.text)
                if not match:
                    canteen.setDayClosed(date)
                else:
                    fromDate = datetime.datetime.strptime(
                        match.group('from'), '%d.%m.%Y')
                    toDate = datetime.datetime.strptime(
                        match.group('to'), '%d.%m.%Y')
                    while fromDate <= toDate:
                        canteen.setDayClosed(fromDate.strftime('%Y-%m-%d'))
                        fromDate += datetime.date.resolution
                continue
            if len(tr) != 2:
                continue  # no meal
            strings = list(tr.contents[0].strings)
            name = strings[0]
            # prices:
            prices = strings[-1].split('|')
            if '-' in map(lambda v: v.strip(), prices):
                prices = {}
            # notes:
            notes = []
            for img in tr.contents[1].find_all('img'):
                notes.append(img['alt'].replace('Symbol', '').strip())
            for extra in list(
                    set(map(lambda v: int(v), extra_regex.findall(tr.text)))):
                if extra in extraLegend:
                    notes.append(extraLegend[extra])
            canteen.addMeal(date, 'Hauptgerichte', name, notes, prices,
                            roles if prices else None)
    return canteen.toXMLFeed()

Example #31

0

Show file

File: duesseldorf.py Project: klemens/openmensa-parsers

def parse_url(url, today=False):
    canteen = OpenMensaCanteen()
    document = parse(urlopen(url).read(), 'lxml')

    days = ('Montag', 'Dienstag', 'Mittwoch', 'Donnerstag', 'Freitag')
    for day in days:
        data = document.find('div', {'data-day': day})
        date = data.attrs['data-date']
        parse_day(canteen, date, data)

    return canteen.toXMLFeed()

Example #32

0

Show file

File: aachen.py Project: mlewe/openmensa-parsers

def parse_url(url, today=False):
    canteen = OpenMensaCanteen()
    canteen.setAdditionalCharges('student', {'other': 1.5})
    document = parse(urlopen(url).read())
    for submit in document.find_all('input'):
        if submit['type'] != 'submit':
            continue
        parse_week(url, urlencode({submit['name']: submit['value']}).encode('utf8'), canteen)
        if today:
            break
    return canteen.toXMLFeed()

Example #33

0

Show file

File: page_builder.py Project: chipp972/RecipeFinder

def get_content(_file):
    """
    Return the content of the web page inside the body tags
    @param _file an option in the config file containing the path to an html file
    @return the content of the body tags in the html file
    """
    config = SafeConfigParser()
    config.read(CONFIG_FILE)
    _fd = open(config.get('html', _file), 'r')
    soup = parse(_fd.read(), "lxml")
    return soup.find('body').prettify(formatter='html')

Example #34

0

Show file

File: aachen.py Project: azrdev/openmensa-parsers

def parse_url(url, today=False):
    canteen = OpenMensaCanteen()
    document = parse(urlopen(url).read(), 'lxml')

    # todo only for: Tellergericht, vegetarisch, Klassiker, Empfehlung des Tages:
    canteen.setAdditionalCharges('student', {'other': 1.5})
    # unwanted automatic notes extraction would be done in `OpenMensaCanteen.addMeal()`
    # if we used `LazyBuilder.setLegendData()`, so we bypass it using a custom attribute
    canteen.legend = parse_legend(document)

    parse_all_days(canteen, document)

    return canteen.toXMLFeed()

Example #35

0

Show file

File: aachen.py Project: theScrabi/openmensa-parsers

def parse_url(url, today=False):
    canteen = OpenMensaCanteen()
    document = parse(urlopen(url).read(), 'lxml')

    # todo only for: Tellergericht, vegetarisch, Klassiker, Empfehlung des Tages:
    canteen.setAdditionalCharges('student', {'other': 1.5})
    # unwanted automatic notes extraction would be done in `OpenMensaCanteen.addMeal()`
    # if we used `LazyBuilder.setLegendData()`, so we bypass it using a custom attribute
    canteen.legend = parse_legend(document)

    parse_all_days(canteen, document)

    return canteen.toXMLFeed()

Example #36

0

Show file

File: duesseldorf.py Project: mswart/openmensa-parsers

def parse_url(url, today=False):
    canteen = OpenMensaCanteen()
    document = parse(urlopen(url).read(), 'lxml')

    days = ('Montag', 'Dienstag', 'Mittwoch', 'Donnerstag', 'Freitag')
    for day in days:
        data = document.find('div', {'data-day': day})
        if data is None:
            continue
        date = data.attrs['data-date']
        parse_day(canteen, date, data)

    return canteen.toXMLFeed()

Example #37

0

Show file

File: wuerzburg.py Project: macintosh-HD/openmensa-parsers

def parse_url(url, today=False):
    canteen = LazyBuilder()
    document = parse(urlopen(url).read(), 'lxml')

    for day_div in document.find_all('div', attrs={'data-day': True}):
        # parse date, warning: calculate year number needed
        date_test = day_regex.search(day_div['data-day'])
        if not date_test:
            print('Error: unable to parse date "{}"'.format(
                day_div['data-day']))
            continue
        else:
            year = datetime.datetime.now().year
            if datetime.datetime.now().month > int(date_test.group('month')):
                year += 1  # date from next year
            date = '{}-{}-{}'.format(year, date_test.group('month'),
                                     date_test.group('day'))

        closed_candidate = day_div.find('div', 'holiday') is not None

        for meal_article in day_div.find_all('article', 'menu'):
            name = meal_article.find('div', 'title').text
            if not name:
                continue

            category = meal_article.find('div', 'icon')['title']
            notes = []
            prices = {}

            additives = meal_article.find('div', 'additnr')
            if additives:
                notes += [
                    additive.text for additive in additives.find_all('li')
                ]
            notes += [
                v['title'] for v in meal_article.find_all('div', 'theicon')
                if v['title'] and v['title'] not in notes
            ]

            price_div = meal_article.find('div', 'price')
            if price_div:
                for k, v in price_map.items():
                    price = price_div['data-' + k]
                    if price:
                        prices[v] = price
            canteen.addMeal(date, category, name, notes, prices)

        if closed_candidate and not canteen.hasMealsFor(date):
            canteen.setDayClosed(date)

    return canteen.toXMLFeed()

Example #38

0

Show file

def parse_week(url, canteen, type, allergene={}, zusatzstoffe={}):
    document = parse(urlopen(url).read(), 'lxml')
    for day_table in document.find_all('table', 'swbs_speiseplan'):
        caption = day_table.find('th', 'swbs_speiseplan_head').text
        if type not in caption:
            continue
        date = extractDate(caption)
        meals = day_table.find_all('tr')
        pos = 0
        while pos < len(meals):
            meal_tr = meals[pos]
            if not meal_tr.find('td'):  # z.B Headline
                pos += 1
                continue
            tds = meal_tr.find_all('td')
            category = re.sub(r' \(\d\)', '', tds[0].text.strip())
            name = tds[1].text.strip()
            if tds[1].find('a', href='http://www.stw-on.de/mensavital'):
                notes = ['MensaVital']
            else:
                notes = []
            for img in tds[2].find_all('img'):
                title = img['title']
                if ':' in title:
                    kind, value = title.split(':')
                    if kind == 'Allergene':
                        for allergen in value.split(','):
                            notes.append(
                                allergene.get(allergen.strip())
                                or allergene[allergen.strip()[:-1]])
                    elif kind == 'Zusatzstoffe':
                        for zusatzstoff in value.split(','):
                            notes.append(zusatzstoffe[zusatzstoff.strip()])
                    else:
                        print('Unknown image type "{}"'.format(kind))
                else:
                    notes.append(title.replace('enthält ', ''))
            prices = {
                'student': tds[3].text.strip(),
                'employee': tds[4].text.strip(),
                'other': tds[5].text.strip()
            }
            if pos < len(meals) - 1:
                nextTds = meals[pos + 1].find_all('td')
                if nextTds[0].text.strip() == '':
                    pos += 1
                    for img in nextTds[1].find_all('img'):
                        notes.append(img['title'])
            pos += 1
            canteen.addMeal(date, category or 'Sonstiges', name, notes, prices)

Example #39

0

Show file

File: ostniedersachsen.py Project: azrdev/openmensa-parsers

def parse_week(url, canteen, type, allergene={}, zusatzstoffe={}):
    document = parse(urlopen(url).read(), 'lxml')
    for day_table in document.find_all('table', 'swbs_speiseplan'):
        caption = day_table.find('th', 'swbs_speiseplan_head').text
        if type not in caption:
            continue
        date = extractDate(caption)
        meals = day_table.find_all('tr')
        pos = 0
        while pos < len(meals):
            meal_tr = meals[pos]
            if not meal_tr.find('td'):  # z.B Headline
                pos += 1
                continue
            tds = meal_tr.find_all('td')
            category = re.sub(r' \(\d\)', '', tds[0].text.strip())
            name = tds[1].text.strip()
            if tds[1].find('a', href='http://www.stw-on.de/mensavital'):
                notes = ['MensaVital']
            else:
                notes = []
            for img in tds[2].find_all('img'):
                title = img['title']
                if ':' in title:
                    kind, value = title.split(':')
                    if kind == 'Allergene':
                        for allergen in value.split(','):
                            notes.append(allergene.get(allergen.strip()) or allergene[allergen.strip()[:-1]])
                    elif kind == 'Zusatzstoffe':
                        for zusatzstoff in value.split(','):
                            notes.append(zusatzstoffe[zusatzstoff.strip()])
                    else:
                        print('Unknown image type "{}"'.format(kind))
                else:
                    notes.append(title.replace('enthält ', ''))
            prices = {
                'student':  tds[3].text.strip(),
                'employee': tds[4].text.strip(),
                'other':    tds[5].text.strip()
            }
            if pos < len(meals) - 1:
                nextTds = meals[pos+1].find_all('td')
                if nextTds[0].text.strip() == '':
                    pos += 1
                    for img in nextTds[1].find_all('img'):
                        notes.append(img['title'])
            pos += 1
            canteen.addMeal(date, category or 'Sonstiges', name, notes, prices)

Example #40

0

Show file

File: magdeburg.py Project: mlewe/openmensa-parsers

def parse_url(url, today=False):
    content = urlopen(url).read()
    document = parse(content)
    legends = document.find_all('div', {'class': 'legende'})
    if len(legends) > 0:
        extraLegend = {int(v[0]): v[1] for v in reversed(legend_regex.findall(legends[0].text))}
    else:
        extraLegend = {}
    canteen = LazyBuilder()
    for day_td in document.find_all('td', text=day_regex):
        date = day_regex.search(day_td.string).group('date')
        table = None
        for element in day_td.parents:
            if element.name == 'table':
                table = element
                break
        if not table:
            continue
        for tr in table.tbody.find_all('tr'):
            if 'geschlossen' in tr.text or 'Feiertage' in tr.text:
                match = day_range_regex.search(tr.text)
                if not match:
                    canteen.setDayClosed(date)
                else:
                    fromDate = datetime.datetime.strptime(match.group('from'), '%d.%m.%Y')
                    toDate = datetime.datetime.strptime(match.group('to'), '%d.%m.%Y')
                    while fromDate <= toDate:
                        canteen.setDayClosed(fromDate.strftime('%Y-%m-%d'))
                        fromDate += datetime.date.resolution
                continue
            if len(tr) != 3:
                continue  # no meal
            strings = list(tr.contents[0].strings)
            name = strings[0]
            # prices:
            prices = strings[-1].split('|')
            print(prices)
            if '-' in map(lambda v: v.strip(), prices):
                prices = {}
            # notes:
            notes = []
            for img in tr.contents[1].find_all('img'):
                notes.append(img['alt'].replace('Symbol', '').strip())
            for extra in list(set(map(lambda v: int(v), extra_regex.findall(tr.text)))):
                if extra in extraLegend:
                    notes.append(extraLegend[extra])
            canteen.addMeal(date, 'Hauptgerichte', name, notes, prices, roles if prices else None)
    return canteen.toXMLFeed()

Example #41

0

Show file

File: ostniedersachsen.py Project: steeb/openmensa-parsers

def parse_url(url, today=False, canteentype='Mittagsmensa', this_week='', next_week=True, legend_url=None):
    canteen = LazyBuilder()
    canteen.legendKeyFunc = lambda v: v.lower()
    if not legend_url:
        legend_url = url[:url.find('essen/') + 6] + 'lebensmittelkennzeichnung'
    legend_doc = parse(urlopen(legend_url))
    canteen.setLegendData(
        text=legend_doc.find(id='artikel').text,
        regex=r'(?P<name>(\d+|[A-Z]+))\s+=\s+(?P<value>\w+( |\t|\w)*)'
    )
    parse_week(url + this_week, canteen, canteentype)
    if not today and next_week is True:
        parse_week(url + '-kommende-woche', canteen, canteentype)
    if not today and type(next_week) is str:
        parse_week(url + next_week, canteen, canteentype)
    return canteen.toXMLFeed()

Example #42

0

Show file

def getPostings():
    url = 'https://news.ycombinator.com/jobs'
    response = requests.get(url)
    page = parse(response.content, 'lxml')
    headlines = page.select('a.storylink')
    timestamps = page.select('span.age')
    company = re.compile(r'^[A-Z].+ \(YC .\d+\)|^[A-Z]\w+ [a-z]')

    titles = [title.text for title in headlines]
    times = [time.text for time in timestamps]
    urls = [title['href'] for title in headlines]
    locations = [GeoText(title).cities for title in titles]
    # companies = [re.findall(company,str(titles))]

    details = zip(titles, times, urls, locations)
    return details

Example #43

0

Show file

File: karlsruhe.py Project: klemens/openmensa-parsers

def parse_week(canteen, url, place_class=None):
    content = urlopen(url).read()
    document = parse(content)
    legends = document.find_all('div', {'class': 'legende'})
    if len(legends) > 0:
        extraLegend = {
            int(v[0]): v[1]
            for v in reversed(legend_regex.findall(legends[0].text))
        }
    else:
        extraLegend = {}

    if place_class:
        document = document.find(id=place_class)

    for day_a in document.find_all('a', rel=day_regex):
        day_data = document.find(id=day_a['href'].replace('#', ''))
        if not day_data:
            continue
        date = day_a['rel'][0]
        day_table = day_data.table
        if not day_table:
            continue
        if day_table.tbody:
            day_table = day_table.tbody
        canteen.clearDay(date)  # remove old data about this day
        for category_tr in day_table.children:
            if category_tr.name != 'tr':
                continue
            if len(category_tr) < 2:
                continue  # no meal
            category = category_tr.contents[0].text
            meal_table = category_tr.contents[1].table
            if meal_table.tbody:
                meal_table = meal_table.tbody
            for meal_tr in meal_table.children:
                if meal_tr.name != 'tr':
                    continue
                if len(list(meal_tr.children)) != 3:
                    #print('skipping category, unable to parse meal_table: {} tds'.format(len(list(meal_tr.children))))
                    continue
                name = meal_tr.contents[1].text
                # notes, to do
                canteen.addMeal(date, category, name, [],
                                price_regex.findall(meal_tr.contents[2].text),
                                roles)

Example #44

0

Show file

 def download(self):
     # Validating input
     songID = self.input.text()
     if not songID:
         self.notification = NotificationDialog(
             switch['empinp'][config["lang"]])
         self.notification.exec()
         return
     elif not songID.isdigit():
         self.notification = NotificationDialog(
             switch['typerr'][config["lang"]])
         self.notification.exec()
         return
     page = parse(
         load('https://www.newgrounds.com/audio/listen/'
              f'{songID}').text, 'html.parser')
     if page.find(id='pageerror') is not None:
         self.notification = NotificationDialog(
             switch['404'][config["lang"]])
         self.notification.exec()
         return
     self.songTitle = page.find('title').text
     # Getting download link
     link = 'http://audio.ngfiles.com/'
     page = str(page)
     i = page.find('audio.ngfiles.com') + len('audio.ngfiles.com/')
     while not link.endswith('.mp3'):
         if page[i] != '\\':
             link += page[i]
         i += 1
     # Locating file
     self.dist = (QFileDialog.getSaveFileName(
         self, switch['savefile'][config["lang"]],
         link.split('/')[-1], 'MP3 Audio File (*.mp3)')[0])
     if not self.dist:
         return
     # Downloading
     self.file = load(link, stream=True)
     self.progress = ProgressDialog()
     self.progress.label.setText(switch['downloading'][config["lang"]](
         self.songTitle))
     self.progress.setWindowTitle(switch['downloading'][config["lang"]](
         self.songTitle))
     self.progress.bar.setValue(0)
     self.progress.exec()

Example #45

0

Show file

File: wuerzburg.py Project: mlewe/openmensa-parsers

def parse_url(url, today=False):
    canteen = LazyBuilder()
    document = parse(urlopen(url).read())
    for day_div in document.find_all('div', 'day') + document.find_all('article', attrs={'data-day': True}):
        # parse date, warning: calculate year number needed
        date_test = day_regex.search(day_div['data-day'])
        if not date_test:
            print('Error: unable to parse date')
            continue
        else:
            year = datetime.datetime.now().year
            if datetime.datetime.now().month > int(date_test.group('month')):
                year += 1  # date from next year
            date = "{}-{}-{}".format(year, date_test.group('month'), date_test.group('day'), )
        if 'nodata' in day_div.attrs.get('class', []) or 'GESCHLOSSEN' in day_div.text:
            canteen.setDayClosed(date)
            continue
        closed_candidate = False
        for meal_article in day_div.find_all('article', 'menu'):
            name = meal_article.find('div', 'title').text
            if not name:
                continue
            if 'geschlossen' in name:
                closed_candidate = True
                continue
            category = meal_article.find('div', 'desc').text
            notes = [v['title'] for v in meal_article.find_all('div', 'theicon') if v['title']]
            if meal_article.find('div', 'additive'):
                notes += [v[0] for v in extra_regex.findall(meal_article.find('div', 'additive').text)]
            price_div = meal_article.find('div', 'price')
            if price_div is None:
                canteen.addMeal(date, category, name, notes)
                continue
            prices = {}
            for v, r in (('default', 'student'), ('bed', 'employee'), ('guest', 'other')):
                price = price_regex.search(price_div['data-' + v])
                if price:
                    prices[r] = price.group('price')
                elif v == 'default':
                    prices = {}
                    break
            canteen.addMeal(date, category, name, notes, prices)
        if closed_candidate and not canteen.hasMealsFor(date):
            canteen.setDayClosed(date)
    return canteen.toXMLFeed()

Example #46

0

Show file

def parse_url(url, today=False):
    canteen = OpenMensaCanteen()
    # todo only for: Tellergericht, vegetarisch, Klassiker, Empfehlung des Tages:
    canteen.setAdditionalCharges('student', {'other': 1.5})

    document = parse(urlopen(url).read())

    global legend
    regex = '\((?P<name>[\dA-Z]+)\)\s*(?P<value>[\w\s]+)'
    legend = buildLegend(legend, document.find(id='additives').text, regex=regex)

    days = ('montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag',
            'montagNaechste', 'dienstagNaechste', 'mittwochNaechste', 'donnerstagNaechste', 'freitagNaechste')
    for day in days:
        data = document.find('div', id=day)
        headline = document.find('a', attrs={'data-anchor': '#' + day})
        parse_day(canteen, headline.text, data)
    return canteen.toXMLFeed()

Example #47

0

Show file

File: wuerzburg.py Project: mswart/openmensa-parsers

def parse_url(url, today=False):
    canteen = LazyBuilder()
    document = parse(urlopen(url).read(), 'lxml')

    for day_div in document.find_all('div', attrs={'data-day': True}):
        # parse date, warning: calculate year number needed
        date_test = day_regex.search(day_div['data-day'])
        if not date_test:
            print('Error: unable to parse date "{}"'.format(day_div['data-day']))
            continue
        else:
            year = datetime.datetime.now().year
            if datetime.datetime.now().month > int(date_test.group('month')):
                year += 1  # date from next year
            date = '{}-{}-{}'.format(year, date_test.group('month'), date_test.group('day'))

        closed_candidate = day_div.find('div', 'holiday') is not None

        for meal_article in day_div.find_all('article', 'menu'):
            name = meal_article.find('div', 'title').text
            if not name:
                continue

            category = meal_article.find('div', 'icon')['title']
            notes = []
            prices = {}

            additives = meal_article.find('div', 'additnr')
            if additives:
                notes += [additive.text for additive in additives.find_all('li')]
            notes += [v['title'] for v in meal_article.find_all('div', 'theicon') if v['title'] and v['title'] not in notes]

            price_div = meal_article.find('div', 'price')
            if price_div:
                for k, v in price_map.items():
                    price = price_div['data-' + k]
                    if price:
                        prices[v] = price
            canteen.addMeal(date, category, name, notes, prices)

        if closed_candidate and not canteen.hasMealsFor(date):
            canteen.setDayClosed(date)

    return canteen.toXMLFeed()

Example #48

0

Show file

File: ostniedersachsen.py Project: mswart/openmensa-parsers

def parse_week(url, canteen, type, allergene={}, zusatzstoffe={}):
    document = parse(urlopen(url).read())
    for day_table in document.find_all("table", "swbs_speiseplan"):
        caption = day_table.find("th", "swbs_speiseplan_head").text
        if type not in caption:
            continue
        date = extractDate(caption)
        meals = day_table.find_all("tr")
        pos = 0
        while pos < len(meals):
            meal_tr = meals[pos]
            if not meal_tr.find("td"):  # z.B Headline
                pos += 1
                continue
            tds = meal_tr.find_all("td")
            category = re.sub(r" \(\d\)", "", tds[0].text.strip())
            name = tds[1].text.strip()
            if tds[1].find("a", href="http://www.stw-on.de/mensavital"):
                notes = ["MensaVital"]
            else:
                notes = []
            for img in tds[2].find_all("img"):
                title = img["title"]
                if ":" in title:
                    kind, value = title.split(":")
                    if kind == "Allergene":
                        for allergen in value.split(","):
                            notes.append(allergene.get(allergen.strip()) or allergene[allergen.strip()[:-1]])
                    elif kind == "Zusatzstoffe":
                        for zusatzstoff in value.split(","):
                            notes.append(zusatzstoffe[zusatzstoff.strip()])
                    else:
                        print('Unknown image type "{}"'.format(kind))
                else:
                    notes.append(title.replace("enthält ", ""))
            prices = {"student": tds[3].text.strip(), "employee": tds[4].text.strip(), "other": tds[5].text.strip()}
            if pos < len(meals) - 1:
                nextTds = meals[pos + 1].find_all("td")
                if nextTds[0].text.strip() == "":
                    pos += 1
                    for img in nextTds[1].find_all("img"):
                        notes.append(img["title"])
            pos += 1
            canteen.addMeal(date, category, name, notes, prices)

Example #49

0

Show file

File: ostniedersachsen.py Project: azrdev/openmensa-parsers

def parse_url(url, today=False, canteentype='Mittagsmensa', this_week='', next_week=True, legend_url=None):
    canteen = LazyBuilder()
    canteen.legendKeyFunc = lambda v: v.lower()
    if not legend_url:
        legend_url = url[:url.find('essen/') + 6] + 'wissenswertes/lebensmittelkennzeichnung'
    legend_doc = parse(urlopen(legend_url), 'lxml').find(id='artikel')
    allergene = buildLegend(
        text=legend_doc.text.replace('\xa0', ' '),
        regex=r'(?P<name>[A-Z]+) {3,}enthält (?P<value>\w+( |\t|\w)*)'
    )
    allergene['EI'] = 'Ei'
    zusatzstoffe = buildLegend(
        text=legend_doc.text.replace('\xa0', ' '),
        regex=r'(?P<name>\d+) {3,} (enthält )?(?P<value>\w+( |\t|\w)*)'
    )
    suballergene = re.compile(r'(?P<name>[0-9A-Z]+)[^a-zA-Z]*enthält (?P<value>\w+( |\t|\w)*)')
    for tr in legend_doc.find_all('tr'):
        tds = tr.find_all('td')
        if len(tds) != 2:
            continue
        title = tds[0].find('strong')
        if title is None:
            continue
        else:
            title = title.text
        lines = tds[1].text.split('\n')
        for line in lines[1:]:
            try_allergine = suballergene.match(line)
            if try_allergine:
                allergene[try_allergine.group('name')] = try_allergine.group('value')
        text = lines[0].replace('enthält', '').strip()
        if title.isdigit():
            zusatzstoffe[title] = text
        else:
            allergene[title] = text
    parse_week(url + this_week, canteen, canteentype,
               allergene=allergene, zusatzstoffe=zusatzstoffe)
    if not today and next_week is True:
        parse_week(url + '-kommende-woche', canteen, canteentype,
                   allergene=allergene, zusatzstoffe=zusatzstoffe)
    if not today and type(next_week) is str:
        parse_week(url + next_week, canteen, canteentype,
                   allergene=allergene, zusatzstoffe=zusatzstoffe)
    return canteen.toXMLFeed()

Example #50

0

Show file

File: aachen.py Project: a-andre/openmensa-parsers

def parse_url(url, today=False):
    canteen = OpenMensaCanteen()
    # todo only for: Tellergericht, vegetarisch, Klassiker, Empfehlung des Tages:
    canteen.setAdditionalCharges('student', {'other': 1.5})

    document = parse(urlopen(url).read())

    global legend
    regex = '(?P<name>(\d|[A-Z])+)\)\s*' + \
            '(?P<value>\w+((\s+\w+)*[^0-9)]))'
    legend = buildLegend(legend, document.find(id='additives').text, regex=regex)

    days = ('montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag',
            'montagNaechste', 'dienstagNaechste', 'mittwochNaechste', 'donnerstagNaechste', 'freitagNaechste')
    for day in days:
        data = document.find('div', id=day)
        headline = document.find('a', attrs={'data-anchor': '#' + day})
        parse_day(canteen, headline.text, data)
    return canteen.toXMLFeed()

Example #51

0

Show file

File: karlsruhe.py Project: a-andre/openmensa-parsers

def parse_week(canteen, url, place_class=None):
    content = urlopen(url).read()
    document = parse(content)
    legends = document.find_all('div', {'class': 'legende'})
    if len(legends) > 0:
        extraLegend = {int(v[0]): v[1] for v in reversed(legend_regex.findall(legends[0].text))}
    else:
        extraLegend = {}

    if place_class:
        document = document.find(id=place_class)

    for day_a in document.find_all('a', rel=day_regex):
        day_data = document.find(id=day_a['href'].replace('#', ''))
        if not day_data:
            continue
        date = day_a['rel'][0]
        day_table = day_data.table
        if not day_table:
            continue
        if day_table.tbody:
            day_table = day_table.tbody
        canteen.clearDay(date)  # remove old data about this day
        for category_tr in day_table.children:
            if category_tr.name != 'tr':
                continue
            if len(category_tr) < 2:
                continue  # no meal
            category = category_tr.contents[0].text
            meal_table = category_tr.contents[1].table
            if meal_table.tbody:
                meal_table = meal_table.tbody
            for meal_tr in meal_table.children:
                if meal_tr.name != 'tr':
                    continue
                if len(list(meal_tr.children)) != 3:
                    #print('skipping category, unable to parse meal_table: {} tds'.format(len(list(meal_tr.children))))
                    continue
                name = meal_tr.contents[1].text
                # notes, to do
                canteen.addMeal(date, category, name, [],
                                price_regex.findall(meal_tr.contents[2].text), roles)

Example #52

0

Show file

File: marburg.py Project: mswart/openmensa-parsers

def parse_url(url, data_canteen, today=False):
    canteen = LazyBuilder()

    data = urlopen(url).read().decode('utf-8')
    document = parse(data, 'lxml')

    dish = document.find(class_='neo-menu-single-dishes')
    if dish is not None:
        dishes = dish.find_all(name='tr', attrs={"data-canteen": data_canteen})
    else:
        dishes = []

    side = document.find(class_='neo-menu-single-modals')
    if side is not None:
        dishes = dishes + side.find_all(name='tr', attrs={"data-canteen": data_canteen})

    for dish in dishes:
        parse_dish(dish, canteen)

    return canteen.toXMLFeed()

Example #53

0

Show file

def parse_url(url, data_canteen, today=False):
    canteen = LazyBuilder()

    data = urlopen(url).read().decode('utf-8')
    document = parse(data, 'lxml')

    dish = document.find(class_='neo-menu-single-dishes')
    if dish is not None:
        dishes = dish.find_all(name='tr', attrs={"data-canteen": data_canteen})
    else:
        dishes = []

    side = document.find(class_='neo-menu-single-modals')
    if side is not None:
        dishes = dishes + side.find_all(name='tr',
                                        attrs={"data-canteen": data_canteen})

    for dish in dishes:
        parse_dish(dish, canteen)

    return canteen.toXMLFeed()

Example #54

0

Show file

File: dresden.py Project: mlewe/openmensa-parsers

def parse_week(url, canteen):
    document = parse(urlopen(url).read())
    for day_table in document.find_all('table', 'speiseplan'):
        date = extractDate(day_table.thead.tr.th.text)
        if day_table.find('td', 'keinangebot'):
            canteen.setDayClosed(date)
            continue
        for meal_tr in day_table.tbody.children:
            if len(meal_tr.find_all('a') or []) < 1:
                continue
            name = meal_tr.td.text
            if ': ' in name:
                category, name = name.split(': ', 1)
            else:
                category = 'Angebote'
            if len(name) > 200:
                name = name[:200] + ' ...'
            notes = []
            for img in meal_tr.contents[1].find_all('img'):
                notes.append(img['title'])
            canteen.addMeal(date, category, name, notes,
                            price_regex.findall(meal_tr.contents[2].text), roles)

Example #55

0

Show file

File: dresden.py Project: a-andre/openmensa-parsers

def parse_week(url, canteen):
    document = parse(urlopen(url).read())
    for day_table in document.find_all('table', 'speiseplan'):
        date = extractDate(day_table.thead.tr.th.text)
        if day_table.find('td', 'keinangebot'):
            canteen.setDayClosed(date)
            continue
        for meal_tr in day_table.tbody.children:
            if len(meal_tr.find_all('a') or []) < 1:
                continue
            name = meal_tr.td.text
            if ': ' in name:
                category, name = name.split(': ', 1)
            else:
                category = 'Angebote'
            if len(name) > 200:
                name = name[:200] + ' ...'
            notes = []
            for img in meal_tr.contents[1].find_all('img'):
                notes.append(img['title'])
            canteen.addMeal(date, category, name, notes,
                            price_regex.findall(meal_tr.contents[2].text), roles)

Example #56

0

Show file

File: ostniedersachsen.py Project: steeb/openmensa-parsers

def parse_week(url, canteen, type):
    document = parse(urlopen(url).read())
    for day_table in document.find_all('table', 'swbs_speiseplan'):
        caption = day_table.find('th', 'swbs_speiseplan_head').text
        if type not in caption:
            continue
        date = extractDate(caption)
        for meal_tr in day_table.find_all('tr'):
            if not meal_tr.find('td'):  # z.B Headline
                continue
            tds = meal_tr.find_all('td')
            category = tds[0].text.strip()
            name = tds[1].text
            if tds[1].find('a', href='http://www.stw-on.de/mensavital'):
                notes = ['MensaVital']
            else:
                notes = []
            prices = {
                'student':  tds[2].text,
                'employee': tds[3].text,
                'other':    tds[4].text
            }
            canteen.addMeal(date, category, name, notes, prices)

Example #57

0

Show file

File: karlsruhe.py Project: mswart/openmensa-parsers

def parse_week(canteen, url, place_class=None):
    content = urlopen(url).read().decode('utf-8', errors='ignore')
    document = parse(content, features='lxml')
    legend = document.find('div', {'id': 'leg'})
    if legend and legend.find('br'):
        # Update legend
        legend_content = legend.find('br').parent
        current_img = None
        for child in legend_content.children:
            if isinstance(child, str):
                if current_img is not None:
                    # Last child was a icon, this must be its label
                    s = child.strip()
                    if s.startswith('- '):
                        s = s[2:].strip()
                    extraLegend[current_img] = s
                    current_img = None
                else:
                    # Text notes
                    for n, text in legend_number_regex.findall(child):
                        extraLegend[n] = text
                    for tag, text in legend_letters_regex.findall(child):
                        extraLegend[tag] = text
            elif hasattr(child, 'name') and child.name == 'img':
                # Icon
                current_img = icon(child['src'])

    if place_class:
        document = document.find(id=place_class)

    for day_a in document.find_all('a', rel=day_regex):
        day_data = document.find(id=day_a['href'].replace('#', ''))
        if not day_data:
            continue
        date = day_a['rel'][0]
        day_table = day_data.table
        if not day_table:
            continue
        if day_table.tbody:
            day_table = day_table.tbody
        canteen.clearDay(date)  # remove old data about this day
        found_meals = False
        closed_date_match = None
        for category_tr in day_table.children:
            if category_tr.name != 'tr':
                continue
            if len(category_tr) < 2:
                continue  # no meal
            category = category_tr.contents[0].text
            meal_table = category_tr.contents[1].table
            if meal_table.tbody:
                meal_table = meal_table.tbody
            for meal_tr in meal_table.children:
                if meal_tr.name != 'tr':
                    continue
                if len(list(meal_tr.children)) != 3:
                    #print('skipping category, unable to parse meal_table: {} tds'.format(len(list(meal_tr.children))))
                    if len(list(meal_tr.contents)) > 1 and closed_regex.search(meal_tr.contents[1].text):
                        # Remember closed "meal"
                        closed_date_match = closed_regex.search(meal_tr.contents[1].text)
                    continue
                found_meals = True
                td1 = meal_tr.contents[1]
                span = td1.find('span')
                if span:
                    name = span.text  # Name without notes in <sup>
                else:
                    name = td1.text  # Fallback value: whole line

                # Add notes from <sup>[Ab,Cd,Ef]</sup>
                sup = meal_tr.find('sup')
                if sup:
                    keys = sup.text.strip("[] ") if "[" in sup.text else ''
                    keys_list = [key.strip() for key in keys.split(',')]
                    notes = [extraLegend[key] if key in extraLegend else key for key in keys_list if key]
                else:
                    notes = []

                # Find and convert icons to notes
                img = meal_tr.find('img')
                if img:
                    key = icon(img['src'])
                    if key in extraLegend:
                        notes.append(extraLegend[key])

                canteen.addMeal(date, category, name, notes,
                                price_regex.findall(meal_tr.contents[2].text), roles)

        if not found_meals and closed_date_match:
            # If there were no meals and there's a "geschlossen von .. bis .." message,
            # let's assume the whole canteen is closed on the mentioned dates
            match_from = closed_date_match.group("from")
            match_to = closed_date_match.group("to")

            now = datetime.datetime.now()
            year_from = year_to = now.year
            month_from = int(match_from.split(".")[1])
            month_to = int(match_to.split(".")[1])
            if now.month > 9:
                if now.month > month_to:
                    year_to += 1
                    if now.month > month_from:
                        year_from += 1

            fromdate = datetime.datetime.strptime('%s%d' % (match_from, year_from), '%d.%m.%Y')
            todate = datetime.datetime.strptime('%s%d' % (match_to, year_to), '%d.%m.%Y')
            if fromdate < now:
                fromdate = now

            while fromdate <= todate:
                canteen.setDayClosed(fromdate.strftime('%d.%m.%Y'))
                fromdate += datetime.timedelta(1)

Example #58

0

Show file

File: muenchen.py Project: azrdev/openmensa-parsers

def parse_url(url, today=False):
    canteen = LazyBuilder()

    # prices are stored on a separate page
    document = parse(urlopen(base + '/mensa-preise/').read(), 'lxml')
    prices = {}
    for tr in document.find('div', 'ce-bodytext').find_all('tr'):
        meal = tr.find('th')
        if not meal or not meal.text.strip():
            continue
        if len(tr.find_all('td', 'betrag')) < 3:
            continue
        if 'titel' in meal.attrs.get('class', []) or 'zeilentitel' in meal.attrs.get('class', []):
            continue
        meal = meal.text.strip()
        prices[meal] = {}
        for role, _id in [('student', 0), ('employee', 1), ('other', 2)]:
            price_html = tr.find_all('td', 'betrag')[_id].text
            price_search = price_regex.search(price_html)
            if price_search:
                prices[meal][role] = price_search.group('price')

    errorCount = 0
    date = datetime.date.today()
    while errorCount < 7:
        try:
            document = parse(urlopen(url.format(date)).read(), 'lxml')
            errorCount = 0
        except HTTPError as e:
            if e.code == 404:
                errorCount += 1
                date += datetime.timedelta(days=1)
                continue
            else:
                raise e

        # extract legend
        legend = {}
        legends = document.find('div', 'tx-stwm-speiseplan')
        additions = legends.find('div', 'c-schedule__filter-body')
        for table in additions.find_all('div', 'c-schedule__filter-item'):
            for ingredient in table.find('ul').find_all('li'):
                name = ingredient.find('dt').text.strip()
                description = ingredient.find('dd').text.strip()
                legend[name] = description
        for label in legends.find('ul', 'c-schedule__type-list').find_all('li'):
            name = label.find('dt').text.replace('(', '').replace(')', '').strip()
            description = label.find('dd').text.strip()
            legend[name] = description

        # extract meals
        mensa_data = document.find('ul', 'c-schedule__list')
        category = None
        for meal in mensa_data.find_all('li'):
            # update category or use previous one if not specified
            category_text = meal.find('dt', 'c-schedule__term').text.strip()
            if category_text:
                category = category_text

            data = meal.find('dd').find('p', 'js-schedule-dish-description')
            name = data.contents[0].strip() # name is the first text node
            if not name:
                continue

            # notes are contained in 3 boxes (type, additional, allergen) and
            # are comma-separated lists enclosed in brackets or parentheses
            notes = []
            for note in meal.find_all('span', 'c-schedule__marker'):
                note_text = note.find('span', 'u-text-sup').text \
                    .replace('(', '').replace(')', '') \
                    .replace('[', '').replace(']', '')
                notes += [n for n in note_text.split(',') if n]

            # some meals contain the GQB label in their name (instead of in notes)
            if '(GQB)' in name:
                name = name.replace('(GQB)', '').strip()
                notes.append('GQB')

            # the price for both meals is specified as Bio-/Aktionsgericht
            price_category = category \
                .replace('Aktionsessen', 'Bio-/Aktionsgericht') \
                .replace('Biogericht', 'Bio-/Aktionsgericht') \
                .strip()

            canteen.addMeal(date, category, name,
                [legend.get(n, n) for n in notes],
                prices.get(price_category, {})
            )

        date += datetime.timedelta(days=1)
        if today:
            break

    return canteen.toXMLFeed()