Python get_html Exemples, utils.get_html Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : download.py Projet : appastair/image-dl

def mangastream(url, name, dest, delim, digits, number):
    print "Downloading images from [mangastream]...\n"

    links = [tag.get('href') for tag in get_html(url).findAll(
        "ul", {"class": "dropdown-menu"})[-1].select('li > a')]
    match = re.search(r"(.*\/)(\d*)$", links[-1])
    base_url, num_pages = match.group(1), int(match.group(2))

    for i in range(1, num_pages + 1):
        try:
            image_url = get_html(
                base_url + str(i)).select("#manga-page")[0].get("src")
            new_name = set_name("", ".jpg", "", i, digits)
            download_file(image_url, new_name, dest, i)
        except:
            print "exception"
            pass

Exemple #2

0

Afficher le fichier

Fichier : beerkeg.py Projet : huntrar/ChooseMyBeer

    def parse(self):
        ''' retrieves the page and parses the contents into the following fields

                self.name    (May include brewery/brand and/or beer)
                self.price   (USD)
                self.volume  (Gallons)
                self.num_avail  (Kegs)
                self.desc    (Keg description)
        '''
        if self.parsed:
            return

        self.parsed = True

        html = get_html(self.url)

        ''' Attempt to get name and volume '''
        try:
            self.name = html.xpath('//h1/text()')[0].strip()
            if '(' in self.name and ')' in self.name:
                split_name = self.name.split('(')
                self.name = split_name[0].strip()

                volume = filter(lambda x: is_num(x) if '.' not in x \
                                else x, split_name[1].strip(')').strip())
                if is_num(volume):
                    self.volume = float(volume)
                else:
                    self.volume = 0.0
            else:
                self.volume = 0.0
        except Exception:
            self.name = ''
            self.volume = 0.0

        ''' Attempt to get price '''
        try:
            self.price = float(html.xpath('//span[@class="ProductDetailItemPric\
                                          e"]/text()')[0].strip().strip('$'))
        except Exception:
            self.price = 0.0

        ''' Attempt to get number of available kegs '''
        try:
            self.num_avail = int(html.xpath('//em/text()\
                                            ')[0].strip().split()[0])
        except Exception:
            self.num_avail = 0

        ''' Attempt to get description '''
        try:
            self.desc = html.xpath('//td[@class="ProductDetailCell"]/p/text()\
                                   ')[0].strip()
        except Exception:
            self.desc = ''

Exemple #3

0

Afficher le fichier

Fichier : check_vst.py Projet : ddy88958620/work

def source_vst():
    urllist = []
    try:
        js = utils.get_html()
        info_dict = utils.get_json(js)

        for item in info_dict["live"]:
            url_list = item["urllist"].split("#")
            for url in url_list:
                urllist.append(url)

    except Exception, e:
        print api_error, e
        exit(1)

Exemple #4

0

Afficher le fichier

Fichier : route_parser.py Projet : passos/transport-victoria-spider

    def parse_map(self):
        map_node = self.get_node(self.root, "//div[@class='routeMapInner']/a")
        if map_node is None: return None

        url = map_node.get('href')
        map_html = utils.get_html(utils.ptv_url + url)
        map_tree = etree.HTML(map_html)
        map_node = self.get_node(map_tree, "//div[@class='routeMapInner']/img")
        if map_node is None: return None

        # http://ptv.vic.gov.au/
        map_link = map_node.get('src')
        db.update_table('map', 'link', map_link)
        map_id = db.query("SELECT id FROM map WHERE link=?", (map_link,))

        return map_id

Exemple #5

0

Afficher le fichier

Fichier : check_7po.py Projet : ddy88958620/work

def source_7po():
    urllist = []
    try:
        resp = utils.get_html(api_url)
        dom = xml.dom.minidom.parseString(resp)
        root = dom.documentElement
        channels = root.getElementsByTagName("channel")

        for channel in channels:
            for urlNode in channel.childNodes:
                # 源的url
                url = urlNode.firstChild.wholeText
                urllist.append(url)

    except Exception, e:
        print api_error, e
        exit(1)

Exemple #6

0

Afficher le fichier

Fichier : shopping_search.py Projet : desenvdepaula/Google-Search-API

def shopping(query, pages=1):
    results = []
    for i in range(pages):
        url = _get_shopping_url(query, i)
        html = get_html(url)
        if html:
            j = 0
            soup = BeautifulSoup(html)

            products = soup.findAll("div", "g")
            print "yoooo", products
            for prod in products:
                res = ShoppingResult()

                divs = prod.findAll("div")
                for div in divs:
                    match = re.search(
                        "from (?P<count>[0-9]+) stores", div.text.strip())
                    if match:
                        res.store_count = match.group("count")
                        break

                h3 = prod.find("h3", "r")
                if h3:
                    a = h3.find("a")
                    if a:
                        res.compare_url = a["href"]
                    res.name = h3.text.strip()

                psliimg = prod.find("div", "psliimg")
                if psliimg:
                    img = psliimg.find("img")
                    if img:
                        res.thumb = img["src"]

                f = prod.find("div", "f")
                if f:
                    res.subtext = f.text.strip()

                price = prod.find("div", "psliprice")
                if price:
                    res.min_price = price.text.strip()

                results.append(res)
                j = j + 1
    return results

Exemple #7

0

Afficher le fichier

Fichier : standard_search.py Projet : robolamp/Google-Search-API

def get_aver_num(query, lang='en'):
    """Returns average number of search results.

    Args:
        query: String to search in google.

    Returns:
        int number"""
    av_num = 0
    url = _get_search_url(query, 0, lang=lang)
    html = get_html(url)

    if html:
        soup = BeautifulSoup(html, "html.parser")            
        av_num = soup.find("div", {"id": "resultStats"})
        av_num = _get_num(av_num)

    return av_num

Exemple #8

0

Afficher le fichier

Fichier : currency.py Projet : zeerayne/Google-Search-API

def convert(amount, from_currency, to_currency):
    """Method to convert currency.

    Args:
        amount: numeric amount to convert
        from_currency: currency denomination of the amount to convert
        to_currency: target currency denomination to convert to
    """

    # same currency, no conversion
    if from_currency == to_currency:
        return amount * 1.0

    req_url = _get_currency_req_url(amount,
                                    from_currency, to_currency)
    response = get_html(req_url)
    rate = _parse_currency_response(response, to_currency)

    return rate

Exemple #9

0

Afficher le fichier

Fichier : standard_search.py Projet : ChiragChhatbar/Google-Search-API

def search(query, pages=1, lang='en', void=True):
    """Returns a list of GoogleResult.

    Args:
        query: String to search in google.
        pages: Number of pages where results must be taken.

    Returns:
        A GoogleResult object."""

    results = []
    for i in range(pages):
        url = _get_search_url(query, i, lang=lang)
        html = get_html(url)

        if html:
            soup = BeautifulSoup(html, "html.parser")
            lis = soup.findAll("div", attrs={"class": "g"})
            
            j = 0
            for li in lis:
                res = GoogleResult()

                res.page = i
                res.index = j

                res.name = _get_name(li)
                res.link = _get_link(li)
                res.google_link = _get_google_link(li)
                res.description = _get_description(li)
                res.thumb = _get_thumb()
                res.cached = _get_cached(li)
                if void is True:
                    if res.description is None:
                        continue
                results.append(res)
                j += 1

    return results

Exemple #10

0

Afficher le fichier

Fichier : cntv_crawler.py Projet : ddy88958620/work

def cntv_crawler(api):
    print "start running cntv crawler...."
    src_list = []
    for channel in channel_list:
        print "getting url from %s" % channel

        try:
            jsn = utils.get_html(api % channel)
            data = utils.get_json(jsn)
            src = {}
            src["code"] = channel
            src['auth'] = data['hls_url']['hls2']
            if "cctv" in channel:
                src["hls"] = data['hls_url']['hls1']
                src['flv'] = data['hds_url']['hds2']

            src_list.append(src)

        except Exception, e:
            print e
        else:
            pass
        finally:

Exemple #11

0

Afficher le fichier

Fichier : pptv_crawler.py Projet : ddy88958620/work

def get_channels(area, url):
    try:
        # 获取地区中所有频道的url
        c_url = url % (area[0], area[0])
        resp = utils.get_html(c_url)
        c_match = utils.get_json(resp, json_pattern)
        # c_match = re.search(json_pattern, r.text).group(1)
        c_json = json.loads(c_match)
        # 获取含有频道列表的html
        c_html = c_json["html"]
        # 频道url列表
        urllist = re.findall(url_pattern, c_html)
        # 频道名列表
        namelist = re.findall(c_name_pattern, c_html)

        # 频道字典，频道名与url对应
        c_dict = {}
        for i in xrange(len(namelist)):
            c_dict[namelist[i]] = urllist[i]

    except Exception, e:
        print e
        return None

Exemple #12

0

Afficher le fichier

Fichier : download.py Projet : appastair/image-dl

def hotflick(url, name, dest, delim, digits, number):
    print "Downloading images from [hotflick]...\n"
    
    # get all page links if the gallery has more than one page
    div = get_html(url).find('div', {"class": "box-paging"})
    gallery_page_links = [str(tag['href'])
                          for tag in div.findAll('a', href=True)]

    # get image links
    if gallery_page_links != []:
        links = []
        for page in gallery_page_links:
            links.extend([link for link in get_page_links(
                "http://hotflick.net/" + page) if "/v/?q=" in link])
    else:
        links = [link for link in get_page_links(url) if "/v/?q=" in link]

    regex = re.compile(r'\.net/\w/v/\?q\=(\d+)\.(.*)(\.\w*)$', re.IGNORECASE)

    for link in links:
        try:
            # image name and filetype
            match = regex.search(link)
            ext = match.group(3)

            # image URL and output filename
            new_name = set_name(name, ext, delim, number, digits)
            image_url = "http://www.hotflick.net/u/n/{0}/{1}{2}".format(
                match.group(1), match.group(2), ext)

            # download
            download_file(image_url, new_name, dest, number)
            number += 1
        except:
            print "exception"
            pass

Exemple #13

0

Afficher le fichier

Fichier : download.py Projet : appastair/image-dl

def upix(url, name, dest, delim, digits, number):
    print "Downloading images from [upix]...\n"

    links = [str(tag['href'])
             for tag in get_html(url).findAll('a', {"class": "thumb"})]

    base_url = url
    if str.endswith(url, "/#none"):
        base_url = url[:-5]

    regex = re.compile(r'(\.[a-zA-Z]*)$', re.IGNORECASE)

    for link in links:
        try:
            # image URL and output filename
            image_url = base_url + link
            ext = regex.search(image_url).group(1)
            new_name = set_name(name, ext, delim, number, digits)

            # download
            download_file(image_url, new_name, dest, number)
            number += 1
        except:
            pass

Exemple #14

0

Afficher le fichier

Fichier : download.py Projet : fgeorgy/image-dl

def imgur(url, name, dest, delim, digits, number):
    print "Downloading images from [imgur]...\n"

    if not str.endswith(url, "/layout/blog"):
        url += "/layout/blog"

    links = get_html(url).findAll('meta', {'property': 'og:image'})
    links = [link['content'] for link in links[1:]]

    regex = re.compile(r'\.com/\w*(\.[a-zA-Z]*)$', re.IGNORECASE)

    for image_url in links:
        try:
            # filetype
            ext = regex.search(image_url).group(1)

            # output filename
            new_name = set_name(name, ext, delim, number, digits)

            # download
            download_file(image_url, new_name, dest, number)
            number += 1
        except:
            pass

Exemple #15

0

Afficher le fichier

Fichier : mainextract.py Projet : sgousem/jddataextract

 page_number = 1
 #max_num = data['{}'.format(district)]
 os.chdir(os.path.join(path, 'CDATA'))
 with open(sfile, 'a', encoding='utf-8', newline='') as csvfile:
     csvwriter = csv.writer(csvfile)
     # writing the fields
     csvwriter.writerow(fields)
     os.chdir(path)
     while True:
         if page_number > max_num:
             break
         log_file.write('page{} process strted at: '.format(page_number) +
                        time.ctime() + '\r\n')
         url = "https://www.justdial.com/%s/Lawyers/nct-10296083/page-%s" % (
             district, page_number)
         ut.get_html(url, page_number)
         time.sleep(5)
         page = open('temp{}.htm'.format(page_number),
                     'r',
                     encoding='utf-8')
         #page = urllib.request.urlopen(req , proxy , timeout=5)
         #time.ctime(1)
         # page=urllib2.urlopen(url)
         soup = BeautifulSoup(page.read(), "html.parser")
         services = soup.find_all('li', {'class': 'cntanr'})
         # Iterate through the 10 results in the page
         for service_html in services:
             # Parse HTML to fetch data
             name = ut.get_name(service_html)
             phone = ut.get_phone_number(service_html)
             #rating = get_rating(service_html)

Exemple #16

0

Afficher le fichier

Fichier : choosemybeer.py Projet : huntrar/ChooseMyBeer

def get_optimal_kegs(args):
    ''' Gets kegs from bevmo.com
        finds the kegs with the optimal gallons of alcohol per USD
    '''
    num_kegs = args['top']
    beer_limit = args['limit']
    num_attempts = args['attempts']
    max_price = args['price']
    desc_filter = args['filter']
    desc_unfilter = args['unfilter']

    ''' The first url to crawl and its base url '''
    seed_url = 'http://www.bevmo.com/Shop/ProductList.aspx/\
                Beer/Kegs/_/N-15Z1z141vn?DNID=Beer'
    base_url = '{url.scheme}://{url.netloc}'.format(url=urlparse(seed_url))

    ''' Get initial unique page links from the seed url
        append base_url to them
    '''

    '''     For info on XPaths, see:
            http://www.w3schools.com/xpath/xpath_syntax.asp
    '''
    init_page_links = []
    init_page_links[:] = unique(get_html(seed_url).xpath('//div[@class="Product\
                                                         ListPaging"]/a/@href'))

    if not init_page_links:
        print('Failed to retrieve the initial keg page links!')
        return None

    ''' Lists for holding links to pages of beer kegs '''
    page_links = [seed_url] + map(lambda x: base_url + x, init_page_links)
    new_page_links = []

    ''' Lists for holding links to individual beer kegs '''
    beer_links = []
    new_beer_links = []

    ''' To keep track of already crawled beer kegs '''
    crawled_beers = set()

    ''' List for matching --filter and --unfilter keyword arguments to
        keg descriptions
    '''
    matched = []

    ''' List to hold top beer kegs, the size of optimal_kegs is limited by the
        num_kegs argument
    '''
    optimal_kegs = []

    keg = None
    while len(page_links) > 0 and len(crawled_beers) < beer_limit:
        ''' Links are removed as they are crawled '''
        page_link = page_links.pop(0)

        ''' Beer keg links '''
        new_beer_links[:] = unique(get_html(page_link).xpath('//a[@class="Prod\
                                                             uctListItemLink"]\
                                                             /@href'))
        beer_links += [base_url + x for x in new_beer_links]

        ''' Crawl the beer keg links
            get the gallons of alcohol/USD ratio
        '''
        for link in beer_links:
            ''' Break if the number of crawled beers exceeds the limit '''
            if len(crawled_beers) >= beer_limit:
                break

            ''' Cache the BevMo beer id's to prevent duplicates '''
            beer_id = link.split('/')[-1]

            if beer_id not in crawled_beers:
                ''' Create BeerKeg object '''
                keg = BeerKeg(link, num_attempts, verbose=True)

                ''' Call keg.parse() then filter kegs by their descriptions
                    Calling keg.parse() produces fields keg.desc, keg.price, etc
                    keg.parse() will only parse once per keg object
                '''

                ''' Check if price is within range if one was given '''
                if max_price:
                    keg.parse()

                    if keg.price > max_price:
                        ''' Move onto the next keg and ignore this one '''
                        continue

                ''' args['filter'] has words that must be in the description '''
                ''' desc_filter has words that must be in the description '''
                if desc_filter:
                    keg.parse()

                    matched = [word in keg.desc for word in desc_filter]

                    ''' All keywords must be present for a match '''
                    if not all(matched):
                        ''' Move onto the next keg and ignore this one '''
                        continue

                ''' desc_unfilter has words that can't be in the description '''
                if desc_unfilter:
                    keg.parse()

                    matched = [word in keg.desc for word in desc_unfilter]

                    ''' Any keyword must be present to nullify a match '''
                    if any(matched):
                        ''' Move onto the next keg and ignore this one '''
                        continue

                ''' Add current beer to crawled beers '''
                crawled_beers.add(beer_id)

                ''' Print how many kegs have been crawled '''
                print('Keg {}'.format(len(crawled_beers)))

                ''' Gets the gallons of alcohol per USD for the keg '''
                ratio = keg.get_ratio()

                print('')

                ''' Maintain a sorted list of the current top 3 kegs using
                    heapq (heap queue algorithm)

                    optimal_kegs holds a tuple containing the ratio and keg
                    associated with it
                '''
                if optimal_kegs:
                    for opt_tuple in optimal_kegs:
                        ''' If ratio is greater than any keg ratio currently
                            in optimal_kegs, then add it
                        '''
                        if ratio > opt_tuple[0]:
                            if len(optimal_kegs) >= num_kegs:
                                ''' Adds new item to list
                                    removes the smallest to maintain size
                                '''
                                heapq.heappushpop(optimal_kegs, (ratio, keg))
                            else:
                                heapq.heappush(optimal_kegs, (ratio, keg))
                            break
                else:
                    ''' Will only occur for the very first keg crawled '''
                    heapq.heappush(optimal_kegs, (ratio, keg))

        ''' Typical link: Shop/ProductList.aspx/_/N-15Z1z141vn/No-100?DNID=Beer

            If No- is evenly divisible by 100, it leads to more pages to add
        '''
        if 'No-' in page_link:
            if int(page_link.split('No-')[1].split('?')[0]) % 100 == 0:
                ''' Unique new page links with their base url appended '''
                new_page_links[:] = unique(get_html(page_link).xpath('//div[@cl\
                                                                     ass="Produ\
                                                                     ctListPagi\
                                                                     ng"]/a/@hr\
                                                                     ef'))
                page_links += [base_url + x for x in new_page_links]

    ''' Sort the list in descending order by ratio
        (index 0 in the keg tuple)
    '''
    return sorted(optimal_kegs, key=lambda x: x[0], reverse=True)

Exemple #17

0

Afficher le fichier

Fichier : Player.py Projet : b-o-l-l-a/nfl-stat-scraper

    def __init__(self, player_name, player_link, Team=None, position=None):

        self.min_snap_perc = .10
        self.name = player_name
        self.player_link = player_link

        config = get_config(os.getcwd())
        self.base_url = config['base_url']
        self.full_player_url = self.base_url + player_link
        self.standardized_position_dict = {
            "OL": {
                "eligible_positions": [
                    "G", "T", "C", "LS", "OT", "OG", "OL", "G/C", "G-C", "T-G",
                    "G-T", "C-G", "G,C", "C,G", "G,T", "T,G"
                ],
                "class":
                OffLineman,
                "side":
                "offense"
            },
            "QB": {
                "eligible_positions": ["QB"],
                "class": Quarterback,
                "side": "offense"
            },
            "WR": {
                "eligible_positions": ["WR", "PR-WR", "WR/RB"],
                "class": WideReceiver,
                "side": "offense"
            },
            "TE": {
                "eligible_positions": ["TE", "LS,TE", "TE-C"],
                "class": TightEnd,
                "side": "offense"
            },
            "RB": {
                "eligible_positions": ["RB", "FB", "FB-LB", "HB"],
                "class": RunningBack,
                "side": "offense"
            },
            "DB": {
                "eligible_positions": ["SS", "FS", "CB", "DB", "S"],
                "class": DefBack,
                "side": "defense"
            },
            "LB": {
                "eligible_positions": ["LB", "OLB", "ILB", "MLB", "LB-DE"],
                "class": Linebacker,
                "side": "defense"
            },
            "DL": {
                "eligible_positions": [
                    "DT", "DL", "NT", "DE", "NT-DT", "DT-NT", "DE-LB", "DT/LB",
                    "DE-C", "DE-DT", "DT-DE"
                ],
                "class":
                DefLineman,
                "side":
                "defense"
            },
            "K": {
                "eligible_positions": ["K"],
                "class": Kicker,
                "side": "special_teams"
            },
            "P": {
                "eligible_positions": ["P"],
                "class": Punter,
                "side": "special_teams"
            }
        }

        if Team is not None:
            self.game_html_page = Team.game_html_page
            self.season = Team.season
            self.week = Team.week
            self.team = Team.team
            self.team_abbrev = Team.team_abbrev
            self.base_url = Team.base_url

        if position is None:
            self.player_page = get_html(self.full_player_url)
            self.meta_div = self.player_page.find(
                "div", {"itemtype": "https://schema.org/Person"})
            position = self.get_position_from_player_page(self.meta_div)

        self.standardized_pos = [k for k, v in self.standardized_position_dict.items() \
                                 if position in v["eligible_positions"]][0]
        self.player_class = self.standardized_position_dict[
            self.standardized_pos]["class"]
        self.side = self.standardized_position_dict[
            self.standardized_pos]['side']

Exemple #18

0

Afficher le fichier

 def generate_script_dicts(self):
     for s in self.scripts:
         html = get_html('{0}{1}'.format(self.script_url, s))
         for d in self.get_relevant_dict(html):
             yield d

Exemple #19

0

Afficher le fichier

    def get_game_page(self):

        game_soup = get_html(self.game_full_url)
        return game_soup

Exemple #20

0

Afficher le fichier

Fichier : html_to_json.py Projet : catwy/mayors

def html_to_json(url):
    category, uid = tokenize(url)
    schema_name = 'schema/{}.json'.format(category)
    with open(schema_name, 'rb') as fp:
        template = json.load(fp)
    html_doc = get_html(url)
    soup = BeautifulSoup(html_doc, 'html.parser')

    table_title = None
    result = {}
    ignore_image = True
    for tr in soup.find_all('tr'):
        # keep only the most bottom level tr
        if tr.find_all('tr'):
            continue
        is_title_row = False
        row_content = []
        for td in tr.find_all('td'):
            if ignore_image and td.find_all('img'):
                continue
            text = clean_up(td.text)
            if text in template:
                table_title = text
                is_title_row = True
                row_titles = template[table_title]
                ignore_image = row_titles['ignore image']
                result[table_title] = {}
                break
            link = ''
            for a in td.find_all('a'):
                link = a.get('href')
            row_content.append({'text': text, 'link': link})

        if is_title_row:
            continue

        if not row_content or not table_title:
            continue

        column_index = row_titles['column index']
        strict_match = row_titles['strict match']
        regex_match = row_titles['regex match']
        terminate_on_mismatch = row_titles['terminate on mismatch']

        matched = False
        if len(row_content) > column_index + 1:
            candidate_row_title = row_content[column_index]['text']
            for s in strict_match:
                if s == candidate_row_title and s not in result[table_title]:
                    matched = True
                    result[table_title][s] = row_content[column_index + 1:]
                    break
            if not matched:
                for s in regex_match:
                    if s in candidate_row_title:
                        matched = True
                        result[table_title][u'Certified Votes'] = row_content[column_index + 1:]
                        break
                    if re.match(s, candidate_row_title):
                        matched = True
                        category, race_id = tokenize(row_content[column_index + 1]['link'])
                        result[table_title][race_id] = row_content[column_index:]
                        break
        if terminate_on_mismatch and not matched:
            table_title = None
            ignore_image = True
    return result

Exemple #21

0

Afficher le fichier

def main():
    html = get_html(url)
    table = get_macrolang_table(html)
    langs = parse_macrolang_table(table)
    detailed_list = get_detailed_list(html)
    parse_detailed_list(detailed_list)

Exemple #22

0

Afficher le fichier

def get_areas(area_url):
    try:
        resp = utils.get_html(area_url)
    except Exception, e:
        print area_error, e
        exit(1)

Exemple #23

0

Afficher le fichier

 def __init__(self, profile_id):
     self.profile_id = profile_id
     profile_html = utils.get_html("profile", self.profile_id)
     self.name = utils.get_name_from_html(profile_html)
     self.solved = 0
     self.problems = []

Exemple #24

0

Afficher le fichier

Fichier : aspetjournal_down.py Projet : Sustartpython/My-Python-Examples

 def run(self):
     big_json_name = big_json_path + '/%s_%s_%s.big_json' % (
         now_time, os.getpid(), get_ident())
     while True:
         if not message_que.empty():
             rows = message_que.get()
             for url in rows:
                 utils.printf(url)
                 key = random.choice(RKEY_PROXY)
                 proxy_ = connRedis.srandmember(key)
                 proxy = {
                     'http': proxy_,
                     'https': proxy_,
                 }
                 feature = "highwire-cite-metadata"
                 feature_2 = "pane-title"
                 # res = utils.get_html(url,feature=feature,proxies=proxy,timeout=200)
                 res = utils.get_html(url, feature=feature, timeout=200)
                 if res:
                     html = res.text.strip()
                     HEADER = {
                         "Accept":
                         "*/*",
                         "User-Agent":
                         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
                     }
                     try:
                         sumDict = dict()
                         info_url = url + "/tab-article-info"
                         # res_info = requests.get(info_url,headers=HEADER,proxies=proxy,timeout=20)
                         res_info = requests.get(info_url,
                                                 headers=HEADER,
                                                 timeout=200)
                         if res_info.status_code == 200:
                             if res_info.text.find(feature_2) > 0:
                                 info_html = res_info.text.strip()
                                 sumDict['provider_url'] = url
                                 sumDict['down_date'] = now_time
                                 sumDict['htmlText'] = html
                                 sumDict['info_htmlText'] = info_html
                                 with open(big_json_name,
                                           mode='a',
                                           encoding='utf-8') as f:
                                     line = json.dumps(
                                         sumDict,
                                         ensure_ascii=False).strip() + '\n'
                                     f.write(line)
                                 utils.printf(url, 'write to big_json')
                                 sql_queue.put(url)
                             else:
                                 utils.printf("not find feee_info")
                                 message_que.put(rows)
                         elif res_info.status_code == 404:
                             sumDict['provider_url'] = url
                             sumDict['down_date'] = now_time
                             sumDict['htmlText'] = html
                             sumDict['info_htmlText'] = ""
                             with open(big_json_name,
                                       mode='a',
                                       encoding='utf-8') as f:
                                 line = json.dumps(
                                     sumDict,
                                     ensure_ascii=False).strip() + '\n'
                                 f.write(line)
                             utils.printf(url, 'write to big_json')
                             sql_queue.put(url)
                         else:
                             message_que.put(rows)
                     except Exception as e:
                         utils.printf(e)
                         message_que.put(rows)
                 else:
                     message_que.put(rows)

Exemple #25

0

Afficher le fichier

Fichier : interview_text_scrapper.py Projet : woshimanstein/Personalized-response-generation

def get_interview_text(interview_url):
    """
    Fetch a single piece of interview text and meta-data from a source webpage

    Parameters
    ----------
    interview_url : String
        The url to the webpage

    Returns
    ------
    interview_name : String
        Name of this interview

    interview_time : String
        When this interview happened

    interview_players: List[String]
        Interviewees

    interview_text : String
        An unprocessed String of raw interview text (including Questions and interviewee responses)
    """
    # example url: http://www.asapsports.com/show_conference.php?id=144725

    # fetch HTML
    soup = get_html(interview_url)

    assert len(soup.find_all('h1')) == 1
    if soup.find_all('h1')[0].a is not None:
        interview_name = str(soup.find_all('h1')[0].a.contents[0])
    else:
        interview_name = str(soup.find_all('h1')[0].contents[0])
    assert len(soup.find_all('h2')) == 1
    interview_time = str(soup.find_all('h2')[0].contents[0])

    # find all players attending this interview
    interview_players = []
    for link in soup.find_all('a'):
        if 'show_player.php' in link.get('href'):
            interview_players.append(str(link.contents[0]))

    # find interview text
    for td in soup.find_all('td'):
        if td.get('valign') == 'top' and td.get('style') == 'padding: 10px;':
            raw_interview_text = td.contents
            interview_text = ''
            for item in raw_interview_text:
                # all actual texts are either directly below the td Tag or is a Tag with name 'b'
                if type(item) is NavigableString:
                    interview_text += str(item)
                elif type(item) is Tag and item.name == 'b':
                    # cope with empty tags: <b></b>
                    if len(item.contents) > 0:
                        interview_text += str(item.contents[0])

    # remove #nbsp; and Â from text
    interview_text = interview_text.replace('\xa0', ' ')
    interview_text = interview_text.replace('Â', ' ')

    return interview_name, interview_time, interview_players, interview_text

Exemple #26

0

Afficher le fichier

from bs4 import BeautifulSoup

from utils import get_html


def process_scripts(html):
    soup = BeautifulSoup(html, "lxml")
    scripts = soup.find_all("script", {"type": "text/javascript"})
    return scripts


def process_shared_data(data):
    shared_data = [s.string for s in data if "window._sharedData = " in str(s)]
    post_links = str(shared_data).split(
        "edge_sidecar_to_children")[-1].replace(r"\\u0026", "&")
    post_links = post_links.split(",")
    links = [str(link.split('":"')[-1].rstrip('"'))
             for link in post_links if 'display_url' in link or 'video_url' in link]

    return links


if __name__ == "__main__":
    html = get_html("https://www.instagram.com/p/CCnsE2PJktq/").text
    data = process_scripts(html)
    print(process_shared_data(data))

Exemple #27

0

Afficher le fichier

 def get_family_urls(self, url1):
     html = get_html(url1)
     self.family_urls = set([])
     for relevant_dict in self.get_relevant_dict(html):
         self.family_urls.add()

Exemple #28

0

Afficher le fichier

Fichier : thread_dowm.py Projet : Sustartpython/My-Python-Examples

 def run(self):
     big_json_name =big_json_path + '/%s_%s_%s.big_json' % (now_time,os.getpid(),get_ident())
     HEADER = {
     'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
     }
     # url_zhuye = "https://www.pnas.org"
     # sn = requests.session()
     # res_zhuye = sn.get(url_zhuye,headers=HEADER,timeout=2)
     while True:
         if not message_que.empty():
             rows = message_que.get()
             for url in rows:
                 utils.printf(url)
                 key = random.choice(RKEY_PROXY)
                 proxy_ = connRedis.srandmember(key)
                 proxy = {
                     'http': proxy_,
                     'https': proxy_,
                 }
                 feature = "highwire-cite-metadata"
                 feature_2 = "pane-title"
                 res = utils.get_html(url,feature=feature,proxies=proxy,timeout=15)
                 if res:
                     html = res.text.strip()
                     h = Selector(text=html)
                     node_id = h.xpath("//div[@class='pane-content']/div[@class='highwire-article-citation highwire-citation-type-highwire-article']/@data-node-nid").extract_first()
                     info_url = "https://www.pnas.org/panels_ajax_tab/jnl_pnas_tab_info/node:%s/1" % node_id
                     utils.printf(info_url)
                     try:
                         sumDict = dict()
                         res_info = requests.get(info_url,headers=HEADER,proxies=proxy,timeout=200)
                         if res_info.status_code == 200:
                             if res_info.text.find(feature_2) > 0:
                                 info_html = res_info.text.strip()
                                 info_html = json.loads(info_html)['markup']
                                 sumDict['provider_url'] = url
                                 sumDict['down_date'] = now_time
                                 sumDict['htmlText'] = html
                                 sumDict['info_htmlText'] = info_html
                                 with open(big_json_name, mode='a', encoding='utf-8') as f:
                                     line = json.dumps(sumDict, ensure_ascii=False).strip() + '\n'
                                     f.write(line)
                                 utils.printf(url,'write to big_json')
                                 sql_queue.put(url)
                             else:
                                 utils.printf("not find feee_info")
                                 message_que.put(rows)
                         elif res_info.status_code == 404:
                             sumDict['provider_url'] = url
                             sumDict['down_date'] = now_time
                             sumDict['htmlText'] = html
                             sumDict['info_htmlText'] = ""
                             with open(big_json_name, mode='a', encoding='utf-8') as f:
                                 line = json.dumps(sumDict, ensure_ascii=False).strip() + '\n'
                                 f.write(line)
                             utils.printf(url,'write to big_json')
                             sql_queue.put(url)
                         else:
                             utils.printf(res_info.status_code)
                             message_que.put(rows)
                     except Exception as e:
                         utils.printf(e)
                         message_que.put(rows)
                 else:
                     print("1111")
                     message_que.put(rows)

Exemple #29

0

Afficher le fichier

Fichier : beerkeg.py Projet : huntrar/ChooseMyBeer

    def get_abv(self):
        ''' Attempts to find percentage of alcohol by volume using Bing '''
        abv = ''
        found_abv = ''

        ''' A ceiling for ABV content for validation

            We can assume BevMo does not offer kegs with this high of an ABV
        '''
        max_abv = 20.0

        if not self.parsed:
            self.parse()

        search_url = 'https://www.bing.com/search?q={0}+alcohol+content\
                     '.format('+'.join(self.name.split()))
        search_links = get_html(search_url).xpath('//a/@href')
        new_search_links = search_links[search_links.index('javascript:'):][1:]

        results = [x for x in new_search_links if x != '#' and 'site:' not in x]

        ''' Max number of links to search for alcohol by volume (ABV) '''
        num_attempts = self.num_attempts

        ''' Filter links with same domain to improve chances of matching '''
        searched_domains = set()

        ''' Add the top page results that are unique, r_it is an iterator '''
        top_results = []
        r_it = 0
        result_link = ''

        while len(top_results) < num_attempts and r_it < len(results):
            result_link = results[r_it]
            domain = '{url.netloc}'.format(url=urlparse(result_link))
            if '.' in domain:
                if domain.count('.') > 1:
                    domain = domain.split('.')[1]
                else:
                    domain = domain.split('.')[0]

            ''' Avoid already searched domains '''
            if domain in searched_domains:
                r_it += 1
            else:
                top_results.append(result_link)
                r_it += 1
                searched_domains.add(domain)

        for i in xrange(min(num_attempts, len(top_results))):
            if self.verbose:
                print('Searching {}'.format(top_results[i]))

            try:
                search_text = ''.join(get_text(get_html(top_results[i])))
            except Exception:
                continue

            ''' Retrieves partial string containing the words ABV and a % '''
            abv = re.search('(?<=[Aa][Bb][Vv])[^\d]*(\d+[.]?\d*)(?=%)|(?<=%)\
                            [^\d]*(\d+[.]?\d*)[^\d]*\
                            (?=[Aa][Bb][Cc])', search_text)
            if abv:
                abv = abv.group()

                ''' Filters for a number with or without a decimal pt '''
                abv = float(re.search('(\d+[.]?\d*)', abv).group())

                ''' If new ABV is 0.0, return previously found ABV if any
                    otherwise, move onto the next link
                '''
                if abv == 0.0:
                    if found_abv:
                        if self.verbose:
                            print('ABV for {} is {}'.format(self.name, abv))
                    else:
                        continue

                if abv < max_abv:
                    if abv < max_abv / 2:
                        if self.verbose:
                            print('ABV for {} is {}'.format(self.name, abv))

                        return abv

                    ''' Replace the new ABV only if the next is lower '''
                    if found_abv:
                        if abv < found_abv:
                            if self.verbose:
                                print('ABV for {} is {}'.format(self.name, abv))

                            return abv
                        else:
                            if self.verbose:
                                print('ABV for {} is {}\
                                      '.format(self.name, found_abv))

                            return found_abv

                    ''' Sets the new ABV to the found ABV '''
                    found_abv = abv
            else:
                if found_abv:
                    if self.verbose:
                        print('ABV for {} is {}'.format(self.name, found_abv))
                    return found_abv

        ''' No ABV was found by this point '''
        if self.verbose:
            print('ABV not found for {}'.format(self.name))

        return None

Exemple #30

0

Afficher le fichier

Fichier : search.py Projet : dad9489/HQLiveAI

def search(query,
           pages=1,
           lang='en',
           area='com',
           ncr=False,
           void=True,
           time_period=False,
           sort_by_date=False,
           first_page=0):
    """Returns a list of GoogleResult.
    Args:
        query: String to search in google.
        pages: Number of pages where results must be taken.
        area : Area of google homepages.
        first_page : First page.
    TODO: add support to get the google results.
    Returns:
        A GoogleResult object."""

    start = time.time()
    results = []
    for i in range(first_page, first_page + pages):
        url = _get_search_url(query,
                              i,
                              lang=lang,
                              area=area,
                              ncr=ncr,
                              time_period=time_period,
                              sort_by_date=sort_by_date)
        html = get_html(url)

        urls_time = time.time()
        print('got html in ' + str(urls_time - start) + 's')
        if html:
            soup = BeautifulSoup(html, "html.parser")
            divs = soup.findAll("div", attrs={"class": "g"})

            results_div = soup.find("div", attrs={"id": "resultStats"})
            number_of_results = _get_number_of_results(results_div)

            parse_time = time.time()
            print('parsed html in ' + str(parse_time - urls_time) + 's')

            j = 0
            for li in divs:
                res = GoogleResult()

                res.page = i
                res.index = j

                res.name = _get_name(li)
                res.link = _get_link(li)
                res.google_link = _get_google_link(li)
                res.description = _get_description(li)
                res.thumb = _get_thumb()
                res.cached = _get_cached(li)
                res.number_of_results = number_of_results

                if void is True:
                    if res.description is None:
                        continue
                results.append(res)
                j += 1
    return results

Exemple #31

0

Afficher le fichier

Fichier : books_down.py Projet : Sustartpython/My-Python-Examples

def parsel_detail():
    now_time = time.strftime('%Y%m%d')
    conn_1 = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    conn_2 = sqlite3.connect('zt_template.db3')
    sub_db_id = '243'
    provider = 'mirrorimutmeixingbook'
    type = '1'
    date = '1900'
    date_created = '19000000'
    medium = '2'
    sql_up = "update detail set stat = 1 where url = %s"
    sql_in = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
    result_1 = []
    result_2 = []
    while True:
        sql = "select provider_subject,title,url from detail where stat=0 and failcount < 20 limit 1000"
        cur = conn_1.cursor()
        cur.execute(sql)
        rows = cur.fetchall()
        if len(rows) == 0:
            break
        else:
            for provider_subject, title, url in rows:
                utils.printf(url)
                feature = "tdbg_rightall"
                if "Soft_Showja.asp" in url:
                    SoftID = re.findall("SoftID=(.*)", url)[0]
                    rawid = "ja%s" % SoftID
                else:
                    SoftID = re.findall("SoftID=(.*)", url)[0]
                    rawid = SoftID
                fdir = '%s/%s' % (detail_path, now_time)
                if not os.path.exists(fdir):
                    os.makedirs(fdir)
                filename = '%s/%s.html' % (fdir, rawid)
                if os.path.exists(filename):
                    continue
                res = utils.get_html(url, feature=feature, proxies=proxy)
                time.sleep(2)
                if res:
                    with open(filename, 'w', encoding='gb18030') as f:
                        f.write(res.content.decode("gb18030"))
                    utils.printf(filename)
                    # html = Selector(res.content.decode("gb18030"),"html")
                    # creator = html.xpath("//table[@style='WORD-BREAK: break-all']//tr/td/text()").extract()[0].replace("作者:","")
                    # if creator == "unknow":
                    #     creator = ""
                    # if "Soft_Showja.asp" in url:
                    #     language = "JA"
                    #     country = "JP"
                    #     SoftID = re.findall("SoftID=(.*?)",url)[0]
                    #     rawid = "ja%s" % SoftID
                    #     Lngid = utils.GetLngid(sub_db_id,rawid)
                    # else:
                    #     language = "EN"
                    #     country = "US"
                    #     SoftID = re.findall("SoftID=(.*)",url)[0]
                    #     rawid = SoftID
                    #     Lngid = utils.GetLngid(sub_db_id,rawid)
                    # provider_url = provider + '@' + url
                    # provider_id = provider + '@' + rawid
                    # batch = str(now_time) + '00'
                    result_1.append((url))
                    # result_2.append(
                    #     (Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium)
                    # )
                if utils.parse_results_to_sql(conn_1, sql_up, result_1, 50):
                    utils.printf("更新%s条" % len(result_1))
                    result_1.clear()
                # if utils.parse_results_to_sql(conn_2, sql_in, result_2, 50):
                #     utils.printf("插入%s条" % len(result_2))
                #     result_2.clear()
            utils.parse_results_to_sql(conn_1, sql_up, result_1)
            utils.printf("更新剩下的%s条" % len(result_1))
            result_1.clear()

Exemple #32

0

Afficher le fichier

Fichier : bible_org_parser.py Projet : HughP/langdeath

 def parse(self):
     html = get_html(self.url)
     for lang, code in self.generate_pairs(html):
         yield {'name': lang, 'sil': code, 'on_bible_org': True}

Exemple #33

0

Afficher le fichier

 def _crawl(self):
     print "Getting info for %s" % self.seed
     html = get_html(self.seed)
     self.index = extract_links(html)

Exemple #34

0

Afficher le fichier

Fichier : aero.py Projet : drouetd/Scrape-tools

				member['email'] = d.p.string
			if d.find('span', class_='icon4'):
				member['website'] = d.p.string
		
		# add record to the list
		membership.append(member)
		
	return membership


if __name__ == "__main__":

	# page specific setup
	page_url = ""
	output_filename = 'Data/aero.csv'
	fields = []

	# hack for dealing with accented text
	reload(sys)
	sys.setdefaultencoding('utf-8')

	# read single page and extract data
	html = get_html(page_url)
	if html:
		record_list = parse_aero(html)
		print record_list

	# write records to csv
	write_to_csv(output_filename, fields, record_list)

Exemple #35

0

Afficher le fichier

 def get_scripts(self, url1):
     html = get_html(url1)
     self.scripts = set([])
     for relevant_dict in self.get_relevant_dict(html):
         self.scripts.add(relevant_dict['script'].lower().strip('\n'))

Exemple #36

0

Afficher le fichier

Fichier : tieba.py Projet : loveQt/tieba

 def get_soup(self):
     html = utils.get_html(self.url)
     result = regex.findall(html)
     print result
     soup = BeautifulSoup(html)
     return soup

Exemple #37

0

Afficher le fichier

            if d.find('span', class_='icon3'):
                member['email'] = d.p.string
            if d.find('span', class_='icon4'):
                member['website'] = d.p.string

        # add record to the list
        membership.append(member)

    return membership


if __name__ == "__main__":

    # page specific setup
    page_url = ""
    output_filename = 'Data/aero.csv'
    fields = []

    # hack for dealing with accented text
    reload(sys)
    sys.setdefaultencoding('utf-8')

    # read single page and extract data
    html = get_html(page_url)
    if html:
        record_list = parse_aero(html)
        print record_list

    # write records to csv
    write_to_csv(output_filename, fields, record_list)

Exemple #38

0

Afficher le fichier

Fichier : pptv_crawler.py Projet : ddy88958620/work

def get_areas(area_url):
    try:
        resp = utils.get_html(area_url)
    except Exception, e:
        print area_error, e
        exit(1)

Exemple #39

0

Afficher le fichier

Fichier : route_spider.py Projet : passos/transport-victoria-spider

import utils

###########################################################################

if __name__ == "__main__":
    db.init()

    progress = int(db.get_param('route_update_progress', 0))
    db.cur.execute("SELECT id FROM route WHERE id > ? ORDER BY id", (progress,))
    rows = db.cur.fetchall()
    tobedone = map(lambda x: x[0], rows)

    rp = RouteParser()

    for id in tobedone:
        print "\n\nupdate route id", id
        url = utils.get_route_url(id)
        html = utils.get_html(url, data=id)
        try:
            rp.parse(id, html)
            db.cache_del(url)
        except Exception as e:
            print "  ", id, "failed, error:", e
            db.update_table('route', 'id', id, parsed='F')
            raise

        db.set_param('route_update_progress', id)


    db.close()

Exemple #40

0

Afficher le fichier

# coding:utf-8
import utils

'''
爬取百度贴吧帖子图片
'''

url = 'http://tieba.baidu.com/p/1753935195'

html = utils.get_html(url)
print(html)
# 以写的方式打开pageCode.txt
pageFile = open('pageCode.txt', 'wb+')
# 写入
pageFile.write(html)
# 开了记得关
pageFile.close()

Exemple #41

0

Afficher le fichier

def download_gallery(site):
    start = time.time()
    # for offensive warning
    need_cookies = False
    cookies = None
    html = utils.get_html(site)
    if not html:
        print('Failed to retrieve gallery page, process will be aborted!')
        return
    if utils.is_warning_page(html):
        print('Page has offensive content, setting cookies to get around it')
        need_cookies = True
        cookies = utils.get_cookies(site)
        html = utils.get_html_with_cookies(site, cookies)
    metadata = get_gallery_metadata(html)
    urls = get_page_urls(html)
    sections = metadata["Length"].split()
    total_images = int(sections[0]) if sections else 0
    title = metadata["Title"]
    print('Below is the informaiton of the gallery...')
    print_metadata(metadata)
    print('Start downloading...')
    title = title.replace('/', ' of ')
    if not utils.create_dir(title):
        return
    if total_images:
        utils.print_progress(0, total_images)
    else:
        print(
            "Failed to get total number of images, progress bar is disabled!")
    i = 0
    img_fails = []
    gallery_page_fails = []
    img_page_fails = []

    #download images in each gallery page
    for url in urls:
        page_html = utils.get_html_with_cookies(
            url, cookies) if need_cookies else utils.get_html(url)
        if not page_html:
            gallery_page_fails.append(url)
            continue
        image_urls = get_image_urls(page_html)
        for image_url in image_urls:
            image_page_html = utils.get_html(image_url)
            if not image_page_html:
                img_page_fails.append(image_url)
                continue
            image_src = get_image_src(image_page_html)
            parts = image_src.split('.')
            extension = (
                '.' + parts[-1] if parts[-1] else '.jpg') if parts else '.jpg'
            file_name = get_file_name(total_images, i + 1) + extension
            file_path = title + '/' + file_name
            if not os.path.exists(file_path):
                if not utils.get_image(image_src, file_path):
                    img_fails.append(file_name)
            i += 1
            if total_images:
                utils.print_progress(i, total_images)

    #downloading result
    succeed = True
    if gallery_page_fails or img_page_fails:
        succeed = False
        print('Failed to load following pages:')
        for url in gallery_page_urls:
            print(url)
        for url in img_page_fails:
            print(url)
    if img_fails:
        succeed = False
        print('Failed to download following %s files...' % len(img_fails))
        for img in img_fails:
            print(img)
    if succeed:
        print('All files are downloaded successfully!')
    end = time.time()
    hours, rem = divmod(end - start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("Total time elapsed {:0>2}m:{:02.0f}s".format(
        int(hours) * 60 + int(minutes), seconds))

Exemple #42

0

Afficher le fichier

 def get_page_nums(self, main_url):
     html = get_html(main_url)
     soup = BeautifulSoup(html, "lxml")
     page = soup.select(".p-skip em b")
     print "page number:" + page.get_text()
     return int(page.get_text())

Exemple #43

0

Afficher le fichier

Fichier : mesi.py Projet : drouetd/Scrape-tools


if __name__ == "__main__":
	# hack for dealing with accented text
	reload(sys)
	sys.setdefaultencoding('utf-8')
	
	companies = get_mesi_urls()
	#sys.exit()
	
	
	# page specific setup
	"""
	companies = ["http://internet2.economie.gouv.qc.ca/Internet/aerospatiale/reperaero.nsf/bd4b8ac1bdeea6ee0525694b007576fd/905426df69f342a985257ec0002146d8?OpenDocument",
				"http://internet2.economie.gouv.qc.ca/Internet/aerospatiale/reperaero.nsf/bd4b8ac1bdeea6ee0525694b007576fd/097aeedcac7ea4de85257b3200715bc0?OpenDocument",
				"http://internet2.economie.gouv.qc.ca/Internet/aerospatiale/reperaero.nsf/bd4b8ac1bdeea6ee0525694b007576fd/c06a60a8d08182f885257b32007164cc?OpenDocument",
				"http://internet2.economie.gouv.qc.ca/Internet/aerospatiale/reperaero.nsf/bd4b8ac1bdeea6ee0525694b007576fd/11323676470757f285257b320071648b?OpenDocument"]
	"""
	record_list =[]
	
	# iterate through a list of company pages
	for company in companies:
		html = get_html(company)
		if html:
			record_list.append(parse_mesi_company_page(html))
			
	# write records to csv
	output_filename = 'Data/mesi.csv'
	fields = ['name', 'contact1', 'title1', 'contact2', 'title2','phone', 'fax', 'email', 'website', 'revenues', 'description']
	write_to_csv(output_filename, fields, record_list)

Exemple #44

0

Afficher le fichier

Fichier : language_archives_parser.py Projet : HughP/langdeath

 def get_html(self, sil):
     url = '{0}/{1}'.format(self.base_url, sil)
     return get_html(url)