Exemple #1
0
def mangastream(url, name, dest, delim, digits, number):
    print "Downloading images from [mangastream]...\n"

    links = [tag.get('href') for tag in get_html(url).findAll(
        "ul", {"class": "dropdown-menu"})[-1].select('li > a')]
    match = re.search(r"(.*\/)(\d*)$", links[-1])
    base_url, num_pages = match.group(1), int(match.group(2))

    for i in range(1, num_pages + 1):
        try:
            image_url = get_html(
                base_url + str(i)).select("#manga-page")[0].get("src")
            new_name = set_name("", ".jpg", "", i, digits)
            download_file(image_url, new_name, dest, i)
        except:
            print "exception"
            pass
Exemple #2
0
    def parse(self):
        ''' retrieves the page and parses the contents into the following fields

                self.name    (May include brewery/brand and/or beer)
                self.price   (USD)
                self.volume  (Gallons)
                self.num_avail  (Kegs)
                self.desc    (Keg description)
        '''
        if self.parsed:
            return

        self.parsed = True

        html = get_html(self.url)

        ''' Attempt to get name and volume '''
        try:
            self.name = html.xpath('//h1/text()')[0].strip()
            if '(' in self.name and ')' in self.name:
                split_name = self.name.split('(')
                self.name = split_name[0].strip()

                volume = filter(lambda x: is_num(x) if '.' not in x \
                                else x, split_name[1].strip(')').strip())
                if is_num(volume):
                    self.volume = float(volume)
                else:
                    self.volume = 0.0
            else:
                self.volume = 0.0
        except Exception:
            self.name = ''
            self.volume = 0.0

        ''' Attempt to get price '''
        try:
            self.price = float(html.xpath('//span[@class="ProductDetailItemPric\
                                          e"]/text()')[0].strip().strip('$'))
        except Exception:
            self.price = 0.0

        ''' Attempt to get number of available kegs '''
        try:
            self.num_avail = int(html.xpath('//em/text()\
                                            ')[0].strip().split()[0])
        except Exception:
            self.num_avail = 0

        ''' Attempt to get description '''
        try:
            self.desc = html.xpath('//td[@class="ProductDetailCell"]/p/text()\
                                   ')[0].strip()
        except Exception:
            self.desc = ''
Exemple #3
0
def source_vst():
    urllist = []
    try:
        js = utils.get_html()
        info_dict = utils.get_json(js)

        for item in info_dict["live"]:
            url_list = item["urllist"].split("#")
            for url in url_list:
                urllist.append(url)

    except Exception, e:
        print api_error, e
        exit(1)
    def parse_map(self):
        map_node = self.get_node(self.root, "//div[@class='routeMapInner']/a")
        if map_node is None: return None

        url = map_node.get('href')
        map_html = utils.get_html(utils.ptv_url + url)
        map_tree = etree.HTML(map_html)
        map_node = self.get_node(map_tree, "//div[@class='routeMapInner']/img")
        if map_node is None: return None

        # http://ptv.vic.gov.au/
        map_link = map_node.get('src')
        db.update_table('map', 'link', map_link)
        map_id = db.query("SELECT id FROM map WHERE link=?", (map_link,))

        return map_id
Exemple #5
0
def source_7po():
    urllist = []
    try:
        resp = utils.get_html(api_url)
        dom = xml.dom.minidom.parseString(resp)
        root = dom.documentElement
        channels = root.getElementsByTagName("channel")

        for channel in channels:
            for urlNode in channel.childNodes:
                # 源的url
                url = urlNode.firstChild.wholeText
                urllist.append(url)

    except Exception, e:
        print api_error, e
        exit(1)
def shopping(query, pages=1):
    results = []
    for i in range(pages):
        url = _get_shopping_url(query, i)
        html = get_html(url)
        if html:
            j = 0
            soup = BeautifulSoup(html)

            products = soup.findAll("div", "g")
            print "yoooo", products
            for prod in products:
                res = ShoppingResult()

                divs = prod.findAll("div")
                for div in divs:
                    match = re.search(
                        "from (?P<count>[0-9]+) stores", div.text.strip())
                    if match:
                        res.store_count = match.group("count")
                        break

                h3 = prod.find("h3", "r")
                if h3:
                    a = h3.find("a")
                    if a:
                        res.compare_url = a["href"]
                    res.name = h3.text.strip()

                psliimg = prod.find("div", "psliimg")
                if psliimg:
                    img = psliimg.find("img")
                    if img:
                        res.thumb = img["src"]

                f = prod.find("div", "f")
                if f:
                    res.subtext = f.text.strip()

                price = prod.find("div", "psliprice")
                if price:
                    res.min_price = price.text.strip()

                results.append(res)
                j = j + 1
    return results
def get_aver_num(query, lang='en'):
    """Returns average number of search results.

    Args:
        query: String to search in google.

    Returns:
        int number"""
    av_num = 0
    url = _get_search_url(query, 0, lang=lang)
    html = get_html(url)

    if html:
        soup = BeautifulSoup(html, "html.parser")            
        av_num = soup.find("div", {"id": "resultStats"})
        av_num = _get_num(av_num)

    return av_num
def convert(amount, from_currency, to_currency):
    """Method to convert currency.

    Args:
        amount: numeric amount to convert
        from_currency: currency denomination of the amount to convert
        to_currency: target currency denomination to convert to
    """

    # same currency, no conversion
    if from_currency == to_currency:
        return amount * 1.0

    req_url = _get_currency_req_url(amount,
                                    from_currency, to_currency)
    response = get_html(req_url)
    rate = _parse_currency_response(response, to_currency)

    return rate
def search(query, pages=1, lang='en', void=True):
    """Returns a list of GoogleResult.

    Args:
        query: String to search in google.
        pages: Number of pages where results must be taken.

    Returns:
        A GoogleResult object."""

    results = []
    for i in range(pages):
        url = _get_search_url(query, i, lang=lang)
        html = get_html(url)

        if html:
            soup = BeautifulSoup(html, "html.parser")
            lis = soup.findAll("div", attrs={"class": "g"})
            
            j = 0
            for li in lis:
                res = GoogleResult()

                res.page = i
                res.index = j

                res.name = _get_name(li)
                res.link = _get_link(li)
                res.google_link = _get_google_link(li)
                res.description = _get_description(li)
                res.thumb = _get_thumb()
                res.cached = _get_cached(li)
                if void is True:
                    if res.description is None:
                        continue
                results.append(res)
                j += 1

    return results
Exemple #10
0
def cntv_crawler(api):
    print "start running cntv crawler...."
    src_list = []
    for channel in channel_list:
        print "getting url from %s" % channel

        try:
            jsn = utils.get_html(api % channel)
            data = utils.get_json(jsn)
            src = {}
            src["code"] = channel
            src['auth'] = data['hls_url']['hls2']
            if "cctv" in channel:
                src["hls"] = data['hls_url']['hls1']
                src['flv'] = data['hds_url']['hds2']

            src_list.append(src)

        except Exception, e:
            print e
        else:
            pass
        finally:
Exemple #11
0
def get_channels(area, url):
    try:
        # 获取地区中所有频道的url
        c_url = url % (area[0], area[0])
        resp = utils.get_html(c_url)
        c_match = utils.get_json(resp, json_pattern)
        # c_match = re.search(json_pattern, r.text).group(1)
        c_json = json.loads(c_match)
        # 获取含有频道列表的html
        c_html = c_json["html"]
        # 频道url列表
        urllist = re.findall(url_pattern, c_html)
        # 频道名列表
        namelist = re.findall(c_name_pattern, c_html)

        # 频道字典,频道名与url对应
        c_dict = {}
        for i in xrange(len(namelist)):
            c_dict[namelist[i]] = urllist[i]

    except Exception, e:
        print e
        return None
Exemple #12
0
def hotflick(url, name, dest, delim, digits, number):
    print "Downloading images from [hotflick]...\n"
    
    # get all page links if the gallery has more than one page
    div = get_html(url).find('div', {"class": "box-paging"})
    gallery_page_links = [str(tag['href'])
                          for tag in div.findAll('a', href=True)]

    # get image links
    if gallery_page_links != []:
        links = []
        for page in gallery_page_links:
            links.extend([link for link in get_page_links(
                "http://hotflick.net/" + page) if "/v/?q=" in link])
    else:
        links = [link for link in get_page_links(url) if "/v/?q=" in link]

    regex = re.compile(r'\.net/\w/v/\?q\=(\d+)\.(.*)(\.\w*)$', re.IGNORECASE)

    for link in links:
        try:
            # image name and filetype
            match = regex.search(link)
            ext = match.group(3)

            # image URL and output filename
            new_name = set_name(name, ext, delim, number, digits)
            image_url = "http://www.hotflick.net/u/n/{0}/{1}{2}".format(
                match.group(1), match.group(2), ext)

            # download
            download_file(image_url, new_name, dest, number)
            number += 1
        except:
            print "exception"
            pass
Exemple #13
0
def upix(url, name, dest, delim, digits, number):
    print "Downloading images from [upix]...\n"

    links = [str(tag['href'])
             for tag in get_html(url).findAll('a', {"class": "thumb"})]

    base_url = url
    if str.endswith(url, "/#none"):
        base_url = url[:-5]

    regex = re.compile(r'(\.[a-zA-Z]*)$', re.IGNORECASE)

    for link in links:
        try:
            # image URL and output filename
            image_url = base_url + link
            ext = regex.search(image_url).group(1)
            new_name = set_name(name, ext, delim, number, digits)

            # download
            download_file(image_url, new_name, dest, number)
            number += 1
        except:
            pass
Exemple #14
0
def imgur(url, name, dest, delim, digits, number):
    print "Downloading images from [imgur]...\n"

    if not str.endswith(url, "/layout/blog"):
        url += "/layout/blog"

    links = get_html(url).findAll('meta', {'property': 'og:image'})
    links = [link['content'] for link in links[1:]]

    regex = re.compile(r'\.com/\w*(\.[a-zA-Z]*)$', re.IGNORECASE)

    for image_url in links:
        try:
            # filetype
            ext = regex.search(image_url).group(1)

            # output filename
            new_name = set_name(name, ext, delim, number, digits)

            # download
            download_file(image_url, new_name, dest, number)
            number += 1
        except:
            pass
Exemple #15
0
 page_number = 1
 #max_num = data['{}'.format(district)]
 os.chdir(os.path.join(path, 'CDATA'))
 with open(sfile, 'a', encoding='utf-8', newline='') as csvfile:
     csvwriter = csv.writer(csvfile)
     # writing the fields
     csvwriter.writerow(fields)
     os.chdir(path)
     while True:
         if page_number > max_num:
             break
         log_file.write('page{} process strted at: '.format(page_number) +
                        time.ctime() + '\r\n')
         url = "https://www.justdial.com/%s/Lawyers/nct-10296083/page-%s" % (
             district, page_number)
         ut.get_html(url, page_number)
         time.sleep(5)
         page = open('temp{}.htm'.format(page_number),
                     'r',
                     encoding='utf-8')
         #page = urllib.request.urlopen(req , proxy , timeout=5)
         #time.ctime(1)
         # page=urllib2.urlopen(url)
         soup = BeautifulSoup(page.read(), "html.parser")
         services = soup.find_all('li', {'class': 'cntanr'})
         # Iterate through the 10 results in the page
         for service_html in services:
             # Parse HTML to fetch data
             name = ut.get_name(service_html)
             phone = ut.get_phone_number(service_html)
             #rating = get_rating(service_html)
Exemple #16
0
def get_optimal_kegs(args):
    ''' Gets kegs from bevmo.com
        finds the kegs with the optimal gallons of alcohol per USD
    '''
    num_kegs = args['top']
    beer_limit = args['limit']
    num_attempts = args['attempts']
    max_price = args['price']
    desc_filter = args['filter']
    desc_unfilter = args['unfilter']

    ''' The first url to crawl and its base url '''
    seed_url = 'http://www.bevmo.com/Shop/ProductList.aspx/\
                Beer/Kegs/_/N-15Z1z141vn?DNID=Beer'
    base_url = '{url.scheme}://{url.netloc}'.format(url=urlparse(seed_url))

    ''' Get initial unique page links from the seed url
        append base_url to them
    '''

    '''     For info on XPaths, see:
            http://www.w3schools.com/xpath/xpath_syntax.asp
    '''
    init_page_links = []
    init_page_links[:] = unique(get_html(seed_url).xpath('//div[@class="Product\
                                                         ListPaging"]/a/@href'))

    if not init_page_links:
        print('Failed to retrieve the initial keg page links!')
        return None

    ''' Lists for holding links to pages of beer kegs '''
    page_links = [seed_url] + map(lambda x: base_url + x, init_page_links)
    new_page_links = []

    ''' Lists for holding links to individual beer kegs '''
    beer_links = []
    new_beer_links = []

    ''' To keep track of already crawled beer kegs '''
    crawled_beers = set()

    ''' List for matching --filter and --unfilter keyword arguments to
        keg descriptions
    '''
    matched = []

    ''' List to hold top beer kegs, the size of optimal_kegs is limited by the
        num_kegs argument
    '''
    optimal_kegs = []

    keg = None
    while len(page_links) > 0 and len(crawled_beers) < beer_limit:
        ''' Links are removed as they are crawled '''
        page_link = page_links.pop(0)

        ''' Beer keg links '''
        new_beer_links[:] = unique(get_html(page_link).xpath('//a[@class="Prod\
                                                             uctListItemLink"]\
                                                             /@href'))
        beer_links += [base_url + x for x in new_beer_links]

        ''' Crawl the beer keg links
            get the gallons of alcohol/USD ratio
        '''
        for link in beer_links:
            ''' Break if the number of crawled beers exceeds the limit '''
            if len(crawled_beers) >= beer_limit:
                break

            ''' Cache the BevMo beer id's to prevent duplicates '''
            beer_id = link.split('/')[-1]

            if beer_id not in crawled_beers:
                ''' Create BeerKeg object '''
                keg = BeerKeg(link, num_attempts, verbose=True)

                ''' Call keg.parse() then filter kegs by their descriptions
                    Calling keg.parse() produces fields keg.desc, keg.price, etc
                    keg.parse() will only parse once per keg object
                '''

                ''' Check if price is within range if one was given '''
                if max_price:
                    keg.parse()

                    if keg.price > max_price:
                        ''' Move onto the next keg and ignore this one '''
                        continue

                ''' args['filter'] has words that must be in the description '''
                ''' desc_filter has words that must be in the description '''
                if desc_filter:
                    keg.parse()

                    matched = [word in keg.desc for word in desc_filter]

                    ''' All keywords must be present for a match '''
                    if not all(matched):
                        ''' Move onto the next keg and ignore this one '''
                        continue

                ''' desc_unfilter has words that can't be in the description '''
                if desc_unfilter:
                    keg.parse()

                    matched = [word in keg.desc for word in desc_unfilter]

                    ''' Any keyword must be present to nullify a match '''
                    if any(matched):
                        ''' Move onto the next keg and ignore this one '''
                        continue

                ''' Add current beer to crawled beers '''
                crawled_beers.add(beer_id)

                ''' Print how many kegs have been crawled '''
                print('Keg {}'.format(len(crawled_beers)))

                ''' Gets the gallons of alcohol per USD for the keg '''
                ratio = keg.get_ratio()

                print('')

                ''' Maintain a sorted list of the current top 3 kegs using
                    heapq (heap queue algorithm)

                    optimal_kegs holds a tuple containing the ratio and keg
                    associated with it
                '''
                if optimal_kegs:
                    for opt_tuple in optimal_kegs:
                        ''' If ratio is greater than any keg ratio currently
                            in optimal_kegs, then add it
                        '''
                        if ratio > opt_tuple[0]:
                            if len(optimal_kegs) >= num_kegs:
                                ''' Adds new item to list
                                    removes the smallest to maintain size
                                '''
                                heapq.heappushpop(optimal_kegs, (ratio, keg))
                            else:
                                heapq.heappush(optimal_kegs, (ratio, keg))
                            break
                else:
                    ''' Will only occur for the very first keg crawled '''
                    heapq.heappush(optimal_kegs, (ratio, keg))

        ''' Typical link: Shop/ProductList.aspx/_/N-15Z1z141vn/No-100?DNID=Beer

            If No- is evenly divisible by 100, it leads to more pages to add
        '''
        if 'No-' in page_link:
            if int(page_link.split('No-')[1].split('?')[0]) % 100 == 0:
                ''' Unique new page links with their base url appended '''
                new_page_links[:] = unique(get_html(page_link).xpath('//div[@cl\
                                                                     ass="Produ\
                                                                     ctListPagi\
                                                                     ng"]/a/@hr\
                                                                     ef'))
                page_links += [base_url + x for x in new_page_links]

    ''' Sort the list in descending order by ratio
        (index 0 in the keg tuple)
    '''
    return sorted(optimal_kegs, key=lambda x: x[0], reverse=True)
Exemple #17
0
    def __init__(self, player_name, player_link, Team=None, position=None):

        self.min_snap_perc = .10
        self.name = player_name
        self.player_link = player_link

        config = get_config(os.getcwd())
        self.base_url = config['base_url']
        self.full_player_url = self.base_url + player_link
        self.standardized_position_dict = {
            "OL": {
                "eligible_positions": [
                    "G", "T", "C", "LS", "OT", "OG", "OL", "G/C", "G-C", "T-G",
                    "G-T", "C-G", "G,C", "C,G", "G,T", "T,G"
                ],
                "class":
                OffLineman,
                "side":
                "offense"
            },
            "QB": {
                "eligible_positions": ["QB"],
                "class": Quarterback,
                "side": "offense"
            },
            "WR": {
                "eligible_positions": ["WR", "PR-WR", "WR/RB"],
                "class": WideReceiver,
                "side": "offense"
            },
            "TE": {
                "eligible_positions": ["TE", "LS,TE", "TE-C"],
                "class": TightEnd,
                "side": "offense"
            },
            "RB": {
                "eligible_positions": ["RB", "FB", "FB-LB", "HB"],
                "class": RunningBack,
                "side": "offense"
            },
            "DB": {
                "eligible_positions": ["SS", "FS", "CB", "DB", "S"],
                "class": DefBack,
                "side": "defense"
            },
            "LB": {
                "eligible_positions": ["LB", "OLB", "ILB", "MLB", "LB-DE"],
                "class": Linebacker,
                "side": "defense"
            },
            "DL": {
                "eligible_positions": [
                    "DT", "DL", "NT", "DE", "NT-DT", "DT-NT", "DE-LB", "DT/LB",
                    "DE-C", "DE-DT", "DT-DE"
                ],
                "class":
                DefLineman,
                "side":
                "defense"
            },
            "K": {
                "eligible_positions": ["K"],
                "class": Kicker,
                "side": "special_teams"
            },
            "P": {
                "eligible_positions": ["P"],
                "class": Punter,
                "side": "special_teams"
            }
        }

        if Team is not None:
            self.game_html_page = Team.game_html_page
            self.season = Team.season
            self.week = Team.week
            self.team = Team.team
            self.team_abbrev = Team.team_abbrev
            self.base_url = Team.base_url

        if position is None:
            self.player_page = get_html(self.full_player_url)
            self.meta_div = self.player_page.find(
                "div", {"itemtype": "https://schema.org/Person"})
            position = self.get_position_from_player_page(self.meta_div)

        self.standardized_pos = [k for k, v in self.standardized_position_dict.items() \
                                 if position in v["eligible_positions"]][0]
        self.player_class = self.standardized_position_dict[
            self.standardized_pos]["class"]
        self.side = self.standardized_position_dict[
            self.standardized_pos]['side']
Exemple #18
0
 def generate_script_dicts(self):
     for s in self.scripts:
         html = get_html('{0}{1}'.format(self.script_url, s))
         for d in self.get_relevant_dict(html):
             yield d
Exemple #19
0
    def get_game_page(self):

        game_soup = get_html(self.game_full_url)
        return game_soup
Exemple #20
0
def html_to_json(url):
    category, uid = tokenize(url)
    schema_name = 'schema/{}.json'.format(category)
    with open(schema_name, 'rb') as fp:
        template = json.load(fp)
    html_doc = get_html(url)
    soup = BeautifulSoup(html_doc, 'html.parser')

    table_title = None
    result = {}
    ignore_image = True
    for tr in soup.find_all('tr'):
        # keep only the most bottom level tr
        if tr.find_all('tr'):
            continue
        is_title_row = False
        row_content = []
        for td in tr.find_all('td'):
            if ignore_image and td.find_all('img'):
                continue
            text = clean_up(td.text)
            if text in template:
                table_title = text
                is_title_row = True
                row_titles = template[table_title]
                ignore_image = row_titles['ignore image']
                result[table_title] = {}
                break
            link = ''
            for a in td.find_all('a'):
                link = a.get('href')
            row_content.append({'text': text, 'link': link})

        if is_title_row:
            continue

        if not row_content or not table_title:
            continue

        column_index = row_titles['column index']
        strict_match = row_titles['strict match']
        regex_match = row_titles['regex match']
        terminate_on_mismatch = row_titles['terminate on mismatch']

        matched = False
        if len(row_content) > column_index + 1:
            candidate_row_title = row_content[column_index]['text']
            for s in strict_match:
                if s == candidate_row_title and s not in result[table_title]:
                    matched = True
                    result[table_title][s] = row_content[column_index + 1:]
                    break
            if not matched:
                for s in regex_match:
                    if s in candidate_row_title:
                        matched = True
                        result[table_title][u'Certified Votes'] = row_content[column_index + 1:]
                        break
                    if re.match(s, candidate_row_title):
                        matched = True
                        category, race_id = tokenize(row_content[column_index + 1]['link'])
                        result[table_title][race_id] = row_content[column_index:]
                        break
        if terminate_on_mismatch and not matched:
            table_title = None
            ignore_image = True
    return result
Exemple #21
0
def main():
    html = get_html(url)
    table = get_macrolang_table(html)
    langs = parse_macrolang_table(table)
    detailed_list = get_detailed_list(html)
    parse_detailed_list(detailed_list)
Exemple #22
0
def get_areas(area_url):
    try:
        resp = utils.get_html(area_url)
    except Exception, e:
        print area_error, e
        exit(1)
Exemple #23
0
 def __init__(self, profile_id):
     self.profile_id = profile_id
     profile_html = utils.get_html("profile", self.profile_id)
     self.name = utils.get_name_from_html(profile_html)
     self.solved = 0
     self.problems = []
 def run(self):
     big_json_name = big_json_path + '/%s_%s_%s.big_json' % (
         now_time, os.getpid(), get_ident())
     while True:
         if not message_que.empty():
             rows = message_que.get()
             for url in rows:
                 utils.printf(url)
                 key = random.choice(RKEY_PROXY)
                 proxy_ = connRedis.srandmember(key)
                 proxy = {
                     'http': proxy_,
                     'https': proxy_,
                 }
                 feature = "highwire-cite-metadata"
                 feature_2 = "pane-title"
                 # res = utils.get_html(url,feature=feature,proxies=proxy,timeout=200)
                 res = utils.get_html(url, feature=feature, timeout=200)
                 if res:
                     html = res.text.strip()
                     HEADER = {
                         "Accept":
                         "*/*",
                         "User-Agent":
                         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
                     }
                     try:
                         sumDict = dict()
                         info_url = url + "/tab-article-info"
                         # res_info = requests.get(info_url,headers=HEADER,proxies=proxy,timeout=20)
                         res_info = requests.get(info_url,
                                                 headers=HEADER,
                                                 timeout=200)
                         if res_info.status_code == 200:
                             if res_info.text.find(feature_2) > 0:
                                 info_html = res_info.text.strip()
                                 sumDict['provider_url'] = url
                                 sumDict['down_date'] = now_time
                                 sumDict['htmlText'] = html
                                 sumDict['info_htmlText'] = info_html
                                 with open(big_json_name,
                                           mode='a',
                                           encoding='utf-8') as f:
                                     line = json.dumps(
                                         sumDict,
                                         ensure_ascii=False).strip() + '\n'
                                     f.write(line)
                                 utils.printf(url, 'write to big_json')
                                 sql_queue.put(url)
                             else:
                                 utils.printf("not find feee_info")
                                 message_que.put(rows)
                         elif res_info.status_code == 404:
                             sumDict['provider_url'] = url
                             sumDict['down_date'] = now_time
                             sumDict['htmlText'] = html
                             sumDict['info_htmlText'] = ""
                             with open(big_json_name,
                                       mode='a',
                                       encoding='utf-8') as f:
                                 line = json.dumps(
                                     sumDict,
                                     ensure_ascii=False).strip() + '\n'
                                 f.write(line)
                             utils.printf(url, 'write to big_json')
                             sql_queue.put(url)
                         else:
                             message_que.put(rows)
                     except Exception as e:
                         utils.printf(e)
                         message_que.put(rows)
                 else:
                     message_que.put(rows)
def get_interview_text(interview_url):
    """
    Fetch a single piece of interview text and meta-data from a source webpage

    Parameters
    ----------
    interview_url : String
        The url to the webpage

    Returns
    ------
    interview_name : String
        Name of this interview

    interview_time : String
        When this interview happened

    interview_players: List[String]
        Interviewees

    interview_text : String
        An unprocessed String of raw interview text (including Questions and interviewee responses)
    """
    # example url: http://www.asapsports.com/show_conference.php?id=144725

    # fetch HTML
    soup = get_html(interview_url)

    assert len(soup.find_all('h1')) == 1
    if soup.find_all('h1')[0].a is not None:
        interview_name = str(soup.find_all('h1')[0].a.contents[0])
    else:
        interview_name = str(soup.find_all('h1')[0].contents[0])
    assert len(soup.find_all('h2')) == 1
    interview_time = str(soup.find_all('h2')[0].contents[0])

    # find all players attending this interview
    interview_players = []
    for link in soup.find_all('a'):
        if 'show_player.php' in link.get('href'):
            interview_players.append(str(link.contents[0]))

    # find interview text
    for td in soup.find_all('td'):
        if td.get('valign') == 'top' and td.get('style') == 'padding: 10px;':
            raw_interview_text = td.contents
            interview_text = ''
            for item in raw_interview_text:
                # all actual texts are either directly below the td Tag or is a Tag with name 'b'
                if type(item) is NavigableString:
                    interview_text += str(item)
                elif type(item) is Tag and item.name == 'b':
                    # cope with empty tags: <b></b>
                    if len(item.contents) > 0:
                        interview_text += str(item.contents[0])

    # remove #nbsp; and  from text
    interview_text = interview_text.replace('\xa0', ' ')
    interview_text = interview_text.replace('Â', ' ')

    return interview_name, interview_time, interview_players, interview_text
Exemple #26
0
from bs4 import BeautifulSoup

from utils import get_html


def process_scripts(html):
    soup = BeautifulSoup(html, "lxml")
    scripts = soup.find_all("script", {"type": "text/javascript"})
    return scripts


def process_shared_data(data):
    shared_data = [s.string for s in data if "window._sharedData = " in str(s)]
    post_links = str(shared_data).split(
        "edge_sidecar_to_children")[-1].replace(r"\\u0026", "&")
    post_links = post_links.split(",")
    links = [str(link.split('":"')[-1].rstrip('"'))
             for link in post_links if 'display_url' in link or 'video_url' in link]

    return links


if __name__ == "__main__":
    html = get_html("https://www.instagram.com/p/CCnsE2PJktq/").text
    data = process_scripts(html)
    print(process_shared_data(data))
Exemple #27
0
 def get_family_urls(self, url1):
     html = get_html(url1)
     self.family_urls = set([])
     for relevant_dict in self.get_relevant_dict(html):
         self.family_urls.add()
 def run(self):
     big_json_name =big_json_path + '/%s_%s_%s.big_json' % (now_time,os.getpid(),get_ident())
     HEADER = {
     'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
     }
     # url_zhuye = "https://www.pnas.org"
     # sn = requests.session()
     # res_zhuye = sn.get(url_zhuye,headers=HEADER,timeout=2)
     while True:
         if not message_que.empty():
             rows = message_que.get()
             for url in rows:
                 utils.printf(url)
                 key = random.choice(RKEY_PROXY)
                 proxy_ = connRedis.srandmember(key)
                 proxy = {
                     'http': proxy_,
                     'https': proxy_,
                 }
                 feature = "highwire-cite-metadata"
                 feature_2 = "pane-title"
                 res = utils.get_html(url,feature=feature,proxies=proxy,timeout=15)
                 if res:
                     html = res.text.strip()
                     h = Selector(text=html)
                     node_id = h.xpath("//div[@class='pane-content']/div[@class='highwire-article-citation highwire-citation-type-highwire-article']/@data-node-nid").extract_first()
                     info_url = "https://www.pnas.org/panels_ajax_tab/jnl_pnas_tab_info/node:%s/1" % node_id
                     utils.printf(info_url)
                     try:
                         sumDict = dict()
                         res_info = requests.get(info_url,headers=HEADER,proxies=proxy,timeout=200)
                         if res_info.status_code == 200:
                             if res_info.text.find(feature_2) > 0:
                                 info_html = res_info.text.strip()
                                 info_html = json.loads(info_html)['markup']
                                 sumDict['provider_url'] = url
                                 sumDict['down_date'] = now_time
                                 sumDict['htmlText'] = html
                                 sumDict['info_htmlText'] = info_html
                                 with open(big_json_name, mode='a', encoding='utf-8') as f:
                                     line = json.dumps(sumDict, ensure_ascii=False).strip() + '\n'
                                     f.write(line)
                                 utils.printf(url,'write to big_json')
                                 sql_queue.put(url)
                             else:
                                 utils.printf("not find feee_info")
                                 message_que.put(rows)
                         elif res_info.status_code == 404:
                             sumDict['provider_url'] = url
                             sumDict['down_date'] = now_time
                             sumDict['htmlText'] = html
                             sumDict['info_htmlText'] = ""
                             with open(big_json_name, mode='a', encoding='utf-8') as f:
                                 line = json.dumps(sumDict, ensure_ascii=False).strip() + '\n'
                                 f.write(line)
                             utils.printf(url,'write to big_json')
                             sql_queue.put(url)
                         else:
                             utils.printf(res_info.status_code)
                             message_que.put(rows)
                     except Exception as e:
                         utils.printf(e)
                         message_que.put(rows)
                 else:
                     print("1111")
                     message_que.put(rows)
Exemple #29
0
    def get_abv(self):
        ''' Attempts to find percentage of alcohol by volume using Bing '''
        abv = ''
        found_abv = ''

        ''' A ceiling for ABV content for validation

            We can assume BevMo does not offer kegs with this high of an ABV
        '''
        max_abv = 20.0

        if not self.parsed:
            self.parse()

        search_url = 'https://www.bing.com/search?q={0}+alcohol+content\
                     '.format('+'.join(self.name.split()))
        search_links = get_html(search_url).xpath('//a/@href')
        new_search_links = search_links[search_links.index('javascript:'):][1:]

        results = [x for x in new_search_links if x != '#' and 'site:' not in x]

        ''' Max number of links to search for alcohol by volume (ABV) '''
        num_attempts = self.num_attempts

        ''' Filter links with same domain to improve chances of matching '''
        searched_domains = set()

        ''' Add the top page results that are unique, r_it is an iterator '''
        top_results = []
        r_it = 0
        result_link = ''

        while len(top_results) < num_attempts and r_it < len(results):
            result_link = results[r_it]
            domain = '{url.netloc}'.format(url=urlparse(result_link))
            if '.' in domain:
                if domain.count('.') > 1:
                    domain = domain.split('.')[1]
                else:
                    domain = domain.split('.')[0]

            ''' Avoid already searched domains '''
            if domain in searched_domains:
                r_it += 1
            else:
                top_results.append(result_link)
                r_it += 1
                searched_domains.add(domain)

        for i in xrange(min(num_attempts, len(top_results))):
            if self.verbose:
                print('Searching {}'.format(top_results[i]))

            try:
                search_text = ''.join(get_text(get_html(top_results[i])))
            except Exception:
                continue

            ''' Retrieves partial string containing the words ABV and a % '''
            abv = re.search('(?<=[Aa][Bb][Vv])[^\d]*(\d+[.]?\d*)(?=%)|(?<=%)\
                            [^\d]*(\d+[.]?\d*)[^\d]*\
                            (?=[Aa][Bb][Cc])', search_text)
            if abv:
                abv = abv.group()

                ''' Filters for a number with or without a decimal pt '''
                abv = float(re.search('(\d+[.]?\d*)', abv).group())

                ''' If new ABV is 0.0, return previously found ABV if any
                    otherwise, move onto the next link
                '''
                if abv == 0.0:
                    if found_abv:
                        if self.verbose:
                            print('ABV for {} is {}'.format(self.name, abv))
                    else:
                        continue

                if abv < max_abv:
                    if abv < max_abv / 2:
                        if self.verbose:
                            print('ABV for {} is {}'.format(self.name, abv))

                        return abv

                    ''' Replace the new ABV only if the next is lower '''
                    if found_abv:
                        if abv < found_abv:
                            if self.verbose:
                                print('ABV for {} is {}'.format(self.name, abv))

                            return abv
                        else:
                            if self.verbose:
                                print('ABV for {} is {}\
                                      '.format(self.name, found_abv))

                            return found_abv

                    ''' Sets the new ABV to the found ABV '''
                    found_abv = abv
            else:
                if found_abv:
                    if self.verbose:
                        print('ABV for {} is {}'.format(self.name, found_abv))
                    return found_abv

        ''' No ABV was found by this point '''
        if self.verbose:
            print('ABV not found for {}'.format(self.name))

        return None
Exemple #30
0
def search(query,
           pages=1,
           lang='en',
           area='com',
           ncr=False,
           void=True,
           time_period=False,
           sort_by_date=False,
           first_page=0):
    """Returns a list of GoogleResult.
    Args:
        query: String to search in google.
        pages: Number of pages where results must be taken.
        area : Area of google homepages.
        first_page : First page.
    TODO: add support to get the google results.
    Returns:
        A GoogleResult object."""

    start = time.time()
    results = []
    for i in range(first_page, first_page + pages):
        url = _get_search_url(query,
                              i,
                              lang=lang,
                              area=area,
                              ncr=ncr,
                              time_period=time_period,
                              sort_by_date=sort_by_date)
        html = get_html(url)

        urls_time = time.time()
        print('got html in ' + str(urls_time - start) + 's')
        if html:
            soup = BeautifulSoup(html, "html.parser")
            divs = soup.findAll("div", attrs={"class": "g"})

            results_div = soup.find("div", attrs={"id": "resultStats"})
            number_of_results = _get_number_of_results(results_div)

            parse_time = time.time()
            print('parsed html in ' + str(parse_time - urls_time) + 's')

            j = 0
            for li in divs:
                res = GoogleResult()

                res.page = i
                res.index = j

                res.name = _get_name(li)
                res.link = _get_link(li)
                res.google_link = _get_google_link(li)
                res.description = _get_description(li)
                res.thumb = _get_thumb()
                res.cached = _get_cached(li)
                res.number_of_results = number_of_results

                if void is True:
                    if res.description is None:
                        continue
                results.append(res)
                j += 1
    return results
def parsel_detail():
    now_time = time.strftime('%Y%m%d')
    conn_1 = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    conn_2 = sqlite3.connect('zt_template.db3')
    sub_db_id = '243'
    provider = 'mirrorimutmeixingbook'
    type = '1'
    date = '1900'
    date_created = '19000000'
    medium = '2'
    sql_up = "update detail set stat = 1 where url = %s"
    sql_in = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
    result_1 = []
    result_2 = []
    while True:
        sql = "select provider_subject,title,url from detail where stat=0 and failcount < 20 limit 1000"
        cur = conn_1.cursor()
        cur.execute(sql)
        rows = cur.fetchall()
        if len(rows) == 0:
            break
        else:
            for provider_subject, title, url in rows:
                utils.printf(url)
                feature = "tdbg_rightall"
                if "Soft_Showja.asp" in url:
                    SoftID = re.findall("SoftID=(.*)", url)[0]
                    rawid = "ja%s" % SoftID
                else:
                    SoftID = re.findall("SoftID=(.*)", url)[0]
                    rawid = SoftID
                fdir = '%s/%s' % (detail_path, now_time)
                if not os.path.exists(fdir):
                    os.makedirs(fdir)
                filename = '%s/%s.html' % (fdir, rawid)
                if os.path.exists(filename):
                    continue
                res = utils.get_html(url, feature=feature, proxies=proxy)
                time.sleep(2)
                if res:
                    with open(filename, 'w', encoding='gb18030') as f:
                        f.write(res.content.decode("gb18030"))
                    utils.printf(filename)
                    # html = Selector(res.content.decode("gb18030"),"html")
                    # creator = html.xpath("//table[@style='WORD-BREAK: break-all']//tr/td/text()").extract()[0].replace("作者:","")
                    # if creator == "unknow":
                    #     creator = ""
                    # if "Soft_Showja.asp" in url:
                    #     language = "JA"
                    #     country = "JP"
                    #     SoftID = re.findall("SoftID=(.*?)",url)[0]
                    #     rawid = "ja%s" % SoftID
                    #     Lngid = utils.GetLngid(sub_db_id,rawid)
                    # else:
                    #     language = "EN"
                    #     country = "US"
                    #     SoftID = re.findall("SoftID=(.*)",url)[0]
                    #     rawid = SoftID
                    #     Lngid = utils.GetLngid(sub_db_id,rawid)
                    # provider_url = provider + '@' + url
                    # provider_id = provider + '@' + rawid
                    # batch = str(now_time) + '00'
                    result_1.append((url))
                    # result_2.append(
                    #     (Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium)
                    # )
                if utils.parse_results_to_sql(conn_1, sql_up, result_1, 50):
                    utils.printf("更新%s条" % len(result_1))
                    result_1.clear()
                # if utils.parse_results_to_sql(conn_2, sql_in, result_2, 50):
                #     utils.printf("插入%s条" % len(result_2))
                #     result_2.clear()
            utils.parse_results_to_sql(conn_1, sql_up, result_1)
            utils.printf("更新剩下的%s条" % len(result_1))
            result_1.clear()
Exemple #32
0
 def parse(self):
     html = get_html(self.url)
     for lang, code in self.generate_pairs(html):
         yield {'name': lang, 'sil': code, 'on_bible_org': True}
Exemple #33
0
 def _crawl(self):
     print "Getting info for %s" % self.seed
     html = get_html(self.seed)
     self.index = extract_links(html)
Exemple #34
0
				member['email'] = d.p.string
			if d.find('span', class_='icon4'):
				member['website'] = d.p.string
		
		# add record to the list
		membership.append(member)
		
	return membership


if __name__ == "__main__":

	# page specific setup
	page_url = ""
	output_filename = 'Data/aero.csv'
	fields = []

	# hack for dealing with accented text
	reload(sys)
	sys.setdefaultencoding('utf-8')

	# read single page and extract data
	html = get_html(page_url)
	if html:
		record_list = parse_aero(html)
		print record_list

	# write records to csv
	write_to_csv(output_filename, fields, record_list)

Exemple #35
0
 def get_scripts(self, url1):
     html = get_html(url1)
     self.scripts = set([])
     for relevant_dict in self.get_relevant_dict(html):
         self.scripts.add(relevant_dict['script'].lower().strip('\n'))
Exemple #36
0
 def get_soup(self):
     html = utils.get_html(self.url)
     result = regex.findall(html)
     print result
     soup = BeautifulSoup(html)
     return soup
Exemple #37
0
            if d.find('span', class_='icon3'):
                member['email'] = d.p.string
            if d.find('span', class_='icon4'):
                member['website'] = d.p.string

        # add record to the list
        membership.append(member)

    return membership


if __name__ == "__main__":

    # page specific setup
    page_url = ""
    output_filename = 'Data/aero.csv'
    fields = []

    # hack for dealing with accented text
    reload(sys)
    sys.setdefaultencoding('utf-8')

    # read single page and extract data
    html = get_html(page_url)
    if html:
        record_list = parse_aero(html)
        print record_list

    # write records to csv
    write_to_csv(output_filename, fields, record_list)
Exemple #38
0
def get_areas(area_url):
    try:
        resp = utils.get_html(area_url)
    except Exception, e:
        print area_error, e
        exit(1)
import utils

###########################################################################

if __name__ == "__main__":
    db.init()

    progress = int(db.get_param('route_update_progress', 0))
    db.cur.execute("SELECT id FROM route WHERE id > ? ORDER BY id", (progress,))
    rows = db.cur.fetchall()
    tobedone = map(lambda x: x[0], rows)

    rp = RouteParser()

    for id in tobedone:
        print "\n\nupdate route id", id
        url = utils.get_route_url(id)
        html = utils.get_html(url, data=id)
        try:
            rp.parse(id, html)
            db.cache_del(url)
        except Exception as e:
            print "  ", id, "failed, error:", e
            db.update_table('route', 'id', id, parsed='F')
            raise

        db.set_param('route_update_progress', id)


    db.close()
Exemple #40
0
# coding:utf-8
import utils

'''
爬取百度贴吧帖子图片
'''

url = 'http://tieba.baidu.com/p/1753935195'

html = utils.get_html(url)
print(html)
# 以写的方式打开pageCode.txt
pageFile = open('pageCode.txt', 'wb+')
# 写入
pageFile.write(html)
# 开了记得关
pageFile.close()
Exemple #41
0
def download_gallery(site):
    start = time.time()
    # for offensive warning
    need_cookies = False
    cookies = None
    html = utils.get_html(site)
    if not html:
        print('Failed to retrieve gallery page, process will be aborted!')
        return
    if utils.is_warning_page(html):
        print('Page has offensive content, setting cookies to get around it')
        need_cookies = True
        cookies = utils.get_cookies(site)
        html = utils.get_html_with_cookies(site, cookies)
    metadata = get_gallery_metadata(html)
    urls = get_page_urls(html)
    sections = metadata["Length"].split()
    total_images = int(sections[0]) if sections else 0
    title = metadata["Title"]
    print('Below is the informaiton of the gallery...')
    print_metadata(metadata)
    print('Start downloading...')
    title = title.replace('/', ' of ')
    if not utils.create_dir(title):
        return
    if total_images:
        utils.print_progress(0, total_images)
    else:
        print(
            "Failed to get total number of images, progress bar is disabled!")
    i = 0
    img_fails = []
    gallery_page_fails = []
    img_page_fails = []

    #download images in each gallery page
    for url in urls:
        page_html = utils.get_html_with_cookies(
            url, cookies) if need_cookies else utils.get_html(url)
        if not page_html:
            gallery_page_fails.append(url)
            continue
        image_urls = get_image_urls(page_html)
        for image_url in image_urls:
            image_page_html = utils.get_html(image_url)
            if not image_page_html:
                img_page_fails.append(image_url)
                continue
            image_src = get_image_src(image_page_html)
            parts = image_src.split('.')
            extension = (
                '.' + parts[-1] if parts[-1] else '.jpg') if parts else '.jpg'
            file_name = get_file_name(total_images, i + 1) + extension
            file_path = title + '/' + file_name
            if not os.path.exists(file_path):
                if not utils.get_image(image_src, file_path):
                    img_fails.append(file_name)
            i += 1
            if total_images:
                utils.print_progress(i, total_images)

    #downloading result
    succeed = True
    if gallery_page_fails or img_page_fails:
        succeed = False
        print('Failed to load following pages:')
        for url in gallery_page_urls:
            print(url)
        for url in img_page_fails:
            print(url)
    if img_fails:
        succeed = False
        print('Failed to download following %s files...' % len(img_fails))
        for img in img_fails:
            print(img)
    if succeed:
        print('All files are downloaded successfully!')
    end = time.time()
    hours, rem = divmod(end - start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("Total time elapsed {:0>2}m:{:02.0f}s".format(
        int(hours) * 60 + int(minutes), seconds))
Exemple #42
0
 def get_page_nums(self, main_url):
     html = get_html(main_url)
     soup = BeautifulSoup(html, "lxml")
     page = soup.select(".p-skip em b")
     print "page number:" + page.get_text()
     return int(page.get_text())
Exemple #43
0

if __name__ == "__main__":
	# hack for dealing with accented text
	reload(sys)
	sys.setdefaultencoding('utf-8')
	
	companies = get_mesi_urls()
	#sys.exit()
	
	
	# page specific setup
	"""
	companies = ["http://internet2.economie.gouv.qc.ca/Internet/aerospatiale/reperaero.nsf/bd4b8ac1bdeea6ee0525694b007576fd/905426df69f342a985257ec0002146d8?OpenDocument",
				"http://internet2.economie.gouv.qc.ca/Internet/aerospatiale/reperaero.nsf/bd4b8ac1bdeea6ee0525694b007576fd/097aeedcac7ea4de85257b3200715bc0?OpenDocument",
				"http://internet2.economie.gouv.qc.ca/Internet/aerospatiale/reperaero.nsf/bd4b8ac1bdeea6ee0525694b007576fd/c06a60a8d08182f885257b32007164cc?OpenDocument",
				"http://internet2.economie.gouv.qc.ca/Internet/aerospatiale/reperaero.nsf/bd4b8ac1bdeea6ee0525694b007576fd/11323676470757f285257b320071648b?OpenDocument"]
	"""
	record_list =[]
	
	# iterate through a list of company pages
	for company in companies:
		html = get_html(company)
		if html:
			record_list.append(parse_mesi_company_page(html))
			
	# write records to csv
	output_filename = 'Data/mesi.csv'
	fields = ['name', 'contact1', 'title1', 'contact2', 'title2','phone', 'fax', 'email', 'website', 'revenues', 'description']
	write_to_csv(output_filename, fields, record_list)
 def get_html(self, sil):
     url = '{0}/{1}'.format(self.base_url, sil)
     return get_html(url)