def mangastream(url, name, dest, delim, digits, number): print "Downloading images from [mangastream]...\n" links = [tag.get('href') for tag in get_html(url).findAll( "ul", {"class": "dropdown-menu"})[-1].select('li > a')] match = re.search(r"(.*\/)(\d*)$", links[-1]) base_url, num_pages = match.group(1), int(match.group(2)) for i in range(1, num_pages + 1): try: image_url = get_html( base_url + str(i)).select("#manga-page")[0].get("src") new_name = set_name("", ".jpg", "", i, digits) download_file(image_url, new_name, dest, i) except: print "exception" pass
def parse(self): ''' retrieves the page and parses the contents into the following fields self.name (May include brewery/brand and/or beer) self.price (USD) self.volume (Gallons) self.num_avail (Kegs) self.desc (Keg description) ''' if self.parsed: return self.parsed = True html = get_html(self.url) ''' Attempt to get name and volume ''' try: self.name = html.xpath('//h1/text()')[0].strip() if '(' in self.name and ')' in self.name: split_name = self.name.split('(') self.name = split_name[0].strip() volume = filter(lambda x: is_num(x) if '.' not in x \ else x, split_name[1].strip(')').strip()) if is_num(volume): self.volume = float(volume) else: self.volume = 0.0 else: self.volume = 0.0 except Exception: self.name = '' self.volume = 0.0 ''' Attempt to get price ''' try: self.price = float(html.xpath('//span[@class="ProductDetailItemPric\ e"]/text()')[0].strip().strip('$')) except Exception: self.price = 0.0 ''' Attempt to get number of available kegs ''' try: self.num_avail = int(html.xpath('//em/text()\ ')[0].strip().split()[0]) except Exception: self.num_avail = 0 ''' Attempt to get description ''' try: self.desc = html.xpath('//td[@class="ProductDetailCell"]/p/text()\ ')[0].strip() except Exception: self.desc = ''
def source_vst(): urllist = [] try: js = utils.get_html() info_dict = utils.get_json(js) for item in info_dict["live"]: url_list = item["urllist"].split("#") for url in url_list: urllist.append(url) except Exception, e: print api_error, e exit(1)
def parse_map(self): map_node = self.get_node(self.root, "//div[@class='routeMapInner']/a") if map_node is None: return None url = map_node.get('href') map_html = utils.get_html(utils.ptv_url + url) map_tree = etree.HTML(map_html) map_node = self.get_node(map_tree, "//div[@class='routeMapInner']/img") if map_node is None: return None # http://ptv.vic.gov.au/ map_link = map_node.get('src') db.update_table('map', 'link', map_link) map_id = db.query("SELECT id FROM map WHERE link=?", (map_link,)) return map_id
def source_7po(): urllist = [] try: resp = utils.get_html(api_url) dom = xml.dom.minidom.parseString(resp) root = dom.documentElement channels = root.getElementsByTagName("channel") for channel in channels: for urlNode in channel.childNodes: # 源的url url = urlNode.firstChild.wholeText urllist.append(url) except Exception, e: print api_error, e exit(1)
def shopping(query, pages=1): results = [] for i in range(pages): url = _get_shopping_url(query, i) html = get_html(url) if html: j = 0 soup = BeautifulSoup(html) products = soup.findAll("div", "g") print "yoooo", products for prod in products: res = ShoppingResult() divs = prod.findAll("div") for div in divs: match = re.search( "from (?P<count>[0-9]+) stores", div.text.strip()) if match: res.store_count = match.group("count") break h3 = prod.find("h3", "r") if h3: a = h3.find("a") if a: res.compare_url = a["href"] res.name = h3.text.strip() psliimg = prod.find("div", "psliimg") if psliimg: img = psliimg.find("img") if img: res.thumb = img["src"] f = prod.find("div", "f") if f: res.subtext = f.text.strip() price = prod.find("div", "psliprice") if price: res.min_price = price.text.strip() results.append(res) j = j + 1 return results
def get_aver_num(query, lang='en'): """Returns average number of search results. Args: query: String to search in google. Returns: int number""" av_num = 0 url = _get_search_url(query, 0, lang=lang) html = get_html(url) if html: soup = BeautifulSoup(html, "html.parser") av_num = soup.find("div", {"id": "resultStats"}) av_num = _get_num(av_num) return av_num
def convert(amount, from_currency, to_currency): """Method to convert currency. Args: amount: numeric amount to convert from_currency: currency denomination of the amount to convert to_currency: target currency denomination to convert to """ # same currency, no conversion if from_currency == to_currency: return amount * 1.0 req_url = _get_currency_req_url(amount, from_currency, to_currency) response = get_html(req_url) rate = _parse_currency_response(response, to_currency) return rate
def search(query, pages=1, lang='en', void=True): """Returns a list of GoogleResult. Args: query: String to search in google. pages: Number of pages where results must be taken. Returns: A GoogleResult object.""" results = [] for i in range(pages): url = _get_search_url(query, i, lang=lang) html = get_html(url) if html: soup = BeautifulSoup(html, "html.parser") lis = soup.findAll("div", attrs={"class": "g"}) j = 0 for li in lis: res = GoogleResult() res.page = i res.index = j res.name = _get_name(li) res.link = _get_link(li) res.google_link = _get_google_link(li) res.description = _get_description(li) res.thumb = _get_thumb() res.cached = _get_cached(li) if void is True: if res.description is None: continue results.append(res) j += 1 return results
def cntv_crawler(api): print "start running cntv crawler...." src_list = [] for channel in channel_list: print "getting url from %s" % channel try: jsn = utils.get_html(api % channel) data = utils.get_json(jsn) src = {} src["code"] = channel src['auth'] = data['hls_url']['hls2'] if "cctv" in channel: src["hls"] = data['hls_url']['hls1'] src['flv'] = data['hds_url']['hds2'] src_list.append(src) except Exception, e: print e else: pass finally:
def get_channels(area, url): try: # 获取地区中所有频道的url c_url = url % (area[0], area[0]) resp = utils.get_html(c_url) c_match = utils.get_json(resp, json_pattern) # c_match = re.search(json_pattern, r.text).group(1) c_json = json.loads(c_match) # 获取含有频道列表的html c_html = c_json["html"] # 频道url列表 urllist = re.findall(url_pattern, c_html) # 频道名列表 namelist = re.findall(c_name_pattern, c_html) # 频道字典,频道名与url对应 c_dict = {} for i in xrange(len(namelist)): c_dict[namelist[i]] = urllist[i] except Exception, e: print e return None
def hotflick(url, name, dest, delim, digits, number): print "Downloading images from [hotflick]...\n" # get all page links if the gallery has more than one page div = get_html(url).find('div', {"class": "box-paging"}) gallery_page_links = [str(tag['href']) for tag in div.findAll('a', href=True)] # get image links if gallery_page_links != []: links = [] for page in gallery_page_links: links.extend([link for link in get_page_links( "http://hotflick.net/" + page) if "/v/?q=" in link]) else: links = [link for link in get_page_links(url) if "/v/?q=" in link] regex = re.compile(r'\.net/\w/v/\?q\=(\d+)\.(.*)(\.\w*)$', re.IGNORECASE) for link in links: try: # image name and filetype match = regex.search(link) ext = match.group(3) # image URL and output filename new_name = set_name(name, ext, delim, number, digits) image_url = "http://www.hotflick.net/u/n/{0}/{1}{2}".format( match.group(1), match.group(2), ext) # download download_file(image_url, new_name, dest, number) number += 1 except: print "exception" pass
def upix(url, name, dest, delim, digits, number): print "Downloading images from [upix]...\n" links = [str(tag['href']) for tag in get_html(url).findAll('a', {"class": "thumb"})] base_url = url if str.endswith(url, "/#none"): base_url = url[:-5] regex = re.compile(r'(\.[a-zA-Z]*)$', re.IGNORECASE) for link in links: try: # image URL and output filename image_url = base_url + link ext = regex.search(image_url).group(1) new_name = set_name(name, ext, delim, number, digits) # download download_file(image_url, new_name, dest, number) number += 1 except: pass
def imgur(url, name, dest, delim, digits, number): print "Downloading images from [imgur]...\n" if not str.endswith(url, "/layout/blog"): url += "/layout/blog" links = get_html(url).findAll('meta', {'property': 'og:image'}) links = [link['content'] for link in links[1:]] regex = re.compile(r'\.com/\w*(\.[a-zA-Z]*)$', re.IGNORECASE) for image_url in links: try: # filetype ext = regex.search(image_url).group(1) # output filename new_name = set_name(name, ext, delim, number, digits) # download download_file(image_url, new_name, dest, number) number += 1 except: pass
page_number = 1 #max_num = data['{}'.format(district)] os.chdir(os.path.join(path, 'CDATA')) with open(sfile, 'a', encoding='utf-8', newline='') as csvfile: csvwriter = csv.writer(csvfile) # writing the fields csvwriter.writerow(fields) os.chdir(path) while True: if page_number > max_num: break log_file.write('page{} process strted at: '.format(page_number) + time.ctime() + '\r\n') url = "https://www.justdial.com/%s/Lawyers/nct-10296083/page-%s" % ( district, page_number) ut.get_html(url, page_number) time.sleep(5) page = open('temp{}.htm'.format(page_number), 'r', encoding='utf-8') #page = urllib.request.urlopen(req , proxy , timeout=5) #time.ctime(1) # page=urllib2.urlopen(url) soup = BeautifulSoup(page.read(), "html.parser") services = soup.find_all('li', {'class': 'cntanr'}) # Iterate through the 10 results in the page for service_html in services: # Parse HTML to fetch data name = ut.get_name(service_html) phone = ut.get_phone_number(service_html) #rating = get_rating(service_html)
def get_optimal_kegs(args): ''' Gets kegs from bevmo.com finds the kegs with the optimal gallons of alcohol per USD ''' num_kegs = args['top'] beer_limit = args['limit'] num_attempts = args['attempts'] max_price = args['price'] desc_filter = args['filter'] desc_unfilter = args['unfilter'] ''' The first url to crawl and its base url ''' seed_url = 'http://www.bevmo.com/Shop/ProductList.aspx/\ Beer/Kegs/_/N-15Z1z141vn?DNID=Beer' base_url = '{url.scheme}://{url.netloc}'.format(url=urlparse(seed_url)) ''' Get initial unique page links from the seed url append base_url to them ''' ''' For info on XPaths, see: http://www.w3schools.com/xpath/xpath_syntax.asp ''' init_page_links = [] init_page_links[:] = unique(get_html(seed_url).xpath('//div[@class="Product\ ListPaging"]/a/@href')) if not init_page_links: print('Failed to retrieve the initial keg page links!') return None ''' Lists for holding links to pages of beer kegs ''' page_links = [seed_url] + map(lambda x: base_url + x, init_page_links) new_page_links = [] ''' Lists for holding links to individual beer kegs ''' beer_links = [] new_beer_links = [] ''' To keep track of already crawled beer kegs ''' crawled_beers = set() ''' List for matching --filter and --unfilter keyword arguments to keg descriptions ''' matched = [] ''' List to hold top beer kegs, the size of optimal_kegs is limited by the num_kegs argument ''' optimal_kegs = [] keg = None while len(page_links) > 0 and len(crawled_beers) < beer_limit: ''' Links are removed as they are crawled ''' page_link = page_links.pop(0) ''' Beer keg links ''' new_beer_links[:] = unique(get_html(page_link).xpath('//a[@class="Prod\ uctListItemLink"]\ /@href')) beer_links += [base_url + x for x in new_beer_links] ''' Crawl the beer keg links get the gallons of alcohol/USD ratio ''' for link in beer_links: ''' Break if the number of crawled beers exceeds the limit ''' if len(crawled_beers) >= beer_limit: break ''' Cache the BevMo beer id's to prevent duplicates ''' beer_id = link.split('/')[-1] if beer_id not in crawled_beers: ''' Create BeerKeg object ''' keg = BeerKeg(link, num_attempts, verbose=True) ''' Call keg.parse() then filter kegs by their descriptions Calling keg.parse() produces fields keg.desc, keg.price, etc keg.parse() will only parse once per keg object ''' ''' Check if price is within range if one was given ''' if max_price: keg.parse() if keg.price > max_price: ''' Move onto the next keg and ignore this one ''' continue ''' args['filter'] has words that must be in the description ''' ''' desc_filter has words that must be in the description ''' if desc_filter: keg.parse() matched = [word in keg.desc for word in desc_filter] ''' All keywords must be present for a match ''' if not all(matched): ''' Move onto the next keg and ignore this one ''' continue ''' desc_unfilter has words that can't be in the description ''' if desc_unfilter: keg.parse() matched = [word in keg.desc for word in desc_unfilter] ''' Any keyword must be present to nullify a match ''' if any(matched): ''' Move onto the next keg and ignore this one ''' continue ''' Add current beer to crawled beers ''' crawled_beers.add(beer_id) ''' Print how many kegs have been crawled ''' print('Keg {}'.format(len(crawled_beers))) ''' Gets the gallons of alcohol per USD for the keg ''' ratio = keg.get_ratio() print('') ''' Maintain a sorted list of the current top 3 kegs using heapq (heap queue algorithm) optimal_kegs holds a tuple containing the ratio and keg associated with it ''' if optimal_kegs: for opt_tuple in optimal_kegs: ''' If ratio is greater than any keg ratio currently in optimal_kegs, then add it ''' if ratio > opt_tuple[0]: if len(optimal_kegs) >= num_kegs: ''' Adds new item to list removes the smallest to maintain size ''' heapq.heappushpop(optimal_kegs, (ratio, keg)) else: heapq.heappush(optimal_kegs, (ratio, keg)) break else: ''' Will only occur for the very first keg crawled ''' heapq.heappush(optimal_kegs, (ratio, keg)) ''' Typical link: Shop/ProductList.aspx/_/N-15Z1z141vn/No-100?DNID=Beer If No- is evenly divisible by 100, it leads to more pages to add ''' if 'No-' in page_link: if int(page_link.split('No-')[1].split('?')[0]) % 100 == 0: ''' Unique new page links with their base url appended ''' new_page_links[:] = unique(get_html(page_link).xpath('//div[@cl\ ass="Produ\ ctListPagi\ ng"]/a/@hr\ ef')) page_links += [base_url + x for x in new_page_links] ''' Sort the list in descending order by ratio (index 0 in the keg tuple) ''' return sorted(optimal_kegs, key=lambda x: x[0], reverse=True)
def __init__(self, player_name, player_link, Team=None, position=None): self.min_snap_perc = .10 self.name = player_name self.player_link = player_link config = get_config(os.getcwd()) self.base_url = config['base_url'] self.full_player_url = self.base_url + player_link self.standardized_position_dict = { "OL": { "eligible_positions": [ "G", "T", "C", "LS", "OT", "OG", "OL", "G/C", "G-C", "T-G", "G-T", "C-G", "G,C", "C,G", "G,T", "T,G" ], "class": OffLineman, "side": "offense" }, "QB": { "eligible_positions": ["QB"], "class": Quarterback, "side": "offense" }, "WR": { "eligible_positions": ["WR", "PR-WR", "WR/RB"], "class": WideReceiver, "side": "offense" }, "TE": { "eligible_positions": ["TE", "LS,TE", "TE-C"], "class": TightEnd, "side": "offense" }, "RB": { "eligible_positions": ["RB", "FB", "FB-LB", "HB"], "class": RunningBack, "side": "offense" }, "DB": { "eligible_positions": ["SS", "FS", "CB", "DB", "S"], "class": DefBack, "side": "defense" }, "LB": { "eligible_positions": ["LB", "OLB", "ILB", "MLB", "LB-DE"], "class": Linebacker, "side": "defense" }, "DL": { "eligible_positions": [ "DT", "DL", "NT", "DE", "NT-DT", "DT-NT", "DE-LB", "DT/LB", "DE-C", "DE-DT", "DT-DE" ], "class": DefLineman, "side": "defense" }, "K": { "eligible_positions": ["K"], "class": Kicker, "side": "special_teams" }, "P": { "eligible_positions": ["P"], "class": Punter, "side": "special_teams" } } if Team is not None: self.game_html_page = Team.game_html_page self.season = Team.season self.week = Team.week self.team = Team.team self.team_abbrev = Team.team_abbrev self.base_url = Team.base_url if position is None: self.player_page = get_html(self.full_player_url) self.meta_div = self.player_page.find( "div", {"itemtype": "https://schema.org/Person"}) position = self.get_position_from_player_page(self.meta_div) self.standardized_pos = [k for k, v in self.standardized_position_dict.items() \ if position in v["eligible_positions"]][0] self.player_class = self.standardized_position_dict[ self.standardized_pos]["class"] self.side = self.standardized_position_dict[ self.standardized_pos]['side']
def generate_script_dicts(self): for s in self.scripts: html = get_html('{0}{1}'.format(self.script_url, s)) for d in self.get_relevant_dict(html): yield d
def get_game_page(self): game_soup = get_html(self.game_full_url) return game_soup
def html_to_json(url): category, uid = tokenize(url) schema_name = 'schema/{}.json'.format(category) with open(schema_name, 'rb') as fp: template = json.load(fp) html_doc = get_html(url) soup = BeautifulSoup(html_doc, 'html.parser') table_title = None result = {} ignore_image = True for tr in soup.find_all('tr'): # keep only the most bottom level tr if tr.find_all('tr'): continue is_title_row = False row_content = [] for td in tr.find_all('td'): if ignore_image and td.find_all('img'): continue text = clean_up(td.text) if text in template: table_title = text is_title_row = True row_titles = template[table_title] ignore_image = row_titles['ignore image'] result[table_title] = {} break link = '' for a in td.find_all('a'): link = a.get('href') row_content.append({'text': text, 'link': link}) if is_title_row: continue if not row_content or not table_title: continue column_index = row_titles['column index'] strict_match = row_titles['strict match'] regex_match = row_titles['regex match'] terminate_on_mismatch = row_titles['terminate on mismatch'] matched = False if len(row_content) > column_index + 1: candidate_row_title = row_content[column_index]['text'] for s in strict_match: if s == candidate_row_title and s not in result[table_title]: matched = True result[table_title][s] = row_content[column_index + 1:] break if not matched: for s in regex_match: if s in candidate_row_title: matched = True result[table_title][u'Certified Votes'] = row_content[column_index + 1:] break if re.match(s, candidate_row_title): matched = True category, race_id = tokenize(row_content[column_index + 1]['link']) result[table_title][race_id] = row_content[column_index:] break if terminate_on_mismatch and not matched: table_title = None ignore_image = True return result
def main(): html = get_html(url) table = get_macrolang_table(html) langs = parse_macrolang_table(table) detailed_list = get_detailed_list(html) parse_detailed_list(detailed_list)
def get_areas(area_url): try: resp = utils.get_html(area_url) except Exception, e: print area_error, e exit(1)
def __init__(self, profile_id): self.profile_id = profile_id profile_html = utils.get_html("profile", self.profile_id) self.name = utils.get_name_from_html(profile_html) self.solved = 0 self.problems = []
def run(self): big_json_name = big_json_path + '/%s_%s_%s.big_json' % ( now_time, os.getpid(), get_ident()) while True: if not message_que.empty(): rows = message_que.get() for url in rows: utils.printf(url) key = random.choice(RKEY_PROXY) proxy_ = connRedis.srandmember(key) proxy = { 'http': proxy_, 'https': proxy_, } feature = "highwire-cite-metadata" feature_2 = "pane-title" # res = utils.get_html(url,feature=feature,proxies=proxy,timeout=200) res = utils.get_html(url, feature=feature, timeout=200) if res: html = res.text.strip() HEADER = { "Accept": "*/*", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36", } try: sumDict = dict() info_url = url + "/tab-article-info" # res_info = requests.get(info_url,headers=HEADER,proxies=proxy,timeout=20) res_info = requests.get(info_url, headers=HEADER, timeout=200) if res_info.status_code == 200: if res_info.text.find(feature_2) > 0: info_html = res_info.text.strip() sumDict['provider_url'] = url sumDict['down_date'] = now_time sumDict['htmlText'] = html sumDict['info_htmlText'] = info_html with open(big_json_name, mode='a', encoding='utf-8') as f: line = json.dumps( sumDict, ensure_ascii=False).strip() + '\n' f.write(line) utils.printf(url, 'write to big_json') sql_queue.put(url) else: utils.printf("not find feee_info") message_que.put(rows) elif res_info.status_code == 404: sumDict['provider_url'] = url sumDict['down_date'] = now_time sumDict['htmlText'] = html sumDict['info_htmlText'] = "" with open(big_json_name, mode='a', encoding='utf-8') as f: line = json.dumps( sumDict, ensure_ascii=False).strip() + '\n' f.write(line) utils.printf(url, 'write to big_json') sql_queue.put(url) else: message_que.put(rows) except Exception as e: utils.printf(e) message_que.put(rows) else: message_que.put(rows)
def get_interview_text(interview_url): """ Fetch a single piece of interview text and meta-data from a source webpage Parameters ---------- interview_url : String The url to the webpage Returns ------ interview_name : String Name of this interview interview_time : String When this interview happened interview_players: List[String] Interviewees interview_text : String An unprocessed String of raw interview text (including Questions and interviewee responses) """ # example url: http://www.asapsports.com/show_conference.php?id=144725 # fetch HTML soup = get_html(interview_url) assert len(soup.find_all('h1')) == 1 if soup.find_all('h1')[0].a is not None: interview_name = str(soup.find_all('h1')[0].a.contents[0]) else: interview_name = str(soup.find_all('h1')[0].contents[0]) assert len(soup.find_all('h2')) == 1 interview_time = str(soup.find_all('h2')[0].contents[0]) # find all players attending this interview interview_players = [] for link in soup.find_all('a'): if 'show_player.php' in link.get('href'): interview_players.append(str(link.contents[0])) # find interview text for td in soup.find_all('td'): if td.get('valign') == 'top' and td.get('style') == 'padding: 10px;': raw_interview_text = td.contents interview_text = '' for item in raw_interview_text: # all actual texts are either directly below the td Tag or is a Tag with name 'b' if type(item) is NavigableString: interview_text += str(item) elif type(item) is Tag and item.name == 'b': # cope with empty tags: <b></b> if len(item.contents) > 0: interview_text += str(item.contents[0]) # remove #nbsp; and  from text interview_text = interview_text.replace('\xa0', ' ') interview_text = interview_text.replace('Â', ' ') return interview_name, interview_time, interview_players, interview_text
from bs4 import BeautifulSoup from utils import get_html def process_scripts(html): soup = BeautifulSoup(html, "lxml") scripts = soup.find_all("script", {"type": "text/javascript"}) return scripts def process_shared_data(data): shared_data = [s.string for s in data if "window._sharedData = " in str(s)] post_links = str(shared_data).split( "edge_sidecar_to_children")[-1].replace(r"\\u0026", "&") post_links = post_links.split(",") links = [str(link.split('":"')[-1].rstrip('"')) for link in post_links if 'display_url' in link or 'video_url' in link] return links if __name__ == "__main__": html = get_html("https://www.instagram.com/p/CCnsE2PJktq/").text data = process_scripts(html) print(process_shared_data(data))
def get_family_urls(self, url1): html = get_html(url1) self.family_urls = set([]) for relevant_dict in self.get_relevant_dict(html): self.family_urls.add()
def run(self): big_json_name =big_json_path + '/%s_%s_%s.big_json' % (now_time,os.getpid(),get_ident()) HEADER = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', } # url_zhuye = "https://www.pnas.org" # sn = requests.session() # res_zhuye = sn.get(url_zhuye,headers=HEADER,timeout=2) while True: if not message_que.empty(): rows = message_que.get() for url in rows: utils.printf(url) key = random.choice(RKEY_PROXY) proxy_ = connRedis.srandmember(key) proxy = { 'http': proxy_, 'https': proxy_, } feature = "highwire-cite-metadata" feature_2 = "pane-title" res = utils.get_html(url,feature=feature,proxies=proxy,timeout=15) if res: html = res.text.strip() h = Selector(text=html) node_id = h.xpath("//div[@class='pane-content']/div[@class='highwire-article-citation highwire-citation-type-highwire-article']/@data-node-nid").extract_first() info_url = "https://www.pnas.org/panels_ajax_tab/jnl_pnas_tab_info/node:%s/1" % node_id utils.printf(info_url) try: sumDict = dict() res_info = requests.get(info_url,headers=HEADER,proxies=proxy,timeout=200) if res_info.status_code == 200: if res_info.text.find(feature_2) > 0: info_html = res_info.text.strip() info_html = json.loads(info_html)['markup'] sumDict['provider_url'] = url sumDict['down_date'] = now_time sumDict['htmlText'] = html sumDict['info_htmlText'] = info_html with open(big_json_name, mode='a', encoding='utf-8') as f: line = json.dumps(sumDict, ensure_ascii=False).strip() + '\n' f.write(line) utils.printf(url,'write to big_json') sql_queue.put(url) else: utils.printf("not find feee_info") message_que.put(rows) elif res_info.status_code == 404: sumDict['provider_url'] = url sumDict['down_date'] = now_time sumDict['htmlText'] = html sumDict['info_htmlText'] = "" with open(big_json_name, mode='a', encoding='utf-8') as f: line = json.dumps(sumDict, ensure_ascii=False).strip() + '\n' f.write(line) utils.printf(url,'write to big_json') sql_queue.put(url) else: utils.printf(res_info.status_code) message_que.put(rows) except Exception as e: utils.printf(e) message_que.put(rows) else: print("1111") message_que.put(rows)
def get_abv(self): ''' Attempts to find percentage of alcohol by volume using Bing ''' abv = '' found_abv = '' ''' A ceiling for ABV content for validation We can assume BevMo does not offer kegs with this high of an ABV ''' max_abv = 20.0 if not self.parsed: self.parse() search_url = 'https://www.bing.com/search?q={0}+alcohol+content\ '.format('+'.join(self.name.split())) search_links = get_html(search_url).xpath('//a/@href') new_search_links = search_links[search_links.index('javascript:'):][1:] results = [x for x in new_search_links if x != '#' and 'site:' not in x] ''' Max number of links to search for alcohol by volume (ABV) ''' num_attempts = self.num_attempts ''' Filter links with same domain to improve chances of matching ''' searched_domains = set() ''' Add the top page results that are unique, r_it is an iterator ''' top_results = [] r_it = 0 result_link = '' while len(top_results) < num_attempts and r_it < len(results): result_link = results[r_it] domain = '{url.netloc}'.format(url=urlparse(result_link)) if '.' in domain: if domain.count('.') > 1: domain = domain.split('.')[1] else: domain = domain.split('.')[0] ''' Avoid already searched domains ''' if domain in searched_domains: r_it += 1 else: top_results.append(result_link) r_it += 1 searched_domains.add(domain) for i in xrange(min(num_attempts, len(top_results))): if self.verbose: print('Searching {}'.format(top_results[i])) try: search_text = ''.join(get_text(get_html(top_results[i]))) except Exception: continue ''' Retrieves partial string containing the words ABV and a % ''' abv = re.search('(?<=[Aa][Bb][Vv])[^\d]*(\d+[.]?\d*)(?=%)|(?<=%)\ [^\d]*(\d+[.]?\d*)[^\d]*\ (?=[Aa][Bb][Cc])', search_text) if abv: abv = abv.group() ''' Filters for a number with or without a decimal pt ''' abv = float(re.search('(\d+[.]?\d*)', abv).group()) ''' If new ABV is 0.0, return previously found ABV if any otherwise, move onto the next link ''' if abv == 0.0: if found_abv: if self.verbose: print('ABV for {} is {}'.format(self.name, abv)) else: continue if abv < max_abv: if abv < max_abv / 2: if self.verbose: print('ABV for {} is {}'.format(self.name, abv)) return abv ''' Replace the new ABV only if the next is lower ''' if found_abv: if abv < found_abv: if self.verbose: print('ABV for {} is {}'.format(self.name, abv)) return abv else: if self.verbose: print('ABV for {} is {}\ '.format(self.name, found_abv)) return found_abv ''' Sets the new ABV to the found ABV ''' found_abv = abv else: if found_abv: if self.verbose: print('ABV for {} is {}'.format(self.name, found_abv)) return found_abv ''' No ABV was found by this point ''' if self.verbose: print('ABV not found for {}'.format(self.name)) return None
def search(query, pages=1, lang='en', area='com', ncr=False, void=True, time_period=False, sort_by_date=False, first_page=0): """Returns a list of GoogleResult. Args: query: String to search in google. pages: Number of pages where results must be taken. area : Area of google homepages. first_page : First page. TODO: add support to get the google results. Returns: A GoogleResult object.""" start = time.time() results = [] for i in range(first_page, first_page + pages): url = _get_search_url(query, i, lang=lang, area=area, ncr=ncr, time_period=time_period, sort_by_date=sort_by_date) html = get_html(url) urls_time = time.time() print('got html in ' + str(urls_time - start) + 's') if html: soup = BeautifulSoup(html, "html.parser") divs = soup.findAll("div", attrs={"class": "g"}) results_div = soup.find("div", attrs={"id": "resultStats"}) number_of_results = _get_number_of_results(results_div) parse_time = time.time() print('parsed html in ' + str(parse_time - urls_time) + 's') j = 0 for li in divs: res = GoogleResult() res.page = i res.index = j res.name = _get_name(li) res.link = _get_link(li) res.google_link = _get_google_link(li) res.description = _get_description(li) res.thumb = _get_thumb() res.cached = _get_cached(li) res.number_of_results = number_of_results if void is True: if res.description is None: continue results.append(res) j += 1 return results
def parsel_detail(): now_time = time.strftime('%Y%m%d') conn_1 = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) conn_2 = sqlite3.connect('zt_template.db3') sub_db_id = '243' provider = 'mirrorimutmeixingbook' type = '1' date = '1900' date_created = '19000000' medium = '2' sql_up = "update detail set stat = 1 where url = %s" sql_in = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" result_1 = [] result_2 = [] while True: sql = "select provider_subject,title,url from detail where stat=0 and failcount < 20 limit 1000" cur = conn_1.cursor() cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: break else: for provider_subject, title, url in rows: utils.printf(url) feature = "tdbg_rightall" if "Soft_Showja.asp" in url: SoftID = re.findall("SoftID=(.*)", url)[0] rawid = "ja%s" % SoftID else: SoftID = re.findall("SoftID=(.*)", url)[0] rawid = SoftID fdir = '%s/%s' % (detail_path, now_time) if not os.path.exists(fdir): os.makedirs(fdir) filename = '%s/%s.html' % (fdir, rawid) if os.path.exists(filename): continue res = utils.get_html(url, feature=feature, proxies=proxy) time.sleep(2) if res: with open(filename, 'w', encoding='gb18030') as f: f.write(res.content.decode("gb18030")) utils.printf(filename) # html = Selector(res.content.decode("gb18030"),"html") # creator = html.xpath("//table[@style='WORD-BREAK: break-all']//tr/td/text()").extract()[0].replace("作者:","") # if creator == "unknow": # creator = "" # if "Soft_Showja.asp" in url: # language = "JA" # country = "JP" # SoftID = re.findall("SoftID=(.*?)",url)[0] # rawid = "ja%s" % SoftID # Lngid = utils.GetLngid(sub_db_id,rawid) # else: # language = "EN" # country = "US" # SoftID = re.findall("SoftID=(.*)",url)[0] # rawid = SoftID # Lngid = utils.GetLngid(sub_db_id,rawid) # provider_url = provider + '@' + url # provider_id = provider + '@' + rawid # batch = str(now_time) + '00' result_1.append((url)) # result_2.append( # (Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium) # ) if utils.parse_results_to_sql(conn_1, sql_up, result_1, 50): utils.printf("更新%s条" % len(result_1)) result_1.clear() # if utils.parse_results_to_sql(conn_2, sql_in, result_2, 50): # utils.printf("插入%s条" % len(result_2)) # result_2.clear() utils.parse_results_to_sql(conn_1, sql_up, result_1) utils.printf("更新剩下的%s条" % len(result_1)) result_1.clear()
def parse(self): html = get_html(self.url) for lang, code in self.generate_pairs(html): yield {'name': lang, 'sil': code, 'on_bible_org': True}
def _crawl(self): print "Getting info for %s" % self.seed html = get_html(self.seed) self.index = extract_links(html)
member['email'] = d.p.string if d.find('span', class_='icon4'): member['website'] = d.p.string # add record to the list membership.append(member) return membership if __name__ == "__main__": # page specific setup page_url = "" output_filename = 'Data/aero.csv' fields = [] # hack for dealing with accented text reload(sys) sys.setdefaultencoding('utf-8') # read single page and extract data html = get_html(page_url) if html: record_list = parse_aero(html) print record_list # write records to csv write_to_csv(output_filename, fields, record_list)
def get_scripts(self, url1): html = get_html(url1) self.scripts = set([]) for relevant_dict in self.get_relevant_dict(html): self.scripts.add(relevant_dict['script'].lower().strip('\n'))
def get_soup(self): html = utils.get_html(self.url) result = regex.findall(html) print result soup = BeautifulSoup(html) return soup
if d.find('span', class_='icon3'): member['email'] = d.p.string if d.find('span', class_='icon4'): member['website'] = d.p.string # add record to the list membership.append(member) return membership if __name__ == "__main__": # page specific setup page_url = "" output_filename = 'Data/aero.csv' fields = [] # hack for dealing with accented text reload(sys) sys.setdefaultencoding('utf-8') # read single page and extract data html = get_html(page_url) if html: record_list = parse_aero(html) print record_list # write records to csv write_to_csv(output_filename, fields, record_list)
import utils ########################################################################### if __name__ == "__main__": db.init() progress = int(db.get_param('route_update_progress', 0)) db.cur.execute("SELECT id FROM route WHERE id > ? ORDER BY id", (progress,)) rows = db.cur.fetchall() tobedone = map(lambda x: x[0], rows) rp = RouteParser() for id in tobedone: print "\n\nupdate route id", id url = utils.get_route_url(id) html = utils.get_html(url, data=id) try: rp.parse(id, html) db.cache_del(url) except Exception as e: print " ", id, "failed, error:", e db.update_table('route', 'id', id, parsed='F') raise db.set_param('route_update_progress', id) db.close()
# coding:utf-8 import utils ''' 爬取百度贴吧帖子图片 ''' url = 'http://tieba.baidu.com/p/1753935195' html = utils.get_html(url) print(html) # 以写的方式打开pageCode.txt pageFile = open('pageCode.txt', 'wb+') # 写入 pageFile.write(html) # 开了记得关 pageFile.close()
def download_gallery(site): start = time.time() # for offensive warning need_cookies = False cookies = None html = utils.get_html(site) if not html: print('Failed to retrieve gallery page, process will be aborted!') return if utils.is_warning_page(html): print('Page has offensive content, setting cookies to get around it') need_cookies = True cookies = utils.get_cookies(site) html = utils.get_html_with_cookies(site, cookies) metadata = get_gallery_metadata(html) urls = get_page_urls(html) sections = metadata["Length"].split() total_images = int(sections[0]) if sections else 0 title = metadata["Title"] print('Below is the informaiton of the gallery...') print_metadata(metadata) print('Start downloading...') title = title.replace('/', ' of ') if not utils.create_dir(title): return if total_images: utils.print_progress(0, total_images) else: print( "Failed to get total number of images, progress bar is disabled!") i = 0 img_fails = [] gallery_page_fails = [] img_page_fails = [] #download images in each gallery page for url in urls: page_html = utils.get_html_with_cookies( url, cookies) if need_cookies else utils.get_html(url) if not page_html: gallery_page_fails.append(url) continue image_urls = get_image_urls(page_html) for image_url in image_urls: image_page_html = utils.get_html(image_url) if not image_page_html: img_page_fails.append(image_url) continue image_src = get_image_src(image_page_html) parts = image_src.split('.') extension = ( '.' + parts[-1] if parts[-1] else '.jpg') if parts else '.jpg' file_name = get_file_name(total_images, i + 1) + extension file_path = title + '/' + file_name if not os.path.exists(file_path): if not utils.get_image(image_src, file_path): img_fails.append(file_name) i += 1 if total_images: utils.print_progress(i, total_images) #downloading result succeed = True if gallery_page_fails or img_page_fails: succeed = False print('Failed to load following pages:') for url in gallery_page_urls: print(url) for url in img_page_fails: print(url) if img_fails: succeed = False print('Failed to download following %s files...' % len(img_fails)) for img in img_fails: print(img) if succeed: print('All files are downloaded successfully!') end = time.time() hours, rem = divmod(end - start, 3600) minutes, seconds = divmod(rem, 60) print("Total time elapsed {:0>2}m:{:02.0f}s".format( int(hours) * 60 + int(minutes), seconds))
def get_page_nums(self, main_url): html = get_html(main_url) soup = BeautifulSoup(html, "lxml") page = soup.select(".p-skip em b") print "page number:" + page.get_text() return int(page.get_text())
if __name__ == "__main__": # hack for dealing with accented text reload(sys) sys.setdefaultencoding('utf-8') companies = get_mesi_urls() #sys.exit() # page specific setup """ companies = ["http://internet2.economie.gouv.qc.ca/Internet/aerospatiale/reperaero.nsf/bd4b8ac1bdeea6ee0525694b007576fd/905426df69f342a985257ec0002146d8?OpenDocument", "http://internet2.economie.gouv.qc.ca/Internet/aerospatiale/reperaero.nsf/bd4b8ac1bdeea6ee0525694b007576fd/097aeedcac7ea4de85257b3200715bc0?OpenDocument", "http://internet2.economie.gouv.qc.ca/Internet/aerospatiale/reperaero.nsf/bd4b8ac1bdeea6ee0525694b007576fd/c06a60a8d08182f885257b32007164cc?OpenDocument", "http://internet2.economie.gouv.qc.ca/Internet/aerospatiale/reperaero.nsf/bd4b8ac1bdeea6ee0525694b007576fd/11323676470757f285257b320071648b?OpenDocument"] """ record_list =[] # iterate through a list of company pages for company in companies: html = get_html(company) if html: record_list.append(parse_mesi_company_page(html)) # write records to csv output_filename = 'Data/mesi.csv' fields = ['name', 'contact1', 'title1', 'contact2', 'title2','phone', 'fax', 'email', 'website', 'revenues', 'description'] write_to_csv(output_filename, fields, record_list)
def get_html(self, sil): url = '{0}/{1}'.format(self.base_url, sil) return get_html(url)