def check_proxy(p): try: fetch('http://baidu.com', proxy=p['address']) except RequestException: p.delete() return False return True
def home(): header = "Summarize WordPress Blog" form = NameForm() site_url = request.args.get('url') base_url = request.base_url if request.method == 'GET' and site_url != None: number_of_pages = request.args.get('pages') if number_of_pages != None: try: number_of_pages = int(number_of_pages) except: number_of_pages = 1 form.number_of_pages.data = number_of_pages form.name.data = site_url lines = fetch(site_url, number_of_pages) query_url = get_query_url(base_url, site_url, number_of_pages) return render_template('search.html', pairs=lines, the_title=header, form=form, query_url=query_url) elif request.method == 'POST' and form.validate_on_submit(): site_url = form.name.data number_of_pages = form.number_of_pages.data if number_of_pages is None: number_of_pages = DEFAULT_MAX_PAGE lines = fetch(site_url, number_of_pages) query_url = get_query_url(base_url, site_url, number_of_pages) return render_template('search.html', pairs=lines, the_title=header, form=form, query_url=query_url) return render_template('search.html', the_title=header, form=form)
async def main(future): async with aiohttp.ClientSession() as session: # get location data from api fetch_locations = [fetch(session, url) for url in location_queries] location_data = await asyncio.gather(*fetch_locations) # get weather data based on woeids weather_queries = [] for row in location_data: woeid = str(row[0]['woeid']) weather_query = f'https://www.metaweather.com/api/location/{woeid}/' weather_queries.append(weather_query) fetch_weathers = [fetch(session, url) for url in weather_queries] weather_data = await asyncio.gather(*fetch_weathers) # create json object from locations and weather data results = {} for location, city in zip(locations, weather_data): results[location] = [] weather = city['consolidated_weather'] for day in weather: date = day['applicable_date'] temp = day['the_temp'] description = day['weather_state_name'] forecast = Forecast(date, temp, description) results[location].append(forecast) return results
def parse_source(source, idx, header): """ Import data from a single source based on the data type. """ path = '{}/{}'.format(config.workspace_dir, idx) if not os.path.exists(path): os.makedirs(path) cache_url = source[header.index('cache')] cache_filename = re.search('/[^/]*$', cache_url).group() fetch(cache_url, path + cache_filename) files = rlistdir(path) for f in files: if re.match('.*\.(zip|obj|exe)$', f): # some files had mislabelled ext unzip(f, path) shapes = [] files = rlistdir(path) for f in files: if re.match('.*\.({})$'.format('|'.join(config.fiona_extensions)), f): objs = import_with_fiona(f, source[0]) for obj in objs: shapes.append(obj) elif re.match('.*\.csv$', f): objs = import_csv(f, source[0]) for obj in objs: shapes.append(obj) shutil.rmtree(path) if not shapes: _L.warning('failed to parse source. did not find shapes. files in archive: {}'.format(files)) return shapes
def main(): argvs = sys.argv if len(argvs) != 4: print( 'usage:\n delete_all_your_posts_in_direct_message.py <Slack Web API token> <Your Slack name> <Target user name>\n' ) exit() end_point = 'https://slack.com/api/' token, your_name, target_user_name = argvs[1:] token = '?token=' + token # fetch users.list users_list = utils.fetch(end_point + 'users.list' + token) your_id = [ member['id'] for member in users_list['members'] if member.get('name') == your_name ][0] target_user_id = [ member['id'] for member in users_list['members'] if member.get('name') == target_user_name ][0] print('your_id: ' + your_id) print('target_user_id: ' + target_user_id) # fetch im.list im_list = utils.fetch(end_point + 'im.list' + token) target_im_id = [ im['id'] for im in im_list['ims'] if im.get('user') == target_user_id ][0] print('target_im_id: ' + target_im_id) # fetch im.history im_history = utils.fetch_all_history(end_point + 'im.history' + token + '&channel=' + target_im_id + '&count=1000') your_posts_list = [ message for message in im_history if message.get('user') == your_id and message.get('subtype', '') == '' ] # show your posts for message in your_posts_list: print(message['text'].replace('\n', ''), message['ts']) # chat.delete print('------------------------------') print('{0} 件削除します。よろしいですか?'.format(len(your_posts_list))) ans = utils.prompt() if ans == 'y' or ans == 'Y': for message in your_posts_list: print(message['text'].replace('\n', ''), message['ts']) delete_status = utils.fetch(end_point + 'chat.delete' + token + '&ts=' + message['ts'] + '&channel=' + target_im_id) print(delete_status) print('complete!!')
def install(): fetch("http://ftp.gnome.org/pub/gnome/sources/json-glib/0.16/json-glib-%(json-glib)s.tar.xz") extract("json-glib-%(json-glib)s.tar.xz") configure( "json-glib-%(json-glib)s", ["--prefix=%s" % env.prefix, "--disable-gcov", "--disable-introspection", "CC=clang"] ) make("json-glib-%(json-glib)s") make("json-glib-%(json-glib)s", "install")
def main(): argvs = sys.argv if len(argvs) != 4: print( 'usage:\n delete_all_your_files_in_direct_message.py <Slack Web API token> <Your Slack name> <Target user name>\n' ) exit() end_point = 'https://slack.com/api/' token, your_name, target_user_name = argvs[1:] token = '?token=' + token # fetch users.list users_list = utils.fetch(end_point + 'users.list' + token) your_id = [ member['id'] for member in users_list['members'] if member.get('name') == your_name ][0] target_user_id = [ member['id'] for member in users_list['members'] if member.get('name') == target_user_name ][0] print('your_id: ' + your_id) print('target_user_id: ' + target_user_id) # fetch im.list im_list = utils.fetch(end_point + 'im.list' + token) target_im_id = [ im['id'] for im in im_list['ims'] if im.get('user') == target_user_id ][0] print('target_im_id: ' + target_im_id) # fetch files.list your_files_list = utils.fetch_all_files(end_point + 'files.list' + token + '&user='******'ims') ] # show your files for f in target_ims_your_files_list: print(f['id'], f['url_private']) # files.delete print('------------------------------') print('{0} 件削除します。よろしいですか?'.format(len(target_ims_your_files_list))) ans = utils.prompt() if ans == 'y' or ans == 'Y': for f in target_ims_your_files_list: print(f['id'], f['url_private']) delete_status = utils.fetch(end_point + 'files.delete' + token + '&file=' + f['id']) print(delete_status) print('complete!!')
def crawl(url: str = base_url): ''' 製品の一覧を取得する。 ''' data = utils.fetch(url) data_list = [ data, ] soup = bs(data, 'lxml') # ページの一覧を順番に読み込む page_list = list( set([ page['href'] for page in soup.find('p', class_='pagelink').find_all('a') ])) for page in page_list: d = utils.fetch(page) if d: data_list.append(d) # item list output = [] for d in data_list: soup = bs(d, 'lxml') for item in soup.find('div', class_='item_list').ul.find_all('li'): item = item.div.span # JSON出力用 o = {'manufacture': 'Akner'} # 製品名、製品画像 o['name'] = item.a.img['alt'].replace('Anker ', '') o['image'] = item.a.img['src'] o['url'] = item.a['href'] o['price'] = int( re.sub(r'\D', '', item.find('p', class_='price').string)) # 製品名から容量を推測(13400) m = re.search(r'[1-9][0-9]*00', o['name']) if m: o['capacity'] = int(m.group(0)) # USB PD最大出力の推測 m = re.search(r'([1-9][0-9]+)W', o['name']) if m: o['pd_w'] = int(m.group(1)) # 詳細 o['detail'] = crawl_detail(o['url']) output.append(o) return output
def crawl(url: str = base_url): ''' 製品の一覧を取得する。 ''' data = utils.fetch(url) data_list = [ data, ] soup = bs(data, 'lxml') # ページの一覧を順番に読み込む page_list = list( set([ a['href'] for a in soup.find('nav', class_='pagination').ul.find_all('a') ])) for page in page_list: d = utils.fetch(page) if d: data_list.append(d) # item list output = [] for d in data_list: soup = bs(d, 'lxml') for item in soup.find_all('section'): if not item.find('div', class_='_entry-inner') or not item.find( 'div', class_='_entry-image'): continue o = {'manufacture': 'cheero'} o['name'] = item.find('h2').string.strip() o['url'] = item.a['href'] o['image'] = item.find('img')['data-src'] o['price'] = int( re.sub(r'\D', '', item.find('p', class_='price').span.string)) o['detail'] = crawl_detail(item.a['href']) # 製品名から容量を推測 m = re.search(r'([1-9][0-9]*00)mAh', o['name']) if m: o['capacity'] = int(m.group(1)) # USB PD最大出力の推測 m = re.search(r'([1-9][0-9]+)W$', o['name']) if m: o['pd_w'] = int(m.group(1)) output.append(o) return output
def install(): fetch('http://www.pell.portland.or.us/~orc/Code/discount/discount-%(discount)s.tar.bz2') extract('discount-%(discount)s.tar.bz2') configure('discount-%(discount)s', ['--prefix=%s' % env.prefix, '--libdir=%s/lib' % env.prefix, '--mandir=%s/man' % env.prefix, '--shared', '--enable-all-features'], 'configure.sh') run('sed -i .bkp -e "/ldconfig/d" %s/%s/librarian.sh' % (env.build, 'discount-%(discount)s' % env.versions)) make('discount-%(discount)s') make('discount-%(discount)s', 'install')
def fetch_dict(url, dest): assert url.endswith(".tar.bz2"), url filename = os.path.basename(url) utils.fetch(url, os.path.join(DICT_DIR, filename)) utils.run(["tar", "xjvf", filename], cwd=DICT_DIR) name = filename[: -len(".tar.bz2")] path = os.path.join(DICT_DIR, name) utils.run(["./configure", "--vars", "DESTDIR=tmp"], cwd=path) utils.run(["make"], cwd=path) utils.run(["make", "install"], cwd=path) result_dir = os.path.join(path, "tmp/usr/lib/aspell") utils.ensure_path(dest) for dict_file in os.listdir(result_dir): shutil.copy2(os.path.join(result_dir, dict_file), os.path.join(dest, dict_file))
def fetch_dict(url, dest): assert url.endswith('.tar.bz2'), url filename = os.path.basename(url) utils.fetch(url, os.path.join(DICT_DIR, filename)) utils.run(["tar", "xjvf", filename], cwd=DICT_DIR) name = filename[:-len('.tar.bz2')] path = os.path.join(DICT_DIR, name) utils.run(["./configure", "--vars", "DESTDIR=tmp"], cwd=path) utils.run(["make"], cwd=path) utils.run(["make", "install"], cwd=path) result_dir = os.path.join(path, 'tmp/usr/lib/aspell') utils.ensure_path(dest) for dict_file in os.listdir(result_dir): shutil.copy2(os.path.join(result_dir, dict_file), os.path.join(dest, dict_file))
def _run(self, isbn): keywords = '+'.join(['details', 'texts', 'authors', 'subjects']) url = 'http://isbndb.com/api/books.xml?access_key=%s&index1=isbn&value1=%s&results=%s' % (ISBNDB_ACCESS_KEY, isbn, keywords) soup = BeautifulSoup(fetch(url)) try: result = dict() result['title'] = soup.find('titlelong').string or soup.find('title').string result['author'] = soup.find('authorstext').string bookdata = soup.find('bookdata').attrs result['isbn'] = bookdata[2][1] result['source'] = 'http://isbndb.com/d/book/%s.html' % bookdata[0][1] # publisher info pubs = soup.find('publishertext').string reg = rx_publisher.search(pubs) if reg: if len(reg.groups()) == 1: result['publisher'] = reg.group(1) elif len(reg.groups()) == 4: result['publisher'] = reg.group(2).strip() result['date'] = reg.group(4).strip() # date also can be found in details->edition_info->date if not 'date' in result: details = soup.find('details').attrs for e in details: if u'edition_info' == e[0]: reg = rx_edition.search(e[1]) if reg: result['date'] = reg.group(1) return result except: return None
def _run(self, isbn): url = 'http://books.iqbuy.ru/categories_offer/%s' % (isbn) data = rx_data.sub('', fetch(url).decode('cp1251')) soup = BeautifulSoup(data) try: result = dict() result['title'] = soup.find('h2', {'class': 'book-name'}).string authors = soup.find('p', {'class': 'book-author'}) # author is optional if authors.strong.string: result['author'] = authors.strong.string.replace(' ', ' ') # series is optional series = authors.findNext('p') reg = rx_series.search(unicode(series)) if reg: result['series'] = series.strong.string publisher = series.findNext('p') else: publisher = series # continue with publisher result['publisher'] = publisher.strong.string.replace(' ', ' ').strip() reg = rx_publisher.search(str(publisher)) if reg: result['date'] = reg.group(1) result['source'] = url result['isbn'] = isbn result['photo'] = soup.find('td', {'class': 'book-image'}).p.img['src'] return result except: return None
def fetch_issue(issue_key): '''There is a Python api for jira: pip install jira but we wanted to avoid dependencies. and it's simple.''' query = URL + 'rest/api/2/issue/' + issue_key query += '?fields=' + ','.join(f for f, _rep in FIELDS) raw_issue = utils.fetch(query) return json.loads(raw_issue)
def main(): argvs = sys.argv if len(argvs) != 2: print('usage:\n delete_all_old_files.py <Slack Web API token>\n') exit() end_point = 'https://slack.com/api/' token = argvs[1] token = '?token=' + token # fetch files.list last_month_timestamp = (datetime.now() + timedelta(days=-30)).strftime('%s') files_list = utils.fetch_all_files(end_point + 'files.list' + token + '&ts_to=' + last_month_timestamp) # show your files for f in files_list: print(f['id'], f['url_private']) # files.delete print('------------------------------') print('{0} 件削除します。よろしいですか?'.format(len(files_list))) ans = utils.prompt() if ans == 'y' or ans == 'Y': for f in files_list: print(f['id'], f['url_private']) delete_status = utils.fetch(end_point + 'files.delete' + token + '&file=' + f['id']) print(delete_status) print('complete!!')
def save_search_result(p, queue, retry=0): proxy = Proxy.get_random()['address'] url = SEARCH_URL.format(SEARCH_TEXT, p) try: r = fetch(url, proxy=proxy) except (Timeout, ConnectionError): sleep(0.1) retry += 1 if retry > 5: queue.put(url) raise GreenletExit() try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass return save_search_result(url, queue, retry) soup = BeautifulSoup(r.text, 'lxml') results = soup.find(class_='results') if results is None: # 此代理已经被封, 换其他的代理 sleep(0.1) retry += 1 if retry > 5: queue.put(url) raise GreenletExit() return save_search_result(url, queue, retry) articles = results.find_all('div', lambda x: 'wx-rb' in x) for article in articles: save_article(article)
def episode_menu(): et_tz = pytz.timezone('US/Eastern') date_et = common.get_date() if vars.params.get('custom_date', False) else utils.tznow(et_tz).date() # Avoid possible caching by using query string epg_url = 'https://nlnbamdnyc-a.akamaihd.net/fs/nba/feeds/epg/%d/%d_%d.js?t=%d' % ( date_et.year, date_et.month, date_et.day, time.time()) response = utils.fetch(epg_url) g_epg = json.loads(response[response.find('['):]) for epg_item in g_epg: entry = epg_item['entry'] start_et_hours, start_et_minutes = map(int, entry['start'].split(':')) duration_hours, duration_minutes = map(int, entry['duration'].split(':')) dt_et = et_tz.localize(datetime.datetime(date_et.year, date_et.month, date_et.day, start_et_hours, start_et_minutes)) dt_utc = dt_et.astimezone(pytz.utc) start_timestamp = int((dt_utc - datetime.datetime(1970, 1, 1, tzinfo=pytz.utc)).total_seconds()) * 1000 # in milliseconds duration = (duration_hours * 60 + duration_minutes) * 60 * 1000 # in milliseconds params = { 'start_timestamp': start_timestamp, 'duration': duration, } utils.log(params, xbmc.LOGDEBUG) name = '%s %s: %s' % ( entry['start'], dt_et.tzname(), entry['showTitle'] if entry['showTitle'] else entry['title']) common.addListItem(name, '', 'nba_tv_play_episode', iconimage=entry['image'], customparams=params)
def fetchams(ref): try: dj = json.load(fetch("%s/dossier/%s?format=json" % (settings.PARLTRACK_URL,ref))) except: raise ValueError dossier,_ = Dossier.objects.get_or_create(id=ref, title=dj['procedure']['title'], _date=datetime.datetime.now()) committees={} for am in dj['amendments']: for committee in am['committee']: date=am['date'].split('T')[0] id=committee_map.get(committee,committee) if not (id,date) in committees: c,_=Committee.objects.get_or_create(dossier=dossier, title=committee, cid=id, src = am['src'], date=date) committees[(id, date)]=c else: c=committees[(id,date)] a,_=Amendment.objects.get_or_create(seq = am['seq'], dossier = dossier, committee = c, lang = am['orig_lang'], authors = am['authors'], new = '\n'.join(am.get('new',[])), old = '\n'.join(am.get('old',[])), type = am['location'][0][0], location = am['location'][0][1]) return dossier
def save_search_result(p, queue, retry=0): proxy = Proxy.get_random()['address'] url = SEARCH_URL.format(SEARCH_TEXT, p) try: r = fetch(url, proxy=proxy) except (Timeout, ConnectionError): sleep(0.1) retry += 1 if retry > 5: queue.put(url) raise GreenletExit() try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass return save_search_result(url, queue, retry) soup = BeautifulSoup(r.text, 'lxml') results = soup.find(class_='results') if results is None: # 此代理已经被封, 换其他的代理 sleep(0.1) retry += 1 if retry > 5: queue.put(url) raise GreenletExit() return save_search_result(url, queue, retry) articles = results.find_all( 'div', lambda x: 'wx-rb' in x) for article in articles: save_article(article)
def crawl_detail(url): ''' 製品の詳細を取得する。 ''' data = utils.fetch(url) soup = bs(data, 'lxml') details = {} # Amazonへのリンク # cheeroはJavaScriptでの埋め込みなのでBeautifulSoupが使えない pat = re.compile(r'https?://amzn\.to/[0-9a-zA-Z]+') amazon = list(set([s for s in pat.findall(data)])) for a in amazon: details['amazon'] = a # 仕様 for h3 in soup.find_all('h3'): if h3 and h3.string != 'SPEC': continue for row in h3.find_next_sibling().find_all('div', class_='table_elem'): cols = row.find_all('div') if len(cols) != 2: continue details[cols[0].string] = cols[1].text return details
def check_proxy(p): try: res = fetch( 'http://weixin.sogou.com/weixin?query=python&type=2&page=1', proxy=p['address']) if len(res.text) < 10000: p.delete() except RequestException: p.delete()
def crawl(url: str = base_url): ''' 製品の一覧を取得する。 ''' data = utils.fetch(url) data_list = [ data, ] soup = bs(data, 'lxml') # ページの一覧を順番に読み込む page_list = list( set([ a['href'] for a in soup.find('ul', class_='page-numbers').find_all( 'a', class_='page-numbers') ])) for page in page_list: d = utils.fetch(page) if d: data_list.append(d) # item list output = [] for d in data_list: soup = bs(d, 'lxml') for item in soup.find('ul', class_='products').find_all('li'): if not item.find('div', class_='product_details'): continue o = {'manufacture': 'RAVPower'} o['name'] = item.find('h3').string.strip() o['url'] = item.find('a', class_='product_item_link')['href'] o['image'] = item.find('img')['src'] o['detail'] = crawl_detail(item.a['href']) # 製品名から容量を推測 m = re.search(r'([1-9][0-9]*00)mAh', o['name']) if m: o['capacity'] = int(m.group(1)) output.append(o) return output
def main(): argvs = sys.argv if len(argvs) != 4: print('usage:\n delete_all_your_files_in_direct_message.py <Slack Web API token> <Your Slack name> <Target user name>\n') exit() end_point = 'https://slack.com/api/' token, your_name, target_user_name = argvs[1:] token = '?token=' + token # fetch users.list users_list = utils.fetch(end_point + 'users.list' + token) your_id = [member['id'] for member in users_list['members'] if member.get('name') == your_name][0] target_user_id = [member['id'] for member in users_list['members'] if member.get('name') == target_user_name][0] print('your_id: ' + your_id) print('target_user_id: ' + target_user_id) # fetch im.list im_list = utils.fetch(end_point + 'im.list' + token) target_im_id = [im['id'] for im in im_list['ims'] if im.get('user') == target_user_id][0] print('target_im_id: ' + target_im_id) # fetch files.list your_files_list = utils.fetch_all_files(end_point + 'files.list' + token + '&user='******'ims')] # show your files for f in target_ims_your_files_list: print(f['id'], f['url_private']) # files.delete print('------------------------------') print('{0} 件削除します。よろしいですか?'.format(len(target_ims_your_files_list))) ans = utils.prompt() if ans == 'y' or ans == 'Y': for f in target_ims_your_files_list: print(f['id'], f['url_private']) delete_status = utils.fetch(end_point + 'files.delete' + token + '&file=' + f['id']) print(delete_status) print('complete!!')
def main(): argvs = sys.argv if len(argvs) != 4: print('usage:\n delete_all_your_posts_in_direct_message.py <Slack Web API token> <Your Slack name> <Target user name>\n') exit() end_point = 'https://slack.com/api/' token, your_name, target_user_name = argvs[1:] token = '?token=' + token # fetch users.list users_list = utils.fetch(end_point + 'users.list' + token) your_id = [member['id'] for member in users_list['members'] if member.get('name') == your_name][0] target_user_id = [member['id'] for member in users_list['members'] if member.get('name') == target_user_name][0] print('your_id: ' + your_id) print('target_user_id: ' + target_user_id) # fetch im.list im_list = utils.fetch(end_point + 'im.list' + token) target_im_id = [im['id'] for im in im_list['ims'] if im.get('user') == target_user_id][0] print('target_im_id: ' + target_im_id) # fetch im.history im_history = utils.fetch_all_history(end_point + 'im.history' + token + '&channel=' + target_im_id + '&count=1000') your_posts_list = [message for message in im_history if message.get('user') == your_id and message.get('subtype', '') == ''] # show your posts for message in your_posts_list: print(message['text'].replace('\n', ''), message['ts']) # chat.delete print('------------------------------') print('{0} 件削除します。よろしいですか?'.format(len(your_posts_list))) ans = utils.prompt() if ans == 'y' or ans == 'Y': for message in your_posts_list: print(message['text'].replace('\n', ''), message['ts']) delete_status = utils.fetch(end_point + 'chat.delete' + token + '&ts=' + message['ts'] + '&channel=' + target_im_id) print(delete_status) print('complete!!')
def __init__(self): pynotify.uninit() pynotify.init("sun") self.pkg_count = fetch()[0] self.message_added = "" self.summary = "{0}Software Updates".format(" " * 14) self.message = ("{0}{1} Software updates are available\n".format( " " * 3, self.pkg_count)) self.icon = "{0}{1}.png".format(icon_path, __all__) self.n = pynotify.Notification(self.summary, self.message, self.icon) self.n.set_timeout(60000 * int(config()["STANDBY"]))
def visit(self, update=False): url = self.get_user_feed_url(update=update) print 'visit %s' % url js = fetch(url) d = json.loads(js) if d['status'] == 'ok': for item in d['items']: pk = int(item['pk']) if pk not in self.items: self.items[pk] = item return d
def create_youtube_object_by_id(self, youtube_id): try: obj = self.get(youtube_id=youtube_id) return obj except YoutubeVideo.DoesNotExist: pass youtube_obj = youtube_utils.fetch(youtube_id) youtube_obj.pop('thumburl') obj, created = self.get_or_create(youtube_id=youtube_obj['youtube_id'], defaults=youtube_obj) return obj
def __init__(self): notify2.uninit() notify2.init("sun") self.pkg_count = fetch()[0] self.message_added = "" self.summary = "{0}Software Updates".format(" " * 14) self.message = ("{0}{1} Software updates are available\n".format( " " * 3, self.pkg_count)) self.icon = "{0}{1}.png".format(icon_path, __all__) self.n = notify2.Notification(self.summary, self.message, self.icon) self.n.set_timeout(60000 * int(config()["STANDBY"]))
def save_proxies(url): try: r = fetch(url) except requests.exceptions.RequestException: return False addresses = re.findall(PROXY_REGEX, r.text) for address in addresses: proxy = Proxy(address=address) try: proxy.save() except NotUniqueError: pass
def preprocess(image_path): if image_path == "random": image = np.random.normal(size=(256, 256, 3)).astype(np.float32) image -= image.min() image /= image.max() else: image = Image.open(fetch(image_path)).convert("RGB") rgb2bgr = T.Lambda(lambda x: x[th.LongTensor([2, 1, 0])]) normalize = T.Normalize(mean=[103.939, 116.779, 123.68], std=[1, 1, 1]) return normalize(rgb2bgr(T.ToTensor()(image) * 255)).unsqueeze(0)
def unregister(): fb_uid = request.forms.uid access_token = request.forms.access_token url = 'https://api.facebook.com/method/fql.query?query=SELECT+uid%2C+name%2C+pic_big%2C+sex+FROM+user+WHERE+uid+%3D' url += fb_uid url += '&access_token=' url += access_token url += '&format=json' fb_user_data = json.loads(utils.fetch(url)) if 'error_code' in fb_user_data: return 'error' else: db.users.remove({'fb_uid':fb_uid}) return 'user removed'
def parse_source(source, idx, header): """ Import data from a single source based on the data type. """ path = '{}/{}'.format(config.workspace_dir, idx) if not os.path.exists(path): os.makedirs(path) cache_url = source[header.index('cache')] cache_filename = re.search('/[^/]*$', cache_url).group() fetch(cache_url, path + cache_filename) files = rlistdir(path) for f in files: if re.match('.*\.(zip|obj|exe)$', f): # some files had mislabelled ext unzip(f, path) shapes = [] files = rlistdir(path) for f in files: if re.match('.*\.({})$'.format('|'.join(config.fiona_extensions)), f): objs = import_with_fiona(f, source[0]) for obj in objs: shapes.append(obj) elif re.match('.*\.csv$', f): objs = import_csv(f, source[0]) for obj in objs: shapes.append(obj) shutil.rmtree(path) if not shapes: _L.warning( 'failed to parse source. did not find shapes. files in archive: {}' .format(files)) return shapes
def kuaidaili(): """ 快代理:https://www.kuaidaili.com """ url = "https://www.kuaidaili.com/free/{}" items = ["inha/{}/".format(_) for _ in range(1, 21)] for proxy_type in items: html = fetch(url.format(proxy_type)) if html: doc = pyquery.PyQuery(html) for proxy in doc(".table-bordered tr").items(): ip = proxy("[data-title=IP]").text() port = proxy("[data-title=PORT]").text() if ip and port: yield "http://{}:{}".format(ip, port)
def data5u(): """ 无忧代理:http://www.data5u.com/ """ url = "http://www.data5u.com/" html = fetch(url) if html: doc = pyquery.PyQuery(html) for index, item in enumerate(doc("li ul").items()): if index > 0: ip = item("span:nth-child(1)").text() port = item("span:nth-child(2)").text() schema = item("span:nth-child(4)").text() if ip and port and schema: yield "{}://{}:{}".format(schema, ip, port)
def ip89(): """ 89免费代理:http://http://www.89ip.cn """ url = "http://www.89ip.cn/index_{}.html" items = [p for p in range(1, 8)] for proxy_type in items: html = fetch(url.format(proxy_type)) if html: doc = pyquery.PyQuery(html) for item in doc(".layui-col-md8 tr").items(): ip = item("td:nth-child(1)").text() port = item("td:nth-child(2)").text() if ip and port: yield "http://{}:{}".format(ip, port) yield "https://{}:{}".format(ip, port)
def ip3366(): """ 云代理:http://www.ip3366.net """ url = "http://www.ip3366.net/free/?stype=1&page={}" items = [p for p in range(1, 8)] for page in items: html = fetch(url.format(page)) if html: doc = pyquery.PyQuery(html) for proxy in doc(".table-bordered tr").items(): ip = proxy("td:nth-child(1)").text() port = proxy("td:nth-child(2)").text() schema = proxy("td:nth-child(4)").text() if ip and port and schema: yield "{}://{}:{}".format(schema.lower(), ip, port)
def _run(self, isbn): url = 'http://openlibrary.org/api/books?bibkeys=ISBN:%s&jscmd=data&format=json' % isbn try: json = simplejson.loads(fetch(url)) result = dict() json = json['ISBN:'+isbn] result['title'] = json['title'] result['author'] = ', '.join([i['name'] for i in json['authors']]) result['isbn'] = isbn result['publisher'] = ', '.join([i['name'] for i in json['publishers']]) result['date'] = json['publish_date'] result['source'] = json['url'] if 'cover' in json: result['photo'] = json['cover']['medium'] return result except: return None
def iphai(): """ ip 海代理:http://www.iphai.com """ url = "http://www.iphai.com/free/{}" items = ["ng"] for proxy_type in items: html = fetch(url.format(proxy_type)) if html: doc = pyquery.PyQuery(html) for item in doc(".table-bordered tr").items(): ip = item("td:nth-child(1)").text() port = item("td:nth-child(2)").text() schema = item("td:nth-child(4)").text() if not schema: schema = "HTTP" if ip and port and schema: yield "{}://{}:{}".format(schema.lower(), ip, port)
def save_search_result(page, queue, retry=0): proxy = Proxy.get_random()['address'] url = SEARCH_URL.format(SEARCH_TEXT, page) try: r = fetch(url, proxy=proxy) except (Timeout, ConnectionError, IOError): sleep(0.1) retry += 1 if retry > 5: put_new_page(page, queue) raise GreenletExit() try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass return save_search_result(page, queue, retry) soup = BeautifulSoup(r.text, 'lxml') results = soup.find(class_='results') if results is None: # 此代理已经被封, 换其他的代理 sleep(0.1) retry += 1 if retry > 5: put_new_page(page, queue) print 'retry too much!' raise GreenletExit() return save_search_result(page, queue, retry) articles = results.find_all( 'div', lambda x: 'wx-rb' in x) for article in articles: save_article(article) page_container = soup.find(id='pagebar_container') if page_container and u'下一页' in page_container.text: last_page = int(page_container.find_all('a')[-2].text) current_page = int(page_container.find('span').text) for page in range(current_page + 1, last_page + 1): put_new_page(page, queue)
def calcAll(): """ Calculate the new macd-coefficients for all MACD-objects :return: List of serialized MACD-objects """ global macd_objects global data for macd in macd_objects: try: if macd.pair not in data: data[macd.pair] = fetch(macd.pair) # get data data[macd.pair] = parse_data( data[macd.pair]) # in each pair is stored sdf-data itself except Exception as err: pass sdf = macd.calculate_coefficient(data[macd.pair][macd.time_period]) data = dict() # empty data
async def get_repositories(user: dict, token: str) -> list: private_repos = int(user['owned_private_repos']) public_repos = int(user['public_repos']) total_repo_count = private_repos + public_repos async with aiohttp.ClientSession() as session: requests = [ asyncio.ensure_future( fetch( session, f'{API_BASE}/user/repos?type=owner&sort=full_name&page={page}', headers={ 'Authorization': f'token {token}', }, ) ) for page in range(math.ceil(total_repo_count / PER_PAGE_COUNT)) ] return await asyncio.gather(*requests)
def save_search_result(page, queue, retry=0): proxy = Proxy.get_random()['address'] url = SEARCH_URL.format(SEARCH_TEXT, page) try: r = fetch(url, proxy=proxy) except (Timeout, ConnectionError, IOError): sleep(0.1) retry += 1 if retry > 5: put_new_page(page, queue) raise GreenletExit() try: p = Proxy.objects.get(address=proxy) if p: p.delete() except DoesNotExist: pass return save_search_result(page, queue, retry) soup = BeautifulSoup(r.text, 'lxml') results = soup.find(class_='results') if results is None: # 此代理已经被封, 换其他的代理 sleep(0.1) retry += 1 if retry > 5: put_new_page(page, queue) print 'retry too much!' raise GreenletExit() return save_search_result(page, queue, retry) articles = results.find_all('div', lambda x: 'wx-rb' in x) for article in articles: save_article(article) page_container = soup.find(id='pagebar_container') if page_container and u'下一页' in page_container.text: last_page = int(page_container.find_all('a')[-2].text) current_page = int(page_container.find('span').text) for page in range(current_page + 1, last_page + 1): put_new_page(page, queue)
def xici(): """ 西刺代理:http://www.xicidaili.com """ url = "http://www.xicidaili.com/{}" items = [] for page in range(1, 8): items.append(("wt/{}".format(page), "http://{}:{}")) items.append(("wn/{}".format(page), "https://{}:{}")) for item in items: proxy_type, host = item html = fetch(url.format(proxy_type)) if html: doc = pyquery.PyQuery(html) for proxy in doc("table tr").items(): ip = proxy("td:nth-child(2)").text() port = proxy("td:nth-child(3)").text() if ip and port: yield host.format(ip, port)
def ip_66(): """ 66ip 代理:http://www.66ip.cn """ from copy import deepcopy headers = deepcopy(HEADERS) cookie = loop.run_until_complete(get_cookie()) headers.update({ "Cookie": cookie, "Host": "www.66ip.cn", "Referer": "http://www.66ip.cn/nm.html" }) url = 'http://www.66ip.cn/nmtq.php?getnum=100&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1&proxytype={}&api=66ip' pattern = "\d+\.\d+.\d+\.\d+:\d+" items = [(0, "http://{}"), (1, "https://{}")] for item in items: proxy_type, host = item html = fetch(url.format(proxy_type), headers=headers) if html: for proxy in re.findall(pattern, html): yield host.format(proxy)
def _run(self, isbn): url = 'http://www.livelib.ru/find/%s' % isbn soup = BeautifulSoup(fetch(url)) try: result = dict() result['title'] = soup.find('div', {'class': 'title'}).a.string result['author'] = soup.find('a', {'class': 'author unnoticeable'}).string span_info = soup.find('span', {'class': 'info'}) result['publisher'] = span_info.string.replace('«', '').replace('»', '') span_info = span_info.nextSibling span_info = span_info.nextSibling result['date'] = span_info.string.replace(u' г.', '') span_info = span_info.nextSibling span_info = span_info.nextSibling result['isbn'] = isbn #span_info.string.replace(u'ISBN: ', '').replace('-', '') result['source'] = url result['photo'] = soup.find('div', {'class': 'thumbnail'}).a.img['src'].replace('/s/','/l/') # small size -> large return result except: return None
def register(): fb_uid = request.forms.uid lon = request.forms.lon lat = request.forms.lat loc = [float(lon), float(lat)] access_token = request.forms.access_token url = 'https://api.facebook.com/method/fql.query?query=SELECT+uid%2C+name%2C+pic_big%2C+sex+FROM+user+WHERE+uid+%3D' url += fb_uid url += '&access_token=' url += access_token url += '&format=json' fb_user_data = json.loads(utils.fetch(url)) if 'error_code' in fb_user_data: return 'error' else: fb_user = fb_user_data[0] db.users.update({'fb_uid':fb_uid},{'$set':{'name':fb_user['name'], 'pic_big':fb_user['pic_big'], 'loc':loc}}, True) distance = 0.250 # 250 meters radians_distance = distance / 6371.0 # the radius of the earth is 6371 km cursor = db.users.find({"loc":{"$nearSphere":loc, "$maxDistance": radians_distance}},{'name':1,'fb_uid':1,'pic_big':1}).limit(200) nearby = list((record) for record in cursor) return utils.jsondumps(nearby)
def crawl_detail(url): ''' 製品の詳細を取得する。 ''' data = utils.fetch(url) soup = bs(data, 'lxml') details = {} # Amazonへのリンク amazon = soup.find('a', class_='to_amazon_button') if amazon: details['amazon'] = amazon['href'] # 仕様 for row in soup.find('div', class_='item_detail_product').find_all('tr'): cols = row.find_all('td') # 「項目名 | 値」の形式(列が2)でなければスキップ if len(cols) != 2: continue details[cols[0].string] = cols[1].string return details
def download(self): "Download the best torrent for the current episode" torrent = self.best_candidate() if not torrent: return log.info(" - Downloading %s" % torrent.url) try: req = fetch(torrent.url) with open(config.torrents_dir + '/' + torrent.name + '.torrent', 'wb') as f: f.write(req.read()) except Exception as e: torrent.failed_download = torrent.get('failed_download', 0) + 1 print " - Failed for the %d time(s) [%s, %s]" % (torrent.failed_download, type(e), e) self.hits.save() return guessed_next = numbering_from_str(torrent.next_episode) next, _ = self.get_episodes_after(guessed_next) self.hits.current = next if next else guessed_next log.info(" - Current episode is now %s" % self.hits.current) torrent.downloaded = True self.hits.save()
def run(self, isbn): url = 'http://www.labirint.ru/search/?txt=%s' % (isbn) try: reg = rx_content.search(fetch(url).decode('cp1251')) soup = BeautifulSoup(reg.group(0)) result = dict() result['title'] = soup.find('span', {'class': 'fn'}).next.next.attrs[1][1] result['photo'] = soup.find('span', {'class': 'photo'}).next.next.attrs[1][1] result['series'] = soup.find('span', {'class': 'category'}).next.next.attrs[1][1] result['publisher'] = soup.find('span', {'class': 'brand'}).next.next.attrs[1][1] div = soup.find('div', {'class': 'isbn smallbr'}) result['isbn'] = isbn div = div.findNext('div') result['author'] = div.next.next.string div = div.findNext('div') reg = rx_date.search(div.next.next.next.next) if reg: result['date'] = reg.group(1) result['source'] = url return result except: return None
return state, header def filter_polygons(state, header): """ Removes any non-polygon sources from the state file. We are only interested in parsing parcel data, which is marked as Polygon in the state file. """ filtered_state = [] for source in state: if 'Polygon' in source[header.index('geometry type')]: filtered_state.append(source) return filtered_state if __name__ == '__main__': if not os.path.isfile(config.statefile_path): fetch(config.state_url, config.statefile_path) if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) raw_state, header = load_state() state = filter_polygons(raw_state, header) parse_statefile(state, header)
def install(): fetch('http://piumarta.com/software/peg/peg-%(peg)s.tar.gz') extract('peg-%(peg)s.tar.gz') make('peg-%(peg)s', 'CC=clang') make('peg-%(peg)s', 'PREFIX=%s install' % env.prefix)
def install(): fetch("http://www.fastcgi.com/dist/fcgi-%(fcgi)s.tar.gz") extract("fcgi-%(fcgi)s.tar.gz") configure("fcgi-%(fcgi)s", ["--prefix=%s" % env.prefix]) make("fcgi-%(fcgi)s") make("fcgi-%(fcgi)s", "install")
'gtkbin-1.7.3.zip', DRIVE_C), ('https://dl.dropboxusercontent.com/u/4780737/pywebkitgtk.zip', 'pywebkitgtk.zip', SITE_PACKAGES), ] FILES = [ # Unneeded. Serves only as an example. #('https://dl.dropboxusercontent.com/u/4780737/gtkspell.pyd', # os.path.join(DRIVE_C_REAL, 'Python27', 'Lib', 'site-packages', 'gtkspell.pyd')) ] print HELP for url, filename in INSTALLERS: path = os.path.join(INSTALLERS_DIR, filename) fetch(url, path) install(path, use_wine=IS_LINUX) for url, filename, dest in TARBALLS: path = os.path.join(INSTALLERS_DIR, filename) fetch(url, path) cmd = ['wine'] if IS_LINUX else [] assert path.endswith('.zip'), path cmd.extend([SEVEN_ZIP, 'x', '-o' + dest, path]) run(cmd) for url, dest in FILES: fetch(url, dest) # TODO: Remove once pygtkspellcheck lists its dependencies. run(PYTHON + ['-m', 'pip', 'install', 'pyenchant==1.6.8'])