def login(request): if request.method == 'GET': header = get_headers() body = template('login.html') return header + '\r\n' + body elif request.method == 'POST': data = request.form() user = User.validate(data['username'], data['password']) if user: # 产生一个32位0-9字符串 session_id = ''.join(str(randint(0, 9)) for _ in range(32)) # 保存session值 sessions[session_id] = user.id kwargs = { 'Location': '/', 'Set-Cookie': 'session_id:{}'.format(session_id), } # 设置返回头部信息,制造重定向 header = get_headers(code=302, **kwargs) return header + '\r\n' else: header = get_headers() body = template('login.html', message='登录失败') return header + '\r\n' + body else: return error(request)
def index(request): if request.method == 'GET': user = current_user(request) todos = Todo.filter_by(user_id=user.id) body = template('index.html', username=user.username, todos=todos) return get_headers() + '\r\n' + body if request.method == 'POST': data = request.form() Todo.create_obj(user_id=current_user(request).id, **data) return get_headers(code=302, Location='/') + '\r\n'
def register(request): if request.method == 'GET': header = get_headers() body = template('register.html') return header + '\r\n' + body elif request.method == 'POST': data = request.form() global next_id User.create_obj(**data) body = template('register.html', messgae='注册成功') header = get_headers() return header + '\r\n' + body else: return error(request)
def get_quotation(): page_num = 1 page_size = 10000 fields = "f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f26,f22,f33,f11,f62,f128,f136,f115,f152" # k = ['f43', 'f44', 'f45', 'f46', 'f60', 'f71', 'f47', 'f48', 'f49', 'f161', 'f50', 'f55', 'f59', 'f84', 'f86', # 'f92', 'f116', 'f126', 'f152', 'f167', 'f164', 'f168', 'f169', 'f170', 'f171', 'f172'] # fields = ",".join(keys) start_url = "http://{h}.push2.eastmoney.com/api/qt/clist/get?" \ "cb=jQuery1124012264592664044649_1565663112714&pn={pn}&pz={pz}&po=1&np=1" \ "&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:105,m:106,m:107" \ "&fields={fields}" \ "&_={t}" url = start_url.format(h=random.randint(1, 100), pn=page_num, pz=page_size, t=str(time.time()).replace(".", "")[:13], fields=fields) resp = s.get(url, headers=get_headers()) data = json_loads(resp.text).get("data") quotations = handler_quotation(data.get("diff")) save_data(quotations, filename=filename) total = int(data.get("total")) logger.info("获取行情数据 {} 条".format(total))
def get_follow_info_list_by_country(country, save_to=None): api_get_all_channels = api_prefix + '/api/contentQuery/channelsWithFollow' api_get_recommond_follows = api_prefix + '/api/contentQuery/recommendFollows?followType=&count=50' api_get_popular_follows = api_prefix + '/api/contentQuery/popularFollows' api_get_channel_follows = api_prefix + '/api/contentQuery/channelFollows?version=1&channelId={channel_id}' follow_info_list = [] res_channels = requests.get(api_get_all_channels, headers=get_headers(country=country)) if not log_res(res_channels, country['name']): return [] channel_info_list = res_channels.json()['data'] channel_list = [x['channelId'] for x in channel_info_list] api_get_channel_follows_list = [ api_get_channel_follows.format(channel_id=x) for x in channel_list ] api_follows_list = api_get_channel_follows_list + [ api_get_recommond_follows ] + [api_get_popular_follows] for api in api_follows_list: follow_info_list.extend( get_follow_info_list_by_api(country, api, save_to)) return follow_info_list
def main(): args = get_args() config = utils.get_config(args.config) logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) session = http_session.StorageSession(**config['session'], access_key=utils.get_access_token()) root_dir = config['data']['root_dir'] raw_path = utils.build_path(root_dir=root_dir, sub_dir=args.raw, date=args.date, ext='json') data = download_data(session, path=raw_path) rows = parse_data(data) LOGGER.info("Retrieved %s rows", len(rows)) headers = utils.get_headers(config['fields']) rows = transform.clean(rows, data_types=headers, date=args.date) output_path = utils.build_path(root_dir=root_dir, sub_dir=args.output, date=args.date, ext='csv') utils.write_csv(path=output_path, rows=rows, header=args.header)
def update(request): if request.method == 'GET': id = request.form().get('id', -1) todo = Todo.get_by(id=int(id)) body = template('update.html', id=id, title=todo.title) return get_headers() + body + '\r\n' elif request.method == 'POST': data = request.form() obj_id = data.get('id', -1) # 这里id是整型 todo = Todo.get_by(id=int(obj_id)) todo.title = data['title'] todo.save() return get_headers(code=302, Location='/') + '\r\n' else: return error(request)
def login(): s = requests.Session() s.headers = utils.get_headers() sts_page = s.get( "https://sts.platform.rmunify.com/Account/SignIn/kingstongrammar") html = bs(sts_page.text, "html.parser") form = html.find('form') rvt = form.find('input', {"name": "__RequestVerificationToken"})["value"] return_url = form.find('input', {"name": "returnUrl"})["value"] payload = { "__RequestVerificationToken": rvt, "UserName": c.email, "username2TxtGloLgn": c.email, "Password": c.password, "password2TxtGloLgn": c.password, "returnUrl:": return_url } login_post = s.post( "https://sts.platform.rmunify.com/Account/SignIn/kingstongrammar", data=payload, allow_redirects=True) if "signInErrorMessage" in login_post.text: log.error("Failed to sign in.") else: log.success("Successfully signed in.") teams = s.get("https://teams.microsoft.com/", allow_redirects=True) print(teams.text)
async def google_search(email: str) -> dict: try: async with aiohttp.request(method='GET', url=f'{google_url}{email}', headers=get_headers()) as resp: if resp.status == 200: return parse_resp(content=await resp.text(), email=email) else: await unexpected_status(resp=resp, service=__name__) except Exception as e: print_error(e, service=__name__)
def test_cookies(cookies): headers = utils.get_headers(cookies) soup = utils.get_url(chap_url, headers=headers) noveltext = '' isvalid = True try: noveltext = soup.select('div.noveltext')[0] except: pass if noveltext == '': isvalid = False return isvalid
def structure_data(csv_path, output_path): infile = open(csv_path, 'r') reader = csv.reader(infile, delimiter=',', quotechar='|') lists = [] for row in reader: row = str(row).replace('\n', '') if not row == '': lists.append(row) with open(output_path, 'w+') as outfile: writer = csv.writer(outfile, delimiter=',') writer.writerow(get_headers()) iteration = 1 for lst in lists: print("Iteration: {}".format(iteration)) try: row_split = lst.split('{') dict1 = ast.literal_eval('{' + row_split[1].replace('}', '')[:-4] + '}') #batting and fielding dictionary dict2 = ast.literal_eval('{' + row_split[2].replace('}', '')[:-2] + '}') #bowling dictionary name, country = getNameCountry(row_split[0]) dict1, dict2 = fillingDict(dict1, dict2) #filter the bad out if len(dict1['Tests']) < 15: continue if len(dict1['T20s']) < 15: continue if len(dict1['ODIs']) < 15: continue if len(dict2['Tests']) < 14: continue if len(dict2['T20s']) < 14: continue if len(dict2['ODIs']) < 14: continue data = [name, country] data.extend(dict1['ODIs']) data.extend(dict1['Tests']) data.extend(dict1['T20s']) data.extend(dict2['ODIs']) data.extend(dict2['Tests']) data.extend(dict2['T20s']) writer.writerow(data) except Exception as E: print(E) iteration += 1 print("impurity added: {}".format(impure))
async def cybernews(email: str) -> dict: data = {'lang': 'en_US', 'e': email} try: async with aiohttp.request(method='POST', url=cybernews_url, data=data, headers=get_headers()) as resp: if resp.status == 200: return parse_resp(content=await resp.json(), email=email) else: await unexpected_status(resp=resp, service=__name__) except Exception as e: print_error(e, service=__name__)
async def haveibeenpwned(email: str) -> dict: try: async with aiohttp.request(method='GET', url=f'{haveibeenpwned_url}{email}', headers=get_headers()) as resp: if resp.status == 200: return result(email=email, service=__name__, is_leak=True) elif resp.status == 404: return result(email=email, service=__name__, is_leak=False) else: await unexpected_status(resp=resp, service=__name__) except Exception as e: print_error(e, service=__name__)
async def avast_hackcheck(email: str) -> dict: data = json.dumps({'emailAddresses': [email]}) try: async with aiohttp.request(method='POST', url=avast_url, data=data, headers=get_headers(headers)) as resp: if resp.status == 200: return parse_resp(content=await resp.json(), email=email) else: await unexpected_status(resp=resp, service=__name__) except Exception as e: print_error(e, service=__name__)
def __init__(self, novel_name, chapter_bgn, chapter_end, cookies): userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36' # userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:69.0) Gecko/20100101 Firefox/69.0' self.novel_name = novel_name self.chapter_bgn = chapter_bgn self.chapter_end = chapter_end self.cookies = cookies self.headers = utils.get_headers(cookies) self.blank = ' ' self.space = ' ' self.newline = '\n\n' self.split = self.space + '----'
def get_follow_info_list_by_api(country, api, save_to=None): res_follow_info_list = requests.get(api, headers=get_headers(country=country)) if not log_res(res_follow_info_list, country['name']): return [] follow_info_list = res_follow_info_list.json()['data'] if save_to: with open(save_to, 'ab') as file: for follow_info in follow_info_list: file.write('{},{},{},{}\n'.format(country['id'], follow_info['id'], follow_info['name'], api).encode('utf-8')) return follow_info_list
async def lifelock(email: str) -> dict: bemail = base64.b64encode(email.encode('UTF-8')) data = {'email': bemail.decode('UTF-8'), 'language': 'en', 'country': 'us'} try: async with aiohttp.request(method='POST', url=lifelock_url, data=data, headers=get_headers(headers)) as resp: if resp.status == 200: return parse_resp(content=await resp.json(), email=email) else: await unexpected_status(resp=resp, service=__name__) except Exception as e: print_error(e, service=__name__)
def get_index_data(): url = "http://58.push2.eastmoney.com/api/qt/clist/get?" \ "cb=jQuery1124005752417505401741_1565678085560&pn=1&pz=20&po=1&np=1" \ "&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3" \ "&fs=i:100.NDX,i:100.DJIA,i:100.SPX" \ "&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f26,f22,f33,f11,f62,f128,f136,f115,f152,f124,f107" \ "&_=1565678085561" resp = s.get(url, headers=get_headers()) data = json_loads(resp.text).get("data") index_data = handler_index(data.get("diff")) save_data(index_data, filename=filename) total = data.get("total") logger.info("获取数据 {} 条".format(total))
def function_create(): with utils.AtomicRequest() as atomic: function_id = uuid.uuid4().hex atomic.driver_endpoint = driver_endpoint user, tenant = utils.get_headers(request) zip_file = utils.get_zip(request) zip_url = utils.upload_zip(function_id, zip_file) if not zip_url: atomic.errors = True return critical_error('Not able to store zip.') atomic.zip_url = zip_url metadata = utils.get_metadata(request) if not utils.validate_json(utils.build_schema, metadata): atomic.errors = True return bad_request("Error validating json.") tag = "{0}_{1}_{2}".format(tenant, user, metadata.get('name')) payload = { "memory": metadata.get('memory'), "tags": [tag], "runtime": metadata.get('runtime'), "zip_location": zip_url, "name": metadata.get('name') } image_id = utils.create_image(driver_endpoint, payload) atomic.image_id = image_id function = utils.create_function(tenant, user, function_id, image_id, zip_url, tag, metadata) if not function: atomic.errors = True return critical_error('Error building the function.') return Response(function_id, status=201)
def get_page(self, url, proxies=None): logging.info(f'开始爬取 {url}') retry = 1 while(True): try: r = requests.get(url, headers=get_headers(), proxies=proxies, timeout=8) # r.encoding = chardet.detect(r.content)['encoding'] logging.info(f'{r.status_code} {url} {r.encoding}') if r.status_code == 200: return r.text else: raise ConnectionError except Exception as e: retry += 1 print(e) logging.info(f'{url}请求失败,等待3s重新尝试第{retry}次') time.sleep(3) if retry == 4: logging.info(f'已重试{retry}次,跳过') break
def get_follow_article_count(country, follow_info_list): api_get_follow_article = api_prefix + '/api/contentQuery/followArticles?followId={follow_id}&lastId=first&count=20' follow_article_count_list = [] for follow_info in follow_info_list: time.sleep(0.2) follow_id = follow_info['id'] follow_name = follow_info['name'] api = api_get_follow_article.format(follow_id=follow_id) res = requests.get(api, headers=get_headers(country=country)) if not log_res(res, country['name']): continue cnt = len(res.json()['data']) follow_article_count_list.append({ 'oper_id': country['id'], 'country': country['name'], 'follow_id': follow_id, 'follow_name': follow_name, 'article_cnt': cnt, 'api': api }) return follow_article_count_list
def plot(filename): # load the data freq = load_table_data(filename) rows, cols = get_headers(freq) # normalize_rows(freq) # normalize_cols(freq) # take a subset of the data # TODO: this should be done after sorting the arrays # but how to sort both of them? rows = rows[:100] cols = cols[:100] # print rows # calculate largest value to plot largest = max(freq.values()) import svgfig as sf sf._canvas_defaults["viewBox"] = "0 0 5000 5000" sf._canvas_defaults["width"] = "5000px" sf._canvas_defaults["height"] = "5000px" # svg files are based a lot on 'groups' # groups are quite similar to illustrator groups # tranformations (scale, translate, rotate) can be applied # only on groups, NOT on items directly, so we often need # to create a 'dumb' group with only one item in it.. # here we create a group that contains everything else # properties of groups are inherited by all items contained in them # if an item defines its own properties, this overrides the group properies everything = sf.SVG("g", fill_opacity="100%") everything.attr["style"] = {"stroke": "none", "fill": "blue"} # title of the graph # see http://www.w3schools.com/svg/svg_text.asp the SVG function # is shallow wrapper around svg title = sf.SVG( "text", filename, font_size="13", x=20, y=35, fill="#333333", stroke="none", style="text-anchor:start; font-family:verdana;", ) line = sf.SVG("line", x1="20", y1="43", x2="620", y2="43", stroke="#000000", style="stroke-width: 1;") subtitle = sf.SVG( "text", len(freq), font_size="12", x=20, y=60, fill="grey", stroke="none", style="text-anchor:start; font-family:verdana;", ) title_group = sf.SVG("g", title, subtitle, line) everything.append(title_group) # the size of the main body of the plot (bubbles) needs to be scaled # based on how many things we are plotting -- # here we calculate the scale factor # scale_factor = 340.0 / (len(cols) * 10.0) # scale_string = 'scale(%f)' % (scale_factor) # bubbles_group = sf.SVG("g", # fill_opacity="100%", # stroke="none", # #width=len(cols)*10, height=len(rows)*10 # #transform=(translate_string + ' ' + scale_string) # ) bubbles_group = sf.SVG("g", fill_opacity="100%", stroke="none", transform="translate(120, 160), scale(2)") # draw a frame l = sf.SVG( "rect", x=0, y=-5, width=len(cols) * 10, height=len(rows) * 10, fill="#dddddd", stroke="#ffffff", style="stroke-width: 1;", ) bubbles_group.append(l) # this for loop iterates over the column headers (cols) # and plots each of them as a string, rotating each individually for x, header in enumerate(cols): tx = 10 * (x + 1) ty = -8 t = sf.SVG( "text", header, x=tx, y=ty, fill="black", font_size="5", style="text-anchor:start; font-family:verdana;" ) tg = sf.SVG("g", t, transform="translate(-5,0)" "rotate(%d, %d, %d)" % (-45, tx, ty)) bubbles_group.append(tg) # draw vertical lines if x % 2 == 0 and len(rows) > 1: v = sf.SVG( "rect", x=(10 * x), y=-5, width=10, height=len(rows) * 10, fill="none", stroke="#ffffff", style="stroke-width: 1;", ) bubbles_group.append(v) # this loop iterates over the actual data and plots it row by row # at the beginning of each row we also plot the row header for y, row_name in enumerate(rows): curr_y = 10 * y t = sf.SVG( "text", row_name, x=-5, y=curr_y + 2, fill="black", font_size="5", style="text-anchor:end; font-family:verdana;", ) bubbles_group.append(t) # draw horizontal lines if y % 2 == 0 and len(cols) > 1: h = sf.SVG( "rect", x=0, y=curr_y - 5, width=len(cols) * 10, height=10, fill="none", stroke="#ffffff", style="stroke-width: 1;", ) bubbles_group.append(h) # here we plot the actual data for x, col_name in enumerate(cols): val = freq[(row_name, col_name)] if [row_name] == [col_name]: r = sf.SVG( "rect", x=10 * x, y=curr_y - 5, width=10, height=10, fill="#999999", stroke="#ffffff", style="stroke-width: 1;", ) bubbles_group.append(r) if val > 0 and [row_name] != [col_name]: val = float(val) * 2.0 / float(largest) * 2 c = sf.SVG("circle", cx=10 * x + 5, cy=curr_y, r=val) c.attr["class"] = "bubble" bubbles_group.append(c) everything.append(bubbles_group) # save to file # name the file according to input filename.. out_filename = filename.replace(".txt", ".svg") out_filename = out_filename.replace(".csv", ".svg") print out_filename everything.save(out_filename)
def main(): # create dirs root_dir = Path(__file__).resolve().parents[0] if SECRET_KEY: data_dir = Path('/data/') dump_dir = Path('/data/dump/') else: data_dir = root_dir / 'data' dump_dir = root_dir / 'dump' mkdirs(data_dir, dump_dir) # load book_download_urls book_download_urls = read(data_dir / 'book_download_urls.txt').splitlines() # remove any books that have already been downloaded book_download_urls = [ 'https://www.smashwords.com' + url for url in book_download_urls if not (data_dir / f'{get_book_id(url)}.txt').exists() ] if book_download_urls: # keep only the first 500 (as smashwords blocks the IP-address after 500 requests) book_download_urls = book_download_urls #[:500] # get headers (user-agents) headers = get_headers(root_dir / 'user-agents.txt') # initialize cache-controlled session session = CacheControl(Session()) # get the books (concurrently) with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: for nb_retry in count(1): # break if all book_download_urls successful if not book_download_urls: break # break if max number of retries exceeded if nb_retry > NB_RETRIES: LOGGER.warning( f'Could not download {len(book_download_urls)} books after {NB_RETRIES} retries.' ) break # maintain a list of failed downloads (for future retries) failed_book_download_urls = [] proxies = get_proxies() # get the book_responses book_responses = list( tqdm(executor.map(get, book_download_urls, repeat(session), cycle(headers), cycle(proxies)), total=len(book_download_urls), desc='Getting books')) # dump the book_responses dump(book_responses, 'book_responses.pkl', dump_dir=dump_dir) for book_url, book_r in zip(book_download_urls, book_responses): #print("Book content: {}".format(book_r.content)) if book_r is not None: if book_r.status_code == 200: book_r.encoding = 'utf-8' # write the content to disk write(book_r.content, data_dir / f'{get_book_id(book_url)}.txt') else: failed_book_download_urls.append(book_url) LOGGER.warning( f'Request failed for {book_url}: status code [{book_r.status_code}]' ) else: LOGGER.warning( f"The request for the book_url '{book_url}' was None." ) book_download_urls = failed_book_download_urls
def delete(request): data = request.form() obj_id = data.get('id', -1) Todo.delete(int(obj_id)) return get_headers(code=302, location='/') + '\r\n'
trade_start_date, trade_final_date)} results[dataset_name + ':' + mode], jobs = explore_models( classifiers=classifiers, df=df, prices=prices, dataset_name=dataset_name, magic_number=magic_number, trading_params=trading_params, dates=dates) total_jobs += jobs exec_id = uuid4().hex[:8] result_file = open(os.path.join(save_path, 'results_%s.csv' % exec_id), 'w') # Log information about the execution print( "Tasks launched: \n\t* Get data: %s\n\t* Training: %s\n\t* Total: %s" % (4 * len(datasets), total_jobs, 1 + 4 * len(datasets) + total_jobs)) # Print the models performance as the tasks finish. result_file.write(get_headers(trading_params) + '\n') print(get_headers(trading_params)) clean_results = wait_results(results, log=True, datasets=datasets, f=result_file) total_time = time() # Save the py object containing all Portfolios for each model. save_obj(clean_results, os.path.join(save_path, 'clean_results_%s_%s' % ( symbols_list_name, exec_id))) # Print each portfolio per trading session for each model. print(clean_results) print("Total time: %.3f" % (total_time - start_time))
def main(): # create dirs root_dir = Path(__file__).resolve().parents[1] data_dir = root_dir / 'data' dump_dir = root_dir / 'dump' mkdirs(data_dir, dump_dir) gold_proxies = [ 'https://51.158.186.242:8811', ] proxies = [] print(proxies) proxy_idx = 0 while True: # load book_download_urls book_download_urls = read(root_dir / 'book_download_urls.txt', 'r').splitlines() # remove any books that have already been downloaded book_download_urls = [ url for url in book_download_urls if not (data_dir / f'{get_book_id(url)}.txt').exists() ] if book_download_urls: # keep only the first 500 (as smashwords blocks the IP-address after 500 requests) book_download_urls = book_download_urls[:48] # get headers (user-agents) headers = get_headers(root_dir / 'user-agents.txt') # initialize cache-controlled session session = CacheControl(Session()) # get the books (concurrently) with ThreadPoolExecutor(max_workers=6) as executor: for nb_retry in count(1): # break if all book_download_urls successful if not book_download_urls: break # break if max number of retries exceeded # if nb_retry > NB_RETRIES: # print(f'Could not download {len(book_download_urls)} books after {NB_RETRIES} retries.') # break cur_proxy = proxies[proxy_idx] print(f'current proxy: {cur_proxy} (#{proxy_idx})') # maintain a list of failed downloads (for future retries) failed_book_download_urls = [] nr_books = len(book_download_urls) # get the book_responses book_responses = list( tqdm(executor.map(get, book_download_urls, repeat(session), cycle(headers), repeat(cur_proxy)), total=len(book_download_urls), desc='Getting books')) # dump the book_responses dump(book_responses, 'book_responses.pkl') for book_url, book_r in zip(book_download_urls, book_responses): if book_r is not None: if book_r.status_code == 200: book_r.encoding = 'utf-8' # write the content to disk write( book_r.content, data_dir / f'{get_book_id(book_url)}.txt') else: failed_book_download_urls.append(book_url) print( f'Request failed for {book_url}: status code [{book_r.status_code}]' ) nr_failure = len(failed_book_download_urls) book_download_urls = failed_book_download_urls if nr_failure == nr_books: proxy_idx += 1
#读取js文件 with open('token.js',encoding='utf-8') as f: js = f.read() #通过compile命令转成一个js对象 tokenjs = execjs.compile(js) tokenKey = "5ec029c599f7abec29ebf1c50fcc05a0" options_header = utils.get_headers( ''' Host: api.busyluo.org User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0 Accept: */* Accept-Language: en-US,en;q=0.5 Accept-Encoding: gzip, deflate, br Access-Control-Request-Method: POST Access-Control-Request-Headers: content-type,x-app,x-time,x-token Referer: https://www.busyluo.org/ Origin: https://www.busyluo.org Connection: keep-alive TE: Trailers ''' ) #print(options_header) #res = requests.options("https://api.busyluo.org/4.0/main/signin", headers=options_header) # xtime = str(hex(int(time.time())).replace('0x', '')) lecture_header =''' Host: api.busyluo.org User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0
def error(request): header = get_headers(code=404) body = template('404.html') return header + '\r\n' + body
def main(): # create dirs root_dir = Path(__file__).resolve().parents[1] dump_dir = root_dir / 'dump' mkdirs(dump_dir) # determine search_urls (should be roughly 0.9B words in total) search_urls = [ f'https://www.smashwords.com/books/category/1/downloads/0/free/medium/{i}' for i in range(0, 30000 + 1, 20) ] # get headers (user-agents) headers = get_headers(root_dir / 'user-agents.txt') # initialize cache-controlled session session = CacheControl(Session()) with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: # get/write book_page_urls book_page_urls = [] with open(dump_dir / 'book_page_urls.txt', 'w') as f: for nb_retry in count(1): # break if all search_urls successful if not search_urls: break # break if max number of retries exceeded if nb_retry > NB_RETRIES: print( f'Could not get {len(search_urls)} search pages after {NB_RETRIES} retries.' ) break # maintain a list of failed searches (for future retries) failed_search_urls = [] # get the search_responses search_responses = list( tqdm(executor.map(get, search_urls, repeat(session), cycle(headers)), total=len(search_urls), desc='Getting searches')) # dump the search_responses dump(search_responses, 'search_responses.pkl') for search_url, search_r in zip(search_urls, search_responses): if search_r is not None: if search_r.status_code == 200: search_r.encoding = 'utf-8' search_tree = html.fromstring(search_r.content) search_tree.make_links_absolute(search_r.url) try: for book_page_url in search_tree.xpath( '//a[@class="library-title"]/@href'): book_page_urls.append(book_page_url) f.write(book_page_url + '\n') except IndexError: failed_search_urls.append(search_url) print(f'Request failed for {search_url}') else: failed_search_urls.append(search_url) print( f'Request failed for {search_url}: status code [{search_r.status_code}]' ) search_urls = failed_search_urls # write book_download_urls.txt with open(root_dir / 'book_download_urls.txt', 'w') as f: for nb_retry in count(1): # break if all book_page_urls successful if not book_page_urls: break # break if max number of retries exceeded if nb_retry > NB_RETRIES: print( f'Could not get {len(book_page_urls)} book pages after {NB_RETRIES} retries.' ) break # maintain a list of failed book_pagees (for future retries) failed_book_page_urls = [] # get the book_page_responses book_page_responses = list( tqdm(executor.map(get, book_page_urls, repeat(session), cycle(headers)), total=len(book_page_urls), desc='Getting book pages')) # dump the book_page_responses dump(book_page_responses, 'book_page_responses.pkl') for book_page_url, book_page_r in zip(book_page_urls, book_page_responses): if book_page_r is not None: if book_page_r.status_code == 200: book_page_r.encoding = 'utf-8' book_page_tree = html.fromstring( book_page_r.content) try: # get relevant data script_text = book_page_tree.xpath( '//div[@id="contentArea"]/script/text()' )[0] _json = json.loads( script_text.split( 'window.angularData.book = ')[1].split( '};')[0] + '}') try: language = _json['language']['name'] if language == 'English': formats = _json['formats'] if 'TXT' in formats: f.write( book_page_tree.xpath( '//a[@title="Plain text; contains no formatting"]/@href' )[0] + '\n') else: continue except KeyError: continue except IndexError: failed_book_page_urls.append(book_page_url) print(f'Request failed for {book_page_url}') else: failed_book_page_urls.append(book_page_url) print( f'Request failed for {book_page_url}: status code [{book_page_r.status_code}]' ) book_page_urls = failed_book_page_urls
def main(settings): def _get_rdd(headers): return ( # Read the data sc.textFile(settings['LOCAL_DATA_PATH']) # Remove the warning lines .filter(lambda x: not x.startswith('Warning')) # Map into a tuple .map(lambda x: x.split(settings['SEPARATOR'])) # Replace 'NULL' values by None .map(lambda x: [v if v != 'NULL' else None for v in x]) # Zip into a dictionary with headers .map(lambda x: dict(zip(headers, x))) ) def _split_on_ground_thruth_field(data): unique_field_values = data.map(lambda x: x[settings['DEDUPER_GROUND_TRUTH_FIELD']]).distinct() train_values, test_values = unique_field_values.randomSplit([1 - settings['TEST_RELATIVE_SIZE'], settings['TEST_RELATIVE_SIZE']], seed=settings['RANDOM_SEED']) train_data = ( train_values.map(lambda x: (x, None)).leftOuterJoin( data.map(lambda x: (x[settings['DEDUPER_GROUND_TRUTH_FIELD']], x)) ) .map(lambda x: x[1][1]) ) test_data = ( test_values.map(lambda x: (x, None)).leftOuterJoin( data.map(lambda x: (x[settings['DEDUPER_GROUND_TRUTH_FIELD']], x)) ) .map(lambda x: x[1][1]) ) return train_data, test_data def _get_precision(results): # results is an rdd of tuples of the form (true_value, predicted_value) # Precision: of all predicted matches, how many were real? # Count all predicted matches predicted_matches = results.filter(lambda x: x[1] == 1) denominator = predicted_matches.count() # Count how many of them are actually true numerator = predicted_matches.map(lambda x: x[0]).reduce(add) percentage = float(numerator)/denominator*100 return numerator, denominator, percentage def _get_recall(results): # results is an rdd of tuples of the form (true_value, predicted_value) # Recall : Of all true matches, how many were retrieved? # Count all true matches true_matches = results.filter(lambda x: x[0] == 1) denominator = true_matches.count() # Count how many of them were retrieved numerator = true_matches.map(lambda x: x[1]).reduce(add) percentage = float(numerator)/denominator*100 return numerator, denominator, percentage def _predict_extra_block_pair(labeled_point, same_block_bool, logistic_regression): if not same_block_bool: return 0 else: return logistic_regression.predict(labeled_point.features) # ********* MAIN *************** # Sanity check on settings utils.settings_sanity_check(settings) log_file = settings['LOG_FILE_PATH'] # At some point, we will need the length of the distances vectors (features for ml) later while constructing the labeled points.. n_deduper_fields = len(settings['DEDUPER_FIELDS']) # Read the header line headers = utils.get_headers(settings['HEADER_LOCAL_DATA_PATH'], settings['SEPARATOR']) # Get the data in an RDD data = _get_rdd(headers) log_line("The whole dataset contains %d records" % data.count(), log_file) # Split labeled data and unlabeled data labeled_data = data.filter(lambda x: x[settings['DEDUPER_GROUND_TRUTH_FIELD']] is not None) unlabeled_data = data.filter(lambda x: x[settings['DEDUPER_GROUND_TRUTH_FIELD']] is None) log_line("%d records are labeled and %d records are unlabeld"% (labeled_data.count(), unlabeled_data.count()), log_file) # Split labeled data into a training and test datasets (on unique values of the DEDUPER_GROUND_TRUTH_FIELD such that all true pairs are together in their dataset) train_data, test_data = _split_on_ground_thruth_field(labeled_data) log_line("Labeled data was split into %d records for training and %d records for testing" % (train_data.count(), test_data.count()), log_file) # Loop on all predicates that we want to try for predicate_function in settings['PREDICATE_FUNCTIONS']: log_line("\n***** Predicate function %s *************\n" % str(predicate_function), log_file) # Add the predicate key to training and test_data train_data = train_data.map(lambda x: utils.add_predicate_key(x, **predicate_function)) test_data = test_data.map(lambda x: utils.add_predicate_key(x, **predicate_function)) # Generate a new rdd with all intra-block pairs and the true value of whether or not they are matches (based on the ground thruth field) train_pairs = ( # Transform into tuples of the form (<key>, <value>) where key is the predicate and value is a list that will be extended with all elements of a block train_data.map(lambda x: (x['PredicateKey'], [x])) # Extend the list to get all dictionaries of a same block together .reduceByKey(lambda l1, l2 : l1 + l2) # Generate all pairs of records from each block : (d1, d2) .flatMap(utils.generate_pairs) # Determine if the pair is a match and use this as a key -> (<match>, (d1, d2)) .map(lambda x: (utils.records_are_matches(x[0], x[1], settings['DEDUPER_GROUND_TRUTH_FIELD']), (x[0], x[1]))) # Convert dictionaries into a list of distance measures (one for each DEDUPER_FIELD) -. (<match>, [0.5, 1, ..]) .map(lambda x: (x[0], utils.dict_pair_2_distance_list(x[1][0], x[1][1], settings['DEDUPER_FIELDS']))) # Convert list of distances into SparseVectors -> (<match>, SparseVector) .map(lambda x: (x[0], SparseVector(n_deduper_fields, dict([(i, v) for i, v in enumerate(x[1]) if v is not None])))) # Convert tuples into LabeledPoints (LabeledPoint) .map(lambda x: LabeledPoint(x[0], x[1])) ) n_true_matches = train_pairs.filter(lambda x: x.label == 1).count() n_true_no_match = train_pairs.filter(lambda x: x.label == 0).count() log_line("When taking all intra-block pairs, we get %d true matches and %d true no-match" % (n_true_matches, n_true_no_match), log_file) ratio = float(n_true_matches)/n_true_no_match # If the ratio is too unbalanced, balance it if ratio < 0.85 or ratio > 1.15: log_line("Intra-block pairs are too unbalanced, we will sample the biggest set to get approximately the same number of each type", log_file) label_with_too_many = 0 if n_true_no_match > n_true_matches else 1 keep_all_label = 0 if label_with_too_many == 1 else 1 train_pairs = ( # Keep all of the smaller set train_pairs.filter(lambda x: x.label == keep_all_label) .union( # Add a sample of the bigger set train_pairs.filter(lambda x: x.label == label_with_too_many) .sample(False, ratio, seed=settings['RANDOM_SEED']) ) ) n_true_matches = train_pairs.filter(lambda x: x.label == 1).count() n_true_no_match = train_pairs.filter(lambda x: x.label == 0).count() log_line("After sampling, intra-block pairs, we get %d true matches and %d true no-match" % (n_true_matches, n_true_no_match), log_file) else: log_line("These intra-block pairs are balanced enough so we will keep all of them", log_file) # Train a logistic regression log_line("Training a logistic regression...", log_file) logistic_regression = LogisticRegressionWithSGD.train(train_pairs) # ******* Training results ************** log_line("\nResults when comparing training intra-block pairs only:", log_file) # Build a rdd or tuples of the form: (true_label, predicted_label) for train and test data train_results = train_pairs.map(lambda x: (x.label, logistic_regression.predict(x.features))) # Precision and recall on training data numerator, denominator, percentage = _get_precision(train_results) log_line("Intra-block precision on training data: %d/%d = %.2f%%" % (numerator, denominator, percentage), log_file) numerator, denominator, percentage = _get_recall(train_results) log_line("Intra-block recall on training data: %d/%d = %.2f%%" % (numerator, denominator, percentage), log_file) # ******* Test results ************** # Generate random pairs instead of intra-block pairs until the number of true matches is big enough n_true_matches_in_test_pairs = 0 curr_n_pairs_in_test = 0 fraction = 0 while n_true_matches_in_test_pairs < settings['MIN_TRUE_MATCHES_FOR_EVALUATION'] and fraction < 0.5: log_line("\nGenerating a random set of pairs for testing the model...\n", log_file) # Taking 2 samples whose size is the square root of the number of pairs we want and then excluding same-record pairs will give us a random sample of pairs of approximately the right size curr_n_pairs_in_test += N_PAIRS_TO_TEST fraction = float(sqrt(curr_n_pairs_in_test))/test_data.count() random_test_pairs = ( test_data.sample(False, fraction, seed=settings['RANDOM_SEED']) .map(lambda x: (True, x)) .join( test_data .sample(False, fraction, seed=settings['RANDOM_SEED']) .map(lambda x: (True, x)) ) .filter(lambda x: x[1][0] != x[1][1]) # Only keep the tuple of 2 dictionaries .map(lambda x: x[1]) # Determine if the pair is a match and use this as a key -> (<match>, (d1, d2)) .map(lambda x: (utils.records_are_matches(x[0], x[1], settings['DEDUPER_GROUND_TRUTH_FIELD']), x)) # Convert dictionaries into a list of distance measures (one for each DEDUPER_FIELD) -. (<match>, [0.5, 1, ..], (d1, d2)) .map(lambda x: (x[0], utils.dict_pair_2_distance_list(x[1][0], x[1][1], settings['DEDUPER_FIELDS']), x[1])) # Convert list of distances into SparseVectors -> (<match>, SparseVector, (d1, d2)) .map(lambda x: (x[0], SparseVector(n_deduper_fields, dict([(i, v) for i, v in enumerate(x[1]) if v is not None])), x[2])) # Convert tuples into LabeledPoints -> (LabeledPoint, (d1, d2)) .map(lambda x: (LabeledPoint(x[0], x[1]), x[2])) # Determine if the pair is in the same block or not -> (LabeledPoint, <same_block>) .map(lambda x: (x[0], utils.records_in_same_block(x[1][0], x[1][1]))) ) n_true_matches_in_test_pairs = random_test_pairs.filter(lambda x: x[0].label == 1).count() # Matches in random pairs will be very rare, make sure there are at least some of them.. log_line("Number of same block pairs in the test set: %d" % random_test_pairs.filter(lambda x: x[1]).count(), log_file) log_line("Number of true matches in the test set: %d" % n_true_matches_in_test_pairs, log_file) if n_true_matches_in_test_pairs == 0: raise BaseException("Could not find enough true matches to test prediction and recall on labeled data.") # Get results (<true_label>, <predicted_label>) for random_test_pairs test_results = random_test_pairs.map(lambda x: (x[0].label, _predict_extra_block_pair(x[0], x[1], logistic_regression))) # Precision and recall on test data log_line("\nResults when comparing test pairs:", log_file) numerator, denominator, percentage = _get_precision(test_results) log_line("Precision on test data (intra and extra block pairs): %d/%d = %.2f%%" % (numerator, denominator, percentage), log_file) numerator, denominator, percentage = _get_recall(test_results) log_line("Recall on test data (intra and extra block pairs): %d/%d = %.2f%%" % (numerator, denominator, percentage), log_file) log_line("\n\n", log_file)
#!/usr/bin/env python2 # -- coding: utf-8 -- import requests import json import utils url = utils.API_URL token = utils.get_api_key() headers = utils.get_headers(token) EXISTING_PKS = [] # If you've already filed with a given agency, or otherwise want to exclude it, include its ID here. AGENCY_PKS = [248] # Agency ID 248 is a test agency under federal jurisdiction 10. This ID is subject to change, and will deduct requests from your account. Contact [email protected] and we'll add them back. AGENCY_PKS = filter(lambda x: x not in EXISTING_PKS, AGENCY_PKS) DOCS = """ A copy of your reports that are: Annual Monthly Bimonthly """ TITLE = 'Records Request' # Customize here for your project for agency_pk in AGENCY_PKS: # get the jurisdiction r = requests.get(url + 'agency/{}/'.format(agency_pk), headers=headers) jurisdiction_pk = r.json()['jurisdiction'] print 'Filing for {}...'.format(r.json()['name'])
async def haveibeensold(email: str) -> dict: data = { 'email': email, 'action': 'check' } try: async with aiohttp.request(method='POST', url=haveibeensold_url, data=data, headers=get_headers()) as resp: if resp.status == 200: return parse_resp(content=await resp.json(), email=email) else: await unexpected_status(resp=resp, service=__name__) except Exception as e: print_error(e, service=__name__)