Example #1
0
def login(request):
    if request.method == 'GET':
        header = get_headers()
        body = template('login.html')
        return header + '\r\n' + body
    elif request.method == 'POST':
        data = request.form()
        user = User.validate(data['username'], data['password'])
        if user:
            # 产生一个32位0-9字符串
            session_id = ''.join(str(randint(0, 9)) for _ in range(32))
            # 保存session值
            sessions[session_id] = user.id
            kwargs = {
                'Location': '/',
                'Set-Cookie': 'session_id:{}'.format(session_id),
            }
            # 设置返回头部信息,制造重定向
            header = get_headers(code=302, **kwargs)
            return header + '\r\n'
        else:
            header = get_headers()
            body = template('login.html', message='登录失败')
            return header + '\r\n' + body
    else:
        return error(request)
Example #2
0
def index(request):
    if request.method == 'GET':
        user = current_user(request)
        todos = Todo.filter_by(user_id=user.id)
        body = template('index.html', username=user.username, todos=todos)
        return get_headers() + '\r\n' + body
    if request.method == 'POST':
        data = request.form()
        Todo.create_obj(user_id=current_user(request).id, **data)
        return get_headers(code=302, Location='/') + '\r\n'
Example #3
0
def register(request):
    if request.method == 'GET':
        header = get_headers()
        body = template('register.html')
        return header + '\r\n' + body
    elif request.method == 'POST':
        data = request.form()
        global next_id
        User.create_obj(**data)
        body = template('register.html', messgae='注册成功')
        header = get_headers()
        return header + '\r\n' + body
    else:
        return error(request)
def get_quotation():
    page_num = 1
    page_size = 10000
    fields = "f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f26,f22,f33,f11,f62,f128,f136,f115,f152"
    # k = ['f43', 'f44', 'f45', 'f46', 'f60', 'f71', 'f47', 'f48', 'f49', 'f161', 'f50', 'f55', 'f59', 'f84', 'f86',
    #  'f92', 'f116', 'f126', 'f152', 'f167', 'f164', 'f168', 'f169', 'f170', 'f171', 'f172']

    # fields = ",".join(keys)
    start_url = "http://{h}.push2.eastmoney.com/api/qt/clist/get?" \
                "cb=jQuery1124012264592664044649_1565663112714&pn={pn}&pz={pz}&po=1&np=1" \
                "&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:105,m:106,m:107" \
                "&fields={fields}" \
                "&_={t}"

    url = start_url.format(h=random.randint(1, 100),
                           pn=page_num,
                           pz=page_size,
                           t=str(time.time()).replace(".", "")[:13],
                           fields=fields)
    resp = s.get(url, headers=get_headers())
    data = json_loads(resp.text).get("data")
    quotations = handler_quotation(data.get("diff"))
    save_data(quotations, filename=filename)
    total = int(data.get("total"))
    logger.info("获取行情数据 {} 条".format(total))
Example #5
0
def get_follow_info_list_by_country(country, save_to=None):
    api_get_all_channels = api_prefix + '/api/contentQuery/channelsWithFollow'
    api_get_recommond_follows = api_prefix + '/api/contentQuery/recommendFollows?followType=&count=50'
    api_get_popular_follows = api_prefix + '/api/contentQuery/popularFollows'
    api_get_channel_follows = api_prefix + '/api/contentQuery/channelFollows?version=1&channelId={channel_id}'

    follow_info_list = []

    res_channels = requests.get(api_get_all_channels,
                                headers=get_headers(country=country))
    if not log_res(res_channels, country['name']):
        return []
    channel_info_list = res_channels.json()['data']
    channel_list = [x['channelId'] for x in channel_info_list]

    api_get_channel_follows_list = [
        api_get_channel_follows.format(channel_id=x) for x in channel_list
    ]

    api_follows_list = api_get_channel_follows_list + [
        api_get_recommond_follows
    ] + [api_get_popular_follows]
    for api in api_follows_list:
        follow_info_list.extend(
            get_follow_info_list_by_api(country, api, save_to))

    return follow_info_list
Example #6
0
def main():
    args = get_args()
    config = utils.get_config(args.config)
    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)

    session = http_session.StorageSession(**config['session'],
                                          access_key=utils.get_access_token())

    root_dir = config['data']['root_dir']

    raw_path = utils.build_path(root_dir=root_dir,
                                sub_dir=args.raw,
                                date=args.date,
                                ext='json')
    data = download_data(session, path=raw_path)

    rows = parse_data(data)

    LOGGER.info("Retrieved %s rows", len(rows))

    headers = utils.get_headers(config['fields'])
    rows = transform.clean(rows, data_types=headers, date=args.date)

    output_path = utils.build_path(root_dir=root_dir,
                                   sub_dir=args.output,
                                   date=args.date,
                                   ext='csv')
    utils.write_csv(path=output_path, rows=rows, header=args.header)
Example #7
0
def update(request):
    if request.method == 'GET':
        id = request.form().get('id', -1)
        todo = Todo.get_by(id=int(id))
        body = template('update.html', id=id, title=todo.title)
        return get_headers() + body + '\r\n'
    elif request.method == 'POST':
        data = request.form()
        obj_id = data.get('id', -1)
        # 这里id是整型
        todo = Todo.get_by(id=int(obj_id))
        todo.title = data['title']
        todo.save()
        return get_headers(code=302, Location='/') + '\r\n'
    else:
        return error(request)
Example #8
0
def login():
    s = requests.Session()
    s.headers = utils.get_headers()

    sts_page = s.get(
        "https://sts.platform.rmunify.com/Account/SignIn/kingstongrammar")
    html = bs(sts_page.text, "html.parser")

    form = html.find('form')

    rvt = form.find('input', {"name": "__RequestVerificationToken"})["value"]
    return_url = form.find('input', {"name": "returnUrl"})["value"]

    payload = {
        "__RequestVerificationToken": rvt,
        "UserName": c.email,
        "username2TxtGloLgn": c.email,
        "Password": c.password,
        "password2TxtGloLgn": c.password,
        "returnUrl:": return_url
    }

    login_post = s.post(
        "https://sts.platform.rmunify.com/Account/SignIn/kingstongrammar",
        data=payload,
        allow_redirects=True)

    if "signInErrorMessage" in login_post.text:
        log.error("Failed to sign in.")
    else:
        log.success("Successfully signed in.")
        teams = s.get("https://teams.microsoft.com/", allow_redirects=True)
        print(teams.text)
Example #9
0
async def google_search(email: str) -> dict:
    try:
        async with aiohttp.request(method='GET',
                                   url=f'{google_url}{email}',
                                   headers=get_headers()) as resp:
            if resp.status == 200:
                return parse_resp(content=await resp.text(), email=email)
            else:
                await unexpected_status(resp=resp, service=__name__)
    except Exception as e:
        print_error(e, service=__name__)
Example #10
0
def test_cookies(cookies):
    headers = utils.get_headers(cookies)
    soup = utils.get_url(chap_url, headers=headers)
    noveltext = ''
    isvalid = True
    try:
        noveltext = soup.select('div.noveltext')[0]
    except:
        pass
    if noveltext == '':
        isvalid = False
    return isvalid
Example #11
0
def structure_data(csv_path, output_path):
    infile = open(csv_path, 'r')
    reader = csv.reader(infile, delimiter=',', quotechar='|')
    lists = []
    for row in reader:
        row = str(row).replace('\n', '')
        if not row == '':
            lists.append(row)

    with open(output_path, 'w+') as outfile:
        writer = csv.writer(outfile, delimiter=',')
        writer.writerow(get_headers())
        iteration = 1
        for lst in lists:
            print("Iteration: {}".format(iteration))
            try:
                row_split = lst.split('{')
                dict1 = ast.literal_eval('{' +
                                         row_split[1].replace('}', '')[:-4] +
                                         '}')  #batting and fielding dictionary
                dict2 = ast.literal_eval('{' +
                                         row_split[2].replace('}', '')[:-2] +
                                         '}')  #bowling dictionary
                name, country = getNameCountry(row_split[0])
                dict1, dict2 = fillingDict(dict1, dict2)
                #filter the bad out
                if len(dict1['Tests']) < 15:
                    continue
                if len(dict1['T20s']) < 15:
                    continue
                if len(dict1['ODIs']) < 15:
                    continue
                if len(dict2['Tests']) < 14:
                    continue
                if len(dict2['T20s']) < 14:
                    continue
                if len(dict2['ODIs']) < 14:
                    continue

                data = [name, country]
                data.extend(dict1['ODIs'])
                data.extend(dict1['Tests'])
                data.extend(dict1['T20s'])
                data.extend(dict2['ODIs'])
                data.extend(dict2['Tests'])
                data.extend(dict2['T20s'])
                writer.writerow(data)
            except Exception as E:
                print(E)
            iteration += 1

    print("impurity added: {}".format(impure))
Example #12
0
async def cybernews(email: str) -> dict:
    data = {'lang': 'en_US', 'e': email}
    try:
        async with aiohttp.request(method='POST',
                                   url=cybernews_url,
                                   data=data,
                                   headers=get_headers()) as resp:
            if resp.status == 200:
                return parse_resp(content=await resp.json(), email=email)
            else:
                await unexpected_status(resp=resp, service=__name__)
    except Exception as e:
        print_error(e, service=__name__)
Example #13
0
async def haveibeenpwned(email: str) -> dict:
    try:
        async with aiohttp.request(method='GET',
                                   url=f'{haveibeenpwned_url}{email}',
                                   headers=get_headers()) as resp:
            if resp.status == 200:
                return result(email=email, service=__name__, is_leak=True)
            elif resp.status == 404:
                return result(email=email, service=__name__, is_leak=False)
            else:
                await unexpected_status(resp=resp, service=__name__)
    except Exception as e:
        print_error(e, service=__name__)
Example #14
0
async def avast_hackcheck(email: str) -> dict:
    data = json.dumps({'emailAddresses': [email]})
    try:
        async with aiohttp.request(method='POST',
                                   url=avast_url,
                                   data=data,
                                   headers=get_headers(headers)) as resp:
            if resp.status == 200:
                return parse_resp(content=await resp.json(), email=email)
            else:
                await unexpected_status(resp=resp, service=__name__)
    except Exception as e:
        print_error(e, service=__name__)
Example #15
0
    def __init__(self, novel_name, chapter_bgn, chapter_end, cookies):
        userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
        # userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:69.0) Gecko/20100101 Firefox/69.0'

        self.novel_name = novel_name
        self.chapter_bgn = chapter_bgn
        self.chapter_end = chapter_end
        self.cookies = cookies
        self.headers = utils.get_headers(cookies)
        self.blank = ' '
        self.space = '    '
        self.newline = '\n\n'
        self.split = self.space + '----'
Example #16
0
def get_follow_info_list_by_api(country, api, save_to=None):
    res_follow_info_list = requests.get(api,
                                        headers=get_headers(country=country))
    if not log_res(res_follow_info_list, country['name']):
        return []
    follow_info_list = res_follow_info_list.json()['data']
    if save_to:
        with open(save_to, 'ab') as file:
            for follow_info in follow_info_list:
                file.write('{},{},{},{}\n'.format(country['id'],
                                                  follow_info['id'],
                                                  follow_info['name'],
                                                  api).encode('utf-8'))
    return follow_info_list
Example #17
0
async def lifelock(email: str) -> dict:
    bemail = base64.b64encode(email.encode('UTF-8'))
    data = {'email': bemail.decode('UTF-8'), 'language': 'en', 'country': 'us'}
    try:
        async with aiohttp.request(method='POST',
                                   url=lifelock_url,
                                   data=data,
                                   headers=get_headers(headers)) as resp:
            if resp.status == 200:
                return parse_resp(content=await resp.json(), email=email)
            else:
                await unexpected_status(resp=resp, service=__name__)
    except Exception as e:
        print_error(e, service=__name__)
Example #18
0
def get_index_data():

    url = "http://58.push2.eastmoney.com/api/qt/clist/get?" \
          "cb=jQuery1124005752417505401741_1565678085560&pn=1&pz=20&po=1&np=1" \
          "&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3" \
          "&fs=i:100.NDX,i:100.DJIA,i:100.SPX" \
          "&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f26,f22,f33,f11,f62,f128,f136,f115,f152,f124,f107" \
          "&_=1565678085561"

    resp = s.get(url, headers=get_headers())

    data = json_loads(resp.text).get("data")
    index_data = handler_index(data.get("diff"))
    save_data(index_data, filename=filename)
    total = data.get("total")
    logger.info("获取数据 {} 条".format(total))
Example #19
0
def function_create():
    with utils.AtomicRequest() as atomic:

        function_id = uuid.uuid4().hex

        atomic.driver_endpoint = driver_endpoint

        user, tenant = utils.get_headers(request)

        zip_file = utils.get_zip(request)
        zip_url = utils.upload_zip(function_id, zip_file)

        if not zip_url:
            atomic.errors = True
            return critical_error('Not able to store zip.')

        atomic.zip_url = zip_url

        metadata = utils.get_metadata(request)

        if not utils.validate_json(utils.build_schema, metadata):
            atomic.errors = True
            return bad_request("Error validating json.")

        tag = "{0}_{1}_{2}".format(tenant, user, metadata.get('name'))
        payload = {
            "memory": metadata.get('memory'),
            "tags": [tag],
            "runtime": metadata.get('runtime'),
            "zip_location": zip_url,
            "name": metadata.get('name')
        }

        image_id = utils.create_image(driver_endpoint, payload)
        atomic.image_id = image_id

        function = utils.create_function(tenant, user, function_id, image_id,
                                         zip_url, tag, metadata)

        if not function:
            atomic.errors = True
            return critical_error('Error building the function.')

        return Response(function_id, status=201)
Example #20
0
 def get_page(self, url, proxies=None):
     logging.info(f'开始爬取 {url}')
     retry = 1
     while(True):
         try:
             r = requests.get(url, headers=get_headers(),
                              proxies=proxies, timeout=8)
             # r.encoding = chardet.detect(r.content)['encoding']
             logging.info(f'{r.status_code} {url} {r.encoding}')
             if r.status_code == 200:
                 return r.text
             else:
                 raise ConnectionError
         except Exception as e:
             retry += 1
             print(e)
             logging.info(f'{url}请求失败,等待3s重新尝试第{retry}次')
             time.sleep(3)
         if retry == 4:
             logging.info(f'已重试{retry}次,跳过')
             break
Example #21
0
def get_follow_article_count(country, follow_info_list):
    api_get_follow_article = api_prefix + '/api/contentQuery/followArticles?followId={follow_id}&lastId=first&count=20'
    follow_article_count_list = []
    for follow_info in follow_info_list:
        time.sleep(0.2)
        follow_id = follow_info['id']
        follow_name = follow_info['name']
        api = api_get_follow_article.format(follow_id=follow_id)
        res = requests.get(api, headers=get_headers(country=country))
        if not log_res(res, country['name']):
            continue
        cnt = len(res.json()['data'])
        follow_article_count_list.append({
            'oper_id': country['id'],
            'country': country['name'],
            'follow_id': follow_id,
            'follow_name': follow_name,
            'article_cnt': cnt,
            'api': api
        })

    return follow_article_count_list
def plot(filename):
    # load the data
    freq = load_table_data(filename)
    rows, cols = get_headers(freq)
    # normalize_rows(freq)
    # normalize_cols(freq)

    # take a subset of the data
    # TODO: this should be done after sorting the arrays
    # but how to sort both of them?
    rows = rows[:100]
    cols = cols[:100]

    # print rows
    # calculate largest value to plot
    largest = max(freq.values())

    import svgfig as sf

    sf._canvas_defaults["viewBox"] = "0 0 5000 5000"
    sf._canvas_defaults["width"] = "5000px"
    sf._canvas_defaults["height"] = "5000px"

    #  svg files are based a lot on 'groups'
    # groups are quite similar to illustrator groups
    # tranformations (scale, translate, rotate) can be applied
    # only on groups, NOT on items directly, so we often need
    # to create a 'dumb' group with only one item in it..

    # here we create a group that contains everything else
    # properties of groups are inherited by all items contained in them
    # if an item defines its own properties, this overrides the group properies
    everything = sf.SVG("g", fill_opacity="100%")
    everything.attr["style"] = {"stroke": "none", "fill": "blue"}

    # title of the graph
    # see http://www.w3schools.com/svg/svg_text.asp the SVG function
    # is shallow wrapper around svg
    title = sf.SVG(
        "text",
        filename,
        font_size="13",
        x=20,
        y=35,
        fill="#333333",
        stroke="none",
        style="text-anchor:start; font-family:verdana;",
    )
    line = sf.SVG("line", x1="20", y1="43", x2="620", y2="43", stroke="#000000", style="stroke-width: 1;")
    subtitle = sf.SVG(
        "text",
        len(freq),
        font_size="12",
        x=20,
        y=60,
        fill="grey",
        stroke="none",
        style="text-anchor:start; font-family:verdana;",
    )
    title_group = sf.SVG("g", title, subtitle, line)
    everything.append(title_group)

    # the size of the main body of the plot (bubbles) needs to be scaled
    # based on how many things we are plotting --
    # here we calculate the scale factor
    #    scale_factor =  340.0 / (len(cols) * 10.0)
    #    scale_string = 'scale(%f)' % (scale_factor)
    #    bubbles_group = sf.SVG("g",
    #                           fill_opacity="100%",
    #                           stroke="none",
    #                           #width=len(cols)*10, height=len(rows)*10
    #                           #transform=(translate_string + ' ' + scale_string)
    #                           )

    bubbles_group = sf.SVG("g", fill_opacity="100%", stroke="none", transform="translate(120, 160), scale(2)")

    # draw a frame
    l = sf.SVG(
        "rect",
        x=0,
        y=-5,
        width=len(cols) * 10,
        height=len(rows) * 10,
        fill="#dddddd",
        stroke="#ffffff",
        style="stroke-width: 1;",
    )
    bubbles_group.append(l)

    # this for loop iterates over the column headers (cols)
    # and plots each of them as a string, rotating each individually
    for x, header in enumerate(cols):
        tx = 10 * (x + 1)
        ty = -8
        t = sf.SVG(
            "text", header, x=tx, y=ty, fill="black", font_size="5", style="text-anchor:start; font-family:verdana;"
        )
        tg = sf.SVG("g", t, transform="translate(-5,0)" "rotate(%d, %d, %d)" % (-45, tx, ty))
        bubbles_group.append(tg)

        # draw vertical lines
        if x % 2 == 0 and len(rows) > 1:
            v = sf.SVG(
                "rect",
                x=(10 * x),
                y=-5,
                width=10,
                height=len(rows) * 10,
                fill="none",
                stroke="#ffffff",
                style="stroke-width: 1;",
            )
            bubbles_group.append(v)

    #  this loop iterates over the actual data and plots it row by row
    # at the beginning of each row we also plot the row header
    for y, row_name in enumerate(rows):
        curr_y = 10 * y
        t = sf.SVG(
            "text",
            row_name,
            x=-5,
            y=curr_y + 2,
            fill="black",
            font_size="5",
            style="text-anchor:end; font-family:verdana;",
        )
        bubbles_group.append(t)

        # draw horizontal lines
        if y % 2 == 0 and len(cols) > 1:
            h = sf.SVG(
                "rect",
                x=0,
                y=curr_y - 5,
                width=len(cols) * 10,
                height=10,
                fill="none",
                stroke="#ffffff",
                style="stroke-width: 1;",
            )
            bubbles_group.append(h)

        # here we plot the actual data
        for x, col_name in enumerate(cols):
            val = freq[(row_name, col_name)]
            if [row_name] == [col_name]:
                r = sf.SVG(
                    "rect",
                    x=10 * x,
                    y=curr_y - 5,
                    width=10,
                    height=10,
                    fill="#999999",
                    stroke="#ffffff",
                    style="stroke-width: 1;",
                )
                bubbles_group.append(r)
            if val > 0 and [row_name] != [col_name]:
                val = float(val) * 2.0 / float(largest) * 2
                c = sf.SVG("circle", cx=10 * x + 5, cy=curr_y, r=val)
                c.attr["class"] = "bubble"
                bubbles_group.append(c)

    everything.append(bubbles_group)

    # save to file
    # name the file according to input filename..
    out_filename = filename.replace(".txt", ".svg")
    out_filename = out_filename.replace(".csv", ".svg")
    print out_filename
    everything.save(out_filename)
Example #23
0
def main():
    # create dirs
    root_dir = Path(__file__).resolve().parents[0]
    if SECRET_KEY:
        data_dir = Path('/data/')
        dump_dir = Path('/data/dump/')
    else:
        data_dir = root_dir / 'data'
        dump_dir = root_dir / 'dump'
    mkdirs(data_dir, dump_dir)

    # load book_download_urls
    book_download_urls = read(data_dir / 'book_download_urls.txt').splitlines()

    # remove any books that have already been downloaded
    book_download_urls = [
        'https://www.smashwords.com' + url for url in book_download_urls
        if not (data_dir / f'{get_book_id(url)}.txt').exists()
    ]

    if book_download_urls:
        # keep only the first 500 (as smashwords blocks the IP-address after 500 requests)
        book_download_urls = book_download_urls  #[:500]

        # get headers (user-agents)
        headers = get_headers(root_dir / 'user-agents.txt')

        # initialize cache-controlled session
        session = CacheControl(Session())

        # get the books (concurrently)
        with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
            for nb_retry in count(1):
                # break if all book_download_urls successful
                if not book_download_urls:
                    break

                # break if max number of retries exceeded
                if nb_retry > NB_RETRIES:
                    LOGGER.warning(
                        f'Could not download {len(book_download_urls)} books after {NB_RETRIES} retries.'
                    )
                    break

                # maintain a list of failed downloads (for future retries)
                failed_book_download_urls = []

                proxies = get_proxies()

                # get the book_responses
                book_responses = list(
                    tqdm(executor.map(get, book_download_urls, repeat(session),
                                      cycle(headers), cycle(proxies)),
                         total=len(book_download_urls),
                         desc='Getting books'))

                # dump the book_responses
                dump(book_responses, 'book_responses.pkl', dump_dir=dump_dir)

                for book_url, book_r in zip(book_download_urls,
                                            book_responses):
                    #print("Book content: {}".format(book_r.content))
                    if book_r is not None:
                        if book_r.status_code == 200:
                            book_r.encoding = 'utf-8'

                            # write the content to disk
                            write(book_r.content,
                                  data_dir / f'{get_book_id(book_url)}.txt')
                        else:
                            failed_book_download_urls.append(book_url)
                            LOGGER.warning(
                                f'Request failed for {book_url}: status code [{book_r.status_code}]'
                            )
                    else:
                        LOGGER.warning(
                            f"The request for the book_url '{book_url}' was None."
                        )

                book_download_urls = failed_book_download_urls
Example #24
0
def delete(request):
    data = request.form()
    obj_id = data.get('id', -1)
    Todo.delete(int(obj_id))
    return get_headers(code=302, location='/') + '\r\n'
Example #25
0
                                      trade_start_date, trade_final_date)}

                results[dataset_name + ':' + mode], jobs = explore_models(
                    classifiers=classifiers, df=df, prices=prices,
                    dataset_name=dataset_name, magic_number=magic_number,
                    trading_params=trading_params, dates=dates)
                total_jobs += jobs


    exec_id = uuid4().hex[:8]
    result_file = open(os.path.join(save_path, 'results_%s.csv' % exec_id), 'w')

    # Log information about the execution
    print(
        "Tasks launched: \n\t* Get data: %s\n\t* Training: %s\n\t* Total: %s" %
        (4 * len(datasets), total_jobs, 1 + 4 * len(datasets) + total_jobs))

    # Print the models performance as the tasks finish.
    result_file.write(get_headers(trading_params) + '\n')
    print(get_headers(trading_params))
    clean_results = wait_results(results, log=True, datasets=datasets, f=result_file)
    total_time = time()

    # Save the py object containing all Portfolios for each model.
    save_obj(clean_results, os.path.join(save_path, 'clean_results_%s_%s' % (
        symbols_list_name, exec_id)))

    # Print each portfolio per trading session for each model.
    print(clean_results)
    print("Total time: %.3f" % (total_time - start_time))
def main():
    # create dirs
    root_dir = Path(__file__).resolve().parents[1]
    data_dir = root_dir / 'data'
    dump_dir = root_dir / 'dump'
    mkdirs(data_dir, dump_dir)
    gold_proxies = [
        'https://51.158.186.242:8811',
    ]
    proxies = []
    print(proxies)

    proxy_idx = 0
    while True:
        # load book_download_urls
        book_download_urls = read(root_dir / 'book_download_urls.txt',
                                  'r').splitlines()

        # remove any books that have already been downloaded
        book_download_urls = [
            url for url in book_download_urls
            if not (data_dir / f'{get_book_id(url)}.txt').exists()
        ]

        if book_download_urls:
            # keep only the first 500 (as smashwords blocks the IP-address after 500 requests)
            book_download_urls = book_download_urls[:48]

            # get headers (user-agents)
            headers = get_headers(root_dir / 'user-agents.txt')

            # initialize cache-controlled session
            session = CacheControl(Session())

            # get the books (concurrently)
            with ThreadPoolExecutor(max_workers=6) as executor:
                for nb_retry in count(1):
                    # break if all book_download_urls successful
                    if not book_download_urls:
                        break

                    # break if max number of retries exceeded
                    # if nb_retry > NB_RETRIES:
                    # print(f'Could not download {len(book_download_urls)} books after {NB_RETRIES} retries.')
                    # break

                    cur_proxy = proxies[proxy_idx]
                    print(f'current proxy: {cur_proxy} (#{proxy_idx})')

                    # maintain a list of failed downloads (for future retries)
                    failed_book_download_urls = []
                    nr_books = len(book_download_urls)

                    # get the book_responses
                    book_responses = list(
                        tqdm(executor.map(get, book_download_urls,
                                          repeat(session), cycle(headers),
                                          repeat(cur_proxy)),
                             total=len(book_download_urls),
                             desc='Getting books'))

                    # dump the book_responses
                    dump(book_responses, 'book_responses.pkl')

                    for book_url, book_r in zip(book_download_urls,
                                                book_responses):
                        if book_r is not None:
                            if book_r.status_code == 200:
                                book_r.encoding = 'utf-8'

                                # write the content to disk
                                write(
                                    book_r.content,
                                    data_dir / f'{get_book_id(book_url)}.txt')
                            else:
                                failed_book_download_urls.append(book_url)
                                print(
                                    f'Request failed for {book_url}: status code [{book_r.status_code}]'
                                )

                    nr_failure = len(failed_book_download_urls)
                    book_download_urls = failed_book_download_urls

                    if nr_failure == nr_books:
                        proxy_idx += 1
Example #27
0
#读取js文件
with open('token.js',encoding='utf-8') as f:
    js = f.read()
#通过compile命令转成一个js对象
tokenjs = execjs.compile(js)

tokenKey = "5ec029c599f7abec29ebf1c50fcc05a0"

options_header = utils.get_headers(
'''
Host: api.busyluo.org
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0
Accept: */*
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Access-Control-Request-Method: POST
Access-Control-Request-Headers: content-type,x-app,x-time,x-token
Referer: https://www.busyluo.org/
Origin: https://www.busyluo.org
Connection: keep-alive
TE: Trailers
'''
)
#print(options_header)
#res = requests.options("https://api.busyluo.org/4.0/main/signin", headers=options_header)

# xtime = str(hex(int(time.time())).replace('0x', ''))

lecture_header ='''
Host: api.busyluo.org
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0
Example #28
0
def error(request):
    header = get_headers(code=404)
    body = template('404.html')
    return header + '\r\n' + body
def main():
    # create dirs
    root_dir = Path(__file__).resolve().parents[1]
    dump_dir = root_dir / 'dump'
    mkdirs(dump_dir)

    # determine search_urls (should be roughly 0.9B words in total)
    search_urls = [
        f'https://www.smashwords.com/books/category/1/downloads/0/free/medium/{i}'
        for i in range(0, 30000 + 1, 20)
    ]

    # get headers (user-agents)
    headers = get_headers(root_dir / 'user-agents.txt')

    # initialize cache-controlled session
    session = CacheControl(Session())

    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        # get/write book_page_urls
        book_page_urls = []

        with open(dump_dir / 'book_page_urls.txt', 'w') as f:
            for nb_retry in count(1):
                # break if all search_urls successful
                if not search_urls:
                    break

                # break if max number of retries exceeded
                if nb_retry > NB_RETRIES:
                    print(
                        f'Could not get {len(search_urls)} search pages after {NB_RETRIES} retries.'
                    )
                    break

                # maintain a list of failed searches (for future retries)
                failed_search_urls = []

                # get the search_responses
                search_responses = list(
                    tqdm(executor.map(get, search_urls, repeat(session),
                                      cycle(headers)),
                         total=len(search_urls),
                         desc='Getting searches'))

                # dump the search_responses
                dump(search_responses, 'search_responses.pkl')

                for search_url, search_r in zip(search_urls, search_responses):
                    if search_r is not None:
                        if search_r.status_code == 200:
                            search_r.encoding = 'utf-8'
                            search_tree = html.fromstring(search_r.content)
                            search_tree.make_links_absolute(search_r.url)

                            try:
                                for book_page_url in search_tree.xpath(
                                        '//a[@class="library-title"]/@href'):
                                    book_page_urls.append(book_page_url)
                                    f.write(book_page_url + '\n')
                            except IndexError:
                                failed_search_urls.append(search_url)
                                print(f'Request failed for {search_url}')
                        else:
                            failed_search_urls.append(search_url)
                            print(
                                f'Request failed for {search_url}: status code [{search_r.status_code}]'
                            )

                search_urls = failed_search_urls

        # write book_download_urls.txt
        with open(root_dir / 'book_download_urls.txt', 'w') as f:
            for nb_retry in count(1):
                # break if all book_page_urls successful
                if not book_page_urls:
                    break

                # break if max number of retries exceeded
                if nb_retry > NB_RETRIES:
                    print(
                        f'Could not get {len(book_page_urls)} book pages after {NB_RETRIES} retries.'
                    )
                    break

                # maintain a list of failed book_pagees (for future retries)
                failed_book_page_urls = []

                # get the book_page_responses
                book_page_responses = list(
                    tqdm(executor.map(get, book_page_urls, repeat(session),
                                      cycle(headers)),
                         total=len(book_page_urls),
                         desc='Getting book pages'))

                # dump the book_page_responses
                dump(book_page_responses, 'book_page_responses.pkl')

                for book_page_url, book_page_r in zip(book_page_urls,
                                                      book_page_responses):
                    if book_page_r is not None:
                        if book_page_r.status_code == 200:
                            book_page_r.encoding = 'utf-8'
                            book_page_tree = html.fromstring(
                                book_page_r.content)

                            try:
                                # get relevant data
                                script_text = book_page_tree.xpath(
                                    '//div[@id="contentArea"]/script/text()'
                                )[0]
                                _json = json.loads(
                                    script_text.split(
                                        'window.angularData.book = ')[1].split(
                                            '};')[0] + '}')
                                try:
                                    language = _json['language']['name']

                                    if language == 'English':
                                        formats = _json['formats']

                                        if 'TXT' in formats:
                                            f.write(
                                                book_page_tree.xpath(
                                                    '//a[@title="Plain text; contains no formatting"]/@href'
                                                )[0] + '\n')
                                        else:
                                            continue
                                except KeyError:
                                    continue
                            except IndexError:
                                failed_book_page_urls.append(book_page_url)
                                print(f'Request failed for {book_page_url}')
                        else:
                            failed_book_page_urls.append(book_page_url)
                            print(
                                f'Request failed for {book_page_url}: status code [{book_page_r.status_code}]'
                            )

                book_page_urls = failed_book_page_urls
Example #30
0
def main(settings):
    def _get_rdd(headers):
        return (
            # Read the data
            sc.textFile(settings['LOCAL_DATA_PATH'])
                # Remove the warning lines
                .filter(lambda x: not x.startswith('Warning'))
                # Map into a tuple
                .map(lambda x: x.split(settings['SEPARATOR']))
                # Replace 'NULL' values by None
                .map(lambda x: [v if v != 'NULL' else None for v in x])
                # Zip into a dictionary with headers
                .map(lambda x: dict(zip(headers, x)))
        )

    def _split_on_ground_thruth_field(data):
        unique_field_values = data.map(lambda x: x[settings['DEDUPER_GROUND_TRUTH_FIELD']]).distinct()
        train_values, test_values = unique_field_values.randomSplit([1 - settings['TEST_RELATIVE_SIZE'], settings['TEST_RELATIVE_SIZE']], seed=settings['RANDOM_SEED'])
        train_data = (
            train_values.map(lambda x: (x, None)).leftOuterJoin(
                data.map(lambda x: (x[settings['DEDUPER_GROUND_TRUTH_FIELD']], x))
            )
            .map(lambda x: x[1][1])
        )
        test_data = (
            test_values.map(lambda x: (x, None)).leftOuterJoin(
                data.map(lambda x: (x[settings['DEDUPER_GROUND_TRUTH_FIELD']], x))
            )
            .map(lambda x: x[1][1])
        )

        return train_data, test_data

    def _get_precision(results):
        # results is an rdd of tuples of the form (true_value, predicted_value)
        
        # Precision: of all predicted matches, how many were real?

        # Count all predicted matches
        predicted_matches = results.filter(lambda x: x[1] == 1)
        denominator = predicted_matches.count()

        # Count how many of them are actually true
        numerator = predicted_matches.map(lambda x: x[0]).reduce(add)

        percentage = float(numerator)/denominator*100

        return numerator, denominator, percentage

    def _get_recall(results):
        # results is an rdd of tuples of the form (true_value, predicted_value)

        # Recall : Of all true matches, how many were retrieved?

        # Count all true matches
        true_matches = results.filter(lambda x: x[0] == 1)
        denominator = true_matches.count()

        # Count how many of them were retrieved
        numerator = true_matches.map(lambda x: x[1]).reduce(add)

        percentage = float(numerator)/denominator*100

        return numerator, denominator, percentage

    def _predict_extra_block_pair(labeled_point, same_block_bool, logistic_regression):
        if not same_block_bool:
            return 0
        else:
            return logistic_regression.predict(labeled_point.features)


    # ********* MAIN ***************
    # Sanity check on settings
    utils.settings_sanity_check(settings)
    log_file = settings['LOG_FILE_PATH']

    # At some point, we will need the length of the distances vectors (features for ml) later while constructing the labeled points..
    n_deduper_fields = len(settings['DEDUPER_FIELDS'])


    # Read the header line
    headers = utils.get_headers(settings['HEADER_LOCAL_DATA_PATH'], settings['SEPARATOR'])

    # Get the data in an RDD
    data = _get_rdd(headers)
    log_line("The whole dataset contains %d records" % data.count(), log_file)

    # Split labeled data and unlabeled data
    labeled_data = data.filter(lambda x: x[settings['DEDUPER_GROUND_TRUTH_FIELD']] is not None)
    unlabeled_data = data.filter(lambda x: x[settings['DEDUPER_GROUND_TRUTH_FIELD']] is None)
    log_line("%d records are labeled and %d records are unlabeld"% (labeled_data.count(), unlabeled_data.count()), log_file)

    # Split labeled data into a training and test datasets (on unique values of the DEDUPER_GROUND_TRUTH_FIELD such that all true pairs are together in their dataset)
    train_data, test_data = _split_on_ground_thruth_field(labeled_data)
    log_line("Labeled data was split into %d records for training and %d records for testing" % (train_data.count(), test_data.count()), log_file)

    # Loop on all predicates that we want to try
    for predicate_function in settings['PREDICATE_FUNCTIONS']:

        log_line("\n***** Predicate function %s *************\n" % str(predicate_function), log_file)

        # Add the predicate key to training and test_data
        train_data = train_data.map(lambda x: utils.add_predicate_key(x, **predicate_function))
        test_data = test_data.map(lambda x: utils.add_predicate_key(x, **predicate_function))

        # Generate a new rdd with all intra-block pairs and the true value of whether or not they are matches (based on the ground thruth field)
        train_pairs = (
            # Transform into tuples of the form (<key>, <value>) where key is the predicate and value is a list that will be extended with all elements of a block
            train_data.map(lambda x: (x['PredicateKey'], [x]))
                # Extend the list to get all dictionaries of a same block together
                .reduceByKey(lambda l1, l2 : l1 + l2)
                # Generate all pairs of records from each block : (d1, d2)
                .flatMap(utils.generate_pairs)
                # Determine if the pair is a match and use this as a key -> (<match>, (d1, d2))
                .map(lambda x: (utils.records_are_matches(x[0], x[1], settings['DEDUPER_GROUND_TRUTH_FIELD']), (x[0], x[1])))
                # Convert dictionaries into a list of distance measures (one for each DEDUPER_FIELD) -. (<match>, [0.5, 1, ..])
                .map(lambda x: (x[0], utils.dict_pair_2_distance_list(x[1][0], x[1][1], settings['DEDUPER_FIELDS'])))
                # Convert list of distances into SparseVectors -> (<match>, SparseVector)
                .map(lambda x: (x[0], SparseVector(n_deduper_fields, dict([(i, v) for i, v in enumerate(x[1]) if v is not None]))))
                # Convert tuples into LabeledPoints (LabeledPoint)
                .map(lambda x: LabeledPoint(x[0], x[1]))
        )
        n_true_matches = train_pairs.filter(lambda x: x.label == 1).count()
        n_true_no_match = train_pairs.filter(lambda x: x.label == 0).count()
        log_line("When taking all intra-block pairs, we get %d true matches and %d true no-match" % (n_true_matches, n_true_no_match), log_file)
        ratio = float(n_true_matches)/n_true_no_match
        # If the ratio is too unbalanced, balance it
        if ratio < 0.85 or ratio > 1.15:
            log_line("Intra-block pairs are too unbalanced, we will sample the biggest set to get approximately the same number of each type", log_file)
            label_with_too_many = 0 if n_true_no_match > n_true_matches else 1
            keep_all_label = 0 if label_with_too_many == 1 else 1
            train_pairs = (
                # Keep all of the smaller set
                train_pairs.filter(lambda x: x.label == keep_all_label)
                .union(
                    # Add a sample of the bigger set
                    train_pairs.filter(lambda x: x.label == label_with_too_many)
                    .sample(False, ratio, seed=settings['RANDOM_SEED'])
                )
            )
            n_true_matches = train_pairs.filter(lambda x: x.label == 1).count()
            n_true_no_match = train_pairs.filter(lambda x: x.label == 0).count()
            log_line("After sampling, intra-block pairs, we get %d true matches and %d true no-match" % (n_true_matches, n_true_no_match), log_file)
        else:
            log_line("These intra-block pairs are balanced enough so we will keep all of them", log_file)
        
        # Train a logistic regression
        log_line("Training a logistic regression...", log_file)
        logistic_regression = LogisticRegressionWithSGD.train(train_pairs)

        # ******* Training results **************

        log_line("\nResults when comparing training intra-block pairs only:", log_file)
        # Build a rdd or tuples of the form: (true_label, predicted_label) for train and test data
        train_results = train_pairs.map(lambda x: (x.label, logistic_regression.predict(x.features)))
        # Precision and recall on training data
        numerator, denominator, percentage = _get_precision(train_results)
        log_line("Intra-block precision on training data: %d/%d = %.2f%%" % (numerator, denominator, percentage), log_file)
        numerator, denominator, percentage = _get_recall(train_results)
        log_line("Intra-block recall on training data: %d/%d = %.2f%%" % (numerator, denominator, percentage), log_file)

        # ******* Test results **************

        # Generate random pairs instead of intra-block pairs until the number of true matches is big enough
        n_true_matches_in_test_pairs = 0
        curr_n_pairs_in_test = 0
        fraction = 0
        while n_true_matches_in_test_pairs < settings['MIN_TRUE_MATCHES_FOR_EVALUATION'] and fraction < 0.5:
            log_line("\nGenerating a random set of pairs for testing the model...\n", log_file)
            # Taking 2 samples whose size is the square root of the number of pairs we want and then excluding same-record pairs will give us a random sample of pairs of approximately the right size
            curr_n_pairs_in_test += N_PAIRS_TO_TEST
            fraction = float(sqrt(curr_n_pairs_in_test))/test_data.count()
            random_test_pairs = (
                test_data.sample(False, fraction, seed=settings['RANDOM_SEED'])
                    .map(lambda x: (True, x))
                    .join(
                        test_data
                        .sample(False, fraction, seed=settings['RANDOM_SEED'])
                        .map(lambda x: (True, x))
                    )
                .filter(lambda x: x[1][0] != x[1][1])
                # Only keep the tuple of 2 dictionaries
                .map(lambda x: x[1])
                # Determine if the pair is a match and use this as a key -> (<match>, (d1, d2))
                .map(lambda x: (utils.records_are_matches(x[0], x[1], settings['DEDUPER_GROUND_TRUTH_FIELD']), x))
                # Convert dictionaries into a list of distance measures (one for each DEDUPER_FIELD) -. (<match>, [0.5, 1, ..], (d1, d2))
                .map(lambda x: (x[0], utils.dict_pair_2_distance_list(x[1][0], x[1][1], settings['DEDUPER_FIELDS']), x[1]))
                # Convert list of distances into SparseVectors -> (<match>, SparseVector, (d1, d2))
                .map(lambda x: (x[0], SparseVector(n_deduper_fields, dict([(i, v) for i, v in enumerate(x[1]) if v is not None])), x[2]))
                # Convert tuples into LabeledPoints ->  (LabeledPoint, (d1, d2))
                .map(lambda x: (LabeledPoint(x[0], x[1]), x[2]))
                # Determine if the pair is in the same block or not -> (LabeledPoint, <same_block>)
                .map(lambda x: (x[0], utils.records_in_same_block(x[1][0], x[1][1])))
            )
            
            n_true_matches_in_test_pairs = random_test_pairs.filter(lambda x: x[0].label == 1).count()

        # Matches in random pairs will be very rare, make sure there are at least some of them..
        log_line("Number of same block pairs in the test set: %d" % random_test_pairs.filter(lambda x: x[1]).count(), log_file)
        log_line("Number of true matches in the test set: %d" % n_true_matches_in_test_pairs, log_file)
        if n_true_matches_in_test_pairs == 0:
            raise BaseException("Could not find enough true matches to test prediction and recall on labeled data.")

        # Get results (<true_label>, <predicted_label>) for random_test_pairs
        test_results = random_test_pairs.map(lambda x: (x[0].label, _predict_extra_block_pair(x[0], x[1], logistic_regression)))
        
        # Precision and recall on test data
        log_line("\nResults when comparing test pairs:", log_file)
        numerator, denominator, percentage = _get_precision(test_results)
        log_line("Precision on test data (intra and extra block pairs): %d/%d = %.2f%%" % (numerator, denominator, percentage), log_file)
        numerator, denominator, percentage = _get_recall(test_results)
        log_line("Recall on test data (intra and extra block pairs): %d/%d = %.2f%%" % (numerator, denominator, percentage), log_file)
        log_line("\n\n", log_file)
Example #31
0
#!/usr/bin/env python2
# -- coding: utf-8 --

import requests
import json
import utils

url = utils.API_URL
token = utils.get_api_key()
headers = utils.get_headers(token)

EXISTING_PKS = [] # If you've already filed with a given agency, or otherwise want to exclude it, include its ID here.


AGENCY_PKS = [248] # Agency ID 248 is a test agency under federal jurisdiction 10. This ID is subject to change, and will deduct requests from your account. Contact [email protected] and we'll add them back.

AGENCY_PKS = filter(lambda x: x not in EXISTING_PKS, AGENCY_PKS)
DOCS = """
A copy of your reports that are:
Annual
Monthly
Bimonthly
"""
TITLE = 'Records Request' # Customize here for your project

for agency_pk in AGENCY_PKS:
    # get the jurisdiction
    r = requests.get(url + 'agency/{}/'.format(agency_pk), headers=headers)
    jurisdiction_pk = r.json()['jurisdiction']

    print 'Filing for {}...'.format(r.json()['name'])
Example #32
0
async def haveibeensold(email: str) -> dict:
    data = {
        'email': email,
        'action': 'check'
    }
    try:
        async with aiohttp.request(method='POST', url=haveibeensold_url, data=data, headers=get_headers()) as resp:
            if resp.status == 200:
                return parse_resp(content=await resp.json(), email=email)
            else:
                await unexpected_status(resp=resp, service=__name__)
    except Exception as e:
        print_error(e, service=__name__)