Esempio n. 1
0
def crawl_by_keywords(keywords):
    connector =   db.MongoConnector(config.DB_HOST,config.DB_USER_NAME,config.DB_PASSWORD,config.DB_NAME)
    backend =  MongoQABackend(connector,config.QA_COLLECT_NAME)
    keywords = util.read_txt_lines(args.kw_file)
    keywords = util.expand_keywords(keywords,['飲食'])
    kw_request = KeywordQueryRequest(util.get_browser_driver(config.DRIVER_PATH,config.ENV))
    asession = AsyncHTMLSession()
    for keyword in keywords:
        start_url ='http://so.120ask.com/?kw=%s'%(keyword)
        current_url = start_url
        while True:
            page_src = kw_request.send(current_url)
            if page_src is None:
                break
            page = KeywordQueryPage(page_src)
            links = page.parse_question_links()
            qids =  page.parse_question_ids()
            l = []
            for qid,link in zip(qids,links):
                cb = AsyncHealthPageCallback(qid,backend)
                arq = AsyncHealthQuestionRequest(asession,link,cb)
                l.append(arq)
            if len(l)>0:
                asession.run(*[ r.send for  r in l ])

            next_link = page.parse_next_page_link()
            if next_link is None:
                break
            current_url = urljoin(start_url,next_link)
Esempio n. 2
0
async def main():
    if not os.path.exists('./img'):
        os.mkdir('img')

    # sneaker_links_parser = Sneaker_Links("https://sneakerlinks.com", "test")
    # sneaker_links_parser.get_data()

    asession = AsyncHTMLSession()

    Solelinks_parser = Solelinks("https://vagu.space", "solelinks", asession)
    # await Solelinks_parser.get_data()
    # await task
    asession.run(Solelinks_parser.get_page)
Esempio n. 3
0
def find_department_of_keywords(keywords, filepath):
    kw_request = KeywordQueryRequest(
        util.get_browser_driver(config.DRIVER_PATH, config.ENV))
    asession = AsyncHTMLSession()
    f = open(filepath, 'w', encoding='utf-8')
    for keyword in keywords:
        start_url = 'http://so.120ask.com/?kw=%s' % (keyword)
        current_url = start_url

        page_src = kw_request.send(current_url)
        assert page_src is not None
        page = KeywordQueryPage(page_src)
        links = page.parse_question_links()
        qids = page.parse_question_ids()
        l = []
        for qid, link in zip(qids, links):
            cb = DepartmentOfKeywordCallback()
            arq = AsyncHealthQuestionRequest(asession, link, cb)
            l.append(arq)
        assert len(l) > 3
        res = asession.run(*[r.send for r in l])
        # most of department of questions is the department of keyword
        c = Counter(res)
        department, url = c.most_common()[0][0]
        f.write('%s-->%s,%s\n' % (keyword, department, url))
    def test_async_add_data(self):
        from requests_html import AsyncHTMLSession, HTMLSession
        session = HTMLSession()
        asession = AsyncHTMLSession()
        proxies = {}

        async def get_pythonorg():
            r = await asession.get('https://baidu.com/', proxies=proxies)
            print(r.text)

        async def get_reddit():
            r = await asession.get('https://sina.com/', proxies=proxies)
            print(r.text)

        async def get_google():
            r = await asession.get('https://163.com/', proxies=proxies)
            print(r.text)

        asession.run(get_pythonorg, get_reddit, get_google)
Esempio n. 5
0
async def search_rootme_user_challenges(username: str):
    url = f"https://www.root-me.org/{username}?inc=score"

    session = AsyncHTMLSession()

    async def get_profile():

        r = await session.get(url)
        data = {}

        data['score'] = r.html.xpath(
            "/html/body/div[1]/div/div[2]/main/div/div/div/div/div[2]/div[1]/div[1]/span/text()"
        )[0].split("\xa0")[0][1:]
        data['ranking'] = r.html.xpath(
            "/html/body/div[1]/div/div[2]/main/div/div/div/div/div[2]/div[1]/div[2]/span"
        )[0].text
        data['rank'] = r.html.xpath(
            "/html/body/div[1]/div/div[2]/main/div/div/div/div/div[2]/div[1]/div[3]/span"
        )[0].text

        categories_list = r.html.xpath(
            "/html/body/div/div/div[2]/main/div/div/div/div/div[2]")[0].find(
                "div")

        categories = {}

        for x in categories_list:
            category = x.find('div')[0]
            try:
                title = category.find('h4')[0].text.split('\n')[1]
                categories[title] = {
                    "percentage": category.find('h4')[0].text.split('\n')[0]
                }
                points, _, completion = category.find("span")[1].text.split(
                    '\xa0')
                categories[title]['points'] = points
                categories[title]['completion'] = completion
                categories[title]['challenges'] = {}
                challenges = category.find("ul")[0].find('li')
                for challenge in challenges:
                    categories[title]['challenges'][challenge.text[2:]] = {
                        'completed':
                        True if challenge.text[0] == 'o' else False
                    }
                    categories[title]['challenges'][
                        challenge.text[2:]]['points'] = challenge.find(
                            'a')[0].attrs['title'].split(' ')[0]
            except:
                pass
        data['challenges'] = categories
        return data

    return session.run(get_profile)[0]
Esempio n. 6
0
def test_async_run():
    asession = AsyncHTMLSession()

    async def test1():
        return await asession.get('https://xkcd.com/1957/')

    async def test2():
        return await asession.get('https://reddit.com/')

    async def test3():
        return await asession.get('https://smile.amazon.com/')

    r = asession.run(test1, test2, test3)

    assert len(r) == 3
    assert isinstance(r[0], HTMLResponse)
Esempio n. 7
0
def async_requests(RequestManagers):
    async def make_request(RequestManager, asession):
        rm_dict = RequestManager.dict()
        link = rm_dict['link']
        headers = rm_dict['headers']
        proxies = rm_dict['proxies']

        try:
            r = await asession.get(link, headers=headers, proxies=proxy_num)
            rm_dict['response'] = r
        except:
            pass

        rm_dict['t1'] = datetime.now()
        RequestManager.set_dict(rm_dict)
        return RequestManager

    r1, r2, r3, r4, r5, r6, r7, r8 = LinkClient.getLinks()
    asession = AsyncHTMLSession()

    async def get_link1():
        make_request(r1, assession)

    async def get_link2():
        make_request(r2, asession)

    async def get_link3():
        make_request(r3, asession)

    async def get_link4():
        make_request(r4, asession)

    async def get_link5():
        make_request(r5, asession)

    async def get_link6():
        make_request(r6, asession)

    async def get_link7():
        make_request(r7, asession)

    async def get_link8():
        make_request(r8, asession)

    return asession.run(get_link1, get_link2, get_link3, get_link4, get_link5,
                        get_link6, get_link7, get_link8)
Esempio n. 8
0
def multi_request(urls, headers=None):
    if len(urls) == 0:
        return []
    if headers is None:
        headers = {}
    session = AsyncHTMLSession()

    scrape_fns= []
    for url in urls:
        async def get_site_content(url=url):
            return await session.get(url, headers=headers)

        scrape_fns.append(get_site_content)

    results = session.run(*scrape_fns)
    session.close()

    return results
Esempio n. 9
0
def getnearbyweathers(url_list, weather_data):
    """
    Getting the weathers in multiple locations async - reducing delay time
    :param url_list: List of urls to get the weather data from
    :return: weather_data: List of dictionary objects containing the weather data
    """
    asession = AsyncHTMLSession()

    async def get_weaather1():
        r = await asession.get(url_list[0])

    async def get_weaather2():
        r = await asession.get(url_list[1])

    async def get_weaather3():
        r = await asession.get(url_list[2])

    responses = asession.run(get_weaather1(), get_weaather2(), get_weaather3())
    print(responses)
    for response in responses:
        weather_data.append(get_weather_data_from_url(response))

    return weather_data
Esempio n. 10
0
async def main():
    if not os.path.exists('./img'):
        os.mkdir('img')

    # sneaker_links_parser = Sneaker_Links("https://sneakerlinks.com", "test")
    # sneaker_links_parser.get_data()

    asession = AsyncHTMLSession()

    Solelinks_parser = Solelinks("https://vagu.space", "solelinks", asession)
    # await Solelinks_parser.get_data()
    # await task
    asession.run(Solelinks_parser.get_page)


asession.run(solelinks)

# asyncio.get_event_loop().run_until_complete(solelinks())

# asession = AsyncHTMLSession()

# async def get_qq():
#     r = await asession.get('https://vagu.space/')
#     await r.html.arender()
#     print(r.html.raw_html)

# async def get_toutiao():
#     r = await asession.get('https://www.toutiao.com/')
#     x = await r.html.arender()
#     print(x.html.raw_html)
Esempio n. 11
0
# Instanciate list for champion dictionaries
champlist = []
# Initialise dictionary list index
index = 0
# open async session
asession = AsyncHTMLSession()
while index < end:
    # Initialise secondary function parameters (Number of functions = chunk)
    get_resp1 = getfunc1(tierLinkList[index + 0]["url"])
    get_resp2 = getfunc2(tierLinkList[index + 1]["url"])
    get_resp3 = getfunc3(tierLinkList[index + 2]["url"])
    get_resp4 = getfunc4(tierLinkList[index + 3]["url"])
    get_resp5 = getfunc5(tierLinkList[index + 4]["url"])

    # Start async secondary functions
    resps = asession.run(get_resp1, get_resp2, get_resp3, get_resp4, get_resp5)

    # Process responses
    for i, r in enumerate(resps):
        cdir = get_champ(r, tierLinkList[index + i])
        champlist.append(cdir)
        log.info(f"{cdir.get('Name', 'Not Found')} ({d}) Done")

        # Temporary supplement information to create csv from largest dict items
        #   and smallest dict items to investigate any parsing errors
        maxtuple, mintuple = tmp_info(maxtuple, mintuple)
        d += 1
        # End of Temporary info

    index += chunk
rem = len(tierLinkList) % chunk
Esempio n. 12
0
# 异步请求
async def get_lagou():
    # await半等待状态
    response = await asession.get(url=url_lagou)
    return response

async def get_boss():
    response = await asession.get(url_boss)
    return response

async def get_qcwy():
    response = await asession.get(url_qcwy)
    return response

# 同时跑三个函数
results = asession.run(get_qcwy)
# print(results ,list(map(lambda x:x,results)))
for i in results:
    titles = i.html.xpath("/html/body/div(@class='dw_table')")
    for a in titles:
        title = i.html.xpath(".//a(@target='_blank')/@title")
        print(title)
    # 同时返回当前地址的url
    # print(i.html.url)

    # 抓取页面的所有url
    # links = i.html.links
    # print(links)

    # 填补url 有点像response.urljoin()的补全URL功能
    # absolute_links = i.html.absolute_links
Esempio n. 13
0
class Orunmila:
    def __init__(self, gitlabAddress: str) -> None:
        self._orunSession = AsyncHTMLSession(workers=20)
        self._gitlabAddress = gitlabAddress

        # do not stop the loop on self.getAllProjectMetadata()
        self._dontStopLoop = True
        self._pagesCount = 0

        # raw data
        self.projectsMetadata = list()
        self.projectCommitsMetadata = list()

        # **start** Orunmila knows
        self._commitsByYear = dict()
        self._numberOfProjects = 0
        # **end## Orunmila knows

    #

    # secondary method
    async def _getProjectMetadata(self) -> None:
        gitlabPlatResponse = await self._orunSession.get(
            f"{self._gitlabAddress}/api/v4/projects?&per_page=100&page={self._pagesCount}"
        )

        if gitlabPlatResponse.json() == []:
            self._dontStopLoop = False
            return
        #

        for pMetadata in gitlabPlatResponse.json():
            self.projectsMetadata.append(dict(pMetadata))
        #
        self._pagesCount += 1

    #

    # secondary method
    async def _getCommitsMetadata(self) -> None:
        pageCount = 0
        gitlabPlatResponse = await self._orunSession.get(
            f"{self._gitlabAddress}/api/v4/projects/{self._projectCurrentTd}/repository/commits?&per_page=100&page={pageCount}"
        )
        while gitlabPlatResponse.json() != []:
            tmp_gitlabPlatResponse = gitlabPlatResponse.json()
            for pCommit in tmp_gitlabPlatResponse:
                try:
                    self.projectCommitsMetadata.append(dict(pCommit))
                except ValueError as error:
                    print(
                        f"Error getting commits: {error}, on repository id: {self._projectCurrentTd}"
                    )
                #
            #
            pageCount += 1
            gitlabPlatResponse = await self._orunSession.get(
                f"{self._gitlabAddress}/api/v4/projects/{self._projectCurrentTd}/repository/commits?&per_page=100&page={pageCount}"
            )
        #
        print(f"getting data of project {self._projectCurrentTd}")

    #

    def getAllProjectsMetadata(self) -> List:
        while self._dontStopLoop:
            self._orunSession.run(self._getProjectMetadata)
        #
        print(len(self.projectsMetadata))

        # cleanup
        self._pagesCount = 0
        self._dontStopLoop = True

        return self.projectsMetadata

    #

    def getAllCommitsMetadata(self) -> List:
        for project in self.projectsMetadata:
            self._projectCurrentTd = project["id"]
            self._orunSession.run(self._getCommitsMetadata)
        #

        # cleanup
        del self._projectCurrentTd

        return self.projectCommitsMetadata
Esempio n. 14
0
    name, *courses_filtered = re.split("\n|,",
                                       teacher_section.find("p", first=True).text)

    courses = [course.strip() for course in courses_filtered if course]

    a_link = teacher_section.find("a", first=True)

    site = a_link.attrs.get("href") if a_link else None

    # return a tuple of the name, courses and site
    return (name, courses, site)


results = list(map(lambda url: get_page_function(url), urls))

results_html = asession.run(*results)

classes = {}

for result in results_html:

    response = result.html

    # splitting the thing up, getting rid of the start
    _, *response_td = response.find('td')

    for td in response_td:

        teacher_name, courses, site = process_teacher_courses(td)
        teacher = (teacher_name, site)
Esempio n. 15
0
            logo,
            elem["name"],
            elem["symbol"],
            # description,
            tags,
            status,
            elem["max_supply"],
            elem["circulating_supply"],
            elem["total_supply"],
            elem["quote"]["USD"]["market_cap"],
            elem["quote"]["USD"]["price"],
            elem["quote"]["USD"]["volume_24h"],
            elem["quote"]["USD"]["percent_change_1h"],
            elem["quote"]["USD"]["percent_change_24h"],
            elem["quote"]["USD"]["percent_change_7d"],
            elem["last_updated"],
        ]

        with open(f"crypto_asset_meta.csv", "a") as f1:
            csv_writer_1 = csv.writer(f1)
            csv_writer_1.writerow(asset_info)


if __name__ == '__main__':
    asession = AsyncHTMLSession()
    url_list = [
        "https://web-api.coinmarketcap.com/v1/cryptocurrency/listings/latest?convert=USD&cryptocurrency_type=all&limit=1000&sort=market_cap&sort_dir=desc&start=1",
    ]
    asession.run(
        *[lambda url=url: get_asset_info(asession, url) for url in url_list])
Esempio n. 16
0
class LandChinaBot:
    info_all = []
    url = 'https://www.landchina.com/'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
        # 'Cookie': 'security_session_mid_verify=b92679e3c892fc921cb78030f1e86157',
        'Cookie':
        f'Hm_lvt_83853859c7247c5b03b527894622d3fa=1587194605,1587302414,1587386859,1587430790; ASP.NET_SessionId=uamiaoeymyazhuxclijuqxpm; security_session_verify=b85755a9c9b76a4d6195095ceaf8620d; security_session_high_verify=f06c25fa45c4419e26e53b51dff6f097; security_session_mid_verify=53623062e0ea9dd09f49fe550795e88a; Hm_lpvt_83853859c7247c5b03b527894622d3fa={int(time.time())}'
    }
    data = None

    def __init__(self, city_code, city_name):
        self.city_name = city_name
        self.getCityInfo(city_code, city_name)
        self.async_session = AsyncHTMLSession()

    def getCityInfo(self, city_code, city_name):
        # 894e12d9-6b0f-46a2-b053-73c49d2f706d:出让公告2011后
        # city_info = unquote(f'894e12d9-6b0f-46a2-b053-73c49d2f706d:{city_code}' + u"▓~" + city_name)
        city_info = unquote(
            f'b4a43cbb-3c47-47ee-81cf-8b993d5bda89:{city_code}' + u"▓~" +
            city_name)
        city_info = city_info.encode("gb18030")
        self.data = {
            'TAB_QuerySubmitConditionData': city_info,
        }

    def to_csv(self, datas):
        """
        存储csv文件逻辑
        :param data:
        :return:
        """
        import csv
        if not os.path.exists('./中国土地市场-出让公告2011前.csv'):
            names = [name for name in datas.keys()]
            with open(f'./中国土地市场-出让公告2011前.csv', 'a', newline='') as f:
                writer = csv.writer(f)
                if isinstance(names, list):
                    # 单行存储
                    if names:
                        writer.writerow(names)
                        f.close()
        # 存数据
        data = [i for i in datas.values()]
        try:
            with open(f'./中国土地市场-出让公告2011前.csv', 'a', newline='') as f:
                writer = csv.writer(f)
                if isinstance(data, list):
                    # 单行存储
                    if data:
                        writer.writerow(data)
                        f.close()
                        return True
                    else:
                        return False
                else:
                    # print(type(data))
                    return False
        except Exception as e:
            raise e.args

    @staticmethod
    def stringToHex():
        width = str(GetSystemMetrics(0))
        height = str(GetSystemMetrics(1))
        screendate = width + "," + height
        val = ""
        for i in range(len(screendate)):
            if val == "":
                val = binascii.b2a_hex(screendate[i].encode('utf-8'))
            else:
                val += binascii.b2a_hex(screendate[i].encode('utf-8'))
        return val.decode('utf-8')

    async def getCookie(self):
        response = await self.async_session.get(self.url, headers=self.headers)
        security_verify_data = self.stringToHex()
        link = f'{self.url}?security_verify_data={security_verify_data}'
        response = await self.async_session.get(link, headers=self.headers)

    # print(self.async_session.cookies)

    async def getInfo(self, session):
        # detail_link = []
        link = f'{self.url}default.aspx?tabid=324'
        for page in range(1, 2):
            self.data['TAB_QuerySubmitPagerData'] = str(page)
            # print(self.data)
            try:
                response = requests.post(link,
                                         data=self.data,
                                         headers=self.headers)
                # print(response.content.decode('gbk'))

                info = Selector(text=response.content.decode('gbk')).xpath(
                    '//*[@id="TAB_contentTable"]/tbody/tr')
                # info = response.html.xpath('//*[@id="TAB_contentTable"]/tbody/tr')
                for sub_raw in info[1:]:
                    info_basic = {}
                    basic_value = []
                    sub_list = sub_raw.xpath('td')
                    for i, info_sub in enumerate(sub_list):
                        if i == 0:
                            info_sub = info_sub.xpath(
                                'text()').extract()[0][:-1]
                            # print(info_sub, end=' ')
                            basic_value.append(info_sub)
                        elif i > 0 and i != 2:
                            info_sub = info_sub.xpath('text()').extract()[0]
                            # print(info_sub, end=' ')
                            basic_value.append(info_sub)
                        else:
                            link_sub = info_sub.xpath('a/@href').extract()[0]
                            # detail_link.append(link_sub)
                            try:
                                info_sub = info_sub.xpath(
                                    'a/text()').extract()[0]
                            except IndexError:
                                info_sub = info_sub.xpath(
                                    'a/span/@title').extract()[0]
                            # print(info_sub, end=' ')
                            basic_value.append(info_sub)
                    # print('\n')
                    details = await self.getDetail(link_sub,
                                                   self.async_session)

                    info_basic['城市'] = self.city_name
                    # info_basic['序号'] = basic_value[0]
                    info_basic['行政区'] = basic_value[1]

                    info_basic['供应公告标题'] = basic_value[2]
                    info_basic['公告编号'] = ''.join(
                        re.findall(r'[\s|\S]*(\([\s|\S]*\))',
                                   str(basic_value[2])))
                    info_basic['公告类型'] = basic_value[3]
                    info_basic['发布时间'] = basic_value[4]
                    info_basic['网上创建时间'] = basic_value[5]

                    # info_basic['地块公示信息'] = details
                    for det in details:
                        info_ba = {}
                        info_ba = {**info_basic, **det}
                        all_data = {
                            '城市': '',
                            '行政区': '',
                            '供应公告标题': '',
                            '公告编号': '',
                            '公告类型': '',
                            '发布时间': '',
                            '网上创建时间': '',
                            '宗地编号:': '',
                            '宗地面积:': '',
                            '宗地坐落:': '',
                            '出让年限:': '',
                            '容积率:': '',
                            '建筑密度(%):': '',
                            '绿化率(%):': '',
                            '建筑限高(米):': '',
                            '土地用途:': '',
                            '投资强度:': '',
                            '保证金:': '',
                            '起始价:': '',
                            '加价幅度:': '',
                            '挂牌开始时间:': '',
                            '挂牌截止时间:': '',
                            '提交书面申请地': '',
                            '缴纳竞买保证金截止时间': '',
                            '确认竞买资格时间': '',
                            '拍卖开始时间': '',
                            '拍卖挂牌进行地点': '',
                            '联系地址': '',
                            '联系人': '',
                            '联系电话': '',
                            '开户单位': '',
                            '开户银行': '',
                            '银行账号': ''
                        }
                        info_all = {**all_data, **info_ba}
                        print(info_all)
                        self.to_csv(info_all)
                        # self.info_all.append(info_all)
            except Exception as e:
                continue

        # return detail_link

    async def getDetail(self, link, session):
        link = f'{self.url}{link}'
        print(link)
        # link = 'https://www.landchina.com//DesktopModule/BizframeExtendMdl/workList/bulWorkView.aspx?wmguid=20aae8dc-4a0c-4af5-aedf-cc153eb6efdf&recorderguid=4eff5dbf-6bce-4cef-a3a8-61b51cd4dc21&sitePath='

        # response = await session.get(link, headers=self.headers)
        response = requests.get(link, headers=self.headers)
        # print(response.content.decode('gb18030'))

        ttf_url = re.findall(
            r"truetype[\s|\S]*styles/fonts/([\s|\S]*?)'[\s|\S]*woff'\)",
            response.content.decode('gb18030'))[0]
        # print(ttf_url)

        # ttf_content = await session.get(f'{self.url}/styles/fonts/{ttf_url}', headers=self.headers)
        ttf_content = requests.get(f'{self.url}/styles/fonts/{ttf_url}',
                                   headers=self.headers)
        new_font_name = f"{link.split('recorderguid=')[1]}.ttf"
        with open(new_font_name, 'wb') as f:
            f.write(ttf_content.content)

        info_text_all = Selector(text=response.content.decode(
            'gb18030')).xpath('//*[@id="tdContent"]//td/div')
        other_text = Selector(text=response.content.decode('gb18030')).xpath(
            '//*[@id="tdContent"]//td/p//text()').extract()
        # info = ''.join([str(ir).replace('\r\n','').replace(' ','') for ir in info_text])
        bg_all = []
        for info_t in info_text_all:
            info_text = info_t.xpath('table//text()').extract()
            info_temp = '#'.join(
                list(
                    filter(None, [
                        str(ir).replace(' ', '').replace('\t', '')
                        for ir in info_text
                    ])))
            bg_all.append(info_temp)
        info = '$$$$'.join(bg_all)
        other = list(
            filter(None, [
                str(ot).replace(' ', '').replace('\t', '') for ot in other_text
            ]))
        # info = [str(ir).replace(' ','').replace('\t','') for ir in info_text]
        # info = '#'.join(info)
        other = '&'.join(other)
        # 替换繁体字
        info_all = replace_content(f'{info}****{other}',
                                   link.split('recorderguid=')[1])

        if not info_all:
            return False
        # print(info_all)
        names = [
            '宗地编号:', '宗地面积:', '宗地坐落:', '出让年限:', '容积率:', '建筑密度(%):', '绿化率(%):',
            '建筑限高(米):', '土地用途:', '投资强度:', '保证金:', '起始价:', '加价幅度:', '挂牌开始时间:',
            '挂牌截止时间:', '提交书面申请地', '缴纳竞买保证金截止时间', '确认竞买资格时间', '拍卖开始时间',
            '拍卖挂牌进行地点', '联系地址', '联系人', '联系电话', '开户单位', '开户银行', '银行账号'
        ]

        keys = [
            '宗地编号', '宗地面积', '宗地坐落', '出让年限', '容积率', '建筑密度', '绿化率', '建筑限高',
            '土地用途', '投资强度', '保证金', '起始价', '加价幅度', '挂牌开始时间', '挂牌截止时间',
            '提交书面申请地', '缴纳竞买保证金截止时间', '确认竞买资格时间', '拍卖开始时间', '拍卖挂牌进行地点', '联系地址',
            '联系人', '联系电话', '开户单位', '开户银行', '银行账号'
        ]

        info = info_all.split('****')[0]  # 表格信息
        other = info_all.split('****')[1]  # 其他内容信息

        infos = list(filter(None, info.split('$$$$')))
        # infos = info.split('$$$$')
        # infos_all = []

        # 多表格解析逻辑
        content_all = []
        for info in infos:
            result_info = info.split('#')
            # print(result_info)
            # infos_all.append(result_info)
            # 添加一个空元素,防止后面解析报错
            result_info.append('')

            # 解析表格逻辑
            content_dict = dict()
            for i, inf in enumerate(result_info):
                if inf in names:
                    if result_info[i + 1] in names:
                        content_dict[inf] = ''
                    # elif inf in ['用途名称', '面积']:
                    #     if inf == '用途名称':
                    #         yt = re.findall(r'用途名称#面积#([\u4E00-\u9FA5]+).*?#投资强度', info)
                    #         if yt:
                    #             content_dict[inf] = yt[0]
                    #         else:
                    #             content_dict[inf] = ''
                    #     else:
                    #         mj = re.findall(r'用途名称#面积#.*?([\d|\.|#]+?)#投资强度', info)
                    #         if mj:
                    #             content_dict[inf] = mj[0][1:]
                    #         else:
                    #             content_dict[inf] = ''
                    else:
                        content_dict[inf] = str(result_info[i + 1])
                else:
                    pass

            # 非表格信息
            content_dict['提交书面申请地'] = ''.join(
                re.findall(r'五、申请人可于[\s|\S]*&到&([\s|\S]*?)&向我局提交书面申请', other))
            content_dict['缴纳竞买保证金截止时间'] = ''.join(
                re.findall(r'竞买保证金的截止时间为&([\s|\S]*?)&', other))
            content_dict['确认竞买资格时间'] = ''.join(
                re.findall(r'具备申请条件的,我局将在&([\s|\S]*?)&前确认其竞买资格', other))
            try:
                content_dict['拍卖开始时间'] = ''.join(
                    re.findall(r'拍卖活动定于&([\s|\S]*?)&在&', other))
                content_dict['拍卖挂牌进行地点'] = ''.join(
                    re.findall(r'&在&([\s|\S]*?)&进行', other))
            except:
                # &号地块:&2020年05月18土09时30分&至&
                # content_dict['拍卖开始时间'] = ''
                # content_dict['拍卖进行地点'] = ''
                # content_dict['挂牌开始时间'] = ''.join(re.findall(r'&号地块:&([\s|\S]*?)&至&', other))
                # content_dict['挂牌进行地点'] = ''.join(re.findall(r'&在&([\s|\S]*?)&进行', other))
                pass

            content_dict['联系地址'] = ''.join(
                re.findall(r'联系地址:([\s|\S]*?)&', other))
            content_dict['联系人'] = ''.join(
                re.findall(r'&联系人:([\s|\S]*?)&', other))
            content_dict['联系电话'] = ''.join(
                re.findall(r'&联系电话:([\s|\S]*?)&', other))
            content_dict['开户单位'] = ''.join(
                re.findall(r'&开户单位:([\s|\S]*?)&', other))
            content_dict['开户银行'] = ''.join(
                re.findall(r'&开户银行:([\s|\S]*?)&', other))
            content_dict['银行账号'] = ''.join(
                re.findall(r'&银行帐号:([\s|\S]*)', other))

            content_dict['内容链接'] = link
            content_dict['主键MD5'] = self.to_md5(str(link))
            content_dict['爬取时间'] = time.strftime(
                "%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
            content_all.append(content_dict)

            # 每个表格返回一条数据

        return content_all

    async def run(self):
        await self.getCookie()
        await self.getInfo(self.async_session)

    def to_md5(self, txt):
        import hashlib
        m = hashlib.md5()
        m.update(txt.encode())
        return m.hexdigest()

    def main(self):
        self.async_session.run(self.run)
Esempio n. 17
0
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
__Author__ = "Alvin Liu"

'http://www-mipengine-org.mipcdn.com/i/p3.manhuapan.com/2020/12/11211014200587.jpg'
'http://www-mipengine-org.mipcdn.com/i/p3.manhuapan.com/2020/12/11211014200599.jpg'

from requests_html import AsyncHTMLSession
from requests_html import HTMLSession

async_session = AsyncHTMLSession()


async def fetchBaidu():
    """
    抓取百度的数据,测试获取接口字段
    """
    ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0'
    result = await async_session.get(
        'https://manhua.fzdm.com/2/998/index_1.html',
        headers={'user-agent': ua})
    for i in result.html.find('img'):
        # 展示抓取HTML内容信息
        print(i)


# fetchBaidu()
async_session.run(fetchBaidu)
from requests_html import AsyncHTMLSession

asession = AsyncHTMLSession()


async def delay_1():
    r = await asession.get('https://httpbin.org/delay/1')
    return r


async def delay_2():
    r = await asession.get('https://httpbin.org/delay/2')
    return r


async def delay_3():
    r = await asession.get('https://httpbin.org/delay/3')
    return r


t1 = time.perf_counter()

results = asession.run(delay_1, delay_2, delay_3)

for result in results:
    print(result.html.url)

t2 = time.perf_counter()

print(f'finished in {round(t2-t1, 2)}')
	time1 = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S")
	return r_crude.html.xpath('//*[@id="quote-header-info"]/div[3]/div/div/span[1]')[0].text, "Crude", time1

async def get_gold():
	r_gold = await asession.get("https://finance.yahoo.com/quote/GC=F?p=GC=F")
	ts = time.time()
	time1 = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S")
	return r_gold.html.xpath('//*[@id="quote-header-info"]/div[3]/div/div/span[1]')[0].text, "Gold", time1

conn = sqlite3.connect("portfoliostocks.db")
print(type(conn))
cur = conn.cursor()
# cur.execute("""CREATE TABLE stocks_prices(
# 			stock_name text,
# 			price REAL,
# 			time1 text)""")


for i in range(100):

	results = asession.run(get_snp, get_dow, get_nasdaq, get_crude, get_gold)
	for result in results:
		conn.execute("INSERT INTO stocks_prices VALUES (?,?,?)", (result[1], result[0], result[2]))
		print(result[1], result[0], result[2])
	time.sleep(20)
conn.commit()
conn.close();



Esempio n. 20
0
class LandChinaBot:
    info_all = []
    url = 'https://www.landchina.com/'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
        # 'Cookie': 'security_session_mid_verify=b92679e3c892fc921cb78030f1e86157',
    }
    data = None

    def __init__(self, city_code, city_name):
        self.getCityInfo(city_code, city_name)
        self.async_session = AsyncHTMLSession()

    def getCityInfo(self, city_code, city_name):
        city_info = unquote(
            f'42ad98ae-c46a-40aa-aacc-c0884036eeaf:{city_code}' + u"▓~" +
            city_name)
        city_info = city_info.encode("gb18030")
        self.data = {
            'TAB_QuerySubmitConditionData': city_info,
        }

    @staticmethod
    def stringToHex():
        width = str(GetSystemMetrics(0))
        height = str(GetSystemMetrics(1))
        screendate = width + "," + height
        val = ""
        for i in range(len(screendate)):
            if val == "":
                val = binascii.b2a_hex(screendate[i].encode('utf-8'))
            else:
                pass
                val += binascii.b2a_hex(screendate[i].encode('utf-8'))
        return val.decode('utf-8')

    async def getCookie(self):
        response = await self.async_session.get(self.url, headers=self.headers)
        security_verify_data = self.stringToHex()
        link = f'{self.url}?security_verify_data={security_verify_data}'
        response = await self.async_session.get(link, headers=self.headers)
        # print(self.async_session.cookies)

    async def getInfo(self, session):
        # detail_link = []
        link = f'{self.url}default.aspx?tabid=263'
        response = await session.post(link,
                                      data=self.data,
                                      headers=self.headers)
        # print(response.text)
        info = response.html.xpath('//*[@id="TAB_contentTable"]/tbody/tr')
        for sub_raw in info[1:]:
            info_basic = {}
            basic_value = []
            sub_list = sub_raw.xpath('//td')
            for i, info_sub in enumerate(sub_list):
                if i != 2:
                    info_sub = info_sub.xpath('//text()')[0]
                    # print(info_sub, end=' ')
                    basic_value.append(info_sub)
                else:
                    link_sub = info_sub.xpath('//a/@href')[0]
                    # detail_link.append(link_sub)
                    try:
                        info_sub = info_sub.xpath('//a/text()')[0]
                    except IndexError:
                        info_sub = info_sub.xpath('//a/span/@title')[0]
                    # print(info_sub, end=' ')
                    basic_value.append(info_sub)
            # print('\n')
            details = await self.getDetail(link_sub, self.async_session)
            info_basic['序号'] = basic_value[0][:-1]
            info_basic['行政区'] = basic_value[1]
            info_basic['土地坐落'] = basic_value[2]
            info_basic['总面积'] = basic_value[3]
            info_basic['土地用途'] = basic_value[4]
            info_basic['供应方式'] = basic_value[5]
            info_basic['签订日期'] = basic_value[6]
            info_basic['供地结果信息'] = details
            self.info_all.append(info_basic)
        # return detail_link

    async def getDetail(self, link, session):
        link = f'{self.url}{link}'
        # print(link)
        response = await session.get(link, headers=self.headers)
        # print(response.text)
        info = response.html.xpath(
            '//*[contains(@id, "mainModuleContainer_1855_1856_ctl00_ctl00_p1_")]/text()'
        )
        # print(info)
        if not info:
            return False
        info_new = ''
        pay_i = right_i = None
        # 计算出'土地来源'对应的实际数据
        # 去除空值,并将数据变换为键值对形式,并以#分隔
        for i, info_sub in enumerate(info):
            if '土地来源' in info_sub:
                if float(info[i + 1]) == float(info[i - 1]):
                    info[i + 1] = '现有建设用地'
                elif float(info[i + 1]) == 0:
                    info[i + 1] = '新增建设用地'
                else:
                    info[i + 1] = '新增建设用地(来自存量库)'
            elif '分期支付约定' in info_sub:
                pay_i = i
            elif '土地使用权人' in info_sub:
                right_i = i
            if info_sub != '\xa0':
                info_sub = f'"{info_sub}"'
                if ':' in info_sub or ':' in info_sub:
                    info_sub = f'#{info_sub[:-2]}":'
                # 获取'分期支付约定'对应的实际数据
                if pay_i == i:
                    info_sub = info_sub + '['
                if not right_i and pay_i and i > pay_i:
                    info_sub = info_sub + ','
                if right_i == i:
                    info_sub = ']' + info_sub
                info_new += info_sub
        #
        info_new = info_new.split('#')
        # 获取'约定容积率'对应的实际数据
        volume_i = info_new.index('"约定容积率":')
        if info_new[volume_i + 1][-1] == ':':
            info_new[volume_i + 1] += '""'
        info_new[volume_i + 1] = '{' + info_new[volume_i + 1] + ','
        if info_new[volume_i + 2][-1] == ':':
            info_new[volume_i + 2] += '""'
        info_new[volume_i + 2] = info_new[volume_i + 2] + '}'
        info_new[
            volume_i] = f'{info_new[volume_i]}{info_new[volume_i+1]}{info_new[volume_i+2]}'
        info_new.pop(volume_i + 1)
        info_new.pop(volume_i + 1)
        # 补充空值,构成字典
        info = '{'
        for i, info_sub in enumerate(info_new[1:]):
            if len(info_sub) > 1 and info_sub[-1] == ':':
                info_sub += '""'
            info += f'{info_sub},'
        info += '}'
        info = eval(info)
        # 获取'分期支付约定'对应的实际数据
        pay_info = info['分期支付约定'][4:]
        if pay_info:
            pay_info_new = '['
            for i, info_sub in enumerate(pay_info):
                info_sub = f"'{info_sub}',"
                if re.match(r"^\'\d+\',$", info_sub):
                    info_sub = '[' + info_sub
                    if i > 1:
                        pay_info_new = pay_info_new[:-1] + "],"
                pay_info_new += info_sub
            pay_info_new = pay_info_new[:-1] + "]]"
            pay_info_new = eval(pay_info_new)
            # 去重
            info_index = None
            pay_info = []
            for info_sub in pay_info_new:
                if info_index != info_sub[0]:
                    # 补充相关
                    info_pay = {}
                    for i in range(4 - len(info_sub)):
                        info_sub.append('')
                    info_pay['支付期号'] = info_sub[0]
                    info_pay['约定支付日期'] = info_sub[1]
                    info_pay['约定支付金额(万元)'] = info_sub[2]
                    info_pay['备注'] = info_sub[3]
                    pay_info.append(info_pay)
                    info_index = info_sub[0]
            info['分期支付约定'] = pay_info
        else:
            info['分期支付约定'] = []
        # print(info, '\n\n')
        return info

    async def run(self):
        await self.getCookie()
        await self.getInfo(self.async_session)
        for info_sub in self.info_all:
            print(info_sub, '\n\n')

    def main(self):
        self.async_session.run(self.run)
Esempio n. 21
0
from requests_html import AsyncHTMLSession

asession = AsyncHTMLSession()
url = 'http://1463-50e62051-2d46-4d20.nss.ctfer.vip:9080/shop?page={}'
find_str = 'lv6.png'
start, end = 0, 200


async def download(link, text):
    res = await asession.get(link)
    if text in res.text:
        print(link)


if __name__ == "__main__":
    lst = [url.format(i) for i in range(start, end)]
    funcs = [lambda x=x: download(x, find_str) for x in lst]
    asession.run(*funcs)
Esempio n. 22
0
    ascdep_date = ascdep_url[i]
    asession = AsyncHTMLSession()

    async def getasc():
        r = await asession.get(
            f"https://booking.maykenbel.com/?chain=19159&template=maykenbel&shell=MKNBL2018&start=availresults&brand=maykenbe&currency=GBP&lang=1&arrive={ascdate[5:7]}%2F{ascdate[8:10]}%2F{ascdate[:4]}&depart={ascdep_date[5:7]}%2F{ascdep_date[8:10]}%2F{ascdep_date[:4]}&hotel=70825&dpArrive={ascdate[8:10]}%2F{ascdate[5:7]}%2F{ascdate[:4]}&dpDepart={ascdep_date[8:10]}%2F{ascdep_date[5:7]}%2F{ascdep_date[:4]}&rooms=1&adult=1&promo="
        )
        return r

    async def getchc():
        r = await asession.get(
            f"https://secure.chevalcollection.com/convert/site/Cheval%20Harrington%20Court[wsJsZoGCLg62hr_WrMSMy9dIwRklPItcNUhU30wAXMo]/en/results.php?checkin={chcstart_url[i]}&nights={chc_nights[i]}&currency=GBP&resultViewType=sda&viewtype=rateroom&partya=0"
        )
        return r

    results = asession.run(getasc, getchc)

    for result in results:
        print(result)
        match = re.search("cheval", result.html.url)

        print("Date " + ascdate)

        if match:

            print("Cheval Harrington Court")
            try:
                discchc1bed = result.html.find("#mbprice_4932506_15069_123",
                                               first=True).text
                if discchc1bed:
                    chc1bed = chc_calc(discchc1bed, chc_nights[i])
                    "labels": labels,
                    "url": url,
                    "img_url": img_url
                }
                f.write(title + des + level + users + labels + url + img_url +
                        '\n')
    # if c_list:
    #     items = c_list.find('')
    #     for item in items:
    #         title = item.find(".course-card-name").text()  # 查找title
    #         des = item.find(".course-card-desc").text()
    #         level = item.find(".course-card-info>span:eq(0)").text()
    #         users = item.find(".course-card-info>span:eq(1)").text()
    #         labels = item.find(".course-label").text().split(" ")
    #         url = item("https://www.imooc.com/learn/", item.find("a").attr("href")) # url拼接
    #         img_url = item("https://img3.mukewang.com/", item.find("img").attr("src"))  # url拼接
    #         dict = {
    #             "title":title,
    #             "des":des,
    #             "level":level,
    #             "users":users,
    #             "labels":labels,
    #             "url":url,
    #             "img_url":img_url
    #         }
    #         print(dict)


if __name__ == '__main__':
    result = asession.run(get_html)
Esempio n. 24
0
        for l in f:
            util.rlstrip(l)
            m = re.search(r'(.*)-->(.*),(.*)',l)
            kw,d,url =  m.group(1), m.group(2),m.group(3)
            yield kw,d,url
  

if __name__ == '__main__':
    print('Start scipt of crawl departments')
    connector =   db.MongoConnector(config.DB_HOST,config.DB_USER_NAME,config.DB_PASSWORD,config.DB_NAME)
    backend =  MongoQABackend(connector,config.QA_COLLECT_NAME)
    r = DepartmentListRequest(util.get_browser_driver(config.DRIVER_PATH,config.ENV))
    asession = AsyncHTMLSession()   
    for kw,dep,url in  read_keyword_urls(args.url_file):
        print('crawl %s-->%s'%(kw,url))
        current_url = url
        while current_url is not None:
            page_src = r.send(current_url)
            page =  PageUnderDepartment('呼吸內科',page_src)
            qurls = page.parse_questions()
            l = []
            for qid,link in qurls:
                cb = AsyncHealthPageCallback(qid,backend)
                arq = AsyncHealthQuestionRequest(asession,link,cb)
                l.append(arq)
            if len(l)>0:
                asession.run(*[ r.send for  r in l ])
            next_link = page.parse_next_link()
            current_url = next_link

Esempio n. 25
0
from json import loads
from base64 import b64decode
from requests_html import AsyncHTMLSession

a = AsyncHTMLSession()

url = 'https://www.proxyfish.com/proxylist/server_processing.php?type='


async def 一():
    return await a.get(url + 'HTTP')  # ~1.5k


async def 二():
    return await a.get(url + 'HTTPS')  # ~500


results = a.run(一, 二)

for result in results:

    req = result.json()['data']  # request json data

    data = b64decode(req)  # decode data

    d = loads(data)  # load data

    for p in d:  # only proxy ip and port
        print(p[1] + ':' + p[2])
Esempio n. 26
0
class asyncio_crawler(object):
    def __init__(self, url, depth=10, numworkers=None):
        #create a async requests_html session
        self.asession = AsyncHTMLSession(workers=numworkers)
        if numworkers:
            self.numworkers = numworkers
        else:
            self.numworkers = multiprocessing.cpu_count() * 5
        self.url = url
        self.host = urlparse.urlparse(url).netloc
        self.regex = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,6}\b",
                                re.IGNORECASE)
        self.depth = depth
        self.runtime = 0
        self.gen_link_counter = 0
        self.mails = set()
        self.links = set()
        self.links_done = set()
        LOGGER.info('-----> ASYNCIO_CRAWLER INITIALIZED <-----')

    async def Task(self):
        '''This function will be used to generate tasks for the asyncio call'''
        if self.links:
            #TO DO: Check if this is also possible with asyncio.Queue
            url = self.links.pop()
            #LOGGER.info(url)
            self.links_done.add(url)
            # use the async html_request framework to do requests, get html text
            # and to generate new links
            try:
                response = await self.asession.get(url)
                for link in response.html.absolute_links:
                    if self.host in link:
                        self.links.add(link)
                # use regular expression to scan for mail adresses
                for mail_addr in re.findall(self.regex,
                                            response.html.full_text):
                    self.mails.add(mail_addr)

            except Exception as e:
                # our requests has failed,but we don't care too much
                LOGGER.warning(e)

    def crawl(self):
        '''main loop'''
        #use the requests_html module to get links and mails
        # find all links on the start url
        self.links.add(self.url)
        Task_list = []
        while self.links and self.gen_link_counter < self.depth:
            for link in self.links:
                Task_list.append(self.Task)
            # run asycio task with run command of async requests_html
            self.asession.run(*Task_list)
            self.links.difference_update(self.links_done)
            self.gen_link_counter += 1

    def run(self):
        '''method to be called from executable'''
        LOGGER.info('-----> RUN ASYNCIO_CRAWLER <-----')
        starttime = time.time()
        try:
            self.crawl()
        except KeyboardInterrupt:  # ..abort crawler using CTRL+C
            pass
        except Exception:
            raise
        self.runtime = time.time() - starttime
        self.report()

    def report(self):
        #report all results to console
        LOGGER.info('-----> ASYNCIO_CRAWLER FINISHED <-----')
        LOGGER.info('-----> REPORT FOLLOWS <-----')
        LOGGER.info('-----> Links done: <-----')
        for link in self.links_done:
            LOGGER.info(link)
        LOGGER.info('-----> Got mails: <-----')
        for mail in self.mails:
            LOGGER.info(mail)
        LOGGER.info('-----> Mails found: <-----')
        LOGGER.info(len(self.mails))
        LOGGER.info('-----> Finished linklist: <-----')
        empty = not bool(self.links)
        LOGGER.info(empty)
        LOGGER.info('-----> Generation of links: <-----')
        LOGGER.info(self.gen_link_counter)
        LOGGER.info('-----> Links done: <-----')
        LOGGER.info(len(self.links_done))
        LOGGER.info('-----> Number of Workers: <-----')
        LOGGER.info(self.numworkers)
        LOGGER.info('-----> ASYNCIO_Crawler runtime [s]: <-----')
        LOGGER.info(self.runtime)
Esempio n. 27
0
)  # session appears to be currently required for a single get request with this library?


async def 一():
    return await a.get('https://free-proxy-list.net')  # 300


async def 二():
    return await a.get('https://www.us-proxy.org')  # 200


async def 三():
    return await a.get('https://www.sslproxies.org')  # 100


results = a.run(一, 二, 三)

for result in results:

    cells = result.html.find('td')

    p = ''

    for cell in cells:

        c = cell.text

        if not c.lower().islower(
        ):  # lowercase all letters and then check if islower to determine if the cell contains letters (only ip and port cells will remain)

            if '.' in c:
Esempio n. 28
0
def main():
    session = AsyncHTMLSession()
    tasks = [ping_creator(session, url) for url in load(file)]
    session.run(*tasks)
async def get_delay1():
    r = await async_session.get('https://httpbin.org/delay/1')
    return r


async def get_delay2():
    r = await async_session.get('https://httpbin.org/delay/2')
    return r


async def get_delay3():
    r = await async_session.get('https://httpbin.org/delay/3')
    return r


t1 = time.perf_counter()

results = async_session.run(get_delay1, get_delay2, get_delay3)

# Each item in the results list is a response object
# and can be interacted with as such

for result in results:
    response = result.html.url
    print(response)

t2 = time.perf_counter()

print(f'Asynchronous: {t2 - t1} seconds')
Esempio n. 30
0
    assert numbers, 'Something went wrong'

    functions = []
    for number in numbers:
        for region in REGIONS:

            async def fetch_page(region=region, number=number):
                response = await session.get(f'{URL}{region}/{number}')
                return response

            functions.append(fetch_page)
    return functions


session = AsyncHTMLSession()
results = session.run(*generate_requests(62))
for page in results:
    games = page.html.find('.bracket-game')
    for game in games:
        players = {}
        p_id = 0

        # need very specific selector to prevent bracket-popup elements from being included
        for player_cell in game.find(
                'div.bracket-game > div > div:first-child'):
            if 'bracket-popup' in player_cell.attrs['class']:
                continue

            # increment p_id after checking for invalid elements
            p_id += 1