Esempio n. 1
0
def main_handler():
    global result, last_result
    try:
        logging.info('临时文件:' + ZK_TMP_FILE)
        if not os.path.exists(ZK_TMP_FILE):
            logging.info('临时文件不存在')
        else:
            with open(ZK_TMP_FILE, 'r', encoding=UTF8_ENCODING) as f:
                result = json.load(f)
        logging.info('当前存储数据量:' + str(len(result.keys())))
        # zk首页热门内容
        d = py(ZK_BASE_URL, headers=REQUEST_HEADERS, encoding=GBK_ENCODING)
        d('#threadlisttableid tbody').each(deal_post)
        # 0818tuan
        d = py(TUAN_BASE_URL,
               headers=REQUEST_HEADERS,
               encoding=GB2312_ENCODING)
        d('.list-group > .list-group-item').each(deal_post_tuan)
    except Exception as ex:
        logging.exception('主任务运行异常:' + str(ex))
        raise ex
    finally:
        # 创建存储目录
        if not os.path.exists(BASE_DIR):
            os.makedirs(BASE_DIR)
        # 存储结果
        with open(ZK_TMP_FILE, 'w', encoding=UTF8_ENCODING) as f:
            # 超过数量清空
            if len(result.keys()) > 500:
                last_result.clear()
                last_result = result
                result = dict()
            logging.info('保存数据结果')
            json.dump(result, f)
Esempio n. 2
0
def get_post_info(post_id, title=None, time=None):
    info = dict()
    info['url'] = ZK_POST_URL % post_id
    logging.info('爬取链接:' + info['url'])
    d = py(info['url'], headers=REQUEST_HEADERS, encoding=GBK_ENCODING, timeout=REQUEST_TIMEOUT)
    # 帖子标题
    post_title = d('#thread_subject').attr('title')
    # 帖子时间
    post_time = d('.pti:first>.authi:first').find('em:first').text().replace('发表于 ', '')
    # 帖子图片
    info['images'] = list()
    if post_title is None:
        info['title'] = None if title is None else title
        info['time'] = None if time is None else time
        info['content'] = d('#messagetext>p:first').text()
    else:
        info['title'] = post_title
        info['time'] = post_time
        ele = d('#postlist>div:first').find('tr:first').find('.t_f').clone()
        info['content'] = ele.remove('ignore_js_op').text()
        mapping = get_url_mapping(ele)
        for href in mapping:
            info['content'] = info['content'].replace(href, mapping[href])
        for e in d('.t_fsz:first').find('ignore_js_op').find('img'):
            e = py(e)
            if e.attr('aid') is None:
                continue
            src = e.attr('file')
            if len(re.findall('.jpg|.jpeg|.png', src, re.I)) == 0:
                continue
            info['images'].append(src)
    return info
Esempio n. 3
0
def get_one_page_book_list(url, **kwargs):
    book_list = []
    session = kwargs.get('session')
    if session:
        pq = py(session.get(url).content)
    else:
        pq = py(_get_response_content(url))
    book_dict = pq('.subject-item')
    for book in book_dict.items():
        book_tittle = book('.info h2').text().strip() + book(
            '.info h2 span').text().strip()
        book_url = book('.info h2 a').attr('href')
        book_id = ''
        for txt in book_url.split('/'):
            if re.match(r'^\d+$', txt):
                book_id = ''.join(txt)
                break
        book_coverimage_url = book('.pic img').attr('src')
        book_intro = book('.pub').text()
        book_tags = book('.tags').text().split(':')[1].strip() if book(
            '.tags').text().count(':') else ''
        book_comment = book('.comment').text()
        book_dict = {
            'id': book_id,
            'url': book_url,
            'title': book_tittle,
            'cover_img_url': book_coverimage_url,
            'intro': book_intro,
            'tags': book_tags,
            'comment': book_comment
        }
        book_list.append(book_dict)
    return book_list
Esempio n. 4
0
 def parse(self, response):
     result = response.text
     self.log(response.url)
     links = py(result).find('.info a')
     for _ in links:
         href = py(_).attr('href')
         if href != 'javascript:void(0)':
             yield response.follow(href, callback=self.parse_item)
def test_get_ip():
    row = py(with_space_html)
    ip1 = crawl_goubanjia.get_ip(row)

    without_space_row = py(without_space_html)
    ip2 = crawl_goubanjia.get_ip(without_space_row)

    assert ip1 == actual_ip
    assert ip2 == actual_ip
Esempio n. 6
0
def jdzuLogin(username, password, verifycode, file_format):
    # 获取js编码后的encoded
    encoded = getEncoded(username, password)
    # 准备表单数据
    data = {
        "userAccount": username,
        "userPassword": "",
        "RANDOMCODE": verifycode,
        "encoded": encoded
    }
    # 模拟登录
    r = py(
        sess.post('http://61.131.228.75:8080/jsxsd/xk/LoginToXk',
                  data=data,
                  headers=headers).text)
    # 如果验证码错误
    if r('#showMsg').text():
        return r('#showMsg').text()
    # 请求首页,获取用户名字
    rep = py(
        sess.get('http://61.131.228.75:8080/jsxsd/framework/xsMain.jsp',
                 headers=headers).text)
    name = rep('#btn_gotoGrzx .glyphicon-class').text() + "的成绩单"
    # 请求成绩单数据
    response = py(
        sess.get('http://61.131.228.75:8080/jsxsd/kscj/cjcx_list',
                 headers=headers).text)

    # 将成绩解析到列表组,方便生产excel
    scores = list()
    for item in response('tr').items():
        score = list()
        if item('th').items():
            # 表头
            for th in item('th').items():
                score.append(th.text())
        if item('td').items():
            # tbody
            for td in item('td').items():
                score.append(td.text())
        scores.append(score)

    # 根据用户选择的格式保存
    if file_format == "excel":
        # 保存到excel
        save_excel(scores, name, username)
    if file_format == "pdf":
        # 保存到pdf
        save_excel_for_pdf(scores, name, username)
        convert_to_pdf(username)
    # 发送邮件通知
    send_email.mail(name)
Esempio n. 7
0
    def extract_attrs(self, ele):
        """ Extract attrs """
        attrs = {}
        #manipulation keywords
        TARGET_KEYWORDS = MTBuild.attrs()
        for a in TARGET_KEYWORDS.iterkeys():
            if py(ele).attr(a):
                attrs[a] = py(ele).attr(a)

        #get tag name
        attrs['_TAG_'] = py(ele)[0].tag
        #future: deal with internal property like <a><a.href></a.href></a>
        return attrs
Esempio n. 8
0
def get_url_mapping(ele):
    url_mapping = dict()
    for i in ele.find('a'):
        href = py(i).attr['href']
        url = py(i).text()
        if re.match(r'https?.+\.\..+|.*链接.*', url) is None \
            or href is None:
            continue
        if '0818tuan' in href and '?u=' in href:
            url_mapping[url] = unquote(href.split('?u=')[1])
        else:
            url_mapping[url] = unquote(href)
    return url_mapping
Esempio n. 9
0
    def extract_attrs(self, ele):
        """ Extract attrs """
        attrs = {}
        #manipulation keywords
        TARGET_KEYWORDS = MTBuild.attrs()
        for a in TARGET_KEYWORDS.iterkeys():
            if py(ele).attr(a):
                attrs[a] = py(ele).attr(a)

        #get tag name
        attrs['_TAG_'] = py(ele)[0].tag
        #future: deal with internal property like <a><a.href></a.href></a>
        return attrs
Esempio n. 10
0
    def select(self, context):
        """ select sub-context and yield generateor """
        if self.attrs.has_key('select'):
            #return py(context).find(self.attrs['select'])
            selector = self.attrs['select']
        else:  # no selector is found
            return py(context)

        # if find selector
        try:
            return py(context).find(selector)
        except Exception, e:
            # print e
            return py([])  # return empty pyquery object
Esempio n. 11
0
 def select(self, context):
     """ select sub-context and yield generateor """
     if self.attrs.has_key('select'):
         #return py(context).find(self.attrs['select'])
         selector = self.attrs['select']
     else: # no selector is found
         return py(context)
     
     # if find selector
     try:
         return py(context).find(selector)
     except Exception, e:
         # print e
         return py( [ ] ) # return empty pyquery object
 def _get_info(self, tds):
     lst_value = tds.filter(lambda i: i % 2 == 1).map(
         lambda i, e: py(e).text())
     lst_title = tds.filter(lambda i: i % 2 == 0).map(
         lambda i, e: py(e).text())
     map_title_value = zip(lst_title, lst_value)
     model = {}
     for k_title, v_value in map_title_value:
         k_title = k_title.replace(u':', u'')
         if k_title == u'':
             continue
         key = self.info_dic.get(k_title, None)
         if key is None: continue
         model[key] = v_value
     return model
Esempio n. 13
0
    def parse(self, response):

        movie_list = response.xpath('/html').extract()

        for temp in movie_list:
            soup = BeautifulSoup(temp, 'html5lib')

            # 提取网址
            urls = []
            for link in soup.find_all('a'):
                urls.append(link.get('href'))

            # 提取ul内容
            datas1 = {}
            doc = py(temp)
            item = doc('div ul li')
            for li in item.items():
                datas1['img'] = li.find('img')
                datas1['url'] = li.find('a').attr('href')
                datas1['content'] = li.text()

            # 提取ol内容
            datas2 = {}
            doc = py(temp)
            item = doc('div ol li')
            for li in item.items():
                datas2['img'] = li.find('img')
                datas2['url'] = li.find('a').attr('href')
                datas2['content'] = li.text()
                print(li.text())
                print('——' * 80)

            # 提取自定义列表内容
            datas3 = {}
            doc = py(temp)
            item = doc('dl')
            for t in item.items():
                datas3['dt'] = t.find('dt').text()
                dd = []
                for tt in t.find('dd').items():
                    dd.append(tt.text())
                datas3['dd'] = dd

            # 提取table内容
            tables = []
            table_node = soup.find_all('td')
            for table in table_node:
                tables.append(table.get_text())
Esempio n. 14
0
    def __call__(self, doc, **environment):
        tag = self.attrs['_TAG_']
        # convert encoding
        if tag == "root" and self.has_attr('encoding'):
            encoding = self.attrs['encoding']
            doc = doc.decode(encoding)

        root_element = py(doc)
        
        results = []
        for tri in self.children:
            r = tri['exp_callsite'](root_element)
            results.append(r)

        # action:
        if tag != "root":
            ret = self.call_action(results, action=tag, attrs=self.attrs, **environment)
            return ret
        else:
            # actor as:
            if self.has_attr('as'):
                as_val = self.attrs['as']
                ret = self.call_as(as_val, results)
                return ret
            else:
                return results
Esempio n. 15
0
def get_acticals(actical_url):
    url = base_actical_url + actical_url
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    if response.status_code == 200:
        html = response.text
        # actical_urls = BeautifulSoup(html, 'lxml')
        pattern1 = re.compile(r'[|"\/\\:<>?*]+',
                              re.S)  #不能使用的只有/ \ : * ?" < > | 这几个符号.
        doc = py(html)

        pattern2 = re.compile(r'[\u4e00-\u9fa5]+', re.S)
        title = doc('div').filter('#title').text()
        title = re.sub(pattern1, '', title)
        content = doc('p').text()

        #有些网友是br内装内容
        pattern3 = re.compile(r'[\s]*', re.S)

        if re.match(pattern3, content):
            content = doc('div').filter('#content').text()
        content = re.sub(pattern2, '', content)

        # title=actical_url.div.title.string
        # print(actical_urls)
        yield {'title': title, 'content': content}

    return None
Esempio n. 16
0
    def __call__(self, doc, **environment):
        tag = self.attrs['_TAG_']
        # convert encoding
        if tag == "root" and self.has_attr('encoding'):
            encoding = self.attrs['encoding']
            doc = doc.decode(encoding)

        root_element = py(doc)

        results = []
        for tri in self.children:
            r = tri['exp_callsite'](root_element)
            results.append(r)

        # action:
        if tag != "root":
            ret = self.call_action(results,
                                   action=tag,
                                   attrs=self.attrs,
                                   **environment)
            return ret
        else:
            # actor as:
            if self.has_attr('as'):
                as_val = self.attrs['as']
                ret = self.call_as(as_val, results)
                return ret
            else:
                return results
    def get_annual_out_guarantee_info(self, page_list):
        lst = []
        for page in page_list:
            py_all = py(page, parser='html')
            trs = py_all.find('table').find('tr').not_(
                '.partner_com_top').items()

            for tr in trs:
                tds = tr.find('td')
                if len(tds) < 2:
                    continue

                performance = tds.eq(5).text().strip()
                performance_period = self.trans_for(performance)
                share_model = {
                    AnnualReports.OutGuaranteeInfo.CREDITOR:
                    tds.eq(1).text().strip(),  #
                    AnnualReports.OutGuaranteeInfo.OBLIGOR:
                    tds.eq(2).text().strip(),  #
                    AnnualReports.OutGuaranteeInfo.DEBT_TYPE:
                    tds.eq(3).text().strip(),  #
                    AnnualReports.OutGuaranteeInfo.DEBT_AMOUNT:
                    util.get_amount_with_unit(tds.eq(4).text().strip()),
                    AnnualReports.OutGuaranteeInfo.PERFORMANCE_PERIOD:
                    performance_period,
                    AnnualReports.OutGuaranteeInfo.GUARANTEE_PERIOD:
                    tds.eq(6).text().strip(),  # 担保期限
                    AnnualReports.OutGuaranteeInfo.GUARANTEE_TYPE:
                    tds.eq(7).text().strip(),  # 担保方式
                }
                lst.append(share_model)
        return lst
Esempio n. 18
0
def get_book(i):

    book =[]

    url = "https://www.d4j.cn/download.php?id=" + str(i)
    head = user_agent.getheaders()  # 获取随机user-agent
    response = requests.get(url, headers=head)
    # 获取pyquery对象
    html1 = response.text
    html = py(html1)

    title = html("body > div.wrap > div.content > h2").text()
    # 获取百度网盘地址
    link = html("body > div.wrap > div.content > div:nth-child(4) > div.panel-body > span:nth-child(4) > a").attr("href")
    # 获取提取码
    code = html("body > div.wrap > div.content > div.plus_box > div.plus_l > ul > li:nth-child(4) > font").text()

    if link:
        print("第"+str(i)+"页面")
        print(title)
        print(link)
        print(code)
        # 插入到数据库
        book.append(title)
        book.append(link)
        book.append(code)
        return book
    else:
        print("第" + str(i) + "页面")
    def get_annual_share_hold_info(page_list):
        lst = []
        for page in page_list:
            py_all = py(page, parser='html')
            trs = py_all.find('table').find('tr').not_(
                '.partner_com_top').items()
            for tr in trs:
                tds = tr.find('td')
                if len(tds) < 2:
                    continue

                share_model = {
                    AnnualReports.ShareholderInformation.SHAREHOLDER_NAME:
                    tds.eq(1).text().strip(),
                    AnnualReports.ShareholderInformation.SUBSCRIPTION_AMOUNT:
                    util.get_amount_with_unit(tds.eq(2).text().strip()),
                    AnnualReports.ShareholderInformation.SUBSCRIPTION_TIME:
                    tds.eq(3).text().strip(),  # 认缴时间
                    AnnualReports.ShareholderInformation.SUBSCRIPTION_TYPE:
                    tds.eq(4).text().strip(),  # 认缴类型
                    AnnualReports.ShareholderInformation.PAIED_AMOUNT:
                    util.get_amount_with_unit(
                        tds.eq(5).text().strip()),  # 1实缴金额
                    AnnualReports.ShareholderInformation.PAIED_TIME:
                    tds.eq(6).text().strip(),  # 实缴时间
                    AnnualReports.ShareholderInformation.PAIED_TYPE:
                    tds.eq(7).text().strip(),  # 实缴类型
                }
                lst.append(share_model)
        return lst
Esempio n. 20
0
def main_handler(event, context):
    global result
    tmp_path = ZK_TMP_FILE % datetime.now().strftime('%Y%m%d')
    try:
        logging.info('临时文件:' + tmp_path)
        if not os.path.exists(tmp_path):
            logging.info('临时文件不存在')
        else:
            with open(tmp_path, 'r', encoding=UTF8_ENCODING) as f:
                result = json.load(f)
        logging.info('当前存储数据量:' + str(len(result.keys())))

        # 首页热门内容
        d = py(ZK_BASE_URL, headers=REQUEST_HEADERS, encoding=GBK_ENCODING)
        # 每个帖子
        d('#threadlisttableid tbody').each(deal_post)
        return "Success"
    except Exception as ex:
        logging.error('主任务运行异常:' + str(ex))
        raise ex
    finally:
        # 创建存储目录
        if not os.path.exists(BASE_DIR):
            os.makedirs(BASE_DIR)
        # 存储结果
        with open(tmp_path, 'w', encoding=UTF8_ENCODING) as f:
            logging.info('保存数据结果')
            json.dump(result, f)
    def get_key_person_info(self, key_person_info):
        key_person_info_dict = {}
        page = self.get_crawl_page(key_person_info)
        if page is None or page == u'':
            return key_person_info_dict

        items = py(page, parser='html').find('.info_name').find('li').items()
        lst_key_person = []
        for item in items:
            item_content = item.text()
            part = item_content.split(' ', 1)
            if len(part) >= 2:
                name = part[0].strip()
                position = part[1].strip()
            elif len(part) == 1:
                name = part[0].strip()
                position = u''
            else:
                continue

            key_person = {
                GsModel.KeyPerson.KEY_PERSON_NAME: name,
                GsModel.KeyPerson.KEY_PERSON_POSITION: position
            }
            lst_key_person.append(key_person)

        key_person_info_dict[GsModel.KEY_PERSON] = lst_key_person
        return key_person_info_dict
Esempio n. 22
0
    def get_change_info(self, page):
        change_info_dict = {}
        lst_change_records = []
        if isinstance(page, dict) or page is None:
            return {}

        trs = py(page, parser='html').find('#table_bgxx').find('tr').items()
        for tr in trs:
            tds = tr.find('td')
            if tds is None or len(tds) < 2:
                continue

            change_model = {
                GsModel.ChangeRecords.CHANGE_ITEM:
                tds.eq(1).text(),
                # 去除多余的字
                GsModel.ChangeRecords.BEFORE_CONTENT:
                util.format_content(tds.eq(2).text()),
                GsModel.ChangeRecords.AFTER_CONTENT:
                util.format_content(tds.eq(3).text()),
                # 日期格式化
                GsModel.ChangeRecords.CHANGE_DATE:
                tds.eq(4).text()
            }
            lst_change_records.append(change_model)
        change_info_dict[GsModel.CHANGERECORDS] = lst_change_records if len(
            lst_change_records) != 0 else None
        return change_info_dict
Esempio n. 23
0
def test():
    url = "https://www.d4j.cn/download.php?id=17144"
    response = requests.get(url, headers=head, proxies=proxies)
    # 获取pyquery对象
    html1 = response.text
    html = py(html1)
    print(html)
Esempio n. 24
0
def _get_login_captcha_info(content):
    pq = py(content)
    captcha_id = pq('input[name = "captcha-id"]').attr('value')
    captcha_img_url = pq('#captcha_image').attr('src')
    Image.open(BytesIO(requests.get(captcha_img_url).content)).show()
    captcha_solution = input('请输入验证码:\n')
    return captcha_id, captcha_solution
    def get_change_info(self, change_info):
        change_info_dict = {}
        lst_change_records = []
        pages = self.get_crawl_page(change_info, True)
        if pages is None:
            return {}

        for page in pages:
            trs = py(page.get(u'text', u''), parser='html').find(
                '.partner_com').find('tr').not_('.partner_com_top').items()

            for tr in trs:
                tds = tr.find('td')
                change_model = {
                    GsModel.ChangeRecords.CHANGE_ITEM:
                    tds.eq(1).text(),
                    # 去除多余的字
                    GsModel.ChangeRecords.BEFORE_CONTENT:
                    util.format_content(tds.eq(2).text()),
                    GsModel.ChangeRecords.AFTER_CONTENT:
                    util.format_content(tds.eq(3).text()),
                    # 日期格式化
                    GsModel.ChangeRecords.CHANGE_DATE:
                    tds.eq(4).text()
                }
                lst_change_records.append(change_model)
        change_info_dict[GsModel.CHANGERECORDS] = lst_change_records
        return change_info_dict
    def get_con_detail(page):
        shareholder_name = ""
        sub_model = {}
        if page is None or page == u'':
            return shareholder_name, sub_model

        tables = py(page, parser='html').find('.partner_com').items()
        for table in tables:
            if u'发起人' in table.find('.info_table_h3').text(
            ) or u'股东' in table.find('.info_table_h3').text():  # 股东信息
                tds = table.find('td')
                shareholder_name = tds.eq(1).text().strip()
                sub_model[GsModel.ContributorInformation.
                          SHAREHOLDER_NAME] = tds.eq(1).text()
                sub_model[GsModel.ContributorInformation.
                          SUBSCRIPTION_AMOUNT] = util.get_amount_with_unit(
                              tds.eq(3).text())
                sub_model[GsModel.ContributorInformation.
                          PAIED_AMOUNT] = util.get_amount_with_unit(
                              tds.eq(5).text())

            if u'认缴' in table.find('.info_table_h3').text():  # 认缴明细信息
                trs = table.find('tr')
                lst_sub_detail = []
                for tr_i in xrange(1, len(trs)):
                    tds = trs.eq(tr_i).find('td')
                    sub_model_detail = {
                        GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TYPE:
                        tds.eq(0).text(),
                        GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_AMOUNT:
                        util.get_amount_with_unit(tds.eq(1).text()),
                        GsModel.ContributorInformation.SubscriptionDetail.SUBSCRIPTION_TIME:
                        tds.eq(2).text()
                    }
                    sub_model_detail = replace_none(sub_model_detail)
                    lst_sub_detail.append(sub_model_detail)
                sub_model[GsModel.ContributorInformation.
                          SUBSCRIPTION_DETAIL] = lst_sub_detail

            if u'实缴' in table.find('.info_table_h3').text():  # 实缴明细信息
                trs = table.find('tr')
                lst_paid_detail = []
                for tr_i in xrange(1, len(trs)):
                    tds = trs.eq(tr_i).find('td')
                    paid_model_detail = {
                        GsModel.ContributorInformation.PaiedDetail.PAIED_TYPE:
                        tds.eq(0).text(),
                        GsModel.ContributorInformation.PaiedDetail.PAIED_AMOUNT:
                        util.get_amount_with_unit(tds.eq(1).text()),
                        GsModel.ContributorInformation.PaiedDetail.PAIED_TIME:
                        tds.eq(2).text()
                    }
                    paid_model_detail = replace_none(paid_model_detail)  # 补丁2
                    lst_paid_detail.append(paid_model_detail)
                sub_model[GsModel.ContributorInformation.
                          PAIED_DETAIL] = lst_paid_detail
                sub_model = replace_none(sub_model)

        return shareholder_name, sub_model
Esempio n. 27
0
def auto_get_url(msg):
    response = requests.get(msg.url)
    document = py(response.text) 
    content = document('#js_content').text()
    text1 = stats_word.stats_text_cn(content,100)
    text2 = str(text1)
    bot.file_helper.send(text1)
    return text2
Esempio n. 28
0
def getSouSuoInfo():
    urls = 'http://www.baidu.com/s?wd=联通'
    doc = py(getInfo(urls))
    mlist = doc('#content_left h3.t a').items()
    i = 0
    for li in mlist:
        i = i + 1
        print('标题:' + li.text() +' 链接:'+li.attr('href'))
 def get_annual_base_info(page):
     py_all = py(page, parser='html')
     tds = py_all.find('.info_table').find('td').items()
     annual_base_info = {}
     for td in tds:
         part = td.text().split(u':', 1)
         k = AnnualReports.format_base_model(part[0])
         annual_base_info[k] = part[1]
     return annual_base_info
Esempio n. 30
0
    def get_parse_page(self):
        """
        获取解析网页
        :return: 
        """
        html = self.browser.page_source
        doc = py(html)

        return doc
Esempio n. 31
0
 def get_my_movie_page_list(self):
     p = py(self.movie_collect)
     page_dict = p('.paginator > a')
     for page in page_dict.items():
         url = page.attr('href')
         if url:
             self.movie_page_list.append(_douban_movie_host + url)
         else:
             continue
Esempio n. 32
0
 def get_keywords(self):
     # keywords_search = re.compile(r'https://www.lagou.com/zhaopin.*<h3>(.*?)</h3></a>', re.S)
     keywords_url = "https://www.lagou.com/"
     keywords_res = self.handle_request(method="GET", url=keywords_url)
     doc = py(keywords_res)
     res = doc("#sidebar > div > div:nth-child(1) a h3")
     # self.keywords = set(keywords_search.findall(keywords_res))
     self.keywords = set(res.text().split(" "))
     self.lagou_session.cookies.clear()
Esempio n. 33
0
def get_score(html):
    score = []
    doc = py(html)
    sco = doc('td')
    for i in sco.items():
        score.append(str(i.text()).split())
    # for i in score:
    #     log(i[0])
    return score
Esempio n. 34
0
 def build_script(self, ele, attrs):
     # print "script:", ele, attrs
     ch = py(ele).html()
     x = dict()
     x['exp_meta'] = 'script'
     x['exp_attrs'] = attrs
     x['exp_node'] = ele
     x['exp_children'] = ch
     x['exp_kind'] = 'code'
     x['exp_callsite'] = ScriptCallSite(x)
     return x
Esempio n. 35
0
 def build_array(self, ele, attrs):
     #print "array:", ele, attrs
     ch = []
     for child in py(ele).children():
         ch.append(self.build_element(child))
     x = dict()
     x['exp_meta'] = 'array'
     x['exp_attrs'] = attrs
     x['exp_node'] = ele
     x['exp_children'] = ch
     x['exp_kind'] = 'data'
     x['exp_callsite'] = ArrayCallSite(x)
     return x
Esempio n. 36
0
 def eval(self, eval_value, context):
     """ """
     key_ctx = Evaluater.CONTEXT
     g = MTContext.globals()
     l = MTContext.locals(**{key_ctx: py(context),})
     # cache the evaluator
     cache = self.evaluator_cache
     if not cache.has_key(eval_value):
         evaluator = Evaluater(eval_value, key_ctx, g, l)
         cache[eval_value] = evaluator
     evaluator = cache[eval_value]
     ret = evaluator()
     return ret
Esempio n. 37
0
 def build_map(self, ele, attrs):
     # print "map:", ele, attrs
     ch = []
     for child in py(ele).children():
         #print child
         ch.append(self.build_element(child))
     x = dict()
     x['exp_meta'] = 'map'
     x['exp_attrs'] = attrs
     x['exp_node'] = ele
     x['exp_children'] = ch
     x['exp_kind'] = 'data'
     x['exp_callsite'] = MapCallSite(x)
     return x
Esempio n. 38
0
def get_list(_url,mark):
    import urllib2
    from pyquery import PyQuery as py
    req = urllib2.urlopen(_url)
    html = req.read()
    html = html.decode("gbk")
    d = py(html)
    items = d("tr")
    mylist = []
    for item in items:
        text = d(item).text()
        try:
            href = "http://news.hitsz.edu.cn/site/news/" + d( d(item).find("a") ).attr("href")
            ob = {
                "text": text,
                "href": href
            }
            mylist.append(ob)
        except:
            print "error:" + mark
    return mylist
Esempio n. 39
0
 def build(self, env):
     doc = py(env.template)
     #print "[Template]"
     #print doc
     #print "[/Template]"
     self.fromdoc(doc)
Esempio n. 40
0
def then_there_should_be_the_same_number_of_bars(step):
		jq = py(world.browser.contents)
		assert len(jq("h1")) == 2
Esempio n. 41
0
 def get(self, get_value, context):
     """ get text() or html() """
     if get_value == "text":
         return py(context).text()
     elif get_value == "html":
         return py(context).html()