Python PyQuery.strip Examples

Programming Language: Python

Namespace/Package Name: pyquery

Class/Type: PyQuery

Method/Function: strip

Examples at hotexamples.com: 9

Python PyQuery.strip - 9 examples found. These are the top rated real world Python examples of pyquery.PyQuery.strip extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PyQuery(30)

remove(30)

eq(30)

outerHtml(30)

attr(30)

children(30)

items(30)

html(29)

make_links_absolute(24)

split(24)

find(19)

replace(17)

__unicode__(14)

outer_html(13)

contents(12)

append(12)

size(10)

strip(9)

is_(8)

replaceWith(8)

startswith(6)

has_class(5)

encode(5)

index(4)

parents(4)

lower(4)

get(4)

next(3)

remove_namespaces(3)

hasClass(2)

parent(2)

each(2)

decode(2)

__html__(2)

closest(2)

clone(2)

add_class(2)

addClass(2)

map(2)

endswith(2)

show(1)

replace_with(1)

serialize(1)

siblings(1)

start(1)

terminate(1)

removeClass(1)

insertBefore(1)

removeAttr(1)

read(1)

Example #1

Show file

File: gsxt_guangxi_worker.py Project: xtuyaowu/gsxt_crawler

    def get_contributive_info_page(text):
        total_page = PyQuery(text,
                             parser='html').find('#pagescount').attr('value')
        if total_page is None or total_page.strip() == '':
            return 1

        return int(total_page)

Example #2

Show file

    def __get_company_name(self, text):
        try:
            name = PyQuery(
                text, parser='html').find('.overview').find('#entName').text()
            if name is not None and name.strip() != '':
                return name.strip()

            name = PyQuery(text, parser='html').find('h1.fullName').text()
            if name is not None and name.strip() != '':
                return name.strip()

            return None
        except Exception as e:
            self.log.exception(e)

        return None

Example #3

Show file

File: alertscraper.py Project: michaelpb/alertscraper

 def __init__(self, elem, trims, should_cleanup):
     text = PyQuery(elem).text()
     for trim in (trims or []):
         text = text.replace(trim, '')
     self.rx = re.compile(r'\W+')
     self.text = text.strip()
     self.trimmed_text = non_trimmed.sub(' ', self.text)
     self.html = PyQuery(elem).html()
     if should_cleanup:
         self.html = self.cleanup_html()
     self.normalized_text = nonword.sub('', text.lower())

Example #4

Show file

def get(key):
    try:

        en_reg = re.compile('[a-zA-Z]')
        jp_reg = re.compile('[ぁ-んァ-ヶ]')

        language = ''

        if re.findall(en_reg, key):
            language = 'en'
        if re.findall(jp_reg, key):
            language = 'ja'
        if not language:
            language = guess_language(key)

        print('来自 ' + language + ' wiki')

        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
            'Referer': 'https://www.sanseido.biz/',
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Connection': 'close',
            'authority': f'{language}.wikipedia.org'
        }

        s = requests.session()
        s.keep_alive = False

        page = s.get(f'https://{language}.wikipedia.org/wiki/{key}',
                     headers=headers)
        #print(f'https://{language}.wikipedia.org/wiki/{key}')
        page.encoding = 'UTF-8'
        print(page.text)
        tree = html.fromstring(page.text)
        #print(html.tostring(tree, encoding='UTF-8'))
        p1 = tree.xpath('//*[@id="mw-content-text"]/div/p[1]')
        if not p1:
            return f'wiki里查不到 {key}'
        p1_text = html.tostring(p1[0], encoding='UTF-8')
        de = PyQuery(p1_text.decode('UTF-8')).text()
        if ('refer' in de) or ('可指' in de):
            refer = tree.xpath('//*[@id="mw-content-text"]/div/ul[1]/li')
            for each in refer:
                refer_text = html.tostring(each, encoding='UTF-8')
                refer_link = PyQuery(refer_text.decode('UTF-8')).text()
                de += '\n' + refer_link

        return de.strip()
    except Exception as e:
        return f'似乎有些问题:\n{e}'

Example #5

Show file

File: ctext-huangdi-neijing.py Project: kuanghy/zh-ancient-texts

def parse_text_page(url):
    """解析文本内容页面"""
    resp = session.get(url)
    doc = PyQuery(resp.content)
    text_list = []
    log.info("parsing %s", url)
    for item in doc('tr td.ctext').items():
        item_html = item.html()
        if not re.search(r'<div.*</p>', item_html):
            log.warning("undesired text: %s", item_html[:20])
            continue
        item_text = PyQuery(item_html).text()
        text_list.append(item_text.strip())
    return text_list

Example #6

Show file

def parse_withdraw_tr(tr: PyQuery) -> dict:
    """
    解析一页二号平台站点的出金申请的tr
    :param tr:
    :return:
    """
    init_dict = dict()
    tds = tr.find("td")
    if len(tds) < 15:
        return None
    else:
        domain = domain2
        init_dict['system'] = domain
        """第一个td,取mt帐号和mt分组"""
        first = PyQuery(tds[0])
        texts_1 = first.text().split("\n")
        account = int(re.search(r'\d{4,}', texts_1[0].lower()).group())  # mt账户
        group = texts_1[-1].lower()[5:]  # mt分组
        init_dict['account'] = account
        init_dict['group'] = group
        """第二个td，取客户2"""
        second = PyQuery(tds[1])
        init_dict['manager'] = second.text().strip()
        """第三个td，取英文名"""
        third = PyQuery(tds[2])
        texts_3 = third.text().split("\n")
        nick_name = texts_3[0][4:].strip("")
        init_dict['nick_name'] = nick_name
        """第四个，金额"""
        fourth = PyQuery(tds[3])
        texts_4 = fourth.text().split("\n")
        amount_usd = float(texts_4[0].split("$")[-1].strip())  # 金额/美元
        amount_cny = float(texts_4[-1].split("￥")[-1].strip())  # 金额/人民币
        init_dict['amount_usd'] = amount_usd
        init_dict['amount_cny'] = amount_cny
        """
        第五个，取手续费
        """
        fifth = PyQuery(tds[4])
        texts_5 = fifth.text().split("\n")
        commission_usd = float(texts_5[0].split("$")[-1].strip())  # 手续费/美元
        commission_cny = float(texts_5[-1].split("￥")[-1].strip())  # 手续费/人民币
        init_dict['commission_usd'] = commission_usd
        init_dict['commission_cny'] = commission_cny
        """
        第六个，转账方式，
        """
        sixth = PyQuery(tds[5])
        init_dict['channel'] = sixth.text().strip()
        """
        第七个，时间
        """
        seventh = PyQuery(tds[6])
        seventh = seventh.text().split("\n")
        apply_time = seventh[0][5:].strip("")
        apply_time = get_datetime_from_str(apply_time)
        close_time = seventh[-1][5:].strip("")
        close_time = get_datetime_from_str(close_time)
        init_dict['apply_time'] = apply_time
        init_dict['close_time'] = close_time
        """第八个，开户行"""
        eighth = PyQuery(tds[7]).text()
        init_dict['blank_name'] = eighth.strip()
        """第九个，开户行代码"""
        ninth = PyQuery(tds[8]).text()
        init_dict['blank_code'] = ninth.strip()
        """第十个，银行"""
        tenth = PyQuery(tds[9]).text()
        init_dict['code_id'] = tenth.strip()
        """第十一个，状态"""
        eleventh = PyQuery(tds[10]).text()
        init_dict['status'] = eleventh.strip()
        """第十二个，账户余额"""
        twelfth = PyQuery(tds[11]).text()
        init_dict['account_balance'] = float(twelfth.strip()[1:])
        """第十三个，账户净值"""
        thirteenth = PyQuery(tds[12]).text()
        init_dict['account_value'] = float(thirteenth.strip()[1:])
        """第十四个，持仓量"""
        fourteenth = PyQuery(tds[13]).text()
        init_dict['open_interest'] = float(fourteenth.strip()[0:-1])
        """第十五个，可用保证金"""
        fifteenth = PyQuery(tds[14]).text()
        init_dict['account_margin'] = float(fifteenth.strip()[1:])
        """第十六个，单号"""
        sixth = PyQuery(tds[15].find("a"))
        init_dict['ticket'] = int(sixth.attr("href").split("/")[-1])

        init_dict = {k: v for k, v in init_dict.items()}
        """只记录指定类型的单子"""
        if init_dict['status'] == "审核中":
            return init_dict
        else:
            return None

Example #7

Show file

    def contributive_info_list(con_table_list):
        con_table_dict = {}
        if con_table_list is None or len(con_table_list) <= 0:
            return con_table_dict

        for con_item in con_table_list:
            status = con_item.get('status', 'fail')
            if status != 'success':
                break

            text = con_item.get('text')
            if text is None or text == '':
                break

            json_data = util.json_loads(text)
            if json_data is None:
                break

            data_array = json_data.get('data')
            if not isinstance(data_array, list):
                break

            for item in data_array:
                b_lic_no = item.get('bLicNo')
                b_lic_type_cn = item.get('blicType_CN')
                inv = item.get('inv')
                inv_type_cn = item.get('invType_CN')
                inv_id = item.get('invId')
                if inv is None or inv.strip() == '':
                    continue

                if inv_id is None or inv_id.strip() == '':
                    continue

                inv = inv.strip()
                inv_id = inv_id.strip()

                if b_lic_no is not None and b_lic_no.strip() != '':
                    b_lic_no = PyQuery(b_lic_no, parser='html').remove('div').remove('span'). \
                        text().replace(' ', '').strip()
                else:
                    b_lic_no = ''

                if b_lic_type_cn is None or b_lic_no.strip() == '':
                    b_lic_type_cn = ''
                else:
                    b_lic_type_cn = b_lic_type_cn.strip()

                if inv_type_cn is None or inv_type_cn.strip() == '':
                    inv_type_cn = ''
                else:
                    inv_type_cn = PyQuery(inv_type_cn, parser='html').remove('div').remove('span'). \
                        text().replace(' ', '').strip()

                sub_model = {
                    GsModel.ContributorInformation.SHAREHOLDER_NAME: inv,
                    GsModel.ContributorInformation.SHAREHOLDER_TYPE:
                    inv_type_cn,
                    GsModel.ContributorInformation.CERTIFICATE_TYPE:
                    b_lic_type_cn,
                    GsModel.ContributorInformation.CERTIFICATE_NO: b_lic_no
                }
                con_table_dict[inv_id] = sub_model

        return con_table_dict

Example #8

Show file

SinaBlogUrl = 'http://blog.sina.com.cn/s/articlelist_' + SinaBlogID + '_0_1.html'
print('  >> Read Url: ' + SinaBlogUrl)
BlogML = urllib2.urlopen(SinaBlogUrl).read()
#读取博客目录
BlogMLHtml = PyQuery(BlogML)('div.menuList').html()
BlogMLHtml = PyQuery(BlogMLHtml)('a')
BlogMLList = {}
for li in BlogMLHtml.items():
    #忽略博文收藏目录
    if li.text() != u'\u535a\u6587\u6536\u85cf':
        BlogMLList[li.text()] = li.attr('href')
BlogMLList = sorted(BlogMLList.items(), key=lambda d: d[0])
BlogLB = BlogML
#分析页数
BlogLsHtml = PyQuery(BlogLB)('ul.SG_pages').html()
if BlogLsHtml.strip() != '':
    BlogPgHtml = int(
        PyQuery(BlogLsHtml)('span').text().replace(u'共', '').replace(u'页', ''))
else:
    BlogPgHtml = 1
BlogPgHtmlZ = BlogPgHtml
#分析记录数
BlogLsHtml = PyQuery(BlogLB)('div.SG_colW73').html()
BlogLsHtml = PyQuery(BlogLsHtml)('div.SG_connHead').html()
BlogLsHtml = PyQuery(BlogLsHtml)('span.title').html()
BlogCtHtml = int(
    PyQuery(BlogLsHtml)('em').text().replace(u'(', '').replace(u')', ''))
BlogCtHtmlZ = BlogCtHtml
BlogMLList2 = {}
BlogCounts = 0
print('  >>  类别数: ' + str(len(BlogMLList)) + '， 总页数：' + str(BlogPgHtmlZ) +

Example #9

Show file

File: jiexi_liuxue86.py Project: ted-ch/-python

def parse_detail(title, date, url, content, filename):
    if not content:
        return None
    jq = PyQuery(content)
    res_json = {
        'bread': [u'留学'],
        'title': title,
        'date': date,
        'source': u'liuxue86',
        'url': url,
        'class': 36,
        'subject': u'经验',
        'data_weight': 0,
    }
    methods = []
    content = [each for each in jq('p').items() if each.text().strip() != '']
    #print PyQuery(content[0]).html()
    print len(content)
    if not content:
        return None
    if len(content) == 0:
        return None
    flag_abstract = True
    flag_method = False
    flag_first = True
    steps = []
    _list = []
    img = ''
    step_title = ''
    substeps = []
    for each_json in content:
        each = (PyQuery(each_json).text())
        if u'点击查看' in each or u'原文来源' in each or u'点击此处' in each or u'推荐阅读' in each or u'相关推荐' in each:
            break
        if 'ue86.com' in PyQuery(each_json).text():
            if u'】' != PyQuery(each_json).text().strip(
            )[-1] and u'】' in PyQuery(each_json).text():
                each = PyQuery(each_json).text().split('】')[1]
                #print each
            elif u'】' == PyQuery(each_json).text().strip()[-1]:
                #print each
                continue
        #print each[1].decode('utf8')
        #break
        if each == 'None' or each.strip() == '':
            continue
        if u'相关阅读' in each or u'扫一扫' in each or u'相关链接' in each or u'天道提示' in each:
            break
        #print each
        if each.strip()[1] == u'、' or '<strong>' in PyQuery(each_json).html(
        ) or (each.strip()[3] == ':'
              if len(each.strip()) > 3 else False) or each.strip()[-1] == u'】':
            flag_abstract = False
            flag_method = True
        if flag_abstract:
            steps.append(each)
        else:
            if each.strip()[1] == u'、' or '<strong>' in PyQuery(
                    each_json).html() or (each.strip()[3] == ':'
                                          if len(each.strip()) > 3 else
                                          False) or each.strip()[-1] == u'】':
                #print each.strip()[1]
                if not flag_first:
                    _list.append({
                        'img': img,
                        'title': step_title,
                        'substeps': substeps,
                    })
                    img = ''
                    step_title = '<strong>' + each + '</strong>'
                    substeps = []
                if flag_first:
                    step_title = '<strong>' + each + '</strong>'
                    flag_first = False
            else:
                substeps.append(each)
    _list.append({
        'img': img,
        'title': step_title,
        'substeps': substeps,
    })
    if flag_method:
        methods.append({'title': u'方法/步骤', 'steps': _list})
        abstract = {
            'title': '',
            'steps': steps,
            'img': '',
        }
    else:
        _list1 = []
        for v in steps[1:]:
            _list1.append({
                'img': '',
                'title': v,
                'substeps': '',
            })
        methods.append({'title': u'方法/步骤', 'steps': _list1})
        if len(steps) == 0:
            steps = ['']
        abstract = {
            'title': '',
            'steps': [steps[0]],
            'img': '',
        }
    res_json['methods'] = methods
    res_json['abstract'] = abstract
    #print res_json
    #print methods
    good = True
    print json.dumps(res_json)
    return json.dumps(res_json), good