def get_contributive_info_page(text):
        total_page = PyQuery(text,
                             parser='html').find('#pagescount').attr('value')
        if total_page is None or total_page.strip() == '':
            return 1

        return int(total_page)
Example #2
0
    def __get_company_name(self, text):
        try:
            name = PyQuery(
                text, parser='html').find('.overview').find('#entName').text()
            if name is not None and name.strip() != '':
                return name.strip()

            name = PyQuery(text, parser='html').find('h1.fullName').text()
            if name is not None and name.strip() != '':
                return name.strip()

            return None
        except Exception as e:
            self.log.exception(e)

        return None
Example #3
0
 def __init__(self, elem, trims, should_cleanup):
     text = PyQuery(elem).text()
     for trim in (trims or []):
         text = text.replace(trim, '')
     self.rx = re.compile(r'\W+')
     self.text = text.strip()
     self.trimmed_text = non_trimmed.sub(' ', self.text)
     self.html = PyQuery(elem).html()
     if should_cleanup:
         self.html = self.cleanup_html()
     self.normalized_text = nonword.sub('', text.lower())
Example #4
0
def get(key):
    try:

        en_reg = re.compile('[a-zA-Z]')
        jp_reg = re.compile('[ぁ-んァ-ヶ]')

        language = ''

        if re.findall(en_reg, key):
            language = 'en'
        if re.findall(jp_reg, key):
            language = 'ja'
        if not language:
            language = guess_language(key)

        print('来自 ' + language + ' wiki')

        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
            'Referer': 'https://www.sanseido.biz/',
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Connection': 'close',
            'authority': f'{language}.wikipedia.org'
        }

        s = requests.session()
        s.keep_alive = False

        page = s.get(f'https://{language}.wikipedia.org/wiki/{key}',
                     headers=headers)
        #print(f'https://{language}.wikipedia.org/wiki/{key}')
        page.encoding = 'UTF-8'
        print(page.text)
        tree = html.fromstring(page.text)
        #print(html.tostring(tree, encoding='UTF-8'))
        p1 = tree.xpath('//*[@id="mw-content-text"]/div/p[1]')
        if not p1:
            return f'wiki里查不到 {key}'
        p1_text = html.tostring(p1[0], encoding='UTF-8')
        de = PyQuery(p1_text.decode('UTF-8')).text()
        if ('refer' in de) or ('可指' in de):
            refer = tree.xpath('//*[@id="mw-content-text"]/div/ul[1]/li')
            for each in refer:
                refer_text = html.tostring(each, encoding='UTF-8')
                refer_link = PyQuery(refer_text.decode('UTF-8')).text()
                de += '\n' + refer_link

        return de.strip()
    except Exception as e:
        return f'似乎有些问题:\n{e}'
def parse_text_page(url):
    """解析文本内容页面"""
    resp = session.get(url)
    doc = PyQuery(resp.content)
    text_list = []
    log.info("parsing %s", url)
    for item in doc('tr td.ctext').items():
        item_html = item.html()
        if not re.search(r'<div.*</p>', item_html):
            log.warning("undesired text: %s", item_html[:20])
            continue
        item_text = PyQuery(item_html).text()
        text_list.append(item_text.strip())
    return text_list
Example #6
0
def parse_withdraw_tr(tr: PyQuery) -> dict:
    """
    解析一页二号平台站点的出金申请的tr
    :param tr:
    :return:
    """
    init_dict = dict()
    tds = tr.find("td")
    if len(tds) < 15:
        return None
    else:
        domain = domain2
        init_dict['system'] = domain
        """第一个td,取mt帐号和mt分组"""
        first = PyQuery(tds[0])
        texts_1 = first.text().split("\n")
        account = int(re.search(r'\d{4,}', texts_1[0].lower()).group())  # mt账户
        group = texts_1[-1].lower()[5:]  # mt分组
        init_dict['account'] = account
        init_dict['group'] = group
        """第二个td,取客户2"""
        second = PyQuery(tds[1])
        init_dict['manager'] = second.text().strip()
        """第三个td,取英文名"""
        third = PyQuery(tds[2])
        texts_3 = third.text().split("\n")
        nick_name = texts_3[0][4:].strip("")
        init_dict['nick_name'] = nick_name
        """第四个,金额"""
        fourth = PyQuery(tds[3])
        texts_4 = fourth.text().split("\n")
        amount_usd = float(texts_4[0].split("$")[-1].strip())  # 金额/美元
        amount_cny = float(texts_4[-1].split("¥")[-1].strip())  # 金额/人民币
        init_dict['amount_usd'] = amount_usd
        init_dict['amount_cny'] = amount_cny
        """
        第五个,取手续费
        """
        fifth = PyQuery(tds[4])
        texts_5 = fifth.text().split("\n")
        commission_usd = float(texts_5[0].split("$")[-1].strip())  # 手续费/美元
        commission_cny = float(texts_5[-1].split("¥")[-1].strip())  # 手续费/人民币
        init_dict['commission_usd'] = commission_usd
        init_dict['commission_cny'] = commission_cny
        """
        第六个,转账方式,
        """
        sixth = PyQuery(tds[5])
        init_dict['channel'] = sixth.text().strip()
        """
        第七个,时间
        """
        seventh = PyQuery(tds[6])
        seventh = seventh.text().split("\n")
        apply_time = seventh[0][5:].strip("")
        apply_time = get_datetime_from_str(apply_time)
        close_time = seventh[-1][5:].strip("")
        close_time = get_datetime_from_str(close_time)
        init_dict['apply_time'] = apply_time
        init_dict['close_time'] = close_time
        """第八个,开户行"""
        eighth = PyQuery(tds[7]).text()
        init_dict['blank_name'] = eighth.strip()
        """第九个,开户行代码"""
        ninth = PyQuery(tds[8]).text()
        init_dict['blank_code'] = ninth.strip()
        """第十个,银行"""
        tenth = PyQuery(tds[9]).text()
        init_dict['code_id'] = tenth.strip()
        """第十一个,状态"""
        eleventh = PyQuery(tds[10]).text()
        init_dict['status'] = eleventh.strip()
        """第十二个,账户余额"""
        twelfth = PyQuery(tds[11]).text()
        init_dict['account_balance'] = float(twelfth.strip()[1:])
        """第十三个,账户净值"""
        thirteenth = PyQuery(tds[12]).text()
        init_dict['account_value'] = float(thirteenth.strip()[1:])
        """第十四个,持仓量"""
        fourteenth = PyQuery(tds[13]).text()
        init_dict['open_interest'] = float(fourteenth.strip()[0:-1])
        """第十五个,可用保证金"""
        fifteenth = PyQuery(tds[14]).text()
        init_dict['account_margin'] = float(fifteenth.strip()[1:])
        """第十六个,单号"""
        sixth = PyQuery(tds[15].find("a"))
        init_dict['ticket'] = int(sixth.attr("href").split("/")[-1])

        init_dict = {k: v for k, v in init_dict.items()}
        """只记录指定类型的单子"""
        if init_dict['status'] == "审核中":
            return init_dict
        else:
            return None
Example #7
0
    def contributive_info_list(con_table_list):
        con_table_dict = {}
        if con_table_list is None or len(con_table_list) <= 0:
            return con_table_dict

        for con_item in con_table_list:
            status = con_item.get('status', 'fail')
            if status != 'success':
                break

            text = con_item.get('text')
            if text is None or text == '':
                break

            json_data = util.json_loads(text)
            if json_data is None:
                break

            data_array = json_data.get('data')
            if not isinstance(data_array, list):
                break

            for item in data_array:
                b_lic_no = item.get('bLicNo')
                b_lic_type_cn = item.get('blicType_CN')
                inv = item.get('inv')
                inv_type_cn = item.get('invType_CN')
                inv_id = item.get('invId')
                if inv is None or inv.strip() == '':
                    continue

                if inv_id is None or inv_id.strip() == '':
                    continue

                inv = inv.strip()
                inv_id = inv_id.strip()

                if b_lic_no is not None and b_lic_no.strip() != '':
                    b_lic_no = PyQuery(b_lic_no, parser='html').remove('div').remove('span'). \
                        text().replace(' ', '').strip()
                else:
                    b_lic_no = ''

                if b_lic_type_cn is None or b_lic_no.strip() == '':
                    b_lic_type_cn = ''
                else:
                    b_lic_type_cn = b_lic_type_cn.strip()

                if inv_type_cn is None or inv_type_cn.strip() == '':
                    inv_type_cn = ''
                else:
                    inv_type_cn = PyQuery(inv_type_cn, parser='html').remove('div').remove('span'). \
                        text().replace(' ', '').strip()

                sub_model = {
                    GsModel.ContributorInformation.SHAREHOLDER_NAME: inv,
                    GsModel.ContributorInformation.SHAREHOLDER_TYPE:
                    inv_type_cn,
                    GsModel.ContributorInformation.CERTIFICATE_TYPE:
                    b_lic_type_cn,
                    GsModel.ContributorInformation.CERTIFICATE_NO: b_lic_no
                }
                con_table_dict[inv_id] = sub_model

        return con_table_dict
Example #8
0
SinaBlogUrl = 'http://blog.sina.com.cn/s/articlelist_' + SinaBlogID + '_0_1.html'
print('  >> Read Url: ' + SinaBlogUrl)
BlogML = urllib2.urlopen(SinaBlogUrl).read()
#读取博客目录
BlogMLHtml = PyQuery(BlogML)('div.menuList').html()
BlogMLHtml = PyQuery(BlogMLHtml)('a')
BlogMLList = {}
for li in BlogMLHtml.items():
    #忽略博文收藏目录
    if li.text() != u'\u535a\u6587\u6536\u85cf':
        BlogMLList[li.text()] = li.attr('href')
BlogMLList = sorted(BlogMLList.items(), key=lambda d: d[0])
BlogLB = BlogML
#分析页数
BlogLsHtml = PyQuery(BlogLB)('ul.SG_pages').html()
if BlogLsHtml.strip() != '':
    BlogPgHtml = int(
        PyQuery(BlogLsHtml)('span').text().replace(u'共', '').replace(u'页', ''))
else:
    BlogPgHtml = 1
BlogPgHtmlZ = BlogPgHtml
#分析记录数
BlogLsHtml = PyQuery(BlogLB)('div.SG_colW73').html()
BlogLsHtml = PyQuery(BlogLsHtml)('div.SG_connHead').html()
BlogLsHtml = PyQuery(BlogLsHtml)('span.title').html()
BlogCtHtml = int(
    PyQuery(BlogLsHtml)('em').text().replace(u'(', '').replace(u')', ''))
BlogCtHtmlZ = BlogCtHtml
BlogMLList2 = {}
BlogCounts = 0
print('  >>  类别数: ' + str(len(BlogMLList)) + ', 总页数:' + str(BlogPgHtmlZ) +
Example #9
0
def parse_detail(title, date, url, content, filename):
    if not content:
        return None
    jq = PyQuery(content)
    res_json = {
        'bread': [u'留学'],
        'title': title,
        'date': date,
        'source': u'liuxue86',
        'url': url,
        'class': 36,
        'subject': u'经验',
        'data_weight': 0,
    }
    methods = []
    content = [each for each in jq('p').items() if each.text().strip() != '']
    #print PyQuery(content[0]).html()
    print len(content)
    if not content:
        return None
    if len(content) == 0:
        return None
    flag_abstract = True
    flag_method = False
    flag_first = True
    steps = []
    _list = []
    img = ''
    step_title = ''
    substeps = []
    for each_json in content:
        each = (PyQuery(each_json).text())
        if u'点击查看' in each or u'原文来源' in each or u'点击此处' in each or u'推荐阅读' in each or u'相关推荐' in each:
            break
        if 'ue86.com' in PyQuery(each_json).text():
            if u'】' != PyQuery(each_json).text().strip(
            )[-1] and u'】' in PyQuery(each_json).text():
                each = PyQuery(each_json).text().split('】')[1]
                #print each
            elif u'】' == PyQuery(each_json).text().strip()[-1]:
                #print each
                continue
        #print each[1].decode('utf8')
        #break
        if each == 'None' or each.strip() == '':
            continue
        if u'相关阅读' in each or u'扫一扫' in each or u'相关链接' in each or u'天道提示' in each:
            break
        #print each
        if each.strip()[1] == u'、' or '<strong>' in PyQuery(each_json).html(
        ) or (each.strip()[3] == ':'
              if len(each.strip()) > 3 else False) or each.strip()[-1] == u'】':
            flag_abstract = False
            flag_method = True
        if flag_abstract:
            steps.append(each)
        else:
            if each.strip()[1] == u'、' or '<strong>' in PyQuery(
                    each_json).html() or (each.strip()[3] == ':'
                                          if len(each.strip()) > 3 else
                                          False) or each.strip()[-1] == u'】':
                #print each.strip()[1]
                if not flag_first:
                    _list.append({
                        'img': img,
                        'title': step_title,
                        'substeps': substeps,
                    })
                    img = ''
                    step_title = '<strong>' + each + '</strong>'
                    substeps = []
                if flag_first:
                    step_title = '<strong>' + each + '</strong>'
                    flag_first = False
            else:
                substeps.append(each)
    _list.append({
        'img': img,
        'title': step_title,
        'substeps': substeps,
    })
    if flag_method:
        methods.append({'title': u'方法/步骤', 'steps': _list})
        abstract = {
            'title': '',
            'steps': steps,
            'img': '',
        }
    else:
        _list1 = []
        for v in steps[1:]:
            _list1.append({
                'img': '',
                'title': v,
                'substeps': '',
            })
        methods.append({'title': u'方法/步骤', 'steps': _list1})
        if len(steps) == 0:
            steps = ['']
        abstract = {
            'title': '',
            'steps': [steps[0]],
            'img': '',
        }
    res_json['methods'] = methods
    res_json['abstract'] = abstract
    #print res_json
    #print methods
    good = True
    print json.dumps(res_json)
    return json.dumps(res_json), good