Exemple #1
0
def is_page_false(html):
    try:
        html_str=lxml_to_string(html)
        page_status=True
        if len(html.xpath('//body//div[@class="error"]'))!=0:
            str=lxml_to_string(html.xpath('//body')[0])
            page_status=False
        return page_status
    except Exception as x:
        err=traceback.format_exc()
        print(err)
        pass
def parse_analysis(author_id, limit_ls, html, begin_year, end_year,
                   query_code):
    try:
        url = 'http://med.wanfangdata.com.cn/Author/GetChartData'
        keywords_ls = html.xpath('//ul[@class="trend-chart-word-content"]/li')
        keywords_ls.append('')
        ls = []
        for item in keywords_ls:
            if item == '':
                keyword = ''
            else:
                item_str = lxml_to_string(item)
                keyword = item.xpath('./label/@for')[0]

            data = {
                'Id': author_id,
                'QueryCode': query_code,
                'BeginYear': begin_year,
                'EndYear': end_year,
                'Keywords': keyword
            }
            try:
                response = requests.post(url, data).text
                keywords_info_ls = json.loads(response)
                label = keywords_info_ls[0]['label']
                data = keywords_info_ls[0]['data']
            except:
                label = keyword
                data = None
            dic = {}
            dic['label'] = label
            dic['data'] = data
            ls.append(dic)

        return ls

        pass

    except Exception as x:
        err = traceback.format_exc()
        print(err)
        pass
def get_url(author_id, url):
    try:
        html = start(url)
        page_status = is_page_false(html)
        if page_status:
            source_text = lxml_to_string(html)
            periodical = parse_pub_perio(html)
            fund = parse_fund_relation(html)
            relative_keywords = parse_relative_keywords(html)
            s_ls = parse_analysis_url(author_id, html)
            return [
                periodical, fund, relative_keywords, s_ls[0], s_ls[1], s_ls[2],
                source_text
            ]
        else:
            return [None, None, None, None, None, None, None]
    except Exception as x:
        err = traceback.format_exc()
        print(err)
        pass
Exemple #4
0
def get_url(id, url, author_id, obj_type):
    try:
        html = start(url)
        page_statuse = is_page_false(html)
        if page_statuse:
            source_text = lxml_to_string(html)
            author_org_ls = parse_author_info(html)
            cooperation_author = parse_author_cooperation(
                id, url, author_id, html, obj_type)
            cooperation_relation_org = parse_org_cooperation(html)
            cooperation_relation_author = parse_author_cooperation_relation(
                author_id)
            return [
                author_org_ls[0], author_org_ls[1], cooperation_relation_org,
                cooperation_relation_author, cooperation_author, source_text
            ]
        else:
            return ['', '', None, None, None, None]
    except Exception as x:
        err = traceback.format_exc()
        print(err)
        pass
Exemple #5
0
def get_url(id, author_id, url, obj_type):
    try:
        # SameName_author=None
        html = start(url)
        page_status = is_page_false(html)
        if page_status:
            source_text = lxml_to_string(html)
            pub_author_all = parse_all_pub_num(html)
            pub_author_first = parse_first_pub_num(html)
            click_author = parse_click_num(html)
            SameName_author = get_next_url(id, author_id, url, html, obj_type)
            return [
                SameName_author, pub_author_all, pub_author_first,
                click_author, source_text
            ]
        else:
            return [None, None, None, None, None]

    except Exception as x:
        err = traceback.format_exc()
        print(err)
        pass
def parse_literature_info(id, author_uuid, author_name, author_url, author_id,
                          source_url, html, page, path, last_index):
    try:
        html_str = lxml_to_string(html)
        literature_ls = html.xpath('//ul[@class="author-list"]/li')
        literature_index = 1
        for item in literature_ls:
            if literature_index <= last_index:
                literature_index += 1
                continue
            source_text = lxml_to_string(item)
            if len(
                    item.xpath(
                        './div[@class="author-list-title"]/span[@class="title-only"]/text()'
                    )) != 0:
                label = item.xpath(
                    './div[@class="author-list-title"]/span[@class="title-only"]/text()'
                )[0]
            else:
                label = ''
            num = item.xpath(
                './div[@class="author-list-title"]/span[@class="num"]/text()'
            )[0].strip('.')
            title = item.xpath('./div[@class="author-list-title"]/a/text()')[0]
            url = item.xpath('./div[@class="author-list-title"]/a/@href')[0]
            uuid = md5(url)
            periodical_type = item.xpath(
                './div[@class="author-list-type"]/b/text()')[0]
            author_ls = item.xpath('./div[@class="author-list-type"]/a')
            ls = []
            for author_item in author_ls:
                author_dic = {}
                a_url = author_item.xpath('./@href')[0]
                name = author_item.xpath('./text()')[0]
                uuid = md5(a_url)
                author_id = a_url.split('/')[-1]
                author_dic['uuid'] = uuid
                author_dic['name'] = name
                author_dic['author_id'] = author_id
                author_dic['url'] = a_url
                ls.append(author_dic)
            author_info = json.dumps(ls, ensure_ascii=False)
            # 所在期刊
            periodical = item.xpath(
                './div[@class="author-list-type-info"]/a[1]/text()')[0]
            periodical_url = item.xpath(
                './div[@class="author-list-type-info"]/a[1]/@href')[0]
            periodical_uuid = md5(periodical_url)
            # 期数链接链接
            period_url = item.xpath(
                './div[@class="author-list-type-info"]/a[2]/@href')[0]
            # 期数
            period = item.xpath(
                './div[@class="author-list-type-info"]/a[2]/text()')[0]
            # 期数链接
            period_url = item.xpath(
                './div[@class="author-list-type-info"]/a[2]/@href')[0]
            # 页码
            pagination = item.xpath(
                './div[@class="author-list-type-info"]/a[2]/following::text()'
            )[0].strip()
            # 被引数
            cite_num = item.xpath(
                './div[@class="author-list-type-info"]/span[1]/text()'
            )[0].strip()
            # 收录信息
            include_info_ls = item.xpath(
                './div[@class="author-list-type-info"]/span[@class="core-img"]'
            )
            ls = []
            if len(include_info_ls) != 0:
                for include_item in include_info_ls:
                    include_dic = {}
                    include_name = include_item.xpath('./text()')[0]
                    detail = include_item.xpath('./@title')[0]
                    include_dic['name'] = include_name
                    include_dic['detail'] = detail
                    ls.append(include_dic)
                include_info = json.dumps(ls, ensure_ascii=False)
            else:
                include_info = None
            # 摘要
            intro = '<' + lxml_to_string(
                item.xpath('./div[@class="author-list-main"]')[0]).replace(
                    '&#13;', '').replace(
                        '\n', '').strip('<div class="author-list-main"></div>')
            # intro='<'+item.xpath('./div[@class="author-list-main"]/string(.)')[0]
            #关键词
            keywords_ls = item.xpath('./div[@class="author-list-keyword"]/a')
            if len(keywords_ls) != 0:
                ls = []
                for keywords_item in keywords_ls:
                    keyword_dic = {}
                    k_url = keywords_item.xpath('./@href')[0]
                    try:
                        keyword = keywords_item.xpath('./text()')[0]
                    except:
                        keyword = k_url.split('=')[-1].strip('()')
                    keyword_dic['keyword'] = keyword
                    keyword_dic['url'] = k_url
                    ls.append(keyword_dic)
                keywords = json.dumps(ls, ensure_ascii=False)
            else:
                keywords = None
            # str_test=lxml_to_string(item.xpath('.//div[@class="author-list-operation"]')[0])
            #在线阅读链接
            read_url = ''
            if len(
                    item.xpath(
                        './/div[@class="author-list-operation"]/a[2]/@href')
            ) != 0:
                read_url = item.xpath(
                    './/div[@class="author-list-operation"]/a[2]/@href')[0]
            #下载链接
            download_url = ''
            if len(
                    item.xpath(
                        './/div[@class="author-list-operation"]/a[1]/@href')
            ) != 0:
                download_url = item.xpath(
                    './/div[@class="author-list-operation"]/a[1]/@href')[0]
            pass

            save_author_relative(id, uuid, source_url, source_text,
                                 author_uuid, author_id, author_url,
                                 author_name, num, title, url, label,
                                 periodical_type, author_info, periodical,
                                 periodical_url, period_url, period,
                                 pagination, cite_num, include_info, intro,
                                 keywords, read_url, download_url)

            record_last(id, page, literature_index, path)
            literature_index += 1

    except Exception as x:
        err = traceback.format_exc()
        print(err)
        pass