def is_page_false(html): try: html_str=lxml_to_string(html) page_status=True if len(html.xpath('//body//div[@class="error"]'))!=0: str=lxml_to_string(html.xpath('//body')[0]) page_status=False return page_status except Exception as x: err=traceback.format_exc() print(err) pass
def parse_analysis(author_id, limit_ls, html, begin_year, end_year, query_code): try: url = 'http://med.wanfangdata.com.cn/Author/GetChartData' keywords_ls = html.xpath('//ul[@class="trend-chart-word-content"]/li') keywords_ls.append('') ls = [] for item in keywords_ls: if item == '': keyword = '' else: item_str = lxml_to_string(item) keyword = item.xpath('./label/@for')[0] data = { 'Id': author_id, 'QueryCode': query_code, 'BeginYear': begin_year, 'EndYear': end_year, 'Keywords': keyword } try: response = requests.post(url, data).text keywords_info_ls = json.loads(response) label = keywords_info_ls[0]['label'] data = keywords_info_ls[0]['data'] except: label = keyword data = None dic = {} dic['label'] = label dic['data'] = data ls.append(dic) return ls pass except Exception as x: err = traceback.format_exc() print(err) pass
def get_url(author_id, url): try: html = start(url) page_status = is_page_false(html) if page_status: source_text = lxml_to_string(html) periodical = parse_pub_perio(html) fund = parse_fund_relation(html) relative_keywords = parse_relative_keywords(html) s_ls = parse_analysis_url(author_id, html) return [ periodical, fund, relative_keywords, s_ls[0], s_ls[1], s_ls[2], source_text ] else: return [None, None, None, None, None, None, None] except Exception as x: err = traceback.format_exc() print(err) pass
def get_url(id, url, author_id, obj_type): try: html = start(url) page_statuse = is_page_false(html) if page_statuse: source_text = lxml_to_string(html) author_org_ls = parse_author_info(html) cooperation_author = parse_author_cooperation( id, url, author_id, html, obj_type) cooperation_relation_org = parse_org_cooperation(html) cooperation_relation_author = parse_author_cooperation_relation( author_id) return [ author_org_ls[0], author_org_ls[1], cooperation_relation_org, cooperation_relation_author, cooperation_author, source_text ] else: return ['', '', None, None, None, None] except Exception as x: err = traceback.format_exc() print(err) pass
def get_url(id, author_id, url, obj_type): try: # SameName_author=None html = start(url) page_status = is_page_false(html) if page_status: source_text = lxml_to_string(html) pub_author_all = parse_all_pub_num(html) pub_author_first = parse_first_pub_num(html) click_author = parse_click_num(html) SameName_author = get_next_url(id, author_id, url, html, obj_type) return [ SameName_author, pub_author_all, pub_author_first, click_author, source_text ] else: return [None, None, None, None, None] except Exception as x: err = traceback.format_exc() print(err) pass
def parse_literature_info(id, author_uuid, author_name, author_url, author_id, source_url, html, page, path, last_index): try: html_str = lxml_to_string(html) literature_ls = html.xpath('//ul[@class="author-list"]/li') literature_index = 1 for item in literature_ls: if literature_index <= last_index: literature_index += 1 continue source_text = lxml_to_string(item) if len( item.xpath( './div[@class="author-list-title"]/span[@class="title-only"]/text()' )) != 0: label = item.xpath( './div[@class="author-list-title"]/span[@class="title-only"]/text()' )[0] else: label = '' num = item.xpath( './div[@class="author-list-title"]/span[@class="num"]/text()' )[0].strip('.') title = item.xpath('./div[@class="author-list-title"]/a/text()')[0] url = item.xpath('./div[@class="author-list-title"]/a/@href')[0] uuid = md5(url) periodical_type = item.xpath( './div[@class="author-list-type"]/b/text()')[0] author_ls = item.xpath('./div[@class="author-list-type"]/a') ls = [] for author_item in author_ls: author_dic = {} a_url = author_item.xpath('./@href')[0] name = author_item.xpath('./text()')[0] uuid = md5(a_url) author_id = a_url.split('/')[-1] author_dic['uuid'] = uuid author_dic['name'] = name author_dic['author_id'] = author_id author_dic['url'] = a_url ls.append(author_dic) author_info = json.dumps(ls, ensure_ascii=False) # 所在期刊 periodical = item.xpath( './div[@class="author-list-type-info"]/a[1]/text()')[0] periodical_url = item.xpath( './div[@class="author-list-type-info"]/a[1]/@href')[0] periodical_uuid = md5(periodical_url) # 期数链接链接 period_url = item.xpath( './div[@class="author-list-type-info"]/a[2]/@href')[0] # 期数 period = item.xpath( './div[@class="author-list-type-info"]/a[2]/text()')[0] # 期数链接 period_url = item.xpath( './div[@class="author-list-type-info"]/a[2]/@href')[0] # 页码 pagination = item.xpath( './div[@class="author-list-type-info"]/a[2]/following::text()' )[0].strip() # 被引数 cite_num = item.xpath( './div[@class="author-list-type-info"]/span[1]/text()' )[0].strip() # 收录信息 include_info_ls = item.xpath( './div[@class="author-list-type-info"]/span[@class="core-img"]' ) ls = [] if len(include_info_ls) != 0: for include_item in include_info_ls: include_dic = {} include_name = include_item.xpath('./text()')[0] detail = include_item.xpath('./@title')[0] include_dic['name'] = include_name include_dic['detail'] = detail ls.append(include_dic) include_info = json.dumps(ls, ensure_ascii=False) else: include_info = None # 摘要 intro = '<' + lxml_to_string( item.xpath('./div[@class="author-list-main"]')[0]).replace( ' ', '').replace( '\n', '').strip('<div class="author-list-main"></div>') # intro='<'+item.xpath('./div[@class="author-list-main"]/string(.)')[0] #关键词 keywords_ls = item.xpath('./div[@class="author-list-keyword"]/a') if len(keywords_ls) != 0: ls = [] for keywords_item in keywords_ls: keyword_dic = {} k_url = keywords_item.xpath('./@href')[0] try: keyword = keywords_item.xpath('./text()')[0] except: keyword = k_url.split('=')[-1].strip('()') keyword_dic['keyword'] = keyword keyword_dic['url'] = k_url ls.append(keyword_dic) keywords = json.dumps(ls, ensure_ascii=False) else: keywords = None # str_test=lxml_to_string(item.xpath('.//div[@class="author-list-operation"]')[0]) #在线阅读链接 read_url = '' if len( item.xpath( './/div[@class="author-list-operation"]/a[2]/@href') ) != 0: read_url = item.xpath( './/div[@class="author-list-operation"]/a[2]/@href')[0] #下载链接 download_url = '' if len( item.xpath( './/div[@class="author-list-operation"]/a[1]/@href') ) != 0: download_url = item.xpath( './/div[@class="author-list-operation"]/a[1]/@href')[0] pass save_author_relative(id, uuid, source_url, source_text, author_uuid, author_id, author_url, author_name, num, title, url, label, periodical_type, author_info, periodical, periodical_url, period_url, period, pagination, cite_num, include_info, intro, keywords, read_url, download_url) record_last(id, page, literature_index, path) literature_index += 1 except Exception as x: err = traceback.format_exc() print(err) pass