class EconPapers: """ EconPapers类获取和解析EconPapers网站内容 :param str journal_web: 杂志首页 :return: 无返回值 """ def __init__(self,journal=None,journal_web=None): self.journal_web = journal_web self.scraper = SiteScraper(journal_web) # 期刊名称 self.journal = journal # 网页集合 self.literature_websites = None # 文献信息列表 self.literature_info = list() def to_literature_websites(self,condition=None,filter=None): """ 构建文献网址的列表 :param str condition: 筛选条件 :param str filter: 过滤条件 :return: 无返回值 """ self.scraper.get_links(page_url="",condition=condition) pages = self.scraper.pages if filter is not None: pages = (page for page in pages if re.search(filter,page) is not None) self.literature_websites = [''.join([self.journal_web,page]) for page in pages] def get_literature_info(self,websites=None): """ 利用网页信息获取文献信息 :param str,list websites: 网页地址 :return: 无返回值 """ if websites is None: websites = self.literature_websites if isinstance(websites,str): websites = [websites] i = 0 for web in websites: print(i) econ_parser = EconPapersLitPageParser(page=web,journal=self.journal) self.literature_info.append(econ_parser.literature_info) i += 1 def export_literature_websites(self,file): """ 导出文献网址 :param str file: 导出的文件名 :return: 无返回值 """ json.dump(self.literature_websites, fp=open(file,'w'))
def __init__(self,journal=None,journal_web=None): self.journal_web = journal_web self.scraper = SiteScraper(journal_web) # 期刊名称 self.journal = journal # 网页集合 self.literature_websites = None # 文献信息列表 self.literature_info = list()