def set_discontinue_journal(url): ''' 存储不需要爬取期刊链接 :param url: :return: ''' name_manager().save_discontiune_journal(url)
def create_and_save_execel(section,*name): ''' 创建excel并将数据写入execl :param section: :return: ''' if name.__len__() != 0: execel_name = create_execl_name(name[0]) else: execel_name=create_execl_name(section) wb=openpyxl.Workbook() sheet=wb.create_sheet("sheet1",0) line=2 write_first_line(sheet) while (True): article_data = name_manager().get_article_data(section) if article_data == None: break article = json.loads(article_data) sheet.cell(line,Row_Name.COLUME_NUM.get(Row_Name.EXCELNAME)+1,execel_name[:-5]) sheet.cell(line,Row_Name.COLUME_NUM.get(Row_Name.SERIAL_NUMBER)+1,line-1) for key in article.keys(): num=Row_Name.COLUME_NUM.get(key,None) if num != None: try: sheet.cell(line,num+1,article[key]) except: print("错误:",line,num+1,article[key]) line+=1 wb.save(EXECEL_PATH+execel_name)
def write_log(num,file_path): file=open(file_path,"a+",encoding="utf-8") nm=name_manager() while(True): if num ==1: temp_data=nm.get_journal_error_massage() elif num ==2: temp_data=nm.get_article_error_massage() if temp_data == None: break file.write(temp_data+"\n")
def run_article_error(file): ''' 重新执行出错的文章级别所有链接(运行) :param file: :return: ''' file = open(file) errs = {} pubs = {} nm = name_manager() for line in file.readlines(): # line_l=json.loads(line) # print(line_l) temp = json.loads(json.loads(line)[1]) new_dict = {} print(temp) if Row_Name.EISSN in temp: new_dict[Row_Name.EISSN] = temp[Row_Name.EISSN] if Row_Name.ISSN in temp: new_dict[Row_Name.ISSN] = temp[Row_Name.ISSN] new_dict[Row_Name.JOURNAL_TITLE] = temp[Row_Name.JOURNAL_TITLE] new_dict[Row_Name.PUBLISHER] = temp[Row_Name.PUBLISHER] new_dict[Row_Name.STRING_COVER_DATE] = temp[Row_Name.STRING_COVER_DATE] new_dict[Row_Name.YEAR] = temp[Row_Name.YEAR] new_dict[Row_Name.VOLUME] = temp[Row_Name.VOLUME] new_dict[Row_Name.ISSUE] = temp[Row_Name.ISSUE] new_dict[Row_Name.TEMP_URL] = temp[Row_Name.TEMP_URL] string = new_dict[Row_Name.JOURNAL_TITLE] + "_" + new_dict[ Row_Name.VOLUME] + "_" + new_dict[Row_Name.ISSUE] if not string in errs: errs[string] = new_dict for key in errs.keys(): pub = errs[key][Row_Name.PUBLISHER] if not pub in pubs: pubs[pub] = 1 pyfile = __import__("journals.website." + pub, fromlist=True) ac = getattr(pyfile, "article") m_get = getattr(ac(), "do_run") ais = m_get(errs[key]) m_save = getattr(ac(), "save_data") m_save(ais, errs[key]) write_data(pubs)
def __init__(self): self.nm = name_manager() self.method = []
def __init__(self): self.nm = name_manager() self.list = []
def __init__(self, website): self.nm = name_manager() self.website = website
def __init__(self, section, items, update): threading.Thread.__init__(self) self.section = section self.items = items self.nm = name_manager() self.update = update