예제 #1
0
    def read_items(self):
        self.create()
        eb_list = []
        for row in range(self.r_sheet.nrows - 1):
            eb = name_manager.execl_bean()
            eb.row_num = row + 1
            eb.sourcename = self.r_sheet.cell(eb.row_num, self.nums[0]).value
            issn = self.r_sheet.cell(eb.row_num, self.nums[1]).value
            eissn = self.r_sheet.cell(eb.row_num, self.nums[2]).value
            if issn == "":
                eb.eissn = eissn
            elif (eissn == ""):
                eb.eissn = issn
            else:
                eb.eissn = issn + "-" + eissn
            eb.waibuaid = self.r_sheet.cell(eb.row_num, self.nums[3]).value
            eb.pinjie = self.r_sheet.cell(eb.row_num, self.nums[4]).value
            eb.full_url = self.r_sheet.cell(eb.row_num, self.nums[5]).value
            eb.abs_url = self.r_sheet.cell(eb.row_num, self.nums[6]).value
            eb.full_path = self.r_sheet.cell(eb.row_num, self.nums[7]).value
            if self.list.__len__() > self.nums[7] + 1:
                page_num = self.r_sheet.cell(eb.row_num,
                                             self.nums[7] + 1).value
                if page_num:
                    eb.page = int(page_num)

            eb.check()
            eb_list.append(eb)
        return eb_list
예제 #2
0
 def back_file_to_excel(self, back_file_path):
     back_file = open(back_file_path, "r")
     for line in back_file.readlines():
         eb = name_manager.execl_bean()
         eb.paser(line)
         self.excel_write(eb)
     self.wb.save(self.file_path)
예제 #3
0
    def run(self):
        logger.info("URL_THREAD - " + self.name + " - " + self.sourcename +
                    " download_url start...")
        while (True):
            string = self.um.get_eb(self.url_set_name)
            if string == None:
                self.um.set_done(self.sourcename, self.step)
                break
            eb = nm.execl_bean()
            eb.paser(string)
            url_dict = {}
            if eb.sourcename == "PMC":
                if eb.waibuaid != "":
                    url_dict[
                        EXCEL_ITEM.
                        WAIBUAID] = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + eb.waibuaid
            if eb.abs_url != "":
                url_dict[EXCEL_ITEM.ABS_URL] = eb.abs_url
            if eb.full_url != "":
                url_dict[EXCEL_ITEM.FULL_URL] = eb.full_url
            if eb.pinjie != "":
                url_dict[EXCEL_ITEM.PINJIE] = eb.pinjie

            jcb = nm.json_conf_bean(eb.sourcename, eb.eissn)
            html_ = htmls.HTML(eb,
                               jcb,
                               self.tm,
                               self.sourcename,
                               test_file=self.create_test_file_path())
            try:
                logger.info("URL_THREAD - " + self.name + " - " +
                            self.sourcename + " get download url form: " +
                            str(url_dict))
                url, full_url = parser_url(url_dict,
                                           html_,
                                           name=self.name + " - " +
                                           self.sourcename)
            except:
                logger.error(self.sourcename + " download url has err!url列表:" +
                             str(url_dict),
                             exc_info=True)
                if eb.retry < collect.DOWNLOAD_URL_RETRY:
                    logger.info("retry time:" + str(eb.retry))
                    eb.retry += 1
                    self.um.save(eb, self.step - 1)
                else:
                    logger.info("retry:" + str(eb.retry) +
                                ".retry次数超过5次,不再重试。")
                    eb.err_and_step = str(self.step) + ":请求下载url错误超过五次"
                    self.um.save(eb, self.err_step)
                continue

            eb.full_url = full_url
            eb.abs_url = url
            # eb.full_url = eb.pinjie
            # eb.abs_url = eb.pinjie
            self.um.save(eb, self.step)
        logger.info("URL_THREAD - " + self.name + " - " + self.sourcename +
                    " download_url finsh.")
예제 #4
0
    def run(self):
        logger.info(self.sourcename + " download_url start...")
        while (True):
            string = self.um.get_eb(self.url_set_name)
            if string == None:
                break
            eb = nm.execl_bean()
            eb.paser(string)
            url = ""
            if eb.sourcename == "PMC":
                url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + eb.waibuaid
            else:
                url = eb.pinjie

            jcb = nm.json_conf_bean(eb.sourcename, eb.eissn)
            file_path = self.creat_filename()
            try:
                html_ = htmls.HTML(eb, jcb, self.tm, self.sourcename)
                if eb.full_url == "":
                    # logger.info("URL_THREAD - "+self.name+" - "+self.sourcename+" get download url form: "+url)
                    print("+++++++++++++++++")
                    print(eb.full_url)
                    full_url = html_.run(url)
                else:
                    print("=====================", eb.full_url)
                    if html_.test_full_url(eb.full_url):
                        full_url = eb.full_url
                    else:
                        full_url = html_.run(eb.full_url)
                print("下载pdf...", full_url)
                htmls.download(full_url, file_path)
                eb.page = htmls.checkpdf(file_path)
                print("下载成功")
            except NoConfError:
                logger.info(eb.eissn + " 无可用的conf.")
                eb.err_and_step = str(self.url_step) + ":  无可用的conf"
                self.um.save(eb, self.err_step)
            except Exception as e:
                logger.error(self.sourcename + " download url " + url +
                             " has err",
                             exc_info=True)
                if eb.retry < collect.DOWNLOAD_URL_RETRY:
                    logger.info("retry time:" + str(eb.retry))
                    eb.retry += 1
                    self.um.save(eb, self.url_step - 1)
                else:
                    logger.info("retry:" + str(eb.retry) +
                                ". retry次数超过5次,不再重试。")
                    self.um.save(eb, self.err_step)
                continue

            eb.full_url = full_url
            eb.abs_url = url
            dirs = file_path.split("/")
            eb.full_path = dirs[-2] + "/" + dirs[-1]
            self.um.save(eb, self.finsh_step)
예제 #5
0
    def run(self):
        logger.info(self.sourcename + " start...")
        while (True):
            string = self.um.get_eb(self.url_set_name)
            if string == None:
                break

            file_path = self.creat_filename()
            eb = nm.execl_bean()
            eb.paser(string)
            url_dict = {}

            if eb.abs_url != "":
                url_dict[EXCEL_ITEM.ABS_URL] = eb.abs_url
            if eb.full_url != "":
                url_dict[EXCEL_ITEM.FULL_URL] = eb.full_url
            if eb.pinjie != "":
                url_dict[EXCEL_ITEM.PINJIE] = eb.pinjie

            jcb = nm.json_conf_bean(eb.sourcename, eb.eissn)
            html_ = htmls.HTML(eb, jcb, self.tm, self.sourcename)
            try:
                logger.info("URL_THREAD - " + self.name + " - " +
                            self.sourcename + " get download url form: " +
                            str(url_dict))
                url, full_url = parser_url(url_dict, html_)
                htmls.download(full_url, file_path)
                eb.page = htmls.checkpdf(file_path)
            except NoConfError:
                logger.info(self.sourcename + "-" + eb.eissn + " 无可用的conf.")
                continue
            except Exception as e:
                logger.error(self.sourcename + " download url " +
                             str(url_dict) + " has err",
                             exc_info=True)
                if eb.retry < collect.DOWNLOAD_URL_RETRY:
                    logger.info("retry time:" + str(eb.retry))
                    eb.retry += 1
                    self.um.save(eb, self.url_step - 1)
                else:
                    logger.info("retry:" + str(eb.retry) +
                                ". retry次数超过5次,不再重试。")
                    self.um.save(eb, self.err_step)
                continue

            logger.info("URL_THREAD - " + self.name + " - " + self.sourcename +
                        " 下载成功!")
            eb.full_url = full_url
            eb.abs_url = url

            dirs = file_path.split("/")
            eb.full_path = dirs[-2] + "/" + dirs[-1]
            self.um.save(eb, self.finsh_step)
예제 #6
0
    def run(self):
        logger.info("PDF_THREAD - " + self.name + " - " + self.sourcename +
                    " download start...")
        while (True):
            string = self.um.get_eb(self.url_set_name)
            if string == None:
                if self.um.get_done(self.sourcename,
                                    self.step - 1) == self.um.DONE:
                    self.um.set_done(self.sourcename, self.step)
                    break
                else:
                    logger.info("PDF_THREAD - " + self.name + " - " +
                                self.sourcename + " wait for download...")
                    time.sleep(30)
                    continue
            eb = nm.execl_bean()
            eb.paser(string)
            file_path = self.creat_filename()

            try:
                logger.info("PDF_THREAD - " + self.name + " - " +
                            self.sourcename + " : download pdf.download url:" +
                            eb.full_url + " 下载页面链接:" + eb.pinjie)
                htmls.download(eb.full_url, file_path)
                logger.info("PDF_THREAD - " + self.name + " - " +
                            self.sourcename + " :check pdf. pdf path:" +
                            file_path)
                eb.page = htmls.checkpdf(file_path)
            except Exception:
                logger.error(self.sourcename + " download " + eb.full_url +
                             " has err",
                             exc_info=True)
                if eb.retry < collect.DOWNLOAD_RETRY:
                    logger.info("retry time:" + str(eb.retry))
                    eb.retry += 1
                    self.um.save(eb, self.step - 1)
                else:
                    logger.info("retry:" + str(eb.retry) + "retry次数超过5次,不再重试。")
                    eb.err_and_step = str(self.step) + ":下载pdf错误超过五次"
                    self.um.save(eb, self.err_step)
                continue

            logger.info("PDF_THREAD - " + self.name + " - " + self.sourcename +
                        " :pdf下载成功。")
            dirs = file_path.split("/")
            eb.full_path = dirs[-2] + "/" + dirs[-1]
            self.um.save(eb, self.step)
        logger.info("URL_THREAD - " + self.name + " - " + self.sourcename +
                    " download finsh.")
예제 #7
0
 def write(self):
     back_file = open(self.file_path.replace(".xls", "_back.txt"), "w+")
     logger.info("写入execl...")
     for sn in self.um.get_sourcenames():
         while (True):
             url_name = self.um.fix(sn, self.write_step)
             string = self.um.get_eb(url_name)
             if string == None:
                 break
             back_file.write(string + "\n")
             eb = name_manager.execl_bean()
             eb.paser(string)
             self.excel_write(eb)
     self.wb.save(self.file_path)
     logger.info("Excel写入完成。")
예제 #8
0
    def read(self, file_path, sourcename="doaj", issn="d"):
        logger.info("设定sourcename为:" + sourcename + ",开始读取文件...")
        self.um.save_sourcenames(sourcename)
        with open(file_path, "r", encoding="utf-8") as f:
            for line_index, line in enumerate(f.readlines()):
                print(line_index)
                line = line.replace("\n", "").strip()
                if line == "":
                    continue
                eb = name_manager.execl_bean()
                eb.sourcename = sourcename
                eb.eissn = issn + str(line_index)
                eb.pinjie = line
                eb.row_num = -1

                self.um.save(eb, self.step)
예제 #9
0
 def write(self, file_path):
     write_file = open(file_path, "w+", encoding="utf-8")
     back_file = open(file_path.replace(".txt", "_back.txt"),
                      "w+",
                      encoding="utf-8")
     logger.info("写入文件...")
     for sn in self.um.get_sourcenames():
         while (True):
             url_name = self.um.fix(sn, self.write_step)
             string = self.um.get_eb(url_name)
             if string == None:
                 break
             back_file.write(string + "\n")
             eb = name_manager.execl_bean()
             eb.paser(string)
             write_file.write(eb.pinjie + "##" + eb.full_url + "##" +
                              eb.full_path + "\n")
예제 #10
0
    def run(self):
        logger.info(self.sourcename + " download_url start...")
        while (True):
            string = self.um.get_eb(self.url_set_name)
            if string == None:
                break
            eb = nm.execl_bean()
            eb.paser(string)
            url = eb.pinjie
            jcb = nm.json_conf_bean(eb.sourcename, eb.eissn)
            file_path = self.creat_filename()

            try:
                d_url = self.get_d_url(url)

                # print(d_url)
                logger.info(self.sourcename + " get download url form: " +
                            d_url)
                htmls.download(d_url, file_path)
                eb.page = htmls.checkpdf(file_path)
            except NoConfError:
                logger.info(eb.eissn + " 无可用的conf.")
                eb.err_and_step = str(self.url_step) + ":  无可用的conf"
                self.um.save(eb, self.err_step)
            except Exception as e:
                logger.error(self.sourcename + " download url " + url +
                             " has err",
                             exc_info=True)
                if eb.retry < collect.DOWNLOAD_URL_RETRY:
                    logger.info("retry time:" + str(eb.retry))
                    eb.retry += 1
                    self.um.save(eb, self.url_step - 1)
                else:
                    logger.info("retry:" + str(eb.retry) +
                                ". retry次数超过5次,不再重试。")
                    self.um.save(eb, self.err_step)
                continue
            eb.full_url = d_url
            eb.abs_url = url
            dirs = file_path.split("/")
            eb.full_path = dirs[-2] + "/" + dirs[-1]
            self.um.save(eb, self.finsh_step)
예제 #11
0
    def run(self):
        logger.info(self.sourcename + " download_url start...")
        while (True):
            string = self.um.get_eb(self.url_set_name)
            if string == None:
                break
            eb = nm.execl_bean()
            eb.paser(string)
            url = eb.pinjie

            jcb = nm.json_conf_bean(eb.sourcename, eb.eissn)
            file_path = self.creat_filename()
            try:
                # time.sleep(random.random() * 3 + 1)
                logger.info(self.sourcename + " 开始下载:" + url)

                r = requests.get(url)
                try:
                    c1 = r.cookies['BIGipServerlbapp_tc3']
                    c2 = r.cookies['BIGipServerwww.osti.gov_pool']
                    c3 = r.cookies['JSESSIONID']
                except:
                    pass
                soup = BeautifulSoup(r.text, "html.parser")

                mate = soup.find("meta", {"name": "citation_pdf_url"})
                if mate == None:
                    start_break = False
                    for div1 in soup.find_all("div",
                                              class_="biblio-secondary-group"):
                        for div2 in div1.find_all(
                                "div", class_="biblio-secondary-item small"):
                            for a in div2.find_all("a"):
                                if "href" in a.attrs.keys():
                                    if "https://doi.org" in a["href"]:
                                        pdf_url = a["href"]
                                        cp = htmls.config_parser()
                                        ht = htmls.HTML(None, None, None, None)
                                        for conf in cp.get_all_conf():
                                            print(conf)
                                            if ht.test(conf, pdf_url):
                                                result = ht.do_run(
                                                    conf, pdf_url)
                                                r2 = requests.get(result)
                                                r2.encoding = 'utf-8'
                                                # print(r2.text)
                                                file = open(file_path, "wb+")
                                                file.write(r2.content)
                                                file.close()
                                                break

                                        start_break = True
                                        break
                            if start_break:
                                break
                        if start_break:
                            break

                else:
                    pdf_url = mate["content"]
                    cookies = {
                        'BIGipServerlbapp_tc3': c1,
                        'BIGipServerwww.osti.gov_pool': c2,
                        'JSESSIONID': c3,
                        '__utma':
                        '249692800.1749221367.1564467097.1564467097.1564467097.1',
                        '__utmc': '249692800',
                        '__utmz':
                        '249692800.1564467097.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
                        '_ga': 'GA1.2.1749221367.1564467097',
                        '_gid': 'GA1.2.298248318.1564467099',
                        '__utmt': '1',
                        '__utmb': '249692800.63.10.1564467097'
                    }

                    r2 = requests.get(pdf_url, cookies=cookies)
                    r2.encoding = 'utf-8'
                    # print(r2.text)
                    file = open(file_path, "wb+")
                    file.write(r2.content)
                    file.close()
                eb.page = htmls.checkpdf(file_path)
                full_url = pdf_url
                # r = requests.get(url)
                # c1 = r.cookies['BIGipServerlbapp_tc3']
                # c2 = r.cookies['BIGipServerwww.osti.gov_pool']
                # c3 = r.cookies['JSESSIONID']
                # soup = BeautifulSoup(r.text, "html.parser")
                #
                # pdf_url = soup.find("meta", {"name": "citation_pdf_url"})["content"]
                # cookies = {
                #     'BIGipServerlbapp_tc3': c1,
                #     'BIGipServerwww.osti.gov_pool': c2,
                #     'JSESSIONID': c3,
                #     '__utma': '249692800.1749221367.1564467097.1564467097.1564467097.1',
                #     '__utmc': '249692800',
                #     '__utmz': '249692800.1564467097.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
                #     '_ga': 'GA1.2.1749221367.1564467097',
                #     '_gid': 'GA1.2.298248318.1564467099',
                #     '__utmt': '1',
                #     '__utmb': '249692800.63.10.1564467097'
                # }
                # #time.sleep(random.random() * 3 + 1)
                # logger.info(self.sourcename+" 下载PDF:"+pdf_url)
                #
                # r2 = requests.get(pdf_url, cookies=cookies)
                # r2.encoding = 'utf-8'
                # file = open(file_path, "wb+")
                # file.write(r2.content)
                # file.close()
                # eb.page = htmls.checkpdf(file_path)
                # full_url=pdf_url

            except NoConfError:
                logger.info(eb.eissn + " 无可用的conf.")
                eb.err_and_step = str(self.url_step) + ":  无可用的conf"
                self.um.save(eb, self.err_step)
            except Exception as e:
                logger.error(self.sourcename + " download url " + url +
                             " has err",
                             exc_info=True)
                if eb.retry < collect.DOWNLOAD_URL_RETRY:
                    logger.info("retry time:" + str(eb.retry))
                    eb.retry += 1
                    self.um.save(eb, self.url_step - 1)
                else:
                    logger.info("retry:" + str(eb.retry) +
                                ". retry次数超过5次,不再重试。")
                    self.um.save(eb, self.err_step)
                continue

            eb.full_url = full_url
            eb.abs_url = url
            dirs = file_path.split("/")
            eb.full_path = dirs[-2] + "/" + dirs[-1]
            self.um.save(eb, self.finsh_step)