Example #1
0
    def run(self,url):
        if self.load(self.jcb):
            logger.info(self.sn+" load secuess!")
            return self.do_run(self.jcb.conf,url)
        else:
            logger.info(self.sn+" load faild,test default confs...")
            confs=self.tm.get_default(self.jcb)
            for conf in confs:
                new_jcb=name_manager.json_conf_bean(self.jcb.get_sourcename(),self.eb.eissn)
                new_jcb.paser(conf)
                if self.test(new_jcb.conf,url):
                    logger.info(self.sn+" find a conf form default!")
                    self.tm.save(new_jcb)
                    return self.do_run(new_jcb.conf,url)

            logger.info(self.sn+" test common confs...")
            confs = self.tm.get_common_conf()
            for conf in confs:
                new_jcb = name_manager.json_conf_bean(self.jcb.get_sourcename(), self.eb.eissn)
                new_jcb.paser(conf)
                if self.test(new_jcb.conf, url):
                    logger.info(self.sn+" find a conf form default!")
                    self.tm.save(new_jcb)
                    return self.do_run(new_jcb.conf, url)
            logger.error("There is no conf available!")
            raise NoConfError("no conf")
Example #2
0
 def read_backup(self):
     file=open(self.backup_file)
     for line in file.readlines():
         args=line.split("#")
         jcb=name_manager.json_conf_bean(args[0],args[1])
         jcb.paser(args[2])
         self.tm.save(jcb)
Example #3
0
    def run(self):
        logger.info("URL_THREAD - " + self.name + " - " + self.sourcename +
                    " download_url start...")
        while (True):
            string = self.um.get_eb(self.url_set_name)
            if string == None:
                self.um.set_done(self.sourcename, self.step)
                break
            eb = nm.execl_bean()
            eb.paser(string)
            url_dict = {}
            if eb.sourcename == "PMC":
                if eb.waibuaid != "":
                    url_dict[
                        EXCEL_ITEM.
                        WAIBUAID] = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + eb.waibuaid
            if eb.abs_url != "":
                url_dict[EXCEL_ITEM.ABS_URL] = eb.abs_url
            if eb.full_url != "":
                url_dict[EXCEL_ITEM.FULL_URL] = eb.full_url
            if eb.pinjie != "":
                url_dict[EXCEL_ITEM.PINJIE] = eb.pinjie

            jcb = nm.json_conf_bean(eb.sourcename, eb.eissn)
            html_ = htmls.HTML(eb,
                               jcb,
                               self.tm,
                               self.sourcename,
                               test_file=self.create_test_file_path())
            try:
                logger.info("URL_THREAD - " + self.name + " - " +
                            self.sourcename + " get download url form: " +
                            str(url_dict))
                url, full_url = parser_url(url_dict,
                                           html_,
                                           name=self.name + " - " +
                                           self.sourcename)
            except:
                logger.error(self.sourcename + " download url has err!url列表:" +
                             str(url_dict),
                             exc_info=True)
                if eb.retry < collect.DOWNLOAD_URL_RETRY:
                    logger.info("retry time:" + str(eb.retry))
                    eb.retry += 1
                    self.um.save(eb, self.step - 1)
                else:
                    logger.info("retry:" + str(eb.retry) +
                                ".retry次数超过5次,不再重试。")
                    eb.err_and_step = str(self.step) + ":请求下载url错误超过五次"
                    self.um.save(eb, self.err_step)
                continue

            eb.full_url = full_url
            eb.abs_url = url
            # eb.full_url = eb.pinjie
            # eb.abs_url = eb.pinjie
            self.um.save(eb, self.step)
        logger.info("URL_THREAD - " + self.name + " - " + self.sourcename +
                    " download_url finsh.")
Example #4
0
    def run(self):
        logger.info(self.sourcename + " download_url start...")
        while (True):
            string = self.um.get_eb(self.url_set_name)
            if string == None:
                break
            eb = nm.execl_bean()
            eb.paser(string)
            url = ""
            if eb.sourcename == "PMC":
                url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + eb.waibuaid
            else:
                url = eb.pinjie

            jcb = nm.json_conf_bean(eb.sourcename, eb.eissn)
            file_path = self.creat_filename()
            try:
                html_ = htmls.HTML(eb, jcb, self.tm, self.sourcename)
                if eb.full_url == "":
                    # logger.info("URL_THREAD - "+self.name+" - "+self.sourcename+" get download url form: "+url)
                    print("+++++++++++++++++")
                    print(eb.full_url)
                    full_url = html_.run(url)
                else:
                    print("=====================", eb.full_url)
                    if html_.test_full_url(eb.full_url):
                        full_url = eb.full_url
                    else:
                        full_url = html_.run(eb.full_url)
                print("下载pdf...", full_url)
                htmls.download(full_url, file_path)
                eb.page = htmls.checkpdf(file_path)
                print("下载成功")
            except NoConfError:
                logger.info(eb.eissn + " 无可用的conf.")
                eb.err_and_step = str(self.url_step) + ":  无可用的conf"
                self.um.save(eb, self.err_step)
            except Exception as e:
                logger.error(self.sourcename + " download url " + url +
                             " has err",
                             exc_info=True)
                if eb.retry < collect.DOWNLOAD_URL_RETRY:
                    logger.info("retry time:" + str(eb.retry))
                    eb.retry += 1
                    self.um.save(eb, self.url_step - 1)
                else:
                    logger.info("retry:" + str(eb.retry) +
                                ". retry次数超过5次,不再重试。")
                    self.um.save(eb, self.err_step)
                continue

            eb.full_url = full_url
            eb.abs_url = url
            dirs = file_path.split("/")
            eb.full_path = dirs[-2] + "/" + dirs[-1]
            self.um.save(eb, self.finsh_step)
Example #5
0
 def paser(self):
     self.conf.read(self.file_name,encoding="utf-8")
     sections=self.conf.sections()
     for section in sections:
         rne=section.split("_")
         if rne[0]=="common":
             self.tm.save_common_conf(self.conf.items(section))
         else:
             jcb=name_manager.json_conf_bean(rne[0],rne[1])
             jcb.set_conf(self.conf.items(section))
             self.tm.save(jcb)
Example #6
0
    def run(self):
        logger.info(self.sourcename + " start...")
        while (True):
            string = self.um.get_eb(self.url_set_name)
            if string == None:
                break

            file_path = self.creat_filename()
            eb = nm.execl_bean()
            eb.paser(string)
            url_dict = {}

            if eb.abs_url != "":
                url_dict[EXCEL_ITEM.ABS_URL] = eb.abs_url
            if eb.full_url != "":
                url_dict[EXCEL_ITEM.FULL_URL] = eb.full_url
            if eb.pinjie != "":
                url_dict[EXCEL_ITEM.PINJIE] = eb.pinjie

            jcb = nm.json_conf_bean(eb.sourcename, eb.eissn)
            html_ = htmls.HTML(eb, jcb, self.tm, self.sourcename)
            try:
                logger.info("URL_THREAD - " + self.name + " - " +
                            self.sourcename + " get download url form: " +
                            str(url_dict))
                url, full_url = parser_url(url_dict, html_)
                htmls.download(full_url, file_path)
                eb.page = htmls.checkpdf(file_path)
            except NoConfError:
                logger.info(self.sourcename + "-" + eb.eissn + " 无可用的conf.")
                continue
            except Exception as e:
                logger.error(self.sourcename + " download url " +
                             str(url_dict) + " has err",
                             exc_info=True)
                if eb.retry < collect.DOWNLOAD_URL_RETRY:
                    logger.info("retry time:" + str(eb.retry))
                    eb.retry += 1
                    self.um.save(eb, self.url_step - 1)
                else:
                    logger.info("retry:" + str(eb.retry) +
                                ". retry次数超过5次,不再重试。")
                    self.um.save(eb, self.err_step)
                continue

            logger.info("URL_THREAD - " + self.name + " - " + self.sourcename +
                        " 下载成功!")
            eb.full_url = full_url
            eb.abs_url = url

            dirs = file_path.split("/")
            eb.full_path = dirs[-2] + "/" + dirs[-1]
            self.um.save(eb, self.finsh_step)
Example #7
0
    def run(self):
        logger.info(self.sourcename + " download_url start...")
        while (True):
            string = self.um.get_eb(self.url_set_name)
            if string == None:
                break
            eb = nm.execl_bean()
            eb.paser(string)
            url = eb.pinjie
            jcb = nm.json_conf_bean(eb.sourcename, eb.eissn)
            file_path = self.creat_filename()

            try:
                d_url = self.get_d_url(url)

                # print(d_url)
                logger.info(self.sourcename + " get download url form: " +
                            d_url)
                htmls.download(d_url, file_path)
                eb.page = htmls.checkpdf(file_path)
            except NoConfError:
                logger.info(eb.eissn + " 无可用的conf.")
                eb.err_and_step = str(self.url_step) + ":  无可用的conf"
                self.um.save(eb, self.err_step)
            except Exception as e:
                logger.error(self.sourcename + " download url " + url +
                             " has err",
                             exc_info=True)
                if eb.retry < collect.DOWNLOAD_URL_RETRY:
                    logger.info("retry time:" + str(eb.retry))
                    eb.retry += 1
                    self.um.save(eb, self.url_step - 1)
                else:
                    logger.info("retry:" + str(eb.retry) +
                                ". retry次数超过5次,不再重试。")
                    self.um.save(eb, self.err_step)
                continue
            eb.full_url = d_url
            eb.abs_url = url
            dirs = file_path.split("/")
            eb.full_path = dirs[-2] + "/" + dirs[-1]
            self.um.save(eb, self.finsh_step)
Example #8
0
    def run(self):
        logger.info(self.sourcename + " download_url start...")
        while (True):
            string = self.um.get_eb(self.url_set_name)
            if string == None:
                break
            eb = nm.execl_bean()
            eb.paser(string)
            url = eb.pinjie

            jcb = nm.json_conf_bean(eb.sourcename, eb.eissn)
            file_path = self.creat_filename()
            try:
                # time.sleep(random.random() * 3 + 1)
                logger.info(self.sourcename + " 开始下载:" + url)

                r = requests.get(url)
                try:
                    c1 = r.cookies['BIGipServerlbapp_tc3']
                    c2 = r.cookies['BIGipServerwww.osti.gov_pool']
                    c3 = r.cookies['JSESSIONID']
                except:
                    pass
                soup = BeautifulSoup(r.text, "html.parser")

                mate = soup.find("meta", {"name": "citation_pdf_url"})
                if mate == None:
                    start_break = False
                    for div1 in soup.find_all("div",
                                              class_="biblio-secondary-group"):
                        for div2 in div1.find_all(
                                "div", class_="biblio-secondary-item small"):
                            for a in div2.find_all("a"):
                                if "href" in a.attrs.keys():
                                    if "https://doi.org" in a["href"]:
                                        pdf_url = a["href"]
                                        cp = htmls.config_parser()
                                        ht = htmls.HTML(None, None, None, None)
                                        for conf in cp.get_all_conf():
                                            print(conf)
                                            if ht.test(conf, pdf_url):
                                                result = ht.do_run(
                                                    conf, pdf_url)
                                                r2 = requests.get(result)
                                                r2.encoding = 'utf-8'
                                                # print(r2.text)
                                                file = open(file_path, "wb+")
                                                file.write(r2.content)
                                                file.close()
                                                break

                                        start_break = True
                                        break
                            if start_break:
                                break
                        if start_break:
                            break

                else:
                    pdf_url = mate["content"]
                    cookies = {
                        'BIGipServerlbapp_tc3': c1,
                        'BIGipServerwww.osti.gov_pool': c2,
                        'JSESSIONID': c3,
                        '__utma':
                        '249692800.1749221367.1564467097.1564467097.1564467097.1',
                        '__utmc': '249692800',
                        '__utmz':
                        '249692800.1564467097.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
                        '_ga': 'GA1.2.1749221367.1564467097',
                        '_gid': 'GA1.2.298248318.1564467099',
                        '__utmt': '1',
                        '__utmb': '249692800.63.10.1564467097'
                    }

                    r2 = requests.get(pdf_url, cookies=cookies)
                    r2.encoding = 'utf-8'
                    # print(r2.text)
                    file = open(file_path, "wb+")
                    file.write(r2.content)
                    file.close()
                eb.page = htmls.checkpdf(file_path)
                full_url = pdf_url
                # r = requests.get(url)
                # c1 = r.cookies['BIGipServerlbapp_tc3']
                # c2 = r.cookies['BIGipServerwww.osti.gov_pool']
                # c3 = r.cookies['JSESSIONID']
                # soup = BeautifulSoup(r.text, "html.parser")
                #
                # pdf_url = soup.find("meta", {"name": "citation_pdf_url"})["content"]
                # cookies = {
                #     'BIGipServerlbapp_tc3': c1,
                #     'BIGipServerwww.osti.gov_pool': c2,
                #     'JSESSIONID': c3,
                #     '__utma': '249692800.1749221367.1564467097.1564467097.1564467097.1',
                #     '__utmc': '249692800',
                #     '__utmz': '249692800.1564467097.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
                #     '_ga': 'GA1.2.1749221367.1564467097',
                #     '_gid': 'GA1.2.298248318.1564467099',
                #     '__utmt': '1',
                #     '__utmb': '249692800.63.10.1564467097'
                # }
                # #time.sleep(random.random() * 3 + 1)
                # logger.info(self.sourcename+" 下载PDF:"+pdf_url)
                #
                # r2 = requests.get(pdf_url, cookies=cookies)
                # r2.encoding = 'utf-8'
                # file = open(file_path, "wb+")
                # file.write(r2.content)
                # file.close()
                # eb.page = htmls.checkpdf(file_path)
                # full_url=pdf_url

            except NoConfError:
                logger.info(eb.eissn + " 无可用的conf.")
                eb.err_and_step = str(self.url_step) + ":  无可用的conf"
                self.um.save(eb, self.err_step)
            except Exception as e:
                logger.error(self.sourcename + " download url " + url +
                             " has err",
                             exc_info=True)
                if eb.retry < collect.DOWNLOAD_URL_RETRY:
                    logger.info("retry time:" + str(eb.retry))
                    eb.retry += 1
                    self.um.save(eb, self.url_step - 1)
                else:
                    logger.info("retry:" + str(eb.retry) +
                                ". retry次数超过5次,不再重试。")
                    self.um.save(eb, self.err_step)
                continue

            eb.full_url = full_url
            eb.abs_url = url
            dirs = file_path.split("/")
            eb.full_path = dirs[-2] + "/" + dirs[-1]
            self.um.save(eb, self.finsh_step)