Example #1
0
    def generate_record(self):  # 从抓取的原始素材产生记录
        title_start_with = "linksrc=docsum_title\">"  # 标记查找标题开头
        title_end_with = "</a>"  # 查找标题的结尾
        journal_start_with = 'title='  # 查找期刊开头
        journal_end_with = '\">'  # 查找期刊结尾

        m = 0
        while (m < len(self.pmid)):  # 有多少重复多少
            msg.stat("record", "proc")  # 处理记录+1
            pmid = str(self.pmid[m])[4:-5]  # 先找到pmid,再决定要不要下一步
            if not (self.pmid_check(pmid)):  # 如果之前没有这篇文章
                author = str(self.author[m])[16:-4]
                author_list = author.split(", ")  # 作者列表

                title_start = str(self.title[m]).find(title_start_with) + 22
                title = str(self.title[m])[title_start:-8].replace(
                    '<b>', '').replace('</b>', '')  # 论文名

                issue = re.search("[1-2][09][0-9]{2}",
                                  str(self.issue[m])).group(0)  # 刊号,即年份

                journal_end = str(self.journal[m]).find(
                    journal_end_with)  # 期刊结尾位置
                journal = str(self.journal[m])[26:journal_end].replace(
                    '<b>', '').replace('</b>', '')  # 期刊名
                journal_detail = jn.journal_detail(
                    journal)  # 获取期刊的正式名称,影响因子及分区信息

                paper_detail = self.crawl_detail(
                    pmid)  # 获取文章abstract,keyword列表,机构列表和全文链接列表

                if paper_detail:  # 如果能够返回正确的abstract,记录;否则留给下一次抓取(不记录,视作新论文)
                    mh.add_new_content(self.project_name, self.key_words,
                                       ut.time_str("full"), "pm", pmid, title,
                                       author_list, journal, journal_detail[0],
                                       journal_detail[1],
                                       journal_detail[2], issue,
                                       str(paper_detail[0]), paper_detail[1],
                                       paper_detail[2], paper_detail[3])
                    self.pmid_set.append(pmid)  # 把刚抓的这篇pmid加入pmid list
                    #这里的 paper_detail[0]是这篇文章的abstract,[1]是keywords,[2]是机构列表 [4]是全文下载的链接合集
                    msg.stat("record", "succ")  # 记录成功+1
                    msg.display(ut.time_str("time"), "retrieved record: " +
                                str(pmid) + "; total retrieved: " +
                                str(stats.success_record), "info")  # 显示:记录成功
                    msg.log(self.task_name, ut.time_str("full"),
                            "retrieved record: " + str(pmid),
                            "info")  # 记录:记录成功
            else:
                msg.stat("record", "skip")  # 跳过记录+1
                msg.display(
                    ut.time_str("time"), "skipped record: " + str(pmid) +
                    "; total skipped: " + str(stats.skipped_record), "info")
                msg.log(self.task_name, ut.time_str("full"),
                        "skipped record: " + str(pmid), "info")
            m += 1
Example #2
0
    def crawl_detail(self, pmid):  # 爬具体页面
        link = "https://www.ncbi.nlm.nih.gov/pubmed/" + pmid
        key_words_list = []  # 关键词合集
        institues_list = []  # 机构名称
        full_links_list = []  # 全文链接(不是abstract,是可下载的pdf)

        tries = 3  # 尝试获取3次,不成功就返回错误
        while (tries > 0):
            try:
                opener = requests.Session()  # 新建了session保存
                doc = opener.get(
                    link,
                    timeout=self.request_time_out,
                    headers=agents.get_header()).text  # 注意,这里是不断随机换agent的
                soup = BeautifulSoup(doc)

                abstract_raw = soup.findAll(name="abstracttext")
                abstract = ut.regexp_replace(str(abstract_raw),
                                             ut.re_html)[1:-1]  # 即时清理abstract

                key_words_raw = soup.findAll(name="div",
                                             attrs={"class": "keywords"})
                if key_words_raw:  # 如果有keyword的话,很多文章是没有
                    key_words_raw = str(key_words_raw)[45:-11].replace(
                        "; ", ";")
                    key_words_list = key_words_raw.split(';')

                institues_raw = soup.findAll(name='dl')
                if institues_raw:  # 如果有institues的话,大部分文章都有
                    institues_raw = institues_raw[0]
                    institues_raw = re.findall("<dd>.*?</dd>",
                                               str(institues_raw))
                    for institues in institues_raw:
                        institues_list.append(institues[4:-5])

                full_content = soup.findAll(name='div',
                                            attrs={"class": "icons portlet"})
                full_links_raw = re.findall("<a href=.*?ref=",
                                            str(full_content))
                if full_links_raw:  # 如果有全文链接
                    for full_link in full_links_raw:
                        full_links_list.append(full_link[9:-6].replace(
                            "&amp;", "&"))

                return abstract, key_words_list, institues_list, full_links_list  # 返回的是一个str值和3个集合
                break

            except Exception, e:
                tries -= 1
                msg.display(
                    ut.time_str("time"), "retrying record: " + str(pmid) +
                    "; " + str(tries) + " tries left", "notice")
                msg.log(self.task_name, ut.time_str("full"),
                        "retry record: " + str(pmid), "notice")
                msg.log(self.task_name, ut.time_str("full"), str(e), "error")
                time.sleep(self.request_refresh_wait)  # 如果抓不成功,就先休息3秒钟
Example #3
0
    def loadScores(self):
        self.printMessage('Loading Scores...')

        QApplication.processEvents()
        self.ui.textBrowser.repaint()

        self.diagnostics["threads"] = []
        regionList = []

        if self.ui.injector_cb.isChecked():
            self.getScoreData("Gun to TD11-LEM", regionList)

        if self.ui.copper_cb.isChecked():
            self.getScoreData("Cu Linac", regionList)

        if self.ui.bsyltuhard_cb.isChecked():
            self.getScoreData("Hard BSY thru LTU", regionList)

        if self.ui.bsyltusoft_cb.isChecked():
            self.getScoreData("Soft BSY thru LTU", regionList)

        if self.ui.undhard_cb.isChecked():
            for region in ["UNDH Taper", "UNDH"]:
                self.getScoreData(region, regionList)

        if self.ui.undsoft_cb.isChecked():
            for region in ["UNDS Taper", "UNDS"]:
                self.getScoreData(
                    region,
                    regionList)  #2020FIX 5/28/2020 score namescorrect to here

        # Put message in message log that scores are being loaded
        for region in regionList:
            message = ("Loading SCORE from " + self.scoreInfo["dateChosen"] +
                       " " + self.scoreInfo["timeChosen"] + " for " + region)

            self.printMessage(message)
            log("facility=pythonenergychange " + message)

            # Have a thread subclass to handle this (defined at bottom of this
            # file); normal threading class returns NONE
            t = Utils.ThreadWithReturnValue(target=self.scoreThread,
                                            args=(region, ))

            self.diagnostics["threads"].append(t)

        for thread in self.diagnostics["threads"]:
            thread.start()
Example #4
0
    def crawl_direct(self):  # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快
        msg.stat("sum_page", "proc")  # 列入已处理
        tries = self.tries_request  # 尝试3次
        while (tries > 0):
            try:
                opener = requests.Session()
                raw = opener.get(
                    self.url,
                    timeout=self.request_time_out,
                    headers=agents.get_header()).text  # header仍然可以是随机的
                soup = BeautifulSoup(raw)

                number_raw = soup.findAll(name="input",
                                          attrs={"id": "pageno"})  # 找到含总数的div
                number_start = str(number_raw).find("last=") + 6  # 找到总数开始位置
                number_end = str(number_raw).find("\" />")  # 找到总数结束位置
                max_number = int(
                    str(number_raw)[number_start:number_end])  # 实际最大数值,整数

                if max_number < self.sum_page_number:  # 如果实际最大页面数没有计算值大
                    self.sum_page_number = max_number  # 那用实际值,否则不变
                    msg.display(ut.time_str("time"),
                                "max sum page changed: " + str(max_number),
                                "notice")
                    msg.log(self.task_name, ut.time_str("full"),
                            "changed sum page number: " + str(max_number),
                            "notice")

                msg.display(ut.time_str("time"),
                            "loaded: NO.1 sum page (requests)", "info")
                msg.log(self.task_name, ut.time_str("full"),
                        "load sum page: NO.1 (requests)", "info")

                self.author = soup.findAll(name='p', attrs={"class": "desc"})
                self.journal = soup.findAll(name="span",
                                            attrs={'class': 'jrnl'})
                self.title = soup.findAll(name='p', attrs={"class": "title"})
                self.issue = soup.findAll(name="p", attrs={'class': 'details'})
                self.pmid = soup.findAll(name="dd")

                self.generate_record()  # 直接产生结果
                msg.stat("sum_page", "succ")
                break

            except Exception, e:
                print e
                tries -= 1
                msg.display(
                    ut.time_str("time"),
                    "load retrying: NO.1 sum page (requests); " + str(tries) +
                    " tries left", "notice")
                msg.log(self.task_name, ut.time_str("full"),
                        "retry sum page: NO.1 (requests)", "notice")
                msg.log(self.task_name, ut.time_str("full"), str(e), "error")
Example #5
0
    def crawl_phantom(self):  # 用于使用phantomjs爬取sum-page,可以爬无限页,但是速度慢
        rest_page_number = self.sum_page_number  # 剩下多少页
        tries_1st_sp = self.tries_1st_sp
        tries_other_sp = self.tries_other_sp

        if self.sum_page_number > 1:  # 如果页面不超过1个,就不启动浏览器
            dcap = dict(DesiredCapabilities.PHANTOMJS)  # 设置userAgent
            dcap["phantomjs.page.settings.userAgent"] = (
                self.phantomjs_headers)  # header每次打开phantomjs是随机的,但浏览器关闭前不会变
            dcap["phantomjs.page.settings.loadImages"] = False  # 不载入图片,以加快速度
            # browser = webdriver.PhantomJS(executable_path='C:\Python27\Scripts\phantomjs.exe', desired_capabilities=dcap)  # 加载浏览器,windows下使用
            path = cur_file_dir() + "/browser/phantomjs"  # 浏览器地址
            browser = webdriver.PhantomJS(executable_path=path,
                                          desired_capabilities=dcap)  # 加载浏览器
            browser.set_page_load_timeout(
                self.phantomjs_time_out)  # 设定网页加载超时,超过了就不加载

        while (self.sum_page_number > 1 and tries_1st_sp > 0):
            try:
                browser.get(self.url)
                WebDriverWait(browser, self.phantomjs_time_out).until(
                    EC.presence_of_element_located((By.ID, "footer")))
                msg.display(ut.time_str("time"),
                            "loaded: NO.1 sum page (phantomjs)", "info")
                msg.log(self.task_name, ut.time_str("full"),
                        "load sum page: NO.1 (phantomjs)", "info")
                msg.stat("sum_page", "succ")
                break

            except Exception as e:
                tries_1st_sp -= 1
                msg.display(
                    ut.time_str("time"),
                    "load retrying: NO.1 sum page (phantomjs); " +
                    str(tries_1st_sp) + " tries left", "notice")
                msg.log(self.task_name, ut.time_str("full"),
                        "retry sum page: NO.1 (phantomjs)", "notice")
                msg.log(self.task_name, ut.time_str("full"), str(e), "error")

                browser.refresh()
                browser.implicitly_wait(self.phantomjs_refresh_wait)

        else:
            msg.display(ut.time_str("time"),
                        "load failed: NO.1 sum page (phantomjs)", "error")
            msg.log(self.task_name, ut.time_str("full"),
                    "fail sum page: NO.1 (phantomjs)", "error")

        while (
                rest_page_number > 1 and tries_1st_sp > 0
        ):  # 确认需要第二页,如果sum-page只有1页,那就不用再打开; 如果第一页打开失败,也不用打开;从这里开始循环,直到所有的页面都爬完为止
            msg.stat("sum_page", "proc")
            tries_other_sp = self.tries_other_sp

            while (tries_other_sp > 0):  # 尝试多少次,默认尝试5次,不行就打不开
                try:
                    browser.find_element_by_link_text(
                        "Next >").click()  # 直接就点开“下一页”,从第二页开始
                    WebDriverWait(browser, self.phantomjs_time_out).until(
                        EC.presence_of_element_located((By.ID, "footer")))

                    msg.display(
                        ut.time_str("time"),
                        "loaded: NO." + str(stats.success_sum_page + 1) +
                        " sum page (phantomjs)", "info")
                    msg.log(
                        self.task_name, ut.time_str("full"),
                        "load sum page: NO." +
                        str(stats.success_sum_page + 1) + " (phantomjs)",
                        "info")

                    soup = BeautifulSoup(browser.page_source)
                    self.author = soup.findAll(name='p',
                                               attrs={"class": "desc"})
                    self.journal = soup.findAll(name="span",
                                                attrs={'class': 'jrnl'})
                    self.title = soup.findAll(name='p',
                                              attrs={"class": "title"})
                    self.issue = soup.findAll(name="p",
                                              attrs={'class': 'details'})
                    self.pmid = soup.findAll(name="dd")
                    self.generate_record()  # 直接产生结果

                    msg.stat("sum_page", "succ")
                    rest_page_number -= 1
                    break

                except Exception as e:
                    tries_other_sp -= 1
                    msg.display(
                        ut.time_str("time"), "load retrying: NO." +
                        str(stats.success_sum_page + 1) +
                        " sum page (phantomjs); " + str(tries_other_sp) +
                        " tries left", "notice")
                    msg.log(
                        self.task_name, ut.time_str("full"),
                        "retry sum page: NO." +
                        str(stats.success_sum_page + 1) + " (phantomjs)",
                        "notice")
                    msg.log(self.task_name, ut.time_str("full"), str(e),
                            "error")

                    browser.refresh()
                    browser.implicitly_wait(self.phantomjs_refresh_wait)

            else:
                msg.stat("sum_page", "fail")
                msg.display(
                    ut.time_str("time"), "load failed: NO." +
                    str(stats.success_sum_page + 1) + " sum page (phantomjs)",
                    "error")
                msg.log(
                    self.task_name, ut.time_str("full"), "fail sum page: NO." +
                    str(stats.success_sum_page + 1) + " (phantomjs)", "error")
                break

        if self.sum_page_number > 1:
            browser.quit()  # 关闭浏览器。当出现异常时记得在任务浏览器中关闭PhantomJS
Example #6
0
                break

            except Exception, e:
                tries -= 1
                msg.display(
                    ut.time_str("time"), "retrying record: " + str(pmid) +
                    "; " + str(tries) + " tries left", "notice")
                msg.log(self.task_name, ut.time_str("full"),
                        "retry record: " + str(pmid), "notice")
                msg.log(self.task_name, ut.time_str("full"), str(e), "error")
                time.sleep(self.request_refresh_wait)  # 如果抓不成功,就先休息3秒钟

        else:
            msg.display(ut.time_str("time"),
                        "retrieve record fail: " + str(pmid), "error")
            msg.log(self.task_name, ut.time_str("full"),
                    "failed record: " + str(pmid), "error")
            msg.stat("record", "fail")
            return 0


#=====================================================================================
# 实际爬的部分开始

    def crawl_direct(self):  # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快
        msg.stat("sum_page", "proc")  # 列入已处理
        tries = self.tries_request  # 尝试3次
        while (tries > 0):
            try:
                opener = requests.Session()
                raw = opener.get(
                    self.url,