def __init__(self, task_name, project_name, key_words, record_number):
        self.key_words = key_words  # 输入关键词,关键词是一个集合
        self.record_number = record_number  # 需要爬多少个
        self.project_name = project_name  # 项目的名字
        self.task_name = task_name

        self.request_time_out = 30  # request超时时间
        self.phantomjs_time_out = 60  # phantom超时时间
        self.request_refresh_wait = 3  # request刷新等待
        self.phantomjs_refresh_wait = 5  # 浏览器刷新等待

        self.tries_request = 3
        self.tries_1st_sp = 3  # 尝试获取第一个页面的次数
        self.tries_other_sp = 5  # 尝试获取其它每个页面的次数

        self.sum_page_number = int(math.ceil(self.record_number /
                                             20))  # 每页20个,计算共多少页
        self.url = "https://www.ncbi.nlm.nih.gov/pubmed/?term=" + self.key_words.replace(
            ",", "+")  # 最初的查询网址

        self.phantomjs_headers = agents.get_header()  # 随机选择一个以供浏览器使用
        self.pmid_set = mh.read_pmid_all()  # 只读一次

        # 实例内部容器
        self.content = []  # 把sc_content类的抓过来;每次抓取新页面都清空
        self.author = []  # author合集;每次抓取新页面都清空
        self.journal = []  # 期刊合集;每次抓取新页面都清空
        self.title = []  # 名字与连接的合集;每次抓取新页面都清空
        self.issue = []  # 年份的合集;每次抓取新页面都清空
        self.pmid = []  # Pmid的合集;每次抓取新页面都清空
Exemple #2
0
def get_official_name(journal_name_raw,
                      proxy=None):  # 查找杂志的全名,支持模糊查询,只输出最符合的那个
    url = "http://www.letpub.com.cn/journalappAjax.php?querytype=autojournal&term=" + \
        journal_name_raw.replace("&", "%26").replace(" ", "+")
    tries = config.request_dp_tries
    while tries > 0:
        try:
            opener = requests.Session()
            doc = opener.get(url, timeout=20, headers=agents.get_header()).text
            list = doc.split('},{')  # 获取列表,但是只有最match的被采纳
            journal_name_start = list[0].find("label") + 8
            journal_name_end = list[0].find("\",\"", journal_name_start)
            journal_name = list[0][journal_name_start:journal_name_end]
            journal_name = journal_name.upper()  # 查找到的名字也是全大写
            msg.msg("journal name", journal_name_raw, "web retrieved",
                    journal_name, "debug", msg.display)
            return journal_name
            break
        except Exception, e:
            msg.msg("journal name", journal_name, "web retrieved", "retried",
                    "debug", msg.display)
            msg.msg("journal name", journal_name, "web retrieved", str(e),
                    "error", msg.log)
            tries -= 1
            time.sleep(config.request_refresh_wait)
Exemple #3
0
def adjust_record_number(project, sstr, record_number):  # 确定正确的记录数
    url = parse_url(project, sstr)
    tries = config.request_sp_tries  # 尝试3次
    while(tries > 0):
        try:
            opener = requests.Session()
            content = opener.get(url, timeout=config.request_time_out,
                                 headers=agents.get_header()).text  # header仍然可以是随机的
            max_record_number_start = content.find(
                "<h3 class=\"result_count left\">Items:") + 37  # 找描述开始地方
            max_record_number_end = content.find(
                '</h3>', max_record_number_start)
            record_number_str = content[max_record_number_start:max_record_number_end]
            max_record_number = int(record_number_str.split(" ")[-1])
            if max_record_number >= record_number:
                pass
            else:
                record_number = max_record_number
                msg.msg("record number", "", "changed", str(
                    record_number), "notice", msg.log, msg.display)
            return record_number
            break
        except Exception, e:
            msg.msg("record number", "", "read", str(e), "error", msg.log)
            msg.msg("record number", "", "read",
                    "retried", "notice", msg.display)
            tries -= 1
            time.sleep(config.request_refresh_wait)
    def crawl_detail(self, pmid):  # 爬具体页面
        link = "https://www.ncbi.nlm.nih.gov/pubmed/" + pmid
        key_words_list = []  # 关键词合集
        institues_list = []  # 机构名称
        full_links_list = []  # 全文链接(不是abstract,是可下载的pdf)

        tries = 3  # 尝试获取3次,不成功就返回错误
        while (tries > 0):
            try:
                opener = requests.Session()  # 新建了session保存
                doc = opener.get(
                    link,
                    timeout=self.request_time_out,
                    headers=agents.get_header()).text  # 注意,这里是不断随机换agent的
                soup = BeautifulSoup(doc)

                abstract_raw = soup.findAll(name="abstracttext")
                abstract = ut.regexp_replace(str(abstract_raw),
                                             ut.re_html)[1:-1]  # 即时清理abstract

                key_words_raw = soup.findAll(name="div",
                                             attrs={"class": "keywords"})
                if key_words_raw:  # 如果有keyword的话,很多文章是没有
                    key_words_raw = str(key_words_raw)[45:-11].replace(
                        "; ", ";")
                    key_words_list = key_words_raw.split(';')

                institues_raw = soup.findAll(name='dl')
                if institues_raw:  # 如果有institues的话,大部分文章都有
                    institues_raw = institues_raw[0]
                    institues_raw = re.findall("<dd>.*?</dd>",
                                               str(institues_raw))
                    for institues in institues_raw:
                        institues_list.append(institues[4:-5])

                full_content = soup.findAll(name='div',
                                            attrs={"class": "icons portlet"})
                full_links_raw = re.findall("<a href=.*?ref=",
                                            str(full_content))
                if full_links_raw:  # 如果有全文链接
                    for full_link in full_links_raw:
                        full_links_list.append(full_link[9:-6].replace(
                            "&amp;", "&"))

                return abstract, key_words_list, institues_list, full_links_list  # 返回的是一个str值和3个集合
                break

            except Exception, e:
                tries -= 1
                msg.display(
                    ut.time_str("time"), "retrying record: " + str(pmid) +
                    "; " + str(tries) + " tries left", "notice")
                msg.log(self.task_name, ut.time_str("full"),
                        "retry record: " + str(pmid), "notice")
                msg.log(self.task_name, ut.time_str("full"), str(e), "error")
                time.sleep(self.request_refresh_wait)  # 如果抓不成功,就先休息3秒钟
def crawl_name():  # 爬具体页面
    link = "https://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations"
    opener = requests.Session()  # 新建了session保存
    content = opener.get(link, timeout=config.request_time_out,
                            headers=agents.get_header()).text  # 注意,这里是不断随机换agent的
    selector = etree.HTML(content.encode("utf-8"))
    name_element = selector.xpath("//*[@id=\"bodyContent\"]//table//tr/td//span")

    print len(name_element)
    for item in name_element:
        print item.xpath('string(.)')
    def crawl_direct(self):  # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快
        msg.stat("sum_page", "proc")  # 列入已处理
        tries = self.tries_request  # 尝试3次
        while (tries > 0):
            try:
                opener = requests.Session()
                raw = opener.get(
                    self.url,
                    timeout=self.request_time_out,
                    headers=agents.get_header()).text  # header仍然可以是随机的
                soup = BeautifulSoup(raw)

                number_raw = soup.findAll(name="input",
                                          attrs={"id": "pageno"})  # 找到含总数的div
                number_start = str(number_raw).find("last=") + 6  # 找到总数开始位置
                number_end = str(number_raw).find("\" />")  # 找到总数结束位置
                max_number = int(
                    str(number_raw)[number_start:number_end])  # 实际最大数值,整数

                if max_number < self.sum_page_number:  # 如果实际最大页面数没有计算值大
                    self.sum_page_number = max_number  # 那用实际值,否则不变
                    msg.display(ut.time_str("time"),
                                "max sum page changed: " + str(max_number),
                                "notice")
                    msg.log(self.task_name, ut.time_str("full"),
                            "changed sum page number: " + str(max_number),
                            "notice")

                msg.display(ut.time_str("time"),
                            "loaded: NO.1 sum page (requests)", "info")
                msg.log(self.task_name, ut.time_str("full"),
                        "load sum page: NO.1 (requests)", "info")

                self.author = soup.findAll(name='p', attrs={"class": "desc"})
                self.journal = soup.findAll(name="span",
                                            attrs={'class': 'jrnl'})
                self.title = soup.findAll(name='p', attrs={"class": "title"})
                self.issue = soup.findAll(name="p", attrs={'class': 'details'})
                self.pmid = soup.findAll(name="dd")

                self.generate_record()  # 直接产生结果
                msg.stat("sum_page", "succ")
                break

            except Exception, e:
                print e
                tries -= 1
                msg.display(
                    ut.time_str("time"),
                    "load retrying: NO.1 sum page (requests); " + str(tries) +
                    " tries left", "notice")
                msg.log(self.task_name, ut.time_str("full"),
                        "retry sum page: NO.1 (requests)", "notice")
                msg.log(self.task_name, ut.time_str("full"), str(e), "error")
Exemple #7
0
def crawl_phantom(project, sstr, record_number, endwith, endtime):
    url = parse_url(project, sstr)

    sum_page_number = int(math.ceil(record_number / 200))  # 计算要多少页面可以爬完
    rest_page_number = sum_page_number  # 剩下多少页, 刚开始一样的

    tries_1st_sp = config.phantom_1st_sp_tries

    phantomjs_headers = agents.get_header()  # 随机选择一个以供浏览器使用
    dcap = dict(DesiredCapabilities.PHANTOMJS)  # 设置userAgent
    dcap["phantomjs.page.settings.userAgent"] = (
        phantomjs_headers)  # header每次打开phantomjs是随机的,但浏览器关闭前不会变
    dcap["phantomjs.page.settings.loadImages"] = False  # 不载入图片,以加快速度
    sargs = ["--disk-cache=yes", '--ignore-ssl-errors=true']  # 打开磁盘缓存,忽略ssl错误
    # browser = webdriver.PhantomJS(executable_path='C:\Python27\Scripts\phantomjs.exe', desired_capabilities=dcap)  # 加载浏览器,windows下使用
    path = ut.cur_file_dir() + "/browser/phantomjs"  # 浏览器地址
    pm_browser = webdriver.PhantomJS(
        # executable_path=path, desired_capabilities=dcap, service_args=sargs)  # 加载浏览器
        executable_path=path, desired_capabilities=dcap)  # 加载浏览器
Exemple #8
0
def crawl_direct(project, sstr):  # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快
    url = parse_url(project, sstr)
    tries = config.request_sp_tries  # 尝试3次
    while(tries > 0):
        try:
            opener = requests.Session()
            content = opener.get(url, timeout=config.request_time_out,
                                 headers=agents.get_header()).text  # header仍然可以是随机的
            msg.msg("sum page", "1", "loaded", "proc", "info", msg.display)
            pmid_list = extract_new_pmid(content)  # 提取pmid, 然后排除旧的
            if pmid_list:
                mh.add_new_pmid_many(
                    project, sstr, ut.time_str("full"), "pm", pmid_list)
            msg.msg("sum page", "1", "loaded", "succ",
                    "info", msg.display, msg.log)
            break
        except Exception, e:
            msg.msg("sum page", "1", "loaded", str(e), "error", msg.log)
            msg.msg("sum page", "1", "loaded",
                    "retried", "notice", msg.display)
            tries -= 1
            time.sleep(config.request_refresh_wait)
Exemple #9
0
def crawl_detail(pmid, proxy=None):  # 爬具体页面
    link = "https://www.ncbi.nlm.nih.gov/pubmed/" + str(pmid)
    tries = config.request_dp_tries  # 根据设定重复次数
    msg.msg("record", pmid, "retrieved", "proc", "info", msg.display, msg.stat)
    while (tries > 0):
        try:
            authors = []
            institues = []
            countries = []
            flinks = []
            opener = requests.Session()  # 新建了session保存
            content = opener.get(
                link,
                timeout=config.request_time_out,
                headers=agents.get_header()).text  # 注意,这里是不断随机换agent的
            selector = etree.HTML(content.encode("utf-8"))
            title_element = selector.xpath(
                "//div[@class = \"rprt abstract\"]//h1")
            if len(title_element):
                title = title_element[0].xpath('string(.)')
            author_element = selector.xpath("//div[@class = \"auths\"]//a")
            if len(author_element):
                for author in author_element:
                    authors.append(author.xpath('string(.)'))
            journal_element = selector.xpath("//a[@alsec=\"jour\"]/@title")
            if len(journal_element):
                journal = journal_element[0]
                if journal:
                    journal_detail = jn.journal_detail(journal)
                    ojournal = journal_detail[0]
                    journal_if = journal_detail[1]
                    journal_zone = journal_detail[2]
            abstract_element = selector.xpath(
                "//*[@id=\"maincontent\"]/div/div[5]/div/div[4]")
            if len(abstract_element):
                abstract = abstract_element[0].xpath('string(.)')[8:]
            key_words_element = selector.xpath(
                "//*[@id=\"maincontent\"]/div/div[5]/div/div[5]/p")
            if len(key_words_element):
                key_words = key_words_element[0].xpath('string(.)').split("; ")
            else:
                key_words = []
            issue_element = selector.xpath("//div[@class = \"cit\"]")  # 年份
            if len(issue_element):
                issue_raw = issue_element[0].xpath('string(.)')
                issue_start = issue_raw.find(".")
                issue = issue_raw[issue_start + 2:issue_start + 6]
            institues_element = selector.xpath("//div[@class=\"afflist\"]//dd")
            if len(institues_element):
                for institue in institues_element:
                    institue = institue.xpath('string(.)')
                    institue = ut.regexp_replace(
                        institue, ut.re_email_pm)  # 去除pm的email语句
                    institue = ut.regexp_replace(
                        institue, ut.re_email_general)  # 去除所有中间的email
                    institue = institue.replace(" ,", ",")
                    institues.append(institue)
                    institue = institue.replace(", ", ",").replace(".", "")
                    institue_strs = institue.split(",")
                    institue_strs.reverse()  # 国家名往往放在最后
                    i = 0
                    while i < len(institue_strs):
                        if institue_strs[i] in dictionary.country_names.keys(
                        ):  # 如果有这个机构
                            country_name = dictionary.country_names[
                                institue_strs[i]]  # 直接查询
                            if not country_name in countries:
                                countries.append(country_name)
                            break
                        else:
                            i += 1
            flink_element = selector.xpath(
                "//div[@class=\"icons portlet\"]//a/@href")
            if len(flink_element):
                for flink in flink_element:
                    flinks.append(flink)
            mh.add_new_content(pmid, title, authors, journal, ojournal,
                               journal_if, journal_zone, issue, abstract,
                               key_words, institues, countries, flinks)
            msg.msg("record", pmid, "retrieved", "succ", "info", msg.display,
                    msg.stat)
            break
        except Exception, e:
            msg.msg("record", pmid, "retrieved", "retried", "notice",
                    msg.display)
            msg.msg("record", pmid, "retrieved", str(e), "error", msg.log)
            tries -= 1
            time.sleep(config.request_refresh_wait)  # 如果抓不成功,就先休息3秒钟