Esempio n. 1
0
def project_delete(project_name):
    for project in project_set:
        if project[0] == project_name:
            del project
            dh.csv_write(project_set, "universal", "project", "wb")
            op.output("Project deleted", "notice", ut.time_str("full"))
    else:
        op.output('No project found', "notice", ut.time_str("full"))
Esempio n. 2
0
def project_des_update(project_name, project_description): 
    i = 0
    for i in len(project_set):
        if project_set[i].split(",|,")[0] == project_name:

            op.output("Project description updated",
                      "notice", ut.time_str("full"))
    else:
        op.output('No project found', "notice", ut.time_str("full"))
Esempio n. 3
0
    def crawl_detail(self, pmid):  # 爬具体页面
        link = "https://www.ncbi.nlm.nih.gov/pubmed/" + pmid
        key_words_list = []  # 关键词合集
        institues_list = []  # 机构名称
        full_links_list = []  # 全文链接(不是abstract,是可下载的pdf)

        tries = 3  # 尝试获取3次,不成功就返回错误
        while (tries > 0):
            try:
                opener = requests.Session()  # 新建了session保存
                doc = opener.get(
                    link,
                    timeout=self.request_time_out,
                    headers=agents.get_header()).text  # 注意,这里是不断随机换agent的
                soup = BeautifulSoup(doc)

                abstract_raw = soup.findAll(name="abstracttext")
                abstract = ut.regexp_replace(str(abstract_raw),
                                             ut.re_html)[1:-1]  # 即时清理abstract

                key_words_raw = soup.findAll(name="div",
                                             attrs={"class": "keywords"})
                if key_words_raw:  # 如果有keyword的话,很多文章是没有
                    key_words_raw = str(key_words_raw)[45:-11].replace(
                        "; ", ";")
                    key_words_list = key_words_raw.split(';')

                institues_raw = soup.findAll(name='dl')
                if institues_raw:  # 如果有institues的话,大部分文章都有
                    institues_raw = institues_raw[0]
                    institues_raw = re.findall("<dd>.*?</dd>",
                                               str(institues_raw))
                    for institues in institues_raw:
                        institues_list.append(institues[4:-5])

                full_content = soup.findAll(name='div',
                                            attrs={"class": "icons portlet"})
                full_links_raw = re.findall("<a href=.*?ref=",
                                            str(full_content))
                if full_links_raw:  # 如果有全文链接
                    for full_link in full_links_raw:
                        full_links_list.append(full_link[9:-6].replace(
                            "&amp;", "&"))

                return abstract, key_words_list, institues_list, full_links_list  # 返回的是一个str值和3个集合
                break

            except Exception, e:
                tries -= 1
                msg.display(
                    ut.time_str("time"), "retrying record: " + str(pmid) +
                    "; " + str(tries) + " tries left", "notice")
                msg.log(self.task_name, ut.time_str("full"),
                        "retry record: " + str(pmid), "notice")
                msg.log(self.task_name, ut.time_str("full"), str(e), "error")
                time.sleep(self.request_refresh_wait)  # 如果抓不成功,就先休息3秒钟
Esempio n. 4
0
    def generate_record(self):  # 从抓取的原始素材产生记录
        title_start_with = "linksrc=docsum_title\">"  # 标记查找标题开头
        title_end_with = "</a>"  # 查找标题的结尾
        journal_start_with = 'title='  # 查找期刊开头
        journal_end_with = '\">'  # 查找期刊结尾

        m = 0
        while (m < len(self.pmid)):  # 有多少重复多少
            msg.stat("record", "proc")  # 处理记录+1
            pmid = str(self.pmid[m])[4:-5]  # 先找到pmid,再决定要不要下一步
            if not (self.pmid_check(pmid)):  # 如果之前没有这篇文章
                author = str(self.author[m])[16:-4]
                author_list = author.split(", ")  # 作者列表

                title_start = str(self.title[m]).find(title_start_with) + 22
                title = str(self.title[m])[title_start:-8].replace(
                    '<b>', '').replace('</b>', '')  # 论文名

                issue = re.search("[1-2][09][0-9]{2}",
                                  str(self.issue[m])).group(0)  # 刊号,即年份

                journal_end = str(self.journal[m]).find(
                    journal_end_with)  # 期刊结尾位置
                journal = str(self.journal[m])[26:journal_end].replace(
                    '<b>', '').replace('</b>', '')  # 期刊名
                journal_detail = jn.journal_detail(
                    journal)  # 获取期刊的正式名称,影响因子及分区信息

                paper_detail = self.crawl_detail(
                    pmid)  # 获取文章abstract,keyword列表,机构列表和全文链接列表

                if paper_detail:  # 如果能够返回正确的abstract,记录;否则留给下一次抓取(不记录,视作新论文)
                    mh.add_new_content(self.project_name, self.key_words,
                                       ut.time_str("full"), "pm", pmid, title,
                                       author_list, journal, journal_detail[0],
                                       journal_detail[1],
                                       journal_detail[2], issue,
                                       str(paper_detail[0]), paper_detail[1],
                                       paper_detail[2], paper_detail[3])
                    self.pmid_set.append(pmid)  # 把刚抓的这篇pmid加入pmid list
                    #这里的 paper_detail[0]是这篇文章的abstract,[1]是keywords,[2]是机构列表 [4]是全文下载的链接合集
                    msg.stat("record", "succ")  # 记录成功+1
                    msg.display(ut.time_str("time"), "retrieved record: " +
                                str(pmid) + "; total retrieved: " +
                                str(stats.success_record), "info")  # 显示:记录成功
                    msg.log(self.task_name, ut.time_str("full"),
                            "retrieved record: " + str(pmid),
                            "info")  # 记录:记录成功
            else:
                msg.stat("record", "skip")  # 跳过记录+1
                msg.display(
                    ut.time_str("time"), "skipped record: " + str(pmid) +
                    "; total skipped: " + str(stats.skipped_record), "info")
                msg.log(self.task_name, ut.time_str("full"),
                        "skipped record: " + str(pmid), "info")
            m += 1
Esempio n. 5
0
def project_add(project_name, project_description):
    if not dh.check_folders(project_name, "folder"): # 如果没有对应文件夹
        dh.new_project_files(project_name) # 新建文件
        project_set = dh.text_read("universal","project").split("\n")
        time.sleep(0.1) # 确保文件夹读取后关闭
        new_project = project_name, project_description, ut.time_str("full")
        project_set.append(new_project)
        dh.text_write(project_set,"universal","project","w")
    else:
        op.output("Folder already exist", "warning", ut.time_str("full"))
Esempio n. 6
0
def retrieve_proxy(proxy_number):
    api_url = "http://vtp.daxiangdaili.com/ip/?tid=559131754091145&num=" + \
        str(proxy_number) + "&delay=1&sortby=time"
    proxies = requests.get(api_url, timeout=10).text
    proxy_pool = []
    for proxy in proxies.split("\n"):
        proxy_record = ut.time_str("full"), proxy, 0, 0, 0
        proxy_pool.append(proxy_record)
    return proxy_pool
Esempio n. 7
0
def run_task(project, sstr):  # 多少时间后开始运行
    record_number, mrmins, endwith = get_task_config(project, sstr)
    endtime = ut.time_str("full", mrmins)
    msg.msg("crawl pmid", project + sstr, "started", "succ", "important", msg.display, msg.log, msg.stat)
    pc.run_pmid_crawler(project, sstr, record_number, endwith, endtime)
    msg.msg("crawl pmid", project + sstr, "finished", "succ", "important", msg.display, msg.log, msg.stat)

    msg.msg("crawl detail", project + sstr, "started", "succ", "important", msg.display, msg.log, msg.stat)
    dc.run_detail_crawler(project, sstr, record_number)
    msg.msg("crawl detail", project + sstr, "finished", "succ", "important", msg.display, msg.log, msg.stat)
Esempio n. 8
0
def task_validator(project, sstr, endwith, endtime):  # 是否提前结束
    if stats.c_skipped_pmid >= config.pmid_max_c_skip and endwith:  # 设定提前完成并达到提前完成的skip数量
        msg.msg("crawl pmid", project + sstr, "repeat end",
                "succ", "notice", msg.display, msg.log)
        return False  # false是不通过
    if endtime < ut.time_str("full") and endwith:  # 设定提前完成并达到提前完成的时间上限
        msg.msg("crawl pmid", project + sstr, "time end",
                "succ", "notice", msg.display, msg.log)
        return False
    else:
        return True  # True是通过
Esempio n. 9
0
def stat(when, who, identifier, action, result, info_type):  # 用于统计的信息
    if result == "succ":
        if who == "sum page":
            stats.success_sum_page += 1
        elif who == "record":
            stats.success_record += 1
        elif who == "pmid":
            stats.success_pmid += 1
            stats.c_skipped_pmid = 0
        elif who == "crawl pmid":
            if result == "started":
                stats.crawl_pmid_start = ut.time_str("full")
            elif result == "finished":
                stats.crawl_pmid_finish = ut.time_str("full")
        elif who == "crawl detail":
            if result == "started":
                stats.crawl_detail_start = ut.time_str("full")
            if result == "finished":
                stats.crawl_detail_finish = ut.time_str("full")
    elif result == "fail":
        if who == "sum page":
            stats.failed_sum_page += 1
        elif who == "record":
            stats.failed_record += 1
        elif who == "pmid":
            stats.failed_pmid += 1
    elif result == "proc":
        if who == "sum page":
            stats.processed_sum_page += 1
        elif who == "record":
            stats.processed_record += 1
        elif who == "pmid":
            stats.processed_pmid += 1
    elif result == "skip":
        if who == "sum page":
            stats.skipped_sum_page += 1
        elif who == "record":
            stats.skipped_record += 1
        elif who == "pmid":
            stats.skipped_pmid += 1
            stats.c_skipped_pmid += 1
Esempio n. 10
0
    def crawl_direct(self):  # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快
        msg.stat("sum_page", "proc")  # 列入已处理
        tries = self.tries_request  # 尝试3次
        while (tries > 0):
            try:
                opener = requests.Session()
                raw = opener.get(
                    self.url,
                    timeout=self.request_time_out,
                    headers=agents.get_header()).text  # header仍然可以是随机的
                soup = BeautifulSoup(raw)

                number_raw = soup.findAll(name="input",
                                          attrs={"id": "pageno"})  # 找到含总数的div
                number_start = str(number_raw).find("last=") + 6  # 找到总数开始位置
                number_end = str(number_raw).find("\" />")  # 找到总数结束位置
                max_number = int(
                    str(number_raw)[number_start:number_end])  # 实际最大数值,整数

                if max_number < self.sum_page_number:  # 如果实际最大页面数没有计算值大
                    self.sum_page_number = max_number  # 那用实际值,否则不变
                    msg.display(ut.time_str("time"),
                                "max sum page changed: " + str(max_number),
                                "notice")
                    msg.log(self.task_name, ut.time_str("full"),
                            "changed sum page number: " + str(max_number),
                            "notice")

                msg.display(ut.time_str("time"),
                            "loaded: NO.1 sum page (requests)", "info")
                msg.log(self.task_name, ut.time_str("full"),
                        "load sum page: NO.1 (requests)", "info")

                self.author = soup.findAll(name='p', attrs={"class": "desc"})
                self.journal = soup.findAll(name="span",
                                            attrs={'class': 'jrnl'})
                self.title = soup.findAll(name='p', attrs={"class": "title"})
                self.issue = soup.findAll(name="p", attrs={'class': 'details'})
                self.pmid = soup.findAll(name="dd")

                self.generate_record()  # 直接产生结果
                msg.stat("sum_page", "succ")
                break

            except Exception, e:
                print e
                tries -= 1
                msg.display(
                    ut.time_str("time"),
                    "load retrying: NO.1 sum page (requests); " + str(tries) +
                    " tries left", "notice")
                msg.log(self.task_name, ut.time_str("full"),
                        "retry sum page: NO.1 (requests)", "notice")
                msg.log(self.task_name, ut.time_str("full"), str(e), "error")
Esempio n. 11
0
def crawl_direct(project, sstr):  # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快
    url = parse_url(project, sstr)
    tries = config.request_sp_tries  # 尝试3次
    while(tries > 0):
        try:
            opener = requests.Session()
            content = opener.get(url, timeout=config.request_time_out,
                                 headers=agents.get_header()).text  # header仍然可以是随机的
            msg.msg("sum page", "1", "loaded", "proc", "info", msg.display)
            pmid_list = extract_new_pmid(content)  # 提取pmid, 然后排除旧的
            if pmid_list:
                mh.add_new_pmid_many(
                    project, sstr, ut.time_str("full"), "pm", pmid_list)
            msg.msg("sum page", "1", "loaded", "succ",
                    "info", msg.display, msg.log)
            break
        except Exception, e:
            msg.msg("sum page", "1", "loaded", str(e), "error", msg.log)
            msg.msg("sum page", "1", "loaded",
                    "retried", "notice", msg.display)
            tries -= 1
            time.sleep(config.request_refresh_wait)
Esempio n. 12
0
                return abstract, key_words_list, institues_list, full_links_list  # 返回的是一个str值和3个集合
                break

            except Exception, e:
                tries -= 1
                msg.display(
                    ut.time_str("time"), "retrying record: " + str(pmid) +
                    "; " + str(tries) + " tries left", "notice")
                msg.log(self.task_name, ut.time_str("full"),
                        "retry record: " + str(pmid), "notice")
                msg.log(self.task_name, ut.time_str("full"), str(e), "error")
                time.sleep(self.request_refresh_wait)  # 如果抓不成功,就先休息3秒钟

        else:
            msg.display(ut.time_str("time"),
                        "retrieve record fail: " + str(pmid), "error")
            msg.log(self.task_name, ut.time_str("full"),
                    "failed record: " + str(pmid), "error")
            msg.stat("record", "fail")
            return 0


#=====================================================================================
# 实际爬的部分开始

    def crawl_direct(self):  # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快
        msg.stat("sum_page", "proc")  # 列入已处理
        tries = self.tries_request  # 尝试3次
        while (tries > 0):
            try:
Esempio n. 13
0
def save_png(browser):
    browser.save_screenshot(
        ut.cur_file_dir() + "/browser/" + ut.time_str("time") + ".png")
    msg.msg("screenshot", "", "saved", "succ", "debug", msg.display, msg.log)
Esempio n. 14
0
                 "info", msg.display, msg.stat)
         WebDriverWait(pm_browser, config.phantom_time_out).until(
             EC.presence_of_element_located((By.ID, "footer")))  # 等待加载完毕的最好方案
         pm_browser.find_elements_by_name("Display")[2].click()  # 找到下拉菜单,点击
         pm_browser.implicitly_wait(5)  # 等0.5秒钟,让菜单下拉完成
         time.sleep(2)
         pm_browser.find_element_by_xpath(
             "//*[@id=\"ps200\"]").click()  # 下拉菜单找到200这个值,点击
         WebDriverWait(pm_browser, config.phantom_time_out).until(
             EC.presence_of_element_located((By.ID, "footer")))  # 自动刷新页面, 等待刷新完毕
         msg.msg("sum page", "1", "display number",
                 "clicked", "debug", msg.display, msg.log)
         pm_browser.implicitly_wait(5)
         pmid_list = extract_new_pmid(pm_browser.page_source)
         if pmid_list:
             mh.add_new_pmid_many(project, sstr, ut.time_str(
                 "full"), "pm", pmid_list)  # 把pmid存起来
         msg.msg("sum page", "1", "loaded", "succ",
                 "info", msg.log, msg.display, msg.stat)
         rest_page_number -= 1
         # dc.run_detail_crawler(project, sstr, 200)
         break
     except Exception as e:
         tries_1st_sp -= 1
         # time.sleep(config.phantom_refresh_wait)
         pm_browser.refresh()
         pm_browser.implicitly_wait(config.phantom_refresh_wait)
         msg.msg("sum page", "1", "loaded",
                 "retried", "notice", msg.display)
         msg.msg("sum page", "1", "loaded", str(e), "error", msg.log)
 else:
     msg.msg("sum page", "1", "loaded", "fail",
Esempio n. 15
0

def count_task(project, sstr):
    number = get_db("task").count({"project": project, "sstr": sstr})
    return number


def count_project_task(project):  # 数一下该项目下运行过多少task
    number = get_db("task").count({"project": project})
    return number


def finish_task(project, sstr):  # 把任务标记为完成
    data = {"status": 1}
    get_db("task").update_one({
        "project": project,
        "sstr": sstr
    }, {"$set": data})


if __name__ == "__main__":
    # add_new_project("organ on chip", "organ simulator, organ on chip", ut.time_str("full"))
    add_new_sstr("cancer", "liver,cancer", ut.time_str("full"), "key_words")
    # add_new_task("cancer", "breast,cancer", "2017-10-10 10:10:10", 5000, 6, 0, 0)
    # finish_task("cancer", "breast,cancer")
    # print count_task("cancer", "breast,cancer")
    # add_new_pmid("cancer", "lung,cancer", "2017-10-10 10:10:10", "pm", 29027110)
    # print read_empty_pmid("organ on chip", 10000)
    # print read_content("cancer", "lung,cancer", 1)
    # pass
Esempio n. 16
0
def msg(who, identifier, action, result, info_type, *args):
    '''*args可以为log, display, stat一个或多个'''
    for fn in args:
        fn(ut.time_str("full"), who, identifier, action, result, info_type)
Esempio n. 17
0
    def crawl_phantom(self):  # 用于使用phantomjs爬取sum-page,可以爬无限页,但是速度慢
        rest_page_number = self.sum_page_number  # 剩下多少页
        tries_1st_sp = self.tries_1st_sp
        tries_other_sp = self.tries_other_sp

        if self.sum_page_number > 1:  # 如果页面不超过1个,就不启动浏览器
            dcap = dict(DesiredCapabilities.PHANTOMJS)  # 设置userAgent
            dcap["phantomjs.page.settings.userAgent"] = (
                self.phantomjs_headers)  # header每次打开phantomjs是随机的,但浏览器关闭前不会变
            dcap["phantomjs.page.settings.loadImages"] = False  # 不载入图片,以加快速度
            # browser = webdriver.PhantomJS(executable_path='C:\Python27\Scripts\phantomjs.exe', desired_capabilities=dcap)  # 加载浏览器,windows下使用
            path = cur_file_dir() + "/browser/phantomjs"  # 浏览器地址
            browser = webdriver.PhantomJS(executable_path=path,
                                          desired_capabilities=dcap)  # 加载浏览器
            browser.set_page_load_timeout(
                self.phantomjs_time_out)  # 设定网页加载超时,超过了就不加载

        while (self.sum_page_number > 1 and tries_1st_sp > 0):
            try:
                browser.get(self.url)
                WebDriverWait(browser, self.phantomjs_time_out).until(
                    EC.presence_of_element_located((By.ID, "footer")))
                msg.display(ut.time_str("time"),
                            "loaded: NO.1 sum page (phantomjs)", "info")
                msg.log(self.task_name, ut.time_str("full"),
                        "load sum page: NO.1 (phantomjs)", "info")
                msg.stat("sum_page", "succ")
                break

            except Exception as e:
                tries_1st_sp -= 1
                msg.display(
                    ut.time_str("time"),
                    "load retrying: NO.1 sum page (phantomjs); " +
                    str(tries_1st_sp) + " tries left", "notice")
                msg.log(self.task_name, ut.time_str("full"),
                        "retry sum page: NO.1 (phantomjs)", "notice")
                msg.log(self.task_name, ut.time_str("full"), str(e), "error")

                browser.refresh()
                browser.implicitly_wait(self.phantomjs_refresh_wait)

        else:
            msg.display(ut.time_str("time"),
                        "load failed: NO.1 sum page (phantomjs)", "error")
            msg.log(self.task_name, ut.time_str("full"),
                    "fail sum page: NO.1 (phantomjs)", "error")

        while (
                rest_page_number > 1 and tries_1st_sp > 0
        ):  # 确认需要第二页,如果sum-page只有1页,那就不用再打开; 如果第一页打开失败,也不用打开;从这里开始循环,直到所有的页面都爬完为止
            msg.stat("sum_page", "proc")
            tries_other_sp = self.tries_other_sp

            while (tries_other_sp > 0):  # 尝试多少次,默认尝试5次,不行就打不开
                try:
                    browser.find_element_by_link_text(
                        "Next >").click()  # 直接就点开“下一页”,从第二页开始
                    WebDriverWait(browser, self.phantomjs_time_out).until(
                        EC.presence_of_element_located((By.ID, "footer")))

                    msg.display(
                        ut.time_str("time"),
                        "loaded: NO." + str(stats.success_sum_page + 1) +
                        " sum page (phantomjs)", "info")
                    msg.log(
                        self.task_name, ut.time_str("full"),
                        "load sum page: NO." +
                        str(stats.success_sum_page + 1) + " (phantomjs)",
                        "info")

                    soup = BeautifulSoup(browser.page_source)
                    self.author = soup.findAll(name='p',
                                               attrs={"class": "desc"})
                    self.journal = soup.findAll(name="span",
                                                attrs={'class': 'jrnl'})
                    self.title = soup.findAll(name='p',
                                              attrs={"class": "title"})
                    self.issue = soup.findAll(name="p",
                                              attrs={'class': 'details'})
                    self.pmid = soup.findAll(name="dd")
                    self.generate_record()  # 直接产生结果

                    msg.stat("sum_page", "succ")
                    rest_page_number -= 1
                    break

                except Exception as e:
                    tries_other_sp -= 1
                    msg.display(
                        ut.time_str("time"), "load retrying: NO." +
                        str(stats.success_sum_page + 1) +
                        " sum page (phantomjs); " + str(tries_other_sp) +
                        " tries left", "notice")
                    msg.log(
                        self.task_name, ut.time_str("full"),
                        "retry sum page: NO." +
                        str(stats.success_sum_page + 1) + " (phantomjs)",
                        "notice")
                    msg.log(self.task_name, ut.time_str("full"), str(e),
                            "error")

                    browser.refresh()
                    browser.implicitly_wait(self.phantomjs_refresh_wait)

            else:
                msg.stat("sum_page", "fail")
                msg.display(
                    ut.time_str("time"), "load failed: NO." +
                    str(stats.success_sum_page + 1) + " sum page (phantomjs)",
                    "error")
                msg.log(
                    self.task_name, ut.time_str("full"), "fail sum page: NO." +
                    str(stats.success_sum_page + 1) + " (phantomjs)", "error")
                break

        if self.sum_page_number > 1:
            browser.quit()  # 关闭浏览器。当出现异常时记得在任务浏览器中关闭PhantomJS
Esempio n. 18
0
def time_box():# 把时间相关的列出来
    print " ---------------------------------------------------------------"
    print " │     Current Time    │      Start Time     │   Elapsed Time  │"
    print " │ " + ut.time_str("full") + " │ " + ut.time_str("full") + " │ " + "  ?? Hr ?? min " + " │"
    print " ---------------------------------------------------------------"
Esempio n. 19
0
def generate_tasks(project, sstr):
    config = get_task_config(project, sstr)
    mh.add_new_task(project, sstr, ut.time_str(
        "full"), config[0], config[1], config[2], 0)