class Paper: logger = get_logger(TAG=__name__, file_name=__name__) def __init__(self): # ACL ID self._id = "" self.url_id = "" self.title = "" self.authors = [] self.authors_full_name = [] self.venue = "" self.year = 0 self.abstract = "" # 引用的文章-->paper._id self.out_citations = [] # 被引用的文章-->paper._id self.in_citations = [] # 每一个引文包含了paper_id,line,sentence # 字典类型,包含:paper_id(-->paper._id),line(所在行),sentence(引文句); self.citing_sentences = [] self.session = "" def save(self): # 写入mongoDB数据库,只存一个 try: col_paper.insert_one(self.__dict__) self.logger.info("写入\tid:" + self._id) except Exception as e: self.logger.error("id:%s\turl_id:%s\t%s" % (self._id, self.url_id, e))
def __init__(self, length, thread_num=6): self.thread_num = thread_num self.length = length self.loggers = [ get_logger(__name__ + str(x), __name__ + str(x)) for x in range(1, thread_num + 1) ] logger.info("任务数:%s\t线程数:%s" % (str(length), str(thread_num)))
def __init__(self, things, thread_num=6): """ :param things: 作者页的链接list :param thread_num: """ self.thread_num = thread_num self.things = things self.loggers = [ get_logger(__name__ + str(x), __name__ + str(x)) for x in range(1, thread_num + 1) ]
class Downloader: logger = get_logger(__name__) def __init__(self, url, delay=5, user_agent=r"Mozilla/4.0 (compatible; MSIE 5.0; Windows NT)", num_retries=0): """ :param delay: :param user_agent:默认使用随机ua :param prxies: :param num_retries: :param cache: """ self.throttle = Throttle(delay) self.user_agent = UserAgent() self.num_retries = num_retries self.url = url def __call__(self): return self.dow(self.url) def dow(self, url): result = self.download(url=url, headers={"User-Agent": self.user_agent.random}, num_retries=self.num_retries) return result def download(self, url, headers, num_retries): self.logger.info("Downloading: " + url) try: req = requests.get(url, headers=headers) html = req.text if 500 <= req.status_code < 600: # 服务器错误则忽略缓存并重新下载 time.sleep(2) html = requests.get(url, headers=headers).text if 500 <= req.status_code < 600: html = "" except requests.exceptions.RequestException as e: self.logger.error(e) html = None if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code <= 600: html = self.download(url=url, headers=headers, num_retries=num_retries - 1) # if html is None: # self.throttle.wait(url) # html = self.download(url=url, headers=headers, num_retries=num_retries) return html
Author : Carl Author_email: [email protected] date: meta_data.py Description : ------------------------------------------------- # If this run wrong, don't ask me , I don't know why; # If this run right, thank god, and I don't know why. # Maybe the answer, my friend, is blowing in the wind. ------------------------------------------------- """ __author__ = 'Carl' from util.dao import col_paper, col_author from util.Log import get_logger import re logger = get_logger(TAG="paper", file_name="paper") class MetaData: def __init__(self): self.paper_id = None self.title = None self.authors = None self.venue = None self.year = None def write_data(): with open("../data_set/acl-metadata.txt", "r", encoding='UTF-8') as f: while True: try:
# -*- coding:utf8 -*- import threading from time import ctime, time from spider.paper_page2 import PaperPage from util.Log import get_logger from logging import Logger logger = get_logger("mutil_thread_paper") lock = threading.Lock() class PageThread(object): def __init__(self, length, thread_num=6): self.thread_num = thread_num self.length = length self.loggers = [ get_logger(__name__ + str(x), __name__ + str(x)) for x in range(1, thread_num + 1) ] logger.info("任务数:%s\t线程数:%s" % (str(length), str(thread_num))) def get_range(self): # 完成范围的均分 ranges = [] length = self.length offset = int(int(length) / self.thread_num) for i in range(self.thread_num): if i == (self.thread_num - 1): ranges.append((i * offset, length)) else: ranges.append((i * offset, (i + 1) * offset))
class Author: logger = get_logger(TAG=__name__, file_name=__name__) def __init__(self, _id=None, full_name=None): """ :param _id: id为字符串类型 :param full_name: """ self._id = str(_id) self.full_name = full_name self.papers_count = 0 # 为了便于计算,合作者包括自己 self.partners_full_name = [] self.partners_id = [] self.affiliations = [] self.collaborators = [] # 作者所发表的文章-->paper.url_id self.publications = [] self.insert_flag = True self.search() def search(self): """ 根据ID查询数据库中的信息 :return: """ try: au = col_author.find_one({"_id": self._id}) if au is not None: if not self.full_name: self.full_name = au["full_name"] self.partners_full_name = au['partners_full_name'] self.partners_id = au['partners_id'] self.papers_count = au['partners_id'] self.partners_id = au['partners_id'] self.affiliations = au['affiliations'] self.collaborators = au['collaborators'] # 作者所发表的文章-->paper.url_id self.publications = au['publications'] self.insert_flag = False except Exception as e: self.logger.error("id:%s\t%s" % (self._id, e)) def add_partner_full_name(self, authors_name: list): rs = set(self.partners_full_name) rs = rs.union(authors_name) self.partners_full_name = list(rs) def add_partner_id(self, _id: list): rs = set(self.partners_id) rs = rs.union(_id) self.partners_id = list(rs) def save(self): # 写入mongoDB数据库 try: if self.insert_flag: col_author.insert_one(self.__dict__) else: col_author.replace_one({"_id": self._id}, self.__dict__) self.logger.info("写入\tid:" + self._id) except Exception as e: self.logger.error("id:%s\t%s" % (self._id, e))
class AuthorPage(object): valid = False # 网页数据的有效性,无效则不进行解析与储存 logger = get_logger(__name__, __name__) def __init__(self, _id, content=None, **kwargs): self._id = str(_id) self.author = Author(_id=self._id) if not self.author.full_name: self.valid = True else: return if self.author.insert_flag: if content is None: self.content = Downloader(host + self._id)() if self.content: self.valid = True else: self.logger.error("当前网页为空,无法进行解析\t_id:" + self._id) self.valid = False return else: self.valid = True self.content = content self.selector = etree.HTML(self.content) def run(self): if not self.valid: self.logger.info("该author已存在\t_id:" + self._id) return self.logger.info("开始解析author:" + self._id) self.main_page() if not self.author.insert_flag: self.logger.info("无效网页,已剔除:" + self._id) return self.get_partners() self.get_papers() self.author.save() self.save_publications() self.logger.info("完成author:" + self._id) def main_page(self): self.author.full_name = deep_select(self.selector, 0, xpath="//head/title/text()").replace("AAN: ", "") if "ValueError" in self.author.full_name: self.author.insert_flag = False self.valid = False return self.author.publications = deep_select(self.selector, 0, xpath="//table/tbody/tr[1]/td/text()") self.author.affiliations = deep_select(self.selector, return_type="list", xpath="//table/tbody/tr[5]/td/ul/li/text()") def get_partners(self): if not self.valid: return self.selector = etree.HTML(Downloader('http://aan.how/browse/author/collaborators/' + self._id)()) name = deep_select(self.selector, return_type="list", xpath="//tr[@class='gradeA']/td[1]/a/text()") self.author.partners_full_name = name # 合作文章数量 num = deep_select(self.selector, return_type="list", xpath="//tr[@class='gradeA']/td[2]/text()") for x in range(len(name)): papers_id = deep_select(self.selector, return_type="list", xpath="//tr[@class='gradeA'][" + str(x + 1) + "]/td[3]/a/text()") self.author.collaborators.append({"author": name[x], "num": num[x], "papers_id": papers_id}) partners_id = deep_select(self.selector, return_type="list", xpath="//tr[@class='gradeA']/td[1]/a/@href") self.author.partners_id = [to_num(x) for x in partners_id] def get_papers(self): if not self.valid: return self.selector = etree.HTML(Downloader('http://aan.how/browse/author/publications/' + self._id)()) papers_url_id = deep_select(self.selector, return_type="list", xpath="///tr[@class='gradeA']/td[2]/a/@href") self.author.papers_count = len(papers_url_id) self.author.publications = [to_num(x) for x in papers_url_id] # 使用多线程爬取网页 # paper_thread = PaperPageThread(self.author.publications, 10) # paper_thread.start() def save_many(self, papers: []): # 一次存多个对象 ids = [x for x in papers._id] try: col_paper.insert_many([x.__dict__ for x in papers]) except Exception as e: self.logger.error("ids:%s\t%s" % (ids, e)) def save_publications(self): for x in self.author.publications: col_page.update_one({"_id": x}, {'$set': {"used": False}}, True)
Description : ------------------------------------------------- # If this run wrong, don't ask me , I don't know why; # If this run right, thank god, and I don't know why. # Maybe the answer, my friend, is blowing in the wind. ------------------------------------------------- """ __author__ = 'Carl' from util.downloader import Downloader from util.Log import get_logger from model.paper import Paper from util.stringUtil import * from util.dao import col_paper from lxml import etree logger = get_logger("paper_page", "paper_page") host = "http://aan.how/browse/paper/" class PaperPage(object): valid = False # 网页数据的有效性,无效则不进行解析与储存 def __init__(self, _id, content=None, **kwargs): self._id = str(_id) self.paper = Paper() page_data = col_paper.find_one({"url_id": self._id}) if page_data: # 数据库中已经存在,直接返回 return if content is None: self.content = Downloader(host + self._id)()