def __init__(self): cf = tools.load_config() self.threshold = float(cf.get('Section', 'threshold')) self.days = int(cf.get('Section', 'days')) self.maxPage = int(cf.get('Section', 'maxPage')) self.decoding = cf.get('Section', 'decoding') target_path = cf.get('Section', 'target_path') stopwords_path = cf.get('Section', 'stopwords_path') dict_path = cf.get('Section', 'corpus') self.s = similarity.TextSimilarity(target_path, stopwords_path, dict_path) # 扫描的批次 self.scan_id = str(time.time()) # 首页 self.science_url = 'https://pacaio.match.qq.com/irs/rcd?cid=58&token=c232b098ee7611faeffc46409e836360&ext=tech&page=' # 互联网 self.internet_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=614,603,605,611,612,613,615,620,618&page=1' # IT self.it_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=604,609&page=' # 区块链 self.blockchain_url = 'https://pacaio.match.qq.com/tags/tag2articles?id=276813&num=15&page=' # AI self.ai_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=602,608,622&page=' # 创业创新 self.innovate_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=619,617,610&page=' # 前沿科技 self.leadingSci_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=607,616,623,624&page=' # 添加进start_urls self.start_urls.append(self.science_url) self.start_urls.append(self.internet_url) self.start_urls.append(self.it_url) self.start_urls.append(self.blockchain_url) self.start_urls.append(self.ai_url) self.start_urls.append(self.innovate_url) self.start_urls.append(self.leadingSci_url)
def __init__(self): cf = load_config() self.threshold = float(cf.get('Section', 'threshold')) self.days = int(cf.get('Section', 'days')) self.maxPage = int(cf.get('Section', 'maxPage')) target_path = cf.get('Section', 'target_path') stopwords_path = cf.get('Section', 'stopwords_path') self.s = similarity.TextSimilarity(target_path, stopwords_path) # 扫描的批次 self.scan_id = str(time.time())
def __init__(self): cf = tools.load_config() self.threshold = float(cf.get('Section', 'threshold')) self.days = int(cf.get('Section', 'days')) self.maxPage = int(cf.get('Section', 'maxPage')) target_path = cf.get('Section', 'target_path') dict_path = cf.get('Section', 'corpus') stopwords_path = cf.get('Section', 'stopwords_path') self.decoding = cf.get('Section', 'decoding') self.s = similarity.TextSimilarity(target_path, stopwords_path, dict_path) # 扫描的批次 self.scan_id = str(time.time()) self.category_urls = [] self.page = 1 # url self.tech_url = 'http://news.sina.com.cn/roll/#pageid=153&lid=2515&page='