Ejemplo n.º 1
0
 def __init__(self):
     cf = tools.load_config()
     self.threshold = float(cf.get('Section', 'threshold'))
     self.days = int(cf.get('Section', 'days'))
     self.maxPage = int(cf.get('Section', 'maxPage'))
     self.decoding = cf.get('Section', 'decoding')
     target_path = cf.get('Section', 'target_path')
     stopwords_path = cf.get('Section', 'stopwords_path')
     dict_path = cf.get('Section', 'corpus')
     self.s = similarity.TextSimilarity(target_path, stopwords_path,
                                        dict_path)
     # 扫描的批次
     self.scan_id = str(time.time())
     # 首页
     self.science_url = 'https://pacaio.match.qq.com/irs/rcd?cid=58&token=c232b098ee7611faeffc46409e836360&ext=tech&page='
     # 互联网
     self.internet_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=614,603,605,611,612,613,615,620,618&page=1'
     # IT
     self.it_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=604,609&page='
     # 区块链
     self.blockchain_url = 'https://pacaio.match.qq.com/tags/tag2articles?id=276813&num=15&page='
     # AI
     self.ai_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=602,608,622&page='
     # 创业创新
     self.innovate_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=619,617,610&page='
     # 前沿科技
     self.leadingSci_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=607,616,623,624&page='
     # 添加进start_urls
     self.start_urls.append(self.science_url)
     self.start_urls.append(self.internet_url)
     self.start_urls.append(self.it_url)
     self.start_urls.append(self.blockchain_url)
     self.start_urls.append(self.ai_url)
     self.start_urls.append(self.innovate_url)
     self.start_urls.append(self.leadingSci_url)
Ejemplo n.º 2
0
 def __init__(self):
     cf = load_config()
     self.threshold = float(cf.get('Section', 'threshold'))
     self.days = int(cf.get('Section', 'days'))
     self.maxPage = int(cf.get('Section', 'maxPage'))
     target_path = cf.get('Section', 'target_path')
     stopwords_path = cf.get('Section', 'stopwords_path')
     self.s = similarity.TextSimilarity(target_path, stopwords_path)
     # 扫描的批次
     self.scan_id = str(time.time())
Ejemplo n.º 3
0
 def __init__(self):
     cf = tools.load_config()
     self.threshold = float(cf.get('Section', 'threshold'))
     self.days = int(cf.get('Section', 'days'))
     self.maxPage = int(cf.get('Section', 'maxPage'))
     target_path = cf.get('Section', 'target_path')
     dict_path = cf.get('Section', 'corpus')
     stopwords_path = cf.get('Section', 'stopwords_path')
     self.decoding = cf.get('Section', 'decoding')
     self.s = similarity.TextSimilarity(target_path, stopwords_path,
                                        dict_path)
     # 扫描的批次
     self.scan_id = str(time.time())
     self.category_urls = []
     self.page = 1
     # url
     self.tech_url = 'http://news.sina.com.cn/roll/#pageid=153&lid=2515&page='