コード例 #1
0
ファイル: ESOperate.py プロジェクト: Daliji/easycrawler
 def show_index_info(self):
     '''
     取Mapping信息
     :return:
     '''
     _mapping = self.es.indices.get_mapping(index=self.index_name)
     log.info(_mapping)
コード例 #2
0
ファイル: crawler.py プロジェクト: Daliji/easycrawler
    def start(self, *args, **kwargs):
        if self.status != 2:
            log.error("爬虫未完全停止")
            return

        i = 0
        self.status = 1
        while (self.status == 1):
            page_index = i % 10 + 1
            i += 1
            log.info('now get page {} .'.format(page_index))
            contents = self.crawler.page_contents(page_index)
            need_sleep = False
            if contents is None:
                log.error("page {} error, now sleep!".format(page_index))
                need_sleep = True
            else:
                if len(contents) == 0: continue
                self.crawler.save_to_db(contents)
                self.data_count += len(contents)
            if i == 10:
                log.info('爬取完成,爬虫进入睡眠,时间十分钟')
                need_sleep = True
            if need_sleep: time.sleep(600)
        self.status = 2
コード例 #3
0
ファイル: uestcnews.py プロジェクト: Daliji/easycrawler
 def page_contents(self, page_index):
     '''
     取得每一页的URL等信息
     :return:
     '''
     url_head = self._get_news_page_url(page_index)
     try:
         contents = []
         url_response = requests.get(url_head)
         url_soup = BeautifulSoup(url_response.text, "html.parser")
         news_menus = url_soup.find_all('a', class_="cell")
         for news_menu in news_menus:
             news_type = news_menu.get_text().replace('\n', '').strip()
             url_tail = news_menu.attrs['href']
             page_url = 'https://news.uestc.edu.cn' + str(
                 url_tail) + '&page=' + str(page_index)
             response = requests.get(page_url)
             soup = BeautifulSoup(response.text, "html.parser")
             news = soup.select(' div[id="Degas_news_list"] > ul > li ')
             for list in news:
                 title = list.select(' h3 > a')[0].get_text().replace(
                     '\n', '').replace("'", r"\'").strip()
                 brief = list.find_all('p',
                                       class_="desc")[0].get_text().replace(
                                           '\n', '').replace("'",
                                                             r"\'").strip()
                 page_address = list.select(' h3 > a')[0]
                 address_a = page_address.attrs['href']
                 url = 'https://news.uestc.edu.cn' + str(address_a)
                 if url in self.url_list: continue
                 log.info('正在爬取:' + url)
                 page = requests.get(url)
                 contents_a = BeautifulSoup(
                     page.text, "html.parser").find_all(
                         'div', class_="Degas_news_content")[0].contents[1]
                 content = str(contents_a).replace("'", r"\'")
                 create_time = list.find_all(
                     'span',
                     class_="time")[0].get_text().replace('\n', '').strip()
                 item = {
                     'title': title,
                     'brief': brief,
                     'url': url,
                     'content': content,
                     'news_type': news_type,
                     'create_time': create_time
                 }
                 contents.append({
                     'url': item['url'],
                     'title': item['title'],
                     'brief': item['brief'],
                     'content': item['content'],
                     'news_type': item['news_type'],
                     'create_time': item['create_time']
                 })
                 self.url_list.append(item['url'])
         return contents
     except:
         return None
コード例 #4
0
ファイル: uestcnews.py プロジェクト: Daliji/easycrawler
 def save_to_db(self, data_list):
     log.info('insert into db ...')
     for d in data_list:
         try:
             sql = "insert into uestc_news(title,brief,address,content,create_time,corp_id,news_menu,index_name) values ('{}','{}','{}','{}','{}',{},'{}','{}')".format(
                 d['title'], d['brief'], d['url'], d['content'],
                 d['create_time'] + " 00:00:00", 136, d['news_type'],
                 'uestc_news')
             self.db.update(sql)
         except:
             log.error('insert into db error: ' + d['url'])
             continue
コード例 #5
0
ファイル: ESOperate.py プロジェクト: Daliji/easycrawler
    def put_data(self, data):
        actions = []

        for d in data:
            action = {"_index": self.index_name, "_source": d}
            if 'id' in d: action['_id'] = d['id']
            actions.append(action)

        success, _ = bulk(self.es,
                          actions,
                          index=self.index_name,
                          raise_on_error=True)
        log.info('Performed %d actions' % success)
コード例 #6
0
ファイル: uestcnews.py プロジェクト: Daliji/easycrawler
def _run():
    c = UESTCCrawler()
    i = 0
    while (True):
        page_index = i % 10 + 1
        i += 1
        log.info('now get page {} .'.format(page_index))
        contents = c.page_contents(page_index)
        need_sleep = False
        if contents is None:
            log.error("page {} error, now sleep!".format(page_index))
            need_sleep = True
        else:
            c.save_to_db(contents)
        if i == 10:
            log.info('爬取完成,爬虫进入睡眠,时间十分钟')
            need_sleep = True
        if need_sleep: time.sleep(600)
コード例 #7
0
ファイル: uestc_news_es.py プロジェクト: Daliji/easycrawler
                    "search_analyzer": "ik_smart"
                },
                "content": {  # 内容
                    "type": "text",
                    "analyzer": "ik_smart",
                    "search_analyzer": "ik_smart",
                    "similarity": {
                        "my_custom_similarity": {
                            "type": "BM25",  # 使用BM25算法索引和查找
                            "k1": 2,
                            "b": 0.75
                        }
                    }
                },
                "address": {  # 文档地址
                    "type": "text",
                    "index": "false"
                }
            }
        }
        return content_mapping


# for test
if __name__ == '__main__':
    es = UESTCNEWS()
    es.load_data()
    log.info("############## search result ##################")
    log.info(es.query_data(sentence='习近平', topn=5))
    log.info("###############################################")