def sendCrawledNewsFromHostToHost(from_host_type, to_host_type, from_index,
                                   to_index, logger):
     from_host = create_client_elastic_search(from_host_type)
     to_host = create_client_elastic_search(to_host_type)
     if from_host.indices.exists(index=from_index):
         crawled_news = ElasticSearchUtils.getAllNewsFromHostInOneDay(
             from_index, from_host_type)
         if not to_host.indices.exists(index=to_index):
             MappingElasticSearch.mappingIndicesToHost(
                 to_index, to_host_type)
             ElasticSearchUtils.settingMaxResultSearch(
                 to_index, to_host_type, 5000000)
         for hit in crawled_news:
             if 'images' in hit['_source']:
                 hit['_source']['images'] = [
                     item for item in hit['_source']['images']
                     if len(item) < 2000
                 ]
             res = to_host.index(index=to_index,
                                 id=hit['_id'],
                                 body=hit['_source'])
             print(res)
             # logger.info("Indexed chosen news data processed " + str(res))
     else:
         pass
 def getAllTalentNewsFromHost(host_type, index):
     try:
         from_value = 0
         size_value = 100
         crawled_news = []
         host = create_client_elastic_search(host_type)
         while True:
             res = host.search(index=index,
                               body={
                                   "query": {
                                       "match_all": {}
                                   },
                                   "size": size_value,
                                   "from": from_value
                               })
             if not res['hits']['hits']:
                 break
             for hit in res['hits']['hits']:
                 crawled_news.append(hit)
             from_value = from_value + size_value
         return crawled_news
     except Exception:
         MappingElasticSearch.mappingIndicesToHost(index=index,
                                                   host_type=host_type)
         ElasticSearchUtils.settingMaxResultSearch(index=index,
                                                   host_type=host_type,
                                                   max_result=5000000)
         return []
 def mappingIndicesToHost(index, host_type):
     mapping = {
         "settings": {
             "analysis": {
                 "analyzer": {
                     "news_analyzer": {
                         "type": "custom",
                         "tokenizer": "standard",
                         "char_filter": ["html_strip"],
                         "filter": ["lowercase", "asciifolding"]
                     }
                 }
             }
         },
         "mappings": {
             "properties": {
                 "url": {
                     "type": "keyword"
                 },
                 "title": {
                     "type": "text",
                     "analyzer": "news_analyzer",
                     "search_analyzer": "news_analyzer"
                 },
                 "summary": {
                     "type": "text",
                     "analyzer": "news_analyzer",
                     "search_analyzer": "news_analyzer"
                 },
                 "content": {
                     "type": "text",
                     "analyzer": "news_analyzer",
                     "search_analyzer": "news_analyzer"
                 },
                 "source": {
                     "type": "keyword"
                 },
                 "images": {
                     "type": "keyword"
                 },
                 "published_date": {
                     "type": "date"
                 },
                 "indexed_date": {
                     "type": "date"
                 }
             }
         }
     }
     host = create_client_elastic_search(host_type)
     response = host.indices.create(index=index, body=mapping, ignore=400)
     print("done")
     if 'acknowledged' in response:
         if response['acknowledged']:
             print("INDEX MAPPING SUCCESS FOR INDEX:", response['index'])
     elif 'error' in response:
         print("ERROR:", response['error']['root_cause'])
         print("TYPE:", response['error']['type'])
     print(response)
 def getNumOfCrawledNewsInOneDay(host_type, index):
     host = create_client_elastic_search(host_type)
     body = {
         "query": {
             "range": {
                 "indexed_date": {
                     "gte": "now-1d/d",
                     "lte": "now/d"
                 }
             }
         }
     }
     res = host.count(index=index, body=body)
     return res['count']
 def process_item(self, item, spider):
     if "TalentCrawledDataPipeline" in getattr(spider, 'pipelines', []):
         index_id = create_item_id(item)
         item["indexed_date"] = get_instance_time_iso_format()
         if "https://baomoi.com/404" != item['url']:
             try:
                 # self.es.index(index=self.settings['ELASTIC_SEARCH_INDEX'], id=index_id, body=dict(item))
                 host = create_client_elastic_search(SERVER_HOST_NAME)
                 host.index(index=talent_crawled_index, id=index_id, body=dict(item))
             except elasticsearch.exceptions.NotFoundError:
                 MappingElasticSearch.mappingIndicesToHost(index=self.settings['ELASTIC_SEARCH_INDEX'], host_type=SERVER_HOST_NAME)
                 ElasticSearchUtils.settingMaxResultSearch(index=self.settings['ELASTIC_SEARCH_INDEX'], host_type=SERVER_HOST_NAME, max_result= 5000000)
         return item
     else:
         pass
 def getCrawledNewsFormHostInOneDay(host_type, index, from_value,
                                    size_value):
     host = create_client_elastic_search(host_type)
     crawled_news = []
     res = host.search(index=index,
                       body={
                           "query": {
                               "range": {
                                   "indexed_date": {
                                       "gte": "now-1d/d",
                                       "lte": "now/d"
                                   }
                               }
                           },
                           "size": size_value,
                           "from": from_value
                       })
     for hit in res['hits']['hits']:
         crawled_news.append(hit)
     return crawled_news
def create_index(index, index_id, new_item, host_type, logger):
    try:
        server_elastic_search = create_client_elastic_search(host_type)
        if not server_elastic_search.indices.exists(index=index):
            MappingElasticSearch.mappingIndicesToHost(index, host_type)
        actions = []
        doc = {'_index': index, '_id': index_id, '_source': dict(new_item)}
        actions.append(doc)
        helpers.bulk(server_elastic_search,
                     actions,
                     chunk_size=1000,
                     request_timeout=200)
        time.sleep(3)
        logger.info("index success" + json.dumps(new_item))
        del server_elastic_search
        del doc
        del actions
        gc.collect()
    except elasticsearch.exceptions.NotFoundError:
        logger.error("Not found index" + str(index) + "in host" +
                     str(host_type))
 def getNODocOfIndex(host_type, index):
     host = create_client_elastic_search(host_type)
     res = host.count(index=index)
     return res["count"]
 def settingMaxResultSearch(index, host_type, max_result):
     host = create_client_elastic_search(host_type)
     body = {"index": {"max_result_window": max_result}}
     host.indices.put_settings(index=index, body=body)