class Ncli: _version = 1.0 _yaml = 'nets.yaml' def __init__(self, yamlfile): with open(yamlfile) as f: self.parameters = yaml.load(f) logging.basicConfig(format=self.parameters['logging']['format']) self.logger = logging.getLogger("NETS") self.logger.level = logging.INFO try: self.es = Elasticsearch(hosts=[ {'host': self.parameters['elasticsearch']['host'], 'port': self.parameters['elasticsearch']['port']}]) info = self.es.info() self.logger.info("Connected to Elasticsearch v. %s, name: %s" % (info['version']['number'], info['name'])) except ElasticsearchException: self.logger.info("Elasticsearch is not available.") exit(0) def indexinfo(self, target): for item in self.parameters['elasticsearch']['indexes']: if item['type'] == target: return item['name'], item['doctype'] # display status check and exit def status(self): idx_client = IndicesClient(self.es) for idx in ['raw-article', 'enhanced-article']: es_index = self.indexinfo(idx)[0] if idx_client.exists(es_index): self.logger.info("%s contains %s documents." % (idx, self.es.count(index=es_index)['count'])) if idx == 'article': query = {"query": {"term": {"status": 1}}} self.logger.info( "%s articles have been processed." % self.es.count(index=es_index, body=query)['count']) else: self.logger.info("%s does not exist" % es_index) # initialize articles or events index. def initialize(self, idx): es_index, es_doctype = self.indexinfo(idx) self.logger.info("Initializing %s" % es_index) idx_client = IndicesClient(self.es) if idx_client.exists(es_index): idx_client.delete(es_index) idx_client.create(es_index) if idx == 'event': idx_client.put_mapping(doc_type=es_doctype, index=[es_index], body=event_mapping()) self.logger.info("%s ready." % es_index) # find n articles and run them through the pipeline def pipeline(self, n): self.eventpipeline = Pipeline(self.parameters) es_index, es_doctype = self.indexinfo('raw-article') self.logger.info("Send %s articles through the pipeline" % n) query = '{"query": { "bool": { "must": { "match": { "status" : 0 }}}}}' result = self.es.search(index=es_index, doc_type=es_doctype, size=n, body=query) articles = result['hits']['hits'] self.eventpipeline.batch(articles) # load articles from json files in a directory def load(self): self.logger.info("Load articles") es_index, es_doctype = self.indexinfo('raw-article') path = self.parameters['directories']['articles'] files = [join(path, f) for f in listdir(path) if isfile(join(path, f))] for filename in files: with open(filename) as data_file: rows = [json.loads(row) for row in data_file.readlines()] for index, article in enumerate(rows): if '_id' in article: del article['_id'] self.es.index(index=es_index, doc_type=es_doctype, body=article) def reset(self, n): resetpayload = {"doc": {"status": 0}} self.logger.info("reset %s raw articles" % n) es_index, es_doctype = self.indexinfo('raw-article') query = '{"query": { "bool": { "must": { "match": { "status": "1" }}}}}' result = self.es.search(index=es_index, doc_type=es_doctype, size=n, body=query) articles = result['hits']['hits'] tic = 0 for article in articles: aid = article["_id"] status = article["_source"]["status"] self.es.update(index=es_index, doc_type=es_doctype, id=aid, body=resetpayload) tic = tic + 1 if tic == 500: print("...", tic) tic = 0