class NewsDelInfo(object): def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisTools() self.es = Elasticsearch(ES_ADDR, ignore=404) def es_ping(self): if not self.es.ping(): self.es = Elasticsearch(ES_ADDR, ignore=404) def run(self): while True: rowkey = self.redis_con.get_yy_rowkey("es:news:del:info") _id = trans_md5(rowkey) self.es_ping() try: boo = self.es.exists(index="xw_info",doc_type="sino",id=_id) if boo: self.es.delete(index="xw_info",doc_type="sino",id=_id) except Exception as e: log_info = "news info delete error %s" %str(e) logging.error(log_info) boo = self.es.exists(index="xw_info", doc_type="sino", id=_id) if boo: self.es.delete(index="xw_info", doc_type="sino", id=_id)
def test_elkhost(eshost): try: es_conn = Elasticsearch(eshost) es_conn.exists(index="logeureka-*", doc_type="_all", id=1) except: print("ELK host not ready", eshost) print("Error:",sys.exc_info()[0]) sys.exit(3)
def deleteESItem(elasticsearchDomain, documentId): host = elasticsearchDomain if (documentId): service = 'es' ss = boto3.Session() credentials = ss.get_credentials() region = ss.region_name awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) es = Elasticsearch(hosts=[{ 'host': host, 'port': 443 }], http_auth=awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection) if es.exists(index="textract", doc_type="document", id=documentId): es.delete(index="textract", doc_type="document", id=documentId) print("Deleted document: {}".format(documentId))
def validate_split(es: Elasticsearch, resolution_updates: Dict[str, any], config: dict) -> None: prev_res_num = 0 prev_para_num = None for res in resolution_updates['resolutions']: res_num = int(res.metadata['id'].split('resolution-')[1]) if res_num != prev_res_num + 1: raise ValueError( f'invalid sequence of resolution numbers: prev num {prev_res_num}, curr num: {res_num}' ) prev_res_num = res_num for para in res.paragraphs: para_num = int(para.metadata['id'].split('-para-')[1]) if prev_para_num is None: pass elif para_num != prev_para_num + 1: raise ValueError( f'invalid sequence of paragraph numbers: prev num {prev_para_num}, curr num: {para_num}' ) prev_para_num = para_num for match in resolution_updates['remove_matches']: match_id = make_hash_id(match) if not es.exists(index=config['phrase_match_index'], id=match_id): message = f'unknown phrase match id {match_id} (text id: {match.text_id}, phrase match cannot be removed' raise ValueError(message) return None
def annotate(config, documentId): if "getPosTags" in config and config["getPosTags"] == False: return esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) corpusIndex = config["corpus"]["index"] corpusType = config["corpus"]["type"] corpusFields = config["corpus"]["text_fields"] processorIndex = config["processor"]["index"] processorType = config["processor"]["type"] document = esClient.get(index=corpusIndex, doc_type=corpusType, id = documentId, fields=corpusFields) content = "" if "fields" in document: for field in corpusFields: if field in document["fields"]: if type(document["fields"][field]) is list: for element in document["fields"][field]: content += element + ". " else: content += document["fields"][field] + ". " annotatedDocument = {} sentences = nltk.sent_tokenize(content) posTaggedSentences = [] for sentence in sentences: sentence = sentence.strip() if len(sentence) > 1: sentence = sentence.replace("-", " ") sentenceWords = nltk.word_tokenize(sentence.lower()) sentenceWords = map(lambda x: x.replace(".", ""), sentenceWords) posTags = nltk.pos_tag(sentenceWords) posTaggedSentences.append(posTags) if esClient.exists(index=processorIndex, doc_type=processorType, id=document["_id"]): annotatedDocument = esClient.get(index=processorIndex, doc_type=processorType, id=document["_id"])["_source"] annotatedDocument["pos_tagged_sentences"] = posTaggedSentences esClient.index(index=processorIndex, doc_type=processorType, id=document["_id"], body=annotatedDocument) config["logger"].info("pos-processor: Annotated document '" + document["_id"] + "'")
class ElasticSearch(object): def check_node_status(self): res = requests.get('http://localhost:9200') if res.status_code == 200: return (res.content) return None def connect_es(self): self.es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) def add_document(self, id, data): self.es.index(index='green_bond', doc_type='report', id=id, body=data) def update_document(self, index, doc, id, data): self.es.delete(index=index, doc_type=doc, id=id) self.add_document(id, data) def check_document_exists(self, index, doc, id): return self.es.exists(index=index, doc_type=doc, id=id) def find_document(self, index, doc, id): return self.es.get(index=index, doc_type=doc, id=id) def get_all_document(self, index, doc): return self.es.search(index=index, doc_type=doc, size=1000, pretty=1)
def get(self, job_listing_id): print("Request for job listing with id: " + job_listing_id) es = Elasticsearch(hosts=["elastic"]) if (es.exists(index='joblistings', doc_type='job-listing', id=job_listing_id)): print('Found the document in ElasticSearch') doc = es.get(index='joblistings', doc_type='job-listing', id=job_listing_id) return doc['_source'] print('Not found in ElasticSearch, trying a scrape') with ClusterRpcProxy(CONFIG) as rpc: listing = rpc.stack_overflow_job_listings_scraping_microservice.get_job_listing_info( job_listing_id) print( "Microservice returned with a result - storing in ElasticSearch" ) es.index(index='joblistings', doc_type='job-listing', id=job_listing_id, body=listing) return listing
def setup_index(): """Sets up index and mapping if needed""" try: es = Elasticsearch(get_elasticsearch_endpoint()).indices # create index with mapping if needed if not es.exists(index=get_elasticsearch_index()): es.create( index=get_elasticsearch_index(), body='{"mappings": {"properties": {"filter": {' + '"type": "keyword"},"autocomplete": {"type": "completion"' + ',"contexts": [{"name": "filter","type": "category","path":' + ' "filter"}]}}}}') # add mapping if needed elif len( es.get_field_mapping(fields='autocomplete', index=get_elasticsearch_index()) [get_elasticsearch_index()]['mappings']) == 0: es.put_mapping( index=get_elasticsearch_index(), body='{"mappings": {"properties": {"filter": {' + '"type": "keyword"},"autocomplete": {"type": "completion"' + ',"contexts": [{"name": "filter","type": "category","path":' + ' "filter"}]}}}}') # continue workflow check_records_to_load_into_xse() except Exception as e: logging.error(e) raise SystemExit('Exiting! Connection error with elastic search')
def __check_botometer(screen_name:str): """Calls ES API to get cached BotOMeter API scoring about a screen name if is a Bot or Not Arguments: screen_name {str} -- Twitter Screen Name Returns: [dict] -- BotOMeter API Response """ settings = Settings() es = Elasticsearch(settings.ELASTICSEARCH_URL) logger.debug("Checking ES Botometer Info 🤖: %s" % screen_name) if es.exists(index=settings.ELASTICSEARCH_BOT_INDEX, doc_type='res', id=screen_name): logger.debug("Botometer Info 🤖found for: %s" % screen_name) res = es.get(index=settings.ELASTICSEARCH_BOT_INDEX, doc_type='res', id=screen_name) delta = arrow.utcnow() - arrow.get(res['_source']['updated_at']) if MAX_DELTA_BOTOMETER < delta.total_seconds(): logger.debug("Deprecated ES Botometer Info 🤖: %s" % screen_name) return False else: logger.debug("Found ES Botometer Info 🤖: %s" % screen_name) return res['_source'] else: logger.debug("NOT Found ES Botometer Info 🤖: %s" % screen_name) return False
def load_answers(es: Elasticsearch, data_f): with open(data_f, 'r', encoding="utf8") as csvfile: # open data file reader = csv.reader(csvfile, delimiter=',', quotechar='"') # setup reader with delimiter for row in reader: # loop over rows in csv if len(row) is not 8: # check if row is valid continue try: # try to setup the data for Elasticsearch, if it fails just continue with the next row index = int(row[0]) if es.exists( 'goeievraag', 'answers', index ): # if data already loaded into elasticsearch -> continue continue data = { # create json payload "answerId": index, "date": datetime.datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S'), "userId": int(row[2]), "questionId": int(row[3]), "answer": str(row[4]), "thumbsDown": int(row[5]), "thumbsUp": int(row[6]), "isBestAnswer": to_boolean(row[7]) } except ValueError: print('Invalid answer', row[0]) # log to command line which answers are invalid continue es.create('goeievraag', 'answers', index, data) # send data to elasticsearch
class ElasticSearch(Endpoint): """ ElasticSearch Endpoint implementation :param url: URL of the Endpoint :type url: str :param auth: Authentification information :type auth: (str, str) :param port: Port of the endpoint :type port: int """ def register(self): """ Register the endpoint with init resources :return: Endpoint """ self.endpoint = ES(self.url, auth=self.auth, port=self.port) return self.endpoint def create(self, name, settings): """ Create an index :param name: Name of the index :param settings: Setting for the index :return: Bool """ return self.endpoint.create(name, body=settings) def exists(self, name): """ Check if an index exists :param name: Name of the index to be created :return: Indication of existence as boolean :rtype: Bool """ return self.endpoint.exists(name)
class DB: doc_type = 'article' def __init__(self, host='localhost', port=9200, index_name='articles'): self.es = Elasticsearch([{'host': host, 'port': port}]) self.index_name = index_name self.logger = logging.getLogger(__name__) self.es.ping() self.create_index_if_not_exists() def create_index_if_not_exists(self): if self.es.indices.exists(self.index_name): return configuration = { "mappings": { self.doc_type: get_resource('elasticsearch/index_mapping.json') } } self.es.indices.create(index=self.index_name, body=configuration) self.logger.debug("Elasticsearch index %s created." % self.index_name) def index_document(self, document, id_): self.es.create(body=document, id=id_, index=self.index_name, doc_type=self.doc_type) def id_exists(self, id_): return self.es.exists(id=id_, index=self.index_name, doc_type=self.doc_type)
def index_tweets(): es = Elasticsearch(["http://mixednode1:9200"], use_ssl=False) inputs = glob("filtered/*/*.json") logging.info(inputs) for filename in inputs: logging.info("going to index %s" % filename) with open(filename, 'r') as input: docs = json.loads(input.read()) if len(docs) == 0: continue if es.exists(index="tweets", id=docs[0]["id"]): logging.info("Skipping") continue total = len(docs) i = 0 index_doc = [] for doc in docs: #es.index(index="tweets", doc_type="opinion", id=doc["id"], body=doc) i += 1 doc["created_at"] = parse_date(doc["created_at"]) index_doc.append({ "_index": "tweets", "_type": "tweet", "_id": doc["id"], "_source": doc }) if len(index_doc) == 500: logging.info("indexing %s/%s" % (i, total)) helpers.bulk(es, index_doc) index_doc = [] if len(index_doc) != 0: logging.info("indexing %s/%s" % (i, total)) helpers.bulk(es, index_doc)
def index_tweets(): es = Elasticsearch(["http://mixednode1:9200"], use_ssl=False) inputs = glob("filtered/*/*.json") logging.info(inputs) for filename in inputs: logging.info("going to index %s" % filename) with open(filename, "r") as input: docs = json.loads(input.read()) if len(docs) == 0: continue if es.exists(index="tweets", id=docs[0]["id"]): logging.info("Skipping") continue total = len(docs) i = 0 index_doc = [] for doc in docs: # es.index(index="tweets", doc_type="opinion", id=doc["id"], body=doc) i += 1 doc["created_at"] = parse_date(doc["created_at"]) index_doc.append({"_index": "tweets", "_type": "tweet", "_id": doc["id"], "_source": doc}) if len(index_doc) == 500: logging.info("indexing %s/%s" % (i, total)) helpers.bulk(es, index_doc) index_doc = [] if len(index_doc) != 0: logging.info("indexing %s/%s" % (i, total)) helpers.bulk(es, index_doc)
def get_rdap_asn(asn): es = Elasticsearch() does_exist = es.exists(index='whois', doc_type='asn_rdap', id = asn) print does_exist if does_exist is True: status = 200 print "Found it!" get_record = es.get(index='rdap',doc_type='asn', id = asn) results = jsonify(get_record['_source']) else: try: url = 'http://hailey.opendnsbl.net:8080/rdapbootstrap/autnum/%s' % asn r = requests.get(url) status = 200 b = r.json() #c = json.loads(b) #d = c['entities'] #print d #e = json.dumps(c) #es.index(index='rwhois', doc_type='asn', id=asn, body=json.dumps(b)) results = jsonify(b) except Exception as e: print e results_raw = jsonify({'status': "not_found"}) status = 404 results = jsonify({'status': "not_found"}) return results,status
def retrieve_resolution_by_id(es: Elasticsearch, resolution_id: str, config: dict) -> Union[Resolution, None]: if es.exists(index=config['resolution_index'], id=resolution_id): response = es.get(index=config['resolution_index'], id=resolution_id) return json_to_republic_resolution(response['_source']) else: return None
class ElasticsearchService(object): def __init__(self, host, port): self._es = Elasticsearch([{'host': host, 'port': port}]) def search(self, *args, **kwargs): return self._es.search(*args, **kwargs) def create(self, *args, **kwargs): return self._es.create(*args, **kwargs) def get(self, *args, **kwargs): return self._es.get(*args, **kwargs) def exists(self, *args, **kwargs): return self._es.exists(*args, **kwargs) def msearch(self, *args, **kwargs): return self._es.msearch(*args, **kwargs) def index(self, *args, **kwargs): return self._es.index(*args, **kwargs) def update(self, *args, **kwargs): return self._es.update(*args, **kwargs) def delete(self, *args, **kwargs): return self._es.delete(*args, **kwargs) def put_template(self, *args, **kwargs): return self._es.indices.put_template(*args, **kwargs)
def lambda_handler(event, context): id = event['id'] user = event['user'] ask = event['ask'] host = os.environ["NAME_ES_DOMAIN"] if "https" in host : es = Elasticsearch( [host], use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection ) else: es = Elasticsearch([host]) user['id'] = id datas = { "text": ask['text'], "tags": ask['tags'], "lang": user['lang'] if "lang" in user else "FR", "user": user, "date": datetime.date.today(), "timestamp": int(time.time()) } if not es.exists(index="questions", doc_type="question", id=int(ask['id'])): es.index(index="questions", doc_type="question", id=int(ask['id']), body=datas) return True
class PeragroClient(): """ An audio search client """ def __init__(self): """ initialize client object with elasticsearch object """ self.es = Elasticsearch() def set_index(self, index): """ set index for to lookup in elasticsearch Input: -index: an elasticsearch index """ self.index = index def get_sound(self, id_): """ Get sound by its id input: -id: id of sound output: -sound: sound details if it exists otherwise None Usage: >>> id = "X2VFAB12GH" >>> sound = c.get_sound(id) """ if self.es.exists(index=self.index, doc_type='_all', id=id_): res = self.es.get(index=self.index, id=id_) return res else: return None def text_search(self, query): """ Get sound results based on text query. It also has support for field queries. Usage: >>> query = "tum hi ho" >>> sounds = c.text_search(query) >>> # OR field query >>> query = "tags:'interscope' genre:'hip hop'" >>> sounds = c.text_search(query) """ # print self.index # print self.es.search(index=self.index) res = self.es.search(index=self.index, q=query) print("Got %d Hits:" % res['hits']['total']) return res
class EventoConsumer(AbstractBaseConsumer): """ consumes the messages in cc-zem (json structure based on data of dalite repository) and should create on the fly the elasticsearch doc for the zem index aim: make it rather simple and usable for prototyping the zem frontend """ def createDoc(self, message): course = json.loads(message) transformations = EventoESTransformation(course, self.edu_utilities) transformations.set_configuration(self.configuration.configuration) transformations.make_structure() result = transformations.es_structure return result def __init__(self, config_path: str, configrepshare: str = None, **kwargs): super().__init__(config_path, ConsumerConfig, **kwargs) self._initialize() def _initialize(self): if self.configuration["ES"]["active"]: self.es = Elasticsearch((self.configuration["ES"]["hosts"]).split("#"), index=self.configuration["ES"]["index"]) self.indexClient = self.es.indices self.dI = index = self.configuration["ES"]["index"] self.all_docs = [] self.edu_utilities = EduplatformUtilities(self.configuration.configuration) def _index_doc(self,key, message): if self.configuration["ES"]["active"]: doc = self.createDoc(message) #bug im update https://github.com/elastic/elasticsearch/issues/41625 #response = self.es.update(index="zem",id=key,body=doc) if self.es.exists(index="zem",id=key) else self.es.create(index="zem",id=key,body=doc) if not self.es.exists(index=self.dI, id=doc["id"]): response = self.es.create(index=self.dI, id=doc["id"], body=doc) def _append_doc_for_dump(self,key, message): self.all_docs.append(json.loads(message)) def process(self): #test = self.indexClient.get_mapping(index=self.dI) message = next(self._consumer,None) while (message is not None): value = message.value.decode('utf-8') key = message.key.decode('utf-8') self._index_doc(key, value) #self._append_doc_for_dump(key,value) message = next(self._consumer,None)
def populate(self): if self.download(): es = Elasticsearch(self.es_url) f = open('%s/%s' % (self.assests_dir, self.l8_metadata_filename), 'r') # Read the first line for all the headers headers = f.readline().split(',') # Read the rest of the document rows = f.readlines() added_counter = 0 skipped_counter = 0 for row in rows: fields = row.split(',') obj = {} for header in headers: try: obj[header.replace('\n', '')] = float( fields[headers.index(header)].replace('\n', '')) except ValueError: obj[header.replace( '\n', '')] = fields[headers.index(header)].replace( '\n', '') try: if not es.exists(index=self.es_main_index, doc_type=self.es_main_type, id=obj['sceneID']): es.create(index=self.es_main_index, doc_type=self.es_main_type, id=obj['sceneID'], body=json.dumps(obj), ignore=409) # print('%s-%s created' % (counter, obj['sceneID'])) added_counter += 1 print('%s new records added' % added_counter, end='\r') else: skipped_counter += 1 # New meta data is added to the top of the document. # When the script starts to see existing records, it means # that all new records are added and it's safe to break # the loop. if skipped_counter > 10: break return True except ConnectionError: print('There was a connection error. Check your Elastic' + ' Search setting and make sure Elastic Search is' + 'running.') return False except: print('An expected error: %s' % (sys.exc_info()[0])) return False print('The update is completed. %s new records were added.' % added_counter)
def _update_status_ES(status_id: int, json_data: dict): settings = Settings() es = Elasticsearch(settings.ELASTICSEARCH_URL) if es.exists(index=settings.ELASTICSEARCH_STATUS_INDEX, doc_type='status', id=status_id): _index_status_ES(status_id, json_data) else: return None
class SyncElasticSearch(object): host = settings.NAME_ES_DOMAIN index = None doc_type = None def __init__(self, id): self.id = id if "https" in self.host: self.es = Elasticsearch([self.host], use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection) else: self.es = Elasticsearch([self.host]) def transform_user_to_dict(self, instance): return { "first_name": instance.first_name, "last_name": instance.last_name, "id": instance.id, "username": instance.username, "photo": str(instance.photo.url) } def create(self): self.es.index(index=self.index, doc_type=self.doc_type, id=int(self.id), body=self.object) def delete(self): if self.es.exists(index=self.index, doc_type=self.doc_type, id=int(self.id)): self.es.delete(index=self.index, doc_type=self.doc_type, id=int(self.id)) def update(self): if self.es.exists(index=self.index, doc_type=self.doc_type, id=int(self.id)): self.es.update(index=self.index, doc_type=self.doc_type, id=int(self.id), body={"doc": self.object})
def process_item(self, item, spider): # 获取情感分类 q = lstm_predict(item['content']) # q = 0 es2 = Elasticsearch(hosts=['192.168.3.15']) # 存入mysql,同时存入es res2 = es2.exists(index="spider", doc_type='article', id=item['article_id']) if res2 is not True: try: # 插入数据 self.cursor.execute( "INSERT INTO weibo (id,article_id,content,url,media,publish_time,create_time,qinggan,comm_num,read_num,fav_num,env_num,user_id,user_name) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", ('0', str(item['article_id']), str(item['content']), str(item['url']), "微博", item['time'], item['create_time'], str(q), str(item['comm_num']), str(item['read_num']), str(item['fav_num']), str(item['env_num']), str(item['user_id']), item['user_name'])) self.connect.commit() print('mysql一条数据插入成功') except Exception as e: # 出现错误时打印错误日志 print('mysql 错误', e) # 将数据存入es es = connections.create_connection(Sina_type._doc_type.using) try: print("04%" * 30) art = Sina_type() content = ''.join(item['content']).replace( u'\u3000', u' ').replace(u'\xa0', u' ').replace(u'\\u3000', u' ').replace(u'\\xa0', u' ') art.content = remove_tags(content) art.media = '微博' art.publish_time = item['time'] art.create_time = item['create_time'] art.url = item['url'] art.qinggan = q art.comm_num = int(item['comm_num']) art.read_num = int(item['read_num']) art.fav_num = int(item['fav_num']) art.env_num = int(item['env_num']) art.hot_value = int(item['comm_num']) + int( item['read_num']) + int(item['fav_num']) + int( item['env_num']) art.user_id = item['user_id'] art.user_name = item['user_name'] art.meta.id = item['article_id'] art.save() print("elasticsearch 存入一条数据", item['article_id']) except Exception as e: print(e) print("03" * 30) return item
def connect(self, host, port, wait=5): """ Tries to connect to the given elasticsearch database Returns ElasticSearch object or None """ start = time.time() while True: try: es = Elasticsearch(hosts=host, port=port) es.exists(index="test", id=1) except exceptions.ConnectionError: if time.time() - start > wait: break else: self.es = es return True return False
def get_meeting_by_date(es: Elasticsearch, date: RepublicDate) -> Union[None, Meeting]: # pre-session ID for old index doc_id = f'meeting-{date.isoformat()}-session-1' if es.exists(index=index, doc_type=doc_type, id=doc_id): response = es.get(index=index, doc_type=doc_type, id=doc_id) return session_from_json(response['_source']) else: return None
def get_whois_ip(ip,refresh=None): es = Elasticsearch() print repr(ip) id_num = str(ip).replace(".","0") does_exist = es.exists(index='rwhois2', doc_type='ipaddr', id = id_num) print does_exist if does_exist is True and refresh is None: status = 200 print "Found it!" get_record = es.get(index='rwhois2',doc_type='ipaddr', id = id_num) results = jsonify(get_record['_source']) elif does_exist is True and refresh is not None: status = 200 print "Forcing refresh!" es.delete(index='rwhois2', doc_type='ipaddr', id = id_num) try: ipwhois.net.socks.setdefaultproxy(ipwhois.net.socks.SOCKS5,"localhost") obj = IPWhois(ip) try: results_raw = obj.lookup_whois(get_referral=True,inc_nir=True) except: results_raw = obj.lookup_whois() status = 200 results = jsonify(results_raw) es.index(index='rwhois2', doc_type='ipaddr', id=id_num, body=results_raw) except Exception as e: print e results = jsonify({'status': "not_found"}) status = 404 else: try: obj = IPWhois(ip) try: results_raw = obj.lookup_whois(get_referral=True) except: results_raw = obj.lookup_whois() status = 200 results = jsonify(results_raw) id_num = str(ip).replace(".","0") print results try: es.index(index='rwhois2', doc_type='ipaddr', id=id_num, body=results_raw) except Exception as e: print "Elasticsearch encountered a problem ", e pass except Exception as e: #print results_raw print e results_raw = jsonify({'status': "not_found"}) status = 404 results = jsonify({'status': "not_found"}) return results,status
def get_inventory_metadata(es: Elasticsearch, inventory_num: int, config: dict) -> Union[dict, None]: if not es.exists(index=config["inventory_index"], doc_type=config["inventory_doc_type"], id=inventory_num): return None response = es.get(index=config["inventory_index"], doc_type=config["inventory_doc_type"], id=inventory_num) return response["_source"]
def populate(self): if self.download(): es = Elasticsearch(self.es_url) f = open('%s/%s' % (self.assests_dir, self.l8_metadata_filename), 'r') # Read the first line for all the headers headers = f.readline().split(',') # Read the rest of the document rows = f.readlines() added_counter = 0 skipped_counter = 0 for row in rows: fields = row.split(',') obj = {} for header in headers: try: obj[header.replace('\n', '')] = float(fields[ headers.index(header)].replace('\n', '')) except ValueError: obj[header.replace('\n', '')] = fields[ headers.index(header)].replace('\n', '') try: if not es.exists( index=self.es_main_index, doc_type=self.es_main_type, id=obj['sceneID']): es.create( index=self.es_main_index, doc_type=self.es_main_type, id=obj['sceneID'], body=json.dumps(obj), ignore=409) # print('%s-%s created' % (counter, obj['sceneID'])) added_counter += 1 else: skipped_counter += 1 print('%s added | %s skipped' % (added_counter, skipped_counter), end='\r') except ConnectionError: print('There was a connection error. Check your Elastic' + ' Search setting and make sure Elastic Search is' + 'running.') return False except: print('An expected error: %s' % (sys.exc_info()[0])) return False print('The update is completed. %s new records were added.' % added_counter) return True
class send_data: def __init__(self): self.es = Elasticsearch([{'host': 'localhost', 'port': '9200'}]) def read_from_file(self, text_file): # counter = 0 with open(text_file, 'r') as file_read: next(file_read) while True: line = file_read.readline() # line = line.strip('\n') record = line.split('###') # print(record) self.send_to_index(record) # if not line or counter == 5: # break # counter += 1 if not line: break """ Sends data to ElasticSearch for indexing """ def send_to_index(self,a_webpage): webpage_id = int(a_webpage[1]) img_link = a_webpage[2] web_link = a_webpage[3] webpage_title = a_webpage[4] web_content = a_webpage[5] # print(webpage_id,'\t', img_link,'\t', web_link,'\t', webpage_title) if not self.es.exists(index="final_kellyhe", id=webpage_id): doc = {'title': webpage_title, 'content': web_content, 'image': img_link, 'link':web_link} self.es.index(index='final_kellyhe', id=webpage_id, body=doc) print('\tSuccess! ID is: ', webpage_id) else: print('Duplicate webpage id exist! ID is: ', webpage_id, webpage_title, web_link) exit() """ Deletes an index """ def delete_an_index(self, index_name): if self.es.indices.exists(index= index_name): print('index name exists!') self.es.indices.delete(index=index_name) print('index deleted!') else: print('index name NOT exists!')
def check_id_in_es(es: Elasticsearch, index: str, id: str): """ Check if the news has arleady been indexed in ElasticSearch :param es: ElasticSearch connection :param index: index in elastic :param id: url of the news :return: true if already exists, false otherwise """ return es.exists(index, id)
def retrieve_page_by_id(es: Elasticsearch, page_id: str, config) -> Union[PageXMLPage, None]: if not es.exists(index=config['page_index'], id=page_id): return None response = es.get(index=config['page_index'], id=page_id) if '_source' in response: page_doc = json_to_pagexml_page(response['_source']) return page_doc else: return None
def _get_status_ES(status_id: int): settings = Settings() es = Elasticsearch(settings.ELASTICSEARCH_URL) if es.exists(index=settings.ELASTICSEARCH_STATUS_INDEX, doc_type='status', id=status_id): res = es.get(index=settings.ELASTICSEARCH_STATUS_INDEX, doc_type='status', id=status_id) return res['_source'] else: return None
def main(): global resh es = Elasticsearch([{'host': '127.0.0.1', 'port': 9200}]) temp_list = [] if system("wc -w dump.pickle > /dev/null 2>&1 &") == 0: pickle_in = open("dump.pickle", "rb") indexing = pickle.load(pickle_in) system("rm dump.pickle > /dev/null 2>&1 &") for i in indexing: temps = i i = re.sub("[^a-zA-Z]", "", i) str = "" i = str.join(i) if i == "": pass else: resh = es.exists(index='storage', doc_type='dbs', id=i.lower()) #print(i) if resh == True: try: #print ("i is {}".format(i)) res = es.get(index='storage', doc_type='dbs', id=i.lower()) temp_list = res['_source'][i.lower()] for m in indexing[temps]: temp_list.append(m) data = { i.lower(): temp_list, } res = es.index(index="storage", doc_type='dbs', body=data, id=i.lower()) if res['_shards']['successful'] == 1: print("modified") else: print("ERROR!!!") except: pass elif resh == False: data = { i.lower(): indexing[temps], } try: res = es.index(index="storage", doc_type='dbs', body=data, id=i.lower()) if res['_shards']['successful'] == 1: print("Uploaded") else: print("not uploaded") except: pass else: pass
def retrieve_inventory_metadata(es: Elasticsearch, inventory_num: int, config): if not es.exists(index=config['inventory_index'], doc_type=config['inventory_doc_type'], id=inventory_num): raise ValueError( 'No inventory metadata available for inventory num {}'.format( inventory_num)) response = es.get(index=config['inventory_index'], doc_type=config['inventory_doc_type'], id=inventory_num) return response['_source']
def get_lastmodified(self, docid, parameters = {}): es = Elasticsearch() doc_exists = es.exists(index=self.config['index'], doc_type="document", id=docid) # if doc with id exists in index, read modification date if doc_exists: doc = es.get(index=self.config['index'], doc_type="document", id=docid, _source=False, fields="file_modified_dt") last_modified = doc['fields']['file_modified_dt'][0] else: last_modified=None return last_modified
class ElasticsearchService(object): def __init__(self, host, port): self._es = Elasticsearch([{'host': host, 'port': port}]) def search(self, *args, **kwargs): return self._es.search(*args, **kwargs) def create(self, *args, **kwargs): return self._es.create(*args, **kwargs) def get(self, *args, **kwargs): return self._es.get(*args, **kwargs) def exists(self, *args, **kwargs): return self._es.exists(*args, **kwargs) def msearch(self, *args, **kwargs): return self._es.msearch(*args, **kwargs)
def get_rdap_ip(ip): es = Elasticsearch() does_exist = es.exists(index='rdap', doc_type='ipaddr', id = ip) print does_exist if does_exist is True: status = 200 print "Found it!" get_record = es.get(index='rdap',doc_type='ipaddr', id = ip) results = jsonify(get_record['_source']) else: try: obj = IPWhois(ip) results_raw = obj.lookup_rdap(depth=1) status = 200 results = jsonify(results_raw) es.index(index='rdap', doc_type='ipaddr', id=ip, body=json.dumps(results_raw)) except Exception as e: print e results = jsonify({'status': "not_found"}) status = 404 results = jsonify({'status': "not_found"}) return results,status
def get_whois_domain(domain,refresh=None): es = Elasticsearch() id_num = domain does_exist = es.exists(index='domain', doc_type='domain', id = domain) print does_exist if does_exist is True and refresh is None: status = 200 print "Found it!" get_record = es.get(index='domain',doc_type='domain', id = domain) results = jsonify(get_record['_source']) elif does_exist is True and refresh is not None: status = 200 print "Forcing refresh!" es.delete(index='domain', doc_type='domain', id = domain) try: obj = whois.whois(domain) status = 200 results = jsonify(obj) es.index(index='domain', doc_type='domain', id=domain, body=obj) except Exception as e: print e results_raw = jsonify({'status': "not_found"}) status = 404 else: try: obj = whois.whois(domain) status = 200 results = jsonify(obj) es.index(index='domain', doc_type='domain', id=domain, body=obj) except Exception as e: print e results_raw = jsonify({'status': "not_found"}) status = 404 results = jsonify({'status': "not_found"}) return results,status
class Search(object): """Search Repository""" def __init__(self, config): if 'ELASTICSEARCH' in config: options = {"host": config["ELASTICSEARCH"]["host"], "port": config["ELASTICSEARCH"]["port"]} if 'url_prefix' in config["ELASTICSEARCH"]: options['url_prefix'] = config["ELASTICSEARCH"]['url_prefix'] self.search_index = Elasticsearch(options) self.triplestore = TripleStore(config) self.body = None def __get_id_or_value__(self, value): """Helper function takes a dict with either a value or id and returns the dict value Args: value(dict) Returns: string or None """ if [str, float, int, bool].count(type(value)) > 0: return value elif '@value' in value: return value.get('@value') elif '@id' in value: result = self.triplestore.__get_id__(value.get('@id')) if len(result) > 0: return result[0]['uuid']['value'] return value.get('@id') return value def __generate_body__(self, graph, prefix=None): """Internal method generates the body for indexing into Elastic search based on the JSON-LD serializations of the Fedora Commons Resource graph. Args: graph -- rdflib.Graph of Resource prefix -- Prefix filter, will only index if object starts with a prefix, default is None to index everything. """ self.body = dict() graph_json = json.loads( graph.serialize( format='json-ld', context=CONTEXT).decode()) if '@graph' in graph_json: for graph in graph_json.get('@graph'): # Index only those graphs that have been created in the # repository if 'fedora:created' in graph: for key, val in graph.items(): if key in [ 'fedora:lastModified', 'fedora:created', 'fedora:uuid' ]: self.__set_or_expand__(key, val) elif key.startswith('@type'): for name in val: #! prefix should be a list if prefix: if name.startswith(prefix): self.__set_or_expand__('type', name) else: self.__set_or_expand__('type', name) elif key.startswith('@id'): self.__set_or_expand__('fedora:hasLocation', val) elif not key.startswith('fedora') and not key.startswith('owl'): self.__set_or_expand__(key, val) def __index__(self, subject, graph, doc_type, index, prefix=None): self.__generate_body__(graph, prefix) doc_id = str(graph.value( subject=subject, predicate=FEDORA.uuid)) self.__generate_suggestion__(subject, graph, doc_id) self.search_index.index( index=index, doc_type=doc_type, id=doc_id, body=self.body) def __set_or_expand__(self, key, value): """Helper method takes a key and value and either creates a key with either a list or appends an existing key-value to the value Args: key value """ if key not in self.body: self.body[key] = [] if type(value) == list: for row in value: self.body[key].append(self.__get_id_or_value__(row)) else: self.body[key].append(self.__get_id_or_value__(value)) def __update__(self, **kwargs): """Helper method updates a stored document in Elastic Search and Fuseki. Method must have doc_id Keyword args: doc_id -- Elastic search document ID field -- Field name to update index, raises exception if None value -- Field value to update index, raises exception if None """ doc_id, doc_type, index = kwargs.get('doc_id'), None, None if not doc_id: raise falcon.HTTPMissingParam("doc_id") field = kwargs.get('field') if not field: raise falcon.HTTPMissingParam("field") value = kwargs.get('value') if not value: raise falcon.HTTPMissingParam("field") for row in self.search_index.indices.stats()['indices'].keys(): # Doc id should be unique across all indices if self.search_index.exists(index=row, id=doc_id): result = self.search_index.get(index=row, id=doc_id) doc_type = result['_type'] index=row break if doc_type is None or index is None: raise falcon.HTTPNotFound() self.search_index.update( index=index, doc_type=doc_type, id=doc_id, body={"doc": { field: self.__get_id_or_value__(value) }}) result = self.triplestore.__get_subject__(uuid=doc_id) if len(result) == 1: self.triplestore.__update_triple__( result[0]['subject']['value'], field, value) def on_get(self, req, resp): """Method takes a a phrase, returns the expanded result. Args: req -- Request resp -- Response """ phrase = req.get_param('phrase') or '*' size = req.get_param('size') or 25 resource_type = req.get_param('resource') or None if resource_type: resp.body = json.dumps(self.search_index.search( q=phrase, doc_type=resource_type, size=size)) else: resp.body = json.dumps(self.search_index.search( q=phrase, size=size)) resp.status = falcon.HTTP_200 def on_patch(self, req, resp): """Method takes either sparql statement or predicate and object and updates the Resource. Args: req -- Request resp -- Response """ doc_uuid = req.get_param('uuid') if not doc_uuid: raise falcon.HTTPMissingParam('uuid') predicate = req.get_param('predicate') or None if not predicate: raise falcon.HTTPMissingParam('predicate') object_ = req.get_param('object') or None if not object_: raise falcon.HTTPMissingParam('object') doc_type = req.get_param('doc_type') or None if self.__update__( doc_id=doc_uuid, doc_type=doc_type, field=predicate, value=object_): resp.status = falcon.HTTP_202 resp.body = json.dumps(True) else: raise falcon.HTTPInternalServerError( "Error with PATCH for {}".format(doc_uuid), "Failed setting {} to {}".format( predicate, object_))
index_exist = es.indices.exists(index="activity") if not index_exist: es.indices.create(index="activity", ignore=400) except Exception,r: print Exception,":",r s_re = scan(es, query={"query":{"match_all":{}},"size":1000}, index="20130901",doc_type='bci') bulk_action = [] # new uid record to es count_index = 0 while 1: try: item = s_re.next()['_source'] except: break user_id = item['user'] doc_exist = es.exists(index="activity", id=user_id) if not doc_exist: activity_info = {} activity_info['uid'] = user_id activity_info['max_index'] = item['user_index'] activity_info['min_index'] = item['user_index'] activity_info['index_number'] = 1 activity_info['lower_than_average_number'] = 0 activity_info['remove'] = 0 # 0 denotes not remove xdata = expand_index_action(activity_info) bulk_action.extend([xdata[0], xdata[1]]) count_index += 1 if count_index % 2000 == 0: test_speed(es, count_index, bulk_action) bulk_action = []
#! coding: utf-8 from elasticsearch import Elasticsearch import httplib2 from os import getcwd #test elastic in python #товар - распарсил, в json -> (если _source пуст)документ elastic - заполняем _source в доке #https://elasticsearch-py.readthedocs.org/en/master/api.html es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) doc='{"name":"one_name", "arrs":{"el1":1,"els2":2}}' #res = es.index(index="gearbest_index", doc_type='product_type', id=5, body=doc) #res=es.create(index="gearbest_index",doc_type='product_type',body=doc,id=1) b=es.exists(index="gearbest_index",doc_type="product_type",id=1) if(b==True): print 'OK' #111
__author__ = 'terry' import sys from elasticsearch import Elasticsearch import time if __name__ == '__main__': time.sleep(5) # create a connection to the Elasticsearch database client = Elasticsearch(['pureelk-elasticsearch:9200'], retry_on_timeout=True) if client.exists(index='.kibana', doc_type='index-pattern',id='pureelk-global-arrays'): sys.exit(0) else: sys.exit(1)
class Archiver(object): """ A mailman 3 archiver that forwards messages to pony mail. """ if config.has_section('mailman') and config.has_option('mailman', 'plugin'): implementer(IArchiver) # This is a list of the headers we're interested in publishing. keys = [ "archived-at", "delivered-to", "from", "cc", "to", "date", "in-reply-to", "message-id", "subject", "x-message-id-hash", "references", "x-mailman-rule-hits", "x-mailman-rule-misses", ] def __init__(self, parseHTML=False): """ Just initialize ES. """ self.html = parseHTML if parseHTML: import html2text self.html2text = html2text.html2text self.dbname = config.get("elasticsearch", "dbname") ssl = config.get("elasticsearch", "ssl", fallback="false").lower() == 'true' self.consistency = config.get('elasticsearch', 'write', fallback='quorum') self.cropout = config.get("debug", "cropout", fallback=None) uri = config.get("elasticsearch", "uri", fallback="") dbs = [ { 'host': config.get("elasticsearch", "hostname"), 'port': int(config.get("elasticsearch", "port")), 'use_ssl': ssl, 'url_prefix': uri, 'http_auth': auth }] # Backup ES? backup = config.get("elasticsearch", "backup", fallback="") if backup != "": dbs.append( { 'host': backup, 'port': int(config.get("elasticsearch", "port")), 'use_ssl': ssl, 'url_prefix': uri, 'http_auth': auth } ) self.es = Elasticsearch(dbs, max_retries=5, retry_on_timeout=True ) def msgfiles(self, msg): attachments = [] contents = {} for part in msg.walk(): part_meta, part_file = parse_attachment(part) if part_meta: attachments.append(part_meta) contents[part_meta['hash']] = part_file return attachments, contents def msgbody(self, msg): body = None firstHTML = None if msg.is_multipart(): for part in msg.walk(): try: if part.is_multipart(): for subpart in part.walk(): if subpart.get_content_type() == 'text/plain' and not body: body = subpart.get_payload(decode=True) if subpart.get_content_type() == 'text/enriched' and not body: body = subpart.get_payload(decode=True) elif subpart.get_content_type() == 'text/html' and self.html and not firstHTML: firstHTML = subpart.get_payload(decode=True) elif part.get_content_type() == 'text/plain' and not body: body = part.get_payload(decode=True) elif part.get_content_type() == 'text/html' and self.html and not firstHTML: firstHTML = part.get_payload(decode=True) except Exception as err: print(err) elif msg.get_content_type() == 'text/plain': body = msg.get_payload(decode=True) elif msg.get_content_type() == 'text/enriched': body = msg.get_payload(decode=True) elif msg.get_content_type() == 'text/html' and self.html and not firstHTML: firstHTML = msg.get_payload(decode=True) # this requires a GPL lib, user will have to install it themselves if firstHTML and (not body or len(body) <= 1 or (iBody and str(body).find(str(iBody)) != -1)): body = self.html2text(firstHTML.decode("utf-8", 'ignore') if type(firstHTML) is bytes else firstHTML) for charset in pm_charsets(msg): try: body = body.decode(charset) if type(body) is bytes else body except: body = body.decode('utf-8', errors='replace') if type(body) is bytes else body return body def compute_updates(self, lid, private, msg): """Determine what needs to be sent to the archiver. :param lid: The list id :param msg: The message object. :return None if the message could not be parsed """ ojson = None if not lid: lid= msg.get('list-id') if self.cropout: crops = self.cropout.split(" ") # Regex replace? if len(crops) == 2: lid = re.sub(crops[0], crops[1], lid) # Standard crop out? else: lid = lid.replace(self.cropout, "") defaultEmptyString = lambda value: value and str(value) or "" msg_metadata = dict([(k, defaultEmptyString(msg.get(k))) for k in self.keys]) mid = hashlib.sha224(str("%s-%s" % (lid, msg_metadata['archived-at'])).encode('utf-8')).hexdigest() + "@" + (lid if lid else "none") for key in ['to','from','subject','message-id']: try: hval = "" if msg_metadata.get(key): for t in email.header.decode_header(msg_metadata[key]): if t[1] == None or t[1].find("8bit") != -1: hval += t[0].decode('utf-8') if type(t[0]) is bytes else t[0] else: hval += t[0].decode(t[1],errors='ignore') msg_metadata[key] = hval except Exception as err: print("Could not decode headers, ignoring..: %s" % err) if not msg_metadata.get('message-id'): msg_metadata['message-id'] = mid mdate = None uid_mdate = 0 # mdate for UID generation try: mdate = email.utils.parsedate_tz(msg_metadata.get('date')) uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid except: pass if not mdate and msg_metadata.get('archived-at'): mdate = email.utils.parsedate_tz(msg_metadata.get('archived-at')) elif not mdate: print("Date (%s) seems totally wrong, setting to _now_ instead." % mdate) mdate = time.gmtime() # Get a standard 9-tuple mdate = mdate + (0, ) # Fake a TZ (10th element) mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(email.utils.mktime_tz(mdate))) body = self.msgbody(msg) try: if 'content-type' in msg_metadata and msg_metadata['content-type'].find("flowed") != -1: body = convertToWrapped(body, character_set="utf-8") if isinstance(body, str): body = body.encode('utf-8') except Exception as err: try: body = body.decode(chardet.detect(body)['encoding']) except Exception as err: try: body = body.decode('latin-1') except: try: if isinstance(body, str): body = body.encode('utf-8') except: body = None attachments, contents = self.msgfiles(msg) irt = "" if body is not None or attachments: pmid = mid try: # Use full message as bytes for mid? if archiver_generator == "full": mid = "%s@%s" % (hashlib.sha224(msg.as_bytes()).hexdigest(), lid) elif archiver_generator == "medium": xbody = body if type(body) is bytes else body.encode('ascii', 'ignore') xbody += bytes(lid, encoding='ascii') xbody += bytes(mdatestring, encoding='ascii') mid = "%s@%s" % (hashlib.sha224(xbody).hexdigest(), lid) else: # Or revert to the old way? mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid) except Exception as err: if logger: logger.warn("Could not generate MID: %s" % err) mid = pmid if 'in-reply-to' in msg_metadata: try: try: irt = "".join(msg_metadata['in-reply-to']) except: irt = msg_metadata.get('in-reply-to').__str__() except: irt = "" ojson = { 'from_raw': msg_metadata['from'], 'from': msg_metadata['from'], 'to': msg_metadata['to'], 'subject': msg_metadata['subject'], 'message-id': msg_metadata['message-id'], 'mid': mid, 'cc': msg_metadata.get('cc'), 'epoch': email.utils.mktime_tz(mdate), 'list': lid, 'list_raw': lid, 'date': mdatestring, 'private': private, 'references': msg_metadata['references'], 'in-reply-to': irt, 'body': body.decode('utf-8', 'replace') if type(body) is bytes else body, 'attachments': attachments } self.msg_metadata = msg_metadata self.irt = irt return ojson, contents def archive_message(self, mlist, msg): """Send the message to the archiver. :param mlist: The IMailingList object. :param msg: The message object. :return (lid, mid) """ lid = normalize_lid(mlist.list_id) private = False if hasattr(mlist, 'archive_public') and mlist.archive_public == True: private = False elif hasattr(mlist, 'archive_public') and mlist.archive_public == False: private = True elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is not ArchivePolicy.public: private = True ojson, contents = self.compute_updates(lid, private, msg) if not ojson: id = msg.get('message-id') or msg.get('Subject') or msg.get("Date") raise Exception("Could not parse message %s for %s" % (id,lid)) if args.dry: print("**** Dry run, not saving message to database *****") return lid, ojson['mid'] msg_metadata = self.msg_metadata irt = self.irt if contents: for key in contents: self.es.index( index=self.dbname, doc_type="attachment", id=key, body = { 'source': contents[key] } ) self.es.index( index=self.dbname, doc_type="mbox", id=ojson['mid'], consistency = self.consistency, body = ojson ) self.es.index( index=self.dbname, doc_type="mbox_source", id=ojson['mid'], consistency = self.consistency, body = { "message-id": msg_metadata['message-id'], "source": self.mbox_source(msg) } ) # If MailMan and list info is present, save/update it in ES: if hasattr(mlist, 'description') and hasattr(mlist, 'list_name') and mlist.description and mlist.list_name: self.es.index( index=self.dbname, doc_type="mailinglists", id=lid, consistency = self.consistency, body = { 'list': lid, 'name': mlist.list_name, 'description': mlist.description, 'private': private } ) if logger: logger.info("Pony Mail archived message %s successfully" % mid) oldrefs = [] # Is this a direct reply to a pony mail email? if irt != "": dm = re.search(r"pony-([a-f0-9]+)-([a-f0-9]+)@", irt) if dm: cid = dm.group(1) mid = dm.group(2) if self.es.exists(index = self.dbname, doc_type = 'account', id = cid): doc = self.es.get(index = self.dbname, doc_type = 'account', id = cid) if doc: oldrefs.append(cid) # N.B. no index is supplied, so ES will generate one self.es.index( index=self.dbname, doc_type="notifications", consistency = self.consistency, body = { 'type': 'direct', 'recipient': cid, 'list': lid, 'private': private, 'date': ojson['date'], 'from': msg_metadata['from'], 'to': msg_metadata['to'], 'subject': msg_metadata['subject'], 'message-id': msg_metadata['message-id'], 'in-reply-to': irt, 'epoch': ojson['epoch'], 'mid': mid, 'seen': 0 } ) if logger: logger.info("Notification sent to %s for %s" % (cid, mid)) # Are there indirect replies to pony emails? if msg_metadata.get('references'): for im in re.finditer(r"pony-([a-f0-9]+)-([a-f0-9]+)@", msg_metadata.get('references')): cid = im.group(1) mid = im.group(2) if self.es.exists(index = self.dbname, doc_type = 'account', id = cid): doc = self.es.get(index = self.dbname, doc_type = 'account', id = cid) # does the user want to be notified of indirect replies? if doc and 'preferences' in doc['_source'] and doc['_source']['preferences'].get('notifications') == 'indirect' and not cid in oldrefs: oldrefs.append(cid) # N.B. no index is supplied, so ES will generate one self.es.index( index=self.dbname, consistency = self.consistency, doc_type="notifications", body = { 'type': 'indirect', 'recipient': cid, 'list': lid, 'private': private, 'date': ojson['date'], 'from': msg_metadata['from'], 'to': msg_metadata['to'], 'subject': msg_metadata['subject'], 'message-id': msg_metadata['message-id'], 'in-reply-to': irt, 'epoch': ojson['epoch'], 'mid': mid, 'seen': 0 } ) if logger: logger.info("Notification sent to %s for %s" % (cid, mid)) return lid, ojson['mid'] def mbox_source(self, msg): # Common method shared with import-mbox policy = msg.policy.clone(max_line_length=0) # don't wrap headers return msg.as_bytes(policy=policy).decode('utf-8', errors='replace') def list_url(self, mlist): """ Required by MM3 plugin API """ return None def permalink(self, mlist, msg): """ Required by MM3 plugin API """ return None
class Archiver(object): """ A mailman 3 archiver that forwards messages to pony mail. """ if __name__ != '__main__': implementer(IArchiver) name = "ponymail" # This is a list of the headers we're interested in publishing. keys = [ "archived-at", "delivered-to", "from", "cc", "to", "date", "in-reply-to", "message-id", "subject", "x-message-id-hash", "references", "x-mailman-rule-hits", "x-mailman-rule-misses", ] def __init__(self): """ Just initialize ES. """ global config, auth, parseHTML ssl = False self.cropout = None self.html = parseHTML self.dbname = config.get("elasticsearch", "dbname") self.consistency = 'quorum' if config.has_option("elasticsearch", "ssl") and config.get("elasticsearch", "ssl").lower() == 'true': ssl = True if config.has_option("elasticsearch", "write") and config.get("elasticsearch", "write") != "": self.consistency = config.get('elasticsearch', 'write') if config.has_option("debug", "cropout") and config.get("debug", "cropout") != "": self.cropout = config.get("debug", "cropout") uri = "" if config.has_option("elasticsearch", "uri") and config.get("elasticsearch", "uri") != "": uri = config.get("elasticsearch", "uri") dbs = [ { 'host': config.get("elasticsearch", "hostname"), 'port': int(config.get("elasticsearch", "port")), 'use_ssl': ssl, 'url_prefix': uri, 'http_auth': auth }] # Backup ES? if config.has_option("elasticsearch", "backup") and config.get("elasticsearch", "backup") != "": backup = config.get("elasticsearch", "backup") dbs.append( { 'host': config.get("elasticsearch", "backup"), 'port': int(config.get("elasticsearch", "port")), 'use_ssl': ssl, 'url_prefix': uri, 'http_auth': auth } ) self.es = Elasticsearch(dbs, max_retries=5, retry_on_timeout=True ) def msgfiles(self, msg): attachments = [] contents = {} if msg.is_multipart(): for part in msg.walk(): part_meta, part_file = parse_attachment(part) if part_meta: attachments.append(part_meta) contents[part_meta['hash']] = part_file return attachments, contents def msgbody(self, msg): body = None firstHTML = None if msg.is_multipart(): for part in msg.walk(): try: if part.is_multipart(): for subpart in part.walk(): if subpart.get_content_type() == 'text/plain' and not body: body = subpart.get_payload(decode=True) elif subpart.get_content_type() == 'text/html' and self.html and not firstHTML: firstHTML = subpart.get_payload(decode=True) elif part.get_content_type() == 'text/plain' and not body: body = part.get_payload(decode=True) elif part.get_content_type() == 'text/html' and self.html and not firstHTML: firstHTML = part.get_payload(decode=True) except Exception as err: print(err) elif msg.get_content_type() == 'text/plain': body = msg.get_payload(decode=True) elif msg.get_content_type() == 'text/html' and self.html and not firstHTML: firstHTML = msg.get_payload(decode=True) # this requires a GPL lib, user will have to install it themselves if firstHTML and (not body or len(body) <= 1): body = html2text.html2text(firstHTML.decode("utf-8", 'ignore') if type(firstHTML) is bytes else firstHTML) for charset in pm_charsets(msg): try: body = body.decode(charset) if type(body) is bytes else body except: body = body.decode('utf-8', errors='replace') if type(body) is bytes else body return body def archive_message(self, mlist, msg): """Send the message to the archiver. :param mlist: The IMailingList object. :param msg: The message object. """ lid = None m = re.search(r"(<.+>)", mlist.list_id.replace("@", ".")) if m: lid = m.group(1) else: lid = "<%s>" % mlist.list_id.strip("<>").replace("@", ".") if self.cropout: crops = self.cropout.split(" ") # Regex replace? if len(crops) == 2: lid = re.sub(crops[0], crops[1], lid) # Standard crop out? else: lid = lid.replace(self.cropout, "") format = lambda value: value and str(value) or "" msg_metadata = dict([(k, format(msg.get(k))) for k in self.keys]) mid = hashlib.sha224(str("%s-%s" % (lid, msg_metadata['archived-at'])).encode('utf-8')).hexdigest() + "@" + (lid if lid else "none") for key in ['to','from','subject','message-id']: try: hval = "" if msg_metadata.get(key): for t in email.header.decode_header(msg_metadata[key]): if t[1] == None or t[1].find("8bit") != -1: hval += t[0].decode('utf-8') if type(t[0]) is bytes else t[0] else: hval += t[0].decode(t[1],errors='ignore') msg_metadata[key] = hval except Exception as err: print("Could not decode headers, ignoring..: %s" % err) if not msg_metadata.get('message-id'): msg_metadata['message-id'] = mid mdate = None uid_mdate = 0 # mdate for UID generation try: mdate = email.utils.parsedate_tz(msg_metadata.get('date')) uid_mdate = email.utils.mktime_tz(mdate) # Only set if Date header is valid except: pass if not mdate and msg_metadata.get('archived-at'): mdate = email.utils.parsedate_tz(msg_metadata.get('archived-at')) elif not mdate: print("Date seems totally wrong, setting to _now_ instead.") mdate = time.gmtime() mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(email.utils.mktime_tz(mdate))) body = self.msgbody(msg) try: if 'content-type' in msg_metadata and msg_metadata['content-type'].find("flowed") != -1: body = convertToWrapped(body, character_set="utf-8") if isinstance(body, str): body = body.encode('utf-8') except Exception as err: try: body = body.decode(chardet.detect(body)['encoding']) except Exception as err: try: body = body.decode('latin-1') except: try: if isinstance(body, str): body = body.encode('utf-8') except: body = None if body: attachments, contents = self.msgfiles(msg) private = False if hasattr(mlist, 'archive_public') and mlist.archive_public == True: private = False elif hasattr(mlist, 'archive_public') and mlist.archive_public == False: private = True elif hasattr(mlist, 'archive_policy') and mlist.archive_policy is not ArchivePolicy.public: private = True pmid = mid try: mid = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid) except Exception as err: if logger: logger.warn("Could not generate MID: %s" % err) mid = pmid irt = "" if 'in-reply-to' in msg_metadata: try: try: irt = "".join(msg_metadata['in-reply-to']) except: irt = msg_metadata.get('in-reply-to').__str__() except: irt = "" ojson = { 'from_raw': msg_metadata['from'], 'from': msg_metadata['from'], 'to': msg_metadata['to'], 'subject': msg_metadata['subject'], 'message-id': msg_metadata['message-id'], 'mid': mid, 'cc': msg_metadata.get('cc'), 'epoch': email.utils.mktime_tz(mdate), 'list': lid, 'list_raw': lid, 'date': mdatestring, 'private': private, 'references': msg_metadata['references'], 'in-reply-to': irt, 'body': body.decode('utf-8', 'replace') if type(body) is bytes else body, 'attachments': attachments } if contents: for key in contents: self.es.index( index=self.dbname, doc_type="attachment", id=key, body = { 'source': contents[key] } ) self.es.index( index=self.dbname, doc_type="mbox", id=mid, consistency = self.consistency, body = ojson ) self.es.index( index=self.dbname, doc_type="mbox_source", id=mid, consistency = self.consistency, body = { "message-id": msg_metadata['message-id'], "source": msg.as_string() } ) # If MailMan and list info is present, save/update it in ES: if hasattr(mlist, 'description') and hasattr(mlist, 'list_name') and mlist.description and mlist.list_name: self.es.index( index=self.dbname, doc_type="mailinglists", id=lid, consistency = self.consistency, body = { 'list': lid, 'name': mlist.list_name, 'description': mlist.description, 'private': private } ) if logger: logger.info("Pony Mail archived message %s successfully" % mid) oldrefs = [] # Is this a direct reply to a pony mail email? if irt != "": dm = re.search(r"pony-([a-f0-9]+)-([a-f0-9]+)@", irt) if dm: cid = dm.group(1) mid = dm.group(2) if self.es.exists(index = self.dbname, doc_type = 'account', id = cid): doc = self.es.get(index = self.dbname, doc_type = 'account', id = cid) if doc: oldrefs.append(cid) self.es.index( index=self.dbname, doc_type="notifications", consistency = self.consistency, body = { 'type': 'direct', 'recipient': cid, 'list': lid, 'private': private, 'date': msg_metadata['date'], 'from': msg_metadata['from'], 'to': msg_metadata['to'], 'subject': msg_metadata['subject'], 'message-id': msg_metadata['message-id'], 'in-reply-to': irt, 'epoch': email.utils.mktime_tz(mdate), 'mid': mid, 'seen': 0 } ) if logger: logger.info("Notification sent to %s for %s" % (cid, mid)) # Are there indirect replies to pony emails? if msg_metadata.get('references'): for im in re.finditer(r"pony-([a-f0-9]+)-([a-f0-9]+)@", msg_metadata.get('references')): cid = im.group(1) mid = im.group(2) if self.es.exists(index = self.dbname, doc_type = 'account', id = cid): doc = self.es.get(index = self.dbname, doc_type = 'account', id = cid) # does the user want to be notified of indirect replies? if doc and 'preferences' in doc['_source'] and doc['_source']['preferences'].get('notifications') == 'indirect' and not cid in oldrefs: oldrefs.append(cid) self.es.index( index=self.dbname, consistency = self.consistency, doc_type="notifications", body = { 'type': 'indirect', 'recipient': cid, 'list': lid, 'private': private, 'date': msg_metadata['date'], 'from': msg_metadata['from'], 'to': msg_metadata['to'], 'subject': msg_metadata['subject'], 'message-id': msg_metadata['message-id'], 'in-reply-to': mirt, 'epoch': email.utils.mktime_tz(mdate), 'mid': mid, 'seen': 0 } ) if logger: logger.info("Notification sent to %s for %s" % (cid, mid)) return lid def list_url(self, mlist): """ Gots to be here """ return None def permalink(self, mlist, msg): """ Gots to be here """ return None
class IndexMgr(object): index_pattern = "result_*" doc_feedback = "feedback" doc_percent = "percent" doc_module = "module" def __init__(self, *args, **kwargs): self.server = kwargs.get("host", "localhost") self.es = Elasticsearch([{"host": self.server}]) def feedback_create(self): self.es.indices.delete(index=self.current(), ignore=[400, 404]) self.es.indices.create(index=self.current(), ignore=[400]) mapping = { "feedback": { "_timestamp": {"enabled": "true", "path": "tsapi.received"}, "properties": { "tsapi.product.name": {"type": "string", "index": "not_analyzed"}, "md5": {"type": "string", "index": "not_analyzed"}, "tag": {"type": "string", "index": "not_analyzed"}, "autoflag": {"type": "string", "index": "not_analyzed"}, "filetype": {"type": "string", "index": "not_analyzed"}, "assessment": {"type": "string", "index": "not_analyzed"}, "hit_modules.name": {"type": "string", "index": "not_analyzed"}, }, } } rt = self.es.indices.put_mapping(index=self.current(), doc_type=self.doc_feedback, body=mapping) mapping = {"percent": {"properties": {"timestamp": {"type": "date"}}}} rt = self.es.indices.put_mapping(index=self.current(), doc_type=self.doc_percent, body=mapping) mapping = {"module": {"properties": {"created": {"type": "date"}}}} rt = self.es.indices.put_mapping(index=self.current(), doc_type=self.doc_module, body=mapping) mp = self.es.indices.get_mapping(index=self.current()) print "Index created", rt, "\n" def delete(self): self.es.indices.delete(index="result_*", ignore=[400, 404]) def module_insert(self, idd, doc): try: res = self.es.index(index=self.current(), doc_type=self.doc_module, id=idd, body=doc) except elasticsearch.ElasticsearchException as e: print "Insert -", e.info def percent_insert(self, idd, doc): try: res = self.es.index(index=self.current(), doc_type=self.doc_percent, id=idd, body=doc) except elasticsearch.ElasticsearchException as e: print "Insert -", e.info def feedback_get(self, idd): doc = None try: doc = self.es.get(index=self.current(), doc_type=self.doc_feedback, id=idd) except elasticsearch.ElasticsearchException as e: print "Get -", e.info return doc def feedback_exists(self, idd): IsExists = False try: IsExists = self.es.exists(index=self.current(), doc_type=self.doc_feedback, id=idd) except elasticsearch.ElasticsearchException as e: print "Exists -", e.info except: pass return IsExists def feedback_insert(self, idd, doc): res = None try: res = self.es.index(index=self.current(), doc_type=self.doc_feedback, id=idd, body=doc) except elasticsearch.ElasticsearchException as e: print "Insert -", e.info return res def feedback_update(self, idd, doc): res = None try: res = self.es.update(index=self.current(), doc_type=self.doc_feedback, id=idd, body=doc) except elasticsearch.ElasticsearchException as e: print "Update (%s-%s)-" % (idd, doc), e.info return res def feedback_search(self, content, sz=10): res = None try: res = self.es.search(index=self.current(), doc_type=self.doc_feedback, body=content, size=sz) except elasticsearch.ElasticsearchException as e: print e.info return res def build_query(self, **kwargs): query = {"query": {"filtered": {"filter": {"bool": {"must": [], "must_not": []}}}}} must = query["query"]["filtered"]["filter"]["bool"]["must"] must_not = query["query"]["filtered"]["filter"]["bool"]["must_not"] dt_range = kwargs.get("daterange", None) vt_result = kwargs.get("vt_result", None) vt_detected = kwargs.get("vt_detected", None) tscp_score = kwargs.get("tscp_score", None) tag = kwargs.get("tag", None) assessment = kwargs.get("assessment", None) autoflag = kwargs.get("autoflag", None) not_gt_cca = kwargs.get("not_gt_cca", None) if dt_range: must.append({"range": dt_range}) if vt_result == 0 or vt_result == 1: must.append({"term": {"virustotal.result": vt_result}}) if vt_detected >= 0: must.append({"term": {"virustotal.value": vt_detected}}) if tscp_score >= 0: must.append({"term": {"threatscope.score": tscp_score}}) if tag: must.append({"term": {"tag": tag}}) if assessment: must.append({"term": {"assessment": assessment}}) if autoflag: must.append({"term": {"autoflag": autoflag}}) if not_gt_cca >= 0: # must.append({"exists" : {"field": "threatscope.score"}}) must_not.append({"range": {"cca_result_count": {"gt": 0}}}) return query def get_count(self, **kwargs): res = self.feedback_search(self.build_query(**kwargs), 0) return int(res["hits"]["total"]) def get_rules_count(self, **kwargs): agg = {"aggs": {"modules": {"terms": {"field": "hit_modules.name"}}}} if len(kwargs) > 0: agg["query"] = self.build_query(**kwargs)["query"] res = self.feedback_search(agg, 0) if res and res["hits"]["total"] > 0: return res["aggregations"]["modules"]["buckets"] return [] def current(self): index = "result_{0}".format(date.today().strftime("%Y%m%d")) return index
class AnnotationWorker: def __init__(self, config): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.workerName = "bayzee.annotation.worker" self.timeout = 6000 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.analyzerIndex = self.corpusIndex + "__analysis__" self.worker = DurableChannel(self.workerName, config) self.dispatchers = {} def annotate(self): while True: message = self.worker.receive() if message["content"] == "kill": message["responseId"] = message["requestId"] self.worker.close(message) if len(self.dispatchers) == 0: self.worker.end() break else: self.worker.send(content="kill", to=self.workerName) continue elif message["content"]["type"] == "annotate": if message["content"]["from"] not in self.dispatchers: self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config) self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher) documentId = message["content"]["documentId"] document = self.esClient.get(index=self.corpusIndex, doc_type=self.corpusType, id = documentId, fields=self.corpusFields) if "fields" in document: for field in self.corpusFields: shingles = [] if field in document["fields"]: if type(document["fields"][field]) is list: for element in document["fields"][field]: if len(element) > 0: shingleTokens = self.esClient.indices.analyze(index=self.analyzerIndex, body=element, analyzer="analyzer_shingle") shingles += shingleTokens["tokens"] else: if len(document["fields"][field]) > 0: shingles = self.esClient.indices.analyze(index=self.analyzerIndex, body=document["fields"][field], analyzer="analyzer_shingle")["tokens"] shingles = map(self.__replaceUnderscore, shingles) shingles = filter(self.__filterTokens, shingles) if shingles != None and len(shingles) > 0: for shingle in shingles: phrase = shingle["token"] key = self.__keyify(phrase) if len(key) > 0: data = {"phrase": phrase,"phrase__not_analyzed": phrase,"document_id": document["_id"]} if not self.esClient.exists(index=self.processorIndex, doc_type=self.processorPhraseType, id=key): self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=key, body=data) sleep(1) for processorInstance in self.config["processor_instances"]: processorInstance.annotate(self.config, documentId) self.worker.reply(message, {"documentId": documentId, "status" : "processed", "type" : "reply"}, self.timeout) self.logger.info("Terminating annotation worker") def unregisterDispatcher(self, dispatcher, message): if message == "dying": self.dispatchers.pop(dispatcher, None) if len(self.dispatchers) == 0: self.worker.send(content="kill", to=self.workerName) def __keyify(self, phrase): phrase = phrase.strip() if len(phrase) == 0: return "" key = re.sub("[^A-Za-z0-9]", " ", phrase) key = " ".join(phrase.split()) key = key.lower() key = "-".join(phrase.split()) return key def __replaceUnderscore(self,shingle): token = shingle["token"] token = token.replace("_","") token = re.sub('\s+', ' ', token).strip() shingle["token"] = token return shingle def __filterTokens(self, shingle): global esStopWords tokens = shingle["token"].split(" ") firstToken = tokens[0] lastToken = tokens[-1] isValid = True isValid = (isValid and lastToken != None) isValid = (isValid and len(lastToken) > 1) isValid = (isValid and not firstToken.replace(".","",1).isdigit()) isValid = (isValid and not lastToken.replace(".","",1).isdigit()) isValid = (isValid and firstToken not in esStopWords) isValid = (isValid and lastToken not in esStopWords) return isValid
class ModelRegistry(object): BATCH_SIZE = 100 def __init__(self, es=None): if es is None: logging.info('Attempting to connect to ES: {0}'.format(OS_ELASTICSEARCH_ADDRESS)) self.es = Elasticsearch(hosts=[OS_ELASTICSEARCH_ADDRESS]) logging.info('Successful connection to ES') else: self.es = es @staticmethod def table_name_for_package(datapackage_owner, datapackage_name): return model_name(datapackage_owner, datapackage_name) def save_model(self, name, datapackage_url, datapackage, model, dataset_name, author): """ Save a model in the registry :param name: name for the model :param datapackage_url: origin URL for the datapackage which is the source for this model :param datapackage: datapackage object from which this model was derived :param model: model to save """ document = { # Fields used by babbage API 'id': name, 'model': model, 'package': datapackage, 'origin_url': datapackage_url, # Extra fields available in search 'dataset': dataset_name, 'author': author } self.es.index(index='packages', doc_type='package', body=document, id=name) # Make sure that the data is saved self.es.indices.flush('packages') def list_models(self): """ List all available models in the DB :return: A generator yielding strings (one per model) """ try: count = self.es.count(index='packages', doc_type='package', q='*')['count'] from_ = 0 while from_ < count: ret = self.es.search(index='packages', doc_type='package', q='*', size=self.BATCH_SIZE, from_=from_, _source=PACKAGE_FIELDS) for hit in ret.get('hits',{}).get('hits',[]): yield hit['_source']['id'] from_ += self.BATCH_SIZE except NotFoundError: return def has_model(self, name): """ Check if a model exists in the registry :param name: model name to test :return: True if yes """ return self.es.exists(index='packages', doc_type='package', id=name) def get_model(self, name): """ Return the model associated with a specific name. Raises KeyError in case the model doesn't exist. :param name: model name to fetch :return: Python object representing the model """ try: ret = self.es.get(index='packages', doc_type='package', id=name, _source=PACKAGE_FIELDS) if ret['found']: return ret['_source']['model'] raise KeyError(name) except NotFoundError: raise KeyError(name) def get_package(self, name): """ Return the original package contents associated with a specific name. Raises KeyError in case the model doesn't exist. :param name: model name to fetch :return: Python object representing the package """ try: rec = self.es.get(index='packages', doc_type='package', id=name, _source=PACKAGE_FIELDS) if rec['found']: ret = rec['_source']['package'] ret['__origin_url'] = rec['_source']['origin_url'] return ret raise KeyError(name) except NotFoundError: raise KeyError(name)
class Docstore(): hosts = None indexname = None facets = None es = None def __init__(self, hosts=config.DOCSTORE_HOST, index=config.DOCSTORE_INDEX, connection=None): self.hosts = hosts self.indexname = index if connection: self.es = connection else: self.es = Elasticsearch(hosts, timeout=config.DOCSTORE_TIMEOUT) def __repr__(self): return "<%s.%s %s:%s>" % ( self.__module__, self.__class__.__name__, self.hosts, self.indexname ) def print_configs(self): print('CONFIG_FILES: %s' % config.CONFIG_FILES) print('') print('DOCSTORE_HOST: %s' % config.DOCSTORE_HOST) print('DOCSTORE_INDEX: %s' % config.DOCSTORE_INDEX) print('') def health(self): return self.es.cluster.health() def index_exists(self, index): """ """ return self.es.indices.exists(index=index) def status(self): """Returns status information from the Elasticsearch cluster. >>> docstore.Docstore().status() { u'indices': { u'ddrpublic-dev': { u'total': { u'store': { u'size_in_bytes': 4438191, u'throttle_time_in_millis': 0 }, u'docs': { u'max_doc': 2664, u'num_docs': 2504, u'deleted_docs': 160 }, ... }, ... } }, ... } """ return self.es.indices.stats() def index_names(self): """Returns list of index names """ return [name for name in self.status()['indices'].keys()] def aliases(self): """ @param hosts: list of dicts containing host information. """ return _parse_cataliases( self.es.cat.aliases(h=['index','alias']) ) def delete_alias(self, alias, index): """Remove specified alias. @param alias: Name of the alias @param index: Name of the alias' target index. """ logger.debug('deleting alias %s -> %s' % (alias, index)) alias = make_index_name(alias) index = make_index_name(index) if alias not in [alias for index,alias in self.aliases()]: logger.error('Alias does not exist: "%s".' % alias) return result = self.es.indices.delete_alias(index=index, name=alias) logger.debug(result) logger.debug('DONE') return result def create_alias(self, alias, index): """Point alias at specified index; create index if doesn't exist. IMPORTANT: There should only ever be ONE alias per index. Existing aliases are deleted before specified one is created. @param alias: Name of the alias @param index: Name of the alias' target index. """ logger.debug('creating alias %s -> %s' % (alias, index)) alias = make_index_name(alias) index = make_index_name(index) # delete existing alias for i,a in self.aliases(): removed = '' if a == alias: self.es.indices.delete_alias( # NOTE: "i" is probably not the arg "index". That's what # we want. We only want the arg "index". index=i, name=alias ) removed = ' (removed)' print('%s -> %s%s' % (a,i,removed)) result = self.es.indices.put_alias(index=index, name=alias, body='') logger.debug(result) logger.debug('DONE') return result def target_index(self, alias): """Get the name of the index to which the alias points >>> es.cat.aliases(h=['alias','index']) u'documents0 wd5000bmv-2 \n' @param alias: Name of the alias @returns: name of target index """ alias = make_index_name(alias) target = [] for i,a in _parse_cataliases(self.es.cat.aliases(h=['index','alias'])): if a == alias: target = i return target def create_index(self, index=None): """Creates the specified index if it does not already exist. @returns: JSON dict with status codes and responses """ if not index: index = self.indexname logger.debug('creating new index: %s' % index) body = { 'settings': {}, 'mappings': {} } status = self.es.indices.create(index=index, body=body) logger.debug(status) statuses = self.init_mappings() self.model_fields_lists() logger.debug('DONE') def delete_index(self, index=None): """Delete the specified index. @returns: JSON dict with status code and response """ if not index: index = self.indexname logger.debug('deleting index: %s' % index) if self.index_exists(index): status = self.es.indices.delete(index=index) else: status = '{"status":500, "message":"Index does not exist"}' logger.debug(status) return status def init_mappings(self): """Initializes mappings for Elasticsearch objects Mappings for objects in (ddr-defs)repo_models.elastic.ELASTICSEARCH_CLASSES @returns: JSON dict with status code and response """ logger.debug('registering doc types') statuses = [] for class_ in ELASTICSEARCH_CLASSES['all']: logger.debug('- %s' % class_['doctype']) print('- %s' % class_) status = class_['class'].init(index=self.indexname, using=self.es) statuses.append( {'doctype':class_['doctype'], 'status':status} ) return statuses def model_fields_lists(self): """ Lists of class-specific fields for each class, in order, so documents may be emitted as OrderedDicts with fields in order. HOSTS:PORT/INDEX/modelfields/collection/ HOSTS:PORT/INDEX/modelfields/entity/ HOSTS:PORT/INDEX/modelfields/segment/ HOSTS:PORT/INDEX/modelfields/file/ identifier.MODEL_REPO_MODELS Identifier.fields_module """ DOCTYPE = 'esobjectfields' EXCLUDED = [ 'id', 'title', 'description', ] for model in MODEL_REPO_MODELS.keys(): module = module_for_name(MODEL_REPO_MODELS[model]['module'] ) fields = [ f['name'] for f in module.FIELDS if f['elasticsearch']['public'] and (f['name'] not in EXCLUDED) ] data = { 'model': model, 'fields': fields, } self.post_json( doc_type=DOCTYPE, document_id=model, json_text=json.dumps(data), ) def get_mappings(self, raw=False): """Get mappings for ESObjects @param raw: boolean Use lower-level function to get all mappings @returns: str JSON """ if raw: return self.es.indices.get_mapping(self.indexname) return { class_['doctype']: elasticsearch_dsl.Mapping.from_es( index=self.indexname, doc_type=class_['doctype'], using=self.es, ).to_dict() for class_ in ELASTICSEARCH_CLASSES['all'] } def post_vocabs(self, path=config.VOCABS_URL): """Posts ddr-vocab facets,terms to ES. curl -XPUT 'http://localhost:9200/meta/facet/format' -d '{ ... }' >>> elasticsearch.post_facets( '192.168.56.120:9200', 'meta', '/opt/ddr-local/ddr-vocab' ) @param path: Absolute path to dir containing facet files. @returns: JSON dict with status code and response """ logger.debug('index_facets(%s, %s)' % (self.indexname, path)) vocabs = vocab.get_vocabs(path) # get classes from ddr-defs Facet = ELASTICSEARCH_CLASSES_BY_MODEL['facet'] FacetTerm = ELASTICSEARCH_CLASSES_BY_MODEL['facetterm'] # push facet data statuses = [] for v in vocabs.keys(): fid = vocabs[v]['id'] facet = Facet() facet.meta.id = fid facet.id = fid facet.model = 'facet' facet.links_html = fid facet.links_json = fid facet.links_children = fid facet.title = vocabs[v]['title'] facet.description = vocabs[v]['description'] logging.debug(facet) status = facet.save(using=self.es, index=self.indexname) statuses.append(status) for t in vocabs[v]['terms']: tid = t.get('id') facetterm_id = '-'.join([ str(fid), str(tid), ]) term = FacetTerm() term.meta.id = facetterm_id term.facet = fid term.term_id = tid term.links_html = facetterm_id term.links_json = facetterm_id # TODO doesn't handle location_geopoint for field in FacetTerm._doc_type.mapping.to_dict()[ FacetTerm._doc_type.name]['properties'].keys(): if t.get(field): setattr(term, field, t[field]) term.id = facetterm_id # overwrite term.id from original logging.debug(term) status = term.save(using=self.es, index=self.indexname) statuses.append(status) forms_choices = { 'topics-choices': vocab.topics_choices( vocab.get_vocabs(config.VOCABS_URL)['topics'], ELASTICSEARCH_CLASSES_BY_MODEL['facetterm'] ), 'facility-choices': vocab.form_vocab_choices( vocab.get_vocabs(config.VOCABS_URL)['facility'], 'facility' ), 'format-choices': vocab.form_vocab_choices( vocab.get_vocabs(config.VOCABS_URL)['format'], 'format' ), 'genre-choices': vocab.form_vocab_choices( vocab.get_vocabs(config.VOCABS_URL)['genre'], 'genre' ), 'rights-choices': vocab.form_vocab_choices( vocab.get_vocabs(config.VOCABS_URL)['rights'], 'rights' ), } self.post_json('forms', 'forms-choices', forms_choices) return statuses def facet_terms(self, facet, order='term', all_terms=True, model=None): """Gets list of terms for the facet. $ curl -XGET 'http://192.168.56.101:9200/ddr/entity/_search?format=yaml' -d '{ "fields": ["id"], "query": { "match_all": {} }, "facets": { "genre_facet_result": { "terms": { "order": "count", "field": "genre" } } } }' Sample results: { u'_type': u'terms', u'missing': 203, u'total': 49, u'other': 6, u'terms': [ {u'term': u'photograph', u'count': 14}, {u'term': u'ephemera', u'count': 6}, {u'term': u'advertisement', u'count': 6}, {u'term': u'book', u'count': 5}, {u'term': u'architecture', u'count': 3}, {u'term': u'illustration', u'count': 2}, {u'term': u'fieldnotes', u'count': 2}, {u'term': u'cityscape', u'count': 2}, {u'term': u'blank_form', u'count': 2}, {u'term': u'portrait, u'count': 1'} ] } @param facet: Name of field @param order: term, count, reverse_term, reverse_count @param model: (optional) Type of object ('collection', 'entity', 'file') @returns raw output of facet query """ payload = { "fields": ["id"], "query": { "match_all": {} }, "facets": { "results": { "terms": { "size": MAX_SIZE, "order": order, "all_terms": all_terms, "field": facet } } } } results = self.es.search(index=self.indexname, doc_type=model, body=payload) return results['facets']['results'] def _repo_org(self, path, doctype, remove=False): """ seealso DDR.models.common.DDRObject.to_esobject """ # get and validate file data = load_json(path) if (not (data.get('id') and data.get('repo'))): raise Exception('Data file is not well-formed.') oi = Identifier(id=data['id']) d = OrderedDict() d['id'] = oi.id d['model'] = oi.model d['parent_id'] = oi.parent_id(stubs=1) # links d['links_html'] = oi.id d['links_json'] = oi.id d['links_img'] = '%s/logo.png' % oi.id d['links_thumb'] = '%s/logo.png' % oi.id d['links_parent'] = oi.parent_id(stubs=1) d['links_children'] = oi.id # title,description d['title'] = data['title'] d['description'] = data['description'] d['url'] = data['url'] # ID components (repo, org, cid, ...) as separate fields idparts = deepcopy(oi.idparts) idparts.pop('model') for k in ID_COMPONENTS: d[k] = '' # ensure all fields present for k,v in idparts.iteritems(): d[k] = v # add/update if remove and self.exists(doctype, oi): results = self.es.delete( index=self.indexname, doc_type=doctype, id=oi.id ) else: results = self.es.index( index=self.indexname, doc_type=doctype, id=oi.id, body=d ) return results def repo(self, path, remove=False): """Add/update or remove base repository metadata. @param path: str Absolute path to repository.json @param remove: bool Remove record from ES @returns: dict """ return self._repo_org(path, 'repository', remove) def org(self, path, remove=False): """Add/update or remove base organization metadata. @param path: str Absolute path to organization.json @param remove: bool Remove record from ES @returns: dict """ return self._repo_org(path, 'organization', remove) def narrators(self, path): """Add/update or remove narrators metadata. @param path: str Absolute path to narrators.json @returns: dict """ DOC_TYPE = 'narrator' data = load_json(path) for document in data['narrators']: document['model'] = 'narrator' has_published = document.get('has_published', '') if has_published.isdigit(): has_published = int(has_published) if has_published: result = self.post_json(DOC_TYPE, document['id'], json.dumps(document)) logging.debug(document['id'], result) else: logging.debug('%s not published' % document['id']) if self.get(DOC_TYPE, document['id'], fields=[]): self.delete(document['id']) def post_json(self, doc_type, document_id, json_text): """POST the specified JSON document as-is. @param doc_type: str @param document_id: str @param json_text: str JSON-formatted string @returns: dict Status info. """ logger.debug('post_json(%s, %s, %s)' % ( self.indexname, doc_type, document_id )) return self.es.index( index=self.indexname, doc_type=doc_type, id=document_id, body=json_text ) def post(self, document, public_fields=[], additional_fields={}, parents={}, force=False): """Add a new document to an index or update an existing one. This function can produce ElasticSearch documents in two formats: - old-style list-of-dicts used in the DDR JSON files. - normal dicts used by ddr-public. DDR metadata JSON files are structured as a list of fieldname:value dicts. This is done so that the fields are always in the same order, making it possible to easily see the difference between versions of a file. [IMPORTANT: documents MUST contain an 'id' field!] In ElasticSearch, documents are structured in a normal dict so that faceting works properly. curl -XPUT 'http://localhost:9200/ddr/collection/ddr-testing-141' -d '{ ... }' @param document: Collection,Entity,File The object to post. @param public_fields: list @param additional_fields: dict @param parents: dict Basic metadata for parent documents. @param force: boolean Bypass status and public checks. @returns: JSON dict with status code and response """ logger.debug('post(%s, %s, %s)' % ( self.indexname, document, force )) if force: publishable = True public = False else: if not parents: parents = _parents_status([document.identifier.path_abs()]) publishable = _publishable([document.identifier.path_abs()], parents) public = True if not publishable: return {'status':403, 'response':'object not publishable'} d = document.to_esobject(public_fields=public_fields, public=public) logger.debug('saving') status = d.save(using=self.es, index=self.indexname) logger.debug(str(status)) return status def post_multi(self, path, recursive=False, force=False): """Publish (index) specified document and (optionally) its children. After receiving a list of metadata files, index() iterates through the list several times. The first pass weeds out paths to objects that can not be published (e.g. object or its parent is unpublished). In the final pass, a list of public/publishable fields is chosen based on the model. Additional fields not in the model (e.g. parent ID, parent organization/collection/entity ID) are packaged. Then everything is sent off to post(). @param path: Absolute path to directory containing object metadata files. @param recursive: Whether or not to recurse into subdirectories. @param force: boolean Just publish the damn collection already. @returns: number successful,list of paths that didn't work out """ logger.debug('index(%s, %s, %s, %s)' % (self.indexname, path, recursive, force)) publicfields = _public_fields() # process a single file if requested if os.path.isfile(path): paths = [path] else: # files listed first, then entities, then collections paths = util.find_meta_files(path, recursive, files_first=1) # Store value of public,status for each collection,entity. # Values will be used by entities and files to inherit these values # from their parent. parents = _parents_status(paths) # Determine if paths are publishable or not paths = _publishable(paths, parents, force=force) skipped = 0 successful = 0 bad_paths = [] num = len(paths) for n,path in enumerate(paths): oi = path.get('identifier') # TODO write logs instead of print print('%s | %s/%s %s %s %s' % ( datetime.now(config.TZ), n+1, num, path['action'], oi.id, path['note']) ) if not oi: path['note'] = 'No identifier' bad_paths.append(path) continue try: document = oi.object() except Exception as err: path['note'] = 'Could not instantiate: %s' % err bad_paths.append(path) continue if not document: path['note'] = 'No document' bad_paths.append(path) continue # see if document exists existing_v = None d = self.get(oi.model, oi.id) if d: existing_v = d.meta.version # post document if path['action'] == 'POST': created = self.post(document, parents=parents, force=True) # force=True bypasses _publishable in post() function # delete previously published items now marked incomplete/private elif existing_v and (path['action'] == 'SKIP'): print('%s | %s/%s DELETE' % (datetime.now(config.TZ), n+1, num)) self.delete(oi.id) if path['action'] == 'SKIP': skipped += 1 continue # version is incremented with each updated posted_v = None # for e.g. segment the ES doc_type will be 'entity' but oi.model is 'segment' es_model = ELASTICSEARCH_CLASSES_BY_MODEL[oi.model]._doc_type.name d = self.get(es_model, oi.id) if d: posted_v = d.meta.version # success: created, or version number incremented status = 'ERROR - unspecified' if posted_v and not existing_v: status = 'CREATED' successful += 1 elif (existing_v and posted_v) and (existing_v < posted_v): status = 'UPDATED' successful += 1 elif not posted_v: status = 'ERROR: not created' bad_paths.append(path) print(status) logger.debug('INDEXING COMPLETED') return {'total':len(paths), 'skipped':skipped, 'successful':successful, 'bad':bad_paths} def exists(self, model, document_id): """ @param model: @param document_id: """ return self.es.exists(index=self.indexname, doc_type=model, id=document_id) def get(self, model, document_id, fields=None): """ @param model: @param document_id: @param fields: boolean Only return these fields """ if self.exists(model, document_id): ES_Class = ELASTICSEARCH_CLASSES_BY_MODEL[model] return ES_Class.get(document_id, using=self.es, index=self.indexname) return None def count(self, doctypes=[], query={}): """Executes a query and returns number of hits. The "query" arg must be a dict that conforms to the Elasticsearch query DSL. See docstore.search_query for more info. @param doctypes: list Type of object ('collection', 'entity', 'file') @param query: dict The search definition using Elasticsearch Query DSL @returns raw ElasticSearch query output """ logger.debug('count(index=%s, doctypes=%s, query=%s' % ( self.indexname, doctypes, query )) if not query: raise Exception("Can't do an empty search. Give me something to work with here.") doctypes = ','.join(doctypes) logger.debug(json.dumps(query)) return self.es.count( index=self.indexname, doc_type=doctypes, body=query, ) def delete(self, document_id, recursive=False): """Delete a document and optionally its children. @param document_id: @param recursive: True or False """ identifier = Identifier(id=document_id) if recursive: if identifier.model == 'collection': doc_type = 'collection,entity,file' elif identifier.model == 'entity': doc_type = 'entity,file' elif identifier.model == 'file': doc_type = 'file' query = 'id:"%s"' % identifier.id try: return self.es.delete_by_query( index=self.indexname, doc_type=doc_type, q=query ) except TransportError: pass else: try: return self.es.delete( index=self.indexname, doc_type=identifier.model, id=identifier.id ) except TransportError: pass def search(self, doctypes=[], query={}, sort=[], fields=[], from_=0, size=MAX_SIZE): """Executes a query, get a list of zero or more hits. The "query" arg must be a dict that conforms to the Elasticsearch query DSL. See docstore.search_query for more info. @param doctypes: list Type of object ('collection', 'entity', 'file') @param query: dict The search definition using Elasticsearch Query DSL @param sort: list of (fieldname,direction) tuples @param fields: str @param from_: int Index of document from which to start results @param size: int Number of results to return @returns raw ElasticSearch query output """ logger.debug('search(index=%s, doctypes=%s, query=%s, sort=%s, fields=%s, from_=%s, size=%s' % ( self.indexname, doctypes, query, sort, fields, from_, size )) if not query: raise Exception("Can't do an empty search. Give me something to work with here.") doctypes = ','.join(doctypes) logger.debug(json.dumps(query)) _clean_dict(sort) sort_cleaned = _clean_sort(sort) fields = ','.join(fields) results = self.es.search( index=self.indexname, doc_type=doctypes, body=query, sort=sort_cleaned, from_=from_, size=size, _source_include=fields, ) return results def reindex(self, source, dest): """Copy documents from one index to another. @param source: str Name of source index. @param dest: str Name of destination index. @returns: number successful,list of paths that didn't work out """ logger.debug('reindex(%s, %s)' % (source, dest)) if self.index_exists(source): logger.info('Source index exists: %s' % source) else: return '{"status":500, "message":"Source index does not exist"}' if self.index_exists(dest): logger.info('Destination index exists: %s' % dest) else: return '{"status":500, "message":"Destination index does not exist"}' version = self.es.info()['version']['number'] logger.debug('Elasticsearch version %s' % version) if version >= '2.3': logger.debug('new API') body = { "source": {"index": source}, "dest": {"index": dest} } results = self.es.reindex( body=json.dumps(body), refresh=None, requests_per_second=0, timeout='1m', wait_for_active_shards=1, wait_for_completion=False, ) else: logger.debug('pre-2.3 legacy API') from elasticsearch import helpers results = helpers.reindex( self.es, source, dest, #query=None, #target_client=None, #chunk_size=500, #scroll=5m, #scan_kwargs={}, #bulk_kwargs={} ) return results
if jobIDMatch: jobID = re.search("\d+\.\d+", jobIDMatch.group()) jO1 = jobObject(jobID.group(), dateTime.isoformat()) # create new job object with jobID and the time the job started at jobDict[jO1.jobID] = jO1 # put this jobObject in the dict under its jobID else: jobIDMatch = re.search("\(\d+\.\d+\)", line) if(jobIDMatch): # check if logline is related to a job process jobID = re.search("\d+\.\d+", jobIDMatch.group()) if jobID.group() in jobDict: jO1 = jobDict[jobID.group()] jobTerminateMatch = re.search("Job \d+\.\d+ terminated", line) if not jobTerminateMatch: jobTerminateMatch = re.search("terminating job \d+\.\d+", line) if jobTerminateMatch: jO1.setEndTime(dateTime.isoformat()) if es.exists(index = "htcondor", doc_type = "mongoData", id = jO1.jobID): print "jobID: " + str(jobID.group()) es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.jobShadowStartTime = dateTime", "params" : { "dateTime" : jO1.jobShadowStartTime } } ) es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.jobEndTime = dateTime", "params" : { "dateTime" : jO1.jobEndTime } }) es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.jobTimeSHADOW = diff", "params" : { "diff" : jO1.jobTimeShadow } }) res = es.get(index = "htcondor", doc_type = "mongoData", id = jO1.jobID) if "QDate" in res["_source"]: jO1.setLagTime((res["_source"])["QDate"]) es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.lagTimeSecondsSHADOW = diff", "params" : { "diff" : jO1.lagTimeShadow } }) es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.lagTimeMinutesSHADOW = diff", "params" : { "diff" : jO1.lagTimeShadow/60.0 } }) es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.lagTimeHoursSHADOW = diff", "params" : { "diff" : jO1.lagTimeShadow/3600.0 } }) es.update(index = "htcondor", doc_type = "mongoData", id = jO1.jobID, body = {"script" : "ctx._source.jobMessages = jM", "params" : { "jM" : jO1.jobMessages } }) else: # creates new jobMessages entry that includes the jobTimestamp and jobMessage or appends the jobMessage if key already exists messageMatch = re.search(": .*", line) message = messageMatch.group(0)[2:] if dateTime.isoformat() in jO1.jobMessages:
class IndexMgr(object): index_pattern='event_*' doc_feedback='log' def __init__(self, *args, **kwargs): self.server = kwargs.get('host', 'localhost') self.es = Elasticsearch([{'host': self.server}]) def feedback_create(self): self.es.indices.delete(index=self.current(), ignore=[400, 404]) self.es.indices.create(index=self.current(), ignore=[400]) mapping = { "log" : { "_timestamp": { "enabled": True }, "properties" : { "vendor" : { "type" : "string", "index" : "not_analyzed" }, "URL" : { "type" : "string", "index" : "not_analyzed" }, "source" : { "type" : "string", "index" : "not_analyzed" }, "appid_action" : { "type" : "string", "index" : "not_analyzed" } } } } rt=self.es.indices.put_mapping(index=self.current(), doc_type=self.doc_feedback, body=mapping) mp=self.es.indices.get_mapping(index=self.current()) print 'Index created', rt, '\n' def delete(self): self.es.indices.delete(index=self.index_pattern, ignore=[400, 404]) def feedback_get(self, idd): doc=None try: doc = self.es.get(index=self.current(), doc_type=self.doc_feedback, id=idd) except elasticsearch.ElasticsearchException as e: print 'Get -', e.info return doc def feedback_exists(self, idd): IsExists = False try: IsExists = self.es.exists(index=self.current(), doc_type=self.doc_feedback, id=idd) except elasticsearch.ElasticsearchException as e: print 'Exists -', e.info except: pass return IsExists def feedback_insert(self, idd, doc): res = None try: res = self.es.index(index=self.current(), doc_type=self.doc_feedback, id=idd, body=doc) except elasticsearch.ElasticsearchException as e: print 'Insert -', e.info return res def current(self): index = 'event_{0}'.format(date.today().strftime('%Y%m%d')) return index
class Docstore(): def __init__(self, hosts=settings.DOCSTORE_HOSTS, index=settings.DOCSTORE_INDEX, connection=None): self.hosts = hosts self.indexname = index if connection: self.es = connection else: self.es = Elasticsearch(hosts) def health(self): return self.es.cluster.health() def index_exists(self, index): return self.es.indices.exists(index=index) def exists(self, model, document_id): """ @param model: @param document_id: """ return self.es.exists(index=self.indexname, doc_type=model, id=document_id) def get(self, model, document_id, fields=None): """ @param model: @param document_id: @param fields: boolean Only return these fields """ if self.exists(model, document_id): ES_Class = ELASTICSEARCH_CLASSES_BY_MODEL[model] return ES_Class.get(document_id, using=self.es, index=self.indexname) return None def count(self, doctypes=[], query={}): """Executes a query and returns number of hits. The "query" arg must be a dict that conforms to the Elasticsearch query DSL. See docstore.search_query for more info. @param doctypes: list Type of object ('collection', 'entity', 'file') @param query: dict The search definition using Elasticsearch Query DSL @returns raw ElasticSearch query output """ logger.debug('count(index=%s, doctypes=%s, query=%s' % ( self.indexname, doctypes, query )) if not query: raise Exception("Can't do an empty search. Give me something to work with here.") doctypes = ','.join(doctypes) logger.debug(json.dumps(query)) return self.es.count( index=self.indexname, doc_type=doctypes, body=query, ) def search(self, doctypes=[], query={}, sort=[], fields=[], from_=0, size=MAX_SIZE): """Executes a query, get a list of zero or more hits. The "query" arg must be a dict that conforms to the Elasticsearch query DSL. See docstore.search_query for more info. @param doctypes: list Type of object ('collection', 'entity', 'file') @param query: dict The search definition using Elasticsearch Query DSL @param sort: list of (fieldname,direction) tuples @param fields: str @param from_: int Index of document from which to start results @param size: int Number of results to return @returns raw ElasticSearch query output """ logger.debug('search(index=%s, doctypes=%s, query=%s, sort=%s, fields=%s, from_=%s, size=%s' % ( self.indexname, doctypes, query, sort, fields, from_, size )) if not query: raise Exception("Can't do an empty search. Give me something to work with here.") doctypes = ','.join(doctypes) logger.debug(json.dumps(query)) _clean_dict(sort) sort_cleaned = _clean_sort(sort) fields = ','.join(fields) results = self.es.search( index=self.indexname, doc_type=doctypes, body=query, sort=sort_cleaned, from_=from_, size=size, _source_include=fields, ) return results
# try: # es.indices.delete(index=INDEX) # except: # pass es.indices.create(index=INDEX,ignore=400) import phonenumbers import pycountry for r in range(0,500): phone = fake.phone_number().split("x")[0] Id = str(int("".join([ c if c.isdigit() else "" for c in phone]))) body = dict(phone=Id, name=fake.first_name(), age=random.choice(range(18,35)), gender=random.choice(['male','female']), location=fake.city(), status=random.choice([1,0]), status_message=" ".join(fake.text().split()[:10])) p = ("+%s" % Id.strip()) try: phone_number = phonenumbers.parse(p, None) locale_code = phonenumbers.region_code_for_country_code(phone_number.country_code) country=pycountry.countries.get(alpha2=locale_code) body['country_name'] = country.name body['locale_code'] = locale_code if not (es.exists(index=INDEX, doc_type=DOC, id=Id)): es.index(index=INDEX, doc_type=DOC, id=Id, body=body) except Exception as e: print "Error :%s" % str(e)
"person either by simply appending the fields of the former to the fields of " "the latter or by nesting the fields of the former into the document of type" "person.") p.add_argument('--index', metavar='<str>', dest='index', type=str, required=True, help='Name of index') p.add_argument('--node', metavar='<str>', dest='node', type=str, required=True, help='Url and port of node') args = p.parse_args() es = Elasticsearch([args.node]) same_as = es.search(index=args.index, doc_type='person', _source=['owl:sameAs'], body='{"query":{"exists":{"field":"owl:sameAs"}}}', size=1000)['hits']['hits'] ref_viaf = dict() for e in same_as: if es.exists(index=args.index, doc_type='person', id=e['_id']): print('Retrieving document ' + e['_source']['owl:sameAs'] + ' in order to add it to document ' + e['_id'] + '.') query_body = '{"query":{"ids":{"values":["' + e['_source']['owl:sameAs'][21:] + '"]}}}' viaf_entry = es.search(index=args.index, doc_type='viaf', _source=True, body=query_body)['hits']['hits'][0]['_source'] inner_viaf = es.get(index=args.index, doc_type='person', id=e['_id'], _source = True)['_source'] embedded_viaf = es.get(index=args.index, doc_type='person', id=e['_id'], _source = True)['_source'] print('Updating document ' + e['_id'] + '.')