class SQLToESImporter(object): company_count: int companies_select: str sql_engine: Engine es_client: Elasticsearch es_index: str insertions: int def __init__(self, company_count: int = 100, cb_connect: str = SQL_CONNECT, es_connect: List[Dict] = ES_CONNECT, es_index: str = ES_INDEX): self.company_count = company_count # prep company selection select top-<limit> companies with most workers self.companies_select = CMPS_SELECT.format(limit=self.company_count) # connect to mysql crunshbase database self.sql_engine = create_engine(cb_connect) # connect to es instance self.es_client = Elasticsearch(list(es_connect)) self.es_index = es_index self.insertions = 0 if not self.es_client.ping(): raise ValueError("ElasticSearch Ping Failed") def pull(self) -> Dict: companies_result: ResultProxy try: with self.sql_engine.connect() as conn: companies_result = conn.execute(self.companies_select) for i, company in enumerate(companies_result): company_events = [] events_select = EVENTS_SELECT.format( company_id=company['company_id']) try: events_result = conn.execute(events_select) for event in events_result: try: company_events.append( dict(event_date=event['event_date'], event_code=event['event_code'], event_desc=event['event_desc'], event_url=event['event_url'])) except KeyError: raise except SQLAlchemyError as sq_e: raise try: company_document = dict( company_id=company['company_id'], company_name=company['company_name'], homepage_url=company['homepage_url'], logo_url=company['logo_url'], founded_date=company['founded_date'], country=company['country'], industry=company['industry'], location=company['location'], worker_count=company['worker_count'], events=company_events, ) except KeyError: raise yield company_document except SQLAlchemyError as sq_e: raise def push(self, company_document: Dict) -> bool: es_result = self.es_client.index(index=self.es_index, doc_type='company', id=self.insertions, body=company_document) if es_result['created']: self.insertions += 1 return True else: return False def delete_index(self): self.es_client.indices.delete(index=self.es_index, ignore=(400, 404)) def reimport(self) -> int: self.insertions = 0 self.delete_index() for company_document in self.pull(): self.push(company_document) return self.insertions
#coding:utf-8 from elasticsearch2 import Elasticsearch from datetime import datetime es = Elasticsearch(hosts="10.10.6.6") es.index(index="keti10_10", doc_type="keti10_10", id=3, body={"bdcdyh": "123", "lx": '1',\ 'postDate':'2017-12-30 12:11:06','qx':'北京','records':2,'uuid':'00123dfad','zl':'北京海淀区'}) #doc=es.get(index="keti10_10", doc_type="keti10_10", id=1)['_source'] #print "doc is %s" % doc res = es.search(index="keti10_10", body={"query": { "match_phrase": { "zl": '北京' } }}) for hit in res['hits']['hits']: hitmap = hit['_source'] print "%(zl)s %(postDate)s" % hitmap