def IndexData(request): es = ElasticSearch(settings.ELASTIC_SEARCH) for file in fileHolder: index = file['segment_name'].lower() rawfiles = file['rawfiles'] data_for_es = file['dataFrames'] try: es.delete_index(index.replace(" ", "")) except: pass es.create_index(index.replace(" ", "")) ## Loop dataframe and to elasticsearch index docs = json.loads(data_for_es.to_json(orient='records')) es.bulk((es.index_op(doc) for doc in docs), index=index.replace(" ", ""), doc_type=index) ##Create segment template file_names = [] for file in rawfiles: file_names.append(file.name) segment = Segments(name=index, files_added=",".join(file_names), es_index=index.replace(" ", "")) segment.save() segment = Segments.objects.get(name=index) return render(request, 'analyse.html', {'segment': segment})
def add_document(entries): es_server = 'http://localhost:9200/' if os.environ.get('ELASTICSEARCH_SERVER'): es_server = os.environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) es.bulk([es.index_op(doc) for doc in entries], index='memex', doc_type='page')
def add_document(entries): es_server = 'http://localhost:9200/' if os.environ.get('ELASTICSEARCH_SERVER'): es_server = os.environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) es.bulk([es.index_op(doc) for doc in entries], index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex', doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
class Indexer(object): def __init__(self, input): self.input = input self.es = ElasticSearch() self.index_name = "psim" self.doc_type = 'book' def delete_index(self): # Delete index if already found one try: self.es.delete_index(index = self.index_name) except Exception: pass def create_index(self): self.es.create_index(index=self.index_name, settings = self.get_index_settings()) def get_index_settings(self): settings = { "mappings": { "book": { "_all" : {"enabled" : "false"}, "properties": { "codes": {"type": "string", "term_vector": "yes", "store": "true"}, "pid" : {"type" : "string"}, "embedding": {"type": "float", "store": "true"}, "magnitude": {"type": "float", "store": "true"} } } } } return settings def documents(self): with open(self.input) as input_file: for line in input_file: json_doc = json.loads(line) yield self.es.index_op(json_doc, doc_type=self.doc_type) def index(self): self.delete_index() self.create_index() for chunk in bulk_chunks(self.documents(), docs_per_chunk=1000): self.es.bulk(chunk, index = self.index_name, doc_type = self.doc_type) self.es.refresh(self.index_name)
def update_document(entries): es_server = 'http://localhost:9200/' if os.environ.get('ELASTICSEARCH_SERVER'): es_server = os.environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) # es.update(index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex', # doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page', # id=url, # script=doc, # upsert=True # ) es.bulk([es.update_op(doc, id=doc['url'], upsert=True) for doc in entries], index=os.environ['ELASTICSEARCH_INDEX'] if os.environ.get('ELASTICSEARCH_INDEX') else 'memex', doc_type=os.environ['ELASTICSEARCH_DOC_TYPE'] if os.environ.get('ELASTICSEARCH_DOC_TYPE') else 'page')
def commit(self): if len(self.buffer) > 0: logging.debug("Inserting {} to elasticsearch".format(len(self.buffer))) es = ElasticSearch(ELASTICSEARCH_URI) docs = [] for doc in self.buffer: t = time.gmtime(int(doc['@timestamp']/1000)) index = ELASTICSEARCH_INDEX + "-" + str(t.tm_year).zfill(2) + "." + str(t.tm_mon).zfill(2) + "." + str(t.tm_mday).zfill(2) docs.append(es.index_op(doc, index=index, doc_type=ELASTICSEARCH_DOC)) if len(docs) > 0: try: es.bulk(docs) logging.debug("inserted %d records" % (len(docs))) self.buffer = [] except Exception as e: logging.error("Insert Exception " + str(e))
def update_index(sender, created, **kwargs): """ A signal for indexing new coffeehouses upon creation """ es = ElasticSearch() if created: m = sender.objects.last() es.bulk([ es.index_op({ "pk": m.pk, "name": m.name, "rating": m.rating, "location": { "lon": m.position.longitude, "lat": m.position.latitude } }), ], doc_type="place", index="toerh_coffee")
def bulkpush(sendto, offline, queue, errorqueue, debug=False): ''' Send data in a bulk document to target ElasticSearch clusters If a cluster is unreachable, data will be offloaded to a temporary directory until it is back online Keyword arguments: sendto -- list of online clusters to send data to offline -- list of offline clusters to withhold data for queue -- multiprocessing queue of documents ready to be sent errorqueue -- multiprocessing queue of documents ready to be sent ''' docs = [] errordocs = [] while not queue.empty(): docs.append(queue.get()) while not errorqueue.empty(): errordocs.append(errorqueue.get()) for cluster in sendto: # if debug: # pprint(cluster) es = ElasticSearch(cluster['url']) if docs: r = es.bulk((es.index_op(doc) for doc in docs), index=cluster['data index'], doc_type=cluster['data index type']) if errordocs: r = es.bulk((es.index_op(doc) for doc in errordocs), index=cluster['error index'], doc_type=cluster['error index type']) if debug: # TODO: add try except statment with imformative errors if r['errors'] == False: # TODO: dump data to be sent next time the script is run print('\n\t', 'Bulk package was received by', cluster['name']) else: print('\n\t', 'Bulk package was not accepted by', cluster['name']) if offline: _localoffload(offline=offline, docs=docs, errordocs=errordocs, debug=debug)
"coordinates" : coords, # 4, 5 "feature_class" : row[6], "feature_code" : row[7], "country_code2" : row[8], "country_code3" : country_code3, "cc2" : row[9], "admin1_code" : row[10], "admin2_code" : row[11], "admin3_code" : row[12], "admin4_code" : row[13], "population" : row[14], "elevation" : row[15], "dem" : row[16], "timzeone" : row[17], "modification_date" : "2014-01-01" } yield es.index_op(doc, index='geonames', doc_type='geoname') except: count += 1 print 'Exception count:', count chunk_count = 0 for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500): es.bulk(chunk) chunk_count += 1 print 'Chunk count:', chunk_count es.refresh('geonames')
def __init__(self,start,**kwargs): """ Invoke a Downloader object to get data from the Record. It will check to see if the necessary files are already downloaded and use those instead of querying FDSys. Downloaders are the endpoint for raw data. Required arguments: start : In form 'YYYY-MM-DD.' This is the day/start day you want. Optional arguments: parse : Defaults to True. This tells the downloader whether you just want the raw files, or if you also want it to extract data from the HTML. (Default means yes, give me the data.) end : Same form as start. This is the end date. outpath : Output path RELATIVE TO the present working directory. Defaults to 'output' and works fine when you run it from the repo's root directory. do_mode : Specify what kind of data you want from the parser. If do_mode is not set, the downloader will do absolutely zilch. do_mode can take the following values: json : write json files in a /json directory for that day of the Record. es : Specify the URL and index of an ElasticSearch cluster with arguments es_url and index, and it will pass each file to that cluster for indexing. WARNING: This doesn't handle any mappings, and it doesn't check to see if records are already there, so it will overwrite old files in the same index WITHOUT versioning. also specify: es_url : ElasticSearch cluster url index : ElasticSearch cluster index yield : For each day of the Record the user specifies, the downloader acts like a generator, yielding that day's "crfile" dictionary. """ self.status = 'idle' logging.debug('Downloader object ready with params:') logging.debug(','.join(['='.join([key,value]) for key,value in kwargs.items()])) if 'outpath' in kwargs.keys(): outpath = kwargs['outpath'] else: outpath = 'output' if kwargs['do_mode'] == 'es': es = ElasticSearch(kwargs['es_url']) for chunk in bulk_chunks((es.index_op(crfile.crdoc,id=crfile.crdoc.pop('id')) for crfile in self.bulkdownload(start,**kwargs)), docs_per_chunk=100): es.bulk(chunk,index=kwargs['index'],doc_type='crdoc') elif kwargs['do_mode'] == 'json': # outpath called so often to make it easy to follow # the idea that we're traversing a directory tree for crfile in self.bulkdownload(start,**kwargs): filename = os.path.split(crfile.filepath)[-1].split('.')[0] + '.json' outpath = os.path.split(crfile.filepath)[0] outpath = os.path.split(outpath)[0] if 'json' not in os.listdir(outpath): os.mkdir(os.path.join(outpath,'json')) outpath = os.path.join(outpath,'json',filename) with open(outpath,'w') as out_json: json.dump(crfile.crdoc,out_json) elif kwargs['do_mode'] == 'yield': self.yielded = self.bulkdownload(start,parse=True,**kwargs) elif kwargs['do_mode'] == 'noparse': self.bulkdownload(start,parse=False,**kwargs) else: return None
} } es.create_index(ELASTICSEARCH_INDEX, settings=index_settings) for filename in FILES: print "Processing %s" % filename sf = shapefile.Reader(filename) shapes = sf.shapes() for i, shape in enumerate(shapes, start=1): points = [(p[0], p[1]) for p in shape.points] data = { 'filename': filename, 'location': { 'type': 'polygon', 'coordinates': [points] } } if points[-1] != points[0]: points.append(points[0]) try: es.bulk([es.index_op(data)], doc_type=ELASTICSEARCH_DOC, index=ELASTICSEARCH_INDEX) except: print "Exception"
def getFeeds(): print "getting feeds" es = ElasticSearch('http://fisensee.ddns.net:9200/') query = {"query": {"range": {"date": {"lte": "now-1w/w"}}}} oldFeeds = es.search(query, size=300, index='feeds') if (len(oldFeeds['hits']['hits']) is not 0): es.bulk( es.delete_op(id=feed['_id'], index='feeds', doc_type='feed') for feed in oldFeeds['hits']['hits']) feedSources = FeedSource.objects.all() feeds = [] defaultText = 'undefined' defaultDate = datetime.datetime.now().isoformat() utc = pytz.utc berlin = pytz.timezone('Europe/Berlin') now = datetime.datetime.today() dateThreshold = now - datetime.timedelta(weeks=2) allUrls = [] for feedSource in feedSources: allUrls.append(feedSource.sourceUrl) urls = set(allUrls) for url in urls: source = feedparser.parse(url) for entry in source['items']: feed = { 'title': defaultText, 'description': defaultText, 'link': defaultText, 'date': defaultDate, 'url': defaultText } if ('title' in entry): feed['title'] = entry['title'] if ('description' in entry): feed['description'] = entry['description'] if ('link' in entry): feed['link'] = entry['link'] if ('published_parsed' in entry): date = datetime.datetime.fromtimestamp( time.mktime(entry['published_parsed'])) if (date < dateThreshold): break utcDate = utc.localize(date) feed['date'] = utcDate.astimezone(berlin).isoformat() #id creation should be enough for now, but it's made to fail if ('title' or 'published_parsed' in entry): feed['id'] = base64.urlsafe_b64encode( hashlib.sha256((feed['title'] + feed['date']).encode('utf8')).hexdigest()) else: feed['id'] = base64.urlsafe_b64encode( hashlib.sha256((feed['title']).encode('utf8')).hexdigest()) feed['url'] = url feeds.append(feed) es.bulk((es.index_op(feed, **{'id': feed.pop('id')}) for feed in feeds), index='feeds', doc_type='feed') print es.refresh('feeds')
"coordinates": coords, # 4, 5 "feature_class": row[6], "feature_code": row[7], "country_code2": row[8], "country_code3": country_code3, "cc2": row[9], "admin1_code": row[10], "admin2_code": row[11], "admin3_code": row[12], "admin4_code": row[13], "population": row[14], "elevation": row[15], "dem": row[16], "timzeone": row[17], "modification_date": "2014-01-01" } yield es.index_op(doc, index='geonames', doc_type='geoname') except: count += 1 print 'Exception count:', count chunk_count = 0 for chunk in bulk_chunks(documents(reader, es), docs_per_chunk=500): es.bulk(chunk) chunk_count += 1 print 'Chunk count:', chunk_count es.refresh('geonames')
class MySync(object): ts = 0 #last chunk time log_file = None log_pos = None def __init__(self): self.config = yaml.load(open('./etc/config.yaml')) self.mark_path = self.config['binlog']['mark'] self.bulk_size = self.config['es']['bulk_size'] self.excludes_fields = self.config['slave']['excludes_fields'] self.es = ElasticSearch('http://{host}:{port}/'.format( host=self.config['es']['host'], port=self.config['es']['port'] )) #set logger logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename=self.config['log']['run'] ) logging.getLogger('elasticsearch').setLevel(logging.INFO) logging.getLogger('elasticsearch.trace').setLevel(logging.INFO) logging.getLogger('elasticsearch.trace').addHandler(logging.StreamHandler()) #resume stream if os.path.isfile(self.mark_path): with open(self.mark_path, 'r') as y: mark = yaml.load(y) self.log_file = mark.get('log_file') self.log_pos = mark.get('log_pos') logging.info('resume stream >> file:%s, pos:%s' % (self.log_file, self.log_pos)) """ mark binlog position """ def mark_binlog(self): if self.log_file and self.log_pos: with open(self.mark_path, 'w') as y: logging.info('mark binlog >> file:%s, pos:%s' % (self.log_file, self.log_pos)) yaml.safe_dump({'log_file':self.log_file, 'log_pos':self.log_pos}, y, default_flow_style=False) """ format fields """ def _format(self, dat): for k,v in dat.items(): if isinstance(v, datetime): dat[k] = v.strftime('%Y-%m-%d %H:%M:%S') elif isinstance(v, date): dat[k] = v.strftime('%Y-%m-%d') if k in self.excludes_fields: del dat[k] return dat """ mysql binlog event handle """ def proc_binlog(self): stream = BinLogStreamReader( connection_settings = self.config['mysql'], server_id = self.config['slave']['server_id'], log_file = self.log_file, log_pos = self.log_pos, only_schemas = self.config['slave']['schemas'], blocking = True, resume_stream = bool(self.log_file and self.log_pos), only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent] ) for binlogevent in stream: #binlogevent.dump() self.log_file = stream.log_file self.log_pos = stream.log_pos for row in binlogevent.rows: pk = binlogevent.primary_key table = binlogevent.table schema = binlogevent.schema if isinstance(binlogevent, WriteRowsEvent): yield self.es.index_op(self._format(row['values']), doc_type=table, index=schema, id=row['values'][pk]) elif isinstance(binlogevent, UpdateRowsEvent): yield self.es.update_op(self._format(row['after_values']), doc_type=table, index=schema, id=row['after_values'][pk]) elif isinstance(binlogevent, DeleteRowsEvent): yield self.es.delete_op(doc_type=table, index=schema, id=row['values'][pk]) else: continue stream.close() """ notify exception """ def send_email(self, msg): import smtplib from email.mime.text import MIMEText msg = MIMEText(msg, 'plain', 'utf-8') msg['From'] = self.config['email']['from']['user'] msg['To'] = ','.join(self.config['email']['to']) msg['Subject'] = 'Binlog Sync Exception:' try: s = smtplib.SMTP() s.connect(self.config['email']['host'], self.config['email']['port']) s.ehlo() s.starttls() s.login(user=self.config['email']['from']['user'], password=self.config['email']['from']['passwd']) s.sendmail(msg['From'], self.config['email']['to'], msg.as_string()) s.quit() except Exception: import traceback logging.error(traceback.format_exc()) """ bulk chunk check every second """ def bulk_chunks(self, actions, docs_per_chunk=300, bytes_per_chunk=None): chunk = [] docs = bytes = 0 for action in actions: next_len = len(action) + 1 #+1 for \n if chunk and ( (docs_per_chunk and docs >= docs_per_chunk) or (bytes_per_chunk and bytes + next_len > bytes_per_chunk) or (self.ts+1 < int(time.time())) ): #print(">>>chunk:%d" % len(chunk)) yield chunk chunk = [] docs = bytes = 0 self.ts = int(time.time()) chunk.append(action) docs += 1 bytes += next_len if chunk: yield chunk """ run entry """ def run(self): try: for chunk in self.bulk_chunks(self.proc_binlog(), docs_per_chunk=self.bulk_size): #time.sleep(1) self.es.bulk(chunk) self.mark_binlog() except KeyboardInterrupt: pass except Exception: import traceback logging.error(traceback.format_exc()) self.send_email(msg=traceback.format_exc()) raise
return data es = ElasticSearch('http://localhost:9200/') es.delete_index('pet') spider = Spider() breeds = spider.getPetBreeds() p = Pinyin() for breed in breeds: flg = 1 page = 1 pet_list = [] while(flg): pets = spider.getPets(breed, (page - 1) * spider.limit) if not pets: flg = 0 else: page = page + 1 for pet in pets: pet_obj = {} pet_obj['name'] = pet['name'] pet_obj['img'] = pet['img'] pet_obj['type'] = breed['ename'] pet_list.append(pet_obj) #print pet['name'] + '\t' + p.get_pinyin(pet['name'], '') print breed['ename'] + '\n' if not pet_list: continue doc_type = p.get_pinyin(breed['ename'].replace('宠物', ''), '') es.bulk((es.index_op(pet_obj) for pet_obj in pet_list), doc_type=doc_type, index = 'pet') es.refresh('pet')
class ElasticSearch(object): conn = None url = settings.ELASTICSEARCH_URL index_name = settings.ELASTICSEARCH_INDEX_NAME stdout = None stderr = None def __init__(self, index_name=None, stdout=None, stderr=None): self.conn = PyElasticSearch() if index_name: self.index_name = index_name if stdout: self.stdout = stdout if stderr: self.stderr = stderr def create_index(self, delete=True): if delete: try: self.conn.delete_index(self.index_name) except ElasticHttpNotFoundError as e: pass mappings = dict( (k, v) for k, v in get_elasticsearch_properties().items()) self.conn.create_index(self.index_name, settings={'mappings': mappings}) def index_activity_by_id(self, activity_id): activity = HistoricalActivity.objects.get(pk=activity_id) return self.index_activity(activity) def delete_activity_by_id(self, activity_id): activity = HistoricalActivity.objects.get(pk=activity_id) return self.delete_activity(activity) def index_activity(self, activity): for doc_type in DOC_TYPES_ACTIVITY: docs = self.get_activity_documents(activity, doc_type=doc_type) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op( doc, id=doc.pop('id'), parent=doc.pop('_parent', None)) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) def index_investor(self, investor): for doc_type in DOC_TYPES_INVESTOR: docs = self.get_investor_documents(investor, doc_type=doc_type) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) def index_activity_documents(self, activity_identifiers=[]): activity_identifiers = activity_identifiers or HistoricalActivity.objects.filter( fk_status__in=( HistoricalActivity.STATUS_ACTIVE, HistoricalActivity.STATUS_PENDING, HistoricalActivity.STATUS_OVERWRITTEN, HistoricalActivity.STATUS_DELETED)).distinct().values_list( 'activity_identifier', flat=True).distinct() for doc_type in DOC_TYPES_ACTIVITY: docs = [] # Collect documents self.stdout and self.stdout.write( 'Collect %ss for %i deals...' % (doc_type, len(activity_identifiers))) for activity_identifier in activity_identifiers: for activity in self.get_activity_versions( activity_identifier): docs.extend( self.get_activity_documents(activity, doc_type=doc_type)) # Bulk index documents self.stdout and self.stdout.write('Index %i %ss...' % (len(docs), doc_type)) if len(docs) > 0: paginator = Paginator(docs, 1000) for page in paginator.page_range: try: self.conn.bulk( (self.conn.index_op(doc, id=doc.pop('id'), parent=doc.pop( '_parent', None)) for doc in paginator.page(page)), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % (error['index']['error'] ['caused_by']['type'], error['index']['error'] ['caused_by']['reason']) self.stderr and self.stderr.write(msg) self.conn.refresh() def index_investor_documents(self): investors = Investor.objects.public().order_by( 'investor_identifier', '-id').distinct('investor_identifier') for doc_type in DOC_TYPES_INVESTOR: docs = [] # Collect documents self.stdout and self.stdout.write( 'Collect %ss for %i investors...' % (doc_type, investors.count())) for investor in investors: docs.extend( self.get_investor_documents(investor, doc_type=doc_type)) # Bulk index documents self.stdout and self.stdout.write('Index %i %ss...' % (len(docs), doc_type)) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) #def index_activity_by_version(self, activity_identifier): # for doc_type in get_elasticsearch_properties().keys(): # docs = self.get_documents_for_activity_version(activity_identifier, doc_type=doc_type) # if len(docs) > 0: # try: # self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), # index=self.index_name, # doc_type=doc_type) # except BulkError as e: # for error in e.errors: # stderr and stderr.write('%s: %s (caused by %s: %s, ID: %s)' % ( # error['index']['error']['type'], # error['index']['error']['reason'], # error['index']['error']['caused_by']['type'], # error['index']['error']['caused_by']['reason'], # error['index']['_id'] # )) def get_activity_versions(self, activity_identifier): versions = [] # get the newest non-pending, readable historic version: try: newest = HistoricalActivity.objects.filter( activity_identifier=activity_identifier, fk_status__in=( HistoricalActivity.STATUS_ACTIVE, HistoricalActivity.STATUS_OVERWRITTEN, HistoricalActivity.STATUS_DELETED)).distinct().latest() if newest and not newest.fk_status_id == HistoricalActivity.STATUS_DELETED: versions.append(newest) except HistoricalActivity.DoesNotExist: newest = None # get newer pendings pendings = HistoricalActivity.objects.filter( activity_identifier=activity_identifier, fk_status_id=HistoricalActivity.STATUS_PENDING).distinct() if newest: pendings.filter(history_date__gt=newest.history_date) versions.extend(pendings) return versions def get_activity_documents(self, activity, doc_type='deal'): docs = [] deal_attrs = { 'id': activity.id, 'activity_identifier': activity.activity_identifier, 'historical_activity_id': activity.id, 'status': activity.fk_status_id, } # Todo: Is there a nice way to prevent this extra Activity query? # e.g. if we save is_public/deal_scope as ActivityAttributes public_activity = Activity.objects.filter( activity_identifier=activity.activity_identifier).order_by( '-id').first() if public_activity: deal_attrs.update({ 'is_public': public_activity.is_public, 'deal_scope': public_activity.deal_scope, 'deal_size': public_activity.deal_size, 'current_negotiation_status': public_activity.negotiation_status, 'top_investors': public_activity.top_investors, 'fully_updated_date': public_activity.fully_updated_date, }) else: # Fixme: This should not happen self.stderr and self.stderr.write( _('Missing activity for historical activity %i (Activity identifier: #%i)' % (activity.id, activity.activity_identifier))) #except Activity.MultipleObjectsReturned: # # Fixme: This should not happen # self.stderr and self.stderr.write(_('Too much activities for historical activity %i (Activity identifier: #%i)' % ( # activity.id, # activity.activity_identifier # ))) for a in activity.attributes.select_related('fk_group__name').order_by( 'fk_group__name'): # do not include the django object id if a.name == 'id': continue attribute = None attribute_key = '%s_attr' % a.name if attribute_key in get_elasticsearch_properties( )['deal']['properties'].keys(): attribute = { 'value': a.value, 'value2': a.value2, 'date': a.date, 'is_current': a.is_current, } value = a.value # Area field? if a.name and 'area' in a.name and a.polygon is not None: # Get polygon #value = json.loads(a.polygon.json) # Apparently this is case sensitive: MultiPolygon as provided by the GeoJSON does not work #value['type'] = 'multipolygon' value = a.polygon.json or '' # do not include empty values if value is None or value == '': continue # Doc types: location, data_source or contract group_match = a.fk_group and a.fk_group.name or '' group_match = re.match( '(?P<doc_type>location|data_source|contract)_(?P<count>\d+)', group_match) if group_match: dt, count = group_match.groupdict()['doc_type'], int( group_match.groupdict()['count']) if doc_type == dt: while len(docs) < count: docs.append({ '_parent': activity.activity_identifier, 'id': a.id, #'%i_%i' % (a.id, count), }) docs[count - 1][a.name] = [ value, ] # Set doc type counter within deal doc type (for location/data_source/contract) elif doc_type == 'deal': # Set counter key = '%s_count' % dt if key not in deal_attrs.keys(): deal_attrs[key] = count elif deal_attrs[key] < count: deal_attrs[key] = count # Create list with correct length to ensure formset values have the same index if not a.name in deal_attrs: deal_attrs[a.name] = [''] * count if attribute: deal_attrs[attribute_key] = [''] * count else: while len(deal_attrs[a.name]) < count: deal_attrs[a.name].append('') if attribute: deal_attrs[attribute_key].append('') deal_attrs[a.name][count - 1] = value if attribute: deal_attrs['%s_attr' % a.name][count - 1] = attribute # Doc type: deal and not formset elif doc_type == 'deal': if a.name in deal_attrs: deal_attrs[a.name].append(value) if '%s_attr' % a.name in get_elasticsearch_properties( )['deal']['properties'].keys(): deal_attrs['%s_attr' % a.name].append(attribute) else: deal_attrs[a.name] = [ value, ] if '%s_attr' % a.name in get_elasticsearch_properties( )['deal']['properties'].keys(): deal_attrs['%s_attr' % a.name] = [ attribute, ] if doc_type == 'deal': # Additionally save operational company attributes oc = Investor.objects.filter( investoractivityinvolvement__fk_activity__activity_identifier= activity.activity_identifier) if oc.count() > 0: oc = oc.first() for field in Investor._meta.fields: if isinstance(field, ForeignKey): deal_attrs['operational_company_%s' % field.name] = getattr( oc, '%s_id' % field.name) else: deal_attrs['operational_company_%s' % field.name] = getattr(oc, field.name) else: pass #self.stderr and self.stderr.write("Missing operational company for deal #%i" % activity.activity_identifier) # Create single document for each location # FIXME: Saving single deals for each location might be deprecated since we have doc_type location now? spatial_names = list(get_spatial_properties()) for i in range(deal_attrs.get('location_count', 0)): doc = deal_attrs.copy() for name in spatial_names: if not name in doc: continue if len(deal_attrs[name]) > i: doc[name] = deal_attrs[name][i] else: doc[name] = '' # Set unique ID for location (deals can have multiple locations) doc['id'] = '%s_%i' % (doc['id'], i) point_lat = doc.get('point_lat', None) point_lon = doc.get('point_lon', None) if point_lat and point_lon: # Parse values try: parsed_lat, parsed_lon = float(point_lat), float(point_lon) doc['geo_point'] = '%s,%s' % (point_lat, point_lon) except ValueError: doc['geo_point'] = '0,0' else: doc['point_lat'] = '0' doc['point_lon'] = '0' doc['geo_point'] = '0,0' # FIXME: we dont really need 'point_lat' and 'point_lon' here, # so we should pop them from doc when adding 'geo_point' docs.append(doc) # Update docs with export values for doc in docs: doc.update(self.get_export_properties(doc, doc_type=doc_type)) return docs def get_export_properties(self, doc, doc_type='deal'): if doc_type == 'investor': return ExportInvestorForm.export(doc) elif doc_type == 'involvement': return InvestorVentureInvolvementForm.export(doc) else: properties = { 'deal_scope_export': doc.get('deal_scope', ''), 'is_public_export': doc.get('is_public', False) and str(_('Yes')) or str(_('No')), 'deal_size_export': doc.get('deal_size', ''), 'current_negotiation_status_export': doc.get('current_negotiation_status', ''), 'top_investors_export': doc.get('top_investors', ''), 'fully_updated_date_export': doc.get('fully_updated_date', ''), } # Doc types: deal, location, contract and data_source for form in ChangeDealView.FORMS: formset_name = hasattr(form, "form") and form.Meta.name or None form = formset_name and form.form or form properties.update(form.export(doc, formset=formset_name)) properties.update( ExportInvestorForm.export(doc, prefix='operational_company_')) return properties def get_investor_documents(self, investor, doc_type='investor'): docs = [] # Doc types: involvement and investor if doc_type == 'involvement': ivis = InvestorVentureInvolvement.objects.filter( Q(fk_venture=investor) | Q(fk_investor=investor)) for ivi in ivis: doc = {} for field in ivi._meta.local_fields: if isinstance(field, ForeignKey): doc[field.name] = getattr(ivi, '%s_id' % field.name) else: doc[field.name] = getattr(ivi, field.name) docs.append(doc) elif doc_type == 'investor': doc = {} for field in investor._meta.local_fields: if isinstance(field, ForeignKey): doc[field.name] = getattr(investor, '%s_id' % field.name) else: doc[field.name] = getattr(investor, field.name) docs.append(doc) # Update docs with export values for doc in docs: doc.update(self.get_export_properties(doc, doc_type=doc_type)) return docs def refresh_index(self): self.conn.refresh(self.index_name) def search(self, elasticsearch_query, doc_type='deal', sort=[]): """ Executes paginated queries until all results have been retrieved. @return: The full list of hits. """ start = 0 size = 10000 # 10000 is the default elasticsearch max_window_size (pagination is cheap, so more is not necessarily better) raw_result_list = [] done = False while not done: query = { 'query': elasticsearch_query, 'from': start, 'size': size, } if sort: query['sort'] = sort query_result = self.conn.search(query, index=self.index_name, doc_type=doc_type) raw_result_list.extend(query_result['hits']['hits']) results_total = query_result['hits']['total'] if len(raw_result_list) >= results_total: done = True else: start = len(raw_result_list) print('\nElasticsearch returned %i documents from a total of %i \n\n' % (len(raw_result_list), query_result['hits']['total'])) return raw_result_list def delete_activity(self, activity): for doc_type in DOC_TYPES_ACTIVITY: try: if doc_type == 'deal': self.conn.delete(id=activity.activity_identifier, index=self.index_name, doc_type=doc_type) else: self.conn.delete_by_query(query={ "parent_id": { "type": "deal", "id": str(activity.activity_identifier), } }, index=self.index_name, doc_type=doc_type) except ElasticHttpNotFoundError as e: pass def get_deals_by_activity_identifier(self, activity_identifier, doc_type='deal'): return self.search({ "constant_score": { "filter": { "term": { "activity_identifier": activity_identifier } } } })
Rthandler.setFormatter(formatter) logging.getLogger().addHandler(Rthandler) def get_para_5m_raw_data(): examples = [] lines = io.open(PATH, 'r', encoding='utf-8').readlines() for i in lines: s1 = i.split("\t")[0].lower() s2 = i.split("\t")[1].lower() examples.append({'content': s1, 'type': 'origin'}) examples.append({'content': s2, 'type': 'para'}) return examples def document(sentences): for s in sentences: dic = {'content': s['content'], 'type': s['type']} yield es.index_op(dic) if __name__ == '__main__': # init_log() sentences = get_para_5m_raw_data() for chunk in bulk_chunks(document(sentences), docs_per_chunk=1000, bytes_per_chunk=100000): es.bulk(chunk, doc_type='sentence', index='para-nmt-50m') doc_num += 1000 print("indexed" + str(doc_num) + "docs") logging.info("indexed" + str(doc_num) + "docs")
class SearchIndex(object): def __init__(self, model): self.es = ElasticSearch() self.model = model def put_mapping(self, index, doc_type): mapping = { doc_type: { "properties": { "location": { "type": "geo_point" }, } } } self.es.put_mapping(index=index, doc_type=doc_type, mapping=mapping) def bulk_items(self, index, doc_type): for m in self.model.objects.all(): self.es.bulk([ self.es.index_op({ "pk": m.pk, "name": m.name, "rating": m.rating, "address": m.address, "description": m.description, "location": { "lon": m.longitude, "lat": m.latitude } }), ], doc_type=doc_type, index=index) def search(self, index, question, longitude, latitude, size=10): #self.es.delete_index(index) try: self.es.create_index(index) self.put_mapping(index, "place") self.bulk_items(index, "place") except IndexAlreadyExistsError: pass query = { "query": { "function_score": { "query": { "bool": { "should": [ {"match": {"name": question}}, {"match": {"_all": { "query": question, "operator": "or", "fuzziness": "auto", "zero_terms_query": "all" }}} ] } }, "functions": [ {"exp": {"rating": {"origin": 5, "scale": 1, "offset": 0.1}}}, ] } } } if longitude and longitude is not None: query['query']['function_score']['functions'] = [ {'gauss': { "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "550m", "scale": "1km"} }}, {'gauss': { "location": {"origin": {"lat": latitude, "lon": longitude}, "offset": "500m", "scale": "2km"} }}, ] results = self.es.search(query, index=index, size=size) self.es.refresh() return results
} } es.health(wait_for_status='yellow') es.delete_index('write-ads') es.create_index('write-ads', settings={'mappings': ad_mapping}) dateYMD = args["date"] prepareDataFromDB(dateYMD) dir = DATA_FILES_JSON + '/' + dateYMD for filename in os.listdir(dir): if filename.endswith('.json'): with open(dir + '/' + filename) as open_file: json_docs = json.load(open_file) es.bulk((es.index_op(doc) for doc in json_docs), index='write-ads', doc_type='ad') es.refresh("write-ads") res = es.search('website:com', index='write-ads') print("Got %d Hits for .com websites" % res['hits']['total']) for hit in res['hits']['hits']: print (hit["_source"]) res = es.search('website:in', index='write-ads') print("Got %d Hits for .in websites" % res['hits']['total']) res = es.search('category:entertainment', index='write-ads') print("Got %d Hits for category:Entertainment" % res['hits']['total'])
class MySync(object): log_file = None log_pos = None def __init__(self): print '[INFO] starting ...' self.config = yaml.load(open('./etc/config.yaml')) self.mark_path = self.config['binlog']['mark'] self.bulk_size = self.config['es']['bulk_size'] self.excludes_fields = self.config['slave']['excludes_fields'] self.es = ElasticSearch('http://{host}:{port}/'.format(host=self.config['es']['host'], port=self.config['es']['port'])) """ resume stream """ if os.path.isfile(self.mark_path): with open(self.mark_path, 'r') as y: mark = yaml.load(y) self.log_file = mark.get('log_file') self.log_pos = mark.get('log_pos') logging.info("resume stream : file: {file}, pos: {pos}".format(file=self.log_file,pos=self.log_pos)) def mark_binlog(self): if self.log_file and self.log_pos: with open(self.mark_path, 'w') as y: logging.info("mark binlog: binlog_file: {file}, pos: {pos}".format(file=self.log_file, pos=self.log_pos)) yaml.safe_dump({"log_file": self.log_file, "log_pos": self.log_pos}, y, default_flow_style=False) def _format(self, dat): for k,v in dat.items(): if isinstance(v, datetime): dat[k] = v.strftime('%Y-%m-%d %H:%M:%S') elif isinstance(v, date): dat[k] = v.strftime('%Y-%m-%d') if k in self.excludes_fields: del dat[k] return dat def proc_binlog(self): stream = BinLogStreamReader( connection_settings = self.config['mysql'], server_id = self.config['slave']['server_id'], log_file = self.log_file, log_pos = self.log_pos, only_schemas = self.config['slave']['schemas'], blocking = True, resume_stream = bool(self.log_file and self.log_pos), only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent] ) for binlogevent in stream: self.log_file = stream.log_file self.log_pos = stream.log_pos for row in binlogevent.rows: pk = binlogevent.primary_key table = binlogevent.table schema = binlogevent.schema if isinstance(binlogevent, WriteRowsEvent): yield self.es.index_op(self._format(row['values']), doc_type=table, index=schema, id=row['values'][pk]) elif isinstance(binlogevent, UpdateRowsEvent): yield self.es.update_op(self._format(row['after_values']), doc_type=table, index=schema, id=row['after_values'][pk]) elif isinstance(binlogevent, DeleteRowsEvent): yield self.es.delete_op(doc_type=table, index=schema, id=row['values'][pk]) else: logging.warning("unsupport event type") continue stream.close() def send_email(self, msg): import smtplib from email.mime.text import MIMEText msg = MIMEText(msg, 'plain', 'utf-8') msg['From'] = self.config['email']['from']['user'] msg['To'] = ','.join(self.config['email']['to']) msg['Subject'] = 'Binlog Sync Exception:' try: s = smtplib.SMTP(); s.connect(self.config['email']['host'], self.config['email']['port']) s.ehlo() s.starttls() s.login(user=self.config['email']['from']['user'], password=self.config['email']['from']['passwd']) s.sendmail(msg['From'], self.config['email']['to'], msg.as_string()) s.quit() except Exception: import traceback logging.error(traceback.format_exc()) def run(self): try: if self.bulk_size < 2: for action in self.proc_binlog(): self.es.bulk([action]) self.mark_binlog() else: for chunk in bulk_chunks(self.proc_binlog(), docs_per_chunk=self.bulk_size): self.es.bulk(chunk) self.mark_binlog() except KeyboardInterrupt: pass except Exception: import traceback logging.error(traceback.format_exc()) self.send_email(msg=traceback.format_exc()) raise
def getFeeds(): print "getting feeds" es = ElasticSearch('http://fisensee.ddns.net:9200/') query = {"query": {"range": {"date": {"lte": "now-1w/w"}}}} oldFeeds = es.search(query, size=300, index='feeds') if(len(oldFeeds['hits']['hits']) is not 0): es.bulk(es.delete_op(id=feed['_id'], index='feeds', doc_type='feed') for feed in oldFeeds['hits']['hits']) feedSources = FeedSource.objects.all() feeds = [] defaultText = 'undefined' defaultDate = datetime.datetime.now().isoformat() utc = pytz.utc berlin = pytz.timezone('Europe/Berlin') now = datetime.datetime.today() dateThreshold = now - datetime.timedelta(weeks=2) allUrls = [] for feedSource in feedSources: allUrls.append(feedSource.sourceUrl) urls = set(allUrls) for url in urls: source = feedparser.parse(url) for entry in source['items']: feed = { 'title':defaultText, 'description':defaultText, 'link':defaultText, 'date':defaultDate, 'url': defaultText } if('title' in entry): feed['title'] = entry['title'] if('description' in entry): feed['description'] = entry['description'] if('link' in entry): feed['link'] = entry['link'] if('published_parsed' in entry): date = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed'])) if(date < dateThreshold): break utcDate = utc.localize(date) feed['date'] = utcDate.astimezone(berlin).isoformat() #id creation should be enough for now, but it's made to fail if('title' or 'published_parsed' in entry): feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title'] + feed['date']).encode('utf8')).hexdigest()) else: feed['id'] = base64.urlsafe_b64encode(hashlib.sha256((feed['title']).encode('utf8')).hexdigest()) feed['url'] = url feeds.append(feed) es.bulk((es.index_op(feed, **{'id': feed.pop('id')}) for feed in feeds), index = 'feeds', doc_type = 'feed') print es.refresh('feeds')
class IbbdElasticSearch: """ es操作 文档:http://pyelasticsearch.readthedocs.io/en/latest/ """ es = None config = {} mapping_is_set = False # 判断是否已经设置了es的mapping def __init__(self, config): """ es初始化 配置参数: host: es连接字符串 indexName: index的名字 deleteIndex: 是否删除已经存在的index,默认为false,不删除 settings: index的配置。具体的配置项,请看es的文档。 settingsFile: index的配置,json文件。具体的配置项,请看es的文档。 mappings: mappings的配置。具体的配置项,请看es的文档。 mappingsFile: mappings的配置,json文件。具体的配置项,请看es的文档。 idField: id字段。有些数据是包含id字段的 说明:settings和settingsFile最多只能有一项 mappings和mappingsFile最多也只能有一项 """ self.es = ElasticSearch(config['host']) if 'docType' not in config: config['docType'] = config['indexName'] self.config = config if 'deleteIndex' in config and config['deleteIndex']: try: self.es.delete_index(config['indexName']) print('delete index ' + config['indexName'] + ' success!') except ElasticHttpNotFoundError: # 如果本来不存在,则输出提示就好 print('Index ' + config['indexName'] \ + ' not found, nothing to delete!') except: raise Exception('Index ' + config['indexName'] + ' delete error!') try: if 'settings' in config: self.es.create_index(config['indexName'], settings=config['settings']) elif 'settingsFile' in config: with open(config['settingsFile'], 'r') as f: config['settings'] = json.loads(f.read()) self.es.create_index(config['indexName'], settings=config['settings']) else: self.es.create_index(config['indexName']) print('create index ' + config['indexName'] + ' success!') except Exception: raise Exception("create index " + config['indexName'] + ' error!') def _putMapping(self, row): """ 设置es的mapping。 可以根据row生成默认配置, 生成配置规则如下: """ try: if 'mappingsFile' in self.config: with open(self.config['mappingsFile'], 'r') as f: self.config['mappings'] = json.loads(f.read()) if 'mappings' in self.config: self.es.put_mapping(self.config['indexName'], self.config['docType'], self.config['mappings']) print("put mapping " + self.config['indexName'] + ' success!') except Exception: raise Exception("put mapping " + self.config['indexName'] + ' error!') def read(self): pass def batchRead(self): pass def write(self, row): """ 写入单行记录 """ return self.batchWrite([row]) def batchWrite(self, rows): """ 写入多行记录 """ if not self.mapping_is_set: # 设置mapping self.mapping_is_set = True self._putMapping(rows[0]) docs = () if 'idField' in self.config: docs = (self.es.index_op(doc, id=doc.pop(self.config['idField'])) \ for doc in rows) else: docs = (self.es.index_op(doc) for doc in rows) self.es.bulk(docs, index=self.config['indexName'], doc_type=self.config['docType']) return True
def __init__(self,start,**kwargs): """ Invoke a Downloader object to get data from the Record. It will check to see if the necessary files are already downloaded and use those instead of querying FDSys. Downloaders are the endpoint for raw data. Required arguments: start : In form 'YYYY-MM-DD.' This is the day/start day you want. Optional arguments: parse : Defaults to True. This tells the downloader whether you just want the raw files, or if you also want it to extract data from the HTML. (Default means yes, give me the data.) end : Same form as start. This is the end date. outpath : Output path RELATIVE TO the present working directory. Defaults to 'output' and works fine when you run it from the repo's root directory. do_mode : Specify what kind of data you want from the parser. If do_mode is not set, the downloader will do absolutely zilch. do_mode can take the following values: json : write json files in a /json directory for that day of the Record. es : Specify the URL and index of an ElasticSearch cluster with arguments es_url and index, and it will pass each file to that cluster for indexing. WARNING: This doesn't handle any mappings, and it doesn't check to see if records are already there, so it will overwrite old files in the same index WITHOUT versioning. also specify: es_url : ElasticSearch cluster url index : ElasticSearch cluster index yield : For each day of the Record the user specifies, the downloader acts like a generator, yielding that day's "crfile" dictionary. """ self.status = 'idle' logging.debug('Downloader object ready with params:') logging.debug(','.join(['='.join([key,value]) for key,value in list(kwargs.items())])) if 'outpath' in list(kwargs.keys()): outpath = kwargs['outpath'] else: outpath = 'output' if kwargs['do_mode'] == 'es': es = ElasticSearch(kwargs['es_url']) for chunk in bulk_chunks((es.index_op(crfile.crdoc,id=crfile.crdoc.pop('id')) for crfile in self.bulkdownload(start,**kwargs)), docs_per_chunk=100): es.bulk(chunk,index=kwargs['index'],doc_type='crdoc') elif kwargs['do_mode'] == 'json': # outpath called so often to make it easy to follow # the idea that we're traversing a directory tree for crfile in self.bulkdownload(start,**kwargs): filename = os.path.split(crfile.filepath)[-1].split('.')[0] + '.json' outpath = os.path.split(crfile.filepath)[0] outpath = os.path.split(outpath)[0] if 'json' not in os.listdir(outpath): os.mkdir(os.path.join(outpath,'json')) outpath = os.path.join(outpath,'json',filename) with open(outpath,'w') as out_json: json.dump(crfile.crdoc,out_json) elif kwargs['do_mode'] == 'yield': self.yielded = self.bulkdownload(start,parse=True,**kwargs) elif kwargs['do_mode'] == 'noparse': self.bulkdownload(start,parse=False,**kwargs) else: return None
def loadlocal(debug=False): """ Check for data offloaded to disk and retry sending if cluster(s) are now online """ # TODO: have each cluster checked on a unqiue cluster basis instead of on a per json basis. # TODO: give user control of where data and tmp folder is stored basedir = './cfg/tmp/' datadir = basedir + 'data/' checkmakedir(basedir) checkmakedir(datadir) sendto = [] dumpconfigs = [ basedir + file for file in os.listdir(basedir) if file.endswith('.json') ] if dumpconfigs: for configfile in dumpconfigs: try: cluster = {} with open(configfile, 'r') as f: cluster = json.load(f) name = cluster['name'] cluster['alive'] = False i = 1 # i is to be cast as a string but incremented as an integer since we're using a Dict/JSON while not cluster['alive']: if not 'http://' in cluster[str(i)].encode(): if debug: print('Cluster', name, 'was missing \'http://\'') cluster[str(i)] = 'http://' + cluster[str(i)] if not cluster[str(i)][-1] == '/': if debug: print('Cluster', name, 'was missing \'/\'') cluster[str(i)] += '/' try: if requests.get(cluster[str(i)]).status_code == 200: cluster['url'] = cluster[str(i)] cluster['data index'] = make_index( cluster['url'], cluster['data index']) if cluster['log errors to index']: make_index(cluster['url'], cluster['error index']) cluster['alive'] = True cluster['dumpconfigs'] = configfile sendto.append(cluster) if debug: print('\nDumped cluster', name, 'will send to', cluster[str(i)] + cluster['data index']) if cluster['log errors to index']: print( 'Errors will be sent to', cluster[str(i)] + cluster['error index']) else: if debug: print('\nDumped cluster', name, 'master', i, 'cannot be reached. Trying next...') i += 1 except Exception as e: if debug: print(e) print('Dumped cluster', name, 'master', i, 'cannot be reached. Trying next...') i += 1 except Exception as e: if debug: print(e) print( 'Dumped cluster', name, 'has zero master nodes to send to! Skipping for now.') for cluster in sendto: try: es = ElasticSearch(cluster['url']) for pickleid in cluster['pickle']: with open(datadir + pickleid, 'rb') as f: docpile = pickle.load(f) r = es.bulk((es.index_op(doc) for doc in docpile), index=cluster['data index'], doc_type=cluster['data index type']) for errpickleid in cluster['err_pickle']: with open(datadir + errpickleid, 'rb') as f: errdocpile = pickle.load(f) r = es.bulk((es.index_op(doc) for doc in errdocpile), index=cluster['error index'], doc_type=cluster['error index type']) if debug: if r['errors']: print(r['errors']) raise Exception os.remove(cluster['dumpconfigs']) except Exception as e: print(e) print("thought the cluster was up but it really isn't") _cleanupdump(debug)
'age': 32, 'title': '抽象tv Coder' }, { 'id': 2, 'name': 'Jessica Coder', 'age': 31, 'title': 'Programmer' }, { 'id': 3, 'name': 'Freddy Coder抽', 'age': 29, 'title': 'Office Assistant' }] es.bulk((es.index_op(doc, id=doc.pop('id')) for doc in docs), index='test', doc_type='test') es.refresh('test') res1 = es.get('test', 'test', 1) # 全文匹配, 注意中英文的分词方式. # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html res8 = es.search(index='test', size=2, query={"query": { "query_string": { "query": "抽" }
class ESWrapper(BaseDB): def __init__(self, index_name, host='http://localhost', port=9200): self.eserver = ElasticSearch(urls=host, port=port, timeout=60, max_retries=3) self._base_query = { "query": { "bool": { "must": { "match": { "name.raw": "" } } } } } self._geo_filter = { "geo_distance": { "distance": "20km", "coordinates": {} } } self._index = index_name self._doctype = "places" def query(self, qkey, qtype="exact"): """ qtype values are exact, relaxed or geo_distance """ q = self._base_query.copy() if qtype == "exact": q["query"]["bool"]["must"]["match"]["name.raw"] = qkey elif qtype == "relaxed": q["query"]["bool"]["must"]["match"]["name"] = qkey q["query"]["bool"]["must"]["match"].pop("name.raw") elif qtype == "geo_distance": q = { "query": { "bool": { "must": { "match_all": {} } }, "filter": { "geo_distance": { "distance": "20km", "coordinates": qkey } } } } return self.eserver.search(q, index=self._index, doc_type=self._doctype) def near_geo(self, geo_point): q = { "query": { "bool": { "must": { "match_all": {} } }, "filter": self._geo_filter } } q["query"]["bool"]["geo_distance"]["coordinates"] = geo_point return self.eserver.search(q, index=self._index, doc_type=self._doctype) def create(self, datacsv, confDir="../data/"): with open(os.path.join(confDir, "es_settings.json")) as jf: settings = json.load(jf) self.eserver.create_index(index='geonames', settings=settings) for chunk in bulk_chunks(self._opLoader(datacsv, confDir), docs_per_chunk=1000): self.eserver.bulk(chunk, index='geonames', doc_type='places') print "..", self.eserver.refresh('geonames') def _opLoader(self, datacsv, confDir): with DataReader(datacsv, os.path.join(confDir, 'geonames.conf')) as reader: cnt = 0 for row in reader: row['coordinates'] = [ float(row['longitude']), float(row['latitude']) ] del (row['latitude']) del (row['longitude']) row['alternatenames'] = row['alternatenames'].split(",") cnt += 1 #if cnt > 100: #break yield self.eserver.index_op(row, index="geonames", doc_type="places")
class ESWrapper(BaseDB): def __init__(self, index_name, doc_type, host='http://localhost', port=9200): self.eserver = ElasticSearch(urls=host, port=port, timeout=60, max_retries=3) #self._base_query = {"query": {"bool": {"must": {"match": {}}}}} #self._base_query = {"query": {"bool": {}}} self._geo_filter = {"distance": "20km", "coordinates": {}} self._population_filter = {'population': {'gte': 5000}} self._index = index_name self._doctype = doc_type def getByid(self, geonameId): maincondition = {"match": {"id": geonameId}} q = {"query": {"bool": {"must": maincondition}}} return self.eserver.search( q, index=self._index, doc_type=self._doctype)['hits']['hits'][0]['_source'] def _query(self, qkey, **kwargs): q = {"query": {"bool": {}}} query_name = "should" q["query"]["bool"]["minimum_number_should_match"] = 1 kwargs.pop("qtype", "") placetokens = [ l.strip() for l in tokenizer.split(qkey) if l and l not in STOP_WORDS and l[-1] != '.' ] if placetokens: reduced_placename = u" ".join(placetokens[0:]) if len(placetokens[0]) < 3 and len( placetokens) > 1 and 3.0 / len(placetokens) >= .5: reduced_placename = u" ".join(placetokens[1:]) else: reduced_placename = qkey # print "qkey", qkey, "reduced", reduced_placename maincondition = [ { "bool": { "must": [{ "multi_match": { "query": qkey, "fields": ["name.raw^5", "asciiname^5", "alternatenames"], "operator": "and" } }, { "terms": { "featureClass": ["a", "p"] } }], } }, { "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "normalized_asciiname": { "value": qkey } } }, # {"term": {"alternatenames": {"value": qkey[1:]}}}, { "term": { "alternatenames": { "value": qkey } } }, # {"multi_match": {"query": reduced_placename if 'fuzzy' in kwargs else unicode(unidecode(reduced_placename)), { "multi_match": { "query": reduced_placename if 'fuzzy' in kwargs else unicode( unidecode(reduced_placename)), 'fuzziness': kwargs.pop("fuzzy", 0), "max_expansions": kwargs.pop("max_expansion", 10), "prefix_length": kwargs.pop("prefix_length", 1), 'operator': kwargs.pop("operator", "and"), "fields": [ "name^3", "asciiname^3", "alternatenames", "normalized_asciiname^3" ] } } ] q["query"]["bool"][query_name] = maincondition if kwargs: filter_cond = [] if 'min_popln' in kwargs: popln = kwargs.pop("min_popln") if popln is not None: filter_cond.append( {"range": { "population": { "gte": popln } }}) for key, val in kwargs.viewitems(): if not isinstance(val, basestring): val = list([(v) for v in val]) filter_cond.append({"terms": {key: val}}) else: filter_cond.append({"term": {key: (val)}}) q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}} q['from'] = 0 q['size'] = 50 return self.eserver.search(q, index=self._index, doc_type=self._doctype) def query(self, qkey, min_popln=None, **kwargs): #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits'] res = self._query(qkey, min_popln=min_popln, **kwargs)['hits'] #max_score = sum([r['_score'] for r in res]) max_score = res['max_score'] #sum([r['_score'] for r in res]) #for t in res: # print(max_score) gps = [] if max_score == 0.0: ## no results were obtained by elasticsearch instead it returned a random/very ## low scoring one res['hits'] = [] for t in res['hits']: t['_source']['geonameid'] = t["_source"]["id"] #t['_source']['_score'] = t[1] / max_score t['_source']['_score'] = t['_score'] / max_score pt = GeoPoint(**t["_source"]) if t['_source']['featureCode'].lower() == "cont": gps = [pt] break gps.append(pt) if len(gps) == 1: gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) / max(float(len(gps[0].name)), float(len(qkey)))) return gps def _oldquery(self, qkey, qtype="exact", analyzer=None, min_popln=None, size=10, **kwargs): """ qtype values are exact, relaxed or geo_distance Always limit results to 10 """ q = {"query": {"bool": {}}} query_name = kwargs.pop('query_name', 'must') query_name = "should" if query_name == "should": q["query"]["bool"]["minimum_number_should_match"] = 1 maincondition = {} if qtype == "exact": maincondition = [{ "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "alternatenames": { "value": qkey } } }] if analyzer: maincondition["match"]["name.raw"]["analyzer"] = analyzer elif qtype == "relaxed": maincondition["match"] = {"alternatenames": {"query": qkey}} if analyzer: maincondition["match"]["alternatenames"]["analyzer"] = analyzer #q["query"]["bool"][query_name]["match"].pop("name.raw", "") elif qtype == "combined": maincondition = [{ "bool": { "must": { "multi_match": { "query": qkey, "fields": ["name.raw", "asciiname", "alternatenames"] } }, "filter": { "bool": { "should": [{ "range": { "population": { "gte": 5000 } } }, { "terms": { "featureCode": [ "pcla", "pcli", "cont", "rgn", "admd", "adm1", "adm2" ] } }] } } } }, { "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "alternatenames": { "value": qkey[1:] } } }, { "match": { "alternatenames": { "query": qkey, 'fuzziness': kwargs.pop("fuzzy", 0), "max_expansions": kwargs.pop("max_expansion", 5), "prefix_length": kwargs.pop("prefix_length", 1) } } }] if maincondition: q["query"]["bool"][query_name] = maincondition if min_popln: filter_cond = [{"range": {"population": {"gte": min_popln}}}] else: filter_cond = [] if kwargs: #filter_cond = [{"range": {"population": {"gte": min_popln}}}] filter_cond += [{ "term": { key: val } } for key, val in kwargs.viewitems()] # print(filter_cond) q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}} elif min_popln: filter_cond = [{ "range": { "population": { "gte": min_popln } } }, { "terms": { "featureCode": ["ppla", "pplx"] } }] q["query"]["bool"]["filter"] = { "bool": { "should": filter_cond } } return self.eserver.search(q, index=self._index, doc_type=self._doctype) def oldquery(self, qkey, min_popln=None, **kwargs): #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits'] res = self._query(qkey, min_popln=min_popln, **kwargs)['hits'] #max_score = sum([r['_score'] for r in res]) max_score = res['max_score'] #sum([r['_score'] for r in res]) #for t in res: gps = [] if max_score == 0.0: ## no results were obtained by elasticsearch instead it returned a random/very ## low scoring one res['hits'] = [] for t in res['hits']: t['_source']['geonameid'] = t["_source"]["id"] #t['_source']['_score'] = t[1] / max_score t['_source']['_score'] = t['_score'] / max_score pt = GeoPoint(**t["_source"]) if t['_source']['featureCode'].lower() == "cont": gps = [pt] break gps.append(pt) if len(gps) == 1: gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) / max(float(len(gps[0].name)), float(len(qkey)))) return gps def near_geo(self, geo_point, min_popln=5000, **kwargs): q2 = { "query": { "bool": { "must": { "match_all": {} }, "filter": [ { "geo_distance": { "distance": "30km", "coordinates": geo_point } }, { "terms": # {"featureCode": # ["pcli", "ppl", "ppla2", "adm3"]} { "featureClass": ["a", "h", "l", "t", "p", "v"] } } ] } }, "sort": { "population": "desc" } } if kwargs: for key in kwargs: q2['query']['bool']['filter'].append( {"term": { key: kwargs[key] }}) res = self.eserver.search( q2, index=self._index, doc_type=self._doctype)['hits']['hits'][0]['_source'] res['confidence'] = 1.0 return [GeoPoint(**res)] def create(self, datacsv, confDir="../data/"): with open(os.path.join(confDir, "es_settings.json")) as jf: settings = json.load(jf) settings['mappings'][self._doctype] = settings['mappings'].pop( 'places') try: self.eserver.create_index(index=self._index, settings=settings) except: self.eserver.delete_index(self._index) self.eserver.create_index(index=self._index, settings=settings) for chunk in bulk_chunks(self._opLoader(datacsv, confDir), docs_per_chunk=1000): self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype) print "..", self.eserver.refresh(self._index) def _opLoader(self, datacsv, confDir): ere = re.compile("[^\sa-zA-Z0-9]") with DataReader(datacsv, os.path.join(confDir, 'geonames.conf')) as reader: cnt = 0 for row in reader: try: row['coordinates'] = [ float(row['longitude']), float(row['latitude']) ] try: row['population'] = int(row["population"]) except: row['population'] = -1 try: row['elevation'] = int(row['elevation']) except: row['elevation'] = -1 del (row['latitude']) del (row['longitude']) #print row['name'] row['alternatenames'] = row['alternatenames'].lower( ).split(",") row['normalized_asciiname'] = (re.sub( r'\s+', r' ', ere.sub("", row['asciiname']))).strip() cnt += 1 yield self.eserver.index_op(row, index=self._index, doc_type=self._doctype) except: print json.dumps(row) continue def remove_dynamic_stopwords(self, term): # cc = {} # ttl = 0 words = [w for t in term.split("-") for w in t.split() if len(w) > 1] if len(words) == 1: return term stopword_removed = "" for word in words: try: t = self.eserver.count(word)['count'] if t >= 20000: continue except: pass stopword_removed += (word + " ") # else: # print(term, "stopword ", word) return stopword_removed.strip()
class ESWrapper(BaseDB): def __init__(self, index_name, doc_type, host='http://localhost', port=9200): self.eserver = ElasticSearch(urls=host, port=port, timeout=60, max_retries=3) #self._base_query = {"query": {"bool": {"must": {"match": {}}}}} #self._base_query = {"query": {"bool": {}}} self._geo_filter = {"distance": "20km", "coordinates": {}} self._population_filter = {'population': {'gte': 5000}} self._index = index_name self._doctype = doc_type def getByid(self, geonameId): maincondition = {"match": {"id": geonameId}} q = {"query": {"bool": {"must": maincondition}}} return self.eserver.search( q, index=self._index, doc_type=self._doctype)['hits']['hits'][0]['_source'] def _query(self, qkey, qtype="exact", analyzer=None, min_popln=None, size=10, **kwargs): """ qtype values are exact, relaxed or geo_distance Always limit results to 10 """ q = {"query": {"bool": {}}} query_name = kwargs.pop('query_name', 'must') query_name = "should" if query_name == "should": q["query"]["bool"]["minimum_number_should_match"] = 1 maincondition = {} if qtype == "exact": maincondition = [{ "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "alternatenames": { "value": qkey } } }] #maincondition["match"] = {"name.raw": {"query": qkey}} if analyzer: maincondition["match"]["name.raw"]["analyzer"] = analyzer elif qtype == "relaxed": maincondition["match"] = {"alternatenames": {"query": qkey}} if analyzer: maincondition["match"]["alternatenames"]["analyzer"] = analyzer #q["query"]["bool"][query_name]["match"].pop("name.raw", "") elif qtype == "combined": maincondition = [{ "bool": { "must": { "multi_match": { "query": qkey, "fields": ["name.raw", "asciiname", "alternatenames"] } }, "filter": { "bool": { "should": [{ "range": { "population": { "gte": 5000 } } }, { "terms": { "featureCode": [ "pcla", "pcli", "cont", "rgn", "admd", "adm1", "adm2" ] } }] } } } }, { "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "alternatenames": { "value": qkey[1:] } } }, { "match": { "alternatenames": { "query": qkey, 'fuzziness': kwargs.pop("fuzzy", 0), "max_expansions": kwargs.pop("max_expansion", 5), "prefix_length": kwargs.pop("prefix_length", 1) } } }] if maincondition: q["query"]["bool"][query_name] = maincondition if min_popln: filter_cond = [{"range": {"population": {"gte": min_popln}}}] else: filter_cond = [] if kwargs: #filter_cond = [{"range": {"population": {"gte": min_popln}}}] filter_cond += [{ "term": { key: val } } for key, val in kwargs.viewitems()] # print(filter_cond) q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}} elif min_popln: filter_cond = [{ "range": { "population": { "gte": min_popln } } }, { "terms": { "featureCode": ["ppla", "pplx"] } }] q["query"]["bool"]["filter"] = { "bool": { "should": filter_cond } } return self.eserver.search(q, index=self._index, doc_type=self._doctype) def query(self, qkey, min_popln=None, **kwargs): #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits'] res = self._query(qkey, min_popln=min_popln, **kwargs)['hits'] #max_score = sum([r['_score'] for r in res]) max_score = res['max_score'] #sum([r['_score'] for r in res]) #for t in res: gps = [] if max_score == 0.0: ## no results were obtained by elasticsearch instead it returned a random/very ## low scoring one res['hits'] = [] for t in res['hits']: t['_source']['geonameid'] = t["_source"]["id"] #t['_source']['_score'] = t[1] / max_score t['_source']['_score'] = t['_score'] / max_score pt = GeoPoint(**t["_source"]) if t['_source']['featureCode'].lower() == "cont": gps = [pt] break gps.append(pt) if len(gps) == 1: gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) / max(float(len(gps[0].name)), float(len(qkey)))) return gps def near_geo(self, geo_point, min_popln=5000, **kwargs): q2 = { "query": { "bool": { "must": { "match_all": {} }, "filter": [{ "geo_distance": { "distance": "30km", "coordinates": geo_point } }, { "terms": { "featureCode": ["pcli", "ppl", "ppla2", "adm3"] } }] } }, "sort": { "population": "desc" } } res = self.eserver.search(q2, index=self._index, doc_type=self._doctype, **kwargs)['hits']['hits'][0]['_source'] res['confidence'] = 1.0 return [GeoPoint(**res)] def create(self, datacsv, confDir="../data/"): with open(os.path.join(confDir, "es_settings.json")) as jf: settings = json.load(jf) settings['mappings'][self._doctype] = settings['mappings'].pop( 'places') try: self.eserver.create_index(index=self._index, settings=settings) except: self.eserver.delete_index(self._index) self.eserver.create_index(index=self._index, settings=settings) for chunk in bulk_chunks(self._opLoader(datacsv, confDir), docs_per_chunk=1000): self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype) print "..", self.eserver.refresh(self._index) def _opLoader(self, datacsv, confDir): with DataReader(datacsv, os.path.join(confDir, 'geonames.conf')) as reader: cnt = 0 for row in reader: try: row['coordinates'] = [ float(row['longitude']), float(row['latitude']) ] try: row['population'] = int(row["population"]) except: row['population'] = -1 try: row['elevation'] = int(row['elevation']) except: row['elevation'] = -1 del (row['latitude']) del (row['longitude']) #print row['name'] row['alternatenames'] = row['alternatenames'].split(",") cnt += 1 yield self.eserver.index_op(row, index=self._index, doc_type=self._doctype) except: print json.dumps(row) continue