def handle_command(self, doc, namespace, timestamp): # Flush buffer before handle command self.commit() db = namespace.split(".", 1)[0] if doc.get("dropDatabase"): dbs = self.command_helper.map_db(db) for _db in dbs: self.elastic.indices.delete(index=_db.lower()) if doc.get("renameCollection"): raise errors.OperationFailed( "elastic_doc_manager does not support renaming a mapping.") if doc.get("create"): db, coll = self.command_helper.map_collection(db, doc["create"]) if db and coll: self.elastic.indices.put_mapping( index=db.lower(), doc_type=coll, body={"_source": { "enabled": True }}) if doc.get("drop"): db, coll = self.command_helper.map_collection(db, doc["drop"]) if db and coll: # This will delete the items in coll, but not get rid of the # mapping. warnings.warn("Deleting all documents of type %s on index %s." "The mapping definition will persist and must be" "removed manually." % (coll, db)) responses = streaming_bulk( self.elastic, (dict(result, _op_type="delete") for result in scan( self.elastic, index=db.lower(), doc_type=coll)), ) for ok, resp in responses: if not ok: LOG.error( "Error occurred while deleting ElasticSearch docum" "ent during handling of 'drop' command: %r" % resp)
def handle_command(self, doc, namespace, timestamp): db, _ = namespace.split('.', 1) if doc.get('dropDatabase'): for new_db in self.command_helper.map_db(db): self.solr.delete(q="ns:%s.*" % new_db, commit=(self.auto_commit_interval == 0)) if doc.get('renameCollection'): raise errors.OperationFailed( "solr_doc_manager does not support replication of " " renameCollection") if doc.get('create'): # nothing to do pass if doc.get('drop'): new_db, coll = self.command_helper.map_collection(db, doc['drop']) if new_db: self.solr.delete(q="ns:%s.%s" % (new_db, coll), commit=(self.auto_commit_interval == 0))
def get_last_doc(self): """Returns the last document stored in the Elastic engine. """ try: result = self.elastic.search(index="_all", body={ "query": { "match_all": {} }, "sort": [{ "_ts": "desc" }] }, size=1)["hits"]["hits"] except (es_exceptions.ConnectionError): raise errors.ConnectionFailed( "Could not connect to Elastic Search") except es_exceptions.TransportError: raise errors.OperationFailed( "Could not retrieve last document from Elastic Search") return result[0]["_source"] if len(result) > 0 else None
def bulk_upsert(self, docs): """Update or insert multiple documents into Elastic docs may be any iterable """ def docs_to_upsert(): doc = None for doc in docs: index = doc["ns"] doc[self.unique_key] = str(doc[self.unique_key]) doc_id = doc[self.unique_key] yield { "index": { "_index": index, "_type": self.doc_type, "_id": doc_id } } yield doc if not doc: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: self.elastic.bulk(doc_type=self.doc_type, body=docs_to_upsert(), refresh=(self.auto_commit_interval == 0)) except (es_exceptions.ConnectionError): raise errors.ConnectionFailed( "Could not connect to Elastic Search") except es_exceptions.TransportError: raise errors.OperationFailed( "Could not bulk-insert documents into Elastic") except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass
def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results""" try: first_response = self.elastic.search(*args, search_type="scan", scroll="10m", size=100, **kwargs) scroll_id = first_response.get("_scroll_id") expected_count = first_response.get("hits", {}).get("total", 0) results_returned = 0 while results_returned < expected_count: next_response = self.elastic.scroll(scroll_id=scroll_id, scroll="10m") results_returned += len(next_response["hits"]["hits"]) for doc in next_response["hits"]["hits"]: yield doc["_source"] except (es_exceptions.ConnectionError): raise errors.ConnectionFailed( "Could not connect to Elastic Search") except es_exceptions.TransportError: raise errors.OperationFailed( "Could not retrieve documents from Elastic Search")
def upsert(self, doc): """Update or insert a document into Elastic If you'd like to have different types of document in your database, you can store the doc type as a field in Mongo and set doc_type to that field. (e.g. doc_type = doc['_type']) """ doc_type = self.doc_type index = doc['ns'] doc[self.unique_key] = str(doc["_id"]) doc_id = doc[self.unique_key] try: self.elastic.index(index=index, doc_type=doc_type, body=bsjson.dumps(doc), id=doc_id, refresh=(self.auto_commit_interval == 0)) except (es_exceptions.ConnectionError): raise errors.ConnectionFailed( "Could not connect to Elastic Search") except es_exceptions.TransportError: raise errors.OperationFailed("Could not index document: %s" % (bsjson.dumps(doc)))
def _clean_doc(self, doc, namespace, timestamp): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys - inserts namespace and timestamp metadata into the document in order to handle rollbacks An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # Translate the _id field to whatever unique key we're using. # _id may not exist in the doc, if we retrieved it from Solr # as part of update. if '_id' in doc: doc[self.unique_key] = u(doc.pop("_id")) # Update namespace and timestamp metadata if 'ns' in doc or '_ts' in doc: raise errors.OperationFailed( 'Need to set "ns" and "_ts" fields, but these fields already ' 'exist in the document %r!' % doc) logging.debug(doc) doc['ns'] = namespace doc['_ts'] = timestamp if 'type' in doc and not math.isnan(doc['type']): doc['type'] = int(doc['type']) else: doc['type'] = 0 if 'popularity' in doc and not math.isnan(doc['popularity']): doc['popularity'] = float(doc['popularity']) else: doc['popularity'] = 0 if 'usefulness' in doc and not math.isnan(doc['usefulness']): doc['usefulness'] = float(doc['usefulness']) else: doc['usefulness'] = 0 if 'wroteAtPlan' in doc: wroteAtPlan = doc['wroteAtPlan'] if "_id" in wroteAtPlan: doc['wroteAtPlan'] = u(wroteAtPlan["_id"]) if 'noteInfo' in wroteAtPlan: noteInfo = wroteAtPlan["noteInfo"] if 'day' in noteInfo and not math.isnan(noteInfo['day']): doc['wroteAtPlanDay'] = int(noteInfo["day"]) if 'time' in noteInfo and not math.isnan(noteInfo['time']): doc['wroteAtPlanTime'] = int(noteInfo["time"]) contentStr = u("") if 'content' in doc: noteContent = doc['content'] if isinstance(noteContent, list): for content in noteContent: if 'type' in content: if content['type'] == "html": contentStr += content['data'] doc["content"] = contentStr if 'location' in doc: location = doc['location'] if 'cc' in location: doc['cc'] = location['cc'] if 'loc' in location: loc = location['loc'] if 'coordinates' in loc: coordinates = loc['coordinates'] doc['loc'] = str(coordinates[1]) + ',' + str( coordinates[0]) if 'address' in location: doc['address'] = location['address'] if 'locationWrote' in doc: locationWrote = doc['locationWrote'] if 'loc' in locationWrote: loc = locationWrote['loc'] if 'coordinates' in loc: coordinates = loc['coordinates'] doc['locWrote'] = str(coordinates[1]) + ',' + str( coordinates[0]) if 'locWrote' in doc: if isinstance(doc['locWrote'], list): doc['locWrote'] = doc['locWrote'][0] if 'loc' in doc: if isinstance(doc['loc'], list): doc['loc'] = doc['loc'][0] # SOLR cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key #flat_doc = self._formatter.format_document(doc) flat_doc = doc # Only include fields that are explicitly provided in the # schema or match one of the dynamic field patterns, if # we were able to retrieve the schema if len(self.field_list) + len(self._dynamic_field_regexes) > 0: def include_field(field): return field in self.field_list or any( regex.match(field) for regex in self._dynamic_field_regexes) return dict( (k, v) for k, v in flat_doc.items() if include_field(k)) return flat_doc
def _clean_doc(self, doc, namespace, timestamp): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys - inserts namespace and timestamp metadata into the document in order to handle rollbacks An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # Translate the _id field to whatever unique key we're using. # _id may not exist in the doc, if we retrieved it from Solr # as part of update. if '_id' in doc: doc[self.unique_key] = u(doc.pop("_id")) # Update namespace and timestamp metadata if 'ns' in doc or '_ts' in doc: raise errors.OperationFailed( 'Need to set "ns" and "_ts" fields, but these fields already ' 'exist in the document %r!' % doc) doc['ns'] = namespace doc['_ts'] = timestamp #doc 提前进行扁平化 doc = self._formatter.format_document(doc) #对doc中tag*变量长度进行限制 for k, v in doc.items(): if (k[0:3] == "tag" and v and isinstance(v, basestring)): doc[k] = v[0:9000] # 获取mongo表名称 collecion_name = self._get_collection_name(namespace) # 处理用户行为表数据 if ("b_dynamic" == collecion_name): logging.info("to process doc from b_dynamic ,the doc is %s" % str(doc[self.unique_key])) return self._parse_user_dynamic_collection(doc) #处理用户表 if ("T_USER" == collecion_name): logging.info("to process doc from T_USER ,the doc is %s" % str(doc[self.unique_key])) return self._parse_t_user_collection(doc) #to process the content data logging.info("begin to process b_content ,the doc is %s" % str(doc[self.unique_key])) doctemp = self._parse_content_doc(doc) if doctemp is None: logging.info("don't send doc to solr ,the doc is %s" % str(doc)) return None if (isinstance(doctemp, list) and len(doctemp) == 0): logging.info("don't send doc to solr ,the doc is %s" % str(doc)) return None if (isinstance(doctemp, list) and len(doctemp) > 1): logging.info( "to process doc from b_content after it is a list,the doc is %s" % str(doc[self.unique_key])) flat_doc = [] for docvalue in doctemp: flat_doc.append(self._parse_doc_to_solr_doc(docvalue)) return flat_doc if (isinstance(doctemp, list)): logging.info( "to process doc from b_content after it is a one-value list,the doc is %s" % str(doc[self.unique_key])) return self._parse_doc_to_solr_doc(doctemp[0]) logging.info( "to process doc from b_content after it is a object,the doc is %s" % str(doc[self.unique_key])) return self._parse_doc_to_solr_doc(doctemp)