def handle_command(self, doc, namespace, timestamp):
        # Flush buffer before handle command
        self.commit()
        db = namespace.split(".", 1)[0]
        if doc.get("dropDatabase"):
            dbs = self.command_helper.map_db(db)
            for _db in dbs:
                self.elastic.indices.delete(index=_db.lower())

        if doc.get("renameCollection"):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping.")

        if doc.get("create"):
            db, coll = self.command_helper.map_collection(db, doc["create"])
            if db and coll:
                self.elastic.indices.put_mapping(
                    index=db.lower(),
                    doc_type=coll,
                    body={"_source": {
                        "enabled": True
                    }})

        if doc.get("drop"):
            db, coll = self.command_helper.map_collection(db, doc["drop"])
            if db and coll:
                # This will delete the items in coll, but not get rid of the
                # mapping.
                warnings.warn("Deleting all documents of type %s on index %s."
                              "The mapping definition will persist and must be"
                              "removed manually." % (coll, db))
                responses = streaming_bulk(
                    self.elastic,
                    (dict(result, _op_type="delete") for result in scan(
                        self.elastic, index=db.lower(), doc_type=coll)),
                )
                for ok, resp in responses:
                    if not ok:
                        LOG.error(
                            "Error occurred while deleting ElasticSearch docum"
                            "ent during handling of 'drop' command: %r" % resp)
    def handle_command(self, doc, namespace, timestamp):
        db, _ = namespace.split('.', 1)
        if doc.get('dropDatabase'):
            for new_db in self.command_helper.map_db(db):
                self.solr.delete(q="ns:%s.*" % new_db,
                                 commit=(self.auto_commit_interval == 0))

        if doc.get('renameCollection'):
            raise errors.OperationFailed(
                "solr_doc_manager does not support replication of "
                " renameCollection")

        if doc.get('create'):
            # nothing to do
            pass

        if doc.get('drop'):
            new_db, coll = self.command_helper.map_collection(db, doc['drop'])
            if new_db:
                self.solr.delete(q="ns:%s.%s" % (new_db, coll),
                                 commit=(self.auto_commit_interval == 0))
    def get_last_doc(self):
        """Returns the last document stored in the Elastic engine.
        """
        try:
            result = self.elastic.search(index="_all",
                                         body={
                                             "query": {
                                                 "match_all": {}
                                             },
                                             "sort": [{
                                                 "_ts": "desc"
                                             }]
                                         },
                                         size=1)["hits"]["hits"]
        except (es_exceptions.ConnectionError):
            raise errors.ConnectionFailed(
                "Could not connect to Elastic Search")
        except es_exceptions.TransportError:
            raise errors.OperationFailed(
                "Could not retrieve last document from Elastic Search")

        return result[0]["_source"] if len(result) > 0 else None
    def bulk_upsert(self, docs):
        """Update or insert multiple documents into Elastic

        docs may be any iterable
        """
        def docs_to_upsert():
            doc = None
            for doc in docs:
                index = doc["ns"]
                doc[self.unique_key] = str(doc[self.unique_key])
                doc_id = doc[self.unique_key]
                yield {
                    "index": {
                        "_index": index,
                        "_type": self.doc_type,
                        "_id": doc_id
                    }
                }
                yield doc
            if not doc:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")

        try:
            self.elastic.bulk(doc_type=self.doc_type,
                              body=docs_to_upsert(),
                              refresh=(self.auto_commit_interval == 0))
        except (es_exceptions.ConnectionError):
            raise errors.ConnectionFailed(
                "Could not connect to Elastic Search")
        except es_exceptions.TransportError:
            raise errors.OperationFailed(
                "Could not bulk-insert documents into Elastic")
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass
 def _stream_search(self, *args, **kwargs):
     """Helper method for iterating over ES search results"""
     try:
         first_response = self.elastic.search(*args,
                                              search_type="scan",
                                              scroll="10m",
                                              size=100,
                                              **kwargs)
         scroll_id = first_response.get("_scroll_id")
         expected_count = first_response.get("hits", {}).get("total", 0)
         results_returned = 0
         while results_returned < expected_count:
             next_response = self.elastic.scroll(scroll_id=scroll_id,
                                                 scroll="10m")
             results_returned += len(next_response["hits"]["hits"])
             for doc in next_response["hits"]["hits"]:
                 yield doc["_source"]
     except (es_exceptions.ConnectionError):
         raise errors.ConnectionFailed(
             "Could not connect to Elastic Search")
     except es_exceptions.TransportError:
         raise errors.OperationFailed(
             "Could not retrieve documents from Elastic Search")
    def upsert(self, doc):
        """Update or insert a document into Elastic

        If you'd like to have different types of document in your database,
        you can store the doc type as a field in Mongo and set doc_type to
        that field. (e.g. doc_type = doc['_type'])

        """
        doc_type = self.doc_type
        index = doc['ns']
        doc[self.unique_key] = str(doc["_id"])
        doc_id = doc[self.unique_key]
        try:
            self.elastic.index(index=index,
                               doc_type=doc_type,
                               body=bsjson.dumps(doc),
                               id=doc_id,
                               refresh=(self.auto_commit_interval == 0))
        except (es_exceptions.ConnectionError):
            raise errors.ConnectionFailed(
                "Could not connect to Elastic Search")
        except es_exceptions.TransportError:
            raise errors.OperationFailed("Could not index document: %s" %
                                         (bsjson.dumps(doc)))
Beispiel #7
0
    def _clean_doc(self, doc, namespace, timestamp):
        """Reformats the given document before insertion into Solr.

        This method reformats the document in the following ways:
          - removes extraneous fields that aren't defined in schema.xml
          - unwinds arrays in order to find and later flatten sub-documents
          - flattens the document so that there are no sub-documents, and every
            value is associated with its dot-separated path of keys
          - inserts namespace and timestamp metadata into the document in order
            to handle rollbacks

        An example:
          {"a": 2,
           "b": {
             "c": {
               "d": 5
             }
           },
           "e": [6, 7, 8]
          }

        becomes:
          {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8}

        """

        # Translate the _id field to whatever unique key we're using.
        # _id may not exist in the doc, if we retrieved it from Solr
        # as part of update.
        if '_id' in doc:
            doc[self.unique_key] = u(doc.pop("_id"))

        # Update namespace and timestamp metadata
        if 'ns' in doc or '_ts' in doc:
            raise errors.OperationFailed(
                'Need to set "ns" and "_ts" fields, but these fields already '
                'exist in the document %r!' % doc)
        logging.debug(doc)

        doc['ns'] = namespace
        doc['_ts'] = timestamp

        if 'type' in doc and not math.isnan(doc['type']):
            doc['type'] = int(doc['type'])
        else:
            doc['type'] = 0

        if 'popularity' in doc and not math.isnan(doc['popularity']):
            doc['popularity'] = float(doc['popularity'])
        else:
            doc['popularity'] = 0

        if 'usefulness' in doc and not math.isnan(doc['usefulness']):
            doc['usefulness'] = float(doc['usefulness'])
        else:
            doc['usefulness'] = 0

        if 'wroteAtPlan' in doc:
            wroteAtPlan = doc['wroteAtPlan']
            if "_id" in wroteAtPlan:
                doc['wroteAtPlan'] = u(wroteAtPlan["_id"])
                if 'noteInfo' in wroteAtPlan:
                    noteInfo = wroteAtPlan["noteInfo"]
                    if 'day' in noteInfo and not math.isnan(noteInfo['day']):
                        doc['wroteAtPlanDay'] = int(noteInfo["day"])
                    if 'time' in noteInfo and not math.isnan(noteInfo['time']):
                        doc['wroteAtPlanTime'] = int(noteInfo["time"])

        contentStr = u("")

        if 'content' in doc:
            noteContent = doc['content']
            if isinstance(noteContent, list):
                for content in noteContent:
                    if 'type' in content:
                        if content['type'] == "html":
                            contentStr += content['data']
                doc["content"] = contentStr

        if 'location' in doc:
            location = doc['location']
            if 'cc' in location:
                doc['cc'] = location['cc']
            if 'loc' in location:
                loc = location['loc']
                if 'coordinates' in loc:
                    coordinates = loc['coordinates']
                    doc['loc'] = str(coordinates[1]) + ',' + str(
                        coordinates[0])
            if 'address' in location:
                doc['address'] = location['address']

        if 'locationWrote' in doc:
            locationWrote = doc['locationWrote']
            if 'loc' in locationWrote:
                loc = locationWrote['loc']
                if 'coordinates' in loc:
                    coordinates = loc['coordinates']
                    doc['locWrote'] = str(coordinates[1]) + ',' + str(
                        coordinates[0])

        if 'locWrote' in doc:
            if isinstance(doc['locWrote'], list):
                doc['locWrote'] = doc['locWrote'][0]

        if 'loc' in doc:
            if isinstance(doc['loc'], list):
                doc['loc'] = doc['loc'][0]

        # SOLR cannot index fields within sub-documents, so flatten documents
        # with the dot-separated path to each value as the respective key
        #flat_doc = self._formatter.format_document(doc)

        flat_doc = doc

        # Only include fields that are explicitly provided in the
        # schema or match one of the dynamic field patterns, if
        # we were able to retrieve the schema
        if len(self.field_list) + len(self._dynamic_field_regexes) > 0:

            def include_field(field):
                return field in self.field_list or any(
                    regex.match(field)
                    for regex in self._dynamic_field_regexes)

            return dict(
                (k, v) for k, v in flat_doc.items() if include_field(k))
        return flat_doc
Beispiel #8
0
    def _clean_doc(self, doc, namespace, timestamp):
        """Reformats the given document before insertion into Solr.

        This method reformats the document in the following ways:
          - removes extraneous fields that aren't defined in schema.xml
          - unwinds arrays in order to find and later flatten sub-documents
          - flattens the document so that there are no sub-documents, and every
            value is associated with its dot-separated path of keys
          - inserts namespace and timestamp metadata into the document in order
            to handle rollbacks

        An example:
          {"a": 2,
           "b": {
             "c": {
               "d": 5
             }
           },
           "e": [6, 7, 8]
          }

        becomes:
          {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8}

        """

        # Translate the _id field to whatever unique key we're using.
        # _id may not exist in the doc, if we retrieved it from Solr
        # as part of update.
        if '_id' in doc:
            doc[self.unique_key] = u(doc.pop("_id"))

        # Update namespace and timestamp metadata
        if 'ns' in doc or '_ts' in doc:
            raise errors.OperationFailed(
                'Need to set "ns" and "_ts" fields, but these fields already '
                'exist in the document %r!' % doc)
        doc['ns'] = namespace
        doc['_ts'] = timestamp

        #doc 提前进行扁平化
        doc = self._formatter.format_document(doc)
        #对doc中tag*变量长度进行限制
        for k, v in doc.items():
            if (k[0:3] == "tag" and v and isinstance(v, basestring)):
                doc[k] = v[0:9000]

        # 获取mongo表名称
        collecion_name = self._get_collection_name(namespace)
        # 处理用户行为表数据
        if ("b_dynamic" == collecion_name):

            logging.info("to process doc from b_dynamic ,the doc is %s" %
                         str(doc[self.unique_key]))
            return self._parse_user_dynamic_collection(doc)

        #处理用户表
        if ("T_USER" == collecion_name):
            logging.info("to process doc from T_USER ,the doc is %s" %
                         str(doc[self.unique_key]))
            return self._parse_t_user_collection(doc)

        #to process the content data
        logging.info("begin to process b_content ,the doc is %s" %
                     str(doc[self.unique_key]))
        doctemp = self._parse_content_doc(doc)

        if doctemp is None:
            logging.info("don't send doc to solr ,the doc is %s" % str(doc))
            return None

        if (isinstance(doctemp, list) and len(doctemp) == 0):
            logging.info("don't send doc to solr ,the doc is %s" % str(doc))
            return None

        if (isinstance(doctemp, list) and len(doctemp) > 1):
            logging.info(
                "to process doc from b_content after it is a list,the doc is %s"
                % str(doc[self.unique_key]))
            flat_doc = []
            for docvalue in doctemp:
                flat_doc.append(self._parse_doc_to_solr_doc(docvalue))

            return flat_doc

        if (isinstance(doctemp, list)):
            logging.info(
                "to process doc from b_content after it is a one-value list,the doc is %s"
                % str(doc[self.unique_key]))
            return self._parse_doc_to_solr_doc(doctemp[0])
        logging.info(
            "to process doc from b_content after it is a object,the doc is %s"
            % str(doc[self.unique_key]))
        return self._parse_doc_to_solr_doc(doctemp)