Ejemplo n.º 1
0
 def test_serializes_numpy_ndarray(self):
     self.assertEqual(
         '{"d":[0,0,0,0,0]}',
         JSONSerializer().dumps({"d": np.zeros((5, ), dtype=np.uint8)}),
     )
     # This isn't useful for Elasticsearch, just want to make sure it works.
     self.assertEqual(
         '{"d":[[0,0],[0,0]]}',
         JSONSerializer().dumps({"d": np.zeros((2, 2), dtype=np.uint8)}),
     )
Ejemplo n.º 2
0
    def test_serializes_pandas_category(self):
        cat = pd.Categorical(["a", "c", "b", "a"], categories=["a", "b", "c"])
        self.assertEqual(
            '{"d":["a","c","b","a"]}', JSONSerializer().dumps({"d": cat}),
        )

        cat = pd.Categorical([1, 2, 3], categories=[1, 2, 3])
        self.assertEqual(
            '{"d":[1,2,3]}', JSONSerializer().dumps({"d": cat}),
        )
Ejemplo n.º 3
0
    def test_class_serializer(self):
        class A(object):
            def to_serializable(self):
                return {"a": "b"}

        a = A()
        self.assertEquals('{"a": "b"}', JSONSerializer().dumps(a))
Ejemplo n.º 4
0
 def test_chunks_are_chopped_by_chunk_size(self):
     self.assertEquals(
         10,
         len(
             list(
                 helpers._chunk_actions(self.actions, 10, 99999999,
                                        JSONSerializer()))))
Ejemplo n.º 5
0
 def test_chunks_are_chopped_by_byte_size(self):
     self.assertEqual(
         100,
         len(
             list(helpers._chunk_actions(self.actions, 100000, 1, JSONSerializer()))
         ),
     )
Ejemplo n.º 6
0
def main(args):
    parser = argparse.ArgumentParser(
        usage='%(prog)s [options] command\n\nVersion\n  %(prog)s version '
        + str(__version__))
    parser.add_argument('--version', action='version',
        version='%(prog)s ' + str(__version__))
    parser.add_argument('lognames', metavar='lognames', nargs='+',
        help="log files to parse")

    options = parser.parse_args(args)
    if len(options.lognames) < 1:
        sys.stderr.write("error: not enough arguments")
        parser.print_help()
        return 1

    serializer = JSONSerializer()
    for logname in options.lognames:
        with open(logname) as logfile:
            for event in generate_events(logfile, logname):
                # the elasticsearch serializer does have a
                # a dumps method, but we don't use it
                # because it turns off json.dumps' ensure_ascii
                # we want to enforce ascii because it's
                # not actually specified what encoding the
                # log file is in. We were also getting
                # invalid utf-8 sequences.
                sys.stdout.write(json.dumps(event, default=serializer.default))
                sys.stdout.write('\n')
Ejemplo n.º 7
0
    def handle(self, *args, **options):
        json = JSONSerializer()

        if "datasource" not in options:
            self.stderr.write("You need to specify datasource to export")
            return

        config = django_apps.app_configs[options["datasource"]]
        ElasticModel = config.elastic_model
        all_docs = ElasticModel.search()

        if options["to"] is not None:
            all_docs = all_docs.query(
                "match_all")[options["from"]:options["to"]]
            total_count = all_docs.count()
            all_docs = all_docs.execute()
        elif options["from"]:
            all_docs = all_docs.query("match_all")[options["from"]:].execute()
            total_count = all_docs.count()
            all_docs = all_docs.execute()
        else:
            total_count = all_docs.count()
            all_docs = all_docs.scan()

        for doc in tqdm.tqdm(all_docs, total=total_count):
            doc_json = doc.to_dict()
            if not options["keep_service_fields"]:
                for f in self.service_fields:
                    if f in doc_json:
                        del doc_json[f]

            options["outfile"].write(json.dumps(doc_json) + "\n")
Ejemplo n.º 8
0
 def test_serializes_pandas_na(self):
     if not hasattr(pd, "NA"):  # pandas.NA added in v1
         raise SkipTest("pandas.NA required")
     self.assertEqual(
         '{"d":null}',
         JSONSerializer().dumps({"d": pd.NA}),
     )
Ejemplo n.º 9
0
def main():
    root = sys.argv[1]
    key = sys.argv[2]

    outname = 'tmp/%s' % sanitize_filename(key)
    if os.path.exists(outname):
        sys.stderr.write("'%s' already done\n" % str(outname))
        sys.exit(0)

    from elasticsearch.serializer import JSONSerializer
    serializer = JSONSerializer()

    try:
        with gzip.open(outname, 'wb') as out:
            with open(os.path.join(root, key), mode='rb') as logfile:
                gzfile = gzip.GzipFile(fileobj=logfile, mode='rb')
                for event in generate_events(enumerate(gzfile), key):
                    # the elasticsearch serializer does have a
                    # a dumps method, but we don't use it
                    # because it turns off json.dumps' ensure_ascii
                    # we want to enforce ascii because it's
                    # not actually specified what encoding the
                    # log file is in. We were also getting
                    # invalid utf-8 sequences.
                    out.write(json.dumps(event, default=serializer.default))
                    out.write('\n')

    except Exception as err:
        if os.path.exists(outname):
            os.remove(outname)
        raise err
Ejemplo n.º 10
0
 def test_uuid_serialization(self):
     self.assertEqual(
         '{"d":"00000000-0000-0000-0000-000000000003"}',
         JSONSerializer().dumps(
             {"d": uuid.UUID("00000000-0000-0000-0000-000000000003")}
         ),
     )
Ejemplo n.º 11
0
 def test_serializes_numpy_floats(self):
     ser = JSONSerializer()
     for np_type in (
             np.float_,
             np.float32,
             np.float64,
     ):
         self.assertRegexpMatches(ser.dumps({"d": np_type(1.2)}),
                                  r'^\{"d":1\.2[\d]*}$')
Ejemplo n.º 12
0
 def test_chunks_are_chopped_by_byte_size_properly(self):
     max_byte_size = 170
     chunks = list(
         helpers._chunk_actions(self.actions, 100000, max_byte_size,
                                JSONSerializer()))
     self.assertEqual(25, len(chunks))
     for chunk_data, chunk_actions in chunks:
         chunk = u"".join(chunk_actions)
         chunk = chunk if isinstance(chunk, str) else chunk.encode("utf-8")
         self.assertLessEqual(len(chunk), max_byte_size)
Ejemplo n.º 13
0
    def search(self, body):
        """ Execute a search query.

            The passed query must be a valid ElasticSearch query. This query is
            passed to the connection with the according index and the result is
            returned.
        """
        self.logger.debug('Execute search: %s', JSONSerializer().dumps(body))

        return self.__connection.search(
            body=body, index='syslog')  #config.system.es.index)
Ejemplo n.º 14
0
    def _query_backend(self):
        consumer = KafkaConsumer(bootstrap_servers=KAFKA_HOST,
                                 value_deserializer=lambda v: JSONSerializer().
                                 loads(v.decode('utf-8')))

        tp = TopicPartition(self.topic, 0)
        consumer.assign([tp])

        count = consumer.position(tp)

        consumer.seek(tp, 0)

        metrics = []
        for i in range(count):
            metrics.append(next(consumer))

        return metrics
Ejemplo n.º 15
0
    def test_serializes_numpy_integers(self):
        ser = JSONSerializer()
        for np_type in (
                np.int_,
                np.int8,
                np.int16,
                np.int32,
                np.int64,
        ):
            self.assertEqual(ser.dumps({"d": np_type(-1)}), '{"d":-1}')

        for np_type in (
                np.uint8,
                np.uint16,
                np.uint32,
                np.uint64,
        ):
            self.assertEqual(ser.dumps({"d": np_type(1)}), '{"d":1}')
Ejemplo n.º 16
0
    def handle(self, *args, **options):
        json = JSONSerializer()

        all_decls = Search(index=options["indexes"]).doc_type(
            NACPDeclaration, Declaration)

        if options["to"] is not None:
            all_decls = all_decls.query('match_all')[options["from"]:options["to"]].execute()
        elif options["from"]:
            all_decls = all_decls.query('match_all')[options["from"]:].execute()
        else:
            all_decls = all_decls.scan()

        for i, decl in enumerate(all_decls):
            decl_json = decl.api_response(options["sections"])
            options["outfile"].write(json.dumps(decl_json) + "\n")

            if i and i % 1000 == 0:
                self.stderr.write("Exported %s declarations" % i)
Ejemplo n.º 17
0
class SospiderPipeline(object):
    items_buffer = []
    serializer = JSONSerializer()

    def process_item(self, item, spider):
        data = dict(item)
        url_key = spider.url_key(item['url'])
        #if spider.redis_cache.exists(url_key):
        #    return item
        #else:
        #    spider.redis_cache.set(url_key, item['url'])
        #    spider.redis_cache.expire(url_key,
        #                              spider.conf_dict['expire_seconds'])
        extra_data = data.pop('extra')
        data.update(extra_data)
        self.index_item(data, spider)
        return item

    def index_item(self, item, spider):
        index_action = {
            '_index': spider.es_index,
            '_type': 'fulltext',
            '_source': item,
            '_id': uuid.uuid1(),
        }
        logging.info('get %s' % item['url'])
        try:
            self.serializer.dumps(index_action)
            self.items_buffer.append(index_action)
        except Exception as e:
            logging.info('dumps failed')
        if len(self.items_buffer) > BUF_SIZ:
            self.send_item(spider)
            self.items_buffer = []

    def send_item(self, spider):
        res = helpers.bulk(spider.es, self.items_buffer)
        logging.info('bulk %s' % str(res))

    def close_spider(self, spider):
        if len(self.items_buffer):
            self.send_item()
Ejemplo n.º 18
0
def to_json(data):
    """Convert Python structure to JSON used by Elasticsearch

    This is a helper method that uses the elasticsearch-py
    JSONSerializer to serialize the structure. This is the serializer
    that elasticsearch-py uses to serialize data for Elasticsearch and
    handles dates.

    :arg data: Python structure (e.g. dict, list, ...)

    :returns: string

    Examples:

    >>> to_json({'query': {'match': {'message': 'test message'}}})
    '{"query": {"match": {"message": "test message"}}}'

    >>> from elasticutils import S
    >>> some_s = S().query(message__match='test message')
    >>> to_json(some_s.build_search())
    '{"query": {"match": {"message": "test message"}}}'

    """
    return JSONSerializer().dumps(data)
Ejemplo n.º 19
0
 def test_strings_are_left_untouched(self):
     self.assertEqual("你好", JSONSerializer().dumps("你好"))
Ejemplo n.º 20
0
 def assertDictEqual(self, a, b):
     default = JSONSerializer().default
     self.assertEqual(
         json.dumps(a, sort_keys=True, default=default),
         json.dumps(b, sort_keys=True, default=default),
     )
Ejemplo n.º 21
0
def main(args):

    # Specify the arguments.
    parser = argparse.ArgumentParser(
        description=
        '''A tool which takes a weighted listing of keyword searches and presents aggregations of this data to the user.'''
    )

    parser.add_argument('-a',
                        '--allocationid',
                        metavar='int',
                        dest='allocation_id',
                        default=-1,
                        help='The allocation ID of the job.')
    parser.add_argument('-j',
                        '--jobid',
                        metavar='int',
                        dest='job_id',
                        default=-1,
                        help='The job ID of the job.')
    parser.add_argument('-s',
                        '--jobidsecondary',
                        metavar='int',
                        dest='job_id_secondary',
                        default=0,
                        help='The secondary job ID of the job (default : 0).')
    parser.add_argument(
        '-t',
        '--target',
        metavar='hostname:port',
        dest='target',
        default=None,
        help=
        'An Elasticsearch server to be queried. This defaults to the contents of environment variable "CAST_ELASTIC".'
    )
    parser.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        help='Displays the top --size logs matching the --errormap mappings.')
    parser.add_argument(
        '--size',
        metavar='size',
        dest='size',
        default=10,
        help='The number of results to be returned. (default=10)')
    parser.add_argument('-H',
                        '--hostnames',
                        metavar='host',
                        dest='hosts',
                        nargs='*',
                        default=None,
                        help='A list of hostnames to filter the results to.')
    parser.add_argument(
        '--errormap',
        metavar="file",
        dest="err_map_file",
        default=None,
        help='A map of errors to scan the user jobs for, including weights.')

    args = parser.parse_args()

    # If the target wasn't specified check the environment for the target value, printing help on failure.
    if args.target is None:
        if TARGET_ENV in os.environ:
            args.target = os.environ[TARGET_ENV]
        else:
            parser.print_help()
            print("Missing target, '%s' was not set." % TARGET_ENV)
            return 2

    # Load the weighted error mapping.
    error_map = None
    if args.err_map_file:
        error_map = JSONSerializer().loads(open(args.err_map_file).read())

    if error_map is None:
        parser.print_help()
        print("Error map '%s', could not be loaded" % args.err_map_file)
        return 2

    # Open a connection to the elastic cluster, if this fails is wrong on the server.
    es = Elasticsearch(args.target,
                       sniff_on_start=True,
                       sniff_on_connection_fail=True,
                       sniffer_timeout=60)

    # Execute the query on the cast-allocation index.
    try:
        tr_res = cast.search_job(es, args.allocation_id, args.job_id,
                                 args.job_id_secondary)
    except exceptions.RequestError as e:
        cast.print_request_error(e)
        return 4

    total_hits = cast.deep_get(tr_res, "hits", "total")
    # Finding no matches with valid search criteria is a legit case.
    # return 0, not 3
    if total_hits == None:
        print("# Sorry. Could not find any matching results.")
        return 0
    if total_hits != 1:
        print(
            "This implementation only supports queries where the hit count is equal to 1."
        )
        return 3

    # TODO make this code more fault tolerant
    hits = cast.deep_get(tr_res, "hits", "hits")
    tr_data = cast.deep_get(hits[0], "_source", "data")

    # ---------------------------------------------------------------------------------------------

    # Build the hostnames string:
    if args.hosts is None:
        args.hosts = tr_data.get("compute_nodes")

    hostnames = {
        "multi_match": {
            "query": " ".join(args.hosts),
            "type": "best_fields",
            "fields": ["hostname", "source"],
            "tie_breaker": 0.3,
            "minimum_should_match": 1
        }
    }

    # ---------------------------------------------------------------------------------------------

    (ranges, should_match) = cast.build_timestamp_range(
        tr_data.get("begin_time"), cast.deep_get(tr_data, "history",
                                                 "end_time"))

    ranges.append(hostnames)

    # ---------------------------------------------------------------------------------------------
    # Build a body for the mapping query.
    body = {
        "_source": ["@timestamp"],
        "size": args.size,
    }

    # Check the keywords supplied by the json.
    results = {}
    for error in error_map:
        (category, result) = build_mapping_query(es, body.copy(), ranges,
                                                 error)
        results[category] = result

    print(" ")
    # Print the results.
    for category, response in sorted(
            results.iteritems(),
            key=lambda (k, v): cast.deep_get(v, "hits", "max_score"),
            reverse=True):

        # Get aggregations.
        aggregations = response.get("aggregations", [])
        total = cast.deep_get(response, "hits", "total")

        print("\"{0}\" Max Score : {1}".format(
            category, cast.deep_get(response, "hits", "max_score")))
        print("\"{0}\" Count : {1}".format(category, total))

        if aggregations is not None:
            # Sort aggregations by document count.
            for (aggregation, value) in sorted(aggregations.iteritems(),
                                               key=lambda
                                               (k, v): v.get("doc_count"),
                                               reverse=True):
                print("  \"{0}\" : {1}".format(aggregation,
                                               value.get("doc_count")))

        if args.verbose:
            hits = cast.deep_get(response, "hits", "hits")

            print("\nTop {0} \"{1}\" Results:".format(len(hits), category))
            print("-" * 42)
            for hit in hits:
                print(json.dumps(hit["_source"]))
        print("=" * 42)
        print(" ")
Ejemplo n.º 22
0
 def test_serializes_pandas_timestamp(self):
     self.assertEqual(
         '{"d":"2010-10-01T02:30:00"}',
         JSONSerializer().dumps({"d": pd.Timestamp("2010-10-01T02:30:00")}),
     )
Ejemplo n.º 23
0
 def test_serializes_numpy_datetime(self):
     self.assertEqual(
         '{"d":"2010-10-01T02:30:00"}',
         JSONSerializer().dumps({"d":
                                 np.datetime64("2010-10-01T02:30:00")}),
     )
Ejemplo n.º 24
0
 def test_serializes_pandas_series(self):
     self.assertEqual(
         '{"d":["a","b","c","d"]}',
         JSONSerializer().dumps({"d": pd.Series(["a", "b", "c", "d"])}),
     )
Ejemplo n.º 25
0
 def test_serializes_numpy_bool(self):
     self.assertEqual('{"d":true}',
                      JSONSerializer().dumps({"d": np.bool_(True)}))
Ejemplo n.º 26
0
 def test_decimal_serialization(self):
     if sys.version_info[:2] == (2, 6):
         raise SkipTest("Float rounding is broken in 2.6.")
     self.assertEqual('{"d":3.8}',
                      JSONSerializer().dumps({"d": Decimal("3.8")}))
Ejemplo n.º 27
0
 def test_datetime_serialization(self):
     self.assertEqual(
         '{"d":"2010-10-01T02:30:00"}',
         JSONSerializer().dumps({"d": datetime(2010, 10, 1, 2, 30)}),
     )
Ejemplo n.º 28
0
 def test_raises_serialization_error_pandas_nat(self):
     if not hasattr(pd, "NaT"):
         raise SkipTest("pandas.NaT required")
     self.assertRaises(SerializationError,
                       JSONSerializer().dumps, {"d": pd.NaT})
Ejemplo n.º 29
0
 def test_raises_serialization_error_on_load_error(self):
     self.assertRaises(SerializationError, JSONSerializer().loads, object())
     self.assertRaises(SerializationError, JSONSerializer().loads, "")
     self.assertRaises(SerializationError, JSONSerializer().loads, "{{")
Ejemplo n.º 30
0
 def test_raises_serialization_error_on_dump_error(self):
     self.assertRaises(SerializationError, JSONSerializer().dumps, object())