def post(self, *args): # Add entry into bucket and flag as multipart upload if self.bucket_name and self.object_name: bucket_name = self.bucket_name object_name = self.object_name else: bucket_name,object_name = args if bucket_name not in self._get_bucket_names(): self._error(code=404,s3code='NSB') return original_name = urllib.unquote(object_name) bucket_object = Binary(self.request.body) object_size = bucket_object.__len__() object_md5 = self._object_md5(bucket_object) if self.uploadId: # We have a multipart upload, so iterate over the parts to generate the md5 hash and calculate size # This is the last call made after the mutlipart upload with the uploadId mupmd5 = hashlib.md5() mupsize = 0 for mup in self.application.S3[bucket_name].find({'object_name':object_name}): mupmd5.update(mup['object']) mupsize += mup['size'] self.application.S3[bucket_name].insert_one({'object_name':object_name,'object':bucket_object,'multipart':True,'md5':mupmd5.hexdigest(),'size':mupsize,'added':datetime.datetime.utcnow(),'updated':datetime.datetime.utcnow(),}) self.render_xml({"InitiateMultipartUploadResult": { "Bucket": bucket_name, "Prefix": self.prefix, "Key":object_name, "UploadId":object_name }})
def test_binary(self): a_string = "hello world" a_binary = Binary(b("hello world")) self.assertTrue(a_binary.startswith(b("hello"))) self.assertTrue(a_binary.endswith(b("world"))) self.assertTrue(isinstance(a_binary, Binary)) self.assertFalse(isinstance(a_string, Binary))
def put(self, *args): if self.bucket_name and self.object_name: bucket_name = self.bucket_name object_name = self.object_name else: bucket_name,object_name = args original_name = urllib.unquote(object_name) if bucket_name not in self._get_bucket_names(): self._error(code=404,s3code='NSB') return # Insert object and then calculate computed md5 of stored object, size, then update and return # If the object already exists, delete contents and add updated timestamp and update existance = self.application.S3[bucket_name].find({"object_name":original_name}) if existance.count() > 0 and self.partNumber == None: existance_id = existance.next()['_id'] update_object = Binary(self.request.body) object_size = update_object.__len__() object_md5 = self._object_md5(update_object) self.application.S3[bucket_name].update({"_id":existance_id},{'$set': {'object':update_object,'md5':object_md5,'updated':datetime.datetime.utcnow(),'size':object_size}}) self.set_header('etag', '"%s"' % object_md5) self.finish() return if self.partNumber: tobeinserted = {'object_name':original_name,'object':Binary(self.request.body),'partNumber':self.partNumber} else: tobeinserted = {'object_name':original_name,'object':Binary(self.request.body)} inserted_object_id = self.application.S3[bucket_name].insert_one(tobeinserted).inserted_id inserted_object = self._get_bucket_object(bucket_name=bucket_name,_id=inserted_object_id) object_size = inserted_object['object'].__len__() object_md5 = self._object_md5(inserted_object['object']) self.application.S3[bucket_name].update({'_id':inserted_object_id},{'$set': {'md5':object_md5,'updated':datetime.datetime.utcnow(),'added':datetime.datetime.utcnow(),'size':object_size}}) self.set_header('etag', '"%s"' % object_md5) self.finish()
def test_cursor(self): db = self.db db.drop_collection("test") docs = [{ 'foo': [1, 2] }, { 'bar': { 'hello': 'world' } }, { 'code': Code("function x() { return 1; }") }, { 'bin': Binary(b("\x00\x01\x02\x03\x04")) }, { 'dbref': { '_ref': DBRef('simple', ObjectId('509b8db456c02c5ab7e63c34')) } }] db.test.insert(docs) reloaded_docs = json_util.loads(json_util.dumps(db.test.find())) for doc in docs: self.assertTrue(doc in reloaded_docs)
def forwards(self): phash = {} db = get_db() metrics = db.noc.ts.metrics bulk = metrics.initialize_unordered_bulk_op() n = 0 for m in metrics.find({}).sort("name", 1): phash[m["name"]] = m["hash"] if "." in m["name"]: pn = ".".join(m["name"].split(".")[:-1]) parent = phash[pn] else: parent = Binary("\x00" * 8) bulk.find({ "_id": m["_id"] }).update({ "$set": { "local": m["name"].split(".")[-1], "parent": parent } }) n += 1 if n: bulk.execute()
def __flush_data(self, data): """Flush `data` to a chunk. """ # Ensure the index, even if there's nothing to write, so # the filemd5 command always succeeds. self.__ensure_indexes() self._file['md5'].update(data) if not data: return assert (len(data) <= self.chunk_size) chunk = { "files_id": self._file["_id"], "n": self._chunk_number, "data": Binary(data) } try: self._chunks.insert_one(chunk) except DuplicateKeyError: self._raise_file_exists(self._file['_id']) self._chunk_number += 1 self._position += len(data)
def write(self, arctic_lib, version, symbol, item, previous_version): try: # If it's encodeable, then ship it b = bson.BSON.encode({'data': item}) if len(b) < _MAX_BSON_ENCODE: version['data'] = item return except InvalidDocument: pass # Pickle, chunk and store the data collection = arctic_lib.get_top_level_collection() # Try to pickle it. This is best effort version['blob'] = _MAGIC_CHUNKEDV2 pickled = cPickle.dumps(item, protocol=cPickle.HIGHEST_PROTOCOL) data = compress_array([ pickled[i * _CHUNK_SIZE:(i + 1) * _CHUNK_SIZE] for i in xrange(int(len(pickled) / _CHUNK_SIZE + 1)) ]) for seg, d in enumerate(data): segment = {'data': Binary(d)} segment['segment'] = seg seg += 1 sha = checksum(symbol, segment) collection.update_one({ 'symbol': symbol, 'sha': sha }, { '$set': segment, '$addToSet': { 'parent': version['_id'] } }, upsert=True)
def binarize_image(image): from PIL import Image binary = None try: im = Image.open(image) thumb = im.copy() thumb.thumbnail((260, 260)) image_buffer = StringIO() thumb.save(image_buffer, "JPEG") binary = Binary(image_buffer.getvalue(), BINARY_SUBTYPE) except IOError as e: logging.error("failed to binarize image: " + str(e)) return None # if image is a file object, rewind it finally: try: image.seek(0) except AttributeError: pass return binary
def test_read_with_base_version_id(): self = create_autospec(PickleStore) version = { '_id': sentinel._id, 'base_version_id': sentinel.base_version_id, 'blob': '__chunked__' } coll = Mock() arctic_lib = Mock() coll.find.return_value = [{ 'data': Binary(compressHC(cPickle.dumps(object))), 'symbol': 'sentinel.symbol', 'segment': 1 }] arctic_lib.get_top_level_collection.return_value = coll assert PickleStore.read(self, arctic_lib, version, sentinel.symbol) == object assert coll.find.call_args_list == [ call({ 'symbol': sentinel.symbol, 'parent': sentinel.base_version_id }) ]
def upload_file(): uploaded_file = request.files['file'] app_path = "./static/uploads/" app_name = uploaded_file.filename if uploaded_file.filename != '': file_path = os.path.join(app_path, app_name) uploaded_file.save(file_path) format_status = check_format(uploaded_file, app_path, app_name, file_path) if format_status == True: fs, coll = create_connection_mongo_cloud() with open(file_path, "rb") as fp: encoded = Binary(fp.read()) flink = fs.put(encoded, filename=app_name) coll.insert_one({"filename": app_name, "file": flink}) os.remove(file_path) return json.dumps({'status': 'Zip uploaded successfully'}), 200 else: remove_file_uploads(app_path, app_name, file_path) return format_status
for entry in dbx.files_list_folder(sys.argv[2]).entries: try: meta = dbx.files_alpha_get_metadata(sys.argv[2] + entry.name) except db.exceptions.ApiError: print("not found !") names.append(entry.name) if (meta.size <= 8 * 1024**6) and (entry.name.split('.')[-1] in { "pdf", "docx", "doc", "ppt", "pptx", "jpeg", "jpg", "png" }): try: m, r = dbx.files_download(sys.argv[2] + entry.name) except db.exceptions.ApiError: print("download failure !") try: filestore.update({"name": entry.name}, { "$set": { "name": entry.name, "file": Binary(r.content), "meta": str(m) } }, upsert=True) except pymongo.errors.OperationFailure: print("Update failed !") print(filestore.count()) removefromdb() #time.sleep( 1 )
def append(self, symbol, item): """ Appends data from item to symbol's data in the database Parameters ---------- symbol: str the symbol for the given item in the DB item: the data to append """ sym = self._get_symbol_info(symbol) if not sym: raise NoDataFoundException("Symbol does not exist. Cannot append") if isinstance(item, Series) and sym['type'] == 'df': raise Exception("cannot append a series to a dataframe") if isinstance(item, DataFrame) and sym['type'] == 'series': raise Exception("cannot append a dataframe to a series") records = [] ranges = [] dtype = None for start, end, record in self.chunker.to_chunks( item, sym['chunk_size']): ''' if we have a multiindex there is a chance that part of the append will overlap an already written chunk, so we need to update where the date part of the index overlaps ''' if item.index.nlevels > 1: df = self.read(symbol, chunk_range=self.chunker.to_range(start, end)) if not df.empty: if df.equals(record): continue record = record.combine_first(df) self.update(symbol, record) sym = self._get_symbol_info(symbol) continue r, dtype = serialize(record, string_max_len=self.STRING_MAX) records.append(r) ranges.append((start, end)) if len(records) > 0: item = np.array([r for record in records for r in record]).flatten() if sym.get('shape', [-1]) != [ -1, ] + list(item.shape)[1:]: raise UnhandledDtypeException() item = item.astype(dtype) if str(dtype) != sym['dtype']: raise Exception("Dtype mismatch - cannot append") data = item.tostring() sym['len'] += len(item) if len(item) > 0: sym['chunk_count'] += len(records) sym['append_count'] += len(records) sym['append_size'] += len(data) chunks = [r.tostring() for r in records] chunks = compress_array(chunks) for chunk, rng in zip(chunks, ranges): start = rng[0] end = rng[-1] segment = {'data': Binary(chunk)} segment['start'] = start segment['end'] = end self._collection.update_one( { 'symbol': symbol, 'sha': checksum(symbol, segment) }, {'$set': segment}, upsert=True) self._symbols.replace_one({'symbol': symbol}, sym)
def _encode_uuid(name, value, dummy, opts): """Encode uuid.UUID.""" uuid_representation = opts.uuid_representation binval = Binary.from_uuid(value, uuid_representation=uuid_representation) return _encode_binary(name, binval, dummy, opts)
@client_context.require_test_commands def run_scenario(self): self.run_scenario(scenario_def, test) return run_scenario test_creator = TestCreator(create_test, TestSpec, SPEC_PATH) test_creator.create_tests() # Prose Tests LOCAL_MASTER_KEY = base64.b64decode( b'Mng0NCt4ZHVUYUJCa1kxNkVyNUR1QURhZ2h2UzR2d2RrZzh0cFBwM3R6NmdWMDFBMUN3YkQ' b'5aXRRMkhGRGdQV09wOGVNYUMxT2k3NjZKelhaQmRCZGJkTXVyZG9uSjFk') LOCAL_KEY_ID = Binary(base64.b64decode(b'LOCALAAAAAAAAAAAAAAAAA=='), UUID_SUBTYPE) AWS_KEY_ID = Binary(base64.b64decode(b'AWSAAAAAAAAAAAAAAAAAAA=='), UUID_SUBTYPE) def create_with_schema(coll, json_schema): """Create and return a Collection with a jsonSchema.""" coll.with_options(write_concern=WriteConcern(w='majority')).drop() return coll.database.create_collection( coll.name, validator={'$jsonSchema': json_schema}, codec_options=OPTS) def create_key_vault(vault, *data_keys): """Create the key vault collection with optional data keys.""" vault = vault.with_options(write_concern=WriteConcern(w='majority'), codec_options=OPTS)
def _encode_task(self, task): for prefix in self.ENCODE_FIELDS_PREFIX: for k in list(task.keys()): if k.startswith(prefix): task[k] = Binary(pickle.dumps(task[k])) return task
def _authenticate_scram(credentials, sock_info, mechanism): """Authenticate using SCRAM.""" username = credentials.username if mechanism == 'SCRAM-SHA-256': digest = "sha256" digestmod = hashlib.sha256 data = saslprep(credentials.password).encode("utf-8") else: digest = "sha1" digestmod = hashlib.sha1 data = _password_digest(username, credentials.password).encode("utf-8") source = credentials.source cache = credentials.cache # Make local _hmac = hmac.HMAC user = username.encode("utf-8").replace(b"=", b"=3D").replace(b",", b"=2C") nonce = standard_b64encode(os.urandom(32)) first_bare = b"n=" + user + b",r=" + nonce cmd = SON([('saslStart', 1), ('mechanism', mechanism), ('payload', Binary(b"n,," + first_bare)), ('autoAuthorize', 1)]) res = sock_info.command(source, cmd) server_first = res['payload'] parsed = _parse_scram_response(server_first) iterations = int(parsed[b'i']) if iterations < 4096: raise OperationFailure("Server returned an invalid iteration count.") salt = parsed[b's'] rnonce = parsed[b'r'] if not rnonce.startswith(nonce): raise OperationFailure("Server returned an invalid nonce.") without_proof = b"c=biws,r=" + rnonce if cache.data: client_key, server_key, csalt, citerations = cache.data else: client_key, server_key, csalt, citerations = None, None, None, None # Salt and / or iterations could change for a number of different # reasons. Either changing invalidates the cache. if not client_key or salt != csalt or iterations != citerations: salted_pass = _hi( digest, data, standard_b64decode(salt), iterations) client_key = _hmac(salted_pass, b"Client Key", digestmod).digest() server_key = _hmac(salted_pass, b"Server Key", digestmod).digest() cache.data = (client_key, server_key, salt, iterations) stored_key = digestmod(client_key).digest() auth_msg = b",".join((first_bare, server_first, without_proof)) client_sig = _hmac(stored_key, auth_msg, digestmod).digest() client_proof = b"p=" + standard_b64encode(_xor(client_key, client_sig)) client_final = b",".join((without_proof, client_proof)) server_sig = standard_b64encode( _hmac(server_key, auth_msg, digestmod).digest()) cmd = SON([('saslContinue', 1), ('conversationId', res['conversationId']), ('payload', Binary(client_final))]) res = sock_info.command(source, cmd) parsed = _parse_scram_response(res['payload']) if not compare_digest(parsed[b'v'], server_sig): raise OperationFailure("Server returned an invalid signature.") # Depending on how it's configured, Cyrus SASL (which the server uses) # requires a third empty challenge. if not res['done']: cmd = SON([('saslContinue', 1), ('conversationId', res['conversationId']), ('payload', Binary(b''))]) res = sock_info.command(source, cmd) if not res['done']: raise OperationFailure('SASL conversation failed to complete.')
def default(obj, json_options=DEFAULT_JSON_OPTIONS): # We preserve key order when rendering SON, DBRef, etc. as JSON by # returning a SON for those types instead of a dict. if isinstance(obj, ObjectId): return {"$oid": str(obj)} if isinstance(obj, DBRef): return _json_convert(obj.as_doc(), json_options=json_options) if isinstance(obj, datetime.datetime): if (json_options.datetime_representation == DatetimeRepresentation.ISO8601): if not obj.tzinfo: obj = obj.replace(tzinfo=utc) if obj >= EPOCH_AWARE: off = obj.tzinfo.utcoffset(obj) if (off.days, off.seconds, off.microseconds) == (0, 0, 0): tz_string = 'Z' else: tz_string = obj.strftime('%z') millis = int(obj.microsecond / 1000) fracsecs = ".%03d" % (millis, ) if millis else "" return { "$date": "%s%s%s" % (obj.strftime("%Y-%m-%dT%H:%M:%S"), fracsecs, tz_string) } millis = bson._datetime_to_millis(obj) if (json_options.datetime_representation == DatetimeRepresentation.LEGACY): return {"$date": millis} return {"$date": {"$numberLong": str(millis)}} if json_options.strict_number_long and isinstance(obj, Int64): return {"$numberLong": str(obj)} if isinstance(obj, (RE_TYPE, Regex)): flags = "" if obj.flags & re.IGNORECASE: flags += "i" if obj.flags & re.LOCALE: flags += "l" if obj.flags & re.MULTILINE: flags += "m" if obj.flags & re.DOTALL: flags += "s" if obj.flags & re.UNICODE: flags += "u" if obj.flags & re.VERBOSE: flags += "x" if isinstance(obj.pattern, text_type): pattern = obj.pattern else: pattern = obj.pattern.decode('utf-8') if json_options.json_mode == JSONMode.LEGACY: return SON([("$regex", pattern), ("$options", flags)]) return { '$regularExpression': SON([("pattern", pattern), ("options", flags)]) } if isinstance(obj, MinKey): return {"$minKey": 1} if isinstance(obj, MaxKey): return {"$maxKey": 1} if isinstance(obj, Timestamp): return {"$timestamp": SON([("t", obj.time), ("i", obj.inc)])} if isinstance(obj, Code): if obj.scope is None: return {'$code': str(obj)} return SON([('$code', str(obj)), ('$scope', _json_convert(obj.scope, json_options))]) if isinstance(obj, Binary): return _encode_binary(obj, obj.subtype, json_options) if PY3 and isinstance(obj, bytes): return _encode_binary(obj, 0, json_options) if isinstance(obj, uuid.UUID): if json_options.strict_uuid: binval = Binary.from_uuid( obj, uuid_representation=json_options.uuid_representation) return _encode_binary(binval, binval.subtype, json_options) else: return {"$uuid": obj.hex} if isinstance(obj, Decimal128): return {"$numberDecimal": str(obj)} if isinstance(obj, bool): return obj if (json_options.json_mode == JSONMode.CANONICAL and isinstance(obj, integer_types)): if -2**31 <= obj < 2**31: return {'$numberInt': text_type(obj)} return {'$numberLong': text_type(obj)} if json_options.json_mode != JSONMode.LEGACY and isinstance(obj, float): if math.isnan(obj): return {'$numberDouble': 'NaN'} elif math.isinf(obj): representation = 'Infinity' if obj > 0 else '-Infinity' return {'$numberDouble': representation} elif json_options.json_mode == JSONMode.CANONICAL: # repr() will return the shortest string guaranteed to produce the # original value, when float() is called on it. str produces a # shorter string in Python 2. return {'$numberDouble': text_type(repr(obj))} raise TypeError("%r is not JSON serializable" % obj)
def run_operation(self, sessions, collection, operation): original_collection = collection name = camel_to_snake(operation['name']) if name == 'run_command': name = 'command' elif name == 'download_by_name': name = 'open_download_stream_by_name' elif name == 'download': name = 'open_download_stream' def parse_options(opts): if 'readPreference' in opts: opts['read_preference'] = parse_read_preference( opts.pop('readPreference')) if 'writeConcern' in opts: opts['write_concern'] = WriteConcern( **dict(opts.pop('writeConcern'))) if 'readConcern' in opts: opts['read_concern'] = ReadConcern( **dict(opts.pop('readConcern'))) return opts database = collection.database collection = database.get_collection(collection.name) if 'collectionOptions' in operation: collection = collection.with_options( **dict(parse_options(operation['collectionOptions']))) object_name = self.get_object_name(operation) if object_name == 'gridfsbucket': # Only create the GridFSBucket when we need it (for the gridfs # retryable reads tests). obj = GridFSBucket( database, bucket_name=collection.name, disable_md5=True) else: objects = { 'client': database.client, 'database': database, 'collection': collection, 'testRunner': self } objects.update(sessions) obj = objects[object_name] # Combine arguments with options and handle special cases. arguments = operation.get('arguments', {}) arguments.update(arguments.pop("options", {})) parse_options(arguments) cmd = getattr(obj, name) for arg_name in list(arguments): c2s = camel_to_snake(arg_name) # PyMongo accepts sort as list of tuples. if arg_name == "sort": sort_dict = arguments[arg_name] arguments[arg_name] = list(iteritems(sort_dict)) # Named "key" instead not fieldName. if arg_name == "fieldName": arguments["key"] = arguments.pop(arg_name) # Aggregate uses "batchSize", while find uses batch_size. elif arg_name == "batchSize" and name == "aggregate": continue # Requires boolean returnDocument. elif arg_name == "returnDocument": arguments[c2s] = arguments.pop(arg_name) == "After" elif c2s == "requests": # Parse each request into a bulk write model. requests = [] for request in arguments["requests"]: bulk_model = camel_to_upper_camel(request["name"]) bulk_class = getattr(operations, bulk_model) bulk_arguments = camel_to_snake_args(request["arguments"]) requests.append(bulk_class(**dict(bulk_arguments))) arguments["requests"] = requests elif arg_name == "session": arguments['session'] = sessions[arguments['session']] elif name == 'command' and arg_name == 'command': # Ensure the first key is the command name. ordered_command = SON([(operation['command_name'], 1)]) ordered_command.update(arguments['command']) arguments['command'] = ordered_command elif name == 'open_download_stream' and arg_name == 'id': arguments['file_id'] = arguments.pop(arg_name) elif name == 'with_transaction' and arg_name == 'callback': callback_ops = arguments[arg_name]['operations'] arguments['callback'] = lambda _: self.run_operations( sessions, original_collection, copy.deepcopy(callback_ops), in_with_transaction=True) else: arguments[c2s] = arguments.pop(arg_name) result = cmd(**dict(arguments)) if name == "aggregate": if arguments["pipeline"] and "$out" in arguments["pipeline"][-1]: # Read from the primary to ensure causal consistency. out = collection.database.get_collection( arguments["pipeline"][-1]["$out"], read_preference=ReadPreference.PRIMARY) return out.find() if name == "map_reduce": if isinstance(result, dict) and 'results' in result: return result['results'] if 'download' in name: result = Binary(result.read()) if isinstance(result, Cursor) or isinstance(result, CommandCursor): return list(result) return result
def array_to_binary(x): '''Numpy array to bson binary''' return Binary(pickle.dumps(x))
def post(self): from app.base.document import ImageDocument form = NodeAvatarSetForm(self.request.arguments) if not form.validate() or 'avatar' not in self.request.files: raise HTTPError(404) node_id = form.node_id.data x = form.x.data y = form.y.data w = form.w.data h = form.h.data target_width = form.target_width.data node = yield NodeDocument.find_one({'_id': ObjectId(node_id)}) if not node: raise HTTPError(404) upload_file = self.request.files['avatar'][0] now = datetime.now() document = { 'name': upload_file['filename'], 'body': Binary(upload_file['body']), 'content_type': upload_file['content_type'].split('/')[1].upper(), 'uploader': DBRef(UserDocument.meta['collection'], ObjectId(self.current_user['_id'])), 'upload_time': now } image = Image.open(StringIO(upload_file['body'])) if image.size[0] < target_width: target_width = image.size[0] scale = image.size[0] * 1.0 / target_width x = int(x * scale) y = int(y * scale) w = int(w * scale) h = int(h * scale) box = (x, y, x + w, y + h) image = image.crop(box) output = StringIO() image = image.resize((64, 64), Image.ANTIALIAS).save(output, document['content_type'], quality=100) document.update({'thumbnail': Binary(output.getvalue())}) output.close() yield NodeAvatarDocument.remove_one({ 'node': DBRef(NodeDocument.meta['collection'], ObjectId(node_id)) }) image_id = yield ImageDocument.insert(document) document = { 'node': DBRef(NodeDocument.meta['collection'], ObjectId(node_id)), 'image': DBRef(ImageDocument.meta['collection'], ObjectId(image_id)), 'uploader': DBRef(UserDocument.meta['collection'], ObjectId(self.current_user['_id'])), 'upload_time': now } yield NodeAvatarDocument.insert(document) self.finish()
def _element_to_bson(key, value, check_keys, uuid_subtype): if not isinstance(key, basestring): raise InvalidDocument("documents must have only string keys, " "key was %r" % key) if check_keys: if key.startswith("$"): raise InvalidDocument("key %r must not start with '$'" % key) if "." in key: raise InvalidDocument("key %r must not contain '.'" % key) name = _make_c_string(key, True) if isinstance(value, float): return BSONNUM + name + struct.pack("<d", value) if _use_uuid: if isinstance(value, uuid.UUID): # Java Legacy if uuid_subtype == JAVA_LEGACY: # Python 3.0(.1) returns a bytearray instance for bytes (3.1 # and newer just return a bytes instance). Convert that to # binary_type (here and below) for compatibility. from_uuid = binary_type(value.bytes) as_legacy_java = from_uuid[0:8][::-1] + from_uuid[8:16][::-1] value = Binary(as_legacy_java, subtype=OLD_UUID_SUBTYPE) # C# legacy elif uuid_subtype == CSHARP_LEGACY: # Microsoft GUID representation. value = Binary(binary_type(value.bytes_le), subtype=OLD_UUID_SUBTYPE) # Python else: value = Binary(binary_type(value.bytes), subtype=uuid_subtype) if isinstance(value, Binary): subtype = value.subtype if subtype == 2: value = struct.pack("<i", len(value)) + value return (BSONBIN + name + struct.pack("<i", len(value)) + b(chr(subtype)) + value) if isinstance(value, Code): cstring = _make_c_string(value) if not value.scope: length = struct.pack("<i", len(cstring)) return BSONCOD + name + length + cstring scope = _dict_to_bson(value.scope, False, uuid_subtype, False) full_length = struct.pack("<i", 8 + len(cstring) + len(scope)) length = struct.pack("<i", len(cstring)) return BSONCWS + name + full_length + length + cstring + scope if isinstance(value, binary_type): if PY3: # Python3 special case. Store 'bytes' as BSON binary subtype 0. return (BSONBIN + name + struct.pack("<i", len(value)) + ZERO + value) cstring = _make_c_string(value) length = struct.pack("<i", len(cstring)) return BSONSTR + name + length + cstring if isinstance(value, unicode): cstring = _make_c_string(value) length = struct.pack("<i", len(cstring)) return BSONSTR + name + length + cstring if isinstance(value, dict): return BSONOBJ + name + _dict_to_bson(value, check_keys, uuid_subtype, False) if isinstance(value, (list, tuple)): as_dict = SON(zip([str(i) for i in range(len(value))], value)) return BSONARR + name + _dict_to_bson(as_dict, check_keys, uuid_subtype, False) if isinstance(value, ObjectId): return BSONOID + name + value.binary if value is True: return BSONBOO + name + ONE if value is False: return BSONBOO + name + ZERO if isinstance(value, int): # TODO this is an ugly way to check for this... if value > MAX_INT64 or value < MIN_INT64: raise OverflowError("BSON can only handle up to 8-byte ints") if value > MAX_INT32 or value < MIN_INT32: return BSONLON + name + struct.pack("<q", value) return BSONINT + name + struct.pack("<i", value) # 2to3 will convert long to int here since there is no long in python3. # That's OK. The previous if block will match instead. if isinstance(value, long): if value > MAX_INT64 or value < MIN_INT64: raise OverflowError("BSON can only handle up to 8-byte ints") return BSONLON + name + struct.pack("<q", value) if isinstance(value, datetime.datetime): if value.utcoffset() is not None: value = value - value.utcoffset() millis = int(calendar.timegm(value.timetuple()) * 1000 + value.microsecond / 1000) return BSONDAT + name + struct.pack("<q", millis) if isinstance(value, Timestamp): time = struct.pack("<I", value.time) inc = struct.pack("<I", value.inc) return BSONTIM + name + inc + time if value is None: return BSONNUL + name if isinstance(value, (RE_TYPE, Regex)): pattern = value.pattern flags = "" if value.flags & re.IGNORECASE: flags += "i" if value.flags & re.LOCALE: flags += "l" if value.flags & re.MULTILINE: flags += "m" if value.flags & re.DOTALL: flags += "s" if value.flags & re.UNICODE: flags += "u" if value.flags & re.VERBOSE: flags += "x" return BSONRGX + name + _make_c_string(pattern, True) + \ _make_c_string(flags) if isinstance(value, DBRef): return _element_to_bson(key, value.as_doc(), False, uuid_subtype) if isinstance(value, MinKey): return BSONMIN + name if isinstance(value, MaxKey): return BSONMAX + name raise InvalidDocument("cannot convert value of type %s to bson" % type(value))
def create_model(self, applicationName, model='default', query=None, project=None, labels={}, transduction={}, colors={}, drop=False): ''' Creates a trained model by querying the corresponding collection and fitting the corresponding pipeline for the application. Clustering is also run and the resulting dendrogram and fitted is stored in the _models collection. ''' query = from_str(query) project = from_str(project) # convert labels to tokens tokens = defaultdict(lambda : len(tokens)) token_labels = {k: tokens[v] for k, v in transduction.items()} if self.verbose: print(f'Finding application <{applicationName}>', end='...', flush=True) application = self.db.applications_\ .find_one({'_id': applicationName}) if application: # pipelineName = application['pipeline'] # if self.verbose: # print(f'OK\nFinding pipeline <{pipelineName}>', end='...', flush=True) # pipeline = self.db.pipelines_\ # .find_one({'_id': pipelineName}) collectionName = application['collection'] collection = self.db[collectionName] if self.verbose: print(f'OK\nQuerying collection <{collectionName}> <{query}>', end='...', flush=True) X = list(collection.find(query or {})) if len(X): print(f'found {len(X)}...OK') y = [token_labels.get(xi['_id'], -1) for xi in X] index = [x['_id'] for x in X] if self.verbose: print('Transforming data', end='...', flush=True) umap = UMAP() base_pipeline = pydoc.locate(application['pipeline']) pipeline = Pipeline(base_pipeline.steps + [('umap', umap)])\ .set_params(**application.get('params', {})) X_transform = pipeline.fit_transform(X, y) if self.verbose: print('OK\nClustering data', end='...', flush=True) parents, costs = cluster(X_transform, connectivity=umap.graph_, linkage='ward') if self.verbose: print('OK') obj_computed = { 'pipeline': Binary(pickle.dumps(pipeline)), 'parents': parents.tolist(), 'costs': costs.tolist(), 'instances': index, 'X': X_transform.tolist(), 'tokens': sorted(tokens, key=tokens.get), } if project and project != {}: if self.verbose: print(f'Projecting data for histograms {project}', end='...', flush=True) data = collection.aggregate([ {'$match': {'_id': {'$in': index}}}, {'$project': project} ]) df = pd.DataFrame(list(data))\ .set_index('_id')\ .loc[index] obj_computed['hist'] = df.fillna(df.median(axis=0))\ .to_dict(orient='list') if self.verbose: print('OK') obj = { '_id': {'application': applicationName, 'model': model}, 'labels': labels, 'colors': colors, 'date': datetime.datetime.utcnow(), 'query': json.dumps(query, indent=2), 'project': json.dumps(project, indent=2), 'size': len(index), '_id_computed': self.transduction_.put(bson.BSON.encode(obj_computed)) } if drop: for doc in self.db.transduction_.find({'_id': obj['_id']}): self.transduction_.delete(doc['_id_computed']) self.db.transduction_.delete_one({'_id': doc['_id']}) self.db.transduction_\ .insert_one(obj) if self.verbose: print('done.') obj.update(obj_computed) return obj
def _capture_screenshot(self): # сраный selenium, как же я его ненавижу # боль # страдания # ноль документации img_byte_arr = BytesIO() url = 'http://%s:%s/' % (self._host["ip"], self._host["port"]) self._host["data"]["screenshot"] = None self._logger.debug("Obtaining driver") driver = None img = None try: caps = dict(DesiredCapabilities.CHROME) caps['args'] = [ "--proxy-server", "socks5://%s:9050" % cnf.stalker.proxy ] driver = webdriver.Remote( command_executor="http://%s:4444/wd/hub" % cnf.stalker.HTTP.screenshots.selenium, desired_capabilities=caps) driver.set_window_size(cnf.stalker.HTTP.screenshots.width, cnf.stalker.HTTP.screenshots.height) driver.set_page_load_timeout( cnf.stalker.HTTP.screenshots.load_timeout) driver.set_script_timeout( cnf.stalker.HTTP.screenshots.script_timeout) self._logger.debug("Loading %s:%s", self._host['ip'], self._host['port']) driver.get(url) time.sleep(cnf.stalker.HTTP.screenshots.pause) driver.execute_script("window.scrollTo(0, 0);") img = driver.get_screenshot_as_png() except Exception as e: raise e finally: if driver: driver.quit() self._logger.debug("Finished for %s:%s", self._host['ip'], self._host['port']) with Image.open(BytesIO(img)) as img: img = img.crop((0, 0, cnf.stalker.HTTP.screenshots.width, cnf.stalker.HTTP.screenshots.height)) extrema = img.convert("L").getextrema() if not extrema == (0, 0): img.save(img_byte_arr, format='PNG') self._host["data"]["screenshot"] = Binary( zlib.compress(img_byte_arr.getvalue())) img_byte_arr.close() self._logger.debug( "Saved screen of %s:%s (e: %s)" % (self._host["ip"], self._host["port"], extrema)) else: self._logger.debug( "Not saving screen of %s:%s, as it is empty", self._host["ip"], self._host["port"])
def _element_to_bson(key, value, check_keys, uuid_subtype): if not isinstance(key, basestring): raise InvalidDocument("documents must have only string keys, " "key was %r" % key) if check_keys: if key.startswith("$"): raise InvalidDocument("key %r must not start with '$'" % key) if "." in key: raise InvalidDocument("key %r must not contain '.'" % key) name = _make_c_string(key, True) if isinstance(value, float): return "\x01" + name + struct.pack("<d", value) # Use Binary w/ subtype 3 for UUID instances if _use_uuid: if isinstance(value, uuid.UUID): value = Binary(value.bytes, subtype=uuid_subtype) if isinstance(value, Binary): subtype = value.subtype if subtype == 2: value = struct.pack("<i", len(value)) + value return "\x05%s%s%s%s" % (name, struct.pack("<i", len(value)), chr(subtype), value) if isinstance(value, Code): cstring = _make_c_string(value) if not value.scope: length = struct.pack("<i", len(cstring)) return "\x0D" + name + length + cstring scope = _dict_to_bson(value.scope, False, uuid_subtype, False) full_length = struct.pack("<i", 8 + len(cstring) + len(scope)) length = struct.pack("<i", len(cstring)) return "\x0F" + name + full_length + length + cstring + scope if isinstance(value, str): cstring = _make_c_string(value) length = struct.pack("<i", len(cstring)) return "\x02" + name + length + cstring if isinstance(value, unicode): cstring = _make_c_string(value) length = struct.pack("<i", len(cstring)) return "\x02" + name + length + cstring if isinstance(value, dict): return "\x03" + name + _dict_to_bson(value, check_keys, uuid_subtype, False) if isinstance(value, (list, tuple)): as_dict = SON(zip([str(i) for i in range(len(value))], value)) return "\x04" + name + _dict_to_bson(as_dict, check_keys, uuid_subtype, False) if isinstance(value, ObjectId): return "\x07" + name + value.binary if value is True: return "\x08" + name + "\x01" if value is False: return "\x08" + name + "\x00" if isinstance(value, int): # TODO this is an ugly way to check for this... if value > MAX_INT64 or value < MIN_INT64: raise OverflowError("BSON can only handle up to 8-byte ints") if value > MAX_INT32 or value < MIN_INT32: return "\x12" + name + struct.pack("<q", value) return "\x10" + name + struct.pack("<i", value) if isinstance(value, long): # XXX No long type in Python 3 if value > MAX_INT64 or value < MIN_INT64: raise OverflowError("BSON can only handle up to 8-byte ints") return "\x12" + name + struct.pack("<q", value) if isinstance(value, datetime.datetime): if value.utcoffset() is not None: value = value - value.utcoffset() millis = int(calendar.timegm(value.timetuple()) * 1000 + value.microsecond / 1000) return "\x09" + name + struct.pack("<q", millis) if isinstance(value, Timestamp): time = struct.pack("<I", value.time) inc = struct.pack("<I", value.inc) return "\x11" + name + inc + time if value is None: return "\x0A" + name if isinstance(value, RE_TYPE): pattern = value.pattern flags = "" if value.flags & re.IGNORECASE: flags += "i" if value.flags & re.LOCALE: flags += "l" if value.flags & re.MULTILINE: flags += "m" if value.flags & re.DOTALL: flags += "s" if value.flags & re.UNICODE: flags += "u" if value.flags & re.VERBOSE: flags += "x" return "\x0B" + name + _make_c_string(pattern, True) + \ _make_c_string(flags) if isinstance(value, DBRef): return _element_to_bson(key, value.as_doc(), False, uuid_subtype) if isinstance(value, MinKey): return "\xFF" + name if isinstance(value, MaxKey): return "\x7F" + name raise InvalidDocument("cannot convert value of type %s to bson" % type(value))
def test_read_object_backwards_compat(): self = create_autospec(PickleStore) version = {'blob': Binary(compressHC(cPickle.dumps(object)))} assert PickleStore.read(self, sentinel.arctic_lib, version, sentinel.symbol) == object
def write(self, symbol, item, chunker=DateChunker(), **kwargs): """ Writes data from item to symbol in the database Parameters ---------- symbol: str the symbol that will be used to reference the written data item: Dataframe or Series the data to write the database chunker: Object of type Chunker A chunker that chunks the data in item kwargs: optional keyword args that are passed to the chunker. Includes: chunk_size: used by chunker to break data into discrete chunks. see specific chunkers for more information about this param. """ if not isinstance(item, (DataFrame, Series)): raise Exception("Can only chunk DataFrames and Series") self._arctic_lib.check_quota() previous_shas = [] doc = {} doc[SYMBOL] = symbol doc[LEN] = len(item) doc[SERIALIZER] = self.serializer.TYPE doc[CHUNKER] = chunker.TYPE sym = self._get_symbol_info(symbol) if sym: previous_shas = set([ Binary(x[SHA]) for x in self._collection.find( {SYMBOL: symbol}, projection={ SHA: True, '_id': False }, ) ]) op = False bulk = self._collection.initialize_unordered_bulk_op() chunk_count = 0 for start, end, chunk_size, record in chunker.to_chunks( item, **kwargs): chunk_count += 1 data = self.serializer.serialize(record) doc[METADATA] = { 'columns': data[METADATA][COLUMNS] if COLUMNS in data[METADATA] else '' } doc[CHUNK_SIZE] = chunk_size size_chunked = len(data[DATA]) > MAX_CHUNK_SIZE for i in xrange(int(len(data[DATA]) / MAX_CHUNK_SIZE + 1)): chunk = { DATA: Binary(data[DATA][i * MAX_CHUNK_SIZE:(i + 1) * MAX_CHUNK_SIZE]) } chunk[METADATA] = data[METADATA] if size_chunked: chunk[SEGMENT] = i else: chunk[SEGMENT] = -1 chunk[START] = start chunk[END] = end chunk[SYMBOL] = symbol dates = [ chunker.chunk_to_str(start), chunker.chunk_to_str(end), str(chunk[SEGMENT]).encode('ascii') ] chunk[SHA] = self._checksum(dates, chunk[DATA]) if chunk[SHA] not in previous_shas: op = True find = { SYMBOL: symbol, START: start, END: end, SEGMENT: chunk[SEGMENT] } bulk.find(find).upsert().update_one({'$set': chunk}) else: # already exists, dont need to update in mongo previous_shas.remove(chunk[SHA]) if op: bulk.execute() doc[CHUNK_COUNT] = chunk_count doc[APPEND_COUNT] = 0 if previous_shas: mongo_retry(self._collection.delete_many)({ SYMBOL: symbol, SHA: { '$in': list(previous_shas) } }) mongo_retry(self._symbols.update_one)({ SYMBOL: symbol }, { '$set': doc }, upsert=True)
def test_uuid_representation(self): coll = self.db.uuid coll.drop() # Test property self.assertEqual(UuidRepresentation.UNSPECIFIED, coll.codec_options.uuid_representation) # Test basic query uu = uuid.uuid4() # Insert as binary subtype 3 coll = self.db.get_collection( "uuid", CodecOptions(uuid_representation=PYTHON_LEGACY)) legacy_opts = coll.codec_options coll.insert_one({'uu': uu}) self.assertEqual(uu, coll.find_one({'uu': uu})['uu']) coll = self.db.get_collection( "uuid", CodecOptions(uuid_representation=STANDARD)) self.assertEqual(STANDARD, coll.codec_options.uuid_representation) self.assertEqual(None, coll.find_one({'uu': uu})) uul = Binary.from_uuid(uu, PYTHON_LEGACY) self.assertEqual(uul, coll.find_one({'uu': uul})['uu']) # Test count_documents self.assertEqual(0, coll.count_documents({'uu': uu})) coll = self.db.get_collection( "uuid", CodecOptions(uuid_representation=PYTHON_LEGACY)) self.assertEqual(1, coll.count_documents({'uu': uu})) # Test delete coll = self.db.get_collection( "uuid", CodecOptions(uuid_representation=STANDARD)) coll.delete_one({'uu': uu}) self.assertEqual(1, coll.count_documents({})) coll = self.db.get_collection( "uuid", CodecOptions(uuid_representation=PYTHON_LEGACY)) coll.delete_one({'uu': uu}) self.assertEqual(0, coll.count_documents({})) # Test update_one coll.insert_one({'_id': uu, 'i': 1}) coll = self.db.get_collection( "uuid", CodecOptions(uuid_representation=STANDARD)) coll.update_one({'_id': uu}, {'$set': {'i': 2}}) coll = self.db.get_collection( "uuid", CodecOptions(uuid_representation=PYTHON_LEGACY)) self.assertEqual(1, coll.find_one({'_id': uu})['i']) coll.update_one({'_id': uu}, {'$set': {'i': 2}}) self.assertEqual(2, coll.find_one({'_id': uu})['i']) # Test Cursor.distinct self.assertEqual([2], coll.find({'_id': uu}).distinct('i')) coll = self.db.get_collection( "uuid", CodecOptions(uuid_representation=STANDARD)) self.assertEqual([], coll.find({'_id': uu}).distinct('i')) # Test findAndModify self.assertEqual( None, coll.find_one_and_update({'_id': uu}, {'$set': { 'i': 5 }})) coll = self.db.get_collection( "uuid", CodecOptions(uuid_representation=PYTHON_LEGACY)) self.assertEqual( 2, coll.find_one_and_update({'_id': uu}, {'$set': { 'i': 5 }})['i']) self.assertEqual(5, coll.find_one({'_id': uu})['i']) # Test command self.assertEqual( 5, self.db.command('findAndModify', 'uuid', update={'$set': { 'i': 6 }}, query={'_id': uu}, codec_options=legacy_opts)['value']['i']) self.assertEqual( 6, self.db.command('findAndModify', 'uuid', update={'$set': { 'i': 7 }}, query={'_id': Binary.from_uuid(uu, PYTHON_LEGACY) })['value']['i'])
def __update(self, sym, item, combine_method=None, chunk_range=None): ''' helper method used by update and append since they very closely resemble eachother. Really differ only by the combine method. append will combine existing date with new data (within a chunk), whereas update will replace existing data with new data (within a chunk). ''' if not isinstance(item, (DataFrame, Series)): raise Exception("Can only chunk DataFrames and Series") self._arctic_lib.check_quota() symbol = sym[SYMBOL] if chunk_range is not None: self.delete(symbol, chunk_range) sym = self._get_symbol_info(symbol) bulk = self._collection.initialize_unordered_bulk_op() op = False chunker = CHUNKER_MAP[sym[CHUNKER]] for start, end, _, record in chunker.to_chunks( item, chunk_size=sym[CHUNK_SIZE]): # read out matching chunks df = self.read(symbol, chunk_range=chunker.to_range(start, end), filter_data=False) # assuming they exist, update them and store the original chunk # range for later use if len(df) > 0: record = combine_method(df, record) if record is None or record.equals(df): continue sym[APPEND_COUNT] += len(record) sym[LEN] += len(record) - len(df) else: sym[CHUNK_COUNT] += 1 sym[LEN] += len(record) data = SER_MAP[sym[SERIALIZER]].serialize(record) op = True # remove old segments for this chunk in case we now have less # segments than we did before chunk_count = int(len(data[DATA]) / MAX_CHUNK_SIZE + 1) seg_count = self._collection.count({ SYMBOL: symbol, START: start, END: end }) if seg_count > chunk_count: # if chunk count is 1, the segment id will be -1, not 1 self._collection.delete_many({ SYMBOL: symbol, START: start, END: end, SEGMENT: { '$gt': seg_count if chunk_count > 1 else -1 } }) size_chunked = chunk_count > 1 for i in xrange(chunk_count): chunk = { DATA: Binary(data[DATA][i * MAX_CHUNK_SIZE:(i + 1) * MAX_CHUNK_SIZE]) } chunk[METADATA] = data[METADATA] if size_chunked: chunk[SEGMENT] = i else: chunk[SEGMENT] = -1 chunk[START] = start chunk[END] = end chunk[SYMBOL] = symbol dates = [ chunker.chunk_to_str(start), chunker.chunk_to_str(end), str(chunk[SEGMENT]).encode('ascii') ] sha = self._checksum(dates, data[DATA]) chunk[SHA] = sha bulk.find({ SYMBOL: symbol, START: start, END: end, SEGMENT: chunk[SEGMENT] }).upsert().update_one({'$set': chunk}) if op: bulk.execute() self._symbols.replace_one({SYMBOL: symbol}, sym)
def train_model(recom_vars_obj): ''' This function takes the variable holder class as input and then loads the data for both the latent matrices from the database. Verifies for each user, and then, if any new user is found, adds rows and columns for them in the corresponding U and M matrices. The values are picked from a Normal Gaussian centered at 0 with S.D. 1 After loading of the data, it calls the gradient descent function matrix_factorisation, which returns the updated matrices U and M. It then loads the updated values into the database. ''' movie_counter = 0 if recom_vars_obj.np_arrays.find({'name': 'U'}).count() > 0: U = pickle.loads( recom_vars_obj.np_arrays.find({ 'name': 'U' }).next()['matrix']) M = pickle.loads( recom_vars_obj.np_arrays.find({ 'name': 'M' }).next()['matrix']) user_counter = len(U) movie_counter = len(M) for movie in recom_vars_obj.movies.find(): try: recom_vars_obj.movie_index_dict[ movie['_id']] = movie['matrix_index'] except: recom_vars_obj.movie_index_dict[movie['_id']] = movie_counter recom_vars_obj.movies.update( {'_id': movie['_id']}, {'$set': { 'matrix_index': movie_counter }}) M = np.vstack([M, np.random.normal(0, 1, [1, k])]) movie_counter += 1 for user in recom_vars_obj.users.find(): try: recom_vars_obj.user_index_dict[ user['_id']] = user['matrix_index'] except: recom_vars_obj.user_index_dict[user['_id']] = user_counter recom_vars_obj.users.update( {'_id': user['_id']}, {'$set': { 'matrix_index': user_counter }}) U = np.vstack([U, np.random.normal(0, 1, [1, k])]) user_counter += 1 recom_vars_obj.ratings_array = np.empty((user_counter, movie_counter)) recom_vars_obj.ratings_array.fill(np.nan) rating_indices = [] for rating in recom_vars_obj.ratings.find(): rating_indices.append([ recom_vars_obj.user_index_dict[rating['user_id']], recom_vars_obj.movie_index_dict[rating['movie_id']] ]) recom_vars_obj.ratings_array[ recom_vars_obj.user_index_dict[rating['user_id']], recom_vars_obj. movie_index_dict[rating['movie_id']]] = rating['rating'] new_U, new_M = matrix_factorisation(recom_vars_obj.ratings_array, user_counter, movie_counter, rating_indices, U, M) else: for movie in recom_vars_obj.movies.find(): recom_vars_obj.movie_index_dict[movie['_id']] = movie_counter recom_vars_obj.movies.update( {'_id': movie['_id']}, {'$set': { 'matrix_index': movie_counter }}) movie_counter += 1 user_counter = 0 for user in recom_vars_obj.users.find(): recom_vars_obj.user_index_dict[user['_id']] = user_counter recom_vars_obj.users.update( {'_id': user['_id']}, {'$set': { 'matrix_index': user_counter }}) user_counter += 1 recom_vars_obj.ratings_array = np.empty((user_counter, movie_counter)) recom_vars_obj.ratings_array.fill(np.nan) rating_indices = [] print("4") for rating in recom_vars_obj.ratings.find(): rating_indices.append([ recom_vars_obj.user_index_dict[rating['user_id']], recom_vars_obj.movie_index_dict[rating['movie_id']] ]) recom_vars_obj.ratings_array[ recom_vars_obj.user_index_dict[rating['user_id']], recom_vars_obj. movie_index_dict[rating['movie_id']]] = rating['rating'] new_U, new_M = matrix_factorisation(recom_vars_obj.ratings_array, user_counter, movie_counter, rating_indices) recom_vars_obj.np_arrays.update( {'name': 'U'}, {'$set': { 'matrix': Binary(pickle.dumps(new_U)) }}, upsert=True) recom_vars_obj.np_arrays.update({'name': 'U_len'}, {'$set': { 'value': len(U) }}, upsert=True) recom_vars_obj.np_arrays.update( {'name': 'M'}, {'$set': { 'matrix': Binary(pickle.dumps(new_M)) }}, upsert=True) recom_vars_obj.np_arrays.update({'name': 'M_len'}, {'$set': { 'value': len(M) }}, upsert=True)
if z in ["_id","title"]: continue for k in j: #k[0]->term_id k[1]->doc_id k[2]->term_freq A[k[0]][k[1]]=k[2] #print linalg.det(A) print A #n=6;m=4 term_freq=[sum(A[i]) for i in xrange(m)] entropy=[] for i in xrange(m): s=0 for j in xrange(n): pij=A[i][j]/term_freq[i] if pij==0: continue s+=pij*np.log10(pij) s=s/(np.log10(n)) entropy.append(s+1) print "\nentropy\n",entropy,"\n" A=np.array([ [entropy[i]*np.log10(A[i][j]+1) for j in xrange(n)] for i in xrange(m)]) print A val=(Binary(pickle.dumps(A, protocol=2), subtype=128 )) mycol = mydb["log_en"] mycol.insert({"key":val})
def write(self, symbol, item, chunk_size): """ Writes data from item to symbol in the database Parameters ---------- symbol: str the symbol that will be used to reference the written data item: dataframe or series the data to write the database chunk_size: ? A chunk size that is understood by the specified chunker """ doc = {} doc['symbol'] = symbol doc['chunk_size'] = chunk_size if isinstance(item, Series): doc['type'] = SeriesSerializer.TYPE elif isinstance(item, DataFrame): doc['type'] = DataFrameSerializer.TYPE else: raise Exception("Can only chunk Series and DataFrames") previous_shas = [] if self._get_symbol_info(symbol): previous_shas = set([ Binary(x['sha']) for x in self._collection.find( {'symbol': symbol}, projection={ 'sha': True, '_id': False }, ) ]) records = [] ranges = [] dtype = None for start, end, record in self.chunker.to_chunks(item, chunk_size): r, dtype = serialize(record, string_max_len=self.STRING_MAX) records.append(r) ranges.append((start, end)) item = np.array([r for record in records for r in record]).flatten() for record in records: if record.dtype.hasobject: raise UnhandledDtypeException() doc['dtype'] = str(dtype) doc['shape'] = (-1, ) + item.shape[1:] doc['dtype_metadata'] = dict(dtype.metadata or {}) doc['len'] = len(item) chunks = [r.tostring() for r in records] chunks = compress_array(chunks) op = False bulk = self._collection.initialize_unordered_bulk_op() for chunk, rng in zip(chunks, ranges): start = rng[0] end = rng[1] chunk = {'data': Binary(chunk)} chunk['start'] = start chunk['end'] = end chunk['symbol'] = symbol chunk['sha'] = checksum(symbol, chunk) if chunk['sha'] not in previous_shas: op = True bulk.find({ 'symbol': symbol, 'sha': chunk['sha'] }, ).upsert().update_one({'$set': chunk}) else: # already exists, dont need to update in mongo previous_shas.remove(chunk['sha']) if op: bulk.execute() doc['chunk_count'] = len(chunks) doc['append_size'] = 0 doc['append_count'] = 0 if previous_shas: mongo_retry(self._collection.delete_many)({ 'sha': { '$in': list(previous_shas) } }) mongo_retry(self._symbols.update_one)({ 'symbol': symbol }, { '$set': doc }, upsert=True)
def __setitem__(self, url, result): record = { 'result': Binary(zlib.compress(pickle.dumps(result))), 'timestamp': datetime.utcnow() } self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)
def update(self, symbol, item): """ Merges data from item onto existing data in the database for symbol data that exists in symbol and item for the same index/multiindex will be overwritten by the data in item. Parameters ---------- symbol: str the symbol for the given item in the DB item: the data to update """ sym = self._get_symbol_info(symbol) if not sym: raise NoDataFoundException("Symbol does not exist. Cannot update") records = [] ranges = [] orig_ranges = [] for start, end, record in self.chunker.to_chunks( item, sym['chunk_size']): # read out matching chunks df = self.read(symbol, chunk_range=self.chunker.to_range(start, end)) # assuming they exist, update them and store the original chunk # range for later use if not df.empty: if df.equals(record): continue record = record.combine_first(df) orig_ranges.append((self.chunker.to_start_end(record))) else: orig_ranges.append((None, None)) r, _ = serialize(record, string_max_len=self.STRING_MAX) records.append(r) ranges.append((start, end)) if len(records) > 0: chunks = [r.tostring() for r in records] lens = [len(i) for i in chunks] chunks = compress_array(chunks) seg_count = 0 seg_len = 0 bulk = self._collection.initialize_unordered_bulk_op() for chunk, rng, orig_rng, rec_len in zip(chunks, ranges, orig_ranges, lens): start = rng[0] end = rng[1] orig_start = orig_rng[0] if orig_start is None: sym['len'] += rec_len seg_count += 1 seg_len += rec_len segment = {'data': Binary(chunk)} segment['start'] = start segment['end'] = end sha = checksum(symbol, segment) segment['sha'] = sha if orig_start is None: # new chunk bulk.find({ 'symbol': symbol, 'sha': sha, 'start': segment['start'] }).upsert().update_one({'$set': segment}) else: bulk.find({ 'symbol': symbol, 'start': orig_start }).update_one({'$set': segment}) if len(chunks) > 0: bulk.execute() if seg_count != 0: sym['chunk_count'] += seg_count sym['append_size'] += seg_len sym['append_count'] += seg_count self._symbols.replace_one({'symbol': symbol}, sym)
def run_operation(self, sessions, collection, operation): original_collection = collection name = camel_to_snake(operation['name']) if name == 'run_command': name = 'command' elif name == 'download_by_name': name = 'open_download_stream_by_name' elif name == 'download': name = 'open_download_stream' database = collection.database collection = database.get_collection(collection.name) if 'collectionOptions' in operation: collection = collection.with_options( **self.parse_options(operation['collectionOptions'])) object_name = self.get_object_name(operation) if object_name == 'gridfsbucket': # Only create the GridFSBucket when we need it (for the gridfs # retryable reads tests). obj = GridFSBucket(database, bucket_name=collection.name, disable_md5=True) else: objects = { 'client': database.client, 'database': database, 'collection': collection, 'testRunner': self } objects.update(sessions) obj = objects[object_name] # Combine arguments with options and handle special cases. arguments = operation.get('arguments', {}) arguments.update(arguments.pop("options", {})) self.parse_options(arguments) cmd = getattr(obj, name) with_txn_callback = functools.partial(self.run_operations, sessions, original_collection, in_with_transaction=True) prepare_spec_arguments(operation, arguments, name, sessions, with_txn_callback) if name == 'run_on_thread': args = {'sessions': sessions, 'collection': collection} args.update(arguments) arguments = args result = cmd(**dict(arguments)) if name == "aggregate": if arguments["pipeline"] and "$out" in arguments["pipeline"][-1]: # Read from the primary to ensure causal consistency. out = collection.database.get_collection( arguments["pipeline"][-1]["$out"], read_preference=ReadPreference.PRIMARY) return out.find() if name == "map_reduce": if isinstance(result, dict) and 'results' in result: return result['results'] if 'download' in name: result = Binary(result.read()) if isinstance(result, Cursor) or isinstance(result, CommandCursor): return list(result) return result
def object_hook(dct, compile_re=True): if "$oid" in dct: return ObjectId(str(dct["$oid"])) if "$ref" in dct: return DBRef(dct["$ref"], dct["$id"], dct.get("$db", None)) if "$date" in dct: dtm = dct["$date"] # mongoexport 2.6 and newer if isinstance(dtm, str): # datetime.datetime.strptime is new in python 2.5 naive = datetime.datetime( *(time.strptime(dtm[:19], "%Y-%m-%dT%H:%M:%S")[0:6])) # The %f format is new in python 2.6 micros = int(dtm[20:23]) * 1000 aware = naive.replace(microsecond=micros, tzinfo=utc) offset = dtm[23:] if not offset or offset == 'Z': # UTC return aware else: if len(offset) == 5: # Offset from mongoexport is in format (+|-)HHMM secs = (int(offset[1:3]) * 3600 + int(offset[3:]) * 60) elif ':' in offset and len(offset) == 6: # RFC-3339 format (+|-)HH:MM hours, minutes = offset[1:].split(':') secs = (int(hours) * 3600 + int(minutes) * 60) else: # Not RFC-3339 compliant or mongoexport output. raise ValueError("invalid format for offset") if offset[0] == "-": secs *= -1 return aware - datetime.timedelta(seconds=secs) # mongoexport 2.6 and newer, time before the epoch (SERVER-15275) elif isinstance(dtm, dict): secs = float(dtm["$numberLong"]) / 1000.0 # mongoexport before 2.6 else: secs = float(dtm) / 1000.0 return EPOCH_AWARE + datetime.timedelta(seconds=secs) if "$regex" in dct: flags = 0 # PyMongo always adds $options but some other tools may not. for opt in dct.get("$options", ""): flags |= _RE_OPT_TABLE.get(opt, 0) if compile_re: return re.compile(dct["$regex"], flags) else: return Regex(dct["$regex"], flags) if "$minKey" in dct: return MinKey() if "$maxKey" in dct: return MaxKey() if "$binary" in dct: if isinstance(dct["$type"], int): dct["$type"] = "%02x" % dct["$type"] subtype = int(dct["$type"], 16) if subtype >= 0xffffff80: # Handle mongoexport values subtype = int(dct["$type"][6:], 16) return Binary(base64.b64decode(dct["$binary"].encode()), subtype) if "$code" in dct: return Code(dct["$code"], dct.get("$scope")) if bson.has_uuid() and "$uuid" in dct: return bson.uuid.UUID(dct["$uuid"]) if "$undefined" in dct: return None if "$numberLong" in dct: # 2to3 will change this to int. PyMongo 3.0 supports # a new type, Int64, to avoid round trip issues. return int(dct["$numberLong"]) if "$timestamp" in dct: tsp = dct["$timestamp"] return Timestamp(tsp["t"], tsp["i"]) return dct
def run_operation(self, sessions, collection, operation): original_collection = collection name = camel_to_snake(operation['name']) if name == 'run_command': name = 'command' elif name == 'download_by_name': name = 'open_download_stream_by_name' elif name == 'download': name = 'open_download_stream' database = collection.database collection = database.get_collection(collection.name) if 'collectionOptions' in operation: collection = collection.with_options( **self.parse_options(operation['collectionOptions'])) object_name = self.get_object_name(operation) if object_name == 'gridfsbucket': # Only create the GridFSBucket when we need it (for the gridfs # retryable reads tests). obj = GridFSBucket(database, bucket_name=collection.name, disable_md5=True) else: objects = { 'client': database.client, 'database': database, 'collection': collection, 'testRunner': self } objects.update(sessions) obj = objects[object_name] # Combine arguments with options and handle special cases. arguments = operation.get('arguments', {}) arguments.update(arguments.pop("options", {})) self.parse_options(arguments) cmd = getattr(obj, name) for arg_name in list(arguments): c2s = camel_to_snake(arg_name) # PyMongo accepts sort as list of tuples. if arg_name == "sort": sort_dict = arguments[arg_name] arguments[arg_name] = list(iteritems(sort_dict)) # Named "key" instead not fieldName. if arg_name == "fieldName": arguments["key"] = arguments.pop(arg_name) # Aggregate uses "batchSize", while find uses batch_size. elif ((arg_name == "batchSize" or arg_name == "allowDiskUse") and name == "aggregate"): continue # Requires boolean returnDocument. elif arg_name == "returnDocument": arguments[c2s] = arguments.pop(arg_name) == "After" elif c2s == "requests": # Parse each request into a bulk write model. requests = [] for request in arguments["requests"]: bulk_model = camel_to_upper_camel(request["name"]) bulk_class = getattr(operations, bulk_model) bulk_arguments = camel_to_snake_args(request["arguments"]) requests.append(bulk_class(**dict(bulk_arguments))) arguments["requests"] = requests elif arg_name == "session": arguments['session'] = sessions[arguments['session']] elif (name in ('command', 'run_admin_command') and arg_name == 'command'): # Ensure the first key is the command name. ordered_command = SON([(operation['command_name'], 1)]) ordered_command.update(arguments['command']) arguments['command'] = ordered_command elif name == 'open_download_stream' and arg_name == 'id': arguments['file_id'] = arguments.pop(arg_name) elif name != 'find' and c2s == 'max_time_ms': # find is the only method that accepts snake_case max_time_ms. # All other methods take kwargs which must use the server's # camelCase maxTimeMS. See PYTHON-1855. arguments['maxTimeMS'] = arguments.pop('max_time_ms') elif name == 'with_transaction' and arg_name == 'callback': callback_ops = arguments[arg_name]['operations'] arguments['callback'] = lambda _: self.run_operations( sessions, original_collection, copy.deepcopy(callback_ops), in_with_transaction=True) elif name == 'drop_collection' and arg_name == 'collection': arguments['name_or_collection'] = arguments.pop(arg_name) elif name == 'create_collection' and arg_name == 'collection': arguments['name'] = arguments.pop(arg_name) elif name == 'create_index' and arg_name == 'keys': arguments['keys'] = list(arguments.pop(arg_name).items()) elif name == 'drop_index' and arg_name == 'name': arguments['index_or_name'] = arguments.pop(arg_name) else: arguments[c2s] = arguments.pop(arg_name) if name == 'run_on_thread': args = {'sessions': sessions, 'collection': collection} args.update(arguments) arguments = args result = cmd(**dict(arguments)) if name == "aggregate": if arguments["pipeline"] and "$out" in arguments["pipeline"][-1]: # Read from the primary to ensure causal consistency. out = collection.database.get_collection( arguments["pipeline"][-1]["$out"], read_preference=ReadPreference.PRIMARY) return out.find() if name == "map_reduce": if isinstance(result, dict) and 'results' in result: return result['results'] if 'download' in name: result = Binary(result.read()) if isinstance(result, Cursor) or isinstance(result, CommandCursor): return list(result) return result