async def create_mongodb_doc(self, doc, mime_type, extension, mongodb_doc_id, mongodb): # mongodb document is created with required fields. soup = BeautifulSoup(doc, 'html.parser', parse_only=self.only_html) try: url = soup.singularity['href'] except Exception as e: print(e) return print( "33333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333" ) data = { '_id': mongodb_doc_id, # TODO file_type to select dynamically 'mime_type': mime_type, 'url': url, 'file_extension': extension, 'file_data': Binary(bytes(doc, "utf-8")), 'parsed': "False", } await self.insert_mongodb_doc("https3", data, mongodb)
def run(self): while True: current_url = redis_client.lpop('m_sohu_task') while not current_url: current_url = redis_client.lpop('m_sohu_task') self.spider.status = SpiderStatus.WORKING current_url = current_url.decode('utf-8') if not redis_client.sismember('visited_urls', current_url): redis_client.sadd('visited_urls', current_url) html_page = self.spider.fetch(current_url) if html_page not in [None, '']: hasher = hasher_proto.copy() hasher.update(current_url.encode('utf-8')) doc_id = hasher.hexdigest() if not sohu_data_coll.find_one({'_id': doc_id}): sohu_data_coll.insert_one({ '_id': doc_id, 'url': current_url, 'page': Binary(zlib.compress(pickle.dumps(html_page))) }) self.spider.parse(html_page) self.spider.status = SpiderStatus.IDLE
def store_files(userid): oldcwd = os.getcwd() os.chdir("/Users/WillMichael/Documents/git/FlaskFarmer/project/tests") file_paths = [ "../server/files/Mock/1/APPENDIX A- Recall Team_MR.xlsx", "../server/files/Mock/1/APPENDIX B-Agency-Press-Supplier-Customer Contact List_MR.xlsx", "../server/files/Mock/1/APPENDIX D-General Communication Log_MR.xlsx", "../server/files/Mock/1/APPENDIX H-Ingredients Receipts Record_MR.xlsx", "../server/files/Mock/1/APPENDIX I-Production Batch Sheet_MR.xlsx", "../server/files/Mock/1/APPENDIX K-Product Distribution record_MR.xlsx", "../server/files/Mock/1/APPENDIX N-Product Reconciliation_MR.xlsx", "../server/files/Mock/1/Mock Recall/APPENDIX G1-Mock Recall Record_MR.xlsx", "../server/files/Mock/1/Mock Recall/APPENDIX G2-Mock Recall Log_MR.xlsx", "../server/files/Mock/1/Mock Recall/APPENDIX O3-Recall Notification via Phone_MR.docx" ] for idx, fp in enumerate(file_paths): fo = open(fp, 'r') bin_file = fo.read() data = {"userid": userid, "docid": idx, "data": Binary(bin_file)} mongo.db.documents.insert_one(data) os.chdir(oldcwd)
def test_del_item_should_delete_pair_in_the_collection(self): self.collection.insert({ '_id': 'testing', 'value': Binary(encode('123')) }) self.collection.insert({ '_id': 'bla bla bla', 'value': Binary(encode('3.14')) }) my_dict = MongoDict(**self.config) del my_dict['testing'] results = list(self.collection.find()) self.assertEqual(results[0]['_id'], 'bla bla bla') self.assertEqual(decode(results[0]['value']), '3.14') with self.assertRaises(KeyError): del my_dict['non ecxiste']
def _convert_data_for_mongo(data): if isinstance(data, dict): result = dict() for key, value in data.iteritems(): result[str(key).replace('.', '_')] = _convert_data_for_mongo(value) return result if isinstance(data, np.ndarray): return data.tolist() if isinstance(data, np.float32): return float(data) if isinstance(data, list): return [_convert_data_for_mongo(x) for x in data] if isinstance(data, Decimal): return float(data) if isinstance(data, dt.datetime): return localize_datetime(data) if isinstance(data, dt.date): return localize_date(data) if isinstance(data, str): try: return data.decode('utf8') except Exception as e: logging.warning(e.message) return Binary(data, 0) return data
def test_bson_classes(self): _id = '5a918f9fa08bff9c7688d3e1' for a, b in [ (Binary(b'foo'), mockup_bson.Binary(b'foo')), (Code('foo'), mockup_bson.Code('foo')), (Code('foo', {'x': 1}), mockup_bson.Code('foo', {'x': 1})), (DBRef('coll', 1), mockup_bson.DBRef('coll', 1)), (DBRef('coll', 1, 'db'), mockup_bson.DBRef('coll', 1, 'db')), (Decimal128('1'), mockup_bson.Decimal128('1')), (MaxKey(), mockup_bson.MaxKey()), (MinKey(), mockup_bson.MinKey()), (ObjectId(_id), mockup_bson.ObjectId(_id)), (Regex('foo', 'i'), mockup_bson.Regex('foo', 'i')), (Timestamp(1, 2), mockup_bson.Timestamp(1, 2)), ]: # Basic case. self.assertTrue( Matcher(Command(y=b)).matches(Command(y=b)), "MockupDB %r doesn't equal itself" % (b, )) # First Command argument is special, try comparing the second also. self.assertTrue( Matcher(Command('x', y=b)).matches(Command('x', y=b)), "MockupDB %r doesn't equal itself" % (b, )) # In practice, users pass PyMongo classes in message specs. self.assertTrue( Matcher(Command(y=b)).matches(Command(y=a)), "PyMongo %r != MockupDB %r" % (a, b)) self.assertTrue( Matcher(Command('x', y=b)).matches(Command('x', y=a)), "PyMongo %r != MockupDB %r" % (a, b))
def run(self): redis_client = redis.Redis(host='1.2.3.4', port=6379, password='******') mongo_client = pymongo.MongoClient(host='1.2.3.4', port=27017) thread_local.redis_client = redis_client thread_local.mongo_db = mongo_client.msohu while True: current_url = redis_client.lpop('m_sohu_task') while not current_url: current_url = redis_client.lpop('m_sohu_task') self.spider.status = SpiderStatus.WORKING current_url = current_url.decode('utf-8') if not redis_client.sismember('visited_urls', current_url): redis_client.sadd('visited_urls', current_url) html_page = self.spider.fetch(current_url) if html_page not in [None, '']: hasher = hasher_proto.copy() hasher.update(current_url.encode('utf-8')) doc_id = hasher.hexdigest() sohu_data_coll = mongo_client.msohu.webpages if not sohu_data_coll.find_one({'_id': doc_id}): sohu_data_coll.insert_one({ '_id': doc_id, 'url': current_url, 'page': Binary(zlib.compress(pickle.dumps(html_page))) }) self.spider.parse(html_page) self.spider.status = SpiderStatus.IDLE
def docify(self, df): """ Convert a Pandas DataFrame to SON. Parameters ---------- df: DataFrame The Pandas DataFrame to encode """ dtypes = {} masks = {} lengths = {} columns = [] data = Binary(b'') start = 0 arrays = [] for c in df: try: columns.append(str(c)) arr, mask = self._convert_types(df[c].values) dtypes[str(c)] = arr.dtype.str if mask is not None: masks[str(c)] = Binary(compress(mask.tostring())) arrays.append(arr.tostring()) except Exception as e: typ = infer_dtype(df[c], skipna=False) msg = "Column '{}' type is {}".format(str(c), typ) logging.warning(msg) raise e arrays = compress_array(arrays) for index, c in enumerate(df): d = Binary(arrays[index]) lengths[str(c)] = (start, start + len(d) - 1) start += len(d) data += d doc = SON({DATA: data, METADATA: {}}) doc[METADATA] = { COLUMNS: columns, MASK: masks, LENGTHS: lengths, DTYPE: dtypes } return doc
def run_scenario(self): # Run tests. self.assertTrue(scenario_def['tests'], "tests cannot be empty") for test in scenario_def['tests']: self.init_db(scenario_def['data'], test) # Run GridFs Operation. operation = self.str_to_cmd[test['act']['operation']] args = test['act']['arguments'] extra_opts = args.pop("options", {}) if "contentType" in extra_opts: extra_opts["metadata"] = { "contentType": extra_opts.pop("contentType")} args.update(extra_opts) converted_args = dict((camel_to_snake(c), v) for c, v in args.items()) error = None try: result = operation(**converted_args) if 'download' in test['act']['operation']: result = Binary(result.read()) except Exception as exc: error = exc self.init_expected_db(test, result) # Asserts. errors = {"FileNotFound": NoFile, "ChunkIsMissing": CorruptGridFile, "ExtraChunk": CorruptGridFile, "ChunkIsWrongSize": CorruptGridFile, "RevisionNotFound": NoFile} if test['assert'].get("error", False): self.assertIsNotNone(error) self.assertTrue(isinstance(error, errors[test['assert']['error']])) else: self.assertIsNone(error) if 'result' in test['assert']: if test['assert']['result'] == 'void': test['assert']['result'] = None self.assertEqual(result, test['assert'].get('result')) if 'data' in test['assert']: # Create alphabetized list self.assertEqual( set(self.sorted_list(self.db.fs.chunks, True)), set(self.sorted_list(self.db.expected.chunks, True))) self.assertEqual( set(self.sorted_list(self.db.fs.files, False)), set(self.sorted_list(self.db.expected.files, False)))
def put(self, task, priority, schedule_time=None): if schedule_time is not None: raise SpiderMisuseError('Mongo task queue does not support delayed task') item = { 'task': Binary(pickle.dumps(task)), 'priority': priority, } self.collection.save(item)
def str2binary(data): """字符串转为二进制格式 :param data: :return: """ if isinstance(data, unicode): data = data.encode('utf-8') return Binary(StringIO(data).getvalue())
def create_plant_from_form(self, form): """Return a plant object from a plant form object.""" return Plant( binomial=form.binomial.data, names=form.names.data.split(', ') if form.names.data else (), cultivars=form.cultivars.data.split(', ') if form.cultivars.data else (), image=Binary(form.image.data.read()) if form.image.data else None)
def represent(self, obj, fieldtype): # the base adatpter does not support MongoDB ObjectId if isinstance(obj, self.ObjectId): value = obj else: value = NoSQLAdapter.represent(self, obj, fieldtype) # reference types must be convert to ObjectID if fieldtype == 'date': if value is None: return value # this piece of data can be stripped off based on the fieldtype t = datetime.time(0, 0, 0) # mongodb doesn't has a date object and so it must datetime, # string or integer return datetime.datetime.combine(value, t) elif fieldtype == 'time': if value is None: return value # this piece of data can be stripped of based on the fieldtype d = datetime.date(2000, 1, 1) # mongodb doesn't has a time object and so it must datetime, # string or integer return datetime.datetime.combine(d, value) elif fieldtype == "blob": if value is None: return value from bson import Binary if not isinstance(value, Binary): if not isinstance(value, basestring): return Binary(str(value)) return Binary(value) return value elif (isinstance(fieldtype, basestring) and fieldtype.startswith('list:')): if fieldtype.startswith('list:reference'): newval = [] for v in value: newval.append(self.object_id(v)) return newval return value elif ((isinstance(fieldtype, basestring) and fieldtype.startswith("reference")) or (isinstance(fieldtype, Table)) or fieldtype == "id"): value = self.object_id(value) return value
def checksum(symbol, doc): """ Checksum the passed in dictionary """ sha = hashlib.sha1() sha.update(symbol) for k in sorted(doc.iterkeys(), reverse=True): sha.update(str(doc[k])) return Binary(sha.digest())
def save_image(image_name_resized, image_bytes): logger.info('Saving image %s to DB', image_name_resized) image_resized_file = Binary(image_bytes) return db.images.insert_one({ 'image_resized_name': image_name_resized, 'image_resized_file': image_resized_file })
def test_jsonify_Binary(self): binary = Binary(b"hello") json = {'a': 1, 'bin': binary} safe_json = {'a': 1, 'bin': {'$binary': "aGVsbG8=", "$type": "00"}} jsonified_bson = jsonify(json).response jsonified = flask_jsonify(safe_json).response assert jsonified_bson == jsonified
def fileUpload(db, ssl_file, ssl_type, host): if ssl_type == "key": filename = "{}.{}".format(host, "key") path = "/opt/waf/conf/ssl/keys" else: filename = "{}.{}".format(host, "crt") path = "/opt/waf/conf/ssl/certificates" found = db.ssl.files.find_one({"filename": filename}) if found and "_id" in found: db.ssl.files.update({"filename": filename}, { "$set": { "md5": hashlib.md5(ssl_file).hexdigest(), "length": len(ssl_file) } }) db.ssl.chunks.update({"files_id": ObjectId(found["_id"])}, {"$set": { "data": Binary(ssl_file, 0) }}) file_id = found["_id"] print(colorize("green", "[+] File {} updated".format(filename))) else: file_id = db.ssl.files.insert_one({ "filename": filename, "chunkSize": 261120, "length": len(ssl_file), "md5": hashlib.md5(ssl_file).hexdigest(), "metadata": { "path": path, "is_dir": False } }).inserted_id db.ssl.chunks.insert_one({ "n": 0, "data": Binary(ssl_file, 0), "files_id": file_id }) print(colorize("green", "[+] File {} added to DB".format(filename))) return file_id
def to_mongo(self, value): pass return Binary( value.private_bytes( encoding=serialization.Encoding.DER, format=serialization.PrivateFormat.PKCS8, encryption_algorithm=serialization.BestAvailableEncryption( b"passphrase"), ))
def put(self, task, priority, schedule_time=None): if schedule_time is None: schedule_time = datetime.utcnow() item = { 'task': Binary(pickle.dumps(task)), 'priority': priority, 'schedule_time': schedule_time, } self.collection.save(item)
def create_user(self, db_connection: IConnection, username: str, data: BytesIO, client_id: str): img = Binary(data.read()) _id = db_connection.insert({ 'username': username, 'image': img, 'client_id': client_id }) return _id
def checksum(self, from_idx, to_idx): if self._checksum is None: self._lazy_init() total_sha = None for chunk_bytes, dtype in self.generator_bytes(from_idx=from_idx, to_idx=to_idx): # TODO: what about compress_array here in batches? compressed_chunk = compress(chunk_bytes) total_sha = incremental_checksum(compressed_chunk, curr_sha=total_sha, is_bytes=True) self._checksum = Binary(total_sha.digest()) return self._checksum
def str2hex(jsn): for key, val in jsn.items(): if key in ("data", "source", "result"): if "$hex" in val: jsn[key] = Binary(bytes_from_hex(val['$hex'])) if isinstance(jsn[key], dict): str2hex(jsn[key]) if isinstance(jsn[key], list): for k in jsn[key]: str2hex(k)
def test_reading_an_existing_key_should_read_saved_information(self): encoded_value = Binary( pickle.dumps('value', protocol=pickle.HIGHEST_PROTOCOL)) self.db.main.insert({ '_id': 'id:{}:key'.format(self.fake_id), 'v': encoded_value }) self.assertEqual(self.document['key'], 'value')
def test_match_querying_with_binary(self): class MyDocument(Document): bin_field = BinaryField() MyDocument.drop_collection() doc = MyDocument(bin_field=BIN_VALUE).save() matched_doc = MyDocument.objects(bin_field=Binary(BIN_VALUE)).first() self.assertEqual(matched_doc.id, doc.id)
def save_model_to_mongo(self, model: Any, trained_from: date = None, trained_upto: date = None): fs = connect_grid() if self.keras: with NamedTemporaryFile(suffix='.hdf5', delete=True) as ntf: save_model(model, ntf.name, overwrite=True) with BytesIO(Binary(ntf.read())) as f: objectId = fs.put(f, filename=self.model_name, chunk_size=2097152) else: with BytesIO(Binary(dumps(model))) as f: objectId = fs.put(f, filename=self.model_name, chunk_size=2097152) PythonModel( grid_fileid=objectId, model_name=self.model_name, symbol=self.symbol, trained_from=trained_from, trained_upto=trained_upto, ).save()
def convert_to_document(question): feature_vector = Binary(pickle.dumps(question.feature_vector, protocol=2)) document = { 'text': question.text, 'answer': question.answer, 'feature_vector': feature_vector, 'category': question.category, 'keywords': question.keywords, 'morphs': question.morphs } return document
def update_user_image(self, db_connection: IConnection, username: str, data: BytesIO, client_id: str): img = Binary(data.read()) _id = db_connection.update( { 'username': username, 'client_id': client_id }, {'$set': { 'image': img }}) return _id
def create_version_dict(self, path=None, version=None): """Create and return a version dict. If a project path is given, the project folder is zipped and compared to the latest existing zip archive. If it differs, a new dict is created with a higher version number. """ # Get the latest version number or 1 if it doesn't exist if version == None: version = self.get_latest_version_number() version_dict = self.get_version(version) # Create an empty dict of the manifest doesn't have one if version_dict == 0 or version_dict == None: version_dict = {} # If a path is given, zip it and compare to the existing hash if path is not None: now = datetime.today().strftime('%Y%m%d%H%M%S') version_dict['version_name'] = now + '_v' + str( version) + self.reduced_manifest['name'] version_dict['version_number'] = version # Make sure there is a zipfile to compare # Zip the project path if not os.path.exists(self.temp_dir): os.makedirs(self.temp_dir) new_zipfile_path = self.temp_dir + '/' + version_dict[ 'version_name'] + '.zip' new_zipfile = zipfile.ZipFile(new_zipfile_path, 'w', zipfile.ZIP_DEFLATED) rootlen = len(path) + 1 for base, _, files in os.walk(path): # Create local paths and write them to the new zipfile for file in files: fn = os.path.join(base, file) new_zipfile.write(fn, fn[rootlen:]) # Compare the hashes if 'zipfile' in version_dict and version_dict[ 'zipfile'] is not None: result = self.compare_files(version_dict['zipfile'], new_zipfile) # If the zipfiles are not the same iterate the version if result == False: version = version + 1 version_dict['version_number'] = version with open(new_zipfile_path, 'rb') as f: version_dict['zipfile'] = Binary(f.read()) # Now remove the temporary zipfile new_zipfile.close() os.remove(new_zipfile_path) # else: # version_dict['zipfile'] = Binary(new_zipfile) return version_dict
def checksum(symbol, doc): """ Checksum the passed in dictionary """ sha = hashlib.sha1() sha.update(symbol.encode('ascii')) for k in sorted(iter(doc.keys()), reverse=True): v = doc[k] if isinstance(v, six.binary_type): sha.update(doc[k]) else: sha.update(str(doc[k]).encode('ascii')) return Binary(sha.digest())
def GET(self, name): data = tiles.find_one({'name': name}) if data != None: return data['image'] else: print 'fetch ' + name tmp = string.split(name, '/') doc = {'name': name, 'zoom': tmp[0], 'x': tmp[1], 'y': tmp[2][:-3]} req = requests.get(server + name) image_data = req.content doc['image'] = Binary(image_data) tiles.insert(doc) return str(req.content)