def test_db(self): ''' Test database CRUD ops ''' #display all items from DB self.load_all_items_from_database() #create new_object and read back from database json_data = { "title": "Wordpress website for Freelancers", "description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc molestie. ", "price": 250, "assigned_to": "John Doe" } new_object = DB_Object.build_from_dict(json_data) self.test_create(new_object) #update new_object and read back from database new_object.price = 350 self.test_update(new_object) #delete new_object and try to read back from database self.test_delete(new_object) #Test nuking and reading anything back from database for i in range(3): self.repository.create(DB_Object.build_from_dict(json_data)) self.test_delete_all()
def get_ingredients_as_list(p_list_or_i): ''' Queries the products and ingredients DBs for ingredients contained within the products given by the input list of Object_Ids. Changing the tokenizer type variable 'T_TYPE' to ingredient causes this function to expect ObjectIds referring to infredients as input. Note: The each DB query is performed once using all object IDs simultaneously. This function performs no more than 2 queries when run. ''' global PROD_COMO if not p_list_or_i: return [] elif type(p_list_or_i) is str or type(p_list_or_i) is ObjectId: # Query a single ObjectId prod_fltr = {'_id': p_list_or_i} else: # Build list of ingredient ObjectIds contained in the product list prod_fltr = {'_id': {'$in': p_list_or_i}} if T_TYPE == 'product': prod_prjctn = { '_id': False, 'ingredient_list': True, 'comodegenic': True} db_objects = PRODUCTS_DB.read(prod_fltr, projection=prod_prjctn) # Get ObjectIds from all product ingredients ing_list = set() # Using set eliminates duplicate values for i in db_objects: ing = DB_Object.build_from_dict(i) ing_list.update(ing.get('ingredient_list', '')) PROD_COMO.append(ing.get('comodegenic', 0)) # Create column of comodegenic scores # Build list of all ingredient names ing_fltr = {'_id': {'$in': list(ing_list)}} ing_prjctn = {'_id': False, 'ingredient_name': True} db_objects = INGREDIENTS_DB.read(ing_fltr, projection=ing_prjctn) return [DB_Object.build_from_dict(i).get('ingredient_name', '') for i in db_objects] elif T_TYPE == 'OCR_list': return get_db_ingredients(p_list_or_i) else: # Return the ingredient name ing_fltr = {'_id': p_list_or_i} ing_prjctn = {'_id': False, 'ingredient_name': True} db_objects = INGREDIENTS_DB.read(ing_fltr, projection=ing_prjctn) return [DB_Object.build_from_dict(i).get('ingredient_name', '') for i in db_objects]
def build_product_model(host, port, **kwargs): prod_model_data = 'prod_model_data.pickle' print("Loading products from database:") prod_filt = {'comodegenic': {'$type': 'int'}} # Only return entries with comodegenic score prod_prjctn = { 'ingredient_list': True, 'comodegenic': True} db_objects = PRODUCTS_DB.read(prod_filt, projection=prod_prjctn) products = [DB_Object.build_from_dict(p) for p in db_objects] # The tfidf_vect will ignore the following words stop_words = [ '', 'water', 'glycerin', 'titanium dioxide', 'iron oxides', 'beeswax', 'methylparaben', 'propylparaben', 'propylene glycol', 'panthenol', 'mica'] # Tokenizer for product ingredient lists def get_prod_ings_as_list(product): ''' Queries the ingredients DB for a given product's ingredient list and returns the ingredient list as a list of ingredient strings Note: The DB query is performed once using all ingredient object IDs simultaneously. ''' fltr = {'_id': {'$in': product.get('ingredient_list', [])}} ing_prjctn = {'_id': False, 'ingredient_name': True} db_objects = INGREDIENTS_DB.read(fltr, projection=ing_prjctn) return [DB_Object.build_from_dict(i).get('ingredient_name', '') for i in db_objects] print('Vectorizing product ingredient lists') tfidf_vect = TfidfVectorizer( tokenizer=get_prod_ings_as_list, lowercase=False, stop_words=stop_words) X = tfidf_vect.fit_transform(products) y = [p['comodegenic'] for p in products] print('Storing vectorized data and training labels') # Flatten CSR sparse matrix to strings model = { 'X': X, 'y': y } print("Saving model data to disk for next time") # Insert the model into the model database MODEL_DB.create_file(pdumps(model, protocol=2), filename="ml_product_data") # Save model data to disk with open(prod_model_data, "wb") as pickle_out: pdump(model, pickle_out) print('[SUCCESS] Product model data post-processed and stored')
def check_authentication(s, auth_str): """ Check geven credentials against DB""" in_auth = auth_str.strip().strip('Basic ') query = PEOPLE_DB.read({'auth': in_auth}, limit=1) if query.count() == 1: s.person_data = DB_Object.build_from_dict(query[0]) return True else: return False
def test_update(self, new_object): print("\n\nUpdating new_object in database") self.repository.update(new_object) print("new_object updated in database") print("Reloading new_object from database") db_objects = self.repository.read({'_id': new_object._id}) for p in db_objects: project_from_db = DB_Object.build_from_dict(p) print("new_object = {}".format(project_from_db.get_as_dict()))
def get_prod_ings_as_list(product): ''' Queries the ingredients DB for a given product's ingredient list and returns the ingredient list as a list of ingredient strings Note: The DB query is performed once using all ingredient object IDs simultaneously. ''' fltr = {'_id': {'$in': product.get('ingredient_list', [])}} ing_prjctn = {'_id': False, 'ingredient_name': True} db_objects = INGREDIENTS_DB.read(fltr, projection=ing_prjctn) return [DB_Object.build_from_dict(i).get('ingredient_name', '') for i in db_objects]
def load_all_items_from_database(self): print("Loading all items from database:") db_objects = self.repository.read() at_least_one_item = False for p in db_objects: at_least_one_item = True tmp_project = DB_Object.build_from_dict(p) print("ID = {} | Title = {} | Price = {}".format( tmp_project._id, tmp_project.title, tmp_project.price)) if not at_least_one_item: print("No items in the database")
def dump_db_to_json(host, port, dump_db): valid = [ "people", "products", "ingredients", "testing", "comodegenic", "all" ] repos = [PEOPLE_DB, PRODUCTS_DB, INGREDIENTS_DB, TEST_DB, COMODEGENIC_DB] out_list = {} # Input validation if dump_db is None or dump_db is "": return if dump_db not in valid: return # Dump the specified DB print("Dumping database/s: '" + dump_db + "'") db_objects = repos[0].read() at_least_one_item = False if dump_db == 'all': for repo in repos: db_objects = repo.read() at_least_one_item = False out_list[repo.collection] = [] for p in db_objects: at_least_one_item = True out_list[repo.collection].append(DB_Object.build_from_dict(p)) if not at_least_one_item: print("No items in ", repo.collection, " database") else: repo_idx = valid.index(dump_db) db_objects = repos[repo_idx].read() at_least_one_item = False out_list[repos[repo_idx].collection] = [] for p in db_objects: at_least_one_item = True out_list[repos[repo_idx].collection].append( DB_Object.build_from_dict(p).get_as_dict()) if not at_least_one_item: print("No items in ", repos[repo_idx].collection, " database") with open('db_dump_%s.json' % dump_db, 'w') as f: json.dump(out_list, f, cls=JSONEncoder)
def test_create(self, new_object): print("\n\nSaving new_object to database") result = self.repository.create(new_object) if result.acknowledged: new_object['_id'] = result.inserted_id else: print("[FAILED] Could not save object") print("new_object saved to database") print("Loading new_object from database") db_objects = self.repository.read({'_id': new_object._id}) for p in db_objects: project_from_db = DB_Object.build_from_dict(p) print("new_object = {}".format(project_from_db.get_as_dict()))
def test_delete(self, new_object): print("\n\nDeleting new_object from database") self.repository.delete(new_object) print("new_object deleted from database") print("Trying to reload new_object from database") db_objects = self.repository.read({'_id': new_object._id}) found = False for p in db_objects: found = True project_from_db = DB_Object.build_from_dict(p) print("new_object = {}".format(project_from_db.get_as_dict())) if not found: print("Item with id = {} was not found in the database".format( new_object._id))
def get_ingredient_vocabulary(host, port, **kwargs): ''' Returns the set of all unique ingredient names including synonyms ''' # Build list of all ingredient names ing_fltr = {} # Get all ingredients ing_prjctn = { '_id': False, 'ingredient_name': True, 'synonym_list': True} db_objects = INGREDIENTS_DB.read(ing_fltr, projection=ing_prjctn) ingredients = [DB_Object.build_from_dict(i) for i in db_objects] ret = set() for ingredient in ingredients: ret.update([ingredient.get('ingredient_name', '')]) for synonym in ingredient.get('synonym_list', []): ret.update([ingredient.get('ingredient_name', '')]) return ret
def get_suggestions(s, search_str, col='ingredient'): ''' Check DB to see if username is avaialble''' if not search_str: return [] if col == 'ingredient': collection = INGREDIENTS_DB prjctn = { 'ingredient_name': True, 'cancer_score': True, 'allergy_imm_tox_score': True, 'ingredient_score': True, 'dev_reprod_tox_score': True, 'score': { '$meta': 'textScore' } } else: collection = PRODUCTS_DB prjctn = { 'product_name': True, 'cancer_score': True, 'allergy_imm_tox_score': True, 'product_score': True, 'dev_reprod_tox_score': True, 'score': { '$meta': 'textScore' } } query = collection.read( {'$text': { '$search': unquote_plus(search_str) }}, limit=100, projection=prjctn) sorted_query = query.sort([('score', {'$meta': 'textScore'})]) return [DB_Object.build_from_dict(item) for item in sorted_query]
def build_people_model(host, port, **kwargs): global PROD_COMO ppl_model_data = 'ppl_model_data.pickle' batch_size = kwargs.get('batch_size', 10000) vocabulary = get_ingredient_vocabulary(host, port) # The tfidf_vect will ignore the following words stop_words = [ '', 'water', 'glycerin', 'titanium dioxide', 'iron oxides', 'beeswax', 'methylparaben', 'propylparaben', 'propylene glycol', 'panthenol', 'mica'] # Create vectorizers d_vect = DictVectorizer(sparse=False) tfidf_vect = TfidfVectorizer( tokenizer=get_ingredients_as_list, lowercase=False, stop_words=stop_words, vocabulary=vocabulary) print("Loading people from database, batch_size:", str(batch_size)) ppl_filt = {} ppl_prjctn = { '_id': False, 'race': True, 'birth_sex': True, 'age': True, 'acne': True, 'skin': True, 'acne_products': True} # Don't include any PII db_objects = PEOPLE_DB.read(ppl_filt, projection=ppl_prjctn) y, demo_mult = [], [] batch_num, pulled = 0, 0 X = None # Work in batches to build dataset while pulled <= db_objects.count(with_limit_and_skip=True): # Initialize X_demo_lst, X_prod_lst = [], [] people = [] print('Parsing batch:', batch_num) try: # Build a batch for i in range(batch_size): people.append(DB_Object.build_from_dict(db_objects.next())) pulled += 1 except StopIteration: # End of available data break # Extract features for person in people: # Create new entry for each product # Note: Model is only applicable to entries with products for product_id in person.pop('acne_products'): # Pull product ingredients info X_prod_lst.append([product_id]) # Pull demographic info X_demo_lst.append(person) # Generate demographic multiplier mult = get_multiplier(person) demo_mult.append(mult) # Vectorize data X_demo = d_vect.fit_transform(X_demo_lst) # X_demo is now a numpy array X_prod = tfidf_vect.fit_transform(X_prod_lst) # X_prod is now a CSR sparse matrix # Add batch result to output matrix if X is not None: X_t = hstack([csr_matrix(X_demo), X_prod], format="csr") try: X = vstack([X, X_t], format="csr") except ValueError: break else: # Initialize X X = hstack([csr_matrix(X_demo), X_prod], format="csr") batch_num += 1 for como, mult in zip(PROD_COMO, demo_mult): val = como * mult if val < 6: y.append(0) elif val < 12: y.append(1) else: y.append(2) print('Storing vectorized data and training labels') # Flatten CSR sparse matrix to strings model = { 'X': X, 'y': y, 'd_vect': d_vect, 'tfidf_vect': tfidf_vect, 'vocabulary': vocabulary } print("Saving model data to disk for next time") # Insert the model into the model database MODEL_DB.create_file(pdumps(model, protocol=2), filename="ml_people_data") # Save model data to disk with open(ppl_model_data, "wb") as pickle_out: pdump(model, pickle_out) print('[SUCCESS] People model data post-processed and stored')
def create_new_user(s, recv_data): return PEOPLE_DB.create(DB_Object.build_from_dict(recv_data))
def generate_people(host, port, num_generate_people=10000): # Connect to the required databases products_db = DB_CRUD(host, port, db='capstone', col='products') people_db = DB_CRUD(host, port, db='capstone', col='people') # Variables races = [ 'American Indian', 'Asian', 'Black', 'Pacific Islander', 'White', 'mixed_other' ] birth_sexes = ['female', 'male'] skin_types = ['normal', 'oily', 'dry'] # Probabilities race_probs = [0.009, 0.048, 0.126, 0.002, 0.724, 0.091] sex_probs = [0.508, 0.492] skin_probs = [1.0 / 3, 1.0 / 3, 1.0 / 3] # Make sure user wants to destroy existing DB ppl_qstn = '[WARNING] This will erase the people database. Continue?' if not query_yes_no(ppl_qstn, default='no'): print("No actions taken") return # Get number of people to generate try: usr_input = int(input("# people to generate: ")) num_generate_people = usr_input except ValueError: print("Invalid input, using default value", num_generate_people) pass print("Nuking people database") people_db.nuke() print("Creating search indexes") people_db.createIndex([('user_name', ASCENDING)], unique=True, default_language='english') # Generate random people data print("Generating race data") ppl_race = np.random.choice(races, num_generate_people, p=race_probs) print("Generating sex data") ppl_sex = np.random.choice(birth_sexes, num_generate_people, p=sex_probs) print("Generating age and acne data") ppl_ages, ppl_acne = generate_age_acne_lists(num_generate_people) print("Generating skin data") ppl_skins = np.random.choice(skin_types, num_generate_people, p=skin_probs) print("Generating names") ppl_names = [get_sex_name(s) for s in ppl_sex] print("Generating usernames") ppl_unames = [get_unique_username(full_name) for full_name in ppl_names] print("Generating user authentications") ppl_auths = [ base64.b64encode(str(u_name + ":1234").encode()).decode() for u_name in ppl_unames ] # Generate dict of people print("Creating list of people dicts") fields = [ 'name', 'race', 'birth_sex', 'age', 'acne', 'skin', 'auth', 'user_name' ] p_data = zip(ppl_names, ppl_race, ppl_sex, ppl_ages, ppl_acne, ppl_skins, ppl_auths, ppl_unames) p_list = [dict(zip(fields, d)) for d in p_data] # Get comodegenic products print("Getting list of comodegenic products") # 0 value comodegeinc scores are null data db_objects = products_db.read({'comodegenic': {"$gt": 0}}) products = [DB_Object.build_from_dict(p) for p in db_objects] # Set scaling for comodogenic-ness of products # The scale value is 1 divided by the maximum comodegenic score # in the products database which works regardless of the scoring # method used when building the db. prod_filt = {'comodegenic': {'$type': 'int'}} prod_prjctn = {'comodegenic': True} db_objects = products_db.read(prod_filt, projection=prod_prjctn, sort=[("comodegenic", DESCENDING)], limit=1) como_scale = 1.0 / DB_Object.build_from_dict(db_objects[0])['comodegenic'] print("Adding people to database") # Populate acne causing products for each person for person in p_list: p_products = [] for i in range(np.random.choice(10)): rand_idx = np.abs(np.random.choice(len(products)) - 1) prod_como = products[rand_idx]['comodegenic'] probs = [como_scale * prod_como, 1 - (como_scale * prod_como)] if person['acne']: # If a person has acne, probabilisticly add 0 to 5 known # comodegenic products. Otherwise probabilisticly add # 0 to 5 non-comodegenic products if np.random.choice([True, False], p=probs): p_products.append(products[rand_idx]['_id']) else: if np.random.choice([False, True], p=probs): p_products.append(products[rand_idx]['_id']) person['acne_products'] = p_products #import ipdb #ipdb.set_trace() # Add person to data base new_person = DB_Object.build_from_dict(person) people_db.create(new_person) print("[SUCCESS] people database is populated")
def build_db(host, port, **kwargs): # Get required file paths i_path = kwargs.get('i_path', '') p_path = kwargs.get('p_path', '') c_path = kwargs.get('c_path', '') score_max = kwargs.get('score_max', False) # Connect to the reequired databases products_db = DB_CRUD(host, port, db='capstone', col='products') ingredients_db = DB_CRUD(host, port, db='capstone', col='ingredients') comodegenic_db = DB_CRUD(host, port, db='capstone', col='comodegenic') # Make sure user wants to destroy existing DB db_qstn = ('[WARNING] This will erase the products, ingredients, ' 'and comodegenic items databases. Continue?') if not query_yes_no(db_qstn, default='no'): print("No actions taken") return # Drop databases print("Deleting products database") products_db.nuke() print("Deleting ingredients database") ingredients_db.nuke() print("Deleting comodegenic database") comodegenic_db.nuke() # Open files and load JSON data, exit if unsuccesful print("Attempting to open .json files.") try: i_f = open(i_path, 'rb') p_f = open(p_path, 'rb') c_f = open(c_path, 'rb') except IOError as e: print(e) exit() with i_f: ingredients_dict = json.load(i_f) ing_ins_len = len(ingredients_dict) with p_f: products_dict = json.load(p_f) prod_ins_len = len(products_dict) with c_f: cmdgnc_list = json.load(c_f) print("Populating comodegenic information") #cmdgnc_dict = {entry['ingredient']: entry for entry in cmdgnc_list} for entry in cmdgnc_list: # Create DB object from product new_entry = DB_Object.build_from_dict(entry) # Insert the product into the database comodegenic_db.create(new_entry) comodegenic_db.createIndex([('ingredient', TEXT)]) # Clean and load ingredients into ingredient database print("Populating ingredients") for ingredient_id in list(ingredients_dict.keys()): ingredient = ingredients_dict[ingredient_id] # Remove the old id entry from ingredients_dict # This is to avoid storing redundant info in the DB, ingredient entries will still # be accessible using the ingredient_id when the product entries are added del (ingredient['ingredient_id']) # Get comodegenic info search_term = '"' + ingredient.get('ingredient_name', '') + '"' db_objects = comodegenic_db.read({'$text': {"$search": search_term}}) entries = [DB_Object.build_from_dict(entry) for entry in db_objects] # Try to find ingredient in comodegenic DB, fall back to synonyms if necessary if entries: ingredient['comodegenic'] = int(entries[0]['level']) else: for synonym in ingredient.get('synonym_list', []): search_term = '"' + synonym + '"' db_objects = comodegenic_db.read( {'$text': { "$search": search_term }}) entries = [ DB_Object.build_from_dict(entry) for entry in db_objects ] if entries: ingredient['comodegenic'] = int(entries[0]['level']) break # Set null value for ingredients without comodegenic score information if not 'comodegenic' in ingredient: ingredient['comodegenic'] = None # Normalize text fields ingredient['ingredient_name'] = ingredient.get('ingredient_name', '').strip().lower() norm_synonyms = [] synonym_list = ingredient.get('synonym_list', []) for synonym in synonym_list: norm_synonyms.append(synonym.strip().lower()) if synonym_list: ingredient['synonym_list'] = synonym_list # Create DB object from ingredient new_ingredient = DB_Object.build_from_dict(ingredient) # Insert the ingredient into the database db_op_res = ingredients_db.create(new_ingredient) # Add the new mongoDB id to the existing ingredients dictionary # if the insertion was successful if db_op_res.acknowledged: ingredient['_id'] = db_op_res.inserted_id else: err_msg = ("[FAIL] Database insertion for " + str(new_ingredient) + " was unsuccessful") raise Exception(err_msg) print("Populating products") for product_id in list(products_dict.keys()): # Convert ingredient list IDs to Mongo DB object IDs new_ing_ids = [] product = products_dict[product_id] for ingredient_id in product.get('ingredient_list', []): new_ing_id = ingredients_dict.get(ingredient_id, {}).get('_id', None) if new_ing_id: new_ing_ids.append(new_ing_id) # Set product comodegenic score # Determine whether comodegenic scores are calculated using # ingredient max comodegenic score or sum of ingredient comodegenic scores ing_como = ingredients_dict[ingredient_id].get( 'comodegenic', 0) prod_como = product.get('comodegenic', 0) if score_max: product['comodegenic'] = max(prod_como, ing_como) else: product[ 'comodegenic'] = prod_como + ing_como if ing_como else prod_como else: raise KeyError( "Check scraper, key should exist in ingredients JSON!\nKey: '{}'" .format(ingredient_id)) if new_ing_ids: product['ingredient_list'] = new_ing_ids # Set null value for products without comodegenic score information if not 'comodegenic' in product: product['comodegenic'] = None # Remove old style product id del (product['product_id']) # Create DB object from product new_product = DB_Object.build_from_dict(product) # Insert the product into the database products_db.create(new_product) # Test the build print("Testing data integrity") ing_read_len = ingredients_db.read().count() prod_read_len = products_db.read().count() print("Ingredients inserted: {} Ingredients read: {}".format( ing_ins_len, ing_read_len)) print("Products inserted: {} Products read: {}".format( prod_ins_len, prod_read_len)) if ing_read_len != ing_ins_len or prod_read_len != prod_ins_len: raise Exception("[FAIL] The number of inserted items does not match!") print("Creating search indexes") ingredients_db.createIndex([('ingredient_name', TEXT), ('synonym_list', TEXT)], weights={'ingredient_name': 10}, default_language='english') products_db.createIndex([('product_name', TEXT)], default_language='english') products_db.createIndex([('comodegenic', DESCENDING)], default_language='english') print("[SUCCESS] Database is populated")