def make_npy(self): base = self.knn_num samples, labels = [], [] paths = hp.get_paths(base) for path in paths: img = hp.get_image(base, path) sample = self.__buddy_hog(img) label = hp.get_name(path) labels.append(label) samples.append(sample) samples = np.float32(samples) labels = np.array(labels) np.save(self.samples_num_file, samples) np.save(self.labels_num_file, labels) base = self.knn_sym samples, labels = [], [] paths = hp.get_paths(base) for path in paths: img = hp.get_image(base, path) sample = self.__buddy_hog(img) label = hp.get_name(path) - 10 labels.append(label) samples.append(sample) samples = np.float32(samples) labels = np.array(labels) np.save(self.samples_sym_file, samples) np.save(self.labels_sym_file, labels)
def make_npy(self): base = self.svm_num samples, labels = [], [] paths = hp.get_paths(base) for path in paths: img = hp.get_image(base, path) sample = self.__buddy_hog(img) label = hp.get_name(path) labels.append(label) samples.append(sample) samples = np.float32(samples) labels = np.array(labels) np.save(self.samples_num_file, samples) np.save(self.labels_num_file, labels) base = self.svm_sym samples, labels = [], [] paths = hp.get_paths(base) for path in paths: img = hp.get_image(base, path) sample = self.__buddy_hog(img) label = hp.get_name(path) - 10 labels.append(label) samples.append(sample) samples = np.float32(samples) labels = np.array(labels) np.save(self.samples_sym_file, samples) np.save(self.labels_sym_file, labels)
def load(dir_num, dir_sym): print '\n##############' print 'loading nums...' num_paths = hp.get_paths(dir_num) for path in num_paths: num_images.append(hp.get_gray_image(dir_num, path)) num_labels.append(hp.get_test(path, "num")[0]) print 'loading syms...' sym_paths = hp.get_paths(dir_sym) for path in sym_paths: sym_images.append(hp.get_gray_image(dir_sym, path)) sym_labels.append(hp.get_test(path, "sym")[0]) print '##############'
def make_hog_num_file(): base = hp.small_test_num_images_15x20 samples, labels = [], [] paths = hp.get_paths(base) for path in paths: img = hp.get_image(base, path) sample = buddy_hog(img) label = hp.get_name(path) labels.append(label) samples.append(sample) samples = np.float32(samples) labels = np.array(labels) labels = list(labels) # head document = "" head1 = str(len(samples)) + " " head2 = str(len(samples[0])) + " " head3 = str("10") + "\n" head = head1 + head2 + head3 document += head for sample, label in zip(samples, labels): input = ' '.join(map(str, list(sample))) output = make_output_text_vector(int(label), 10) document += input + "\n" + output print document with open(os.path.join("ann/test/", "test-hog-nums.dat"), 'wb') as temp_file: temp_file.write(document)
def __make_knn_from_images(self, mode): base = "" modifier = 0 samples, labels = [], [] if mode == "num": base = self.knn_num modifier = 0 elif mode == "sym": base = self.knn_sym modifier = 10 paths = hp.get_paths(base) for path in paths: img = hp.get_image(base, path) sample = self.__buddy_hog(img) label = hp.get_name(path) - modifier labels.append(label) samples.append(sample) samples = np.float32(samples) labels = np.array(labels) model = KNearest(k=5) model.train(samples, labels) return model
def __make_svm_from_images(self, dir_train, mode): samples, labels = [], [] paths = hp.get_paths(dir_train) modifier = 0 if mode == "num": modifier = 0 elif mode == "sym": modifier = 10 for path in paths: img = hp.get_image(dir_train, path) sample = self.__buddy_hog(img) label = hp.get_name(path) - modifier labels.append(label) samples.append(sample) samples = np.array(samples) labels = np.array(labels) rand = np.random.RandomState(321) shuffle = rand.permutation(len(samples)) samples, labels = samples[shuffle], labels[shuffle] samples = np.float32(samples) model = SVM(C=2.67, gamma=5.383) model.train(samples, labels) return model
def run(self): while True: if not self.__queue.empty(): print 'process dir' dir_in = self.__queue.get() paths = hp.get_paths(dir_in) for num, path in enumerate(paths, start=1): meta = {'id': random.randint(1000, 1000000000), 'plate': hp.get_image(dir_in, path)} self.plate_captured.emit(meta) # print path self.__queue.task_done()
def process_directory(directory, pmanager, file_types, path_ignore): if debug: print 'process_directory: '+directory file_paths, directory_paths = get_paths(directory) i = 0 for file_path in file_paths: fname, extension = os.path.splitext(file_path) extension = extension.replace('.', '') if extension in file_types: if '.' not in fname: i = i + 1 sync_local_photo(file_path, pmanager, path_ignore) else: print file_path+' skipped due to . in name' print "Syncing %i photos in local db" % i
def global_testing(input_dir, set_name, mode, ann, knn, msp, svm): print "Start recognition:" print "run test", (set_name, mode) abc = {3: 0, 2: 0, 1: 0} problem = 0 paths = hp.get_paths(input_dir) total, err = len(paths), len(paths) for path in paths: # recognition img = hp.get_image(input_dir, path) at, ann_i = ann.rec(img, mode) kt, knn_i = knn.rec(img, mode) mt, msp_i = msp.rec(img, mode) st, svm_i = svm.rec(img, mode) # test tlit, ti = hp.get_test(path, mode) up_count_dict(tlit) up_dict(ann_dict, at, tlit) up_dict(knn_dict, kt, tlit) up_dict(msp_dict, mt, tlit) up_dict(svm_dict, st, tlit) rec = None xyz = {at, kt, st} if len(xyz) < 3: t = [at, kt, st] c = Counter(t) rec = max(c, key=c.get) if tlit == rec: up_dict(common_dict, rec, tlit) err -= 1 else: rec = None if rec is None: abc[len(xyz)] += 1 if len(xyz) == 1: cv2.imwrite("try/awesome/" + str([at, kt, st]) + "__" + str(tlit) + "__" + path, img) hp.print_result(total, err, set_name, mode) print "awesome:", abc[1] print "great:", abc[2] print "f**k this shit: ", abc[3] print " "
def testing_svm_from_image_base(input_dir, set_name, mode, svm): success, error = "", "" counter, err = 1, 0 paths = hp.get_paths(input_dir) for path in paths: # recognition img = hp.get_image(input_dir, path) lit, i = svm.rec(img, mode) # test tlit, ti = hp.get_test(path, mode) if tlit != lit: err += 1 error += "{0}\n".format(path) hp.write_image("try/error", path, img) else: success += "{0}\n".format(path) counter += 1 hp.print_result(counter, err, set_name, mode)
def testing_from_image_base(input_dir, set_name, mode, ann): #init block success, error = "", "" counter, err = 1, 0 paths = hp.get_paths(input_dir) for path in paths: # recognition img = hp.get_image(input_dir, path) lit, i = ann.rec(img, mode) # test tlit, ti = hp.get_test(path, mode) if tlit != lit: err += 1 error += "{0}\n".format(path) hp.write_image("try/error", path, img) else: success += "{0}\n".format(path) counter += 1 hp.print_result(counter, err, set_name, mode)
def visualize_text_clustering_result(km_model): ''' Save images to different folders accoring to the kmeans clustering labels. INPUT: km_model: trained KMeans model OUTPUT: None ''' text_clustering_result = collections.defaultdict(list) for i, label in enumerate(km_model.labels_): text_clustering_result[label].append(int(X.index[i])) paths = get_paths(category) for i in text_clustering_result: for j in text_clustering_result[i]: product_id = sofa_df.ix[j].product_id for path in paths: if product_id in path: image = skimage.io.imread('wayfair/images/' + category + '/' + path) new_path = 'wayfair/images/' + category + '/text_clustering/' + str(i) + '/' + path skimage.io.imsave(new_path, image) if __name__ == '__main__': categories = ['sofa', 'coffee_table', 'office', 'dining', 'bookcase', 'nightstand', 'bed', 'dresser'] for category in categories: client = MongoClient() db = client['furniture']
def build_all_vec_info_json(category): """ For each image within the category, calculate the dominant color and the PCAed vector. For each item within the category, use tfidf to vectorize the description. Merge the info above, together with product info (title, price, URL etc.) to one dataframe, then save it to a json file. INPUT: category: string OUTPUT: None """ paths = get_paths(category) print len(paths) # Get dominant color: start_time = time.time() domi_color_dict = get_domi_color(paths, category) time_1 = time.time() print "Time to run get_domi_color (s): ", time_1 - start_time # Get PCAed features and pickle transformers: feature_dict_pca, pca_scaler, pca_model = image_featurizer(category, pca=True) path = "wayfair/pickle/" + category + "_pca_scaler.pkl" with open(path, "w") as f: pickle.dump(pca_scaler, f) path = "wayfair/pickle/" + category + "_pca_model.pkl" with open(path, "w") as f: pickle.dump(pca_model, f) # Building dataframe from dictionaries: domi_pca_dict = defaultdict(dict) for i in domi_color_dict: domi_pca_dict[i]["domi"] = domi_color_dict[i] for i in feature_dict_pca: domi_pca_dict[i]["pca"] = feature_dict_pca[i] domi_pca_df = pd.DataFrame(domi_pca_dict).T domi_pca_df = domi_pca_df.reset_index() domi_pca_df.rename(columns={"index": "path"}, inplace=True) domi_pca_df["product_id"] = domi_pca_df["path"].apply(lambda x: x.split(".")[0].split("_")[-2]) client = MongoClient() db = client["furniture"] collection = db[category] products_df = pd.DataFrame(list(collection.find())) products_df_small = products_df[ ["product_id", "title", "price", "url", "description_all", "rating_avg", "rating_count"] ] all_info_df = pd.merge(domi_pca_df, products_df_small, how="inner", left_on="product_id", right_on="product_id") # Train a tfidf vectorizer & pickle it: X = all_info_df["description_all"] tfidf = TfidfVectorizer( strip_accents="unicode", stop_words="english", max_df=0.8, max_features=1000, ngram_range=(1, 2) ) tfidf_matrix = tfidf.fit_transform(X).todense() path = "wayfair/pickle/" + category + "_tfidf.pkl" with open(path, "w") as f: pickle.dump(tfidf, f) # Add tfidf vectors to the dataframe: tfidf_matrix = np.array(tfidf_matrix) tfidf_dict = defaultdict(dict) for i in xrange(tfidf_matrix.shape[0]): index = X.index[i] tfidf_dict[index]["tfidf_vec"] = tfidf_matrix[i, :] tfidf_df = pd.DataFrame(tfidf_dict).T tfidf_df = tfidf_df.reset_index() tfidf_df.rename(columns={"index": "ixx"}, inplace=True) all_info_df = pd.merge(all_info_df, tfidf_df, how="inner", left_index=True, right_on="ixx") all_info_df.drop("ixx", axis=1, inplace=True) # Some final clearning & save to json: all_info_df = all_info_df[all_info_df["domi"].notnull()] all_info_df = all_info_df[all_info_df["pca"].notnull()] all_info_df = all_info_df[all_info_df["description_all"].notnull()] path = "wayfair/" + category + "_vec_info.json" all_info_df.to_json(path)
def __load_sym_patterns(self, input_dir): paths = hp.get_paths(input_dir) self.__patterns_sym = [ hp.get_gray_image(input_dir, path) for path in paths ] self.__labels_sym = [hp.get_test(path, "sym")[0] for path in paths]
import helper args = parse_arguments() hot_reload = args.hot_reload write_info_file(args) if hot_reload: client = docker.from_env() cwd = os.getcwd() docker_env = args.docker_env docker_image = args.image_name env = None config_file = None target_folder, mdai_folder, config_path = helper.get_paths(args) # Detect config file if exists if os.path.exists(config_path): config_file = config_path # Prioritize config file values if it exists if config_file is not None: config = helper.process_config_file(config_file) placeholder_values = hot_reload_values helper.resolve_parent_image(placeholder_values, config, helper.PARENT_IMAGE_DICT) helper.add_env_variables(placeholder_values, config.get("env")) relative_mdai_folder = os.path.relpath(mdai_folder, target_folder) os.chdir(os.path.join(BASE_DIRECTORY, "mdai"))
def clean(make_output, clean_data, orders_imported, verbose, run_pricing, ordernum, orderid, email, customerid, product, productid, lineitem, lineid, shipmentorder, shipmentid): ''' param make_output <Boolean> output the data to different files, ie move from tsv to csv param clean_data <Boolean> clean the data and apply the proper fields we expect param orders_imported <Boolean> if the order record has been imported, flip to true so that we can create the linkage between order : record ''' # field names for swapping down below ORDERC = 'Order__c' ACCOUNTC = 'AccountId__c' PRODUCTC = 'Product__c' LINEITEMC = 'OrderProduct__c' SHIPC = 'Shipment__c' # make logs logs = helper.Logger(verbose) # the actual good stuff dir_path = os.path.join(os.path.dirname(os.path.realpath(__file__))) logs.write("Current file directory :: %s" % dir_path) directories = [ d for d in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, d)) ] print("Please choose a directory") for ix, dir in enumerate(directories): print(ix, dir) directoryindex = int(input(">> ")) row = None if directoryindex < len(directories): # get the directory that we want to clean/run through clean_directory = directories[directoryindex] # retrieve root, dirs, and files from walk, retain only filenames _, _, filenames = next( os.walk(os.path.join(dir_path, clean_directory, 'tsv')), (None, None, [])) # get the pricing files that we need if run_pricing: pricingfile = pricingmap.create_pricing(clean_directory) prices = pricingmap.return_pricing(pricingfile) else: pricingfile = os.path.join(pricingmap.make_dir(clean_directory), 'order_product_pricing.csv') prices = pricingmap.return_pricing(pricingfile) import_folder = os.path.join(dir_path, 'imports') # generate the order map (commented out) # order_path = os.path.join(import_folder, 'imported_orders') # ordermap = helper.OrderMap(order_path, ordernum, orderid) # ordermapping = ordermap.get_order_map() # add a place to get customer mappings too customer_directory = os.path.join(import_folder, 'imported_customers') customermap = helper.CustomerMap(customer_directory, email, customerid) customermapping = customermap.get_customer_map() # and product mappings product_directory = os.path.join(import_folder, 'imported_products') productmap = helper.ProductMap(product_directory, product, productid) productmapping = productmap.get_product_map() # and finally order products lineitem_directory = os.path.join(import_folder, 'imported_orderproducts') linemap = helper.LineItemMap(lineitem_directory, lineitem, lineid) lineitemmapping = linemap.get_lineitem_map() # and shipments too shipment_directory = os.path.join(import_folder, 'imported_shipments') shipmentmap = helper.ShipmentMap(shipment_directory, shipmentorder, shipmentid) shipmentmapping = shipmentmap.get_shipment_map() # for each file, let's clean out the tsvs and export to csv for filename in filenames: logs.write('Reading %s' % filename) if filename[0] == '.' or filename[0] == '_': logs.write('Skipping, invalid file %s' % filename) else: filetype = helper.chooser(filename) # directory to read from, read to, and clean to p_in, p_out, p_clean = helper.get_paths( dir_path, clean_directory) # get headers order_type, header = helper.get_headers( os.path.join(p_in, filename), logs) # need to handle this encoding because we have weird bytes with open(os.path.join(p_in, filename), encoding='ISO-8859-1') as tsvfile: reader = csv.DictReader( tsvfile, delimiter='\t' ) # use a DictReader to preserve header names if make_output: # backup the stuff from tsv to csv out_filename = '%s%s.csv' % (filename.split('.')[0], order_type) with open(os.path.join(p_out, out_filename), 'w') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) try: csvwriter.writerow(header) for row in reader: csvwriter.writerow(v for k, v in row.items()) except Exception as e: logs.write(row) if row else logs.write( 'Exception !!') logs.write(e) break if clean_data: logs.write('cleaning %s' % filename) # make a new list for objects that dont have record # missingorders = [] # missing_filename = '%s%s-Clean.csv' % (filename.split('.')[0], '_MissingOrderLink') with open(os.path.join(p_in, filename), encoding='ISO-8859-1') as tsvfile: clean_reader = csv.DictReader( tsvfile, delimiter='\t') # and one for the clean file if make_output: # and create a new clean file for this clean_filename = '%s%s-Clean.csv' % ( filename.split('.')[0], order_type) with open(os.path.join(p_clean, clean_filename), 'w') as cleanfile: cleanwriter = csv.writer(cleanfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) try: if filetype == 1 or filetype == 2: header.append('Product_External_ID__c') header.append('Order_External_ID__c') elif filetype == 4: header.append('Custom_Field_1__c') header.append('Order_External_Id__c') elif filetype == 5: header.append( 'TLA_Shipment_Provider_Carrier__c') header.append('Order_External_ID__c') header.append('Order_Site_Id__c') cleanwriter.writerow(header) for crow in clean_reader: if filetype == 3: helper.order_cleaner( crow, prices, ordernum) helper.map__c( crow, customermapping, ACCOUNTC) if orders_imported and filetype != 3: if filetype == 1: # order product helper.order_product_cleaner( crow) helper.map__c( crow, productmapping, PRODUCTC) elif filetype == 2: # shipment product helper.shipment_product_cleaner( crow) helper.map__c( crow, lineitemmapping, LINEITEMC) helper.map__c( crow, shipmentmapping, SHIPC) elif filetype == 4: # payment helper.payment_cleaner(crow) elif filetype == 5: # shipment helper.shipment_cleaner(crow) cleanwriter.writerow( v for k, v in crow.items()) except Exception as e: logs.write(crow) if crow else logs.write( 'Exception !!') logs.write(e) raise (e)
def __load_sym_patterns(self, input_dir): paths = hp.get_paths(input_dir) self.__patterns_sym = [hp.get_gray_image(input_dir, path) for path in paths] self.__labels_sym = [hp.get_test(path, "sym")[0] for path in paths]