dd.set_return_format(dd.RETURN_PYTHON) autokill = args.auto_kill # Create a service if args.create: description = 'image classification service' mllib = 'caffe' model = {'repository': args.create} parameters_input = { 'connector': 'image', 'width': args.img_width, 'height': args.img_height } parameters_mllib = {'nclasses': args.nclasses} parameters_output = {} dd.put_service(args.sname, model, description, mllib, parameters_input, parameters_mllib, parameters_output) else: pass list_bench_files = [] with open(args.list_bench_files) as f: for l in f: list_bench_files.append(args.remote_bench_data_dir + '/' + l.rstrip()) init_batch_size = 1 batch_sizes = [] l = init_batch_size while l <= args.max_batch_size: batch_sizes.append(l) if l < 32: l = l * 2 else:
ntrees = 100 metric = 'angular' # or 'euclidean' # creating ML service model_repo = os.getcwd() + '/model' model = {'repository': model_repo, 'templates': '../templates/caffe/'} parameters_input = {'connector': 'image', 'width': width, 'height': height} # Only indexing needs the template. if args.index: parameters_mllib = {'nclasses': nclasses, 'template': 'googlenet'} else: parameters_mllib = {'nclasses': nclasses} parameters_output = {} dd.put_service(sname, model, description, mllib, parameters_input, parameters_mllib, parameters_output, mltype) # reset call params parameters_input = {} parameters_mllib = {'gpu': True, 'extract_layer': extract_layer} parameters_output = {'binarized': binarized} if args.index: try: os.remove('names.bin') except: pass s = shelve.open('names.bin') # list files in image repository c = 0
host = 'localhost' port = 8080 sname = 'test' description = 'clustering' mllib = 'tsne' dd = DD(host) dd.set_return_format(dd.RETURN_PYTHON) training_repo = 'http://deepdetect.com/dd/datasets/mnist_csv/mnist_test.csv' # service creation model = {'repository':model_repo} parameters_input = {'connector':'csv'} parameters_mllib = {} parameters_output = {} dd.put_service(sname,model,description,mllib, parameters_input,parameters_mllib,parameters_output,'unsupervised') # training train_data = [training_repo] parameters_input = {'id':'','separator':',','label':'label'} parameters_mllib = {'iterations':500} parameters_output = {} predout = dd.post_train(sname,train_data,parameters_input,parameters_mllib,parameters_output,async=True) time.sleep(1) train_status = '' while True: train_status = dd.get_train(sname,job=1,timeout=3) if train_status['head']['status'] == 'running': print train_status['body']['measure'] else:
parser.add_argument('--img-out', help='transformed image', required=True) parser.add_argument('--gpu', help='whether to run on GPU', action='store_true') args = parser.parse_args() # service creation call model = {'repository': args.model_in_path} parameters_input = { 'connector': 'image', 'width': args.img_size, 'height': args.img_size } parameters_mllib = {'gpu': args.gpu} parameters_output = {} try: jout = dd.put_service('testggan', model, 'gan generator inference test', 'torch', parameters_input, parameters_mllib, parameters_output) except: print('model already exists') pass # inference call data = [args.img_in] parameters_input = { 'rgb': True, 'scale': 0.00392, "mean": [0.5, 0.5, 0.5], "std": [0.5, 0.5, 0.5] } parameters_mllib = {'extract_layer': 'last'} parameters_output = {}
# copy new model recentmodel = most_recent_iteration(args.builddir) print('Using model ' + recentmodel) shutil.copy2(os.path.join('builds', args.builddir, 'snapshots', recentmodel), 'dedemodel/model.caffemodel') # setup DeepDetect service if necessary dd = DD('localhost') dd.set_return_format(dd.RETURN_PYTHON) model = {'repository': '/dockershare/ssd/dedemodel'} parameters_input = {'connector': 'image', 'width': 512, 'height': 512} parameters_mllib = {'nclasses': 7} parameters_output = {} detect = dd.delete_service('ssd') detect = dd.put_service('ssd', model, 'single-shot detector', 'caffe', parameters_input, parameters_mllib, parameters_output, 'supervised') # recursively process input directory for root, dirs, files in os.walk(folder_input): for name in sorted(files): name, ext = os.path.splitext(name) if (ext.lower().endswith(('.mp4', '.avi', '.mov')) and os.path.exists(os.path.join(root, name + '.txt')) and (args.video == 'v' or args.video == name)): # start processing the video print('Processing video ' + name + '...') # video specific pathing output_jpg_unannotated = os.path.join(folder_output,
template = service['template'] activation = service['activation'] test_split = float(service["test_split"]) min_count = int(service["min_count"]) min_word_length = int(service["min_word_length"]) batch_size = int(service["batch_size"]) test_interval = int(service["test_interval"]) mllib = 'caffe' model = {'templates':'/var/deepdetect/templates/caffe/','repository':root_repository+service_name} parameters_input_service = {'connector':'txt'} if template == "mlp": parameters_mllib_service = {'template':template,'nclasses':nclasses,'layers':layers,'activation':activation} elif template == "lregression": parameters_mllib_service = {'template':template,'nclasses':nclasses,'activation':activation} parameters_output_service = {'measure':['mcll','f1']} dd.put_service(service_name,model,description,mllib,parameters_input_service,parameters_mllib_service,parameters_output_service) #Start training the service iterations = int(service['iterations']) solver_type = service['solver_type'] base_lr = float(service['base_lr']) parameters_input_training = {'shuffle':True,'test_split':test_split,'min_count':min_count,'min_word_length':min_word_length,'count':False} parameters_mllib_training = {'gpu':True,'solver':{'iterations':iterations,'test_interval':test_interval,'base_lr':base_lr,'solver_type':solver_type},'net':{'batch_size':batch_size}} parameters_output_training = {'measure':['mcll','f1','cmdiag','cmfull']} train_data = [root_repository+'dataset/'] training_service = dd.post_train(service_name.lower(),train_data,parameters_input_training,parameters_mllib_training,parameters_output_training,async=True) job_number = training_service['head']['job'] #Get training data while the service is running sleep(20) status_code = 200 count_job_data = 1 while status_code == 200:
def segment(image, nclasses=150, port=8080, host="localhost"): random.seed(134124) model_dir = '/home/model' sname = 'segserv' description = 'image segmentation' mllib = 'caffe' mltype = 'unsupervised' dd = DD(host, port) dd.set_return_format(dd.RETURN_PYTHON) def random_color(): ''' generate rgb using a list comprehension ''' r, g, b = [random.randint(0, 255) for i in range(3)] return [r, g, b] raw_img = plt.imread("/home/ubuntu/model/" + image).astype("float32") / 255 width, height = raw_img.shape[:2] #width = 480 #height = 480 # creating ML service model_repo = model_dir if not model_repo: model_repo = os.getcwd() + '/model/' model = {'repository': model_repo} parameters_input = {'connector': 'image', 'width': width, 'height': height} parameters_mllib = {'nclasses': nclasses} parameters_output = {} try: servput = dd.put_service(sname, model, description, mllib, parameters_input, parameters_mllib, parameters_output, mltype) except: # most likely the service already exists pass # prediction call parameters_input = {'segmentation': True} parameters_mllib = {'gpu': True, 'gpuid': 0} parameters_output = {} data = ["/home/model/" + image] detect = dd.post_predict(sname, data, parameters_input, parameters_mllib, parameters_output) pixels = np.array((map(int, detect['body']['predictions'][0]['vals']))) imgsize = detect['body']['predictions'][0]['imgsize'] # visual output label_colours = [] for c in range(nclasses): label_colours.append(random_color()) label_colours = np.array(label_colours) r = pixels.copy() g = pixels.copy() b = pixels.copy() for l in range(0, nclasses): r[pixels == l] = label_colours[l, 0] g[pixels == l] = label_colours[l, 1] b[pixels == l] = label_colours[l, 2] r = np.reshape(r, (imgsize['height'], imgsize['width'])) g = np.reshape(g, (imgsize['height'], imgsize['width'])) b = np.reshape(b, (imgsize['height'], imgsize['width'])) rgb = np.zeros((imgsize['height'], imgsize['width'], 3)) rgb[:, :, 0] = r / 255.0 rgb[:, :, 1] = g / 255.0 rgb[:, :, 2] = b / 255.0 print(rgb[0, 0]) body_mask = np.where(rgb * 255 == np.array([47, 197, 233]), 1, 0) result = body_mask * raw_img plt.imsave("result.png", result) return result
host = 'localhost' port = 8080 sname = 'test' description = 'clustering' mllib = 'tsne' dd = DD(host) dd.set_return_format(dd.RETURN_PYTHON) training_repo = 'http://deepdetect.com/dd/datasets/mnist_csv/mnist_test.csv' # service creation model = {'repository': model_repo} parameters_input = {'connector': 'csv'} parameters_mllib = {} parameters_output = {} dd.put_service(sname, model, description, mllib, parameters_input, parameters_mllib, parameters_output, 'unsupervised') # training train_data = [training_repo] parameters_input = {'id': '', 'separator': ',', 'label': 'label'} parameters_mllib = {'iterations': 500} parameters_output = {} predout = dd.post_train(sname, train_data, parameters_input, parameters_mllib, parameters_output, async=True) time.sleep(1) train_status = ''
nclasses = args.nclasses layer_size = 512 # auto anyways width = height = 300 dd = DD(host) dd.set_return_format(dd.RETURN_PYTHON) ntrees = 1000 metric = 'angular' # or 'euclidean' # creating ML service model_repo = os.getcwd() + '/' + args.model_dir model = {'repository':model_repo,'templates':'../templates/caffe/'} parameters_input = {'connector':'image','width':width,'height':height} parameters_mllib = {'nclasses':nclasses} parameters_output = {} try: dd.put_service(sname,model,description,mllib, parameters_input,parameters_mllib,parameters_output,mltype) except: pass # reset call params parameters_input = {} parameters_mllib = {'gpu':True} parameters_output = {'rois':'rois','confidence_threshold':args.confidence_threshold,'best':1} if args.index: try: os.remove('data.bin') except: pass s = shelve.open('data.bin')
class DNNFeatureExtractor(FeatureGenerator): def __init__(self, dnnmodel, image_files, index_repo, batch_size=32, dd_host='localhost', dd_port=8080, dd_description='image classification', meta_in='', meta_out='', captions_in='', captions_out='', mapi_in='', mapi_out=''): self.dd_host = dd_host self.dd_port = dd_port self.dd_description = dd_description self.dd_mllib = 'caffe' self.meta_in = meta_in self.meta_out = meta_out self.captions_in = captions_in self.captions_out = captions_out self.mapi_in = mapi_in self.mapi_out = mapi_out self.gpuid = 0 self.dnnmodel = dnnmodel if self.dnnmodel.extract_layer: self.dd_mltype = 'unsupervised' else: self.dd_mltype = 'supervised' self.image_files = image_files self.batch_size = batch_size self.binarized = False self.dd = DD(self.dd_host, self.dd_port) self.dd.set_return_format(self.dd.RETURN_PYTHON) self.index_repo = index_repo + '/' + self.dnnmodel.name try: os.mkdir(self.index_repo) except: #logger.warning('directory ' + self.index_repo + ' may alreay exist') pass self.st = {} # shelve used for full tags storage self.stm = {} # in memory tmp storage if self.dd_mltype == 'supervised': self.st = shelve.open(self.index_repo + '/tags.bin') self.delete_dd_service() def __del__(self): if self.dd_mltype == 'supervised': for i, t in self.stm.iteritems(): self.st[i] = t self.st.close() def create_dd_service(self): model = {'repository': self.dnnmodel.model_repo} parameters_input = { 'connector': 'image', 'width': self.dnnmodel.img_width, 'height': self.dnnmodel.img_height } parameters_mllib = { 'nclasses': self.dnnmodel.nclasses, 'gpu': True, 'gpuid': self.gpuid } parameters_output = {} screate = self.dd.put_service(self.dnnmodel.name, model, self.dd_description, self.dd_mllib, parameters_input, parameters_mllib, parameters_output, self.dd_mltype) outcode = screate['status']['code'] if outcode != 201 and outcode != 403: logger.error('failed creation of DNN service ' + self.dnnmodel.name) #return raise Exception('failed creating DNN service ' + self.dnnmodel.name) return def delete_dd_service(self): self.dd.delete_service(self.dnnmodel.name, clear='') def preproc(self): # none needed with dd at the moment return def index(self): ## feature generation, to be indexed or searched for self.create_dd_service() feature_vectors = [] uris = [] parameters_input = {} parameters_mllib = { 'gpu': True, 'gpuid': self.gpuid, 'extract_layer': self.dnnmodel.extract_layer } if self.dd_mltype == 'unsupervised': parameters_output = {'binarized': self.binarized} # pass one image to get the size of the output layer classif = self.dd.post_predict(self.dnnmodel.name, [self.image_files[0]], parameters_input, parameters_mllib, parameters_output) response_code = classif['status']['code'] if response_code != 200: print 'response=', classif logger.error( 'failed (index) initial prediction call to model ' + self.dnnmodel.name + ' via dd') self.delete_dd_service() return dim = len(classif['body']['predictions']['vals']) else: parameters_output = {'best': self.dnnmodel.best} dim = self.dnnmodel.nclasses c = 0 logger.info('dnn feature prediction and indexing for service ' + self.dnnmodel.name + ' with layer of size ' + str(dim)) with Indexer(dim, self.index_repo) as indexer: for x in batch(self.image_files, self.batch_size): classif = self.dd.post_predict(self.dnnmodel.name, x, parameters_input, parameters_mllib, parameters_output) #print classif response_code = classif['status']['code'] if response_code != 200: print 'response=', classif logger.error( 'failed (index) batch prediction call to model ' + self.dnnmodel.name + ' via dd') continue predictions = classif['body']['predictions'] if self.batch_size == 1 or len(self.image_files) == 1: predictions = [predictions] for p in predictions: if self.dd_mltype == 'unsupervised': indexer.index_single(c, p['vals'], p['uri']) if c > 0 and c % self.batch_size == 0: logger.info('indexed ' + str(c) + ' images') else: puri = str(p['uri']) indexer.index_tags_single(p['classes'], p['uri']) self.stm[puri] = [] for pc in p['classes']: self.stm[puri].append(pc['cat']) c = c + 1 indexer.build_index() indexer.save_index() logger.info('indexed a total of ' + str(c) + ' images') self.delete_dd_service() def search(self, jdataout={}): self.create_dd_service() parameters_input = {} parameters_mllib = { 'gpu': True, 'gpuid': self.gpuid, 'extract_layer': self.dnnmodel.extract_layer } if self.dd_mltype == 'unsupervised': parameters_output = {'binarized': self.binarized} else: parameters_output = {'best': self.dnnmodel.best} logger.info('dnn feature prediction and searching for service ' + self.dnnmodel.name) results = {} with Searcher(self.index_repo, search_size=500) as searcher: searcher.load_index() for x in batch(self.image_files, self.batch_size): classif = self.dd.post_predict(self.dnnmodel.name, x, parameters_input, parameters_mllib, parameters_output) response_code = classif['status']['code'] if response_code != 200: print 'response=', classif logger.error( 'failed batch (search) prediction call to model ' + self.dnnmodel.name + ' via dd') self.delete_dd_service() print classif raise Exception( 'failed batch (search) prediction call to model ' + self.dnnmodel.name) predictions = classif['body']['predictions'] if self.batch_size == 1 or len(self.image_files) == 1: predictions = [predictions] #print 'predictions=',predictions for p in predictions: if self.dd_mltype == 'unsupervised': nns = searcher.search_single(p['vals'], p['uri']) else: puri = str(p['uri']) nns = searcher.search_tags_single(p['classes'], puri) nns['tags_out_all'] = [] for nn in nns['nns_uris']: nns['tags_out_all'].append(self.st[str(nn)]) results[p['uri']] = nns self.delete_dd_service() return self.to_json(results, '/img/reuters/', '/img/tate/', self.dnnmodel.name, self.dnnmodel.description, jdataout, self.meta_in, self.meta_out, self.captions_in, self.captions_out, self.mapi_in, self.mapi_out)
info = dd.info() print(info) sys.exit() if args.delete: delete_service = dd.delete_service(args.model_name,clear='') print(delete_service) sys.exit() # creating ML service if args.create_service: model = {'repository':model_config['path']+args.model_name} parameters_input = {'connector':'image','width':model_config['width'],'height':model_config['height']} parameters_mllib = {'nclasses':model_config['nclasses'],'gpu':True} parameters_output = {} creation = dd.put_service(args.model_name,model,args.model_name,model_config['backend'], parameters_input,parameters_mllib,parameters_output,'supervised') print(creation) if args.img_url: parameters_input = {} parameters_mllib = {} parameters_output = {'best':3} if args.model_name == 'voc0712': parameters_output['bbox'] = True parameters_output['confidence_threshold'] = 0.01 data = [args.img_url] classify = dd.post_predict(args.model_name,data, parameters_input,parameters_mllib,parameters_output) print classify
args = parser.parse_args() # service creation call model = {'repository': args.model_in_path} parameters_input = { 'connector': 'image', 'width': args.img_size, 'height': args.img_size } parameters_mllib = {'gpu': args.gpu} if args.backend == 'tensorrt': parameters_mllib['maxBatchSize'] = args.maxbatchsize parameters_output = {} try: jout = dd.put_service('testggan', model, 'gan generator inference test', args.backend, parameters_input, parameters_mllib, parameters_output) except: print('model already exists') pass # inference call data = [args.img_in] parameters_input = { 'rgb': True, 'scale': 0.00392, "mean": [0.5, 0.5, 0.5], "std": [0.5, 0.5, 0.5] } parameters_mllib = {'extract_layer': 'last'} parameters_output = {}
class InformationExtractor(object): """ Module with functions for information Extraction """ wordnet_lemmatizer = WordNetLemmatizer() #External service URLs google_service_url = 'https://kgsearch.googleapis.com/v1/entities:search' probase_service_url = "https://concept.research.microsoft.com/api/Concept/ScoreByProb" #DD constants height = width = 224 nclasses_clothing = 304 nclasses_bags = 37 nclasses_footwear = 51 nclasses_fabric = 233 #setting up DD client mllib = 'caffe' def __init__(self, word_vectors, companies, styles, materials, items, probase_brands, probase_materials, patterns, top_category_items, deep_detectStartup, confFilePath, tfidf): self.conf = json.load(open(confFilePath)) self.tfidf = tfidf self.api_key = self.conf["google_api_key_path"] self.deep_detect_models = self.conf["deep_detect_models"] self.CAPTION_FACTOR = self.conf["caption_factor"] self.COMMENTS_FACTOR = self.conf["comments_factor"] self.USERTAG_FACTOR = self.conf["usertag_factor"] self.HASHTAG_FACTOR = self.conf["hashtag_factor"] if deep_detectStartup: self.dd = DD(self.conf["deep_detect_host"], port=self.conf["deep_detect_port"]) self.startup_deep_detect() self.wordvec_model = gensim.models.KeyedVectors.load_word2vec_format( word_vectors, binary=False) self.companies = companies self.styles = styles self.materials = materials self.items = items self.brands_keywords_google = [] self.materials_keywords_google = [] self.probase_brands = probase_brands self.probase_materials = probase_materials self.colors = [] self.patterns = patterns self.top_category_items = top_category_items self.lemmatize() def lemmatize(self): """ Lemmatize domain lists""" self.styles_lemmas = { self.wordnet_lemmatizer.lemmatize(style): style for style in self.styles } self.materials_lemmas = { self.wordnet_lemmatizer.lemmatize(material): material for material in self.materials } self.items_lemmas = { self.wordnet_lemmatizer.lemmatize(item): item for item in self.items } def find_closest_semantic(self, caption, comments, tags, hashtags, segmented_hashtags, num, topic, id): """ Finds num semantically closest candidates for a given topic""" topic = map(lambda x: x.decode('utf-8', 'ignore').encode("utf-8"), topic) freq_scores = {} for x in topic: freq_scores[x] = 0.0 for token in caption: scores = [] for x in topic: token2 = x.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity(token, token2, token2Lemma, self.CAPTION_FACTOR, self.tfidf[id]) scores.append((x, similarity)) top = sorted(scores, reverse=True, key=lambda x: x[1])[:num] for x in top: freq_scores[x[0]] = freq_scores[x[0]] + x[1] for token in comments: scores = [] for x in topic: token2 = x.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity(token, token2, token2Lemma, self.COMMENTS_FACTOR, self.tfidf[id]) scores.append((x, similarity)) top = sorted(scores, reverse=True, key=lambda x: x[1])[:num] for x in top: freq_scores[x[0]] = freq_scores[x[0]] + x[1] for token in hashtags: scores = [] for x in topic: token2 = x.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity(token, token2, token2Lemma, self.HASHTAG_FACTOR, self.tfidf[id]) scores.append((x, similarity)) top = sorted(scores, reverse=True, key=lambda x: x[1])[:num] for x in top: freq_scores[x[0]] = freq_scores[x[0]] + x[1] for token in segmented_hashtags: scores = [] for x in topic: token2 = x.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity(token, token2, token2Lemma, self.HASHTAG_FACTOR, self.tfidf[id]) scores.append((x, similarity)) top = sorted(scores, reverse=True, key=lambda x: x[1])[:num] for x in top: freq_scores[x[0]] = freq_scores[x[0]] + x[1] for token in tags: scores = [] for x in topic: token2 = x.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity(token, token2, token2Lemma, self.USERTAG_FACTOR, self.tfidf[id]) scores.append((x, similarity)) top = sorted(scores, reverse=True, key=lambda x: x[1])[:num] for x in top: freq_scores[x[0]] = freq_scores[x[0]] + x[1] top = sorted([(k, v) for k, v in freq_scores.iteritems()], reverse=True, key=lambda x: x[1])[:num] return top def token_similarity(self, token, token2, token2Lemma, factor, tfidf): """ Returns similarity between two tokens using cosine similarity between embeddings, edit distance and TFIDF weighting""" similarity = 0.0 if isinstance(token, str): token = token.decode("utf-8", "ignore") tokenLemma = self.wordnet_lemmatizer.lemmatize(token) if tokenLemma in self.wordvec_model.wv.vocab and token2Lemma in self.wordvec_model.wv.vocab: if edit_distance(tokenLemma, token2Lemma) == 0: factor = factor * 10 similarity = factor * math.pow( float(self.wordvec_model.wv.similarity(tokenLemma, token2Lemma)), 2) else: dist = factor * edit_distance(tokenLemma, token2Lemma) similarity = float(1) / float(1 + math.pow(dist, 2)) tfidf_score = 0.0 if token in tfidf: tfidf_score = tfidf[token] if token.encode("utf-8") in tfidf: tfidf_score = tfidf[token.encode("utf-8")] tfidf_score = max(tfidf_score, 0.0001) similarity = similarity * tfidf_score return similarity def find_closest_syntactic(self, caption, comments, tags, hashtags, segmented_hashtags, num, topic, id): """ Finds num semantically closest candidates for a given topic""" topic = map(lambda x: x.decode('utf-8', 'ignore').encode("utf-8"), topic) freq_scores = {} for x in topic: freq_scores[x] = 0.0 for token in caption: scores = [] for x in topic: token2 = x.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity_syntactic_only( token, token2, token2Lemma, self.CAPTION_FACTOR, self.tfidf[id]) scores.append((x, similarity)) top = sorted(scores, reverse=True, key=lambda x: x[1])[:num] for x in top: freq_scores[x[0]] = freq_scores[x[0]] + x[1] for token in comments: scores = [] for x in topic: token2 = x.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity_syntactic_only( token, token2, token2Lemma, self.COMMENTS_FACTOR, self.tfidf[id]) scores.append((x, similarity)) top = sorted(scores, reverse=True, key=lambda x: x[1])[:num] for x in top: freq_scores[x[0]] = freq_scores[x[0]] + x[1] for token in hashtags: scores = [] for x in topic: token2 = x.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity_syntactic_only( token, token2, token2Lemma, self.HASHTAG_FACTOR, self.tfidf[id]) scores.append((x, similarity)) top = sorted(scores, reverse=True, key=lambda x: x[1])[:num] for x in top: freq_scores[x[0]] = freq_scores[x[0]] + x[1] for token in segmented_hashtags: scores = [] for x in topic: token2 = x.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity_syntactic_only( token, token2, token2Lemma, self.HASHTAG_FACTOR, self.tfidf[id]) scores.append((x, similarity)) top = sorted(scores, reverse=True, key=lambda x: x[1])[:num] for x in top: freq_scores[x[0]] = freq_scores[x[0]] + x[1] for token in tags: scores = [] for x in topic: token2 = x.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity_syntactic_only( token, token2, token2Lemma, self.USERTAG_FACTOR, self.tfidf[id]) scores.append((x, similarity)) top = sorted(scores, reverse=True, key=lambda x: x[1])[:num] for x in top: freq_scores[x[0]] = freq_scores[x[0]] + x[1] top = sorted([(k, v) for k, v in freq_scores.iteritems()], reverse=True, key=lambda x: x[1])[:num] return top def token_similarity_syntactic_only(self, token, token2, token2Lemma, factor, tfidf): """ Returns similarity between two tokens using edit distance and TFIDF weighting""" tokenLemma = self.wordnet_lemmatizer.lemmatize(token) similarity = 0.0 if edit_distance(tokenLemma, token2Lemma) == 0: factor = factor * 10 dist = edit_distance(tokenLemma, token2Lemma) similarity = factor * (float(1) / float(1 + dist)) tfidf_score = 0.0 if token in tfidf: tfidf_score = tfidf[token] if token.encode("utf-8") in tfidf: tfidf_score = tfidf[token.encode("utf-8")] tfidf_score = max(tfidf_score, 0.0001) similarity = similarity * tfidf_score return similarity def lookup_google(self, params): """ Lookup in Google Search""" #curl "https://kgsearch.googleapis.com/v1/entities:search?query=bebe&key=<key>&limit=2&indent=True&types=Organization" url = self.google_service_url + '?' + urllib.urlencode(params) #result score = an indicator of how well the entity matched the request constraints. response = json.loads(urllib.urlopen(url).read()) results = [] if "itemListElement" in response: for element in response['itemListElement']: dict_result = {} if "resultScore" in element: dict_result["resultScore"] = element['resultScore'] if "result" in element: if "detailedDescription" in element["result"]: dict_result["detailedDescription"] = element["result"][ 'detailedDescription'] if "description" in element["result"]: dict_result["description"] = element["result"][ 'description'] if "url" in element["result"]: dict_result["url"] = element["result"]["url"] results.append(dict_result) return results def rank_google_result_company(self, results): """ Binary rank of google search results""" for result in results: for keyword in self.brands_keywords_google: if "detailedDescription" in result: if keyword in result["detailedDescription"]: return 1 if "description" in result: if keyword in result["description"]: return 1 return 0.0 def rank_google_result_material(self, results): """ Binary rank of google search results""" for result in results: for keyword in self.materials_keywords_google: if keyword in result[ "detailedDescription"] or keyword in result[ "description"]: return 1 return 0.0 def rank_probase_result_company(self, result): """Probase probability ranking [0,1]""" keywords = filter(lambda x: x in result, self.probase_brands) keywords = map(lambda x: result[x], keywords) if len(keywords) > 0: return 1 + max(keywords) else: return 0.5 def rank_probase_result_material(self, result): """Probase probability ranking [0,1]""" keywords = filter(lambda x: x in result, self.probase_materials) keywords = map(lambda x: result[x], keywords) if len(keywords) > 0: return 1 + max(keywords) else: return 0.5 def lookup_probase(self, params): """Probase lookup""" #curl "https://concept.research.microsoft.com/api/Concept/ScoreByProb?instance=adidas&topK=10" url = self.probase_service_url + '?' + urllib.urlencode(params) response = json.loads(urllib.urlopen(url).read()) return response def get_liketoknowitlinks(self, tokens): """ Extract liketoknowit links""" links = [] for token in tokens: match = re.search("http://liketk.it/([^\s]+)", token) if match is not None: link = match.group(0) links.append(link) return links def lda_topic_models(self, num_topics, num_iter, min_occ, docs): """ Extract LDA topic models """ cvectorizer = CountVectorizer(min_df=min_occ, stop_words="english") cvz = cvectorizer.fit_transform(docs) lda_model = lda.LDA(n_topics=num_topics, n_iter=num_iter) X_topics = lda_model.fit_transform(cvz) _lda_keys = [] for i in xrange(X_topics.shape[0]): _lda_keys.append(X_topics[i].argmax()) topic_summaries = [] topic_word = lda_model.topic_word_ # all topic words n_top_words = 5 vocab = cvectorizer.get_feature_names() for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort( topic_dist)][:-(n_top_words + 1):-1] # get! topic_summaries.append(' '.join(topic_words)) return topic_summaries def get_top_num(self, coll, num): """ Extract top 10 ranked items""" top, counts = zip(*Counter(coll).most_common(num)) return list(top) def get_wikipedia_vote(self, query): """ Wikipedia lookup binary rank""" pages = wikipedia.search(query) for pageName in pages: try: page = wikipedia.page(pageName) content = page.content.lower() for keyword in self.brands_keywords_google: if keyword in content: return 1 except: return 0.0 return 0.0 def get_google_search_vote(self, query): """ Google search lookup binary rank""" try: response = GoogleSearch().search(query) for result in response.results: text = result.getText().lower() title = result.title.lower() for keyword in self.brands_keywords_google: if keyword in text or keyword in title: return 1 except: return 0 return 0 def emoji_classification(self, emojis, num): """ Emoji classification """ items = {} for item in self.items_lemmas.keys(): items[item] = 0.0 for emoji in emojis: item_matches = self.emoji_to_item(emoji) for item_m in item_matches: items[item_m] = items[item_m] + 1 top = sorted([(k, v) for k, v in items.iteritems()], reverse=True, key=lambda x: x[1])[:num] return top def emoji_to_item(self, token): """Classify item based on emojis""" if token == u"π": return ["shirt", "top"] if token == u"π": return ["jean", "trouser", "legging", "jogger"] if token == u"π": return ["dress"] if token == u"π": return ["blouse", "shirt"] if token == u"π": ["purse", "bag", "handbag"] if token == u"π": return ["bag", "handbag"] if token == u"π" or token == u"π ": return ["bag"] if token == u"π": return ["shoe", "boot"] if token == u"π": return ["trainer", "shoe", "boot"] if token == u"π " or token == u"π‘ " or token == u"π’": return ["heel", "shoe"] if token == u"π" or token == u"π©": return ["hat"] return [] def map_candidates_to_ontology(self, candidates): """ Map candidates from external APIs to our classes""" topic = map(lambda x: x.decode('utf-8', 'ignore').encode("utf-8"), self.top_category_items) freq_scores = {} for x in topic: parts = x.split(",") label = parts[0] freq_scores[label] = 0.0 for token in candidates: for x in topic: parts = x.split(",") label = parts[0] words = parts[1].split(" ") acc_sim = 0 scores = [] for word in words: token2 = word.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity(token[0], token2, token2Lemma, self.CAPTION_FACTOR) scores.append(similarity * math.pow(token[1], 2)) acc_sim = acc_sim + max(scores) freq_scores[label] = freq_scores[label] + acc_sim return freq_scores def liketkit_classification(self, url): """ Liketkit link scraping """ text = [] try: driver = webdriver.PhantomJS() driver.get(url) p_element = driver.find_element_by_class_name("ltk-products") products = p_element.find_elements_by_xpath(".//*") urls = [] for prod in products: urls.append(prod.get_attribute("href")) for url in urls: driver.get(url) html = driver.page_source soup = BeautifulSoup(html, "lxml") data = soup.findAll(text=True, recursive=True) text.extend(list(data)) return text except: print("error in liketkit classification") return text def google_vision_lookup(self, imagePath): """ Google vision API lookup """ item_candidates = [] try: # Instantiates a client client = vision.ImageAnnotatorClient() # The name of the image file to annotate file_name = os.path.join(os.path.dirname(__file__), imagePath) # Loads the image into memory with io.open(file_name, 'rb') as image_file: content = image_file.read() image = types.Image(content=content) # Performs label detection on the image file response = client.label_detection(image=image) labels = response.label_annotations for label in labels: item_candidates.append((label.description, label.score)) return item_candidates except: print("error in google_vision_LF") return item_candidates def deep_detect_lookup(self, link): """ Deep detect local lookup""" items_and_fabrics = {} items_and_fabrics["items"] = [] items_and_fabrics["fabrics"] = [] try: parameters_input = {} parameters_mllib = {} parameters_output = {'best': 10} data = [link] clothing_res = self.dd.post_predict(self.sname_clothing, data, parameters_input, parameters_mllib, parameters_output) body = clothing_res[u"body"] predictions = body[u"predictions"] classes = predictions[0][u"classes"] for c in classes: items = c[u"cat"].strip(" ").split(",") prob = c[u"prob"] for item in items: items_and_fabrics["items"].append((item, prob)) bags_res = self.dd.post_predict(self.sname_bags, data, parameters_input, parameters_mllib, parameters_output) body = bags_res[u"body"] predictions = body[u"predictions"] classes = predictions[0][u"classes"] for c in classes: items = c[u"cat"].strip(" ").split(",") prob = c[u"prob"] for item in items: items_and_fabrics["items"].append((item, 0.5 * prob)) footwear_res = self.dd.post_predict(self.sname_footwear, data, parameters_input, parameters_mllib, parameters_output) body = footwear_res[u"body"] predictions = body[u"predictions"] classes = predictions[0][u"classes"] for c in classes: items = c[u"cat"].strip(" ").split(",") prob = c[u"prob"] for item in items: items_and_fabrics["items"].append((item, 0.5 * prob)) fabric_res = self.dd.post_predict(self.sname_fabric, data, parameters_input, parameters_mllib, parameters_output) body = fabric_res[u"body"] predictions = body[u"predictions"] classes = predictions[0][u"classes"] for c in classes: items = c[u"cat"].strip(" ").split(",") prob = c[u"prob"] for item in items: items_and_fabrics["fabrics"].append((item, prob)) return items_and_fabrics except: print("error in deep_detect_LF") return items_and_fabrics def startup_deep_detect(self): """ Startup services for deep detect classification """ self.dd.set_return_format(self.dd.RETURN_PYTHON) for model in self.deep_detect_models: m = {"repository": model["path"]} parameters_input = { 'connector': 'image', 'width': self.width, 'height': self.height } parameters_mllib = {'nclasses': self.nclasses_clothing} parameters_output = {} self.dd.put_service(model["name"], model, model["description"], self.mllib, parameters_input, parameters_mllib, parameters_output) def deepomatic_lookup(self, link): """ Deepomatic API lookup """ item_candidates = [] try: client = Client(529372386976, self.conf["deepomatic_api_key"]) task = client.helper.get("/detect/fashion/?url=" + link) taskid = task[u"task_id"] i = 0 while i < 10: sleep(0.1) #100ms res = client.helper.get("/tasks/" + str(taskid) + "/") task = res[u"task"] status = task[u"status"] if status == u"success" or status == "success": data = task[u"data"] boxes = data[u"boxes"] for item in boxes.keys(): info = boxes[item] probability = 0.0 for inf in info: probability = probability + inf[u"proba"] item_candidates.append( (item.encode("utf-8"), probability)) i = 10 else: i += 1 return item_candidates except: print("error in deepomaticLF") return item_candidates def clarifai_lookup(self, link): """ Clarifai API lookup""" item_candidates = [] try: app = ClarifaiApp(api_key=self.conf["clarifai_api_key"]) model = app.models.get('apparel') image = ClImage(url=link) res = model.predict([image]) outputs = res[u"outputs"] for output in outputs: data = output[u"data"] concepts = data[u"concepts"] for concept in concepts: concept_parts = concept[u"name"].encode("utf-8").split(" ") val = concept[u"value"] for part in concept_parts: item_candidates.append((part, val)) return item_candidates except: print("error in clarifai LF") return item_candidates def find_closest_semantic_hierarchy(self, caption, comments, tags, hashtags, topic, id, num): """ Finds num semantically closest candidates for a given topic with multiple words per topic""" topic = map(lambda x: x.decode('utf-8', 'ignore').encode("utf-8"), topic) freq_scores = {} for x in topic: parts = x.split(",") label = parts[0] freq_scores[label] = 0.0 for token in caption: for x in topic: parts = x.split(",") label = parts[0] words = parts[1].split(" ") acc_sim = 0 scores = [] for word in words: token2 = word.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity(token, token2, token2Lemma, self.CAPTION_FACTOR, self.tfidf[id]) scores.append(similarity) acc_sim = acc_sim + max(scores) freq_scores[label] = freq_scores[label] + acc_sim for token in comments: for x in topic: parts = x.split(",") label = parts[0] words = parts[1].split(" ") acc_sim = 0 scores = [] for word in words: token2 = word.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity(token, token2, token2Lemma, self.COMMENTS_FACTOR, self.tfidf[id]) scores.append(similarity) acc_sim = acc_sim + max(scores) freq_scores[label] = freq_scores[label] + acc_sim for token in hashtags: for x in topic: parts = x.split(",") label = parts[0] words = parts[1].split(" ") acc_sim = 0 scores = [] for word in words: token2 = word.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity(token, token2, token2Lemma, self.HASHTAG_FACTOR, self.tfidf[id]) scores.append(similarity) acc_sim = acc_sim + max(scores) freq_scores[label] = freq_scores[label] + acc_sim for token in tags: for x in topic: parts = x.split(",") label = parts[0] words = parts[1].split(" ") acc_sim = 0 scores = [] for word in words: token2 = word.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity(token, token2, token2Lemma, self.USERTAG_FACTOR, self.tfidf[id]) scores.append(similarity) acc_sim = acc_sim + similarity acc_sim = acc_sim + max(scores) freq_scores[label] = freq_scores[label] + acc_sim top = sorted([(k, v) for k, v in freq_scores.iteritems()], reverse=True, key=lambda x: x[1])[:num] return top def find_closest_syntactic_hierarchy(self, caption, comments, tags, hashtags, topic, id, num): """ Finds num syntactically closest candidates for a given topic, with multiple words per topic""" topic = map(lambda x: x.decode('utf-8', 'ignore').encode("utf-8"), topic) freq_scores = {} for x in topic: parts = x.split(",") label = parts[0] freq_scores[label] = 0.0 for token in caption: for x in topic: parts = x.split(",") label = parts[0] words = parts[1].split(" ") acc_sim = 0 scores = [] for word in words: token2 = word.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity_syntactic_only( token, token2, token2Lemma, self.CAPTION_FACTOR, self.tfidf[id]) scores.append(similarity) acc_sim = acc_sim + max(scores) freq_scores[label] = freq_scores[label] + acc_sim for token in comments: for x in topic: parts = x.split(",") label = parts[0] words = parts[1].split(" ") acc_sim = 0 scores = [] for word in words: token2 = word.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity_syntactic_only( token, token2, token2Lemma, self.COMMENTS_FACTOR, self.tfidf[id]) scores.append(similarity) acc_sim = acc_sim + max(scores) freq_scores[label] = freq_scores[label] + acc_sim for token in hashtags: for x in topic: parts = x.split(",") label = parts[0] words = parts[1].split(" ") acc_sim = 0 scores = [] for word in words: token2 = word.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity_syntactic_only( token, token2, token2Lemma, self.HASHTAG_FACTOR, self.tfidf[id]) scores.append(similarity) acc_sim = acc_sim + max(scores) freq_scores[label] = freq_scores[label] + acc_sim for token in tags: for x in topic: parts = x.split(",") label = parts[0] words = parts[1].split(" ") acc_sim = 0 scores = [] for word in words: token2 = word.lower() token2Lemma = self.wordnet_lemmatizer.lemmatize(token2) similarity = self.token_similarity_syntactic_only( token, token2, token2Lemma, self.USERTAG_FACTOR, self.tfidf[id]) scores.append(similarity) acc_sim = acc_sim + similarity acc_sim = acc_sim + max(scores) freq_scores[label] = freq_scores[label] + acc_sim top = sorted([(k, v) for k, v in freq_scores.iteritems()], reverse=True, key=lambda x: x[1])[:num] return top
class ModelTrainer: """ Prediction Model trainer class binary char-based model training class """ def __init__(self,structure,logger,config): """ Instanciate a model trainer :param dic structure: Model Trainer specific settings eg: {"model-repo":"../models/mymodel","training-repo":"../training/mytraining","sname":"MyTrainer","test_split":0.01,"base-lr":0.01,"clevel":False,"sequence":140,"iterations":50000,"test_interval":1000,"stepsize":15000,"destroy":True,"resume":False,"finetune":False,"weights":"","nclasses":2,"documents":True,"batch-size":128,"test-batch-size":16,"gpuid":0,"mllib":"xgboost","lregression":False} *model-repo* location of the model *training-repo* location of the training files *sname* service name *test_plit* training split between 0 and < 1,type=float,default=0.01 *base_lr* initial learning rate,default=0.01,type=float *clevel* character-level convolutional net,type=boolean *sequence* sequence length for character level models,default=140,type=int *iterations* number of iterations,default=50000,type=int *test_interval* test interval',default=1000,type=int *stepsize* lr policy stepsize',default=15000,type=int *destroy* whether to destroy model',type=boolean *resume* whether to resume training,type=boolean *finetune* whether to finetune,type=boolean *weights* pre-trained weight file, when finetuning *nclasses* number of classes,type=int,default=2 *documents* whether to train from text documents (as opposed to sentences in one doc),type=boolean *batch_size* batch size,type=int,default=128 *test_batch_size* test batch size,type=int,default=16 *gpu* enable gpu usage is True, default=False *gpuid* specify gpu id,type=int,default=0 *mllib* caffe or xgboost,default='caffe' *lregression* whether to use logistic regression,type=boolean :param obj logger: DFM logger object :param obj storage: DFM storage object :param obj config: DFM global config object :returns: ModelTrainer object (instance of a modeltrainer class) """ self.config=config self.structure=structure self.logger=logger self.nclasses = self.structure['nclasses'] self.description = 'classifier' self.sname=self.structure['sname'] self.mllib = self.structure['mllib'] self.dd = DD(config['DEEP_DETECT_URI'],config['DEEP_DETECT_PORT']) self.dd.set_return_format(self.dd.RETURN_PYTHON) def createMLTrainerService(self): """ Create ML Trainer service in DeepDetect """ if self.structure['lregression']: self.template = 'lregression' else: self.template = 'mlp' layers = [800,500,200] if self.structure['clevel']: self.template = 'convnet' self.layers = ['1CR256','1CR256','4CR256','1024','1024'] self.model = {'templates':'../templates/caffe/','repository':self.structure['model-repo']} self.parameters_input = {'connector':'txt','sentences':False,'characters':self.structure['clevel'],'read_forward':True} if self.structure['documents']: self.parameters_input['sentences'] = False if self.structure['clevel']: self.parameters_input['sequence'] = self.sequence #parameters_input['alphabet'] = 'abcdef0123456789' # hex # parameters_input['alphabet'] = '_-,:?/.(){}*%0123456789abcdefghijklmnopqrstuvwxyz' # opcode #parameters_input['alphabet'] = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?'"#\"/\\|_@#$%^&*~`+-=<>" self.parameters_mllib = {'template':self.template,'nclasses':self.nclasses,'db':True,'dropout':0.5} if self.mllib == 'xgboost': self.parameters_mllib['db'] = False if not self.template == 'lregression': self.parameters_mllib['layers'] = layers #parameters_mllib = {'nclasses':nclasses,'db':True} if self.structure['finetune']: self.parameters_mllib['finetuning'] = True if not self.structure['weights']: logger.error('Finetuning requires weights file') # server will fail on service creation anyways else: self.parameters_mllib['weights'] = self.structure['weights'] self.parameters_output = {} self.logger.debug("dd.put_service("+str(self.structure['sname'])+","+str(self.model)+","+str(self.description)+","+str(self.mllib)+","+str(self.parameters_input)+","+str(self.parameters_mllib)+","+str(self.parameters_output)+")") return self.dd.put_service(self.structure['sname'],self.model,self.description,self.mllib,self.parameters_input,self.parameters_mllib,self.parameters_output) def trainModel(self): """ Train the model. """ self.train_data = [self.structure['training-repo']] self.parameters_input = {'test_split':self.structure['test_split'],'shuffle':True,'db':True} if not self.structure['clevel']: self.parameters_input['min_word_length'] = 5 self.parameters_input['min_count'] = 10 self.parameters_input['count'] = False if self.mllib == 'xgboost': self.parameters_input['tfidf'] = True self.parameters_input['db'] = False else: self.parameters_input['sentences'] = True self.parameters_input['characters'] = True self.parameters_input['sequence'] = self.sequence if self.structure['documents']: self.parameters_input['sentences'] = False if self.mllib == 'caffe': self.parameters_input['db']=True self.parameters_mllib = { 'gpu':self.structure['gpu'], 'gpuid':self.structure['gpuid'], 'resume':self.structure['resume'], 'net':{ 'batch_size':self.structure['batch_size'] }, 'solver':{ 'test_interval':self.structure['test_interval'], 'test_initialization':False, 'base_lr':self.structure['base_lr'], 'solver_type':'ADAM', 'iterations':self.structure['iterations'] } }#,'lr_policy':'step','stepsize':self.structure['stepsize'],'gamma':0.5,'weight_decay':0.0001}} elif self.mllib == 'xgboost': self.parameters_mllib = { 'iterations':self.structure['iterations'], 'objective':'multi:softprob', 'booster_params':{'max_depth':50} } self.parameters_output = {'measure':['mcll','f1','cmdiag','cmfull']} if self.nclasses == 2: self.parameters_output['measure'].append('auc') self.logger.debug("dd.post_train("+self.structure['sname']+","+str(self.train_data)+","+str(self.parameters_input)+","+str(self.parameters_mllib)+","+str(self.parameters_output)+",async="+str(True)+")") self.dd.post_train(self.structure['sname'],self.train_data,self.parameters_input,self.parameters_mllib,self.parameters_output,async=True) time.sleep(1) train_status = '' while True: train_status = self.dd.get_train(self.sname,job=1,timeout=10) if train_status['head']['status'] == 'running': self.logger.debug(train_status['body']['measure']) else: self.logger.debug(train_status) break return train_status def clearMLTrainerService(self,clear=''): """ delete the service, keeping the model :param str clear: use clear='lib' to clear the model as well, default empty. :returns: DeepDetect delete result """ return self.dd.delete_service(self.sname,clear=clear)