def getcursorandimgsrcs(webfile_prepped, imgnum_needed, progressdata): robo.whereami(sys._getframe().f_code.co_name) imgurl_list = [] # specific to going to big image page, not thumbnail imgsrc_list = [] img2url_dict = {} cursor = None basetag = progressdata["basetag"] thistag = progressdata["thistag"] #build a list of images for de-dupe. with a refactor, i would make a single list of # imgnum_needed urls and pass that to a download module... for now i ll check the logfile imgs_existing = robo.imgs_existing_build(progressdata["img2url_file"]) #get cursor - \/(.+)Load more cursor_match = re.findall( r'(.+)Load More', webfile_prepped.read() ) try: cursor_thegoodpart = cursor_match[0].split("?cursor=")[1] cursor = cursor_thegoodpart.replace('">','') except: pass #regex issue webfile_prepped.seek(0) imgs_in_file = re.findall( r'\/p\/(.{11})', webfile_prepped.read() ) for img_loc in imgs_in_file: imgdlfile_url_a = imgdlfile_url_prefix.replace("https","https:") + img_loc imgdlfile_url = imgdlfile_url_a.replace('"','')#strip trailing quotes imgurl_list.append(imgdlfile_url) #now go to primary page to get useful sized image if len(imgsrc_list) < imgnum_needed: print "imgdlfile_url", imgdlfile_url try: imgdlfile_url_txt = urlopen(imgdlfile_url) rawimg_url_big = re.findall( r'(.+)img-fluid', imgdlfile_url_txt.read() ) try: rawimg_url = rawimg_url_big[0].split('"')[1] if rawimg_url not in imgs_existing: #prevent dupes #this is rhe download list imgsrc_list.append(rawimg_url) # this is dict for img2url logfile for imgurl in imgsrc_list:img2url_dict[rawimg_url] = [imgdlfile_url] else: print cfg.color.magenta + "Dupe file rejected!" + cfg.color.white except: pass #regex issue, skip it except: print cfg.color.magenta + "FAILED: urlopen("+imgdlfile_url+"). moving on"+ cfg.color.white cursor_and_imgs = [cursor, imgsrc_list, img2url_dict] if len(imgsrc_list) < 1: print cfg.color.magenta + ''' ================================= ***** WARNING ***** no JPG images found online! =================================''' print cfg.color.white return cursor_and_imgs
def main(retrain_dict): robo.whereami(sys._getframe().f_code.co_name) print cfg.color.yellow + "retrain params:" + cfg.color.white for k,v in retrain_dict.items(): print k, ":" ,v print timestart = time.strftime("%H%M%S") #### ACTUAL WORK HERE retrained_output = retrain_tensorflow(retrain_dict) print "retrained_output:", retrained_output ##################### #AFTER -- send a message timeend = time.strftime("%H%M%S") timespent = float(timeend) - float(timestart) if cfg.twilio_active == True: sms_msg = "tensorflow retrained and new model created in "+str(timespent)+" seconds! boop boop." sms_status = robo.sendsms(sms_msg) print "exiting robo_retrain..." return sys.exit(1) #shouldnt get here, but just in case...
def processclassifiedimages(origfilename, labelresults, basetag): robo.whereami(sys._getframe().f_code.co_name) #setup the new namescheme (could prolly be small func) labelname = labelresults[0] labelscore = labelresults[1] score_suffix = labelscore[2:5] filename_parts = origfilename.split(".") newfilename = filename_parts[ 0] + "_" + score_suffix + "." + filename_parts[1] rootdir = cfg.path_to_testimgs + cfg.dd + basetag + cfg.dd + cfg.sorted_dirname + cfg.dd + labelname subdir = rootdir + cfg.belowminlabel_dir_suffix if float(labelscore) > float(cfg.confidence_min): print cfg.color.green + "yay! score HIGH: " + cfg.color.black + cfg.bkcolor.green + " " + labelscore + " " + cfg.bkcolor.resetall print "== moved to: ..." + cfg.sorted_dirname + cfg.dd + labelname shutil.move( cfg.path_to_testimgs + cfg.dd + basetag + cfg.dd + origfilename, rootdir + cfg.dd + newfilename) proc_result = (newfilename, labelname, labelscore) else: print cfg.color.magenta + "sad score low: " + cfg.color.white + cfg.bkcolor.magenta + " " + labelscore + " " + cfg.bkcolor.resetall print "==== moved to dir: ..." + cfg.sorted_dirname + cfg.dd + labelname + cfg.belowminlabel_dir_suffix shutil.move( cfg.path_to_testimgs + cfg.dd + basetag + cfg.dd + origfilename, subdir + cfg.dd + newfilename) proc_result = (newfilename, labelname + cfg.belowminlabel_dir_suffix, labelscore) print return proc_result
def retrain_dict_setup_modeltype(): robo.whereami(sys._getframe().f_code.co_name) print cfg.color.cyan + "1. RETRAIN TENSORFLOW MODEL?" + cfg.color.white print "(default:" + cfg.retrain_model_default+") Enter [i]nceptionv3 or [m]obile:" modeltype_raw = raw_input() if modeltype_raw == "i": modeltype = cfg.inception_model mobilepercent = None elif modeltype_raw == "m": modeltype = cfg.mobile_model robo.makebeep() mp_raw = raw_input("Please enter percent of pretrained '"+cfg.mobile_model+"' model to use: [25], [50], [75] or [100] : ") if mp_raw == "25" : mobilepercent = "0.25" elif mp_raw == "50" : mobilepercent = "0.50" elif mp_raw == "75" : mobilepercent = "0.75" elif mp_raw == "100": mobilepercent = "1.0" else: mobilepercent = cfg.retrain_mobile_percent_default robo.makebeep() print"Woops, non-valid choice, so '"+str(cfg.retrain_mobile_percent_default)+"' was chosen for you." else: modeltype = cfg.retrain_model_default mobilepercent = cfg.retrain_mobile_percent_default return (modeltype, mobilepercent)
def add_accuracy_to_modeldir(path_to_trainingsumm_name,tf_final_acc): robo.whereami(sys._getframe().f_code.co_name) #clean up final accuracy (ex: 'INFO/tensorflow/Final test accuracy = 80.8% (N=73)' ) final_acc = tf_final_acc.split("=")[1].replace("% (N","") #append final_accuracy to modeldir name acc_label = "_acc"+final_acc shutil.move(path_to_trainingsumm_name, path_to_trainingsumm_name+acc_label) return
def urlbuild(vars_dict): robo.whereami(sys._getframe().f_code.co_name) thiscursor = vars_dict["cursor"] thistag = vars_dict["thistag"] if thiscursor == None: url_built = scrapeurl.replace("https","https:") + cfg.dd + thistag else: url_built = scrapeurl.replace("https","https:") + cfg.dd + thistag+scrapeurl_pagenum+thiscursor vars_dict["url_built"] = url_built return vars_dict
def urlbuild(vars_dict): robo.whereami(sys._getframe().f_code.co_name) thistag = vars_dict["thistag"] scrapeurl_pagenum = vars_dict["scrapeurl_pagenum"] url_built_ending = scrape_sort + cfg.dd + str(scrapeurl_pagenum) url_built = scrapeurl.replace( "https", "https:") + cfg.dd + thistag + cfg.dd + url_built_ending vars_dict["scrapeurl_pagenum"] += 1 vars_dict["url_built"] = url_built return vars_dict
def retrain_dict_setup_imgsize(): robo.whereami(sys._getframe().f_code.co_name) print cfg.color.cyan + "3. IMAGE SIZE?" + cfg.color.white print "(default: "+str(cfg.retrain_imgsize_default)+") Enter [128], [160], [192], or [224] pixels... " raw_imagesize = raw_input() try: if int(raw_imagesize) not in (128,160,192,224): imgsize = cfg.retrain_imgsize_default else: imgsize = int(raw_imagesize) except: imgsize = cfg.retrain_imgsize_default return imgsize
def retrain_dict_setup_batchsize(): robo.whereami(sys._getframe().f_code.co_name) print cfg.color.cyan + "5. BATCH SIZE?" + cfg.color.white print "(min:"+str(cfg.retrain_batchsize_min)+") While there is debate on a best number, from 10 to 100 (or more if you have many thousand of images) is a good starting point... " raw_batchsize = raw_input() try: if int(raw_batchsize) < cfg.retrain_batchsize_min: bs = cfg.retrain_batchsize_min else: bs = int(raw_batchsize) except: bs = cfg.retrain_batchsize_min return bs
def retrain_dict_setup_testper(): robo.whereami(sys._getframe().f_code.co_name) print cfg.color.cyan + "4. TESTING PERCENT?" + cfg.color.white print "(min:"+str(cfg.retrain_testper_min)+") Depending how many images in total, between 10 and 50 is a good starting point... " raw_testper = raw_input() try: if int(raw_testper) < cfg.retrain_testper_min: testper = cfg.retrain_testper_min else: testper = int(raw_testper) except: testper = cfg.retrain_testper_min return testper
def getretrainedlabels(model_data): robo.whereami(sys._getframe().f_code.co_name) basetag = model_data["basetag"] model_dir = model_data["model_dir"] path_to_labels = cfg.path_to_trainingsumms + cfg.dd + basetag + cfg.dd + model_dir + cfg.dd + cfg.retrainedlabels_file f = open(path_to_labels, "rU") labels_list = [] for line in f: lineclean = line.replace("\n", "").replace(" ", "_") labels_list.append(lineclean) return labels_list
def webfile_prep(fwebname): robo.whereami(sys._getframe().f_code.co_name) try: webfile_prepped = open(fwebname, "r") return webfile_prepped except: print cfg.color.magenta print "Hmm, Unable to load WEBSTAGRAM response: "+fwebname print "(usually, the tag has no images. so check the txt file, and also give it a look online.)" print cfg.color.white robo.goodbye() sys.exit(1) #shouldnt get here, but for safety
def retrain_dict_setup_trainsteps(): robo.whereami(sys._getframe().f_code.co_name) print cfg.color.cyan + "2. TRAINING STEPS?" + cfg.color.white print "(default:"+str(cfg.retrain_steps_min)+") Too many leads to overfitting, too few leads to weak results," print "so between 500 and 4000 is a good choice... " raw_steps = raw_input() try: if int(raw_steps) < cfg.retrain_steps_min: trainsteps = cfg.retrain_steps_min else: trainsteps = int(raw_steps) except: trainsteps = cfg.retrain_steps_min return trainsteps
def webfile_prep(fwebname): robo.whereami(sys._getframe().f_code.co_name) with open(fwebname, "r") as webfile_local: try: webfile_prepped = json.load(webfile_local) return webfile_prepped except: print cfg.color.magenta print "Hmm, Unable to load JSON from IMGUR API response." print "(usually, the tag has no images. so check the txt file above, and give it a look online.)" print cfg.color.white robo.goodbye() sys.exit(1) #shouldnt get here, but for safety
def getnexturl(vars_dict): robo.whereami(sys._getframe().f_code.co_name) with open(vars_dict["localurlfile"], "rU") as f: urls_list = [line for line in f] nexturl_raw= urls_list[ (len(urls_list)-1)] nexturl = nexturl_raw.replace("\n","") print "nexturl "+ nexturl vars_dict["nexturl"] = nexturl if nexturl == cfg.nomoreurls: iscomplete(progressdata) return vars_dict
def getwebfile(webfileurl): robo.whereami(sys._getframe().f_code.co_name) print "get data from:", webfileurl try: webfile = urlopen(webfileurl) return webfile except: print cfg.color.magenta print "doh, didnt get file from "+webfileurl+"!" print "usually this is random. good next steps:" print "1. maybe check the url in a browser or\n2. wait like 15 seconds and try again." print cfg.color.white robo.goodbye() sys.exit(1) #shouldnt get here, but for safety
def classify_image(testimg, model_data): robo.whereami(sys._getframe().f_code.co_name) basetag = model_data["basetag"] model_type = model_data["model_type"] model_dir = model_data["model_dir"] path_to_retrainedgraph = cfg.path_to_trainingsumms + cfg.dd + basetag + cfg.dd + model_dir + cfg.dd + cfg.retrainedgraph_file path_to_labels = cfg.path_to_trainingsumms + cfg.dd + basetag + cfg.dd + model_dir + cfg.dd + cfg.retrainedlabels_file # build a command if model_type == cfg.mobile_model: testimgcommand = "python ../scripts/label_image.py \ --graph='" + path_to_retrainedgraph + "' \ --labels='" + path_to_labels + "' \ --input_height=224 \ --input_width=224 \ --input_mean=128 \ --input_std=128 \ --image=" + testimg elif model_type == cfg.inception_model: testimgcommand = "python ../scripts/label_image.py \ --graph='" + path_to_retrainedgraph + "' \ --labels='" + path_to_labels + "' \ --input_height=299 \ --input_width=299 \ --input_mean=128 \ --input_std=128 \ --input_layer='Mul' \ --image=" + testimg else: robo.goodbye("woops no classification model! Program stopping...") print "image: " + testimg # use the tensorflow label_image script try: imagelabel_raw = subprocess.check_output(testimgcommand, shell=True) except Exception: # just remove file for now (rather than store for later analysis) # because they are broken images, not misclassified os.remove(testimg) imagelabel_raw = False return imagelabel_raw
def getwebfile(webfileurl): robo.whereami(sys._getframe().f_code.co_name) print "API for:", webfileurl imgur_client_id = 'Client-ID ' + os.environ.get('IMGURAPI_ID') req = Request(webfileurl) req.add_header('Authorization', imgur_client_id) try: webfile = urlopen(req) return webfile except: print cfg.color.magenta print "doh, probably 'urllib2.HTTPError: HTTP Error 500: Internal Server Error'" print "(something wrong with imgur API. happens all the time. try again in a min.)" print cfg.color.white robo.goodbye() sys.exit(1) #shouldnt get here, but for safety
def imgurapi_clientid_confirm(): robo.whereami(sys._getframe().f_code.co_name) try: imgur_client_id = 'Client-ID ' + os.environ.get('IMGURAPI_ID') return imgur_client_id except: print cfg.color.magenta + ''' Whelp! no Imgur API Client-ID found in environment variables. (and thus, no ability to download images from imgur.com...) ''' print cfg.color.white + ''' SOLUTIONS: 1. Change 'scrapesite_default' in config file, or 2. Set yourself up an API key at: https://apidocs.imgur.com/ then take 10 seconds to add it to your environment (on mac) at: http://osxdaily.com/2015/07/28/set-enviornment-variables-mac-os-x/''' robo.goodbye() sys.exit(1) # for safety
def process_imagelabel_for_final(label_raw): robo.whereami(sys._getframe().f_code.co_name) # do some things to strings imagelabel_2 = label_raw.split("\n") ## change in tensorflow script it now adds an evaluation string, ## so there is a new conditional here (jan 2018) -- mmc if "Evaluation" in imagelabel_2[1]: imagelabel_2b = imagelabel_2[3] else: imagelabel_2b = imagelabel_2[0] imagelabel_2c = imagelabel_2b.split(" ") imagelabel_score = imagelabel_2c[-1] imagelabel_name = imagelabel_2b.replace(" " + imagelabel_score, "").replace(" ", "_") imagelabel_processed = (imagelabel_name, imagelabel_score) return imagelabel_processed
def makesortedlabels_dirs(model_data): robo.whereami(sys._getframe().f_code.co_name) path_to_sortedimg_basetag = cfg.path_to_testimgs + cfg.dd + model_data[ "basetag"] + cfg.dd + cfg.sorted_dirname #make master sorted_dir if not os.path.exists(path_to_sortedimg_basetag): os.makedirs(path_to_sortedimg_basetag) labels_list = getretrainedlabels(model_data) for label in labels_list: abovemin_dir = path_to_sortedimg_basetag + cfg.dd + label if not os.path.exists(abovemin_dir): os.makedirs(abovemin_dir) #also make dir for images that are under the confidence min belowmin_labeldir = abovemin_dir + cfg.belowminlabel_dir_suffix if not os.path.exists(belowmin_labeldir): os.makedirs(belowmin_labeldir) status = True #make this a check return status
def getnexturl(vars_dict): robo.whereami(sys._getframe().f_code.co_name) with open(vars_dict["localurlfile"], "rU") as f: urls_list = [line for line in f] #should have format like: https://api.imgur.com/3/gallery/t/robot/time/3 # get/increment number at end, and update scrapeurl_pagenum nexturl_raw = urls_list[(len(urls_list) - 1)] nexturl_parts = nexturl_raw.replace("\n", "").split("/") nexturl_increment = nexturl_parts[-1] nexturl_increment_added = str(int(nexturl_increment) + 1) nexturl_ending = scrape_sort + cfg.dd + nexturl_increment_added nexturl = scrapeurl.replace( "https", "https:") + cfg.dd + vars_dict["thistag"] + cfg.dd + nexturl_ending vars_dict["nexturl"] = nexturl vars_dict["scrapeurl_pagenum"] = int(nexturl_increment_added) if nexturl == cfg.nomoreurls: iscomplete(progressdata) return vars_dict
def getcursorandimgsrcs(webfile_prepped, imgnum_needed, progressdata): robo.whereami(sys._getframe().f_code.co_name) imgsrc_list = [] img2url_dict = {} ## WHELP... cursor is used to check if no mo data, but for imgurapi rewrite ## going with this always exists/true for now cursor = 1 #build a list of images for de-dupe. with a refactor, i would make a single list of # imgnum_needed urls and pass that to a download module... for now i ll check the logfile imgs_existing = robo.imgs_existing_build(progressdata["img2url_file"]) #get images from imgur api json response imgs_in_json = re.findall(r'i.imgur.com/(.{7})(.jpg)', str(webfile_prepped)) for img in imgs_in_json: if len(imgsrc_list) < imgnum_needed: imgdlfile_url = imgdlfile_url_prefix.replace( "https", "https:") + img[0] + imgdlfile_url_suffix if imgdlfile_url not in imgs_existing: #prevent dupes imgsrc_list.append(imgdlfile_url) for imgurl in imgsrc_list: img2url_dict[imgurl] = [imgurl] cursor_and_imgs = [cursor, imgsrc_list, img2url_dict] if len(imgsrc_list) < 1: print cfg.color.magenta + ''' ================================= ***** WARNING ***** no JPG images found online! =================================''' print cfg.color.white return cursor_and_imgs
def retrain_tensorflow(retrain_dict): robo.whereami(sys._getframe().f_code.co_name) #SETUP basetag = retrain_dict["basetag"] thistag = retrain_dict["thistag"] imagesize = str(retrain_dict["imagesize"]) steps = str(retrain_dict["steps"]) testpercent = str(retrain_dict["testpercent"]) batchsize = str(retrain_dict["batchsize"]) modeltype = retrain_dict["modeltype"] mobilepercent = retrain_dict["mobilepercent"] if modeltype == "mobilenet": trainsumm_name = modeltype + "_" + str(mobilepercent) + "_batch" + str(batchsize) + "_steps" + str(steps) + "_test" + str(testpercent) + "_img" + str(imagesize) + "" else: trainsumm_name = modeltype + "_batch" + str(batchsize) + "_steps" + str(steps) + "_test" + str(testpercent) + "_img" + str(imagesize) + "" path_to_trainingsumm_name = cfg.path_to_trainingsumms + cfg.dd + basetag + cfg.dd + trainsumm_name path_to_trainimgs_basetag = cfg.path_to_trainingimgs + cfg.dd + basetag path_to_output_graph = path_to_trainingsumm_name + cfg.dd + cfg.retrainedgraph_file path_to_output_labels = path_to_trainingsumm_name + cfg.dd + cfg.retrainedlabels_file #build up shared commands cmd1 = "../scripts/retrain.py" cmd2 = "--bottleneck_dir=" + cfg.path_to_bottlenecks cmd3 = "--model_dir=" + cfg.path_to_trainingmodels cmd4 = "--how_many_training_steps=" + steps cmd5 = "--train_batch_size=" + batchsize cmd6 = "--testing_percentage=" + testpercent cmd7 = "--summaries_dir=" + path_to_trainingsumm_name cmd8 = "--output_graph=" + path_to_output_graph cmd9 = "--output_labels=" + path_to_output_labels cmd10 = "--image_dir=" + path_to_trainimgs_basetag cmd11 = "" #BUILD an array of CMDS based on model if modeltype == "inceptionv3": cmds=['1',cmd1,cmd2,cmd3,cmd4,cmd5,cmd6,cmd7,cmd8,cmd9,cmd10] else: # build a command WITH ARCHITECTURE, since not default mobilepercent = retrain_dict["mobilepercent"] ARCHITECTURE = modeltype + "_" + str(mobilepercent) + "_" + str(imagesize) cmd11 = "--architecture=" + ARCHITECTURE cmds=['1',cmd1,cmd2,cmd3,cmd4,cmd5,cmd6,cmd7,cmd8,cmd9,cmd10,cmd11] print "\n------------------------------" print "start retraining tensorflow model/graph" print "when it breaks, look for 'RuntimeError: Error during processing file' " print cfg.color.yellow + "retraining command:" + cfg.color.white for cmd in cmds: print cmd # use the tensorflow RETRAIN script try: #training_results = subprocess.check_output(retrain_command, shell=True) training_results = Popen(cmds,shell=False,stderr=PIPE,bufsize=1,executable="python") for line in iter(training_results.stderr.readline, b''): print line if line.startswith("INFO:tensorflow:Final test accuracy"): tf_final_acc = line training_results.wait() # wait for the subprocess to exit except Exception: ### log something or? ### remove specific image or modeldir? regex thru output to find it-- or just skip? pass # see need/description at this function add_accuracy_to_modeldir(path_to_trainingsumm_name,tf_final_acc) return training_results
def main(model_data): robo.whereami(sys._getframe().f_code.co_name) basetag = model_data["basetag"] model_type = model_data["model_type"] path_to_sortedimgs_basetag = cfg.path_to_testimgs + cfg.dd + basetag + cfg.dd + cfg.sorted_dirname if makesortedlabels_dirs(model_data) == False: robo.goodbye( "Dirs for final output NOT created or available. Check your permissions. Stopping program..." ) #get images images_list = robo.getimagelist_fromdir(cfg.path_to_testimgs + cfg.dd + basetag) #call the labeling function testimages_dict = {} for testimage in images_list: timestart = time.strftime("%H%M%S") ##### call tensorflow label_image script imagelabel_raw = classify_image(testimage, model_data) ####################################### # process the label if imagelabel_raw: imagelabel_processed = process_imagelabel_for_final(imagelabel_raw) testimage_clean = testimage.replace( cfg.path_to_testimgs + cfg.dd + basetag + cfg.dd, "") #process the classifiedimages proc_classed_images_list = {} proc_result = processclassifiedimages(testimage_clean, imagelabel_processed, basetag) # save to dict for later analysis timeend = time.strftime("%H%M%S") timespent = float(timeend) - float(timestart) testimages_dict[proc_result[0]] = (proc_result[1], proc_result[2], timespent) #make a CLASSIFICATION log file filetitle = cfg.imagelog_prefix + model_type + "_" + logtime + cfg.imagelog_suffix path_to_thisfile = path_to_sortedimgs_basetag + cfg.dd + filetitle robo.createfilefromdict(path_to_thisfile, testimages_dict) #make a MODEL INFO log file thatfile = cfg.imagelog_prefix + cfg.modeldatalog_name + logtime + cfg.imagelog_suffix path_to_thatfile = path_to_sortedimgs_basetag + cfg.dd + thatfile robo.createfilefromdict(path_to_thatfile, model_data) #AFTER -- send a message if cfg.twilio_active == True: num_of_testimages = len(testimages_dict) sms_msg = "roboclassified and moved " + str( num_of_testimages ) + " imgs! now, do a manual QA sortcheck (and/or re-sort) for best longterm success. boop boop." sms_status = robo.sendsms(sms_msg) print "exiting robo_classify..." return sys.exit(1) #shouldnt get here, but just in case...
def functionsloaded(): robo.whereami(sys._getframe().f_code.co_name) print cfg.color.yellow + "DOWNLOAD SOURCE: IMGUR API" + cfg.color.white print "(choose 'imgurapi' or 'webstagram' in the config file)\n" return