def get_image_matrices(train_imagepath, test_imagepath, trainDF, valDF, testDF): ''' load images from pkl files and convert to matrices ''' plog("Loading train image features from %s..." % train_imagepath) with open(train_imagepath, 'rb') as f: imageDF = pkl.load(f) if test_imagepath is not None: plog("Loading test image features from %s..." % test_imagepath) with open(test_imagepath, 'rb') as f: test_imageDF = pkl.load(f) test_image_matrix = test_imageDF.as_matrix() test_image_matrix = test_image_matrix[:testDF.shape[0], :] assert test_image_matrix.shape[0] == testDF.shape[0] else: test_image_matrix = None image_matrix = imageDF.as_matrix() train_image_matrix = image_matrix[:trainDF.shape[0], :] val_image_matrix = image_matrix[trainDF.shape[0]:trainDF.shape[0] + valDF.shape[0], :] return (train_image_matrix, val_image_matrix, test_image_matrix)
def build_text_matrices(datadir, tokenizer_path, trainDF, valDF, testDF): ''' use bag-of-words representation to convert descriptions into bag-of-words matrices ''' plog("Building text matrices...") with open(tokenizer_path) as f: tokenizer = pkl.load(f) train_text_matrix_path = datadir + 'train_text.pkl' val_text_matrix_path = datadir + 'val_text.pkl' test_text_matrix_path = datadir + 'test_text.pkl' bow_train, idx_train = bag_of_words.series_to_bag_of_words( trainDF.description_clean, tokenizer, train_text_matrix_path, mode="binary") bow_val, idx_val = bag_of_words.series_to_bag_of_words( valDF.description_clean, tokenizer, val_text_matrix_path, mode="binary") bow_test, idx_test = bag_of_words.series_to_bag_of_words( testDF.description_clean, tokenizer, test_text_matrix_path, mode="binary") plog("bow_train type: %s" % type(bow_train)) return (bow_train, bow_val, bow_test)
def prepDFs(datadir, train_samples=10000, test_samples=1000, val_portion=0.1, debug=False): ''' 1. run train_val_split on training 1b. run shuffle on test 2. if text: a. train tokenizer b. convert text data to bag of words matrix 3. if images: a. extract image data 4. merge datasets returns: X_train,y_train,X_val,y_val,X_test,y_test ''' if(debug): trainpath = datadir + 'head_train_set.csv' testpath = datadir + 'head_test_set.csv' train_samples = 90 test_samples = 90 else: trainpath = datadir + 'train_set.csv' testpath = datadir + 'test_set.csv' plog("Loading train csv...") trainDF = pd.read_csv(trainpath,header = 0, index_col = 0,low_memory = False) plog("Loading test csv...") testDF = pd.read_csv(testpath,header = 0, index_col = 0,low_memory = False) trainDF = shuffle_and_downsample(trainDF,train_samples) testDF = shuffle_and_downsample(testDF,test_samples) return trainDF,testDF
def prepDFs(datadir, train_samples=10000, test_samples=1000, val_portion=0.1, debug=False): ''' 1. run train_val_split on training 1b. run shuffle on test 2. if text: a. train tokenizer b. convert text data to bag of words matrix 3. if images: a. extract image data 4. merge datasets returns: X_train,y_train,X_val,y_val,X_test,y_test ''' if (debug): trainpath = datadir + 'head_train_set.csv' testpath = datadir + 'head_test_set.csv' train_samples = 90 test_samples = 90 else: trainpath = datadir + 'train_set.csv' testpath = datadir + 'test_set.csv' plog("Loading train csv...") trainDF = pd.read_csv(trainpath, header=0, index_col=0, low_memory=False) plog("Loading test csv...") testDF = pd.read_csv(testpath, header=0, index_col=0, low_memory=False) trainDF = shuffle_and_downsample(trainDF, train_samples) testDF = shuffle_and_downsample(testDF, test_samples) return trainDF, testDF
def build_image_network(): ''' builds CNN for image feature extraction CNN is designed to match the pretrained network from VGG returns: network ''' plog("Building lasagne net...") net = {} net['input'] = InputLayer((None, 3, 224, 224)) net['conv1'] = ConvLayer(net['input'], num_filters=96, filter_size=7, stride=2) net['norm1'] = NormLayer(net['conv1'], alpha=0.0001) # caffe has alpha = alpha * pool_size net['pool1'] = PoolLayer(net['norm1'], pool_size=3, stride=3, ignore_border=False) net['conv2'] = ConvLayer(net['pool1'], num_filters=256, filter_size=5) net['pool2'] = PoolLayer(net['conv2'], pool_size=2, stride=2, ignore_border=False) net['conv3'] = ConvLayer(net['pool2'], num_filters=512, filter_size=3, pad=1) net['conv4'] = ConvLayer(net['conv3'], num_filters=512, filter_size=3, pad=1) net['conv5'] = ConvLayer(net['conv4'], num_filters=512, filter_size=3, pad=1) net['pool5'] = PoolLayer(net['conv5'], pool_size=3, stride=3, ignore_border=False) net['fc6'] = DenseLayer(net['pool5'], num_units=4096) net['drop6'] = DropoutLayer(net['fc6'], p=0.5) net['fc7'] = DenseLayer(net['drop6'], num_units=4096) net['drop7'] = DropoutLayer(net['fc7'], p=0.5) net['fc8'] = DenseLayer(net['drop7'], num_units=1000, nonlinearity=lasagne.nonlinearities.softmax) output_layer = net['fc8'] lasagne.layers.set_all_param_values(output_layer, PRETRAINED_VGG['values']) return net
def __init__(self, port, cfgs, reload_sign): self.port: int = port self.cfgs = cfgs self.reload_sign: queue.Queue = reload_sign self.domain_match_dict = self.make_domain_match_dict() plog(self.domain_match_dict) self.ssl_ctx = {} self.default_ssl_ctx = None self.load_ssl_ctxs()
def main(): conf_file = sys.argv[1] conf = load_config(conf_file) while 1: try: do_main(conf) sleep_minute = conf.get("sleep", 30) plog(u"%d 分钟后执行下一次检查", sleep_minute) time.sleep(sleep_minute * 60) except KeyboardInterrupt, _: sys.exit(0) except Exception, e: plog(str(e.message)) time.sleep(60)
def build_text_matrices(datadir, tokenizer_path, trainDF, valDF, testDF): ''' use bag-of-words representation to convert descriptions into bag-of-words matrices ''' plog("Building text matrices...") with open(tokenizer_path) as f: tokenizer=pkl.load(f) train_text_matrix_path=datadir + 'train_text.pkl' val_text_matrix_path=datadir + 'val_text.pkl' test_text_matrix_path=datadir + 'test_text.pkl' bow_train, idx_train = bag_of_words.series_to_bag_of_words(trainDF.description_clean,tokenizer,train_text_matrix_path,mode="binary") bow_val, idx_val = bag_of_words.series_to_bag_of_words(valDF.description_clean,tokenizer,val_text_matrix_path,mode="binary") bow_test, idx_test = bag_of_words.series_to_bag_of_words(testDF.description_clean,tokenizer,test_text_matrix_path,mode="binary") plog("bow_train type: %s" %type(bow_train)) return (bow_train, bow_val, bow_test)
def conditional_hstack(other, bow, image, dataset_name): ''' assumes other is present. if bag of words is not none, hstack it if image is not None, hstack it ''' if other is not None: X = other if bow is not None: assert bow.shape[0] == X.shape[0] X = np.hstack((X, bow)) else: plog("Bag of words data missing from %s" % dataset_name) if image is not None: assert image.shape[0] == X.shape[0] X = np.hstack((X, image)) else: plog("Image data missing from %s" % dataset_name) return X
def build_brand_matrices(trainDF, valDF, testDF): ''' one-hot encode brand indexes ''' brand_list = get_brand_index(trainDF,valDF,testDF) with open(datadir + 'brand_list.pkl','wb') as f: pkl.dump(brand_list,f) plog("Building brand matrices...") enc = OneHotEncoder() train_vect = np.reshape(trainDF.brand_num.values,(-1,1)) brands_train = enc.fit_transform(train_vect).toarray() val_vect = np.reshape(valDF.brand_num.values,(-1,1)) brands_val = enc.transform(val_vect).toarray() test_vect = np.reshape(testDF.brand_num.values,(-1,1)) brands_test = enc.transform(test_vect).toarray() return (brands_train, brands_val, brands_test)
def merge_data(bows,images,others): ''' merge together the datasets to be used in the model args: sets: list of datasets to be used returns: 2D float32 numpyarrays ''' #HACK: splitting None into 3 if bows is None: bows = (None,None,None) if images is None: images= (None,None,None) plog("Merging data...") X_train = conditional_hstack(others[0],bows[0],images[0],'train') X_val = conditional_hstack(others[1],bows[1],images[1],'val') X_test = conditional_hstack(others[2],bows[2],images[2],'test') return X_train.astype(np.float32), X_val.astype(np.float32), X_test.astype(np.float32)
def conditional_hstack(other,bow,image,dataset_name): ''' assumes other is present. if bag of words is not none, hstack it if image is not None, hstack it ''' if other is not None: X=other if bow is not None: assert bow.shape[0]==X.shape[0] X = np.hstack((X,bow)) else: plog("Bag of words data missing from %s" %dataset_name) if image is not None: assert image.shape[0]==X.shape[0] X = np.hstack((X,image)) else: plog("Image data missing from %s" %dataset_name) return X
def build_brand_matrices(trainDF, valDF, testDF): ''' one-hot encode brand indexes ''' brand_list = get_brand_index(trainDF, valDF, testDF) with open(datadir + 'brand_list.pkl', 'wb') as f: pkl.dump(brand_list, f) plog("Building brand matrices...") enc = OneHotEncoder() train_vect = np.reshape(trainDF.brand_num.values, (-1, 1)) brands_train = enc.fit_transform(train_vect).toarray() val_vect = np.reshape(valDF.brand_num.values, (-1, 1)) brands_val = enc.transform(val_vect).toarray() test_vect = np.reshape(testDF.brand_num.values, (-1, 1)) brands_test = enc.transform(test_vect).toarray() return (brands_train, brands_val, brands_test)
def merge_data(bows, images, others): ''' merge together the datasets to be used in the model args: sets: list of datasets to be used returns: 2D float32 numpyarrays ''' #HACK: splitting None into 3 if bows is None: bows = (None, None, None) if images is None: images = (None, None, None) plog("Merging data...") X_train = conditional_hstack(others[0], bows[0], images[0], 'train') X_val = conditional_hstack(others[1], bows[1], images[1], 'val') X_test = conditional_hstack(others[2], bows[2], images[2], 'test') return X_train.astype(np.float32), X_val.astype(np.float32), X_test.astype( np.float32)
def get_image_matrices(train_imagepath,test_imagepath, trainDF, valDF, testDF): ''' load images from pkl files and convert to matrices ''' plog("Loading train image features from %s..." %train_imagepath) with open(train_imagepath,'rb') as f: imageDF=pkl.load(f) if test_imagepath is not None: plog("Loading test image features from %s..." %test_imagepath) with open(test_imagepath,'rb') as f: test_imageDF = pkl.load(f) test_image_matrix = test_imageDF.as_matrix() test_image_matrix = test_image_matrix[:testDF.shape[0],:] assert test_image_matrix.shape[0]==testDF.shape[0] else: test_image_matrix=None image_matrix = imageDF.as_matrix() train_image_matrix = image_matrix[:trainDF.shape[0],:] val_image_matrix = image_matrix[trainDF.shape[0]:trainDF.shape[0] + valDF.shape[0],:] return (train_image_matrix, val_image_matrix, test_image_matrix)
def get_selected_image_features(df, datadir, dataset, iloc0, iloc1, save_freq, out_pickle_name='image_features.pkl', batch_size=256, width=224, filetype='jpg'): ''' for a given index range, download and resize the images, then save to directory args: df: dataframe where image urls are iloc0: int or None. first iloc of range of images to download iloc1: int or None. last iloc of range of images to download save_freq: how many batches before saving out_pickle_name: name of outfile batch_size: rows per batch dataset: string 'train' or 'test' or other identifier returns: none ''' plog("Beginning feature extraction...") assert iloc0<=df.shape[0] assert iloc1<=df.shape[0] image_urls = df.large_image_URL.iloc[iloc0:iloc1] iloc=iloc0 prev_iloc = iloc0 batch_num=0 featureDF = pd.DataFrame() for batch in iterate_minibatches(image_urls,batch_size): plog("extracting image features for batch %i, iloc %i" %(batch_num,iloc)) batch_featureDF = batch_extract_features(batch,dataset,datadir,width,filetype) featureDF=featureDF.append(batch_featureDF,verify_integrity=True) iloc+=batch_size batch_num+=1 if iloc>iloc0 and (batch_num%save_freq==0 or iloc>=iloc1-1): plog("Saving from image iloc %i to image iloc %i" %(prev_iloc,iloc)) #Append to csv here with open('csv_fn.csv','a') as outf: featureDF.to_csv(outf,header=False) #with open(datadir+out_pickle_name + '_' + str(prev_iloc) + '_' + str(iloc)+'.pkl','wb') as outf: # pkl.dump(featureDF,outf) prev_iloc = iloc #reset featureDF to save memory featureDF = pd.DataFrame()
def do_main(conf): for info in conf.get("domains"): domain = info["domain"] hosts = info["hosts"] plog(u"检查域名: %s", domain) # pick fastest domain fastest_host = pick_fastest_ping(hosts) plog(u"响应最快的主机: %s", fastest_host) # update_recode new_value = update_record(domain, fastest_host) plog(u"新的记录值: %s => %s", domain, new_value)
''' main.py End-to-end script for running all processes. ''' __author__='Charlie Guthrie' from utils import create_log,plog,fplog create_log(__file__) import sys #Command-line arguments if len(sys.argv)<2: plog("Usage: python main.py [num_train_samples] [use_images|use_text]") sys.exit() else: train_samples = int(sys.argv[1]) if 'use_images' in sys.argv: use_images=True else: use_images=False if 'use_text' in sys.argv: use_text=True else: use_text=False plog('importing main.py modules...') import os import data_prep import models import pdb from datetime import datetime
#! python3 from utils import getIntegers, plog logfile = r"\\192.168.99.91\shares\scripts\BartenderPrint\testlog_.log" file = open(logfile, "r") cnt = 0 for line in file: cnt += getIntegers(line)[6] print(cnt) plog( logfile, "Итого за февраль напечатано" + str(cnt) + "термочеков для групп, что составляет" + str(int(cnt / 300)) + "роликов или " + str(cnt / 300 / 60) + "коробок") plog( logfile, "рассчёты произведены без учёта термочеков для грузчиков или другого использования, такого как наклейки для овощей или фруктов" )
''' data_prep.py Starting with the csv's, ending with X_train, y_train, X_val, y_val, X_test, y_test Where X's are feature vectors and y's are classifier integers ''' __author__='Charlie Guthrie' from utils import create_log,plog create_log(__file__) plog('importing modules...') from datetime import datetime import os import pandas as pd import numpy as np import pdb import cPickle as pkl import bag_of_words from sklearn.preprocessing import OneHotEncoder def shuffle_and_downsample(df,samples): ''' shuffle dataframe, including previous indexes, then downsample args: samples: number of samples ''' #random seed 9 makes sure we always get the same shuffle. np.random.seed(9) assert df.shape[0]>2
def main(datadir, train_samples=10000, test_samples=1000, val_portion=0.1, use_images=True, use_text=True, train_image_fn='train_image_features_0_2500.pkl', test_image_fn='test_image_features_0_2500.pkl', debug=False): ''' 1. run train_val_split on training 1b. run shuffle on test 2. if text: a. train tokenizer b. convert text data to bag of words matrix 3. if images: a. extract image data 4. merge datasets returns: X_train,y_train,X_val,y_val,X_test,y_test ''' if (debug): trainpath = datadir + 'head_train_set.csv' testpath = datadir + 'head_test_set.csv' train_imagepath = datadir + 'train_image_features_0_2500.pkl' test_imagepath = datadir + 'test_image_features_0_2500.pkl' train_samples = 90 test_samples = 90 else: trainpath = datadir + 'train_set.csv' testpath = datadir + 'test_set.csv' train_imagepath = datadir + train_image_fn test_imagepath = datadir + test_image_fn dstart = datetime.now() plog("Checking to see if prepped data already available...") outpath = datadir + 'model_data_%i_%r_%s_%s.pkl' % ( train_samples, val_portion, use_images, use_text) if os.path.exists(outpath): plog("Data found. Loading...") with open(outpath, 'rb') as f: data, n_values = pkl.load(f) dfin = datetime.now() plog("Data loading time: %s" % (dfin - dstart)) return data, n_values plog("Prepped data not available. Preparing data...") plog("Loading train csv...") trainDF = pd.read_csv(trainpath, header=0, index_col=0, low_memory=False) plog("Loading test csv...") testDF = pd.read_csv(testpath, header=0, index_col=0, low_memory=False) trainDF = shuffle_and_downsample(trainDF, train_samples) trainDF, valDF = train_val_split(trainDF, val_portion) testDF = shuffle_and_downsample(testDF, test_samples) #Load text data t0 = datetime.now() if use_text: bow_data = build_text_matrices(datadir, 'tokenizer_5000.pkl', trainDF, valDF, testDF) t1 = datetime.now() plog("Time to load text: %s" % str(t1 - t0)) else: bow_data = None #Load image data t1 = datetime.now() if use_images: image_data = get_image_matrices(train_imagepath, test_imagepath, trainDF, valDF, testDF) t2 = datetime.now() plog("Time to load images: %s" % str(t2 - t1)) else: image_data = None #Load other data y1_train, y2_train, y3_train = get_targets(trainDF) y1_val, y2_val, y3_val = get_targets(valDF) y1_test, y2_test, y3_test = get_targets(testDF) other_data = build_brand_matrices(trainDF, valDF, testDF) X_train, X_val, X_test = merge_data(bow_data, image_data, other_data) train_data = X_train, y1_train, y2_train, y3_train val_data = X_val, y1_val, y2_val, y3_val test_data = X_test, y1_test, y2_test, y3_test keys = ['y_1', 'y_2', 'y_3'] values = [max(d) + 1 for d in train_data[1:]] n_values = dict(zip(keys, values)) data = (train_data, val_data, test_data) plog("Data loaded. Saving to %s" % outpath) with open(outpath, 'wb') as f: pkl.dump((data, n_values), f) dfin = datetime.now() plog("Data loading time: %s" % (dfin - dstart)) return data, n_values
def main(datadir, train_samples=10000, test_samples=1000, val_portion=0.1, use_images=True, use_text=True, train_image_fn='train_image_features_0_2500.pkl', test_image_fn='test_image_features_0_2500.pkl', debug=False): ''' 1. run train_val_split on training 1b. run shuffle on test 2. if text: a. train tokenizer b. convert text data to bag of words matrix 3. if images: a. extract image data 4. merge datasets returns: X_train,y_train,X_val,y_val,X_test,y_test ''' if(debug): trainpath = datadir + 'head_train_set.csv' testpath = datadir + 'head_test_set.csv' train_imagepath = datadir + 'train_image_features_0_2500.pkl' test_imagepath = datadir + 'test_image_features_0_2500.pkl' train_samples = 90 test_samples = 90 else: trainpath = datadir + 'train_set.csv' testpath = datadir + 'test_set.csv' train_imagepath = datadir + train_image_fn test_imagepath = datadir + test_image_fn dstart=datetime.now() plog("Checking to see if prepped data already available...") outpath = datadir + 'model_data_%i_%r_%s_%s.pkl'%(train_samples,val_portion,use_images,use_text) if os.path.exists(outpath): plog("Data found. Loading...") with open(outpath,'rb') as f: data,n_values = pkl.load(f) dfin = datetime.now() plog("Data loading time: %s" %(dfin-dstart)) return data,n_values plog("Prepped data not available. Preparing data...") plog("Loading train csv...") trainDF = pd.read_csv(trainpath,header = 0, index_col = 0,low_memory = False) plog("Loading test csv...") testDF = pd.read_csv(testpath,header = 0, index_col = 0,low_memory = False) trainDF = shuffle_and_downsample(trainDF,train_samples) trainDF,valDF = train_val_split(trainDF,val_portion) testDF = shuffle_and_downsample(testDF,test_samples) #Load text data t0 = datetime.now() if use_text: bow_data=build_text_matrices(datadir, 'tokenizer_5000.pkl', trainDF, valDF, testDF) t1 = datetime.now() plog("Time to load text: %s" %str(t1-t0)) else: bow_data=None #Load image data t1 = datetime.now() if use_images: image_data = get_image_matrices(train_imagepath,test_imagepath,trainDF, valDF, testDF) t2 = datetime.now() plog("Time to load images: %s" %str(t2-t1)) else: image_data=None #Load other data y1_train,y2_train,y3_train=get_targets(trainDF) y1_val,y2_val,y3_val=get_targets(valDF) y1_test,y2_test,y3_test=get_targets(testDF) other_data = build_brand_matrices(trainDF, valDF, testDF) X_train,X_val,X_test = merge_data(bow_data,image_data,other_data) train_data = X_train, y1_train, y2_train, y3_train val_data = X_val, y1_val, y2_val, y3_val test_data = X_test, y1_test, y2_test, y3_test keys = ['y_1','y_2','y_3'] values = [max(d)+1 for d in train_data[1:]] n_values = dict(zip(keys,values)) data = (train_data, val_data, test_data) plog("Data loaded. Saving to %s" %outpath) with open(outpath,'wb') as f: pkl.dump((data,n_values),f) dfin = datetime.now() plog("Data loading time: %s" %(dfin-dstart)) return data,n_values
def stitch_files(basename,idx_start=0,idx_finish=None): ''' Cycles through all files in the basename directory and stacks them together. args: basename: name without indexes, e.g. 'train_image_features' idx_start: starting index (usually 0) idx_finish: last index of the output file returns: none. saves pickle of images stitched together ''' #datadir = '../data/' datadir = '/scratch/cdg356/spring/data/' featuredir = datadir+basename+'/' #Get list of indexes iloc0_list = [] iloc1_list = [] for root, dirs, files in os.walk(featuredir): for fname in files: idx_range = get_indexes(fname) if idx_range[0] is not None and idx_range[1] is not None: if idx_range[0]>=idx_start and idx_range[1]<=idx_finish: iloc0_list.append(idx_range[0]) iloc1_list.append(idx_range[1]) iloc0_list.sort() iloc1_list.sort() #Make sure there are no duplicates present assert len(iloc0_list)==len(set(iloc0_list)) assert len(iloc1_list)==len(set(iloc1_list)) #Make sure there are no gaps, i.e. that iloc1 of one file = iloc0 of the next for i in range(len(iloc0_list)-1): assert iloc0_list[i+1]==iloc1_list[i] #Load files for i,iloc0 in enumerate(iloc0_list): iloc1=iloc1_list[i] fname = basename + "_%i_%i.pkl" %(iloc0,iloc1) plog("loading %s..." %fname) with open(featuredir + fname,'rb') as f: if i==0: df=pkl.load(f) else: df2=pkl.load(f) df=pd.concat([df,df2]) plog("df shape: %s" %str(df.shape)) max_index = max(iloc1_list) # A couple sanity checks assert max_index == iloc1_list[-1] if idx_finish is not None: assert idx_finish==max_index assert idx_start==iloc0_list[0] outname = datadir + basename + '_%i_%i.pkl'%(idx_start,max_index) plog("writing to %s..." %outname) with open(outname,'wb') as f: pkl.dump(df,f)
#TODO: #batch these up into batches of 256 or 512 images from utils import create_log,plog create_log(__file__) import numpy as np import pandas as pd import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import theano import cPickle as pkl import download_images_to_directory as dl from datetime import datetime plog("Theano device: %s" %theano.config.device) #dnn requires GPU import lasagne from lasagne.layers import InputLayer, DenseLayer, DropoutLayer from lasagne.layers.dnn import Conv2DDNNLayer as ConvLayer from lasagne.layers import MaxPool2DLayer as PoolLayer from lasagne.layers import LocalResponseNormalization2DLayer as NormLayer from lasagne.utils import floatX # ### Load the model parameters and metadata def load_pretrained_model(datadir): plog("Loading vgg model...") model = pkl.load(open(datadir+'vgg_cnn_s.pkl')) #CLASSES = model['synset words'] mean_image = model['mean image']
previoustime = os.path.getmtime(file) except: previoustime = os.path.getmtime(file) # отсортировать - не нужно # сделать дифф try: previousjson currentjson = utils.loadjson(file, quiet=True) currentcustomtime = ntpath.getmtime(file) #print(currentcustomtime) #print(currentjson["group1"]["lastnum"] > previousjson["group1"]["lastnum"]) if currentjson["group1"]["lastnum"] > previousjson["group1"]["lastnum"]: utils.plog( logfile, "напечатано " + str(currentjson["group1"]["lastnum"] - previousjson["group1"]["lastnum"]) + " бирок для первой группы", currentcustomtime) if currentjson["group2"]["lastnum"] > previousjson["group2"]["lastnum"]: utils.plog( logfile, "напечатано " + str(currentjson["group2"]["lastnum"] - previousjson["group2"]["lastnum"]) + " бирок для второй группы", currentcustomtime) if currentjson["group3"]["lastnum"] > previousjson["group3"]["lastnum"]: utils.plog( logfile, "напечатано " + str(currentjson["group3"]["lastnum"] - previousjson["group3"]["lastnum"]) + " бирок для третьей группы", currentcustomtime) if currentjson["group4"]["lastnum"] > previousjson["group4"]["lastnum"]:
''' data_prep.py Starting with the csv's, ending with X_train, y_train, X_val, y_val, X_test, y_test Where X's are feature vectors and y's are classifier integers ''' __author__ = 'Charlie Guthrie' from utils import create_log, plog create_log(__file__) plog('importing modules...') from datetime import datetime import os import pandas as pd import numpy as np import pdb import cPickle as pkl import bag_of_words from sklearn.preprocessing import OneHotEncoder def shuffle_and_downsample(df, samples): ''' shuffle dataframe, including previous indexes, then downsample args: samples: number of samples ''' #random seed 9 makes sure we always get the same shuffle. np.random.seed(9) assert df.shape[0] > 2
def load_pretrained_model(datadir): plog("Loading vgg model...") model = pkl.load(open(datadir+'vgg_cnn_s.pkl')) #CLASSES = model['synset words'] mean_image = model['mean image'] return model, mean_image
''' main.py End-to-end script for running all processes. ''' __author__ = 'Charlie Guthrie' from utils import create_log, plog, fplog create_log(__file__) import sys #Command-line arguments if len(sys.argv) < 2: plog("Usage: python main.py [num_train_samples] [use_images|use_text]") sys.exit() else: train_samples = int(sys.argv[1]) if 'use_images' in sys.argv: use_images = True else: use_images = False if 'use_text' in sys.argv: use_text = True else: use_text = False plog('importing main.py modules...') import os import data_prep import models import pdb from datetime import datetime