def spectral_cluster(): t0 = time() S = spectral_clustering( loadPickle('./models/trump_sample_affinity.pickle'), n_clusters=100) savePickle(S, './models/trump_sample_spectral.pickle') print(S) print("Spectral clustering took {}s".format(time() - t0))
def store_all_org(self, codelist=None): if not self.codelist and not codelist: self.codelist = self.tga.getCodeList() for code in self.codelist: self.store_one_org(code) utils.savePickle(var=self.sofar, filename="orgs_before_error") self.solve_error() return self.sofar
def kmeans(): t0 = time() K = k_means(loadPickle('./models/trump_sample_vectors.pickle'), n_clusters=100, n_jobs=-1) savePickle(K, './models/trump_sample_kmeans.pickle') print(K) print("K-means took {}s".format(time() - t0))
def TrainInitialModelSample(): train_data = read_OnlyTrainData(dropFileName=True) train_data = train_data[:20] X_train = train_data.drop('label', axis=1).values y_train = train_data['label'] test_data = read_OnlyTestData(dropFileName=True, returnXy=False) X_test_set = test_data.drop('label', axis=1).values y_test_set = test_data['label'] svmClassifier = SVC(C=10, kernel='linear', gamma=0.001, probability=True, random_state=500156) logRegClassifier = LogisticRegression(random_state=789) rfClassifier = RandomForestClassifier(criterion='entropy', random_state=4528) classifiers = { type(svmClassifier).__name__: svmClassifier, type(logRegClassifier).__name__: logRegClassifier, type(rfClassifier).__name__: rfClassifier, } #Train all 3 classifires with initial data samlples experiments = [] scores = {} for clfname, clf in classifiers.iteritems(): clf.fit(X_train, y_train) y_pred = clf.predict(X_test_set) score = accuracy_score(y_pred, y_test_set) pred_probs = clf.predict_proba(X_test_set) modelObj = {} modelObj['classifier_name'] = clfname modelObj['acc_score'] = score modelObj['pred_probs'] = pred_probs modelObj['clf_obj'] = clf scores[clfname] = score #save models to pickle savePickle(clf, clfname) #print scores #save initial scores to pickle savePickle([scores[clfname]], clfname + '_scores') experiments.append(modelObj) return scores
def generateOracleData(df_samples): oracleSamples = [] savePickle(df_samples,'finalOracleSamples15') for df in df_samples: clfname = df['classifier'][0] for clip_name in df['sample']: instance = {'name': clip_name[-1], 'clip': 'static/datafiles/audiofiles/'+clip_name[-1]+'.mp3'} oracleSamples.append(instance) return oracleSamples
def train(): data_file = '~/data/twitter/ece901/161112politics0.csv' model_base = './models/w2v' model_pickle = model_base + '.pickle' model_bin = model_base + '.bin' data = TweetIterator(data_file, False, 'tokenized_tweet') t0 = time() model = word2vec.Word2Vec(data, workers=multiprocessing.cpu_count(), sg=1) print("Training word2vec model took {}s".format(time() - t0)) savePickle(model, model_pickle) saveWord2Vec(model, model_bin)
def get_vecs(): t0 = time() tweet2vec = Tweet2Vec(model_file, char=False, chrd=True, word=True) print("Loading model took {}s".format(time() - t0)) source = pd.read_csv(source_file, header=None, sep=chr(1)) text = source[0] t0 = time() M = tweet2vec[text] print(M) print(M.shape) print("Grabbing {} vectors took {}s".format(len(text), time() - t0)) savePickle(M, './models/trump_sample_vectors.pickle')
def PrepareHashtags(source, top_n=2000): ''' This function will pick out the `top_n` most frequent hashtags And save them as `./models/hashtags.txt` You can then make a MultiLabelBinarizer object with MakeMLB() ''' print( "Processing {} and creating MultiLabelBinarizer object".format(source)) model_dir = './models' if not os.path.exists(model_dir): os.makedirs(model_dir) counts = {} counts_details = {} num_tweets = 0 for i, hashtags in enumerate(TweetIterator(source, True, 'hashtags')): num_tweets += 1 if num_tweets % 1000 == 0: print("Processed {} tweets".format(num_tweets)) for h in hashtags: if h not in counts: counts[h] = 1 counts_details[h] = [i] else: counts[h] += 1 counts_details[h].append(i) counts_sorted = sorted(counts.keys(), key=lambda x: -counts[x]) top_hashtags = counts_sorted[:top_n] hashtag_file = os.path.join(model_dir, 'hashtags.txt') hashtag_count_file = os.path.join(model_dir, 'hashtag_counts.pickle') saveList(top_hashtags, hashtag_file) savePickle(counts, hashtag_count_file)
def trainModels(trainingData): # get test data of 50 instances X_test_set, y_test_set = read_OnlyTestData(dropFileName=True, returnXy=True) # get pickle files LogRegression = getPicklefile('LogisticRegression') Svc = getPicklefile('SVC') RFClassifier = getPicklefile('RandomForestClassifier') classifiers = [LogRegression,Svc,RFClassifier] scores = {} for execNo in range(len(trainingData)): tdata = trainingData[execNo] tdata = mapping(tdata) X = tdata.drop(['label','audio_name'],axis=1).values y = tdata['label'] clf = classifiers[execNo] clfname = type(clf).__name__ # retrain the model clf.fit(X,y) y_pred = clf.predict(X_test_set) score = accuracy_score(y_pred, y_test_set) scores[clfname] = score # save models to pickle savePickle(clf, clfname) #get previous score and save it oldscores = getPicklefile(clfname+'_scores') oldscores.append(score) #print "OLD",oldscores # save scores to pickle savePickle(oldscores, clfname + '_scores') return scores
def MakeMLB(top_n=1000): ''' This function produces the "MultiLabelBinarizer" object and saves it as a pickle file in the ./models directory The MultiLabelBinarizer is the object that turns a list of hashtags into a binary vector, for labels for our model Loads `top_n` hashtags in `./models/hashtags.txt' and makes a MultiLabelBinarizer object ''' model_dir = './models' if not os.path.exists(model_dir): os.makedirs(model_dir) output_mlb = os.path.join(model_dir, 'mlb.pickle') hashtag_file = os.path.join(model_dir, 'hashtags.txt') top_hashtags = loadList(hashtag_file) top_hashtags = top_hashtags[:top_n] mlb = MultiLabelBinarizer(sparse_output=False).fit([top_hashtags]) savePickle(mlb, output_mlb) print("Final set of hashtags: {}".format(mlb.classes_))
def getSamples_toAnnotate(): constDiff = 20 data = json.loads(request.data) runCount = int(data['stepSampleCount']) sampleCounter = runCount + constDiff if (sampleCounter == 420): return render_template('index.html') entropied_samples = compute.computeOracle(sampleCounter) logRegSample = entropied_samples[0:5] svmSample = entropied_samples[20:25] rfSample = entropied_samples[40:45] savePickle(logRegSample, 'LogisticRegression_Samples') savePickle(svmSample, 'SVC_Samples') savePickle(rfSample, 'RandomForestClassifier_Samples') print(sampleCounter, " samples") return json.dumps([logRegSample, svmSample, rfSample])
def preprocessDeepModel(sequencesPath, outputPath, maxLen=None): """ Preprocess the sequences to make them trainable by a deep model. Parameters: ----------- sequencesPath (str); where are stored the sequences outputPath (str): where will be stored the preprocessed sequenced maxLen(int): size of the padded sequences Returns: -------- (np.arrays): the training and the validation data, and the training and the validation labels. """ modelPath = "./Resources/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin" # Download the model if needed if not os.path.isfile(modelPath): link = " http://embeddings.org/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin" os.system("wget -O " + modelPath + link) # Load the model w2v = word2vec.load(modelPath) vocab = set(w2v.vocab) # Load the encoder encoder = openPickle("./Data/dict.pkl") decoder = {encoder[key]: key for key in encoder} #if not os.path.isfile(sequencesPath): if not os.path.isfile(outputPath): fromOldToNew = reIndexToken(w2v, decoder) if not os.path.isfile("./Data/newDict.pkl"): newCoder = {"pad": 0, "unk": len(decoder) - 1} for key in decoder: if fromOldToNew[key] != len(decoder) - 1: newCoder[decoder[key]] = fromOldToNew[key] savePickle("./Data/newDict.pkl", newCoder) else: newCoder = openPickle("./Data/newDict.pkl") if not os.path.isfile(sequencesPath): raise FileNotFoundError("Please run studyWord2Vec.py") sequences = openPickle(sequencesPath) sequences = reIndexSequences(sequences, fromOldToNew) savePickle(outputPath, sequences) else: sequences = openPickle(outputPath) if maxLen is None: maxLength = max([len(seq) for seq in sequences]) else: maxLength = maxLen return pad_sequences(sequences, maxlen=maxLength)
import json import numpy as np import params import utils from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer captions = utils.readCaptions(params.TRAIN_CAPTIONS_PATH) stemmed_dict = utils.stemming(captions) #flatten corpus corpus = [] for x in stemmed_dict.keys(): for y in stemmed_dict.get(x): corpus.append(y) bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=3) transformer = TfidfTransformer(smooth_idf=False) counts = bigram_vectorizer.fit_transform(corpus) transformer.fit(counts) utils.savePickle(object=bigram_vectorizer, PATH=params.REDUCED_BIGRAM_MODEL) utils.savePickle(object=transformer, PATH=params.REDUCED_TF_IDF_MODEL)
from preprocessing import TfIdfTransformer, sparseBagOfWords # Naive Bayes with bag of words model = LinearSVC(C = 0.1, class_weight="balanced") _, expectedScore = evaluateModel(model, "./Data/Learn/sequences.pkl", "./Data/Learn/labels.pkl") preds = getPredictions(model, "./Data/Learn/sequences.pkl", "./Data/Learn/labels.pkl", "./Data/Test/sequences.pkl") mcPreds = convertLabels(preds) name = "model__%s__preprocesser__%s__expected__%.4f.pkl"%("LinearSVC(C=0.1, weight_class=balanced)", "sparseBagOfWords", expectedScore) savePickle(pjoin("./Results/", name), mcPreds) # Naive Bayes with tf-idf print("\n" + "#"*50 + "\n") model = LinearSVC(C=11., class_weight="balanced") preprocesser = TfIdfTransformer(norm="l1") _, expectedScore = evaluateModel(model, "./Data/Learn/sequences.pkl", "./Data/Learn/labels.pkl", preprocesser) preds = getPredictions(model, "./Data/Learn/sequences.pkl", "./Data/Learn/labels.pkl", "./Data/Test/sequences.pkl") mcPreds = convertLabels(preds) name = "model__%s__preprocesser__%s__expected__%.4f.pkl" name = name%("LinearSVC(C=0.001, weight_class=balanced)", preprocesser, expectedScore)
def get_affinity(): t0 = time() A = rbf_kernel(loadPickle('./models/trump_sample_vectors.pickle')) savePickle(A, './models/trump_sample_affinity.pickle') print(A.shape) print("Spectral clustering took {}s".format(time() - t0))
model.add(Dense(output_dim=128)) model.compile("nadam", "mae") print('Training...') i = 0 for epoch in range(EPOCHS): random.seed(42) random.shuffle(text) print(' EPOCH:', epoch) for text_descriptors, img_descriptors in utils.getBatch( text, images, BATCH_SIZE): print(vstack(text_descriptors).shape) """ tmp = list(zip(text_descriptors, img_descriptors)) random.seed(42) random.shuffle(tmp) text_descriptors, img_descriptors = zip(*tmp)""" t0 = time.time() mlp.partial_fit(vstack(text_descriptors), img_descriptors) print(' Partial fit {} took: {} min, Score {}'.format( i, round((time.time() - t0) / 60, 2), mlp.loss_)) """ mlp.fit(vstack(text_descriptors), img_descriptors) """ """ i = i+1 if i == 2: break""" print('Saving model...') utils.savePickle(mlp, params.CNN_MLP) print('Done!')
9 03797390 Mug 214 | 10 02880940 Bowl 186 ''' anchor_vects, labels = getRecons(num_to_get=10, cat_label_index=8) #%% Interpolate between 2 set reconstructions from the previous method interpolateDesigns(anchor_vects, labels, 3, 5) #%% Run model on all data to get latent vects and loss. Used for streamlit app and other places. shape2loss = {} shape2vec = {} for sample, label in tqdm(zip(all_voxs, all_mids), unit_scale=True, desc="Saving shape 2 vec: ", unit=" encodes", total=len(all_voxs)) : sample = tf.cast(sample, dtype=tf.float32) shape2vec[label] = model.encode(sample[None,...], reparam=True).numpy()[0] shape2loss[label] = model.compute_loss(sample[None,...]).numpy() ut.savePickle(os.path.join(lg.root_dir,"shape2vec.pkl"), shape2vec) ut.savePickle(os.path.join(lg.root_dir,"shape2loss.pkl"), shape2loss) #%% Shapetime journey code for fun. Shapetime journey methods : def showRandIndices(num_to_show=100) : for i in np.random.randint(0, len(shape2vec), size=num_to_show) : vox = shapemodel.decode(shape2vec[mids[i]][None,...], apply_sigmoid=True)[0,...,0] ut.plotVox(vox, step=2, limits = cf_limits, title=i) def journey(journey_length = 20, vects_sample=8, max_dist=8, interp_points=6, plot_step=2, start_index = 715) model.training=False journey_vecs = [] visited_indices = [start_index] journey_mids = [] mids = list(shape2vec.keys())
print "Code:", code org = self.tga.getOrgDetails(code) self.sofar.append(org) if org.DeliveryNotifications and org.Scopes: if code in code2cid.keys(): print "Cid:", code2cid[code] score, qualsno = self.tga.get_org_scale(org) print "Score:", score self.cid2scale[(code2cid[code], code)] = (score, qualsno) else: print "New company {} found!".format(code) self.new_company.append(code) except Exception, e: self.error[code] = e time.sleep(15) utils.savePickle(var=self.sofar, filename="all_TGA") utils.savePickle(var=self.cid2scale, filename="cid2scale") utils.savePickle(var=self.error, filename="scalepipeError") utils.savePickle(var=self.new_company, filename="new_company") # for cid, code in self.cid2scale.keys(): # print "Updating company with RTOcode: {}".format(code) # self._updateCompany('scale_score', self.cid2scale[(cid, code)], companyID=cid) # print "Update finished" # self.updated.append(code) def _addContact2Company(self, vid, companyId): print "Adding contacts to company" url = 'https://api.hubapi.com/companies/v2/companies/{0}/contacts/{1}?hapikey={2}'.format(str(companyId), str(vid),
from preprocessing import getMeanVectors from utils import openPickle, savePickle if not os.path.isfile("./Data/Learn/embeddedMeanSequences.pkl"): encoder = openPickle("./Data/dict.pkl") decoder = {encoder[key]: key for key in encoder} w2v = word2vec.load("./Resources/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin") preprocesser = lambda x: getMeanVectors(x, w2v, decoder) sequences = np.array(openPickle("./Data/Learn/correctedSequences.pkl")) for i in range(len(sequences) // 5000): if i == 0: embeddedSeq = preprocesser(sequences[0:5000]) else: embeddedSeq = np.vstack((embeddedSeq, preprocesser(sequences[5000 * i: 5000 * (i+1)]))) print("Process until i = %s"%i) embeddedSeq = np.vstack((embeddedSeq, preprocesser(sequences[5000 * i:]))) savePickle("./Data/Learn/embeddedMeanSequences.pkl", embeddedSeq) model = XGBClassifier(n_estimators=500, max_depth=5, reg_alpha=10., reg_lambda=20.) _, expectedScore = evaluateModel(model, "./Data/Learn/embeddedMeanSequences.pkl", "./Data/Learn/labels.pkl", lambda x:x) print("")
paddedTrainSeq = preprocessDeepModel("./Data/Learn/correctedSequences.pkl", "./Data/Learn/kerasSequences.pkl", 409) labels = toBoolList(openPickle("./Data/Learn/labels.pkl")) trainInd, testInd = getTrainTest(labels) X_train, X_val = paddedTrainSeq[trainInd], paddedTrainSeq[testInd] y_train, y_val = labels[trainInd], labels[testInd] # Test Data sequences = openPickle("./Data/Test/sequences.pkl") correcter = openPickle("./Resources/tokenCorrecter.pkl") correctedSequences = sequencesCorrecter(sequences, correcter) savePickle("./Data/Test/correctedSequences.pkl", correctedSequences) paddedSeq = preprocessDeepModel("./Data/Test/correctedSequences.pkl", "./Data/Test/kerasSequences.pkl", 409) # CNN CNNPath = "./Resources/CNNWeight/CNNWeight.h5" cnn = load_model(CNNPath) ## Evaluate score trainPreds = cnn.predict(X_train).flatten() print("Training score: %.4f" % f1_score(trainPreds > 0.5, y_train)) valPreds = cnn.predict(X_val).flatten()