vectorizers = [tfidf1] tfidf = vectorizers[0] #comment = 'lsa = 1, tfidf2, 175000 -> 1000' comment = 'tfidf1, transition 75' y = np.array(t.ix[:,4:])#[:,9:] y_original = np.array(t.ix[:,4:])#[:,9:] cv_split = 0.2 n = int(np.round(len(t['tweet'].tolist()))) train_end = int(np.round(n*(1-cv_split))) cv_beginning = int(np.round( n*(1-cv_split if cv_split > 0 else 0.8))) train = t['tweet'].tolist()[0:train_end] cv_X_original = np.array(t['tweet'].tolist()[cv_beginning:]) cv_y = np.array(y[cv_beginning:]) c = u.strings_to_classes(t['state']) if cv_split == 0: train = t['tweet'].tolist() else: y = y[0:int(np.round(len(t['tweet'].tolist())*(1-cv_split)))] prediction_grand_all = 0 predict_cv_grand_all = 0 list_predictions = [] list_predictions_test = [] for tfidf in vectorizers: print 'fitting vectorizer...' tfidf.fit(t['tweet'].tolist() + t2['tweet'].tolist()) print 'transforming train set...' #train = tfidf.transform(train)
sales = dict_sales[key][0] if repair_key not in dict_repair: dict_repair[repair_key] = [entry[-1],timespan.days,entry[0],entry[1],entry[2],entry[3],sales] else: dict_repair[repair_key][0] += entry[-1] else: error_count += 1 data = [] for value in dict_repair.values(): data.append([ele for ele in value]) X = np.array(data) X = X[:,[0,1,2,3,6]] fac1 = u.strings_to_classes(X[:,2]) fac2 = u.strings_to_classes(X[:,3]) t1 = u.create_t_matrix(fac1) t2 = u.create_t_matrix(fac2) X = np.hstack([np.float32(X[:,[0,1,4]]),t1,t2]) print X.shape np.save('/home/tim/Downloads/repair/train.npy',X) print 'Saved!' #TODO: use util to create categories #print(t1.ix[0:5,:]) #print(t2.ix[0:5,:])