def generateTeacherScore(X_train, X_cross, X_test, X_eval, X_predict, Y_train,Y_predict): teacher = pd.DataFrame(X_train['teacher_acctid_x']) teacher_cross = pd.DataFrame(X_cross['teacher_acctid_x']) teacher_test = pd.DataFrame(X_test['teacher_acctid_x']) teacher_eval = pd.DataFrame(X_eval['teacher_acctid_x']) teacher_predict = pd.DataFrame(X_predict['teacher_acctid_x']) vectorizer = DictVectorizer(sparse = True) teacher_dummies_train = vectorizer.fit_transform(teacher.T.to_dict().values()) teacher_dummies_cross = vectorizer.transform(teacher_cross.T.to_dict().values()) teacher_dummies_test = vectorizer.transform(teacher_test.T.to_dict().values()) vectorizer2 = DictVectorizer(sparse = True) teacher_dummies_predict = vectorizer2.fit_transform(teacher_predict.T.to_dict().values()) teacher_dummies_eval = vectorizer2.transform(teacher_eval.T.to_dict().values()) logit = LogisticRegression(penalty='l2',dual=False,tol=1,fit_intercept=True, C=0.00000001, intercept_scaling=1, class_weight='auto', random_state=423) logit.fit(teacher_dummies_train,Y_train) X_train_teacher = logit.predict_proba(teacher_dummies_train)[:,1] X_cross_teacher = logit.predict_proba(teacher_dummies_cross)[:,1] X_test_teacher = logit.predict_proba(teacher_dummies_test)[:,1] logit2 = LogisticRegression(penalty='l2',dual=False,tol=1,fit_intercept=True, C=0.00000001, intercept_scaling=1, class_weight='auto', random_state=423) logit2.fit(teacher_dummies_predict,Y_predict) X_predict_teacher = logit2.predict_proba(teacher_dummies_predict)[:,1] X_eval_teacher = logit2.predict_proba(teacher_dummies_eval)[:,1] return X_train_teacher, X_cross_teacher, X_test_teacher, X_eval_teacher, X_predict_teacher
def main(): print "Loading training set..." data = list(csv.DictReader(open('adult.csv','rU'))) data = remove_missing(data) data_refined , target = refine_data(data) #using DictVectorizer to get data in a Scikit-Learn-usable form vec = DictVectorizer() data_refined = vec.fit_transform(data_refined).toarray() data_train , data_test , target_train , target_test = train_test_split( data_refined , target , test_size = 0.4) print "Fitting the nearest neighbor model..." n=KNeighborsClassifier(n_neighbors=20) n.fit(data_train , target_train) print "Score of nearest neighbour algorithm on cross-validation set:" , n.score(data_test,target_test) print "Loading test set..." data = list(csv.DictReader(open('test.csv','rU'))) data = remove_missing(data) data_refined , target = refine_data(data) #using DictVectorizer to get data in a Scikit-Learn-usable form vec = DictVectorizer() data_refined = vec.fit_transform(data_refined).toarray() print "Score of nearest neighbour algorithm on test set:" , float(n.score(data_refined, target))*100 ,"%"
def _dic_list_to_matrix(self, processedData, normalize): vectorizer = DictVectorizer() if normalize: res = preprocessing.normalize(vectorizer.fit_transform(processedData), norm='l2') else: res = vectorizer.fit_transform(processedData) return vectorizer.get_feature_names(), res
def learn_classify__svm_individual(data, folds, test_fold=4): test_folds = [0, 1, 2, 3, 4] X_train = [] y_train = [] X_test = [] y_test = [] for i in test_folds: if i == test_fold: continue for name in folds[i]: c, ind = parse_filename(name) X_train.append(data[c][ind]['features']) y_train.append(data[c][ind]['meta']['stance']) for i in test_folds: if i != test_fold: continue for name in folds[i]: c, ind = parse_filename(name) X_test.append(data[c][ind]['features']) y_test.append(data[c][ind]['meta']['stance']) vectorizer = DictVectorizer(sparse=True) X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.fit_transform(X_test) clf = svm.LinearSVC() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) return accuracy_score(y_test, y_pred)
class Projects: def __init__(self, outcome_file): self.state_feature_index = 7 self.zip_feature_index = 8 self.binary_feature_index = [12, 13, 14, 15, 16, 17, 19, 20, 32, 33] self.categorical_feature_index = [18, 21, 22, 25, 26, 27, 28] self.numerical_feature_index = [29, 30, 31] self.date_feature_index = 34 self.vec = DictVectorizer(sparse=False) self.load_projects(outcome_file) def load_projects(self, outcome_file): fin = open(outcome_file) self.project_feature_names = fin.next().strip().split(',') self.projects = dict((line.strip().split(',')[0], line.strip().split(','))\ for line in fin) fin.close() def all_features(self, pids): measurements_state = map(lambda k: {str(self.state_feature_index): self.projects[k][self.state_feature_index]}, pids) measurements_zip = map(lambda k: {str(self.zip_feature_index): self.projects[k][self.zip_feature_index][:3]}, pids) measurements_bin = map(lambda k: dict((str(fi), self.projects[k][fi]) for fi in self.binary_feature_index), pids) measurements_cat = map(lambda k: dict((str(fi), self.projects[k][fi]) for fi in self.categorical_feature_index), pids) #measurements_num = map(lambda k: [float(self.projects[k][fi]) for fi in self.numerical_feature_index], pids) measurements_num = map(lambda k: dict((str(fi), str(discretize_num(float(self.projects[k][fi])))) for fi in self.numerical_feature_index), pids) return self.vec.fit_transform(measurements_state), self.vec.fit_transform(measurements_zip), self.vec.fit_transform(measurements_bin), self.vec.fit_transform(measurements_cat), self.vec.fit_transform(measurements_num)#,np.array(measurements_num)
def rw(data, Alpha=0.1, Beta=0.1, Lambda=1.0, M=50000, trajectory=False): # code cues cues = DictVectorizer(dtype=np.int, sparse=False) D = cues.fit_transform([explode(c) for c in data.Cues]) # code outcomes out = DictVectorizer(dtype=np.int, sparse=False) O = out.fit_transform([explode(c) for c in data.Outcomes]) # weight matrix W = np.zeros((len(cues.get_feature_names()), len(out.get_feature_names()))) E = data.Frequency / sum(data.Frequency) rand = alias.multinomial(E) history = dict() iter = 0 while iter < M: iter += 1 item = rand.draw() rwUpdate(W, D[item,:], O[item,:], Alpha, Beta, Lambda) if trajectory: history[iter] = pd.DataFrame(W, columns=out.get_feature_names(), index=cues.get_feature_names(), copy=True) if trajectory: return pd.Panel.from_dict(history) else: return pd.DataFrame(W, columns=out.get_feature_names(), index=cues.get_feature_names())
def main(): global data # baseline using equilibrium equations data = pd.read_csv('serbian.csv') W0 = ndl.ndl(data) diff = np.zeros_like(W0) W = np.zeros_like(W0) # simulate learning for R individuals R = 1000 now = time() P = Pool(6) for i,W1 in P.imap_unordered(simulate,xrange(R)): diff += abs(W1 - W0) W += W1 print >>sys.stderr,i,time()-now diff = diff / R W = W / R # get cue-outcome co-occurrence frequencies cues = DictVectorizer(dtype=int,sparse=False) D = cues.fit_transform([ndl.explode(c) for c in data.Cues]) out = DictVectorizer(dtype=int,sparse=False) X = out.fit_transform([ndl.explode(c) for c in data.Outcomes]) * data.Frequency[:,np.newaxis] O = np.zeros_like(W0) for i in xrange(len(X)): for nz in np.nonzero(D[i]): O[nz] += X[i] # save results np.savez('serbian-rw',diff=diff,W0=W0.as_matrix(),O=O,W=W)
def create_dataset(dataset_name, features): print('Creating "%s" dataset with the following features: %s.' % (dataset_name, ', '.join(features)), file=sys.stderr) dv = DictVectorizer() train_data = read_data('train.csv') train_customers, train_y, train_x, train_weights = zip(*make_features(slice_and_group(train_data), features)) train_x = dv.fit_transform(train_x) os.mkdir(j(DATA_DIR, dataset_name)) save('per-customer-train', dataset_name, dv, train_customers, train_y, train_x, train_weights) test_data = read_data('test.csv') test_customers, test_y, test_x, test_weights = zip(*make_features(test_data.groupby('customer_ID'), features)) test_x = dv.transform(test_x) save('per-customer-test', dataset_name, dv, test_customers, test_y, test_x, test_weights) for cv_i, (train_raw, test_raw) in enumerate(cv(train_data, CV_GROUPS_COUNT)): dv = DictVectorizer() train_customers, train_y, train_x, train_weights = zip(*make_features(slice_and_group(train_raw), features)) train_x = dv.fit_transform(train_x) save('cv%02d_per-customer-train' % cv_i, dataset_name, dv, train_customers, train_y, train_x, train_weights) test_customers, test_y, test_x, test_weights = zip(*make_features(slice_and_group(test_raw), features)) test_x = dv.transform(test_x) save('cv%02d_per-customer-test' % cv_i, dataset_name, dv, test_customers, test_y, test_x, test_weights)
def add_sentence_level_features(sentence_objects, articles, service, sentiment=True): """ add feature vector to sentence object. features: - bow - keywords """ extra_stop = ["n't", "'s", "'m"] vect_bow = DictVectorizer() #BOW features X_bow = vect_bow.fit_transform(Counter(tokenize_sent(sent, extra_stop)) for sent in sentence_objects) #keyword features vect_keywords = DictVectorizer() keywords = get_all_keywords(articles) X_keyword = vect_keywords.fit_transform(make_keyword_dict(sent, keywords) for sent in sentence_objects) print "sentiment analysis" #sentiment features: if sentiment: X_sentiment = [get_sentiment(sent["spacy_sent"], service) for sent in sentence_objects] #concatenate if sentiment: vects = hstack([X_bow, X_keyword, X_sentiment]) else: vects = hstack([X_bow, X_keyword]) #add vector to sentence_object num = 0 for vect in vects.toarray(): sentence_objects[num]["feature_vector"] = vect num += 1 return vects
def bigram_word_feats(self,words): n = 200 bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n) vec = DictVectorizer() measurements = dict([(ngram, True) for ngram in itertools.chain(words, bigrams)]) vec.fit_transform(measurements).toarray() return vec
def getYearFeature(frame): all = getAllData().Publication_year labels = all.unique().tolist() label_dict = dict(zip(labels, range(len(labels)))) dv = DictVectorizer() dv.fit_transform(label_dict) list_of_dicts = [{year: label_dict[year]} for year in frame.Publication_year] return dv.transform(list_of_dicts)
def get_feature_vectorizer (df, cols): """ :return: vectorizers 1-hot-encoding for feature """ feature_vect = DictVectorizer(sparse=True) feature_vect.fit_transform(df[cols].to_dict(outtype='records')) return feature_vect
def categoricalFeatures(): from sklearn.feature_extraction import DictVectorizer onehot_encoder= DictVectorizer() instances=[ {'city':'New York'}, {'city':'San Francisco'}, {'city': 'Chapel Hill'} ] print onehot_encoder.fit_transform(instances).toarray()
def getLanguageFeature(frame): all = getAllData().Language sub = frame.Language all, sub = checkAndReplaceNan(all, sub, unicode(-1)) labels = all.unique().tolist() label_dict = dict(zip(labels, range(len(labels)))) dv = DictVectorizer() dv.fit_transform(label_dict) list_of_dicts = [{lan: label_dict[unicode(lan)]} for lan in sub] return dv.transform(list_of_dicts)
class SearnModelCla(SearnModel_Legacy): def __init__(self, feature_extractor, cr_tags, base_learner_fact, beta_decay_fn=lambda b: b - 0.1, positive_val=1, sparse=True): super(SearnModelCla, self).__init__(feature_extractor=feature_extractor, cr_tags=cr_tags, base_learner_fact=base_learner_fact, beta_decay_fn=beta_decay_fn, positive_val=positive_val, sparse=sparse) def train_parse_models(self, examples): models = {} self.current_parser_dict_vectorizer = DictVectorizer(sparse=True) xs = self.current_parser_dict_vectorizer.fit_transform(examples.xs) for action in PARSE_ACTIONS: # xgboost needs values in [0,1] ys = [1 if i > 0 else 0 for i in examples.get_labels_for(action)] weights = examples.get_weights_for(action) # the cost matrix has 4 cols - [fp,fn,tp,tn] # based on how we compute the costs, the fn cost will be non-zero for positive ground truth # else the fp cost will be non-zero. The other 3 cols will be zero lst_cost_mat = [] for lbl, cost in zip(ys, weights): fp,fn,tp,tn = 0.05,0.05,0.05,0.05 if lbl > 0: fn = cost else: fp = cost lst_cost_mat.append([fp,fn,tp,tn]) cost_mat = np.asanyarray(lst_cost_mat, dtype=np.float) mdl = self.base_learner_fact() mdl.fit(xs, ys, cost_mat) models[action] = mdl self.current_parser_models = models self.parser_models.append(models) def train_crel_models(self, examples): self.current_crel_dict_vectorizer = DictVectorizer(sparse=True) xs = self.current_crel_dict_vectorizer.fit_transform(examples.xs) ys = examples.get_labels() # all costs are equal cost_mat = np.ones((len(ys),4),dtype=np.float) # Keep this simple as not weighted model = self.base_learner_fact() model.fit(xs, ys, cost_mat) self.current_crel_model = model self.crel_models.append(model)
def dynamic_cross_val_predict(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, cv=None, verbose=0, fit_params=None): print "dynamic predict cross val mit %s" % esa_feature_list + unigram_feature_list vec = DictVectorizer() tfidf = TfidfTransformer() X = vec.fit_transform(fv).toarray() # X = tfidf.fit_transform(X).toarray() X, y = cross_validation.indexable(X, y) cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator)) preds_blocks = [] cross_val_step = 0 for train, test in cv: fv_copy = copy.deepcopy(fv) #baue X in jedem Schritt neu for i in range(0,len(fv)): #jedes i steht für einen featuredict feature_dict = fv_copy[i] dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec for feature in esa_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten for feature in unigram_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten X = vec.fit_transform(fv_copy).toarray() # X = tfidf.fit_transform(X).toarray() preds_blocks.append(cross_validation._fit_and_predict(cross_validation.clone(estimator), X, y, train, test, verbose, fit_params)) cross_val_step+=1 preds = [p for p, _ in preds_blocks] locs = np.concatenate([loc for _, loc in preds_blocks]) if not cross_validation._check_is_partition(locs, cross_validation._num_samples(X)): raise ValueError('cross_val_predict only works for partitions') inv_locs = np.empty(len(locs), dtype=int) inv_locs[locs] = np.arange(len(locs)) # Check for sparse predictions if sp.issparse(preds[0]): preds = sp.vstack(preds, format=preds[0].format) else: preds = np.concatenate(preds) return preds[inv_locs]
def show_tree(clf): """ 创建data.dot文件 :param clf: :return: """ vec = DictVectorizer() vec.fit_transform(FEATURE_LIST) with open('data.dot', 'w') as f: f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f) print("[+] 已成功创建 dot 文件")
def make_sessions_features(data, df_sessions): # Drop row with nan values from the "user_id" column as they're useless df_sessions = df_sessions.dropna(subset=["user_id"]) # print df_sessions # Frequency of devices - by user device_freq = df_sessions.groupby('user_id').device_type.value_counts() # Frequency of actions taken - by user action_freq = df_sessions.groupby('user_id').action.value_counts() # Total list of users users = data.id.values def feature_dict(df): f_dict = dict(list(df.groupby(level='user_id'))) res = {} for k, v in f_dict.items(): v.index = v.index.droplevel('user_id') res[k] = v.to_dict() return res # Make a dictionary with the frequencies { 'user_id' : {"IPhone": 2, "Windows": 1}} action_dict = feature_dict(action_freq) device_dict = feature_dict(device_freq) # Transform to a list of dictionaries action_rows = [action_dict.get(k, {}) for k in users] device_rows = [device_dict.get(k, {}) for k in users] device_transf = DictVectorizer() tf = device_transf.fit_transform(device_rows) action_transf = DictVectorizer() tf2 = action_transf.fit_transform(action_rows) # Concatenate the two datasets # Those are row vectors with the frequencies of both device and actions [0, 0, 0, 2, 0, 1, ...] features = sp.hstack([tf, tf2]) # We create a dataframe with the new features and we write it to disk df_sess_features = pd.DataFrame(features.todense()) df_sess_features['id'] = users #left joining data and sessions on user_id final = pd.merge(data, df_sess_features, how='left', left_on='id', right_on='id') final.ix[:, final.columns != 'age_bucket'].fillna(-1, inplace=True) # Using inplace because I have 8GB of RAM # final.ix[:, final.columns != 'age_bucket'] = final.ix[:, final.columns != 'age_bucket'].fillna(-1) final.drop(['id'], axis=1, inplace=True) return final
class StructuredModel: def __init__(self, category, attribute_function): self.category = category self.feature_dict = DictVectorizer() self.label_dict = DictVectorizer() self.get_attributes = attribute_function def train(self, X, Y_all, Y_star, Y_lim, n_iter=10, alpha_sgd=0.1, every_iter=None, adagrad=False, l1=0.0): logging.info('Converting into matrices') X = self.feature_dict.fit_transform(X) logging.info('X: %d x %d', *X.shape) Y_all = self.label_dict.fit_transform(Y_all) logging.info('Y_all: %d x %d', *Y_all.shape) Y_star = numpy.array(Y_star) logging.info('Y_star: %d', *Y_star.shape) Y_lim = numpy.array(Y_lim) logging.info('Y_lim: %d x %d', *Y_lim.shape) self.model = structlearn.StructuredClassifier(n_iter=n_iter, alpha_sgd=alpha_sgd) if every_iter: # call every_iter with StructuredModel and not StructuredClassifier every_iter2 = lambda it, model: every_iter(it, self) else: every_iter2 = every_iter if adagrad and l1: logging.info('Using Adagrad and L1 regularization, lambda:{}'.format(l1)) self.model.fit(X, Y_all, Y_star, Y_lim, every_iter=every_iter2, Adagrad=adagrad, l1_lambda=l1) def score_all(self, inflections, features): X = self.feature_dict.transform([features]) Y_all = [] for i, (tag, _) in enumerate(inflections): label = {attr: 1 for attr in self.get_attributes(self.category, tag)} Y_all.append(label) Y_all = self.label_dict.transform(Y_all) scores = self.model.predict_log_proba(X, Y_all) return [(score, tag, inflection) for score, (tag, inflection) in zip(scores, inflections)] @property def output_features(self): for label in self.label_dict.get_feature_names(): yield label def weights(self, label): j = self.label_dict.feature_names_.index(label) for i, feature in enumerate(self.feature_dict.get_feature_names()): yield feature, self.model.weights[i, j] for k, other_label in enumerate(self.label_dict.get_feature_names()): yield other_label, self.model.y_weights[j, k]
def reverse_transformation(self,bow_dict): """ Reverse the transformation of a dictionary representation of BOW into numpy vectors :return: """ assert isinstance(bow_dict,BaseDict) or isinstance(bow_dict,dict) vec=DictVectorizer() vec.fit_transform(bow_dict) return vec
def getTermStatistics(all_hits, es_index='memex', es_doc_type='page', es=None): if es is None: es = Elasticsearch('http://localhost:9200/') stats = [] docs = [] ttf = {} for i in range(0, len(all_hits), 100): hits = all_hits[i:i+100] term_res = es.mtermvectors(index=es_index, doc_type=es_doc_type, term_statistics=True, fields=['text'], ids=hits) #pprint.pprint(term_res['docs']) for doc in term_res['docs']: #pprint.pprint(doc) if doc.get('term_vectors'): if 'text' in doc['term_vectors']: docs.append(doc['_id']) res = terms_from_es_json(doc) stats.append(res) for k in res.keys(): ttf[k] = res[k]['ttf'] #else: # pprint.pprint(doc) #pprint.pprint(tfidfs) tfidfs = [] for stat in stats: tfidf={} for k in stat.keys(): tfidf[k] =stat[k]['tfidf'] tfidfs.append(tfidf) tfs = [] for stat in stats: tf={} for k in stat.keys(): tf[k] =stat[k]['tf'] tfs.append(tf) v_tfidf = DictVectorizer() v_tf = DictVectorizer() result = [v_tfidf.fit_transform(tfidfs), v_tf.fit_transform(tfs), ttf, v_tfidf.get_feature_names()] return result
def vector(train,test): X_train = train.ix[0:,1:] y_train = train.Hazard vec = DictVectorizer() X_train = X_train.T.to_dict().values() X_train = vec.fit_transform(X_train) X_test = test.T.to_dict().values() X_test = vec.fit_transform(X_test) return X_train,y_train,X_test
class myCityModel(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin): def __init__(self): self.v = DictVectorizer(sparse=False) def fit(self, X, y=None): city = [{'city':x['city']} for x in X] self.v.fit_transform(city) return self def transform(self,X): city = [{'city':x['city']} for x in X] retval = self.v.transform(city) return retval
def get_mushroom_data(): with open(mushroom_file, 'r') as dest_f: data_iter = csv.reader(dest_f, delimiter=",") # Prepare the data as dictionaries. Its not very neat, but it works. data = [ ( {row[0]: 1}, { feature_index: category for feature_index, category in enumerate(row[1:]) } ) for row in data_iter ] # It's always good to randomize the order random.shuffle(data) # Get the labels and the features as lists labels_, features_ = zip(*data) feature_vectorizer = DictVectorizer(sparse=False, dtype=np.uint8) label_vectorizer = DictVectorizer(sparse=False, dtype=np.uint8) # Create features and labels as numpy arrays with one hot encoding features = feature_vectorizer.fit_transform(features_) labels = label_vectorizer.fit_transform(labels_) # Print the meaning of the one-hot encoded features # for i, f in enumerate(feature_vectorizer.get_feature_names()): # print(f) # # for i, f in enumerate(label_vectorizer.get_feature_names()): # print(f) # Split into train and test. split_at = int(len(data) * 0.7) X_training, X_test = features[:split_at, :], features[split_at:, :] y_training, y_test = labels[:split_at, :], labels[split_at:, :] train = IndexableDataset({ 'features': X_training.astype(np.uint8), 'targets': y_training.astype(np.uint8) }) test = IndexableDataset({ 'features': X_test.astype(np.uint8), 'targets': y_test.astype(np.uint8) }) return train, test
def get_numerical_data(data_file): sf_df_data = pd.read_csv(data_file) # sf_df_test = pd.read_csv(test_file) ###### Change Date to Month and Year ######### sf_df_data["Dates"] = pd.to_datetime(sf_df_data["Dates"]) sf_df_data["Year"],sf_df_data["Month"] = sf_df_data['Dates'].apply(lambda x: str(x.year)), sf_df_data['Dates'].apply(lambda x: str(x.month)) # sf_df_test["Dates"] = pd.to_datetime(sf_df_test["Dates"]) # sf_df_test["Year"],sf_df_test["Month"] = sf_df_test['Dates'].apply(lambda x: str(x.year)), sf_df_test['Dates'].apply(lambda x: str(x.month)) print len(pd.unique(sf_df_data['Category'].values.ravel()).tolist()) print pd.unique(sf_df_data['Category'].values.ravel()).tolist() ######## To deal with categorical variables, we can make use of Pandas and DictVectorizer ########### cat_cols = ['Year','DayOfWeek','PdDistrict'] num_cols = ['X','Y'] num_data_X = sf_df_data[num_cols].as_matrix() # num_test_X = sf_df_test[num_cols].as_matrix() max_data = np.amax(abs(num_data_X),0) # max_test = np.amax(abs(num_test_X),0) ### Normalising data num_data_X = num_data_X/max_data # num_test_X = num_test_X/max_test cat_df_data_X = sf_df_data[cat_cols] cat_df_data_Y = sf_df_data[['Category']] cat_dict_data_X = cat_df_data_X.T.to_dict().values() # A list of dictionaries. cat_dict_data_Y = cat_df_data_Y.T.to_dict().values() # cat_df_test_X = sf_df_test[cat_cols] #cat_df_test_Y = sf_df_test[['Category']] # cat_dict_test_X = cat_df_test_X.T.to_dict().values() #cat_dict_test_Y = cat_df_test_Y.T.to_dict().values() vectorizer = DV(sparse=False) vec_data_X = vectorizer.fit_transform(cat_dict_data_X) data_Y = vectorizer.fit_transform(cat_dict_data_Y) # vec_test_X = vectorizer.fit_transform(cat_dict_test_X) #vec_test_Y = vectorizer.fit_transform(cat_dict_test_Y) # data_X = np.hstack((vec_data_X,num_data_X)) ##### remove the lat. and long. from the input data. # test_X = np.hstack((vec_test_X,num_test_X)) data_X = vec_data_X print 'Done converting categorical data' return (data_X,data_Y)
class Q4Transformer(base.BaseEstimator, base.TransformerMixin): ''' class variable: self.col; self.vectorizer ''' def __init__(self): self.col = 'attributes' # initialize the column name # flatten out dics of dicts def flatten_dict(self, Xdict): p_dict = Xdict.copy() for key in p_dict.keys(): #print key, p_dict[key], type(p_dict[key]) if type(p_dict[key]) == dict: # son is a dict, flatten son_dict = self.flatten_dict(p_dict[key]).copy() for son_key in son_dict.keys(): son_dict[key+'_'+son_key] = son_dict.pop(son_key) del p_dict[key] p_dict.update(son_dict) elif type(p_dict[key]) in [unicode,str]: # son is a string, concatate to key son_str = p_dict[key] p_dict[key] = 1 p_dict[key+'_'+son_str] = p_dict.pop(key) elif type(p_dict[key]) not in [bool, int, float]: raise ValueError("type error in flatten_dict!") return p_dict def fit(self, X, y=None): # flatten the train dict attr_train = [self.flatten_dict(record[self.col]) for record in X] # transform the training records self.vectorizer = DictVectorizer(sparse=False) self.vectorizer.fit_transform(attr_train) return self def transform(self, X): # transform the test record if type(X) is list: attr_X = [self.flatten_dict(record[self.col]) for record in X] else: attr_X = self.flatten_dict(X[self.col]) X_trans = self.vectorizer.transform(attr_X) return X_trans
def export(self, query, n_topics, n_words, title="PCA Export", fname="PCAExport"): vec = DictVectorizer() rows = topics_to_vectorspace(self.model, n_topics, n_words) X = vec.fit_transform(rows) pca = skPCA(n_components=2) X_pca = pca.fit(X.toarray()).transform(X.toarray()) match = [] for i in range(n_topics): topic = [t[1] for t in self.model.show_topic(i, len(self.dictionary.keys()))] m = None for word in topic: if word in query: match.append(word) break pyplot.figure() for i in range(X_pca.shape[0]): pyplot.scatter(X_pca[i, 0], X_pca[i, 1], alpha=.5) pyplot.text(X_pca[i, 0], X_pca[i, 1], s=' '.join([str(i), match[i]])) pyplot.title(title) pyplot.savefig(fname) pyplot.close()
def get_vector(name, feature_names, full_vector): """ Returns a complete feature vector """ name_features = {} name_features["last_letter"] = name[-1] name_features["last_two"] = name[-2:] name_features["last_is_vowel"] = 0 if name[-1] in "aeiouy" else 0 vectorizer = DictVectorizer() small_vector = vectorizer.fit_transform(name_features).toarray()[0] small_feature_names = vectorizer.get_feature_names() hit_count = 0 for index, feature_name in enumerate(feature_names): if feature_name in small_feature_names: full_vector[index] = small_vector[small_feature_names.index(feature_name)] hit_count += 1 else: full_vector[index] = 0 assert hit_count == len(small_feature_names) == small_vector.shape[0] assert full_vector.shape[0] == len(feature_names) return full_vector
def pair_vectors(pairs, features, words, output_path): vectorizer = DictVectorizer() vectors = vectorizer.fit_transform(x[1] for x in features) vector_map = {word:vector for word, vector in itertools.izip((x[0].split('/')[0] for x in features), vectors)} # Positive examples positive = [] record = [] for specific, general in pairs: positive.append(vector_map[general] - vector_map[specific]) record.append( (specific, general, 1) ) pair_set = set([tuple(x) for x in pairs]) non_positive = [] for i in range(len(positive)): first = second = None while first == second or (first, second) in pair_set: first = words[random.randint(len(words))] second = words[random.randint(len(words))] non_positive.append(vector_map[second] - vector_map[first]) record.append( (first, second, 0) ) data = vstack(positive + non_positive) target = [1]*len(positive) + [0]*len(non_positive) # Save dataset with open(os.path.join(output_path,'wn-noun-dependencies.mat'), 'wb') as data_file: dump_svmlight_file(data, target, data_file) with open(os.path.join(output_path,'wn-noun-dependencies.json'), 'w') as record_file: json.dump(record, record_file)
def extractData(features, examples=None, scaler=None, featureOrder=None, scaling=False): vec = DictVectorizer() samples = vec.fit_transform(features) featureNames = vec.get_feature_names() if (featureOrder != None): indices = [featureNames.index(feature) for feature in featureOrder] samples = samples[:, indices] imp = pp.Imputer(missing_values='NaN', strategy='mean') if (examples == None): imp.fit(samples) else : imp.fit(examples) impSamples = imp.transform(samples) if (impSamples.shape == samples.shape): samples = impSamples else: print("too few samples to replace missing values, using 0's") samples[shouldReplace(samples)]=0 # if (scaler == None): # scaler = pp.StandardScaler(with_mean=False) # scaler.fit(samples) # samples = scaler.transform(samples) if (scaling): samples = pp.scale(samples,with_mean=False) if (sprs.isspmatrix(samples)): samples = samples.todense() return [samples, featureNames,imp,scaler]
# 把数据转化成训练集的模式 feature_list = [] label_list = [] for row in reader: label_list.append((row[-1])) row_dict = {} for i in range(1, len(row) - 1): row_dict[head[i]] = row[i] feature_list.append(row_dict) print(feature_list) # print(label_list) #然后转化成数字特征集 比如age有三个类别 001,010,100用三个数字位表示 vec = DictVectorizer() dummyX = vec.fit_transform(feature_list).toarray() print("dummyX: " + str(dummyX)) print(vec.get_feature_names()) lab = preprocessing.LabelBinarizer() dummyY = lab.fit_transform(label_list) # le = preprocessing.LabelEncoder() # le.fit_transform() #变换成0-n 某列分成n中类别 print("dummyY: " + str(dummyY)) #准备训练,调用训练模型 clf = tree.DecisionTreeClassifier(criterion='entropy') clf = clf.fit(dummyX, dummyY) print("ctf: " + str(clf)) # Visualize model
tf_idf = TfidfVectorizer(min_df=5) train_full_descr_transformed = tf_idf.fit_transform( train_data['FullDescription'].values.astype('U'), y=None) test_full_descr_transformed = tf_idf.transform( test_data['FullDescription'].values.astype('U')) train_data['LocationNormalized'].fillna('nan', inplace=True) train_data['ContractTime'].fillna('nan', inplace=True) from sklearn.feature_extraction import DictVectorizer enc = DictVectorizer() X_train_categ = enc.fit_transform( train_data[['LocationNormalized', 'ContractTime']].to_dict('records')) X_test_categ = enc.transform(test_data[['LocationNormalized', 'ContractTime']].to_dict('records')) """ print ('X_train_categ size: ', X_train_categ.size, '\n') print ('X_test_categ size: ', X_test_categ.size, '\n') print ('test_data[[LocationNormalized, ContractTime]: ', test_data[['LocationNormalized', 'ContractTime']], '\n') print ('X_train_categ: ', X_train_categ, '\n') print ('train_full_descr_transformed size: ', train_full_descr_transformed.size, '\n') print ('train_data[LocationNormalized] size: ', train_data['LocationNormalized'].size, '\n') """ from scipy.sparse import hstack transformed_data = hstack( [train_full_descr_transformed,
class BotView: """ This class starts with a temporal bot cluster. If the cluster has above a minimum number of edges connecting the members, then it will examine the followers sets and find common followers who are highly connectd to many of the cluster members. It will then get the followers of these highly connected followers and repeat the process until enough steps have been taken or no more followers satisfy the criterion. """ def __init__(self, tokens_ar, users, file_path): """ Initializes the data structures, connects to the PostgreSQL database, and sets up the Twitter API connection :param tokens_ar: array of tokens for Twitter API :param cluster: Seed cluster for this object :return: """ # Clique is the current set of all highly-connected nodes at all levels self.clique = set() # to_check is the nodes that we need to find followers for self.to_check = set() # user_followers is the dictionary of node_id: followers_list for this round. Resets every round. WHY?? self.got_followers = set() self.current_level_users = [] # followers is the set of all followers for all nodes in to_check. Resets every round. WHY?? self.followers = [] self.user_info = {} self.users = set() self.current_level_timelines = {} self.ignore_users = set() self.current_filepath = file_path self.stream_filepath = '/home/amanda/bigDisk/Twitter/Debot2/stream' self.user_features = {} self.features_list = [] self.vec = DictVectorizer() try: load_dotenv('/home/amanda/bigDisk/Twitter/creds/.env') username = os.getenv('DATABASE_USER') password = os.getenv('DATABASE_PASSWORD') conn_string = "dbname='twitter' user="******" password = "******"Don't have id for: " + user self.clique.add((user, self.level)) self.to_check.add(user) self.n = float(len(self.clique)) self.original_n = self.n def explore(self): """ Pops items from to_check and adds their followers to user_followers :return: """ i = 0 # Need to reset followers and user_followers for this round self.followers = [] self.current_level_users = [] while self.to_check: user = self.to_check.pop() # if we haven't already found the followers for this user if user not in self.got_followers and user not in self.ignore_users: self.cur.execute("SELECT followers FROM name_id WHERE user_id = %s;", (str(user),)) f = self.cur.fetchone() # If we have queried this user in the past it will be in the db, so we don't have to waste a query on it if f: if f[0]: if f[0] == '[]': self.ignore_users.add(user) continue try: followers = ast.literal_eval(f[0]) except ValueError: self.ignore_users.add(user) continue self.got_followers.add(user) self.followers.extend(followers) continue # Otherwise we query the Twitter API for this user's followers self.cur.execute('SELECT deleted, suspended, other_error FROM followers WHERE user_id = %s;', (str(user),)) f = self.cur.fetchone() if f: if f[0] or f[1] or f[2]: self.ignore_users.add(user) continue self.query_api(user) def query_api(self, user): """ Query Twitter API for the followers of a given user. Add this entry to user_followers, add to followers, and add to database :param user: The user of interest :return: """ try: followers = self.api.followers_ids(user) self.got_followers.add(user) self.followers.extend(followers) self.cur.execute('SELECT * FROM name_id WHERE user_id = %s;', (str(user),)) f = self.cur.fetchone() if f: self.cur.execute('UPDATE name_id SET followers = (%s) WHERE user_id = %s', (str(followers), str(user))) else: self.cur.execute('INSERT INTO name_id (user_id, followers) VALUES (%s, %s);', (str(user), str(followers))) self.con.commit() #print "Added followers for " + str(user) except tweepy.TweepError: traceback.print_exc() print '>>>>>>>>>>>>>>> exception: ' + str(user) self.ignore_users.add(user) self.cur.execute('SELECT * FROM name_id WHERE user_id = %s;', (str(user),)) f = self.cur.fetchone() if f: self.cur.execute('UPDATE name_id SET followers = (%s) WHERE user_id = %s', ('[]', str(user))) else: self.cur.execute('INSERT INTO name_id (user_id, followers) VALUES (%s, %s);', (str(user), '[]')) self.con.commit() def find_bots(self, priors): print "Getting all user info..." self.users_to_query = set() followers_set = set(self.followers) print "Number of followers: " + str(len(self.followers)) follower_counts = Counter(self.followers).most_common() # should fix this to be a more precise measure size_to_keep = int(.15*len(self.followers)) connectedness_threshold = floor(0.3*self.n) tmp_followers = [f[0] for f in follower_counts if f[1] >= connectedness_threshold] if len(tmp_followers) < size_to_keep: tmp_followers.extend([f[0] for f in follower_counts[:size_to_keep] if f[1] > 1]) followers_set = set(tmp_followers) print "Number of connected followers: " + str(len(followers_set)) for follower in followers_set: user_info = None follower = str(follower) if follower not in self.users and follower not in self.ignore_users: self.cur.execute('SELECT suspended, deleted, other_error, user_info_json FROM followers WHERE user_id = %s', (follower,)) record = self.cur.fetchone() if record: if record[0] or record[1] or record[2]: self.ignore_users.add(follower) # print "User is suspended or deleted" continue if record[3]: # print "Already have profile information for user number " + follower self.user_info[follower] = ast.literal_eval(record[3]) continue self.users_to_query.add(follower) get_user_info(self) print "Getting all timeline info and extracting features" for follower in followers_set: timeline = None follower = str(follower) if follower not in self.users and follower not in self.ignore_users: self.users.add(follower) self.cur.execute('SELECT suspended, deleted, other_error, timeline FROM followers WHERE user_id = %s', (follower,)) record = self.cur.fetchone() if record: if record[0] or record[1] or record[2]: self.ignore_users.add(follower) # print "User is suspended or deleted" continue if record[3]: # print "Already have timeline information for user number " + follower # Have to read in file to get timeline info timeline = get_timeline_from_file(self, follower) else: timeline = get_user_timeline(self, follower) else: timeline = get_user_timeline(self, follower) if timeline and self.user_info.get(follower) and len(timeline) > 50: gf = GetFeatures(follower, self.user_info[follower], timeline) try: gf.user_features() gf.collect_tweets() gf.content_features() gf.temporal_features() except Exception as e: print "ERROR GETTING FEATURES" print e print follower print self.user_info[follower] # need to incorporate other network features #gf.features['num_shared_edges'] = follower_counts[user] #cself.user_features[user] = gf.features self.current_level_users.append(follower) self.features_list.append(gf.features) # Axis=0 should be vertical len_priors = len(priors) current_features = priors current_features.extend(self.features_list) print "Performing anomaly detection" #json.dump(priors, open('test.json', 'w'), indent=4, separators=(',', ': ')) X = self.vec.fit_transform(current_features).toarray() current_features = {} X_norm = normalize(X) #print np.any(np.isnan(X)) #print np.all(np.isfinite(X)) print X.shape # X = np.stack([current_features, priors], axis=0) Every round will find outliers, how do we stop exploring? clf = LocalOutlierFactor(n_neighbors=20) clf.fit(X) check_is_fitted(clf, ["threshold_", "negative_outlier_factor_", "n_neighbors_", "_distances_fit_X_"]) if X is not None: X = check_array(X, accept_sparse='csr') y_pred = clf._decision_function(X) else: y_pred = clf.negative_outlier_factor_ #y_pred = clf.fit_predict(X) y_pred_new = y_pred[len_priors:] # Do anomaly detection and set connected followers to certain outliers # this line is a stand-in users_scores = zip(self.current_level_users, y_pred_new) connected_followers = [u[0] for u in users_scores if u[1] <= clf.threshold_] #How do I add back in the outliers to the anomaly detection? Mueen said not to so I will leave for now self.level += 1 # Add highly connected followers to the clique and to_check for follower in connected_followers: self.clique.add((follower, self.level)) self.to_check.add(follower) print self.clique self.n = float(len(self.clique)) print "Current size of cluster: " + str(self.n)
df >>= ( rename(action_type='action', counts='count') >> arrange('minute') >> group_by('country', 'product', 'site', 'action_type') >> mutate( ** {'counts_t-%02.f' % i: X.counts.shift(i) for i in range(1, time_step)}) >> mutate(**{ 'counts_t+%02.f' % i: X.counts.shift(-i) for i in range(1, time_step + 1) }) >> r(X.dropna())) ######################################################################################################### ######## 以下是為了正式使用 (採用所有至目前時間點的資料進行模型訓練) ################################# ########################################### MLP ####################################################### df_feature = df.iloc[:, :5] vec = DictVectorizer(sparse=False) data_feature = vec.fit_transform(df_feature.to_dict('records')) counts_index = int(where(pd.Series(vec.get_feature_names()) == 'counts')) df_x = df.iloc[:, 4:5 + time_step] >> drop('minute') df_x = df_x[df_x.columns.sort_values(ascending=False)] df_y = df.iloc[:, 5 + time_step:] data_x = df_x.as_matrix().reshape(df_x.shape[0], df_x.shape[1], 1) data_y = df_y.as_matrix().reshape(df_y.shape[0], df_y.shape[1], 1) data_feature = data_feature.reshape(data_feature.shape[0], 1, data_feature.shape[1]) data_feature = np.concatenate([data_feature for i in range(10)], axis=1) data_feature[:, :, counts_index] = data_x[:, :, 0] train_x = torch.from_numpy(data_feature).float() train_y = torch.from_numpy(data_y).float()
class SklearnClassifier(ClassifierI): """Wrapper for scikit-learn classifiers.""" def __init__(self, estimator, dtype=float, sparse=True): """ :param estimator: scikit-learn classifier object. :param dtype: data type used when building feature array. scikit-learn estimators work exclusively on numeric data. The default value should be fine for almost all situations. :param sparse: Whether to use sparse matrices internally. The estimator must support these; not all scikit-learn classifiers do (see their respective documentation and look for "sparse matrix"). The default value is True, since most NLP problems involve sparse feature sets. Setting this to False may take a great amount of memory. :type sparse: boolean. """ self._clf = estimator self._encoder = LabelEncoder() self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse) def __repr__(self): return "<SklearnClassifier(%r)>" % self._clf def classify_many(self, featuresets): """Classify a batch of samples. :param featuresets: An iterable over featuresets, each a dict mapping strings to either numbers, booleans or strings. :return: The predicted class label for each input sample. :rtype: list """ X = self._vectorizer.transform(featuresets) classes = self._encoder.classes_ return [classes[i] for i in self._clf.predict(X)] def prob_classify_many(self, featuresets): """Compute per-class probabilities for a batch of samples. :param featuresets: An iterable over featuresets, each a dict mapping strings to either numbers, booleans or strings. :rtype: list of ``ProbDistI`` """ X = self._vectorizer.transform(featuresets) y_proba_list = self._clf.predict_proba(X) return [self._make_probdist(y_proba) for y_proba in y_proba_list] def labels(self): """The class labels used by this classifier. :rtype: list """ return list(self._encoder.classes_) def train(self, labeled_featuresets): """ Train (fit) the scikit-learn estimator. :param labeled_featuresets: A list of ``(featureset, label)`` where each ``featureset`` is a dict mapping strings to either numbers, booleans or strings. """ X, y = list(zip(*labeled_featuresets)) X = self._vectorizer.fit_transform(X) y = self._encoder.fit_transform(y) self._clf.fit(X, y) return self def _make_probdist(self, y_proba): classes = self._encoder.classes_ return DictionaryProbDist({classes[i]: p for i, p in enumerate(y_proba)})
for post in posts: for comment in post['comments']: #取得八卦文文章之鄉民留言 l = comment['content'].strip() #去頭去尾換行之類的字符 if l and comment['score'] != 0: d = defaultdict(int) for w in jieba.cut(l): # w 是針對 l 中的文字斷詞後所得之詞語 d[w] += 1 if len(d) > 0: c_scores.append( 1 if comment['score'] > 0 else 0) #每一則留言之標記(推/噓) c_words.append(d) # convert to vectors c_dvec = DictVectorizer() c_tfidf = TfidfTransformer() c_vector = c_dvec.fit_transform(c_words) c_X = c_tfidf.fit_transform(c_vector) #將一千篇所有鄉民留言的斷詞文字矩陣轉成向量並計算tf-idf # build and train the classifier c_svc = LinearSVC() c_svc.fit(c_X, c_scores) #分類留言的情緒 def comment_sentiment_classifier(model, dvec, tfidf, text): l = text.strip() #去頭去尾換行之類的字符 d = defaultdict(int) for w in jieba.cut(l): # w 是針對 l 中的文字斷詞後所得之詞語 d[w] += 1
testdata = pd.read_csv(this_folder + "/insurance-test.csv") X_train = traindata[[ 'Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage' ]] Y_train = traindata['Response'] X_test = testdata[[ 'Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage' ]] vec = DictVectorizer() X_train = vec.fit_transform(X_train.to_dict(orient="record")) X_test = vec.transform(X_test.to_dict(orient="record")) gnb = GaussianNB() gnb.fit(X_train.toarray(), Y_train) # input X,y for training mnb = MultinomialNB() mnb.fit(X_train, Y_train) Y_test1 = gnb.predict(X_test.toarray()) output = pd.DataFrame({'id': testdata['id'], 'Response': Y_test1}) output.to_csv('Bayes_gnb.csv', index=False) Y_test2 = mnb.predict(X_test.toarray()) output = pd.DataFrame({'id': testdata['id'], 'Response': Y_test2}) output.to_csv('Bayes_mnb.csv', index=False)
pData.feature_dicts[d - 1] for d in combined_partitions_ids[0] ] test_dicts = pData.feature_dicts = [ pData.feature_dicts[d - 1] for d in combined_partitions_ids[1] ] train_dicts = pData.feature_dicts = [ pData.feature_dicts[d - 1] for d in combined_partitions_ids[2] ] binary_genres = combined_partitions_binary_genres[0] ## Begin function block #convert to tf-idf model tfidf = TfidfTransformer() vec = DictVectorizer() vect = vec.fit_transform(feature_select_dicts) adjusted = tfidf.fit_transform(vect) term_indices = list(vec.vocabulary_.items()) #alphabetical order term_indices.sort(key=operator.itemgetter(1)) term_list = [i[0] for i in term_indices] data = adjusted.toarray() p_tuples = [] for column in data.T: p, c = spearmanr(column, binary_genres) f_tuple = (p, c) p_tuples.append(f_tuple)
def byte_ngram(files_list, addrlength=32, n=1): dicts_list = [] total_files = len(files_list) bad_files_names = [] for idx, file_name in enumerate(files_list): bytes_file = DATASET_DIR + file_name + '.bytes.gz' try: with gzip.open(bytes_file, 'rt') as fp: bytedict = {} hex_seq = "" for line in fp.readlines(): if not line.strip(): continue else: address = int(addrlength / 4) # hex to bytes # ensure that addresses values will not be counted # in the ngram calculation hex_seq = hex_seq + line[address:].strip() hex_seq = hex_seq.replace(" ", "") for i in range(0, len(hex_seq) - 1, 2): # ignore bytes that contain the "?" character if hex_seq[i] == "?" or hex_seq[i + 1] == "?": continue if 2 * n + i > len(hex_seq): break gram = hex_seq[i:(2 * n + i)] if gram not in bytedict.keys(): bytedict[gram] = 1 else: bytedict[gram] += 1 dicts_list.append(bytedict) except Exception as e: bad_files_names.append(file_name) log_exception(e, sys.argv[0], bytes_file) # progress bars always save my sanity progress_bar(idx + 1, total_files, 50) # log the corrupted files for future reference if len(bad_files_names) > 0: with open('bad_bytes_files.txt', 'w') as bfp: for name in bad_files_names: bfp.write(name + '.bytes\n') # convert list of dictionaries to a byte ngram count numpy array vec = DictVectorizer() ngram_freq = vec.fit_transform(dicts_list).toarray() ngram_freq_df = pd.DataFrame(ngram_freq, columns=vec.get_feature_names()) # store frequency of each byte ngram ngram_freq_df.to_csv('features/' + str(n) + 'gram_byte_freq.csv') save_obj(ngram_freq_df, str(n) + 'gram_byte_freq') # transform ngram frequency array to ngram tfidf array transformer = TfidfTransformer(smooth_idf=False) ngram_tfidf = transformer.fit_transform(ngram_freq) # store tfidf of each byte ngram ngram_tfidf_df = pd.DataFrame(ngram_tfidf.todense(), columns=vec.get_feature_names()) ngram_tfidf_df.to_csv('features/' + str(n) + 'gram_byte_tfidf.csv') save_obj(ngram_tfidf_df, str(n) + 'gram_byte_tfidf') return ngram_tfidf_df
def ReadProcTrainData(features, labels): train_TXT = [] train_data = read_data(train, train_TXT) #test = read_data(test) print train_data[ 'Leon is an East Village gem: casual but hip, with well prepared basic French bistro fare, good specials, a warm and lively atmosphere.'] print train_data[ 'Leon is an East Village gem: casual but hip, with well prepared basic French bistro fare, good specials, a warm and lively atmosphere.'][ 1] print train_data[ 'Leon is an East Village gem: casual but hip, with well prepared basic French bistro fare, good specials, a warm and lively atmosphere.'][ 1][0] print train_data[ 'Leon is an East Village gem: casual but hip, with well prepared basic French bistro fare, good specials, a warm and lively atmosphere.'][ 1][1] print train_data[ 'Leon is an East Village gem: casual but hip, with well prepared basic French bistro fare, good specials, a warm and lively atmosphere.'][ 1][2] print train_data[ 'Leon is an East Village gem: casual but hip, with well prepared basic French bistro fare, good specials, a warm and lively atmosphere.'][ 1][3] ############ split data before proceeding ########### #train_TXT = train_TXT[:100] ########################################## GLOBAL ########################################## ############################################################################################################################################# featuredicts = [] ##################################### LOCAL and APPEND ####################################################### l = len(train_TXT) pl = negl = nutl = el = 0 prev_pol = "" prev_idn = "" pol_arr = [] for i in range(l): dlen = len(train_data[train_TXT[i]]) for k in range(dlen): #global_features.append(txt_corp_feats[i]) sentence = train_TXT[i] target = train_data[train_TXT[i]][k][0] cat = train_data[train_TXT[i]][k][1] pol = train_data[train_TXT[i]][k][2] frm = train_data[train_TXT[i]][k][3] to = train_data[train_TXT[i]][k][4] idn = train_data[train_TXT[i]][k][5] if idn == prev_idn: pol_arr.append(prev_pol) else: pol_arr = [] if pol == 'positive' or pol == 'negative' or pol == "neutral": t = TextClass(sentence, target, cat, pol, frm, to, idn, prev_pol, prev_idn, pol_arr) featuredicts.append(t.baselinefeatures()) if t.pol == 'positive': labels.append(1) pl += 1 elif t.pol == 'negative': labels.append(0) negl += 1 elif t.pol == "neutral": labels.append(2) nutl += 1 else: el += 1 prev_pol = t.pol prev_idn = idn print "counts ", pl, negl, nutl, el #sys.exit(0) vec = DictVectorizer() local_features = vec.fit_transform(featuredicts).toarray() global GLOBAL_VEC GLOBAL_VEC = vec #### append local and global features ''' features = [] for i in range(len(global_features)): features.append(np.concatenate((global_features[i], local_features[i]))) features = np.asarray(features) ''' #print local_features print() features.append(local_features) labels = np.array(labels)
from sklearn.feature_extraction import DictVectorizer '''one-hot编码''' onehot_encoder = DictVectorizer() X = [{'city': 'New York'}, {'city': 'San Francisco'}, {'city': 'Chapel Hill'}] print(onehot_encoder.fit_transform(X).toarray()) '''特征标准化''' # 等同于StandardScaler from sklearn import preprocessing import numpy as np X = np.array([[0., 0., 5., 13., 9., 1.], [0., 0., 13., 15., 10., 15.], [0., 3., 15., 2., 0., 11.]]) print(preprocessing.scale(X)) # 能更好的处理异常值 from sklearn.preprocessing import RobustScaler scaler = RobustScaler() X_scaled = scaler.fit_transform(X) print(X_scaled)
def train(data, classifier_file): # do not change the heading of the function data_list = data model_x = [] model_y = [] vo_list = { 'IH', 'UW', 'OY', 'AH', 'ER', 'EY', 'AO', 'AW', 'AY', 'EH', 'AE', 'UH', 'IY', 'AA', 'OW' } co_list = { 'W', 'K', 'HH', 'G', 'JH', 'Z', 'Y', 'N', 'V', 'SH', 'L', 'NG', 'S', 'CH', 'R', 'D', 'B', 'TH', 'F', 'DH', 'T', 'P', 'M', 'ZH' } strong_suffixes = { 'al', 'ance', 'ancy', 'ant', 'ard', 'ary', 'àte', 'auto', 'ence', 'ency', 'ent', 'ery', 'est', 'ial', 'ian', 'iana', 'en', 'ésce', 'ic', 'ify', 'ine', 'ion', 'tion', 'ity', 'ive', 'ory', 'ous', 'ual', 'ure', 'wide', 'y', 'se', 'ade', 'e', 'ee', 'een', 'eer', 'ese', 'esque', 'ette', 'eur', 'ier', 'oon', 'que' } strong_prefixes = { 'ad', 'co', 'con', 'counter', 'de', 'di', 'dis', 'e', 'en', 'ex', 'in', 'mid', 'ob', 'para', 'pre', 're', 'sub', 'a', 'be', 'with', 'for' } neutral_prefixes = { 'down', 'fore', 'mis', 'over', 'out', 'un', 'under', 'up', 'anti', 'bi', 'non', 'pro', 'tri', 'contra', 'counta', 'de', 'dis', 'extra', 'inter', 'intro', 'multi', 'non', 'post', 'retro', 'super', 'trans', 'ultra' } neutral_suffixes = { 'able', 'age', 'al', 'ate', 'ed', 'en', 'er', 'est', 'ful', 'hood', 'ible', 'ing', 'ile', 'ish', 'ism', 'ist', 'ize', 'less', 'like', 'ly' 'man', 'ment', 'most', 'ness', 'old', 's', 'ship', 'some', 'th', 'ward', 'wise', 'y' } suffixes = { 'inal', 'ain', 'tion', 'sion', 'osis', 'oon', 'sce', 'que', 'ette', 'eer', 'ee', 'aire', 'able', 'ible', 'acy', 'cy', 'ade', 'age', 'al', 'al', 'ial', 'ical', 'an', 'ance', 'ence', 'ancy', 'ency', 'ant', 'ent', 'ant', 'ent', 'ient', 'ar', 'ary', 'ard', 'art', 'ate', 'ate', 'ate', 'ation', 'cade', 'drome', 'ed', 'ed', 'en', 'en', 'ence', 'ency', 'er', 'ier', 'er', 'or', 'er', 'or', 'ery', 'es', 'ese', 'ies', 'es', 'ies', 'ess', 'est', 'iest', 'fold', 'ful', 'ful', 'fy', 'ia', 'ian', 'iatry', 'ic', 'ic', 'ice', 'ify', 'ile', 'ing', 'ion', 'ish', 'ism', 'ist', 'ite', 'ity', 'ive', 'ive', 'ative', 'itive', 'ize', 'less', 'ly', 'ment', 'ness', 'or', 'ory', 'ous', 'eous', 'ose', 'ious', 'ship', 'ster', 'ure', 'ward', 'wise', 'ize', 'phy', 'ogy' } prefixes = { 'ac', 'ad', 'af', 'ag', 'al', 'an', 'ap', 'as', 'at', 'an', 'ab', 'abs', 'acer', 'acid', 'acri', 'act', 'ag', 'acu', 'aer', 'aero', 'ag', 'agi', 'ig', 'act', 'agri', 'agro', 'alb', 'albo', 'ali', 'allo', 'alter', 'alt', 'am', 'ami', 'amor', 'ambi', 'ambul', 'ana', 'ano', 'andr', 'andro', 'ang', 'anim', 'ann', 'annu', 'enni', 'ante', 'anthrop', 'anti', 'ant', 'anti', 'antico', 'apo', 'ap', 'aph', 'aqu', 'arch', 'aster', 'astr', 'auc', 'aug', 'aut', 'aud', 'audi', 'aur', 'aus', 'aug', 'auc', 'aut', 'auto', 'bar', 'be', 'belli', 'bene', 'bi', 'bine', 'bibl', 'bibli', 'biblio', 'bio', 'bi', 'brev', 'cad', 'cap', 'cas', 'ceiv', 'cept', 'capt', 'cid', 'cip', 'cad', 'cas', 'calor', 'capit', 'capt', 'carn', 'cat', 'cata', 'cath', 'caus', 'caut', 'cause', 'cuse', 'cus', 'ceas', 'ced', 'cede', 'ceed', 'cess', 'cent', 'centr', 'centri', 'chrom', 'chron', 'cide', 'cis', 'cise', 'circum', 'cit', 'civ', 'clam', 'claim', 'clin', 'clud', 'clus claus', 'co', 'cog', 'col', 'coll', 'con', 'com', 'cor', 'cogn', 'gnos', 'com', 'con', 'contr', 'contra', 'counter', 'cord', 'cor', 'cardi', 'corp', 'cort', 'cosm', 'cour', 'cur', 'curr', 'curs', 'crat', 'cracy', 'cre', 'cresc', 'cret', 'crease', 'crea', 'cred', 'cresc', 'cret', 'crease', 'cru', 'crit', 'cur', 'curs', 'cura', 'cycl', 'cyclo', 'de', 'dec', 'deca', 'dec', 'dign', 'dei', 'div', 'dem', 'demo', 'dent', 'dont', 'derm', 'di', 'dy', 'dia', 'dic', 'dict', 'dit', 'dis', 'dif', 'dit', 'doc', 'doct', 'domin', 'don', 'dorm', 'dox', 'duc', 'duct', 'dura', 'dynam', 'dys', 'ec', 'eco', 'ecto', 'en', 'em', 'end', 'epi', 'equi', 'erg', 'ev', 'et', 'ex', 'exter', 'extra', 'extro', 'fa', 'fess', 'fac', 'fact', 'fec', 'fect', 'fic', 'fas', 'fea', 'fall', 'fals', 'femto', 'fer', 'fic', 'feign', 'fain', 'fit', 'feat', 'fid', 'fid', 'fide', 'feder', 'fig', 'fila', 'fili', 'fin', 'fix', 'flex', 'flect', 'flict', 'flu', 'fluc', 'fluv', 'flux', 'for', 'fore', 'forc', 'fort', 'form', 'fract', 'frag', 'frai', 'fuge', 'fuse', 'gam', 'gastr', 'gastro', 'gen', 'gen', 'geo', 'germ', 'gest', 'giga', 'gin', 'gloss', 'glot', 'glu', 'glo', 'gor', 'grad', 'gress', 'gree', 'graph', 'gram', 'graf', 'grat', 'grav', 'greg', 'hale', 'heal', 'helio', 'hema', 'hemo', 'her', 'here', 'hes', 'hetero', 'hex', 'ses', 'sex', 'h**o', 'hum', 'human', 'hydr', 'hydra', 'hydro', 'hyper', 'hypn', 'an', 'ics', 'ignis', 'in', 'im', 'in', 'im', 'il', 'ir', 'infra', 'inter', 'intra', 'intro', 'ty', 'jac', 'ject', 'join', 'junct', 'judice', 'jug', 'junct', 'just', 'juven', 'labor', 'lau', 'lav', 'lot', 'lut', 'lect', 'leg', 'lig', 'leg', 'levi', 'lex', 'leag', 'leg', 'liber', 'liver', 'lide', 'liter', 'loc', 'loco', 'log', 'logo', 'ology', 'loqu', 'locut', 'luc', 'lum', 'lun', 'lus', 'lust', 'lude', 'macr', 'macer', 'magn', 'main', 'mal', 'man', 'manu', 'mand', 'mania', 'mar', 'mari', 'mer', 'matri', 'medi', 'mega', 'mem', 'ment', 'meso', 'meta', 'meter', 'metr', 'micro', 'migra', 'mill', 'kilo', 'milli', 'min', 'mis', 'mit', 'miss', 'mob', 'mov', 'mot', 'mon', 'mono', 'mor', 'mort', 'morph', 'multi', 'nano', 'nasc', 'nat', 'gnant', 'nai', 'nat', 'nasc', 'neo', 'neur', 'nom', 'nom', 'nym', 'nomen', 'nomin', 'non', 'non', 'nov', 'nox', 'noc', 'numer', 'numisma', 'ob', 'oc', 'of', 'op', 'oct', 'oligo', 'omni', 'onym', 'oper', 'ortho', 'over', 'pac', 'pair', 'pare', 'paleo', 'pan', 'para', 'pat', 'pass', 'path', 'pater', 'patr', 'path', 'pathy', 'ped', 'pod', 'pedo', 'pel', 'puls', 'pend', 'pens', 'pond', 'per', 'peri', 'phage', 'phan', 'phas', 'phen', 'fan', 'phant', 'fant', 'phe', 'phil', 'phlegma', 'phobia', 'phobos', 'phon', 'phot', 'photo', 'pico', 'pict', 'plac', 'plais', 'pli', 'ply', 'plore', 'plu', 'plur', 'plus', 'pneuma', 'pneumon', 'pod', 'poli', 'poly', 'pon', 'pos', 'pound', 'pop', 'port', 'portion', 'post', 'pot', 'pre', 'pur', 'prehendere', 'prin', 'prim', 'prime', 'pro', 'proto', 'psych', 'punct', 'pute', 'quat', 'quad', 'quint', 'penta', 'quip', 'quir', 'quis', 'quest', 'quer', 're', 'reg', 'recti', 'retro', 'ri', 'ridi', 'risi', 'rog', 'roga', 'rupt', 'sacr', 'sanc', 'secr', 'salv', 'salu', 'sanct', 'sat', 'satis', 'sci', 'scio', 'scientia', 'scope', 'scrib', 'script', 'se', 'sect', 'sec', 'sed', 'sess', 'sid', 'semi', 'sen', 'scen', 'sent', 'sens', 'sept', 'sequ', 'secu', 'sue', 'serv', 'sign', 'signi', 'simil', 'simul', 'sist', 'sta', 'stit', 'soci', 'sol', 'solus', 'solv', 'solu', 'solut', 'somn', 'soph', 'spec', 'spect', 'spi', 'spic', 'sper', 'sphere', 'spir', 'stand', 'stant', 'stab', 'stat', 'stan', 'sti', 'sta', 'st', 'stead', 'strain', 'strict', 'string', 'stige', 'stru', 'struct', 'stroy', 'stry', 'sub', 'suc', 'suf', 'sup', 'sur', 'sus', 'sume', 'sump', 'super', 'supra', 'syn', 'sym', 'tact', 'tang', 'tag', 'tig', 'ting', 'tain', 'ten', 'tent', 'tin', 'tect', 'teg', 'tele', 'tem', 'tempo', 'ten', 'tin', 'tain', 'tend', 'tent', 'tens', 'tera', 'term', 'terr', 'terra', 'test', 'the', 'theo', 'therm', 'thesis', 'thet', 'tire', 'tom', 'tor', 'tors', 'tort', 'tox', 'tract', 'tra', 'trai', 'treat', 'trans', 'tri', 'trib', 'tribute', 'turbo', 'typ', 'ultima', 'umber', 'umbraticum', 'un', 'uni', 'vac', 'vade', 'vale', 'vali', 'valu', 'veh', 'vect', 'ven', 'vent', 'ver', 'veri', 'verb', 'verv', 'vert', 'vers', 'vi', 'vic', 'vicis', 'vict', 'vinc', 'vid', 'vis', 'viv', 'vita', 'vivi', 'voc', 'voke', 'vol', 'volcan', 'volv', 'volt', 'vol', 'vor', 'with', 'zo' } neutral_prefixes = upper(neutral_prefixes) neutral_suffixes = upper(neutral_suffixes) strong_prefixes = upper(strong_prefixes) strong_suffixes = upper(strong_suffixes) full_suffixes_set = upper(suffixes) full_prefixes_set = upper(prefixes) suffix = {"1", "2", "0"} for line in data_list: dict = {} vow_index = [] vowelCount = 0 pattern = "" y = "" dict["pos"] = nltk.pos_tag([line.split(":")[0]])[0][1] word = line.split(":")[0] temp = check_prefix(word, neutral_prefixes) if temp: dict['neu_pre'] = temp temp = check_suffix(word, neutral_suffixes) if temp: dict['neu_suf'] = temp temp = check_prefix(word, strong_prefixes) if temp: dict['str_pre'] = temp temp = check_suffix(word, strong_suffixes) if temp: dict['str_suf'] = temp temp = check_prefix(word, full_suffixes_set) if temp: dict['ful_pre'] = temp temp = check_suffix(word, full_prefixes_set) if temp: dict['ful_suf'] = temp line = line.split(":")[1].strip() syllables = line.split(" ") l = [] for i in syllables: l.append(i if not (i[-1].isdigit()) else i[:-1]) dict.update(Counter({''.join(i) for i in get_ngrams(l)})) dict['len'] = len(syllables) out = '' for i in range(len(syllables)): syl = syllables[i] if syl[-1] in suffix: vowelCount += 1 vow_index.append(i) out += syl[-1] # if syl[-1]=="1": # model_y.append(vowelCount) pattern += "V" else: pattern += "C" model_y.append(out) vowelCount = 0 dict["pattern"] = pattern dict['vow_len'] = len(vow_index) for i in vow_index: vowelCount += 1 if i - 1 >= 0: dict["onset2_" + str(vowelCount)] = syllables[i - 1] if i + 1 < len(syllables): dict["coda1_" + str(vowelCount)] = syllables[i + 1] dict["nucleus_" + str(vowelCount)] = syllables[i][:-1] model_x.append(dict) # print(pd.DataFrame(model_x)) # print(model_y) v = DictVectorizer(sparse=True) X = v.fit_transform(model_x) classifier = LogisticRegression(penalty='l2', class_weight='balanced') classifier.fit(X, model_y) with open(classifier_file, 'wb') as f: pickle.dump(classifier, f) pickle.dump(v, f)
print('-' * 30) print(train_data.describe(include=['O'])) print('-' * 30) print(train_data.head()) print('-' * 30) print(train_data.tail()) # step2 # 数据清洗 # 使用平均年龄来填充年龄中的 nan 值 train_data['Age'].fillna(train_data['Age'].mean(), inplace=True) test_data['Age'].fillna(test_data['Age'].mean(), inplace=True) print(train_data['Embarked'].value_counts()) # 使用登录最多的港口来填充登录港口的 nan 值 train_data['Embarked'].fillna('S', inplace=True) test_data['Embarked'].fillna('S', inplace=True) # step3 # 特征选择 features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] train_features = train_data[features] train_labels = train_data['Survived'] test_features = test_data[features] # 处理符号化的对象,将符号转成数字 0/1 进行表示 dvec = DictVectorizer(sparse=False) train_features = dvec.fit_transform(train_features.to_dict(orient='record')) print(dvec.feature_names_)
#缺失值处理 x["age"].fillna(x["age"].mean(), inplace=True) #转换成字典 x = x.to_dict(orient="records") #数据集划分 from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22) #字典特征抽取 from sklearn.feature_extraction import DictVectorizer transfer = DictVectorizer() x_train = transfer.fit_transform(x_train) x_test = transfer.transform(x_test) #决策树预估器: from sklearn.tree import DecisionTreeClassifier, export_graphviz estimator = DecisionTreeClassifier(criterion="entropy", max_depth=8) estimator.fit(x_train, y_train) #模型评估: y_predict = estimator.predict(x_test) print("y_predict:\n", y_predict) print("直接比对真实值和预测值:\n", y_test == y_predict) score = estimator.score(x_test, y_test) print("准确率:\n", score) #可视化决策树 ## http://webgraphviz.com/ export_graphviz(estimator,
x_templist.append(current_dictX) y_templist.append(current_Y) stack, graph = transition.empty_stack(stack, graph) for word in sentence: word['head'] = graph['heads'][word['id']] x_list.extend(x_temp_list) y_list.extend(y_temp_list) print("Encoding the features and classes...") # Vectorize the feature matrix and carry out a one-hot encoding vec = DictVectorizer(sparse=True) X = vec.fit_transform(x_list) # The statement below will swallow a considerable memory # X = vec.fit_transform(X_dict).toarray() # print(vec.get_feature_names()) y, nbr_to_class, classes_to_nbr = encode_classes(y_list) print("Training the model...") classifier = linear_model.LogisticRegression(penalty='l2', dual=True, solver='liblinear') model = classifier.fit(X, y) print(model) print('Predicting') # print(transitions)
class ImitationLearner(object): # initialize the classifier to be learned def __init__(self): # Any classifier could be used here self.model = LogisticRegression() self.vectorizer = DictVectorizer() self.labelEncoder = LabelEncoder() # this function predicts an instance given the state # state keeps track the various actions taken # it does not change the instance in any way, # it does change the state # the predicted structured output is returned in the end def predict(self, structured_instance, state=None, expert_policy_prob=0.0): if state == None: state = self.transitionSystem( structured_instance=structured_instance) # predict all remaining actions # if we do not have any actions we are done while len(state.agenda) > 0: # for each action # pop it from the queue current_action = state.agenda.popleft() # extract features and add them to the action # (even for the optimal policy, it doesn't need the features but they are needed later on) current_action.features = state.extractFeatures( structured_instance=structured_instance, action=current_action) # the first condition is to avoid un-necessary calls to random which give me reproducibility headaches if (expert_policy_prob == 1.0) or (expert_policy_prob > 0.0 and random.random() < expert_policy_prob): current_action.label = state.expert_policy( structured_instance, current_action) else: # predict (probably makes sense to parallelize across instances) # vectorize the features: vectorized_features = self.vectorizer.transform( current_action.features) # predict using the model normalized_label = self.model.predict(vectorized_features) # get the actual label (returns an array, get the first and only element) current_action.label = self.labelEncoder.inverse_transform( normalized_label)[0] # add the action to the state making any necessary updates state.updateWithAction(current_action, structured_instance) # OK return the final state reached return state class params(object): def __init__(self): self.learningParam = 0.1 self.iterations = 40 def train(self, structuredInstances, params): # create the dataset trainingFeatures = [] trainingLabels = [] # for each iteration for iteration in range(params.iterations): # set the expert policy prob expertPolicyProb = pow(1 - params.learningParam, iteration) print("Iteration:" + str(iteration) + ", expert policy prob:" + str(expertPolicyProb)) for structuredInstance in structuredInstances: # so we obtain the predicted output and the actions taken are in state # this prediction uses the gold standard since we need this info for the expert policy actions final_state = self.predict(structuredInstance, expert_policy_prob=expertPolicyProb) # initialize a second state to avoid having to roll-back stateCopy = self.transitionSystem( structured_instance=structuredInstance) # The agenda seems to initialized fine for action in final_state.actionsTaken: # DAgger just ask the expert stateCopy.agenda.popleft() expert_action_label = stateCopy.expert_policy( structuredInstance, action) # add the labeled features to the training data trainingFeatures.append(action.features) trainingLabels.append(expert_action_label) # take the original action chosen to proceed stateCopy.updateWithAction(action, structuredInstance) # OK, let's save the training data and learn some classifiers # vectorize the training data collected training_data = self.vectorizer.fit_transform(trainingFeatures) # encode the labels encoded_labels = self.labelEncoder.fit_transform(trainingLabels) # train self.model.fit(training_data, encoded_labels)
import copy def extract_feats(dataset, class_name): X_dict = copy.deepcopy(dataset) y_symbols = [obs.pop(class_name, None) for obs in X_dict] return X_dict, y_symbols X_dict, y_symbols = extract_feats(dataset, 'will_wait') y = [0 if symb == 'No' else 1 for symb in y_symbols] from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) # Should be true X = vec.fit_transform(X_dict) # y values not included features = vec.get_feature_names() param = vec.get_params(deep=True) ################ MAIN CALLING DECISION TREE LEARNING ############### # and printing import decision_tree_learning as dtl # DECISION TREE BUILDING examples = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] root = dtl.decision_tree_learning_(examples, attributes, None, None, None, features, y, X)
sep=' ', engine='python', quoting=csv.QUOTE_NONE, error_bad_lines=False, nrows=50000) print(df.head()) print(df.isnull().sum()) df = df.fillna(method='ffill') print(df.groupby('Tag').size().reset_index(name='counts')) X = df.drop('Tag', axis=1) v = DictVectorizer(sparse=False) X = v.fit_transform(X.to_dict('records')) y = df.Tag.values classes = np.unique(y) del (df) classes = classes.tolist() #nbrtest=int(X.shape[0]*0.33) #nbrtrain=X.shape[0]-nbrtest #X_train=X[:nbrtrain] #X=np.delete(X,range(nbrtrain), 0) #X_test=X #del(X) #y_train=y[:nbrtrain] #y=np.delete(y, range(nbrtrain))
scaler = preprocessing.StandardScaler() X_train.loc[:, numeric_features] = scaler.fit_transform(X_train[numeric_features]) X_test.loc[:, numeric_features] = scaler.transform(X_test[numeric_features]) #normalizer = preprocessing.Normalizer() #X_train.loc[:, numeric_features] = normalizer.fit_transform(X_train[numeric_features]) #X_test.loc[:, numeric_features] = normalizer.transform(X_test[numeric_features]) np.random.seed = 0 while len(y[y == 1]) < len(y[y == 0]) - 207: ind = np.random.choice( data[data['Survived'] == 1][data['Pclass'] == 1].index) X_train = X_train.append(X_train.loc[ind], ignore_index=True) y = y.set_value(len(y) + 1, y.loc[ind]) encoder = DV(sparse=False) encoded_train_data = encoder.fit_transform( X_train[category_features].T.to_dict().values()) encoded_test_data = encoder.transform( X_test[category_features].T.to_dict().values()) X_train['Combo1'] = X_train['Pclass'] * X_train[ 'Fare'] # * X_train['Family'] * X_train['Age'] X_test['Combo1'] = X_test['Pclass'] * X_test[ 'Fare'] # * X_test['Family'] * X_test['Age'] #X['Combo1'] = ((X['Combo1'] ** 0.1 - 1) / 0.1) #X_test['Combo1'] = ((X_test['Combo1'] ** 0.1 - 1) / 0.1) #X['Combo2'] = X['Age'] * X['Fare'] #X_test['Combo2'] = X_test['Age'] * X_test['Fare'] numeric_features = ['Age', 'Pclass', 'Fare', 'Family', 'Combo1'] #polinom = preprocessing.PolynomialFeatures(2, interaction_only = True) #poly_features = polinom.fit_transform(X_train[numeric_features]) #poly_test_features = polinom.transform(X_test[numeric_features])
sd_y = stats.stdev(y) for observation in x: score_x.append((observation - mean_x) / sd_x) for obseervation in y: score_y.append((observation - mean_y) / sd_y) return (sum([i * j for i, j in zip(score_x, score_y)])) / (n - 1) print(pearson(x, y)) ################# staff = [{ 'name': 'Steve Miller', 'age': 33. }, { 'name': 'Lyndon Jones', 'age': 12. }, { 'name': 'Baxter Morth', 'age': 18. }] vec = DictVectorizer() vec.fit_transform(staff).toarray() print(vec.get_feature_names())
import pickle import math from collections import defaultdict from sklearn.feature_extraction import DictVectorizer ppmi_dic = defaultdict(dict) t_c_dic, t_dic, c_dic, N = pickle.load(open('knock83.txt', 'rb')) ppmi = lambda key: max(math.log(N * t_c_dic[key] / (t_dic[key.split(' ')[0]] * c_dic[key.split(' ')[1]])), 0) for t_c, freq in t_c_dic.items(): if freq >= 10: t, c =t_c.split() ppmi_tc = ppmi(t_c) if ppmi_tc > 0: ppmi_dic[t][c] = ppmi_tc dicvec = DictVectorizer() matrix = dicvec.fit_transform(ppmi_dic.values()) with open('knock84.txt','wb') as w_f: pickle.dump((matrix, list(ppmi_dic.keys())), w_f)
''' #观察Embarked字段取值 #我们发现一共就 3 个登陆港口,其中 S 港口人数最多,占到了 72%,因此我们将其余缺 失的 Embarked 数值均设置为 S: print(train_data['Embarked'].value_counts()) # 使用登录最多的港口来填充登录港口的 nan 值 train_data['Embarked'].fillna('S', inplace=True) test_data['Embarked'].fillna('S',inplace=True) #特征选择 寻找自认为可能会和乘客的预测分类有关系的特征 features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] #选择特定列 train_features = train_data[features] train_labels = train_data['Survived'] test_features = test_data[features] #特征值中是字符串 不方便处理 用数字来代替比如female为1 male为0 Embarked为S C Q三种可能 用0/1表示 from sklearn.feature_extraction import DictVectorizer dvec=DictVectorizer(sparse=False) train_features=dvec.fit_transform(train_features.to_dict(orient='record'))#fit_transform它可以将特征向量转化为特征值矩阵 print(dvec.feature_names_) #clf = DecisionTreeClassifier(criterion='entropy') from sklearn.tree import DecisionTreeClassifier # 构造 ID3 决策树 clf = DecisionTreeClassifier(criterion='entropy') # 决策树训练 clf.fit(train_features, train_labels) test_features=dvec.transform(test_features.to_dict(orient='record')) # 决策树预测 pred_labels = clf.predict(test_features) # 得到决策树准确率 acc_decision_tree = round(clf.score(train_features, train_labels), 6) print(u'score 准确率为 %.4lf' % acc_decision_tree)
class MEMMSequenceLabeler: def transform_input(self, data): return [ self.feat(x, i, padded_history(y, i, self.order)) for x, y in data for i in range(0, len(x)) ] def __init__(self, feat, train_data, order=1, **lr_params): self.order = order self.train_data = train_data self.feat = feat self.vectorizer = DictVectorizer() self.label_encoder = LabelEncoder() train_classifier_x = self.vectorizer.fit_transform( self.transform_input(train_data)) train_classifier_y = self.label_encoder.fit_transform( to_classifier_y(train_data)) self.lr = LogisticRegression(fit_intercept=False, **lr_params) self.lr.fit(train_classifier_x, train_classifier_y) self.v_weights = self.vectorizer.inverse_transform(self.lr.coef_) def weights(self, label): v_index = self.label_encoder.transform([label]) v_weights = self.v_weights[v_index[0]] return v_weights def plot_lr_weights(self, label, how_many=20, reverse=True, feat_filter=lambda s: True): v_index = self.label_encoder.transform([label]) v_weights = self.vectorizer.inverse_transform( self.lr.coef_)[v_index[0]] # print(type(v_weights.items())) filtered = [(k, v) for k, v in v_weights.items() if feat_filter(k)] sorted_weights = sorted(filtered, key=lambda t: t[1], reverse=reverse) return util.plot_bar_graph([w for _, w in sorted_weights[:how_many]], [f for f, _ in sorted_weights[:how_many]], rotation=45) def input_repr(self, x, i, y): return self.feat(x, i, padded_history(y, i, self.order)) def sklearn_repr(self, x, i, y): return self.vectorizer.transform([self.input_repr(x, i, y)]) def predict_next(self, x, i, y): scikit_x = self.vectorizer.transform([self.input_repr(x, i, y)]) return self.label_encoder.inverse_transform( self.lr.predict(scikit_x))[0] def predict_next_hist(self, x, i, hist): scikit_x = self.vectorizer.transform( [self.feat(x, i, padded_history(hist, 1, self.order))]) return self.label_encoder.inverse_transform( self.lr.predict(scikit_x))[0] def labels(self): return self.label_encoder.classes_ def predict_scores(self, x, i, y): return self.lr.predict_log_proba(self.sklearn_repr(x, i, y))[0] def predict_label_scores(self, x, i, y): scores = self.predict_scores(x, i, y) labels = self.labels() return sorted([(labels[label_index], label_score) for label_index, label_score in enumerate(scores)], key=lambda x: -x[1]) def predict_scores_hist(self, x, i, hist): scikit_x = self.vectorizer.transform( [self.feat(x, i, padded_history(hist, 1, self.order))]) return self.lr.predict_log_proba(scikit_x)[0] def predict(self, data): result = [] for x, y in data: y_guess = [] for i in range(0, len(x)): prediction = self.predict_next(x, i, y_guess) y_guess += prediction result.append(y_guess) return result
class SimpleFMLearner: def __init__(self, iter=100, factor=10, use_info=True, path='./', external_fm=None): from pyfm import pylibfm self.__use_info = use_info # temp code, load ml-100k's info if self.__use_info: self.__info = Info(path) # Build and train a Factorization Machine if external_fm: print >> sys.stderr, 'Use external FM: %s' % type(external_fm) self.__fm = external_fm else: print >> sys.stderr, 'iter=%d, factor=%d, use_info=%d' % ( iter, factor, use_info) self.__fm = pylibfm.FM(num_factors=factor, num_iter=iter, verbose=True, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal") def fit(self, train): ''' train : [(userid, itemid, rating)...] ''' train_data = [] y_train = [] for userid, itemid, rating in train: d = self.__make_data(userid, itemid) train_data.append(d) y_train.append(rating) self.__v = DictVectorizer() X_train = self.__v.fit_transform(train_data) y_train = np.array(y_train) print >> sys.stderr, 'x_train.shape=%s, type=%s' % (str( X_train.shape), type(X_train)) print >> sys.stderr, 'y_train.shape=%s, type=%s' % (str( y_train.shape), type(y_train)) if isinstance(self.__fm, tffm.models.TFFMRegressor): self.__fm.fit(X_train, y_train, show_progress=True) else: self.__fm.fit(X_train, y_train) print >> sys.stderr, 'Train completed.' def predict(self, userid, itemid): d = self.__make_data(userid, itemid) X_test = self.__v.transform([d]) preds = self.__fm.predict(X_test) return preds[0] def __make_data(self, userid, itemid): userid = int(userid) itemid = int(itemid) d = {"user_id": str(userid), "movie_id": str(itemid)} if self.__use_info: d = self.__info.process(userid, itemid, d) return d
print(df.shape) # Print the shape of the transformed array print(df_encoded.shape) ------------------------------------------------------------- # Import DictVectorizer from sklearn.feature_extraction import DictVectorizer # Convert df into a dictionary: df_dict df_dict = df.to_dict("records") # Create the DictVectorizer object: dv dv = DictVectorizer() # Apply dv on df: df_encoded df_encoded = dv.fit_transform(df_dict) # Print the resulting first five rows print(df_encoded[:5,:]) # Print the vocabulary print(dv.vocabulary_) ------------------------------------------------------- # Import necessary modules from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline from sklearn.model_selection import cross_val_score # Fill LotFrontage missing values with 0 X.LotFrontage = X.LotFrontage.fillna(0)
if dataset == "train": with open('../data/train_2.dat') as f: for line in f: (userID, movieID, rating) = line.split(' ') data.append({"userID": str(userID), "movieID": str(movieID)}) try: # for matrix factorization, this was y.append(float(rating)) # y.append(float(rating)) except ValueError: print "Check line {l}".format(l=line) users.add(userID) movies.add(movieID) return (data, y, users, movies) train = get_unique_users_movies("train") test = get_unique_users_movies("test") X_train, y_train = train[0], train[1] X_test = test[0] print type(y_train) v = DictVectorizer() X_train_dv = v.fit_transform(X_train) X_test_dv = v.transform(X_test) print X_train_dv
mtx = dp.form_matrix('./data/train.json', type=2) X_train = [] y_train = [] for item in mtx: dic = {} for tag in item[1]: if tag not in dic: dic[tag] = 1 else: dic[tag] += 1 X_train.append(dic) y_train.append(item[0]) v = DictVectorizer(sparse=False) X_train = v.fit_transform(X_train) #############SVM : 0.71 #clf = svm.SVC(kernel='linear') #############MNB : 0.70 #clf = MultinomialNB() #############BNB : 0.71 #clf = BernoulliNB() #############GNB : 0.35 #clf = GaussianNB() #############RF : 0.707 clf = RandomForestClassifier(n_estimators=200, criterion='entropy')
def lexrank(sentences, continuous=False, sim_threshold=0.1, alpha=0.9, use_divrank=False, divrank_alpha=0.25): ''' compute centrality score of sentences. Args: sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ] continuous: if True, apply continuous LexRank. (see reference) sim_threshold: if continuous is False and smilarity is greater or equal to sim_threshold, link the sentences. alpha: the damping factor of PageRank and DivRank divrank: if True, apply DivRank instead of PageRank divrank_alpha: strength of self-link [0.0-1.0] (it's not the damping factor, see divrank.py) Returns: tuple ( { # sentence index -> score 0: 0.003, 1: 0.002, ... }, similarity_matrix ) Reference: Günes Erkan and Dragomir R. Radev. LexRank: graph-based lexical centrality as salience in text summarization. (section 3) http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html ''' # configure ranker ranker_params = {'max_iter': 1000} if use_divrank: ranker = divrank_scipy ranker_params['alpha'] = divrank_alpha ranker_params['d'] = alpha else: ranker = networkx.pagerank_scipy ranker_params['alpha'] = alpha graph = networkx.DiGraph() # sentence -> tf sent_tf_list = [] for sent in sentences: words = tools.word_segmenter_ja(sent) tf = collections.Counter(words) sent_tf_list.append(tf) sent_vectorizer = DictVectorizer(sparse=True) sent_vecs = sent_vectorizer.fit_transform(sent_tf_list) # compute similarities between senteces sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric='cosine') if continuous: linked_rows, linked_cols = numpy.where(sim_mat > 0) else: linked_rows, linked_cols = numpy.where(sim_mat >= sim_threshold) # create similarity graph graph.add_nodes_from(range(sent_vecs.shape[0])) for i, j in zip(linked_rows, linked_cols): if i == j: continue weight = sim_mat[i,j] if continuous else 1.0 graph.add_edge(i, j, weight=weight) scores = ranker(graph, **ranker_params) return scores, sim_mat
#inplace=True:不创建新的对象,直接对原始对象进行修改; # inplace=False:对数据进行修改,创建并返回新的对象承载其修改结果。 X['age'].fillna(X['age'].mean(), inplace=True) #对原始数据进行分割,25%的乘客数据用于测试 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) #对类别型特征进行转化,成为特征向量 from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) #sparse=False意思是不产生稀疏矩阵 #转换特征后,凡是类别型的特征都单独剥离出来,独成一列特征,数值型保持不变 X_train = vec.fit_transform(X_train.to_dict(orient='record')) # 对测试数据的特征进行转换 X_test = vec.transform(X_test.to_dict(orient='record')) # 一、使用单一决策树进行模型训练和预测分析 from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier() #使用默认配置初始化决策树分类器 dtc.fit(X_train, y_train) #使用分割得到的训练数据进行模型学习 dtc_y_predict = dtc.predict(X_test) #使用训练好的决策树模型对测试特征数据进行预测 # 二、使用随机森林分类器进行集成模型的训练以及预测分析 from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) rfc_y_pred = rfc.predict(X_test)
def get_data_queue(args): users, items, labels = [], [], [] if args.dataset == 'ml-100k': data_path = os.path.join(args.data, 'ml-100k', 'u.data') elif args.dataset == 'ml-1m': data_path = os.path.join(args.data, 'ml-1m', 'ratings.dat') elif args.dataset == 'ml-10m': data_path = os.path.join(args.data, 'ml-10m', 'ratings.dat') elif args.dataset == 'youtube-small': data_path = os.path.join(args.data, 'youtube-weighted-small.npy') if 'ml' in args.dataset: # movielens dataset with open(data_path, 'r') as f: for i, line in enumerate(f.readlines()): if args.dataset == 'ml-100k': line = line.split() elif args.dataset == 'ml-1m' or args.dataset == 'ml-10m': line = line.split('::') users.append(int(line[0]) - 1) items.append(int(line[1]) - 1) labels.append(float(line[2])) labels = StandardScaler().fit_transform(np.reshape( labels, [-1, 1])).flatten().tolist() print('user', max(users), min(users)) print('item', max(items), min(items)) users, items, labels = shuffle(users, items, labels) indices = list(range(len(users))) num_train = int(len(users) * args.train_portion) num_valid = int(len(users) * args.valid_portion) if not args.mode == 'libfm': data_queue = torch.utils.data.TensorDataset( torch.tensor(users), torch.tensor(items), torch.tensor(labels)) train_queue = torch.utils.data.DataLoader( data_queue, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[:num_train]), pin_memory=True) valid_queue = torch.utils.data.DataLoader( data_queue, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[num_train:num_train + num_valid]), pin_memory=True) test_queue = torch.utils.data.DataLoader( data_queue, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[num_train + num_valid:]), pin_memory=True) else: # prepare data format for libfm data_queue = [] for i in range(len(users)): data_queue.append({ 'user': str(users[i]), 'item': str(items[i]) }) v = DictVectorizer() data_queue = v.fit_transform(data_queue) train_queue = [ data_queue[:num_train], np.array(labels[:num_train]) ] valid_queue = [ data_queue[num_train:num_train + num_valid], np.array(labels[num_train:num_train + num_valid]) ] test_queue = [ data_queue[num_train + num_valid:], np.array(labels[num_train + num_valid:]) ] else: # 3-d dataset [ps, qs, rs, labels] = np.load(data_path).tolist() labels = StandardScaler().fit_transform(np.reshape( labels, [-1, 1])).flatten().tolist() ps = [int(i) for i in ps] qs = [int(i) for i in qs] rs = [int(i) for i in rs] print('p', max(ps), min(ps)) print('q', max(qs), min(qs)) print('r', max(rs), min(rs)) ps, qs, rs, labels = shuffle(ps, qs, rs, labels) indices = list(range(len(ps))) num_train = int(len(ps) * args.train_portion) num_valid = int(len(ps) * args.valid_portion) if not args.mode == 'libfm': data_queue = torch.utils.data.TensorDataset( torch.tensor(ps), torch.tensor(qs), torch.tensor(rs), torch.tensor(labels)) train_queue = torch.utils.data.DataLoader( data_queue, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[:num_train]), pin_memory=True) valid_queue = torch.utils.data.DataLoader( data_queue, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[num_train:num_train + num_valid]), pin_memory=True) test_queue = torch.utils.data.DataLoader( data_queue, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[num_train + num_valid:]), pin_memory=True) else: # prepare data format for libfm data_queue = [] for i in range(len(ps)): data_queue.append({ 'p': str(ps[i]), 'q': str(qs[i]), 'r': str(rs[i]) }) v = DictVectorizer() data_queue = v.fit_transform(data_queue) train_queue = [ data_queue[:num_train], np.array(labels[:num_train]) ] valid_queue = [ data_queue[num_train:num_train + num_valid], np.array(labels[num_train:num_train + num_valid]) ] test_queue = [ data_queue[num_train + num_valid:], np.array(labels[num_train + num_valid:]) ] return train_queue, valid_queue, test_queue