Exemple #1
0
def generateTeacherScore(X_train, X_cross, X_test, X_eval, X_predict, Y_train,Y_predict):

    teacher = pd.DataFrame(X_train['teacher_acctid_x'])
    teacher_cross = pd.DataFrame(X_cross['teacher_acctid_x'])
    teacher_test = pd.DataFrame(X_test['teacher_acctid_x'])
    teacher_eval = pd.DataFrame(X_eval['teacher_acctid_x'])
    teacher_predict = pd.DataFrame(X_predict['teacher_acctid_x'])
    
    vectorizer = DictVectorizer(sparse = True)
    teacher_dummies_train = vectorizer.fit_transform(teacher.T.to_dict().values())
    teacher_dummies_cross = vectorizer.transform(teacher_cross.T.to_dict().values())
    teacher_dummies_test = vectorizer.transform(teacher_test.T.to_dict().values())
    
    vectorizer2 = DictVectorizer(sparse = True)
    teacher_dummies_predict = vectorizer2.fit_transform(teacher_predict.T.to_dict().values())
    teacher_dummies_eval = vectorizer2.transform(teacher_eval.T.to_dict().values())
    
    logit = LogisticRegression(penalty='l2',dual=False,tol=1,fit_intercept=True, C=0.00000001, intercept_scaling=1, class_weight='auto', random_state=423)
    logit.fit(teacher_dummies_train,Y_train)
    X_train_teacher = logit.predict_proba(teacher_dummies_train)[:,1]
    X_cross_teacher = logit.predict_proba(teacher_dummies_cross)[:,1]
    X_test_teacher = logit.predict_proba(teacher_dummies_test)[:,1]
    
    logit2 = LogisticRegression(penalty='l2',dual=False,tol=1,fit_intercept=True, C=0.00000001, intercept_scaling=1, class_weight='auto', random_state=423)
    logit2.fit(teacher_dummies_predict,Y_predict)
    X_predict_teacher = logit2.predict_proba(teacher_dummies_predict)[:,1]
    X_eval_teacher = logit2.predict_proba(teacher_dummies_eval)[:,1]
    
    return X_train_teacher, X_cross_teacher, X_test_teacher, X_eval_teacher, X_predict_teacher
Exemple #2
0
def main():
	print "Loading training set..."
	data = list(csv.DictReader(open('adult.csv','rU')))
	data = remove_missing(data)
	data_refined , target = refine_data(data)

	#using DictVectorizer to get data in a Scikit-Learn-usable form 
	vec = DictVectorizer()
	data_refined = vec.fit_transform(data_refined).toarray() 

	data_train , data_test , target_train , target_test = train_test_split( data_refined , target , test_size = 0.4)

	print "Fitting the nearest neighbor model..."
	n=KNeighborsClassifier(n_neighbors=20)
	n.fit(data_train , target_train)

	print "Score of nearest neighbour algorithm on cross-validation set:" , n.score(data_test,target_test)

	print "Loading test set..."
	data = list(csv.DictReader(open('test.csv','rU')))
	data = remove_missing(data)
	data_refined , target = refine_data(data)

	#using DictVectorizer to get data in a Scikit-Learn-usable form 
	vec = DictVectorizer()
	data_refined = vec.fit_transform(data_refined).toarray()

	print "Score of nearest neighbour algorithm on test set:" , float(n.score(data_refined, target))*100 ,"%"
 def _dic_list_to_matrix(self, processedData, normalize):
     vectorizer = DictVectorizer()
     if normalize:
         res = preprocessing.normalize(vectorizer.fit_transform(processedData), norm='l2')
     else:
         res = vectorizer.fit_transform(processedData)
     return vectorizer.get_feature_names(), res
def learn_classify__svm_individual(data, folds, test_fold=4):

    test_folds = [0, 1, 2, 3, 4]

    X_train = []
    y_train = []
    X_test = []
    y_test = []

    for i in test_folds:
        if i == test_fold: continue
        for name in folds[i]:
            c, ind = parse_filename(name)
            X_train.append(data[c][ind]['features'])
            y_train.append(data[c][ind]['meta']['stance'])

    for i in test_folds:
        if i != test_fold: continue
        for name in folds[i]:
            c, ind = parse_filename(name)
            X_test.append(data[c][ind]['features'])
            y_test.append(data[c][ind]['meta']['stance'])

    vectorizer = DictVectorizer(sparse=True)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.fit_transform(X_test)

    clf = svm.LinearSVC()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    return accuracy_score(y_test, y_pred)
class Projects:
  def __init__(self, outcome_file):
    self.state_feature_index = 7
    self.zip_feature_index = 8
    self.binary_feature_index = [12, 13, 14, 15, 16, 17, 19, 20, 32, 33]
    self.categorical_feature_index = [18, 21, 22, 25, 26, 27, 28]
    self.numerical_feature_index = [29, 30, 31]
    self.date_feature_index = 34
    self.vec = DictVectorizer(sparse=False)
    self.load_projects(outcome_file)
    
  def load_projects(self, outcome_file):
    fin = open(outcome_file)
    self.project_feature_names = fin.next().strip().split(',')
    self.projects = dict((line.strip().split(',')[0], line.strip().split(','))\
    for line in fin)
    fin.close()
    
  def all_features(self, pids):
    measurements_state = map(lambda k: {str(self.state_feature_index): self.projects[k][self.state_feature_index]}, pids)
    measurements_zip = map(lambda k: {str(self.zip_feature_index): self.projects[k][self.zip_feature_index][:3]}, pids)
    measurements_bin = map(lambda k: dict((str(fi), self.projects[k][fi]) for fi in self.binary_feature_index), pids)
    measurements_cat = map(lambda k: dict((str(fi), self.projects[k][fi]) for fi in self.categorical_feature_index), pids)
    #measurements_num = map(lambda k: [float(self.projects[k][fi]) for fi in self.numerical_feature_index], pids)
    measurements_num = map(lambda k: dict((str(fi), str(discretize_num(float(self.projects[k][fi])))) for fi in self.numerical_feature_index), pids)
    return self.vec.fit_transform(measurements_state), self.vec.fit_transform(measurements_zip), self.vec.fit_transform(measurements_bin), self.vec.fit_transform(measurements_cat), self.vec.fit_transform(measurements_num)#,np.array(measurements_num)
Exemple #6
0
def rw(data, Alpha=0.1, Beta=0.1, Lambda=1.0, M=50000, trajectory=False):

    # code cues

    cues = DictVectorizer(dtype=np.int, sparse=False)
    D = cues.fit_transform([explode(c) for c in data.Cues])
    
    # code outcomes

    out = DictVectorizer(dtype=np.int, sparse=False)
    O = out.fit_transform([explode(c) for c in data.Outcomes])

    # weight matrix

    W = np.zeros((len(cues.get_feature_names()), len(out.get_feature_names())))

    E = data.Frequency / sum(data.Frequency)
    rand = alias.multinomial(E)
    history = dict()

    iter = 0
    while iter < M:   
        iter += 1
        item = rand.draw()
        rwUpdate(W, D[item,:], O[item,:], Alpha, Beta, Lambda)
        if trajectory:
            history[iter] = pd.DataFrame(W, columns=out.get_feature_names(), index=cues.get_feature_names(), copy=True)

    if trajectory:
        return pd.Panel.from_dict(history)
    else:
        return pd.DataFrame(W, columns=out.get_feature_names(), index=cues.get_feature_names())                
        
        
Exemple #7
0
def main():

    global data
    
    # baseline using equilibrium equations
    data = pd.read_csv('serbian.csv')
    W0 = ndl.ndl(data)
    diff = np.zeros_like(W0)
    W = np.zeros_like(W0)
    
    # simulate learning for R individuals
    R = 1000
    now = time()
    P = Pool(6)
    for i,W1 in P.imap_unordered(simulate,xrange(R)):
        diff += abs(W1 - W0)
        W += W1
        print >>sys.stderr,i,time()-now
    diff = diff / R
    W = W / R

    # get cue-outcome co-occurrence frequencies
    cues = DictVectorizer(dtype=int,sparse=False)
    D = cues.fit_transform([ndl.explode(c) for c in data.Cues])
    out = DictVectorizer(dtype=int,sparse=False)
    X = out.fit_transform([ndl.explode(c) for c in data.Outcomes]) * data.Frequency[:,np.newaxis]
    O = np.zeros_like(W0)
    for i in xrange(len(X)):
        for nz in np.nonzero(D[i]):
            O[nz] += X[i]

    # save results
    np.savez('serbian-rw',diff=diff,W0=W0.as_matrix(),O=O,W=W)
def create_dataset(dataset_name, features):
    print('Creating "%s" dataset with the following features: %s.' % (dataset_name, ', '.join(features)),
          file=sys.stderr)

    dv = DictVectorizer()
    train_data = read_data('train.csv')
    train_customers, train_y, train_x, train_weights = zip(*make_features(slice_and_group(train_data), features))
    train_x = dv.fit_transform(train_x)
    os.mkdir(j(DATA_DIR, dataset_name))
    save('per-customer-train', dataset_name, dv, train_customers, train_y, train_x, train_weights)

    test_data = read_data('test.csv')
    test_customers, test_y, test_x, test_weights = zip(*make_features(test_data.groupby('customer_ID'), features))
    test_x = dv.transform(test_x)
    save('per-customer-test', dataset_name, dv, test_customers, test_y, test_x, test_weights)

    for cv_i, (train_raw, test_raw) in enumerate(cv(train_data, CV_GROUPS_COUNT)):
        dv = DictVectorizer()
        train_customers, train_y, train_x, train_weights = zip(*make_features(slice_and_group(train_raw), features))
        train_x = dv.fit_transform(train_x)
        save('cv%02d_per-customer-train' % cv_i, dataset_name, dv, train_customers, train_y, train_x, train_weights)

        test_customers, test_y, test_x, test_weights = zip(*make_features(slice_and_group(test_raw), features))
        test_x = dv.transform(test_x)
        save('cv%02d_per-customer-test' % cv_i, dataset_name, dv, test_customers, test_y, test_x, test_weights)
def add_sentence_level_features(sentence_objects, articles, service, sentiment=True):
    """
        add feature vector to sentence object. features:
        - bow
        - keywords
    """
    extra_stop = ["n't", "'s", "'m"]
    vect_bow = DictVectorizer()
    #BOW features
    X_bow = vect_bow.fit_transform(Counter(tokenize_sent(sent, extra_stop)) for sent in sentence_objects)
    #keyword features
    vect_keywords = DictVectorizer()
    keywords = get_all_keywords(articles)
    X_keyword = vect_keywords.fit_transform(make_keyword_dict(sent, keywords) for sent in sentence_objects)
    print "sentiment analysis"
    #sentiment features:
    if sentiment:
        X_sentiment = [get_sentiment(sent["spacy_sent"], service) for sent in sentence_objects]

    #concatenate
    if sentiment:
        vects = hstack([X_bow, X_keyword, X_sentiment])
    else:
        vects = hstack([X_bow, X_keyword])

    #add vector to sentence_object
    num = 0
    for vect in vects.toarray():
        sentence_objects[num]["feature_vector"] = vect
        num += 1
    return vects
 def bigram_word_feats(self,words):
     n = 200
     bigram_finder = BigramCollocationFinder.from_words(words)
     bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n)
     vec = DictVectorizer()
     measurements = dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
     vec.fit_transform(measurements).toarray()
     return vec 
Exemple #11
0
def getYearFeature(frame):
    all = getAllData().Publication_year
    labels = all.unique().tolist()
    label_dict = dict(zip(labels, range(len(labels))))
    dv = DictVectorizer()
    dv.fit_transform(label_dict)
    list_of_dicts = [{year: label_dict[year]} for year in frame.Publication_year]
    return dv.transform(list_of_dicts)
Exemple #12
0
def get_feature_vectorizer (df, cols):
    """

    :return: vectorizers 1-hot-encoding for feature
    """

    feature_vect = DictVectorizer(sparse=True)
    feature_vect.fit_transform(df[cols].to_dict(outtype='records'))
    return feature_vect
Exemple #13
0
def categoricalFeatures():
    from sklearn.feature_extraction import DictVectorizer
    onehot_encoder= DictVectorizer()
    instances=[
        {'city':'New York'},
        {'city':'San Francisco'},
        {'city': 'Chapel Hill'}
    ]
    print onehot_encoder.fit_transform(instances).toarray()
Exemple #14
0
def getLanguageFeature(frame):
    all = getAllData().Language
    sub = frame.Language
    all, sub = checkAndReplaceNan(all, sub, unicode(-1))
    labels = all.unique().tolist()
    label_dict = dict(zip(labels, range(len(labels))))
    dv = DictVectorizer()
    dv.fit_transform(label_dict)
    list_of_dicts = [{lan: label_dict[unicode(lan)]} for lan in sub]
    return dv.transform(list_of_dicts)
class SearnModelCla(SearnModel_Legacy):
    def __init__(self, feature_extractor, cr_tags, base_learner_fact, beta_decay_fn=lambda b: b - 0.1, positive_val=1, sparse=True):
        super(SearnModelCla, self).__init__(feature_extractor=feature_extractor, cr_tags=cr_tags,
                                                        base_learner_fact=base_learner_fact,
                                                        beta_decay_fn=beta_decay_fn, positive_val=positive_val,
                                                        sparse=sparse)

    def train_parse_models(self, examples):
        models = {}
        self.current_parser_dict_vectorizer = DictVectorizer(sparse=True)
        xs = self.current_parser_dict_vectorizer.fit_transform(examples.xs)

        for action in PARSE_ACTIONS:

            # xgboost needs values in [0,1]
            ys = [1 if i > 0 else 0 for i in examples.get_labels_for(action)]
            weights = examples.get_weights_for(action)

            # the cost matrix has 4 cols - [fp,fn,tp,tn]
            # based on how we compute the costs, the fn cost will be non-zero for positive ground truth
            # else the fp cost will be non-zero. The other 3 cols will be zero

            lst_cost_mat = []
            for lbl, cost in zip(ys, weights):
                fp,fn,tp,tn = 0.05,0.05,0.05,0.05
                if lbl > 0:
                    fn = cost
                else:
                    fp = cost
                lst_cost_mat.append([fp,fn,tp,tn])
            cost_mat = np.asanyarray(lst_cost_mat, dtype=np.float)

            mdl = self.base_learner_fact()
            mdl.fit(xs, ys, cost_mat)

            models[action] = mdl

        self.current_parser_models = models
        self.parser_models.append(models)

    def train_crel_models(self, examples):

        self.current_crel_dict_vectorizer = DictVectorizer(sparse=True)

        xs = self.current_crel_dict_vectorizer.fit_transform(examples.xs)
        ys = examples.get_labels()

        # all costs are equal
        cost_mat = np.ones((len(ys),4),dtype=np.float)
        # Keep this simple as not weighted
        model = self.base_learner_fact()
        model.fit(xs, ys, cost_mat)

        self.current_crel_model = model
        self.crel_models.append(model)
def dynamic_cross_val_predict(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, cv=None,
                              verbose=0, fit_params=None):


    print "dynamic predict cross val mit %s" % esa_feature_list + unigram_feature_list


    vec = DictVectorizer()
    tfidf = TfidfTransformer()

    X = vec.fit_transform(fv).toarray()
    # X = tfidf.fit_transform(X).toarray()

    X, y = cross_validation.indexable(X, y)
    cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator))

    preds_blocks = []

    cross_val_step = 0
    for train, test in cv:

        fv_copy = copy.deepcopy(fv)

        #baue X in jedem Schritt neu
        for i in range(0,len(fv)): #jedes i steht für einen featuredict
            feature_dict = fv_copy[i]
            dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec
            for feature in esa_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten
            for feature in unigram_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten


        X = vec.fit_transform(fv_copy).toarray()
        # X = tfidf.fit_transform(X).toarray()

        preds_blocks.append(cross_validation._fit_and_predict(cross_validation.clone(estimator), X, y,
                                                      train, test, verbose,
                                                      fit_params))

        cross_val_step+=1

    preds = [p for p, _ in preds_blocks]
    locs = np.concatenate([loc for _, loc in preds_blocks])
    if not cross_validation._check_is_partition(locs, cross_validation._num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')
    inv_locs = np.empty(len(locs), dtype=int)
    inv_locs[locs] = np.arange(len(locs))

    # Check for sparse predictions
    if sp.issparse(preds[0]):
        preds = sp.vstack(preds, format=preds[0].format)
    else:
        preds = np.concatenate(preds)
    return preds[inv_locs]
def show_tree(clf):
    """
    创建data.dot文件
    :param clf:
    :return:
    """
    vec = DictVectorizer()
    vec.fit_transform(FEATURE_LIST)
    with open('data.dot', 'w') as f:
        f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f)
    print("[+] 已成功创建 dot 文件")
Exemple #18
0
def make_sessions_features(data, df_sessions):
    # Drop row with nan values from the "user_id" column as they're useless
    df_sessions = df_sessions.dropna(subset=["user_id"])

    # print df_sessions

    # Frequency of devices - by user
    device_freq = df_sessions.groupby('user_id').device_type.value_counts()
    
    # Frequency of actions taken - by user
    action_freq = df_sessions.groupby('user_id').action.value_counts()

    # Total list of users
    users = data.id.values
    def feature_dict(df):
        f_dict = dict(list(df.groupby(level='user_id')))
        res = {}
        for k, v in f_dict.items():
            v.index = v.index.droplevel('user_id')
            res[k] = v.to_dict()
        return res

    # Make a dictionary with the frequencies { 'user_id' : {"IPhone": 2, "Windows": 1}}
    action_dict = feature_dict(action_freq)
    device_dict = feature_dict(device_freq)

    # Transform to a list of dictionaries
    action_rows = [action_dict.get(k, {}) for k in users]
    device_rows = [device_dict.get(k, {}) for k in users]

    device_transf = DictVectorizer()
    tf = device_transf.fit_transform(device_rows)

    action_transf = DictVectorizer()
    tf2 = action_transf.fit_transform(action_rows)

    # Concatenate the two datasets
    # Those are row vectors with the frequencies of both device and actions [0, 0, 0, 2, 0, 1, ...]
    features = sp.hstack([tf, tf2])

    # We create a dataframe with the new features and we write it to disk
    df_sess_features = pd.DataFrame(features.todense())
    
    df_sess_features['id'] = users

    #left joining data and sessions on user_id
    final = pd.merge(data, df_sess_features, how='left', left_on='id', right_on='id')
    final.ix[:, final.columns != 'age_bucket'].fillna(-1, inplace=True)

    # Using inplace because I have 8GB of RAM
    # final.ix[:, final.columns != 'age_bucket'] = final.ix[:, final.columns != 'age_bucket'].fillna(-1)

    final.drop(['id'], axis=1, inplace=True)
    return final
Exemple #19
0
class StructuredModel:
    def __init__(self, category, attribute_function):
        self.category = category
        self.feature_dict = DictVectorizer()
        self.label_dict = DictVectorizer()
        self.get_attributes = attribute_function

    def train(self, X, Y_all, Y_star, Y_lim, n_iter=10, alpha_sgd=0.1, 
             every_iter=None, adagrad=False, l1=0.0):
        logging.info('Converting into matrices')
        X = self.feature_dict.fit_transform(X)
        logging.info('X: %d x %d', *X.shape)
        Y_all = self.label_dict.fit_transform(Y_all)
        logging.info('Y_all: %d x %d', *Y_all.shape)
        Y_star = numpy.array(Y_star)
        logging.info('Y_star: %d', *Y_star.shape)
        Y_lim = numpy.array(Y_lim)
        logging.info('Y_lim: %d x %d', *Y_lim.shape)

        self.model = structlearn.StructuredClassifier(n_iter=n_iter, 
                alpha_sgd=alpha_sgd)
        if every_iter: # call every_iter with StructuredModel and not StructuredClassifier
            every_iter2 = lambda it, model: every_iter(it, self)
        else:
            every_iter2 = every_iter
        if adagrad and l1:
          logging.info('Using Adagrad and L1 regularization, lambda:{}'.format(l1))
        self.model.fit(X, Y_all, Y_star, Y_lim, every_iter=every_iter2, 
                       Adagrad=adagrad, l1_lambda=l1)

    def score_all(self, inflections, features):
        X = self.feature_dict.transform([features])
        Y_all = []
        for i, (tag, _) in enumerate(inflections):
            label = {attr: 1 for attr in self.get_attributes(self.category, tag)}
            Y_all.append(label)
        Y_all = self.label_dict.transform(Y_all)

        scores = self.model.predict_log_proba(X, Y_all)
        return [(score, tag, inflection) for score, (tag, inflection)
                in zip(scores, inflections)]

    @property
    def output_features(self):
        for label in self.label_dict.get_feature_names():
            yield label

    def weights(self, label):
        j = self.label_dict.feature_names_.index(label)
        for i, feature in enumerate(self.feature_dict.get_feature_names()):
            yield feature, self.model.weights[i, j]
        for k, other_label in enumerate(self.label_dict.get_feature_names()):
            yield other_label, self.model.y_weights[j, k]
Exemple #20
0
    def reverse_transformation(self,bow_dict):
        """
        Reverse the transformation of a dictionary representation of BOW into numpy vectors

        :return:
        """
        assert isinstance(bow_dict,BaseDict) or isinstance(bow_dict,dict)

        vec=DictVectorizer()
        vec.fit_transform(bow_dict)

        return vec
def getTermStatistics(all_hits, es_index='memex', es_doc_type='page', es=None):
    if es is None:
        es = Elasticsearch('http://localhost:9200/')

    stats = []
    docs = []

    ttf = {}
    for i in range(0, len(all_hits), 100):
        hits = all_hits[i:i+100]

        term_res = es.mtermvectors(index=es_index,
                                   doc_type=es_doc_type,
                                   term_statistics=True, 
                                   fields=['text'], 
                                   ids=hits)

        #pprint.pprint(term_res['docs'])

        for doc in term_res['docs']:
            #pprint.pprint(doc)
            if doc.get('term_vectors'):
                if 'text' in doc['term_vectors']:
                    docs.append(doc['_id'])
                    res = terms_from_es_json(doc)
                    stats.append(res)
                    for k in res.keys():
                        ttf[k] = res[k]['ttf']
            #else:
             #   pprint.pprint(doc)
        #pprint.pprint(tfidfs)
    
    tfidfs = []
    for stat in stats:
        tfidf={}
        for k in stat.keys():
            tfidf[k] =stat[k]['tfidf']
        tfidfs.append(tfidf)

    tfs = []
    for stat in stats:
        tf={}
        for k in stat.keys():
            tf[k] =stat[k]['tf']
        tfs.append(tf)
    
    v_tfidf = DictVectorizer()
    v_tf = DictVectorizer()
    
    result = [v_tfidf.fit_transform(tfidfs), v_tf.fit_transform(tfs), ttf, v_tfidf.get_feature_names()]

    return result
Exemple #22
0
def vector(train,test):
    
    X_train = train.ix[0:,1:]
    y_train = train.Hazard
    
    vec = DictVectorizer()
    X_train = X_train.T.to_dict().values()
    X_train = vec.fit_transform(X_train)
    
    X_test = test.T.to_dict().values()
    X_test = vec.fit_transform(X_test)
    
    return X_train,y_train,X_test
Exemple #23
0
class myCityModel(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
    def __init__(self):
        self.v = DictVectorizer(sparse=False)

    def fit(self, X, y=None):
        city = [{'city':x['city']} for x in X] 
        self.v.fit_transform(city)    
        return self
    
    def transform(self,X):
        city = [{'city':x['city']} for x in X] 
        retval = self.v.transform(city)
        return retval
def get_mushroom_data():
    with open(mushroom_file, 'r') as dest_f:
        data_iter = csv.reader(dest_f, delimiter=",")

        # Prepare the data as dictionaries. Its not very neat, but it works.
        data = [
            (
                {row[0]: 1},
                {
                    feature_index: category
                    for feature_index, category in enumerate(row[1:])
                }
            ) for row in data_iter
        ]

    # It's always good to randomize the order
    random.shuffle(data)

    # Get the labels and the features as lists
    labels_, features_ = zip(*data)

    feature_vectorizer = DictVectorizer(sparse=False, dtype=np.uint8)

    label_vectorizer = DictVectorizer(sparse=False, dtype=np.uint8)

    # Create features and labels as numpy arrays with one hot encoding
    features = feature_vectorizer.fit_transform(features_)
    labels = label_vectorizer.fit_transform(labels_)

    # Print the meaning of the one-hot encoded features
    # for i, f in enumerate(feature_vectorizer.get_feature_names()):
    #     print(f)
    #
    # for i, f in enumerate(label_vectorizer.get_feature_names()):
    #     print(f)

    # Split into train and test.
    split_at = int(len(data) * 0.7)
    X_training, X_test = features[:split_at, :], features[split_at:, :]
    y_training, y_test = labels[:split_at, :], labels[split_at:, :]

    train = IndexableDataset({
        'features': X_training.astype(np.uint8),
        'targets': y_training.astype(np.uint8)
    })
    test = IndexableDataset({
        'features': X_test.astype(np.uint8),
        'targets': y_test.astype(np.uint8)
    })

    return train, test
def get_numerical_data(data_file):
	sf_df_data = pd.read_csv(data_file)
	# sf_df_test = pd.read_csv(test_file)

	###### Change Date to Month and Year #########
	sf_df_data["Dates"] = pd.to_datetime(sf_df_data["Dates"])
	sf_df_data["Year"],sf_df_data["Month"] = sf_df_data['Dates'].apply(lambda x: str(x.year)), sf_df_data['Dates'].apply(lambda x: str(x.month))

	# sf_df_test["Dates"] = pd.to_datetime(sf_df_test["Dates"])
	# sf_df_test["Year"],sf_df_test["Month"] = sf_df_test['Dates'].apply(lambda x: str(x.year)), sf_df_test['Dates'].apply(lambda x: str(x.month))

	print len(pd.unique(sf_df_data['Category'].values.ravel()).tolist())
	print pd.unique(sf_df_data['Category'].values.ravel()).tolist()

	######## To deal with categorical variables, we can make use of Pandas and DictVectorizer ###########
	cat_cols = ['Year','DayOfWeek','PdDistrict']
	num_cols = ['X','Y']

	num_data_X = sf_df_data[num_cols].as_matrix()
	# num_test_X = sf_df_test[num_cols].as_matrix()

	max_data = np.amax(abs(num_data_X),0)
	# max_test = np.amax(abs(num_test_X),0)   ### Normalising data

	num_data_X = num_data_X/max_data
	# num_test_X = num_test_X/max_test	

	cat_df_data_X = sf_df_data[cat_cols]
	cat_df_data_Y = sf_df_data[['Category']]
	cat_dict_data_X = cat_df_data_X.T.to_dict().values() # A list of dictionaries.
	cat_dict_data_Y = cat_df_data_Y.T.to_dict().values()

	# cat_df_test_X = sf_df_test[cat_cols]
	#cat_df_test_Y = sf_df_test[['Category']]
	# cat_dict_test_X = cat_df_test_X.T.to_dict().values()
	#cat_dict_test_Y = cat_df_test_Y.T.to_dict().values()

	vectorizer = DV(sparse=False)
	vec_data_X = vectorizer.fit_transform(cat_dict_data_X)
	data_Y = vectorizer.fit_transform(cat_dict_data_Y)
	# vec_test_X = vectorizer.fit_transform(cat_dict_test_X)
	#vec_test_Y = vectorizer.fit_transform(cat_dict_test_Y)

	# data_X = np.hstack((vec_data_X,num_data_X))   ##### remove the lat. and long. from the input data.
	# test_X = np.hstack((vec_test_X,num_test_X))

	data_X = vec_data_X

	print 'Done converting categorical data'
	return (data_X,data_Y)
class Q4Transformer(base.BaseEstimator, base.TransformerMixin):
    '''
        class variable: self.col; self.vectorizer
    '''
    def __init__(self):      
        self.col = 'attributes' # initialize the column name
   
    # flatten out dics of dicts
    def flatten_dict(self, Xdict):
        p_dict = Xdict.copy()
        for key in p_dict.keys():
            #print key, p_dict[key], type(p_dict[key])
            if type(p_dict[key]) == dict: 
                # son is a dict, flatten
                son_dict = self.flatten_dict(p_dict[key]).copy()
                for son_key in son_dict.keys():
                    son_dict[key+'_'+son_key] = son_dict.pop(son_key)
                del p_dict[key]
                p_dict.update(son_dict)
            
            elif type(p_dict[key]) in [unicode,str]:
                # son is a string, concatate to key
                son_str = p_dict[key]
                p_dict[key] = 1
                p_dict[key+'_'+son_str] = p_dict.pop(key)
            
            elif type(p_dict[key]) not in [bool, int, float]:    
                raise ValueError("type error in flatten_dict!")
        return p_dict


    def fit(self, X, y=None):
        # flatten the train dict
        attr_train = [self.flatten_dict(record[self.col]) for record in X]
        
        # transform the training records
        self.vectorizer = DictVectorizer(sparse=False)
        self.vectorizer.fit_transform(attr_train)
        return self

    def transform(self, X):
        # transform the test record
        if type(X) is list:
            attr_X = [self.flatten_dict(record[self.col]) for record in X]
        else:
            attr_X = self.flatten_dict(X[self.col])
        X_trans = self.vectorizer.transform(attr_X)
        return X_trans 
Exemple #27
0
  def export(self, query, n_topics, n_words, title="PCA Export", fname="PCAExport"):
    vec = DictVectorizer()
    
    rows = topics_to_vectorspace(self.model, n_topics, n_words)
    X = vec.fit_transform(rows)
    pca = skPCA(n_components=2)
    X_pca = pca.fit(X.toarray()).transform(X.toarray())
    
    match = []
    for i in range(n_topics):
      topic = [t[1] for t in self.model.show_topic(i, len(self.dictionary.keys()))]
      m = None
      for word in topic:
        if word in query:
          match.append(word)
          break

    pyplot.figure()
    for i in range(X_pca.shape[0]):
      pyplot.scatter(X_pca[i, 0], X_pca[i, 1], alpha=.5)
      pyplot.text(X_pca[i, 0], X_pca[i, 1], s=' '.join([str(i), match[i]]))  
     
    pyplot.title(title)
    pyplot.savefig(fname)
     
    pyplot.close()
Exemple #28
0
def get_vector(name, feature_names, full_vector):
    """
    Returns a complete feature vector
    """
    name_features = {}
    name_features["last_letter"] = name[-1]
    name_features["last_two"] = name[-2:]
    name_features["last_is_vowel"] = 0 if name[-1] in "aeiouy" else 0

    vectorizer = DictVectorizer()
    small_vector = vectorizer.fit_transform(name_features).toarray()[0]
    small_feature_names = vectorizer.get_feature_names()

    hit_count = 0
    for index, feature_name in enumerate(feature_names):
        if feature_name in small_feature_names:
            full_vector[index] = small_vector[small_feature_names.index(feature_name)]
            hit_count += 1
        else:
            full_vector[index] = 0

    assert hit_count == len(small_feature_names) == small_vector.shape[0]
    assert full_vector.shape[0] == len(feature_names)

    return full_vector
def pair_vectors(pairs, features, words, output_path):
    vectorizer = DictVectorizer()
    vectors = vectorizer.fit_transform(x[1] for x in features)

    vector_map = {word:vector for word, vector in
                  itertools.izip((x[0].split('/')[0] for x in features),
                                 vectors)}

    # Positive examples
    positive = []
    record = []
    for specific, general in pairs:
        positive.append(vector_map[general] - vector_map[specific])
        record.append( (specific, general, 1) )

    pair_set = set([tuple(x) for x in pairs])
    non_positive = []
    for i in range(len(positive)):
        first = second = None
        while first == second or (first, second) in pair_set:
            first = words[random.randint(len(words))]
            second = words[random.randint(len(words))]
        non_positive.append(vector_map[second] - vector_map[first])
        record.append( (first, second, 0) )
    
    data = vstack(positive + non_positive)
    target = [1]*len(positive) + [0]*len(non_positive)
    
    # Save dataset
    with open(os.path.join(output_path,'wn-noun-dependencies.mat'), 'wb') as data_file:
        dump_svmlight_file(data, target, data_file)

    with open(os.path.join(output_path,'wn-noun-dependencies.json'), 'w') as record_file:
        json.dump(record, record_file)
Exemple #30
0
def extractData(features, examples=None, scaler=None, featureOrder=None, scaling=False):
    vec = DictVectorizer()
    samples = vec.fit_transform(features)
    featureNames = vec.get_feature_names()
    
    if (featureOrder != None):
        indices = [featureNames.index(feature) for feature in featureOrder]
        samples = samples[:, indices]
    imp = pp.Imputer(missing_values='NaN', strategy='mean')
    if (examples == None):
        imp.fit(samples)
    else :
        imp.fit(examples)
    impSamples = imp.transform(samples)
    if (impSamples.shape == samples.shape):
        samples = impSamples
    else:
        print("too few samples to replace missing values, using 0's")
        samples[shouldReplace(samples)]=0
    
#     if (scaler == None):
#         scaler = pp.StandardScaler(with_mean=False)
#         scaler.fit(samples)
#     samples = scaler.transform(samples)
    if (scaling):
        samples = pp.scale(samples,with_mean=False)
    if (sprs.isspmatrix(samples)):
        samples = samples.todense()
    
    return [samples, featureNames,imp,scaler]
Exemple #31
0
# 把数据转化成训练集的模式
feature_list = []
label_list = []
for row in reader:
    label_list.append((row[-1]))
    row_dict = {}
    for i in range(1, len(row) - 1):
        row_dict[head[i]] = row[i]

    feature_list.append(row_dict)
print(feature_list)
# print(label_list)

#然后转化成数字特征集 比如age有三个类别 001,010,100用三个数字位表示
vec = DictVectorizer()
dummyX = vec.fit_transform(feature_list).toarray()

print("dummyX: " + str(dummyX))
print(vec.get_feature_names())
lab = preprocessing.LabelBinarizer()
dummyY = lab.fit_transform(label_list)
# le  = preprocessing.LabelEncoder()
# le.fit_transform() #变换成0-n 某列分成n中类别
print("dummyY: " + str(dummyY))

#准备训练,调用训练模型
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(dummyX, dummyY)
print("ctf: " + str(clf))

# Visualize model
Exemple #32
0
tf_idf = TfidfVectorizer(min_df=5)

train_full_descr_transformed = tf_idf.fit_transform(
    train_data['FullDescription'].values.astype('U'), y=None)
test_full_descr_transformed = tf_idf.transform(
    test_data['FullDescription'].values.astype('U'))

train_data['LocationNormalized'].fillna('nan', inplace=True)
train_data['ContractTime'].fillna('nan', inplace=True)

from sklearn.feature_extraction import DictVectorizer

enc = DictVectorizer()

X_train_categ = enc.fit_transform(
    train_data[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test_categ = enc.transform(test_data[['LocationNormalized',
                                        'ContractTime']].to_dict('records'))
"""
print ('X_train_categ size: ', X_train_categ.size, '\n')
print ('X_test_categ size: ', X_test_categ.size, '\n')
print ('test_data[[LocationNormalized, ContractTime]: ', test_data[['LocationNormalized', 'ContractTime']], '\n')
print ('X_train_categ: ', X_train_categ, '\n')
print ('train_full_descr_transformed size: ', train_full_descr_transformed.size, '\n')
print ('train_data[LocationNormalized] size: ', train_data['LocationNormalized'].size, '\n')
"""

from scipy.sparse import hstack

transformed_data = hstack(
    [train_full_descr_transformed,
class BotView:
    """
    This class starts with a temporal bot cluster. If the cluster has above a minimum number of edges connecting the
    members, then it will examine the followers sets and find common followers who are highly connectd to many of the
    cluster members. It will then get the followers of these highly connected followers and repeat the process until
    enough steps have been taken or no more followers satisfy the criterion.
    """

    def __init__(self, tokens_ar, users, file_path):
        """
        Initializes the data structures, connects to the PostgreSQL database, and sets up the Twitter API connection
        :param tokens_ar: array of tokens for Twitter API
        :param cluster: Seed cluster for this object
        :return:
        """
        # Clique is the current set of all highly-connected nodes at all levels
        self.clique = set()
        # to_check is the nodes that we need to find followers for
        self.to_check = set()
        # user_followers is the dictionary of node_id: followers_list for this round. Resets every round. WHY??
        self.got_followers = set()
        self.current_level_users = []
        # followers is the set of all followers for all nodes in to_check. Resets every round. WHY??
        self.followers = []
        self.user_info = {}
        self.users = set()
        self.current_level_timelines = {}
        self.ignore_users = set()
        self.current_filepath = file_path
        self.stream_filepath = '/home/amanda/bigDisk/Twitter/Debot2/stream'
        self.user_features = {}
        self.features_list = []
        self.vec = DictVectorizer()

        try:
            load_dotenv('/home/amanda/bigDisk/Twitter/creds/.env')
            username = os.getenv('DATABASE_USER')
            password = os.getenv('DATABASE_PASSWORD')
            conn_string = "dbname='twitter' user="******" password = "******"Don't have id for: " + user
                self.clique.add((user, self.level))
                self.to_check.add(user)
        self.n = float(len(self.clique))
        self.original_n = self.n

    def explore(self):
        """
        Pops items from to_check and adds their followers to user_followers
        :return:
        """
        i = 0
        # Need to reset followers and user_followers for this round
        self.followers = []
        self.current_level_users = []
        while self.to_check:
            user = self.to_check.pop()
            # if we haven't already found the followers for this user
            if user not in self.got_followers and user not in self.ignore_users:
                self.cur.execute("SELECT followers FROM name_id WHERE user_id = %s;", (str(user),))
                f = self.cur.fetchone()
                # If we have queried this user in the past it will be in the db, so we don't have to waste a query on it
                if f:
                    if f[0]:
                        if f[0] == '[]':
                            self.ignore_users.add(user)
                            continue
                        try:
                            followers = ast.literal_eval(f[0])
                        except ValueError:
                            self.ignore_users.add(user)
                            continue
                        self.got_followers.add(user)
                        self.followers.extend(followers)
                        continue
                # Otherwise we query the Twitter API for this user's followers
                self.cur.execute('SELECT deleted, suspended, other_error FROM followers WHERE user_id = %s;', (str(user),))
                f = self.cur.fetchone()
                if f:
                    if f[0] or f[1] or f[2]:
                        self.ignore_users.add(user)
                        continue
                self.query_api(user)

    def query_api(self, user):
        """
        Query Twitter API for the followers of a given user. Add this entry to user_followers, add to followers, and
        add to database
        :param user: The user of interest
        :return:
        """
        try:
            followers = self.api.followers_ids(user)
            self.got_followers.add(user)
            self.followers.extend(followers)
            self.cur.execute('SELECT * FROM name_id WHERE user_id = %s;', (str(user),))
            f = self.cur.fetchone()
            if f:
                self.cur.execute('UPDATE name_id SET followers = (%s) WHERE user_id = %s', (str(followers), str(user)))
            else:
                self.cur.execute('INSERT INTO name_id (user_id, followers) VALUES (%s, %s);', (str(user), str(followers)))
            self.con.commit()
            #print "Added followers for " + str(user)
        except tweepy.TweepError:
            traceback.print_exc()
            print '>>>>>>>>>>>>>>> exception: ' + str(user)
            self.ignore_users.add(user)
            self.cur.execute('SELECT * FROM name_id WHERE user_id = %s;', (str(user),))
            f = self.cur.fetchone()
            if f:
                self.cur.execute('UPDATE name_id SET followers = (%s) WHERE user_id = %s', ('[]', str(user)))
            else:
                self.cur.execute('INSERT INTO name_id (user_id, followers) VALUES (%s, %s);', (str(user), '[]'))
            self.con.commit()

    def find_bots(self, priors):
        print "Getting all user info..."
        self.users_to_query = set()
        followers_set = set(self.followers)
        print "Number of followers: " + str(len(self.followers))
        follower_counts = Counter(self.followers).most_common()
        # should fix this to be a more precise measure
        size_to_keep = int(.15*len(self.followers))
        connectedness_threshold = floor(0.3*self.n)
        tmp_followers = [f[0] for f in follower_counts if f[1] >= connectedness_threshold]
        if len(tmp_followers) < size_to_keep:
            tmp_followers.extend([f[0] for f in follower_counts[:size_to_keep] if f[1] > 1])
        followers_set = set(tmp_followers)
        print "Number of connected followers: " + str(len(followers_set))
        for follower in followers_set:
            user_info = None
            follower = str(follower)
            if follower not in self.users and follower not in self.ignore_users:
                self.cur.execute('SELECT suspended, deleted, other_error, user_info_json FROM followers WHERE user_id = %s', (follower,))
                record = self.cur.fetchone()
                if record:
                    if record[0] or record[1] or record[2]:
                        self.ignore_users.add(follower)
                        # print "User is suspended or deleted"
                        continue
                    if record[3]:
                        # print "Already have profile information for user number " + follower
                        self.user_info[follower] = ast.literal_eval(record[3])
                        continue
                self.users_to_query.add(follower)
        get_user_info(self)
        print "Getting all timeline info and extracting features"
        for follower in followers_set:
            timeline = None
            follower = str(follower)
            if follower not in self.users and follower not in self.ignore_users:
                self.users.add(follower)
                self.cur.execute('SELECT suspended, deleted, other_error, timeline FROM followers WHERE user_id = %s', (follower,))
                record = self.cur.fetchone()
                if record:
                    if record[0] or record[1] or record[2]:
                        self.ignore_users.add(follower)
                        # print "User is suspended or deleted"
                        continue
                    if record[3]:
                        # print "Already have timeline information for user number " + follower
                        # Have to read in file to get timeline info
                        timeline = get_timeline_from_file(self, follower)
                    else:
                        timeline = get_user_timeline(self, follower)
                else:
                    timeline = get_user_timeline(self, follower)
                if timeline and self.user_info.get(follower) and len(timeline) > 50:
                    gf = GetFeatures(follower, self.user_info[follower], timeline)
                    try:
                        gf.user_features()
                        gf.collect_tweets()
                        gf.content_features()
                        gf.temporal_features()
                    except Exception as e:
                        print "ERROR GETTING FEATURES"
                        print e
                        print follower
                        print self.user_info[follower]
                    # need to incorporate other network features
                    #gf.features['num_shared_edges'] = follower_counts[user]
                    #cself.user_features[user] = gf.features
                    self.current_level_users.append(follower)
                    self.features_list.append(gf.features)
        # Axis=0 should be vertical
        len_priors = len(priors)
        current_features = priors
        current_features.extend(self.features_list)
        print "Performing anomaly detection"
        #json.dump(priors, open('test.json', 'w'), indent=4, separators=(',', ': '))
        X = self.vec.fit_transform(current_features).toarray()
        current_features = {}
        X_norm = normalize(X)
        #print np.any(np.isnan(X))
        #print np.all(np.isfinite(X))
        print X.shape
        # X = np.stack([current_features, priors], axis=0) Every round will find outliers, how do we stop exploring?
        clf = LocalOutlierFactor(n_neighbors=20)
        clf.fit(X)
        check_is_fitted(clf, ["threshold_", "negative_outlier_factor_", "n_neighbors_", "_distances_fit_X_"])
        if X is not None:
            X = check_array(X, accept_sparse='csr')
            y_pred = clf._decision_function(X)
        else:
            y_pred = clf.negative_outlier_factor_
        #y_pred = clf.fit_predict(X)
        y_pred_new = y_pred[len_priors:]
        # Do anomaly detection and set connected followers to certain outliers
        # this line is a stand-in
        users_scores = zip(self.current_level_users, y_pred_new)
        connected_followers = [u[0] for u in users_scores if u[1] <= clf.threshold_]
        #How do I add back in the outliers to the anomaly detection? Mueen said not to so I will leave for now
        self.level += 1
        # Add highly connected followers to the clique and to_check
        for follower in connected_followers:
            self.clique.add((follower, self.level))
            self.to_check.add(follower)
        print self.clique
        self.n = float(len(self.clique))
        print "Current size of cluster: " + str(self.n)
Exemple #34
0
df >>= (
    rename(action_type='action', counts='count') >> arrange('minute') >>
    group_by('country', 'product', 'site', 'action_type') >> mutate(
        **
        {'counts_t-%02.f' % i: X.counts.shift(i)
         for i in range(1, time_step)}) >> mutate(**{
             'counts_t+%02.f' % i: X.counts.shift(-i)
             for i in range(1, time_step + 1)
         }) >> r(X.dropna()))

#########################################################################################################
######## 以下是為了正式使用 (採用所有至目前時間點的資料進行模型訓練) #################################
########################################### MLP #######################################################
df_feature = df.iloc[:, :5]
vec = DictVectorizer(sparse=False)
data_feature = vec.fit_transform(df_feature.to_dict('records'))
counts_index = int(where(pd.Series(vec.get_feature_names()) == 'counts'))

df_x = df.iloc[:, 4:5 + time_step] >> drop('minute')
df_x = df_x[df_x.columns.sort_values(ascending=False)]
df_y = df.iloc[:, 5 + time_step:]
data_x = df_x.as_matrix().reshape(df_x.shape[0], df_x.shape[1], 1)
data_y = df_y.as_matrix().reshape(df_y.shape[0], df_y.shape[1], 1)
data_feature = data_feature.reshape(data_feature.shape[0], 1,
                                    data_feature.shape[1])
data_feature = np.concatenate([data_feature for i in range(10)], axis=1)
data_feature[:, :, counts_index] = data_x[:, :, 0]

train_x = torch.from_numpy(data_feature).float()
train_y = torch.from_numpy(data_y).float()
Exemple #35
0
class SklearnClassifier(ClassifierI):
    """Wrapper for scikit-learn classifiers."""

    def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

    def __repr__(self):
        return "<SklearnClassifier(%r)>" % self._clf

    def classify_many(self, featuresets):
        """Classify a batch of samples.

        :param featuresets: An iterable over featuresets, each a dict mapping
            strings to either numbers, booleans or strings.
        :return: The predicted class label for each input sample.
        :rtype: list
        """
        X = self._vectorizer.transform(featuresets)
        classes = self._encoder.classes_
        return [classes[i] for i in self._clf.predict(X)]

    def prob_classify_many(self, featuresets):
        """Compute per-class probabilities for a batch of samples.

        :param featuresets: An iterable over featuresets, each a dict mapping
            strings to either numbers, booleans or strings.
        :rtype: list of ``ProbDistI``
        """
        X = self._vectorizer.transform(featuresets)
        y_proba_list = self._clf.predict_proba(X)
        return [self._make_probdist(y_proba) for y_proba in y_proba_list]

    def labels(self):
        """The class labels used by this classifier.

        :rtype: list
        """
        return list(self._encoder.classes_)

    def train(self, labeled_featuresets):
        """
        Train (fit) the scikit-learn estimator.

        :param labeled_featuresets: A list of ``(featureset, label)``
            where each ``featureset`` is a dict mapping strings to either
            numbers, booleans or strings.
        """

        X, y = list(zip(*labeled_featuresets))
        X = self._vectorizer.fit_transform(X)
        y = self._encoder.fit_transform(y)
        self._clf.fit(X, y)

        return self

    def _make_probdist(self, y_proba):
        classes = self._encoder.classes_
        return DictionaryProbDist({classes[i]: p for i, p in enumerate(y_proba)})
Exemple #36
0
for post in posts:
    for comment in post['comments']:  #取得八卦文文章之鄉民留言
        l = comment['content'].strip()  #去頭去尾換行之類的字符
        if l and comment['score'] != 0:
            d = defaultdict(int)
            for w in jieba.cut(l):  # w 是針對 l 中的文字斷詞後所得之詞語
                d[w] += 1
            if len(d) > 0:
                c_scores.append(
                    1 if comment['score'] > 0 else 0)  #每一則留言之標記(推/噓)
                c_words.append(d)

# convert to vectors
c_dvec = DictVectorizer()
c_tfidf = TfidfTransformer()
c_vector = c_dvec.fit_transform(c_words)
c_X = c_tfidf.fit_transform(c_vector)  #將一千篇所有鄉民留言的斷詞文字矩陣轉成向量並計算tf-idf

# build and train the classifier
c_svc = LinearSVC()
c_svc.fit(c_X, c_scores)


#分類留言的情緒
def comment_sentiment_classifier(model, dvec, tfidf, text):
    l = text.strip()  #去頭去尾換行之類的字符
    d = defaultdict(int)

    for w in jieba.cut(l):  # w 是針對 l 中的文字斷詞後所得之詞語
        d[w] += 1
testdata = pd.read_csv(this_folder + "/insurance-test.csv")

X_train = traindata[[
    'Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
    'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium', 'Policy_Sales_Channel',
    'Vintage'
]]
Y_train = traindata['Response']
X_test = testdata[[
    'Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
    'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium', 'Policy_Sales_Channel',
    'Vintage'
]]

vec = DictVectorizer()
X_train = vec.fit_transform(X_train.to_dict(orient="record"))
X_test = vec.transform(X_test.to_dict(orient="record"))

gnb = GaussianNB()
gnb.fit(X_train.toarray(), Y_train)  # input X,y for training
mnb = MultinomialNB()
mnb.fit(X_train, Y_train)

Y_test1 = gnb.predict(X_test.toarray())
output = pd.DataFrame({'id': testdata['id'], 'Response': Y_test1})
output.to_csv('Bayes_gnb.csv', index=False)

Y_test2 = mnb.predict(X_test.toarray())
output = pd.DataFrame({'id': testdata['id'], 'Response': Y_test2})
output.to_csv('Bayes_mnb.csv', index=False)
Exemple #38
0
    pData.feature_dicts[d - 1] for d in combined_partitions_ids[0]
]
test_dicts = pData.feature_dicts = [
    pData.feature_dicts[d - 1] for d in combined_partitions_ids[1]
]
train_dicts = pData.feature_dicts = [
    pData.feature_dicts[d - 1] for d in combined_partitions_ids[2]
]

binary_genres = combined_partitions_binary_genres[0]
## Begin function block

#convert to tf-idf model
tfidf = TfidfTransformer()
vec = DictVectorizer()
vect = vec.fit_transform(feature_select_dicts)
adjusted = tfidf.fit_transform(vect)

term_indices = list(vec.vocabulary_.items())
#alphabetical order
term_indices.sort(key=operator.itemgetter(1))

term_list = [i[0] for i in term_indices]
data = adjusted.toarray()

p_tuples = []

for column in data.T:
    p, c = spearmanr(column, binary_genres)
    f_tuple = (p, c)
    p_tuples.append(f_tuple)
def byte_ngram(files_list, addrlength=32, n=1):
    dicts_list = []
    total_files = len(files_list)
    bad_files_names = []
    for idx, file_name in enumerate(files_list):
        bytes_file = DATASET_DIR + file_name + '.bytes.gz'
        try:
            with gzip.open(bytes_file, 'rt') as fp:
                bytedict = {}
                hex_seq = ""
                for line in fp.readlines():
                    if not line.strip():
                        continue
                    else:
                        address = int(addrlength / 4)  # hex to bytes
                        # ensure that addresses values will not be counted
                        # in the ngram calculation
                        hex_seq = hex_seq + line[address:].strip()

                hex_seq = hex_seq.replace(" ", "")
                for i in range(0, len(hex_seq) - 1, 2):
                    # ignore bytes that contain the "?" character
                    if hex_seq[i] == "?" or hex_seq[i + 1] == "?":
                        continue
                    if 2 * n + i > len(hex_seq):
                        break

                    gram = hex_seq[i:(2 * n + i)]
                    if gram not in bytedict.keys():
                        bytedict[gram] = 1
                    else:
                        bytedict[gram] += 1

                dicts_list.append(bytedict)
        except Exception as e:
            bad_files_names.append(file_name)
            log_exception(e, sys.argv[0], bytes_file)

        # progress bars always save my sanity
        progress_bar(idx + 1, total_files, 50)

    # log the corrupted files for future reference
    if len(bad_files_names) > 0:
        with open('bad_bytes_files.txt', 'w') as bfp:
            for name in bad_files_names:
                bfp.write(name + '.bytes\n')

    # convert list of dictionaries to a byte ngram count numpy array
    vec = DictVectorizer()
    ngram_freq = vec.fit_transform(dicts_list).toarray()
    ngram_freq_df = pd.DataFrame(ngram_freq, columns=vec.get_feature_names())
    # store frequency of each byte ngram
    ngram_freq_df.to_csv('features/' + str(n) + 'gram_byte_freq.csv')
    save_obj(ngram_freq_df, str(n) + 'gram_byte_freq')

    # transform ngram frequency array to ngram tfidf array
    transformer = TfidfTransformer(smooth_idf=False)
    ngram_tfidf = transformer.fit_transform(ngram_freq)
    # store tfidf of each byte ngram
    ngram_tfidf_df = pd.DataFrame(ngram_tfidf.todense(),
                                  columns=vec.get_feature_names())
    ngram_tfidf_df.to_csv('features/' + str(n) + 'gram_byte_tfidf.csv')
    save_obj(ngram_tfidf_df, str(n) + 'gram_byte_tfidf')
    return ngram_tfidf_df
def ReadProcTrainData(features, labels):
    train_TXT = []
    train_data = read_data(train, train_TXT)
    #test = read_data(test)

    print train_data[
        'Leon is an East Village gem: casual but hip, with well prepared basic French bistro fare, good specials, a warm and lively atmosphere.']
    print train_data[
        'Leon is an East Village gem: casual but hip, with well prepared basic French bistro fare, good specials, a warm and lively atmosphere.'][
            1]
    print train_data[
        'Leon is an East Village gem: casual but hip, with well prepared basic French bistro fare, good specials, a warm and lively atmosphere.'][
            1][0]
    print train_data[
        'Leon is an East Village gem: casual but hip, with well prepared basic French bistro fare, good specials, a warm and lively atmosphere.'][
            1][1]
    print train_data[
        'Leon is an East Village gem: casual but hip, with well prepared basic French bistro fare, good specials, a warm and lively atmosphere.'][
            1][2]
    print train_data[
        'Leon is an East Village gem: casual but hip, with well prepared basic French bistro fare, good specials, a warm and lively atmosphere.'][
            1][3]
    ############ split data before proceeding ###########

    #train_TXT = train_TXT[:100]
    ##########################################   GLOBAL ##########################################

    #############################################################################################################################################

    featuredicts = []

    ##################################### LOCAL and APPEND #######################################################
    l = len(train_TXT)

    pl = negl = nutl = el = 0
    prev_pol = ""
    prev_idn = ""
    pol_arr = []

    for i in range(l):
        dlen = len(train_data[train_TXT[i]])

        for k in range(dlen):

            #global_features.append(txt_corp_feats[i])

            sentence = train_TXT[i]
            target = train_data[train_TXT[i]][k][0]
            cat = train_data[train_TXT[i]][k][1]
            pol = train_data[train_TXT[i]][k][2]
            frm = train_data[train_TXT[i]][k][3]
            to = train_data[train_TXT[i]][k][4]
            idn = train_data[train_TXT[i]][k][5]

            if idn == prev_idn:
                pol_arr.append(prev_pol)
            else:
                pol_arr = []

            if pol == 'positive' or pol == 'negative' or pol == "neutral":
                t = TextClass(sentence, target, cat, pol, frm, to, idn,
                              prev_pol, prev_idn, pol_arr)

                featuredicts.append(t.baselinefeatures())
                if t.pol == 'positive':
                    labels.append(1)
                    pl += 1
                elif t.pol == 'negative':
                    labels.append(0)
                    negl += 1
                elif t.pol == "neutral":
                    labels.append(2)
                    nutl += 1
                else:
                    el += 1

                prev_pol = t.pol
            prev_idn = idn

    print "counts ", pl, negl, nutl, el
    #sys.exit(0)

    vec = DictVectorizer()

    local_features = vec.fit_transform(featuredicts).toarray()

    global GLOBAL_VEC
    GLOBAL_VEC = vec
    #### append local and global features
    '''
    features = []
    for i in range(len(global_features)):
        features.append(np.concatenate((global_features[i], local_features[i])))
    
    features = np.asarray(features)
    '''
    #print local_features
    print()
    features.append(local_features)
    labels = np.array(labels)
from sklearn.feature_extraction import DictVectorizer
'''one-hot编码'''
onehot_encoder = DictVectorizer()
X = [{'city': 'New York'}, {'city': 'San Francisco'}, {'city': 'Chapel Hill'}]

print(onehot_encoder.fit_transform(X).toarray())
'''特征标准化'''
# 等同于StandardScaler
from sklearn import preprocessing
import numpy as np
X = np.array([[0., 0., 5., 13., 9., 1.], [0., 0., 13., 15., 10., 15.],
              [0., 3., 15., 2., 0., 11.]])
print(preprocessing.scale(X))

# 能更好的处理异常值
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)
def train(data, classifier_file):  # do not change the heading of the function
    data_list = data
    model_x = []
    model_y = []
    vo_list = {
        'IH', 'UW', 'OY', 'AH', 'ER', 'EY', 'AO', 'AW', 'AY', 'EH', 'AE', 'UH',
        'IY', 'AA', 'OW'
    }
    co_list = {
        'W', 'K', 'HH', 'G', 'JH', 'Z', 'Y', 'N', 'V', 'SH', 'L', 'NG', 'S',
        'CH', 'R', 'D', 'B', 'TH', 'F', 'DH', 'T', 'P', 'M', 'ZH'
    }
    strong_suffixes = {
        'al', 'ance', 'ancy', 'ant', 'ard', 'ary', 'àte', 'auto', 'ence',
        'ency', 'ent', 'ery', 'est', 'ial', 'ian', 'iana', 'en', 'ésce', 'ic',
        'ify', 'ine', 'ion', 'tion', 'ity', 'ive', 'ory', 'ous', 'ual', 'ure',
        'wide', 'y', 'se', 'ade', 'e', 'ee', 'een', 'eer', 'ese', 'esque',
        'ette', 'eur', 'ier', 'oon', 'que'
    }

    strong_prefixes = {
        'ad', 'co', 'con', 'counter', 'de', 'di', 'dis', 'e', 'en', 'ex', 'in',
        'mid', 'ob', 'para', 'pre', 're', 'sub', 'a', 'be', 'with', 'for'
    }

    neutral_prefixes = {
        'down', 'fore', 'mis', 'over', 'out', 'un', 'under', 'up', 'anti',
        'bi', 'non', 'pro', 'tri', 'contra', 'counta', 'de', 'dis', 'extra',
        'inter', 'intro', 'multi', 'non', 'post', 'retro', 'super', 'trans',
        'ultra'
    }

    neutral_suffixes = {
        'able', 'age', 'al', 'ate', 'ed', 'en', 'er', 'est', 'ful', 'hood',
        'ible', 'ing', 'ile', 'ish', 'ism', 'ist', 'ize', 'less', 'like', 'ly'
        'man', 'ment', 'most', 'ness', 'old', 's', 'ship', 'some', 'th',
        'ward', 'wise', 'y'
    }

    suffixes = {
        'inal', 'ain', 'tion', 'sion', 'osis', 'oon', 'sce', 'que', 'ette',
        'eer', 'ee', 'aire', 'able', 'ible', 'acy', 'cy', 'ade', 'age', 'al',
        'al', 'ial', 'ical', 'an', 'ance', 'ence', 'ancy', 'ency', 'ant',
        'ent', 'ant', 'ent', 'ient', 'ar', 'ary', 'ard', 'art', 'ate', 'ate',
        'ate', 'ation', 'cade', 'drome', 'ed', 'ed', 'en', 'en', 'ence',
        'ency', 'er', 'ier', 'er', 'or', 'er', 'or', 'ery', 'es', 'ese', 'ies',
        'es', 'ies', 'ess', 'est', 'iest', 'fold', 'ful', 'ful', 'fy', 'ia',
        'ian', 'iatry', 'ic', 'ic', 'ice', 'ify', 'ile', 'ing', 'ion', 'ish',
        'ism', 'ist', 'ite', 'ity', 'ive', 'ive', 'ative', 'itive', 'ize',
        'less', 'ly', 'ment', 'ness', 'or', 'ory', 'ous', 'eous', 'ose',
        'ious', 'ship', 'ster', 'ure', 'ward', 'wise', 'ize', 'phy', 'ogy'
    }

    prefixes = {
        'ac', 'ad', 'af', 'ag', 'al', 'an', 'ap', 'as', 'at', 'an', 'ab',
        'abs', 'acer', 'acid', 'acri', 'act', 'ag', 'acu', 'aer', 'aero', 'ag',
        'agi', 'ig', 'act', 'agri', 'agro', 'alb', 'albo', 'ali', 'allo',
        'alter', 'alt', 'am', 'ami', 'amor', 'ambi', 'ambul', 'ana', 'ano',
        'andr', 'andro', 'ang', 'anim', 'ann', 'annu', 'enni', 'ante',
        'anthrop', 'anti', 'ant', 'anti', 'antico', 'apo', 'ap', 'aph', 'aqu',
        'arch', 'aster', 'astr', 'auc', 'aug', 'aut', 'aud', 'audi', 'aur',
        'aus', 'aug', 'auc', 'aut', 'auto', 'bar', 'be', 'belli', 'bene', 'bi',
        'bine', 'bibl', 'bibli', 'biblio', 'bio', 'bi', 'brev', 'cad', 'cap',
        'cas', 'ceiv', 'cept', 'capt', 'cid', 'cip', 'cad', 'cas', 'calor',
        'capit', 'capt', 'carn', 'cat', 'cata', 'cath', 'caus', 'caut',
        'cause', 'cuse', 'cus', 'ceas', 'ced', 'cede', 'ceed', 'cess', 'cent',
        'centr', 'centri', 'chrom', 'chron', 'cide', 'cis', 'cise', 'circum',
        'cit', 'civ', 'clam', 'claim', 'clin', 'clud', 'clus claus', 'co',
        'cog', 'col', 'coll', 'con', 'com', 'cor', 'cogn', 'gnos', 'com',
        'con', 'contr', 'contra', 'counter', 'cord', 'cor', 'cardi', 'corp',
        'cort', 'cosm', 'cour', 'cur', 'curr', 'curs', 'crat', 'cracy', 'cre',
        'cresc', 'cret', 'crease', 'crea', 'cred', 'cresc', 'cret', 'crease',
        'cru', 'crit', 'cur', 'curs', 'cura', 'cycl', 'cyclo', 'de', 'dec',
        'deca', 'dec', 'dign', 'dei', 'div', 'dem', 'demo', 'dent', 'dont',
        'derm', 'di', 'dy', 'dia', 'dic', 'dict', 'dit', 'dis', 'dif', 'dit',
        'doc', 'doct', 'domin', 'don', 'dorm', 'dox', 'duc', 'duct', 'dura',
        'dynam', 'dys', 'ec', 'eco', 'ecto', 'en', 'em', 'end', 'epi', 'equi',
        'erg', 'ev', 'et', 'ex', 'exter', 'extra', 'extro', 'fa', 'fess',
        'fac', 'fact', 'fec', 'fect', 'fic', 'fas', 'fea', 'fall', 'fals',
        'femto', 'fer', 'fic', 'feign', 'fain', 'fit', 'feat', 'fid', 'fid',
        'fide', 'feder', 'fig', 'fila', 'fili', 'fin', 'fix', 'flex', 'flect',
        'flict', 'flu', 'fluc', 'fluv', 'flux', 'for', 'fore', 'forc', 'fort',
        'form', 'fract', 'frag', 'frai', 'fuge', 'fuse', 'gam', 'gastr',
        'gastro', 'gen', 'gen', 'geo', 'germ', 'gest', 'giga', 'gin', 'gloss',
        'glot', 'glu', 'glo', 'gor', 'grad', 'gress', 'gree', 'graph', 'gram',
        'graf', 'grat', 'grav', 'greg', 'hale', 'heal', 'helio', 'hema',
        'hemo', 'her', 'here', 'hes', 'hetero', 'hex', 'ses', 'sex', 'h**o',
        'hum', 'human', 'hydr', 'hydra', 'hydro', 'hyper', 'hypn', 'an', 'ics',
        'ignis', 'in', 'im', 'in', 'im', 'il', 'ir', 'infra', 'inter', 'intra',
        'intro', 'ty', 'jac', 'ject', 'join', 'junct', 'judice', 'jug',
        'junct', 'just', 'juven', 'labor', 'lau', 'lav', 'lot', 'lut', 'lect',
        'leg', 'lig', 'leg', 'levi', 'lex', 'leag', 'leg', 'liber', 'liver',
        'lide', 'liter', 'loc', 'loco', 'log', 'logo', 'ology', 'loqu',
        'locut', 'luc', 'lum', 'lun', 'lus', 'lust', 'lude', 'macr', 'macer',
        'magn', 'main', 'mal', 'man', 'manu', 'mand', 'mania', 'mar', 'mari',
        'mer', 'matri', 'medi', 'mega', 'mem', 'ment', 'meso', 'meta', 'meter',
        'metr', 'micro', 'migra', 'mill', 'kilo', 'milli', 'min', 'mis', 'mit',
        'miss', 'mob', 'mov', 'mot', 'mon', 'mono', 'mor', 'mort', 'morph',
        'multi', 'nano', 'nasc', 'nat', 'gnant', 'nai', 'nat', 'nasc', 'neo',
        'neur', 'nom', 'nom', 'nym', 'nomen', 'nomin', 'non', 'non', 'nov',
        'nox', 'noc', 'numer', 'numisma', 'ob', 'oc', 'of', 'op', 'oct',
        'oligo', 'omni', 'onym', 'oper', 'ortho', 'over', 'pac', 'pair',
        'pare', 'paleo', 'pan', 'para', 'pat', 'pass', 'path', 'pater', 'patr',
        'path', 'pathy', 'ped', 'pod', 'pedo', 'pel', 'puls', 'pend', 'pens',
        'pond', 'per', 'peri', 'phage', 'phan', 'phas', 'phen', 'fan', 'phant',
        'fant', 'phe', 'phil', 'phlegma', 'phobia', 'phobos', 'phon', 'phot',
        'photo', 'pico', 'pict', 'plac', 'plais', 'pli', 'ply', 'plore', 'plu',
        'plur', 'plus', 'pneuma', 'pneumon', 'pod', 'poli', 'poly', 'pon',
        'pos', 'pound', 'pop', 'port', 'portion', 'post', 'pot', 'pre', 'pur',
        'prehendere', 'prin', 'prim', 'prime', 'pro', 'proto', 'psych',
        'punct', 'pute', 'quat', 'quad', 'quint', 'penta', 'quip', 'quir',
        'quis', 'quest', 'quer', 're', 'reg', 'recti', 'retro', 'ri', 'ridi',
        'risi', 'rog', 'roga', 'rupt', 'sacr', 'sanc', 'secr', 'salv', 'salu',
        'sanct', 'sat', 'satis', 'sci', 'scio', 'scientia', 'scope', 'scrib',
        'script', 'se', 'sect', 'sec', 'sed', 'sess', 'sid', 'semi', 'sen',
        'scen', 'sent', 'sens', 'sept', 'sequ', 'secu', 'sue', 'serv', 'sign',
        'signi', 'simil', 'simul', 'sist', 'sta', 'stit', 'soci', 'sol',
        'solus', 'solv', 'solu', 'solut', 'somn', 'soph', 'spec', 'spect',
        'spi', 'spic', 'sper', 'sphere', 'spir', 'stand', 'stant', 'stab',
        'stat', 'stan', 'sti', 'sta', 'st', 'stead', 'strain', 'strict',
        'string', 'stige', 'stru', 'struct', 'stroy', 'stry', 'sub', 'suc',
        'suf', 'sup', 'sur', 'sus', 'sume', 'sump', 'super', 'supra', 'syn',
        'sym', 'tact', 'tang', 'tag', 'tig', 'ting', 'tain', 'ten', 'tent',
        'tin', 'tect', 'teg', 'tele', 'tem', 'tempo', 'ten', 'tin', 'tain',
        'tend', 'tent', 'tens', 'tera', 'term', 'terr', 'terra', 'test', 'the',
        'theo', 'therm', 'thesis', 'thet', 'tire', 'tom', 'tor', 'tors',
        'tort', 'tox', 'tract', 'tra', 'trai', 'treat', 'trans', 'tri', 'trib',
        'tribute', 'turbo', 'typ', 'ultima', 'umber', 'umbraticum', 'un',
        'uni', 'vac', 'vade', 'vale', 'vali', 'valu', 'veh', 'vect', 'ven',
        'vent', 'ver', 'veri', 'verb', 'verv', 'vert', 'vers', 'vi', 'vic',
        'vicis', 'vict', 'vinc', 'vid', 'vis', 'viv', 'vita', 'vivi', 'voc',
        'voke', 'vol', 'volcan', 'volv', 'volt', 'vol', 'vor', 'with', 'zo'
    }
    neutral_prefixes = upper(neutral_prefixes)
    neutral_suffixes = upper(neutral_suffixes)
    strong_prefixes = upper(strong_prefixes)
    strong_suffixes = upper(strong_suffixes)
    full_suffixes_set = upper(suffixes)
    full_prefixes_set = upper(prefixes)
    suffix = {"1", "2", "0"}
    for line in data_list:
        dict = {}
        vow_index = []
        vowelCount = 0
        pattern = ""
        y = ""
        dict["pos"] = nltk.pos_tag([line.split(":")[0]])[0][1]
        word = line.split(":")[0]
        temp = check_prefix(word, neutral_prefixes)
        if temp:
            dict['neu_pre'] = temp
        temp = check_suffix(word, neutral_suffixes)
        if temp:
            dict['neu_suf'] = temp
        temp = check_prefix(word, strong_prefixes)
        if temp:
            dict['str_pre'] = temp
        temp = check_suffix(word, strong_suffixes)
        if temp:
            dict['str_suf'] = temp
        temp = check_prefix(word, full_suffixes_set)
        if temp:
            dict['ful_pre'] = temp
        temp = check_suffix(word, full_prefixes_set)
        if temp:
            dict['ful_suf'] = temp
        line = line.split(":")[1].strip()

        syllables = line.split(" ")
        l = []
        for i in syllables:
            l.append(i if not (i[-1].isdigit()) else i[:-1])
        dict.update(Counter({''.join(i) for i in get_ngrams(l)}))
        dict['len'] = len(syllables)
        out = ''
        for i in range(len(syllables)):
            syl = syllables[i]

            if syl[-1] in suffix:
                vowelCount += 1
                vow_index.append(i)
                out += syl[-1]
                # if syl[-1]=="1":
                #     model_y.append(vowelCount)
                pattern += "V"
            else:
                pattern += "C"

        model_y.append(out)
        vowelCount = 0
        dict["pattern"] = pattern
        dict['vow_len'] = len(vow_index)
        for i in vow_index:
            vowelCount += 1
            if i - 1 >= 0:
                dict["onset2_" + str(vowelCount)] = syllables[i - 1]
            if i + 1 < len(syllables):
                dict["coda1_" + str(vowelCount)] = syllables[i + 1]
            dict["nucleus_" + str(vowelCount)] = syllables[i][:-1]
        model_x.append(dict)
    # print(pd.DataFrame(model_x))
    # print(model_y)
    v = DictVectorizer(sparse=True)

    X = v.fit_transform(model_x)
    classifier = LogisticRegression(penalty='l2', class_weight='balanced')

    classifier.fit(X, model_y)
    with open(classifier_file, 'wb') as f:
        pickle.dump(classifier, f)
        pickle.dump(v, f)
Exemple #43
0
print('-' * 30)
print(train_data.describe(include=['O']))
print('-' * 30)
print(train_data.head())
print('-' * 30)
print(train_data.tail())

# step2
# 数据清洗
# 使用平均年龄来填充年龄中的 nan 值
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)

print(train_data['Embarked'].value_counts())

# 使用登录最多的港口来填充登录港口的 nan 值
train_data['Embarked'].fillna('S', inplace=True)
test_data['Embarked'].fillna('S', inplace=True)

# step3
# 特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]

# 处理符号化的对象,将符号转成数字 0/1 进行表示
dvec = DictVectorizer(sparse=False)
train_features = dvec.fit_transform(train_features.to_dict(orient='record'))
print(dvec.feature_names_)
#缺失值处理
x["age"].fillna(x["age"].mean(), inplace=True)

#转换成字典
x = x.to_dict(orient="records")

#数据集划分
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)

#字典特征抽取
from sklearn.feature_extraction import DictVectorizer

transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

#决策树预估器:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

estimator = DecisionTreeClassifier(criterion="entropy", max_depth=8)
estimator.fit(x_train, y_train)
#模型评估:
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_test == y_predict)
score = estimator.score(x_test, y_test)
print("准确率:\n", score)
#可视化决策树 ## http://webgraphviz.com/
export_graphviz(estimator,
Exemple #45
0
            x_templist.append(current_dictX)
            y_templist.append(current_Y)

        stack, graph = transition.empty_stack(stack, graph)

        for word in sentence:
            word['head'] = graph['heads'][word['id']]

        x_list.extend(x_temp_list)
        y_list.extend(y_temp_list)

    print("Encoding the features and classes...")
    # Vectorize the feature matrix and carry out a one-hot encoding
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(x_list)
    # The statement below will swallow a considerable memory
    # X = vec.fit_transform(X_dict).toarray()
    # print(vec.get_feature_names())

    y, nbr_to_class, classes_to_nbr = encode_classes(y_list)

    print("Training the model...")
    classifier = linear_model.LogisticRegression(penalty='l2',
                                                 dual=True,
                                                 solver='liblinear')
    model = classifier.fit(X, y)
    print(model)
    print('Predicting')

    # print(transitions)
Exemple #46
0
class ImitationLearner(object):

    # initialize the classifier to be learned
    def __init__(self):
        # Any classifier could be used here
        self.model = LogisticRegression()
        self.vectorizer = DictVectorizer()
        self.labelEncoder = LabelEncoder()

    # this function predicts an instance given the state
    # state keeps track the various actions taken
    # it does not change the instance in any way,
    # it does change the state
    # the predicted structured output is returned in the end
    def predict(self, structured_instance, state=None, expert_policy_prob=0.0):
        if state == None:
            state = self.transitionSystem(
                structured_instance=structured_instance)

        # predict all remaining actions
        # if we do not have any actions we are done
        while len(state.agenda) > 0:
            # for each action
            # pop it from the queue
            current_action = state.agenda.popleft()
            # extract features and add them to the action
            # (even for the optimal policy, it doesn't need the features but they are needed later on)
            current_action.features = state.extractFeatures(
                structured_instance=structured_instance, action=current_action)
            # the first condition is to avoid un-necessary calls to random which give me reproducibility headaches
            if (expert_policy_prob
                    == 1.0) or (expert_policy_prob > 0.0
                                and random.random() < expert_policy_prob):
                current_action.label = state.expert_policy(
                    structured_instance, current_action)
            else:
                # predict (probably makes sense to parallelize across instances)
                # vectorize the features:
                vectorized_features = self.vectorizer.transform(
                    current_action.features)
                # predict using the model
                normalized_label = self.model.predict(vectorized_features)
                # get the actual label (returns an array, get the first and only element)
                current_action.label = self.labelEncoder.inverse_transform(
                    normalized_label)[0]
            # add the action to the state making any necessary updates
            state.updateWithAction(current_action, structured_instance)

        # OK return the final state reached
        return state

    class params(object):
        def __init__(self):
            self.learningParam = 0.1
            self.iterations = 40

    def train(self, structuredInstances, params):
        # create the dataset
        trainingFeatures = []
        trainingLabels = []

        # for each iteration
        for iteration in range(params.iterations):
            # set the expert policy prob
            expertPolicyProb = pow(1 - params.learningParam, iteration)
            print("Iteration:" + str(iteration) + ", expert policy prob:" +
                  str(expertPolicyProb))

            for structuredInstance in structuredInstances:

                # so we obtain the predicted output and the actions taken are in state
                # this prediction uses the gold standard since we need this info for the expert policy actions
                final_state = self.predict(structuredInstance,
                                           expert_policy_prob=expertPolicyProb)

                # initialize a second state to avoid having to roll-back
                stateCopy = self.transitionSystem(
                    structured_instance=structuredInstance)
                # The agenda seems to initialized fine
                for action in final_state.actionsTaken:
                    # DAgger just ask the expert
                    stateCopy.agenda.popleft()
                    expert_action_label = stateCopy.expert_policy(
                        structuredInstance, action)

                    # add the labeled features to the training data
                    trainingFeatures.append(action.features)
                    trainingLabels.append(expert_action_label)

                    # take the original action chosen to proceed
                    stateCopy.updateWithAction(action, structuredInstance)

            # OK, let's save the training data and learn some classifiers
            # vectorize the training data collected
            training_data = self.vectorizer.fit_transform(trainingFeatures)
            # encode the labels
            encoded_labels = self.labelEncoder.fit_transform(trainingLabels)
            # train
            self.model.fit(training_data, encoded_labels)
Exemple #47
0
import copy


def extract_feats(dataset, class_name):
    X_dict = copy.deepcopy(dataset)
    y_symbols = [obs.pop(class_name, None) for obs in X_dict]
    return X_dict, y_symbols


X_dict, y_symbols = extract_feats(dataset, 'will_wait')

y = [0 if symb == 'No' else 1 for symb in y_symbols]

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)  # Should be true
X = vec.fit_transform(X_dict)  # y values not included

features = vec.get_feature_names()
param = vec.get_params(deep=True)

################ MAIN CALLING DECISION TREE LEARNING ###############
# and printing

import decision_tree_learning as dtl

# DECISION TREE BUILDING

examples = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

root = dtl.decision_tree_learning_(examples, attributes, None, None, None,
                                   features, y, X)
Exemple #48
0
                 sep=' ',
                 engine='python',
                 quoting=csv.QUOTE_NONE,
                 error_bad_lines=False,
                 nrows=50000)

print(df.head())

print(df.isnull().sum())
df = df.fillna(method='ffill')

print(df.groupby('Tag').size().reset_index(name='counts'))

X = df.drop('Tag', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = df.Tag.values
classes = np.unique(y)
del (df)

classes = classes.tolist()
#nbrtest=int(X.shape[0]*0.33)
#nbrtrain=X.shape[0]-nbrtest

#X_train=X[:nbrtrain]
#X=np.delete(X,range(nbrtrain), 0)
#X_test=X
#del(X)

#y_train=y[:nbrtrain]
#y=np.delete(y, range(nbrtrain))
scaler = preprocessing.StandardScaler()
X_train.loc[:,
            numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test.loc[:, numeric_features] = scaler.transform(X_test[numeric_features])
#normalizer = preprocessing.Normalizer()
#X_train.loc[:, numeric_features] = normalizer.fit_transform(X_train[numeric_features])
#X_test.loc[:, numeric_features] = normalizer.transform(X_test[numeric_features])
np.random.seed = 0
while len(y[y == 1]) < len(y[y == 0]) - 207:
    ind = np.random.choice(
        data[data['Survived'] == 1][data['Pclass'] == 1].index)
    X_train = X_train.append(X_train.loc[ind], ignore_index=True)
    y = y.set_value(len(y) + 1, y.loc[ind])

encoder = DV(sparse=False)
encoded_train_data = encoder.fit_transform(
    X_train[category_features].T.to_dict().values())
encoded_test_data = encoder.transform(
    X_test[category_features].T.to_dict().values())

X_train['Combo1'] = X_train['Pclass'] * X_train[
    'Fare']  # * X_train['Family'] * X_train['Age']
X_test['Combo1'] = X_test['Pclass'] * X_test[
    'Fare']  # * X_test['Family'] * X_test['Age']
#X['Combo1'] = ((X['Combo1'] ** 0.1 - 1) / 0.1)
#X_test['Combo1'] = ((X_test['Combo1'] ** 0.1 - 1) / 0.1)
#X['Combo2'] = X['Age'] * X['Fare']
#X_test['Combo2'] = X_test['Age'] * X_test['Fare']
numeric_features = ['Age', 'Pclass', 'Fare', 'Family', 'Combo1']
#polinom = preprocessing.PolynomialFeatures(2, interaction_only = True)
#poly_features = polinom.fit_transform(X_train[numeric_features])
#poly_test_features = polinom.transform(X_test[numeric_features])
    sd_y = stats.stdev(y)

    for observation in x:
        score_x.append((observation - mean_x) / sd_x)

    for obseervation in y:
        score_y.append((observation - mean_y) / sd_y)

    return (sum([i * j for i, j in zip(score_x, score_y)])) / (n - 1)


print(pearson(x, y))

#################

staff = [{
    'name': 'Steve Miller',
    'age': 33.
}, {
    'name': 'Lyndon Jones',
    'age': 12.
}, {
    'name': 'Baxter Morth',
    'age': 18.
}]

vec = DictVectorizer()
vec.fit_transform(staff).toarray()

print(vec.get_feature_names())
Exemple #51
0
import pickle
import math
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer

ppmi_dic = defaultdict(dict)
t_c_dic, t_dic, c_dic, N = pickle.load(open('knock83.txt', 'rb'))

ppmi = lambda key: max(math.log(N * t_c_dic[key] / (t_dic[key.split(' ')[0]] * c_dic[key.split(' ')[1]])), 0)

for t_c, freq in t_c_dic.items():
    if freq >= 10:
        t, c =t_c.split()
        ppmi_tc = ppmi(t_c)
        if ppmi_tc > 0:
            ppmi_dic[t][c] = ppmi_tc

dicvec = DictVectorizer()
matrix = dicvec.fit_transform(ppmi_dic.values())

with open('knock84.txt','wb') as w_f:
    pickle.dump((matrix, list(ppmi_dic.keys())), w_f)
        
        

Exemple #52
0
'''
#观察Embarked字段取值
#我们发现一共就 3 个登陆港口,其中 S 港口人数最多,占到了 72%,因此我们将其余缺 失的 Embarked 数值均设置为 S:
print(train_data['Embarked'].value_counts())
# 使用登录最多的港口来填充登录港口的 nan 值
train_data['Embarked'].fillna('S', inplace=True)
test_data['Embarked'].fillna('S',inplace=True)

#特征选择 寻找自认为可能会和乘客的预测分类有关系的特征
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
#选择特定列
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]

#特征值中是字符串 不方便处理 用数字来代替比如female为1 male为0 Embarked为S C Q三种可能 用0/1表示
from sklearn.feature_extraction import DictVectorizer
dvec=DictVectorizer(sparse=False)
train_features=dvec.fit_transform(train_features.to_dict(orient='record'))#fit_transform它可以将特征向量转化为特征值矩阵
print(dvec.feature_names_)
#clf = DecisionTreeClassifier(criterion='entropy')
from sklearn.tree import DecisionTreeClassifier # 构造 ID3 决策树
clf = DecisionTreeClassifier(criterion='entropy') # 决策树训练
clf.fit(train_features, train_labels)
test_features=dvec.transform(test_features.to_dict(orient='record'))
# 决策树预测
pred_labels = clf.predict(test_features)
# 得到决策树准确率
acc_decision_tree = round(clf.score(train_features, train_labels), 6)
print(u'score 准确率为 %.4lf' % acc_decision_tree)
Exemple #53
0
class MEMMSequenceLabeler:
    def transform_input(self, data):
        return [
            self.feat(x, i, padded_history(y, i, self.order)) for x, y in data
            for i in range(0, len(x))
        ]

    def __init__(self, feat, train_data, order=1, **lr_params):
        self.order = order
        self.train_data = train_data
        self.feat = feat
        self.vectorizer = DictVectorizer()
        self.label_encoder = LabelEncoder()

        train_classifier_x = self.vectorizer.fit_transform(
            self.transform_input(train_data))
        train_classifier_y = self.label_encoder.fit_transform(
            to_classifier_y(train_data))
        self.lr = LogisticRegression(fit_intercept=False, **lr_params)
        self.lr.fit(train_classifier_x, train_classifier_y)
        self.v_weights = self.vectorizer.inverse_transform(self.lr.coef_)

    def weights(self, label):
        v_index = self.label_encoder.transform([label])
        v_weights = self.v_weights[v_index[0]]
        return v_weights

    def plot_lr_weights(self,
                        label,
                        how_many=20,
                        reverse=True,
                        feat_filter=lambda s: True):
        v_index = self.label_encoder.transform([label])
        v_weights = self.vectorizer.inverse_transform(
            self.lr.coef_)[v_index[0]]
        # print(type(v_weights.items()))
        filtered = [(k, v) for k, v in v_weights.items() if feat_filter(k)]
        sorted_weights = sorted(filtered, key=lambda t: t[1], reverse=reverse)
        return util.plot_bar_graph([w for _, w in sorted_weights[:how_many]],
                                   [f for f, _ in sorted_weights[:how_many]],
                                   rotation=45)

    def input_repr(self, x, i, y):
        return self.feat(x, i, padded_history(y, i, self.order))

    def sklearn_repr(self, x, i, y):
        return self.vectorizer.transform([self.input_repr(x, i, y)])

    def predict_next(self, x, i, y):
        scikit_x = self.vectorizer.transform([self.input_repr(x, i, y)])
        return self.label_encoder.inverse_transform(
            self.lr.predict(scikit_x))[0]

    def predict_next_hist(self, x, i, hist):
        scikit_x = self.vectorizer.transform(
            [self.feat(x, i, padded_history(hist, 1, self.order))])
        return self.label_encoder.inverse_transform(
            self.lr.predict(scikit_x))[0]

    def labels(self):
        return self.label_encoder.classes_

    def predict_scores(self, x, i, y):
        return self.lr.predict_log_proba(self.sklearn_repr(x, i, y))[0]

    def predict_label_scores(self, x, i, y):
        scores = self.predict_scores(x, i, y)
        labels = self.labels()
        return sorted([(labels[label_index], label_score)
                       for label_index, label_score in enumerate(scores)],
                      key=lambda x: -x[1])

    def predict_scores_hist(self, x, i, hist):
        scikit_x = self.vectorizer.transform(
            [self.feat(x, i, padded_history(hist, 1, self.order))])
        return self.lr.predict_log_proba(scikit_x)[0]

    def predict(self, data):
        result = []
        for x, y in data:
            y_guess = []
            for i in range(0, len(x)):
                prediction = self.predict_next(x, i, y_guess)
                y_guess += prediction
            result.append(y_guess)
        return result
Exemple #54
0
class SimpleFMLearner:
    def __init__(self,
                 iter=100,
                 factor=10,
                 use_info=True,
                 path='./',
                 external_fm=None):
        from pyfm import pylibfm
        self.__use_info = use_info
        # temp code, load ml-100k's info
        if self.__use_info:
            self.__info = Info(path)

        # Build and train a Factorization Machine
        if external_fm:
            print >> sys.stderr, 'Use external FM: %s' % type(external_fm)
            self.__fm = external_fm
        else:
            print >> sys.stderr, 'iter=%d, factor=%d, use_info=%d' % (
                iter, factor, use_info)
            self.__fm = pylibfm.FM(num_factors=factor,
                                   num_iter=iter,
                                   verbose=True,
                                   task="regression",
                                   initial_learning_rate=0.001,
                                   learning_rate_schedule="optimal")

    def fit(self, train):
        ''' train : [(userid, itemid, rating)...] '''
        train_data = []
        y_train = []
        for userid, itemid, rating in train:
            d = self.__make_data(userid, itemid)

            train_data.append(d)
            y_train.append(rating)

        self.__v = DictVectorizer()

        X_train = self.__v.fit_transform(train_data)
        y_train = np.array(y_train)

        print >> sys.stderr, 'x_train.shape=%s, type=%s' % (str(
            X_train.shape), type(X_train))
        print >> sys.stderr, 'y_train.shape=%s, type=%s' % (str(
            y_train.shape), type(y_train))
        if isinstance(self.__fm, tffm.models.TFFMRegressor):
            self.__fm.fit(X_train, y_train, show_progress=True)
        else:
            self.__fm.fit(X_train, y_train)
        print >> sys.stderr, 'Train completed.'

    def predict(self, userid, itemid):
        d = self.__make_data(userid, itemid)
        X_test = self.__v.transform([d])
        preds = self.__fm.predict(X_test)
        return preds[0]

    def __make_data(self, userid, itemid):
        userid = int(userid)
        itemid = int(itemid)
        d = {"user_id": str(userid), "movie_id": str(itemid)}
        if self.__use_info:
            d = self.__info.process(userid, itemid, d)
        return d
print(df.shape)

# Print the shape of the transformed array
print(df_encoded.shape)
-------------------------------------------------------------
# Import DictVectorizer
from sklearn.feature_extraction import DictVectorizer

# Convert df into a dictionary: df_dict
df_dict = df.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer()

# Apply dv on df: df_encoded
df_encoded = dv.fit_transform(df_dict)

# Print the resulting first five rows
print(df_encoded[:5,:])

# Print the vocabulary
print(dv.vocabulary_)
-------------------------------------------------------
# Import necessary modules
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Fill LotFrontage missing values with 0
X.LotFrontage = X.LotFrontage.fillna(0)
    if dataset == "train":
        with open('../data/train_2.dat') as f:
            for line in f:
                (userID, movieID, rating) = line.split(' ')
                data.append({"userID": str(userID), "movieID": str(movieID)})
                try:
                    # for matrix factorization, this was
                    y.append(float(rating))
                    # y.append(float(rating))
                except ValueError:
                    print "Check line {l}".format(l=line)
                users.add(userID)
                movies.add(movieID)
        return (data, y, users, movies)


train = get_unique_users_movies("train")
test = get_unique_users_movies("test")

X_train, y_train = train[0], train[1]

X_test = test[0]

print type(y_train)

v = DictVectorizer()
X_train_dv = v.fit_transform(X_train)
X_test_dv = v.transform(X_test)

print X_train_dv
Exemple #57
0
mtx = dp.form_matrix('./data/train.json', type=2)
X_train = []
y_train = []
for item in mtx:
    dic = {}
    for tag in item[1]:
        if tag not in dic:
            dic[tag] = 1
        else:
            dic[tag] += 1
    X_train.append(dic)
    y_train.append(item[0])

v = DictVectorizer(sparse=False)
X_train = v.fit_transform(X_train)

#############SVM : 0.71
#clf = svm.SVC(kernel='linear')

#############MNB : 0.70
#clf = MultinomialNB()

#############BNB : 0.71
#clf = BernoulliNB()

#############GNB : 0.35
#clf = GaussianNB()

#############RF : 0.707
clf = RandomForestClassifier(n_estimators=200, criterion='entropy')
Exemple #58
0
def lexrank(sentences, continuous=False, sim_threshold=0.1, alpha=0.9,
            use_divrank=False, divrank_alpha=0.25):
    '''
    compute centrality score of sentences.

    Args:
      sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ]
      continuous: if True, apply continuous LexRank. (see reference)
      sim_threshold: if continuous is False and smilarity is greater or
        equal to sim_threshold, link the sentences.
      alpha: the damping factor of PageRank and DivRank
      divrank: if True, apply DivRank instead of PageRank
      divrank_alpha: strength of self-link [0.0-1.0]
        (it's not the damping factor, see divrank.py)

    Returns: tuple
      (
        {
          # sentence index -> score
          0: 0.003,
          1: 0.002,
          ...
        },
        similarity_matrix
      )
    
    Reference:
      Günes Erkan and Dragomir R. Radev.
      LexRank: graph-based lexical centrality as salience in text
      summarization. (section 3)
      http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html
    '''
    # configure ranker
    ranker_params = {'max_iter': 1000}
    if use_divrank:
        ranker = divrank_scipy
        ranker_params['alpha'] = divrank_alpha
        ranker_params['d'] = alpha
    else:
        ranker = networkx.pagerank_scipy
        ranker_params['alpha'] = alpha

    graph = networkx.DiGraph()

    # sentence -> tf
    sent_tf_list = []
    for sent in sentences:
        words = tools.word_segmenter_ja(sent)
        tf = collections.Counter(words)
        sent_tf_list.append(tf)

    sent_vectorizer = DictVectorizer(sparse=True)
    sent_vecs = sent_vectorizer.fit_transform(sent_tf_list)

    # compute similarities between senteces
    sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric='cosine')

    if continuous:
        linked_rows, linked_cols = numpy.where(sim_mat > 0)
    else:
        linked_rows, linked_cols = numpy.where(sim_mat >= sim_threshold)

    # create similarity graph
    graph.add_nodes_from(range(sent_vecs.shape[0]))
    for i, j in zip(linked_rows, linked_cols):
        if i == j:
            continue
        weight = sim_mat[i,j] if continuous else 1.0
        graph.add_edge(i, j, weight=weight)

    scores = ranker(graph, **ranker_params)
    return scores, sim_mat
Exemple #59
0
#inplace=True:不创建新的对象,直接对原始对象进行修改;
# inplace=False:对数据进行修改,创建并返回新的对象承载其修改结果。
X['age'].fillna(X['age'].mean(), inplace=True)

#对原始数据进行分割,25%的乘客数据用于测试
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=33)

#对类别型特征进行转化,成为特征向量
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)  #sparse=False意思是不产生稀疏矩阵
#转换特征后,凡是类别型的特征都单独剥离出来,独成一列特征,数值型保持不变
X_train = vec.fit_transform(X_train.to_dict(orient='record'))

# 对测试数据的特征进行转换
X_test = vec.transform(X_test.to_dict(orient='record'))

# 一、使用单一决策树进行模型训练和预测分析
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()  #使用默认配置初始化决策树分类器
dtc.fit(X_train, y_train)  #使用分割得到的训练数据进行模型学习
dtc_y_predict = dtc.predict(X_test)  #使用训练好的决策树模型对测试特征数据进行预测

# 二、使用随机森林分类器进行集成模型的训练以及预测分析
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_y_pred = rfc.predict(X_test)
Exemple #60
0
def get_data_queue(args):
    users, items, labels = [], [], []
    if args.dataset == 'ml-100k':
        data_path = os.path.join(args.data, 'ml-100k', 'u.data')
    elif args.dataset == 'ml-1m':
        data_path = os.path.join(args.data, 'ml-1m', 'ratings.dat')
    elif args.dataset == 'ml-10m':
        data_path = os.path.join(args.data, 'ml-10m', 'ratings.dat')
    elif args.dataset == 'youtube-small':
        data_path = os.path.join(args.data, 'youtube-weighted-small.npy')

    if 'ml' in args.dataset:
        # movielens dataset
        with open(data_path, 'r') as f:
            for i, line in enumerate(f.readlines()):
                if args.dataset == 'ml-100k':
                    line = line.split()
                elif args.dataset == 'ml-1m' or args.dataset == 'ml-10m':
                    line = line.split('::')
                users.append(int(line[0]) - 1)
                items.append(int(line[1]) - 1)
                labels.append(float(line[2]))
        labels = StandardScaler().fit_transform(np.reshape(
            labels, [-1, 1])).flatten().tolist()

        print('user', max(users), min(users))
        print('item', max(items), min(items))

        users, items, labels = shuffle(users, items, labels)
        indices = list(range(len(users)))
        num_train = int(len(users) * args.train_portion)
        num_valid = int(len(users) * args.valid_portion)

        if not args.mode == 'libfm':
            data_queue = torch.utils.data.TensorDataset(
                torch.tensor(users), torch.tensor(items), torch.tensor(labels))

            train_queue = torch.utils.data.DataLoader(
                data_queue,
                batch_size=args.batch_size,
                sampler=torch.utils.data.sampler.SubsetRandomSampler(
                    indices[:num_train]),
                pin_memory=True)

            valid_queue = torch.utils.data.DataLoader(
                data_queue,
                batch_size=args.batch_size,
                sampler=torch.utils.data.sampler.SubsetRandomSampler(
                    indices[num_train:num_train + num_valid]),
                pin_memory=True)

            test_queue = torch.utils.data.DataLoader(
                data_queue,
                batch_size=args.batch_size,
                sampler=torch.utils.data.sampler.SubsetRandomSampler(
                    indices[num_train + num_valid:]),
                pin_memory=True)

        else:
            # prepare data format for libfm
            data_queue = []
            for i in range(len(users)):
                data_queue.append({
                    'user': str(users[i]),
                    'item': str(items[i])
                })

            v = DictVectorizer()
            data_queue = v.fit_transform(data_queue)
            train_queue = [
                data_queue[:num_train],
                np.array(labels[:num_train])
            ]
            valid_queue = [
                data_queue[num_train:num_train + num_valid],
                np.array(labels[num_train:num_train + num_valid])
            ]
            test_queue = [
                data_queue[num_train + num_valid:],
                np.array(labels[num_train + num_valid:])
            ]

    else:
        # 3-d dataset
        [ps, qs, rs, labels] = np.load(data_path).tolist()
        labels = StandardScaler().fit_transform(np.reshape(
            labels, [-1, 1])).flatten().tolist()

        ps = [int(i) for i in ps]
        qs = [int(i) for i in qs]
        rs = [int(i) for i in rs]
        print('p', max(ps), min(ps))
        print('q', max(qs), min(qs))
        print('r', max(rs), min(rs))

        ps, qs, rs, labels = shuffle(ps, qs, rs, labels)
        indices = list(range(len(ps)))
        num_train = int(len(ps) * args.train_portion)
        num_valid = int(len(ps) * args.valid_portion)

        if not args.mode == 'libfm':
            data_queue = torch.utils.data.TensorDataset(
                torch.tensor(ps), torch.tensor(qs), torch.tensor(rs),
                torch.tensor(labels))

            train_queue = torch.utils.data.DataLoader(
                data_queue,
                batch_size=args.batch_size,
                sampler=torch.utils.data.sampler.SubsetRandomSampler(
                    indices[:num_train]),
                pin_memory=True)

            valid_queue = torch.utils.data.DataLoader(
                data_queue,
                batch_size=args.batch_size,
                sampler=torch.utils.data.sampler.SubsetRandomSampler(
                    indices[num_train:num_train + num_valid]),
                pin_memory=True)

            test_queue = torch.utils.data.DataLoader(
                data_queue,
                batch_size=args.batch_size,
                sampler=torch.utils.data.sampler.SubsetRandomSampler(
                    indices[num_train + num_valid:]),
                pin_memory=True)

        else:
            # prepare data format for libfm
            data_queue = []
            for i in range(len(ps)):
                data_queue.append({
                    'p': str(ps[i]),
                    'q': str(qs[i]),
                    'r': str(rs[i])
                })

            v = DictVectorizer()
            data_queue = v.fit_transform(data_queue)
            train_queue = [
                data_queue[:num_train],
                np.array(labels[:num_train])
            ]
            valid_queue = [
                data_queue[num_train:num_train + num_valid],
                np.array(labels[num_train:num_train + num_valid])
            ]
            test_queue = [
                data_queue[num_train + num_valid:],
                np.array(labels[num_train + num_valid:])
            ]

    return train_queue, valid_queue, test_queue