Example #1
0
def dict_vectorize(dict_list):
    assert isinstance(dict_list, list)
    from sklearn.feature_extraction import DictVectorizer

    vec = DictVectorizer()
    vec.fit(dict_list)
    return vec
Example #2
0
  def export(self, query, n_topics, n_words, title="PCA Export", fname="PCAExport"):
    vec = DictVectorizer()
    
    rows = topics_to_vectorspace(self.model, n_topics, n_words)
    X = vec.fit_transform(rows)
    pca = skPCA(n_components=2)
    X_pca = pca.fit(X.toarray()).transform(X.toarray())
    
    match = []
    for i in range(n_topics):
      topic = [t[1] for t in self.model.show_topic(i, len(self.dictionary.keys()))]
      m = None
      for word in topic:
        if word in query:
          match.append(word)
          break

    pyplot.figure()
    for i in range(X_pca.shape[0]):
      pyplot.scatter(X_pca[i, 0], X_pca[i, 1], alpha=.5)
      pyplot.text(X_pca[i, 0], X_pca[i, 1], s=' '.join([str(i), match[i]]))  
     
    pyplot.title(title)
    pyplot.savefig(fname)
     
    pyplot.close()
Example #3
0
def pair_vectors(pairs, features, words, output_path):
    vectorizer = DictVectorizer()
    vectors = vectorizer.fit_transform(x[1] for x in features)

    vector_map = {word:vector for word, vector in
                  itertools.izip((x[0].split('/')[0] for x in features),
                                 vectors)}

    # Positive examples
    positive = []
    record = []
    for specific, general in pairs:
        positive.append(vector_map[general] - vector_map[specific])
        record.append( (specific, general, 1) )

    pair_set = set([tuple(x) for x in pairs])
    non_positive = []
    for i in range(len(positive)):
        first = second = None
        while first == second or (first, second) in pair_set:
            first = words[random.randint(len(words))]
            second = words[random.randint(len(words))]
        non_positive.append(vector_map[second] - vector_map[first])
        record.append( (first, second, 0) )
    
    data = vstack(positive + non_positive)
    target = [1]*len(positive) + [0]*len(non_positive)
    
    # Save dataset
    with open(os.path.join(output_path,'wn-noun-dependencies.mat'), 'wb') as data_file:
        dump_svmlight_file(data, target, data_file)

    with open(os.path.join(output_path,'wn-noun-dependencies.json'), 'w') as record_file:
        json.dump(record, record_file)
Example #4
0
    def _train(self, train_data, resources):
        sample_length = len(train_data)
        dict_status_path = os.path.join(root_dic,
                                        'dict_vectorizer_{}.status'.
                                        format(sample_length))
        if os.path.isfile(dict_status_path):
            dictVectorizer = joblib.load(dict_status_path)
        else:
            dictVectorizer = DictVectorizer()
            dictVectorizer.fit(train_data[self.features].
                               fillna(0).
                               to_dict('record'))
            joblib.dump(dictVectorizer, dict_status_path)

        tfidf_status_path = os.path.join(root_dic,
                                         'tfidf_vectorizer_{}.status'.
                                         format(sample_length))
        if os.path.isfile(tfidf_status_path):
            tfidf = joblib.load(tfidf_status_path)
        else:
            tfidf = TfidfVectorizer(min_df=40, max_features=300)
            tfidf.fit(train_data.essay)
            joblib.dump(tfidf, tfidf_status_path)

        resources['dictVectorizer'] = dictVectorizer
        resources['tfidf'] = tfidf
        print 'Head Processing Completed'
        return train_data, resources
Example #5
0
def generate_matrix():
    D = []
    y = []
    fex = features.IpadicFeature()
    progress = 0
    print('create feature dictionary')
    for q, a in load_corpus():
        D.append(list(fex.transform(q)))
        a = normalize.normalize_askfm(a, h2z=False)
        y.append(isnot_shitsumon(a))
        progress += 1
        if progress % 100 == 0:
            print(progress)

    dv = DictVectorizer()
    dv.fit(itertools.chain(*D))

    progress = 0
    print('create feature vector')
    X = []
    for ds in D:
        count = None
        for d in ds:
            v = dv.transform(d)
            if count is None:
                count = v
            else:
                count += v
        X.append(count)
        progress += 1
        if progress % 100 == 0:
            print(progress)
    X = scipy.sparse.vstack(X)
    y = numpy.array(y)
    return X, y, dv
Example #6
0
def get_vector(name, feature_names, full_vector):
    """
    Returns a complete feature vector
    """
    name_features = {}
    name_features["last_letter"] = name[-1]
    name_features["last_two"] = name[-2:]
    name_features["last_is_vowel"] = 0 if name[-1] in "aeiouy" else 0

    vectorizer = DictVectorizer()
    small_vector = vectorizer.fit_transform(name_features).toarray()[0]
    small_feature_names = vectorizer.get_feature_names()

    hit_count = 0
    for index, feature_name in enumerate(feature_names):
        if feature_name in small_feature_names:
            full_vector[index] = small_vector[small_feature_names.index(feature_name)]
            hit_count += 1
        else:
            full_vector[index] = 0

    assert hit_count == len(small_feature_names) == small_vector.shape[0]
    assert full_vector.shape[0] == len(feature_names)

    return full_vector
Example #7
0
def extractData(features, examples=None, scaler=None, featureOrder=None, scaling=False):
    vec = DictVectorizer()
    samples = vec.fit_transform(features)
    featureNames = vec.get_feature_names()
    
    if (featureOrder != None):
        indices = [featureNames.index(feature) for feature in featureOrder]
        samples = samples[:, indices]
    imp = pp.Imputer(missing_values='NaN', strategy='mean')
    if (examples == None):
        imp.fit(samples)
    else :
        imp.fit(examples)
    impSamples = imp.transform(samples)
    if (impSamples.shape == samples.shape):
        samples = impSamples
    else:
        print("too few samples to replace missing values, using 0's")
        samples[shouldReplace(samples)]=0
    
#     if (scaler == None):
#         scaler = pp.StandardScaler(with_mean=False)
#         scaler.fit(samples)
#     samples = scaler.transform(samples)
    if (scaling):
        samples = pp.scale(samples,with_mean=False)
    if (sprs.isspmatrix(samples)):
        samples = samples.todense()
    
    return [samples, featureNames,imp,scaler]
Example #8
0
File: B.py Project: keyu-lai/NLP
def vectorize(train_features, test_features):
    """
    convert set of features to vector representation
    :param train_features: A dictionary with the following structure
             { instance_id: {f1:count, f2:count,...}
            ...
            }
    :param test_features: A dictionary with the following structure
             { instance_id: {f1:count, f2:count,...}
            ...
            }
    :return: X_train: A dictionary with the following structure
             { instance_id: [f1_count,f2_count, ...]}
            ...
            }
            X_test: A dictionary with the following structure
             { instance_id: [f1_count,f2_count, ...]}
            ...
            }
    """
    X_train = {}
    X_test = {}

    vec = DictVectorizer()
    vec.fit(train_features.values())
    for instance_id in train_features:
        X_train[instance_id] = vec.transform(train_features[instance_id]).toarray()[0]

    for instance_id in test_features:
        X_test[instance_id] = vec.transform(test_features[instance_id]).toarray()[0]

    return X_train, X_test
def TransformIntoVectors(totalData,totalLabel):
    v = DictVectorizer(sparse=True)
    
    X =  v.fit_transform(totalData)   
    Y = array(totalLabel)
    
    return (X,Y)
Example #10
0
class Projects:
  def __init__(self, outcome_file):
    self.state_feature_index = 7
    self.zip_feature_index = 8
    self.binary_feature_index = [12, 13, 14, 15, 16, 17, 19, 20, 32, 33]
    self.categorical_feature_index = [18, 21, 22, 25, 26, 27, 28]
    self.numerical_feature_index = [29, 30, 31]
    self.date_feature_index = 34
    self.vec = DictVectorizer(sparse=False)
    self.load_projects(outcome_file)
    
  def load_projects(self, outcome_file):
    fin = open(outcome_file)
    self.project_feature_names = fin.next().strip().split(',')
    self.projects = dict((line.strip().split(',')[0], line.strip().split(','))\
    for line in fin)
    fin.close()
    
  def all_features(self, pids):
    measurements_state = map(lambda k: {str(self.state_feature_index): self.projects[k][self.state_feature_index]}, pids)
    measurements_zip = map(lambda k: {str(self.zip_feature_index): self.projects[k][self.zip_feature_index][:3]}, pids)
    measurements_bin = map(lambda k: dict((str(fi), self.projects[k][fi]) for fi in self.binary_feature_index), pids)
    measurements_cat = map(lambda k: dict((str(fi), self.projects[k][fi]) for fi in self.categorical_feature_index), pids)
    #measurements_num = map(lambda k: [float(self.projects[k][fi]) for fi in self.numerical_feature_index], pids)
    measurements_num = map(lambda k: dict((str(fi), str(discretize_num(float(self.projects[k][fi])))) for fi in self.numerical_feature_index), pids)
    return self.vec.fit_transform(measurements_state), self.vec.fit_transform(measurements_zip), self.vec.fit_transform(measurements_bin), self.vec.fit_transform(measurements_cat), self.vec.fit_transform(measurements_num)#,np.array(measurements_num)
def test_dictvectorizer():
    D = [{"foo": 1, "bar": 3},
         {"bar": 4, "baz": 2},
         {"bar": 1, "quux": 1, "quuux": 2}]

    for sparse in (True, False):
        for dtype in (int, np.float32, np.int16):
            for sort in (True, False):
                for iterable in (True, False):
                    v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
                    X = v.fit_transform(iter(D) if iterable else D)

                    assert_equal(sp.issparse(X), sparse)
                    assert_equal(X.shape, (3, 5))
                    assert_equal(X.sum(), 14)
                    assert_equal(v.inverse_transform(X), D)

                    if sparse:
                        # CSR matrices can't be compared for equality
                        assert_array_equal(X.A, v.transform(iter(D) if iterable
                                                            else D).A)
                    else:
                        assert_array_equal(X, v.transform(iter(D) if iterable
                                                          else D))

                    if sort:
                        assert_equal(v.feature_names_,
                                     sorted(v.feature_names_))
Example #12
0
def tokenize( training_data, test_data ):
	# print training_data.shape
	if 'Risk_Stripe' in categorical_fields:
		categorical_fields.remove( 'Risk_Stripe' )
	
	
	for c in categorical_fields:
		training_data[c] = training_data[c].map(str)
		test_data[c] = test_data[c].map(str)

	cat_data = training_data[categorical_fields]
	ts_cat_data = test_data[categorical_fields]
	# print cat_data.shape
	vec = DictVectorizer()
	tr_cat_data_dict = cat_data.T.to_dict().values()
	ts_cat_data_dict = ts_cat_data.T.to_dict().values()
	tr_cat_data_array = vec.fit_transform( tr_cat_data_dict ).toarray()
	ts_cat_data_array = vec.transform( ts_cat_data_dict ).toarray()
	# print tr_cat_data_array.shape
	# print ts_cat_data_array.shape
	non_cat_data = training_data.drop( categorical_fields, axis=1 )
	non_cat_data = np.array( non_cat_data ).astype(np.float)
	new_tr_data = np.concatenate( (tr_cat_data_array, non_cat_data), axis=1 )
	# print new_tr_data.shape
	non_cat_data = test_data.drop( categorical_fields, axis=1 )
	non_cat_data = np.array( non_cat_data ).astype(np.float)
	new_ts_data = np.concatenate( (ts_cat_data_array, non_cat_data), axis=1 )
	# print new_ts_data.shape
	new_tr_data = pd.DataFrame( new_tr_data, index=training_data.index )
	new_ts_data = pd.DataFrame( new_ts_data, index=test_data.index )
	return new_tr_data, new_ts_data
Example #13
0
def cv_prediction(feature_dict, feature, polarity, threshold, folds):
    accuracy = 0
    precision = 0
    recall = 0
    f1 = 0
    count = 0
    dicvec = DictVectorizer()
    LR = LogisticRegression()
    kfold = KFold(len(polarity), n_folds=folds)
    for train, test in kfold:
        count += 1
        x = list()
        y = list()
        [x.append(feature[i]) for i in train]
        [y.append(polarity[i]) for i in train]
        x.append(feature_dict)
        y.append(0)
        LR.fit(dicvec.fit_transform(x), y)
        test_label = list()
        answer_label = list()
        [answer_label.append(polarity[j]) for j in test]
        for j in test:
            query = fit_feature(feature[j], feature_dict)
            result = -1 if query.shape[1] != len(feature_dict) else prediction(LR, query, threshold)
            test_label.append(result)
        accuracy += accuracy_score(answer_label, test_label)
        precision += precision_score(answer_label, test_label)
        recall += recall_score(answer_label, test_label)
        f1 += f1_score(answer_label, test_label)
        print('{}_fold finished.'.format(count))

    return accuracy, precision, recall, f1
 def _dic_list_to_matrix(self, processedData, normalize):
     vectorizer = DictVectorizer()
     if normalize:
         res = preprocessing.normalize(vectorizer.fit_transform(processedData), norm='l2')
     else:
         res = vectorizer.fit_transform(processedData)
     return vectorizer.get_feature_names(), res
Example #15
0
def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Train morphology generation model')
    parser.add_argument('category', help='Russian word category to (R/V/A/N/M)')
    parser.add_argument('model', help='output file for trained model')
    parser.add_argument('--penalty', help='regularization penalty', type=float, default=0.001)
    args = parser.parse_args()

    assert len(args.category) == 1
    with open(args.model, 'w') as f:
        f.write('write test / training...')

    logging.info('Extracting features for training data')

    training_features = []
    training_outputs = []
    for source, target, alignment in read_sentences(sys.stdin):
        for features, output in extract_instances(args.category, source, target, alignment):
            training_features.append(features)
            training_outputs.append(output)

    vectorizer = DictVectorizer()
    X = vectorizer.fit_transform(training_features)
    y = training_outputs

    logging.info('Training data size: %d instances x %d features', *X.shape)
    logging.info('Training model for category: %s (%d tags)', args.category, len(set(y)))

    model = LogisticRegression(C=args.penalty)
    model.fit(X, y)

    with open(args.model, 'w') as f:
        cPickle.dump((args.category, vectorizer, model), f, protocol=-1)
def learn_classify__svm_individual(data, folds, test_fold=4):

    test_folds = [0, 1, 2, 3, 4]

    X_train = []
    y_train = []
    X_test = []
    y_test = []

    for i in test_folds:
        if i == test_fold: continue
        for name in folds[i]:
            c, ind = parse_filename(name)
            X_train.append(data[c][ind]['features'])
            y_train.append(data[c][ind]['meta']['stance'])

    for i in test_folds:
        if i != test_fold: continue
        for name in folds[i]:
            c, ind = parse_filename(name)
            X_test.append(data[c][ind]['features'])
            y_test.append(data[c][ind]['meta']['stance'])

    vectorizer = DictVectorizer(sparse=True)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.fit_transform(X_test)

    clf = svm.LinearSVC()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    return accuracy_score(y_test, y_pred)
def get_LinearRegression_Acc(a,b,c):

    # Convert features into vector of numbers
    from sklearn.feature_extraction import DictVectorizer
    v1 = DictVectorizer().fit(a+c)

    #define training data
    X_data_tr = v1.transform(a)
    Y_data_tr = b

    #define test data
    X_data_ts = v1.transform(c)

    #import Linear Regression classifier
    import numpy as np
    from sklearn import linear_model
    regr = linear_model.LinearRegression()
    regr.fit(X_data_tr,Y_data_tr)

    #Use trained model to classify test data
    Y_pred = regr.predict(X_data_ts)
    # Convert into nearest integer
    Y_pred = np.rint(Y_pred)

    return Y_pred
def main():
    # load data
    #path = 'generated/extracted_text'
    os.system("mkdir generated")
    path = 'extracted_text'
    data = map(json.loads, file(path))

    # count word for every tag
    tags = TAGS + ['boilerplate', 'boilerpipe']
    counts_per_tag = {}

    for tag in tags:
        counts = map(count, get(tag, data))
        counts_per_tag[tag] = counts

    total = sum_up(counts_per_tag, len(data))

    # vectorize
    v = DictVectorizer()
    v.fit([total])

    features = {}
    for tag in tags:
        features[tag] = v.transform(counts_per_tag[tag])

    save('text_features', features)
    save('text_vectorizer', v)
    os.system("mv generated/text_features . ")
    os.system("mv generated/text_vectorizer . ")
Example #19
0
def cat_to_dummy(df, cat_cols, test=False, vectorizer=None):
    """
    Convert categorical variables to dummies for either train or test dataset
    """
    num_cols = list(set(df.columns) - set(cat_cols))

    cat = df[cat_cols].astype(str)
    num = df[num_cols]

    x_num = num.values

    cat.fillna( 'NA', inplace = True )

    x_cat = cat.T.to_dict().values()

    if test:
        vec_x_cat = vectorizer.transform(x_cat)
        x = np.hstack((x_num, vec_x_cat))
        return x

    else:
        vectorizer = DV(sparse=False)
        vec_x_cat = vectorizer.fit_transform(x_cat)
        x = np.hstack((x_num, vec_x_cat))
        return x, vectorizer
Example #20
0
def featurize(X):
	"""
	Featurizes using just word overlap features.
	Extracts the unigram features from the given train set.
	- X: numpy array with (n_train, max_seqlen, max_sentlen) 

	Return:
	- vector: A numpy array of size (n_train, n_features)
	corresponding to the featurization of X.  
	"""	
	feats = []

	n_train = len(X)
	print n_train
	for i in xrange(n_train):
		feat_dict = {}
		s1 = X[i][0]
		s2 = X[i][1]
		feat_dict.update(word_overlap_features(s1, s2))
		feats.append(feat_dict)
		    
	v = DictVectorizer(sparse=True)
	vector = v.fit_transform(feats)

	return vector
def get_neurosynth_terms(combined_df):
    """ Grab terms for each image, decoded with neurosynth"""
    terms = list()
    from sklearn.feature_extraction import DictVectorizer
    vectorizer = DictVectorizer()
    image_ids = list()
    for row in combined_df.iterrows():
        image_id = row[1]['image_id']
        image_ids.append(int(image_id))
        print "Fetching terms for image %i" % image_id
        image_url = row[1]['url_image'].split('/')[-2]

        try:
            elevations = mem.cache(url_get)(
                        'http://neurosynth.org/decode/data/?neurovault=%s'
                        % image_url)
            data = json.loads(elevations)['data']
            data = dict([(i['analysis'], i['r']) for i in data])
        except HTTPError:
            data = {}
        terms.append(data)
    X = vectorizer.fit_transform(terms).toarray()
    term_dframe = dict([('neurosynth decoding %s' % name, X[:, idx])
                        for name, idx in vectorizer.vocabulary_.items()])
    term_dframe['image_id'] = image_ids

    return pd.DataFrame(term_dframe)
def KFoldPredictionScore (X,y,k,header):

    from sklearn.svm import SVC
    from sklearn.feature_extraction import DictVectorizer
    vec = DictVectorizer()

    try:
        accuracy = 0.0
        for X_train, y_train, X_test, y_test in k_fold_generator(X, y, k):

            vec = DictVectorizer()
            fit = vec.fit(X_train)

            X_train_counts = fit.transform(X_train)
            X_test_counts = fit.transform(X_test)
            clf = SVC(kernel="linear", C=0.025)
            try:
                clf.fit(X_train_counts.toarray(), y_train)
                #predict = clf.predict(X_test_counts.toarray())
                accuracy += clf.score(X_test_counts.toarray(),y_test)
                # coef = clf._get_coef()
               # print(np.argsort(coef)[-20:])
                #for i in range(0,len(X_test)):
                    #print (X_test[i]['ID']+"\t"+y_test[i]+"\t"+predict[i])
            except BaseException as b:
                    print (b)
        print (header+"\t"+str(accuracy))
    except BaseException as b:
        print (b)
Example #23
0
def get_similarity_of_words():
	f = open('final_rated_posts.csv', 'r')
	counters = []
	vocab_map = dict()
	index = 0
	for _ in range(5):
		counters.append(Counter())
	for line in f:
		attributes = line.split('@@')
		words = set(attributes[1].strip().split())
		words_mapped = []
		for word in words:
			if word not in vocab_map:
				vocab_map[word] = index
				index += 1
			words_mapped.append(vocab_map[word])
		label = int(attributes[2])
		if label in (1, 2, 3, 4, 5):
			counters[label-1] += Counter(list(words_mapped))
	f.close()
	vectorizer = DictVectorizer(sparse=False)
	data = vectorizer.fit_transform(counters)
	words_1 = set(counters[0].keys())
	words_5 = set(counters[4].keys())
	print len(words_1)
	print len(words_5)
	print len(words_1 - words_5)
	print len(words_5 - words_1)
	for i in xrange(len(data)):
		words_i = data[i]
		for j in xrange(len(data)):
			words_j = data[j]
			print "Comparing words of posts categorized as %d and %d" % (i + 1, j + 1)
			#print cosine_similarity(words_i, words_j)
			print jaccard_sim(set(counters[i]), set(counters[j]))
def get_linear_regression_model(x, y):
    print 'generating linear model'
    linreg = linear_model.LinearRegression()
    d = DictVectorizer()
    t_x = d.fit_transform(x)
    linreg.fit(t_x, y)
    return d, linreg
Example #25
0
def create_matrix(dataCols, exclude_aids=[], return_pairs=False):
    # Find aid-pid pairs
    pairs = []
    for aid in [a for a in dataCols.values()[0] if a not in exclude_aids]:
        for pid in dataCols.values()[0][aid]:
            addpair = True
            for dataCol in dataCols.values():
                try:
                    tmp = dataCol[aid][pid]
                except KeyError:
                    addpair = False
            if addpair:
                pairs.append((aid, pid))

    # Create the list of dictionaries
    dataDict = [{} for i in pairs]
    for n in range(len(pairs)):
        aid, pid = pairs[n]
        for dataName, data in dataCols.items():
            dataDict[n][dataName] = data[aid][pid]

    # Create the matrix
    vec = DictVectorizer()
    data = vec.fit_transform(dataDict).toarray()
    colnames = vec.get_feature_names()

    # Done
    if return_pairs:
        return data, colnames, pairs
    else:
        return data, colnames
Example #26
0
def getTermStatistics(all_hits):
    host =  environ['ELASTICSEARCH_SERVER'] if environ.get('ELASTICSEARCH_SERVER') else 'localhost'
    es = Elasticsearch(hosts=[host])
    tfidfs = []
    docs = []

    for i in range(0, len(all_hits), 100):
        hits = all_hits[i:i+100]
        term_res = es.mtermvectors(index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', 
                                doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page', 
                                term_statistics=True, fields=['text'], ids=hits)
        #pprint.pprint(term_res['docs'])
        for doc in term_res['docs']:
            #pprint.pprint(doc)
            if doc.get('term_vectors'):
                if 'text' in doc['term_vectors']:
                    docs.append(doc['_id'])
                    tfidfs.append(terms_from_es_json(doc))
            #else:
             #   pprint.pprint(doc)
        #pprint.pprint(tfidfs)
    
    v = DictVectorizer()

    return [v.fit_transform(tfidfs), v.get_feature_names()]
Example #27
0
    def extract(self):
        """
        Extract features for clustering
        """

        continuous_features = []
        discrete_features = []

        for page, text in zip(self.pages, self.texts):
        
            # continuous features
            continuous_features.append([
                process(page, text)
                for key, process in self.CONTINUOUS_FEATURES.iteritems()
            ])

            # discrete features
            discrete_feature = dict(text['computed'].items())
            discrete_feature['path'] = ' > '.join(text['path'])
            discrete_features.append(discrete_feature)

        # build numpy array
        continuous_features = preprocessing.scale(np.array(continuous_features))

        # vectorize discrete features
        vectorizer = DictVectorizer()
        discrete_features = vectorizer.fit_transform(discrete_features).toarray()

        return np.hstack([continuous_features, discrete_features]).astype(np.float32)
def test_sklearn_nb(balanced):
    movie_words = process_plots_mp(balanced)

    training_movies = [movie_words[i] for i in range(len(movie_words)) if i % 3 != 0]
    test_movies = [movie_words[i] for i in range(len(movie_words)) if i % 3 == 0]

    vec = DictVectorizer()
    training_features = vec.fit_transform([movie.wordcounts for movie in training_movies]).toarray()
    training_labels = np.array([movie.year for movie in training_movies])
    #LOGGER.debug("Original size of feature vectors: %d (issparse: %s)" % (
        #csr_matrix(training_features[-1]).toarray().size, str(issparse(training_features))
    #))

    mnb_classifier = MultinomialNB()
    mnb_classifier.fit(training_features, training_labels)

    test_features = vec.transform([movie.wordcounts for movie in test_movies])
    test_labels = np.array([movie.year for movie in test_movies])

    results = mnb_classifier.predict(test_features)

    correct = sum([1 for i, result in enumerate(results) if result == test_labels[i]])
    LOGGER.info("skleanrn's MultinomialNB classifier predicted %d/%d correctly (%0.3f%% accuracy)" % (
        correct, len(test_labels), correct / len(test_labels) * 100
    ))
Example #29
0
def retrain_models(username):
	train_x, train_y, body_x, body_y, head_x, head_y = model_retriever.retrieve_data_db(username)

	b_train_x = []
	b_train_y = numpy.concatenate([body_y, train_y])

	for msg in (body_x + train_x):
		b_train_x.append(extract_body_features(msg))

	body_vec = TfidfVectorizer(norm="l2")
	b_train_x = body_vec.fit_transform(b_train_x)

	h_train_x = []
	h_train_y = numpy.concatenate([head_y, train_y])

	for msg in (head_x + train_x):
		h_train_x.append(extract_header_features(msg))

	head_vec = DictVectorizer()
	h_train_x = head_vec.fit_transform(h_train_x)

	body_model = LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3)
	head_model = RidgeClassifier(tol=1e-2, solver="lsqr")

	body_model.fit(b_train_x, b_train_y)
	head_model.fit(h_train_x, h_train_y)

        print("Finished training models for "+username+"...")

	store_models(username, body_vec, body_model, head_vec, head_model)
Example #30
0
def runsvd():
    data = []
    y = []
    students=set()
    steps=set()

    #training input data
    for i in range(0, numOfLines):
        data.append({"student_id": str(student_id[i]), "step_id": str(step_full_name[i])})
        y.append(int(correct_first_attempt[i]))
        #students.add(student_id[i])
        #steps.add(step_full_name[i])

    #training output data
    data2 = []
    y2 = []
    for i in range(0, numOfLines2):
        data2.append({"student_id": str(student_id2[i]), "step_id": str(step_full_name2[i])})
        y2.append(int(correct_first_attempt2[i]))
    test_data = data2
    y_test = np.array(y2)





    train_data = data
    y_train = np.array(y)


    print(len(train_data))
    print(len(y_train))
    train_data_same = copy.copy(train_data)

    v = DictVectorizer()
    X_train = v.fit_transform(train_data)
    #so far N=40 is good, iter=55
    fm = pylibfm.FM(num_factors=40, num_iter=55, verbose=True, task="classification", initial_learning_rate=0.2, learning_rate_schedule="optimal")
    fm.fit(X_train, y_train)


    # Evaluate
    train_data_same = v.transform(train_data_same)
    test_data = v.transform(test_data)
    preds = fm.predict(test_data)
    #print(y_train)
    #print(preds)
    with open('FactorizationMachineResult.csv', 'w') as csvfile:
        fieldnames = ['Row', 'Student ID', 'Correct First Attempt', 'real y']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for x in range(numOfLines2):
            writer.writerow({'Row': row_Num2[x], 'Student ID': student_id2[x] ,'Correct First Attempt': preds[x], 'real y': y_test[x]})



    #print("FM MSE: %.4f" % mean_squared_error(y_train,preds))
    rmse = mean_squared_error(y_test, preds)**0.5
    print("RMSE: %.4f" % rmse)
    return
Example #31
0
## 使用平均年龄来填充年龄中的nan值
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)
## 使用票价的均值来填充票价中的nan值
test_data['Fare'].fillna(test_data['Age'].mean(), inplace=True)
## 使用登陆最多的港口来填充
print(train_data['Embarked'].value_counts())
train_data['Embarked'].fillna('S', inplace=True)
test_data['Embarked'].fillna('S', inplace=True)
# 特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]
##处理符号化的对象
dvec = DictVectorizer(sparse=False)
train_features = dvec.fit_transform(train_features.to_dict(orient='record'))
print(dvec.feature_names_)
#构造ID3决策数
clf = DecisionTreeClassifier(criterion='entropy')
#决策树训练
clf.fit(train_features, train_labels)
#决策树预测
test_features = dvec.transform(test_features.to_dict(orient='record'))
pred_labels = clf.predict(test_features)
#得到决策树准确率
acc_decision_tree = round(clf.score(train_features, train_labels), 6)
print(u'score 准确率为%4lf' % acc_decision_tree)
#K折交叉验证
acc_cross_decision_tree = np.mean(
    cross_val_score(clf, train_features, train_labels, cv=10))
Example #32
0
    def fit(self, X, y):
        """Build a WEASEL classifiers from the training set (X, y),

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """

        X, y = check_X_y(X, y, enforce_univariate=True)
        y = y.values if isinstance(y, pd.Series) else y

        # Window length parameter space dependent on series length
        self.n_instances, self.series_length = X.shape[0], len(X.iloc[0, 0])
        self.max_window = min(self.series_length, self.max_window)
        self.window_sizes = list(range(self.min_window,
                                       self.max_window,
                                       self.win_inc))

        max_acc = -1
        self.highest_bit = (math.ceil(math.log2(self.max_window))+1)

        final_bag_vec = None

        for norm in self.norm_options:
            # transformers = []

            for w, word_length in enumerate(self.word_lengths):
                all_words = [dict() for x in range(len(X))]
                transformers = []

                for i, window_size in enumerate(self.window_sizes):
                    # if w == 0:  # only compute once, otherwise shorten
                    transformer = SFA(word_length=np.max(word_length),
                                      alphabet_size=self.alphabet_size,
                                      window_size=window_size,
                                      norm=norm,
                                      anova=self.anova,
                                      binning_method=self.binning_strategy,
                                      bigrams=self.bigrams,
                                      remove_repeat_words=False,
                                      lower_bounding=False,
                                      save_words=False)
                    sfa_words = transformer.fit_transform(X, y)
                    transformers.append(transformer)

                    # use the shortening of words trick
                    # sfa_words = transformers[i]._shorten_bags(word_length)

                    # TODO refactor? dicts not really needed here ...
                    bag = sfa_words.iloc[:, 0]

                    # chi-squared test to keep only relevent features
                    # bag_vec = DictVectorizer(sparse=False).fit_transform(bag)
                    # chi2_statistics, p = chi2(bag_vec, y)
                    # relevant_features = np.where(
                    #    chi2_statistics >= self.chi2_threshold)[0]

                    # merging bag-of-patterns of different window_sizes
                    # to single bag-of-patterns with prefix indicating
                    # the used window-length
                    for j in range(len(bag)):
                        for (key, value) in bag[j].items():
                            # if key in relevant_features:  # chi-squared test
                            # append the prefices to the words to
                            # distinguish between window-sizes
                            word = (key << self.highest_bit) | window_size
                            # X_all_words[j].append((word, value))
                            all_words[j][word] = value

                # TODO use CountVectorizer instead on actual words ... ???
                vectorizer = DictVectorizer(sparse=True)
                bag_vec = vectorizer.fit_transform(all_words)

                clf = LogisticRegression(max_iter=5000, solver="liblinear",
                                         dual=True, penalty="l2",
                                         random_state=self.random_state)
                current_acc = cross_val_score(clf, bag_vec, y, cv=5).mean()

                # clf = RandomForestClassifier(oob_score=True,
                #                              n_estimators=1000,
                #                              n_jobs=-1).fit(bag_vec, y)
                # current_acc = clf.oob_score_

                # print("Train acc:", norm, word_length, current_acc)

                if current_acc > max_acc:
                    max_acc = current_acc
                    self.vectorizer = vectorizer
                    self.clf = clf
                    self.SFA_transformers = transformers
                    self.best_word_length = word_length
                    final_bag_vec = bag_vec

                if max_acc == 1.0:
                    break  # there can be no better model than 1.0

        # # fit final model using all words
        # for i, window_size in enumerate(self.window_sizes):
        #     self.SFA_transformers[i] = \
        #         SFA(word_length=np.max(self.word_lengths),
        #             alphabet_size=self.alphabet_size,
        #             window_size=window_size,
        #             norm=norm,
        #             anova=self.anova,
        #             binning_method=self.binning_strategy,
        #             bigrams=self.bigrams,
        #             remove_repeat_words=False,
        #             lower_bounding=False,
        #             save_words=False)
        #     self.SFA_transformers[i].fit_transform(X, y)

        self.clf.fit(final_bag_vec, y)
        self._is_fitted = True
        return self
Example #33
0
    parser.add_argument('student_data',
                        type=argparse.FileType('r'),
                        help="the student data file in datashop format")
    args = parser.parse_args()

    if args.ft == "transaction":
        ssr_file = transaction_to_student_step(args.student_data)
        ssr_file = open(ssr_file, 'r')
    else:
        ssr_file = args.student_data

    kcs, opps, y, stu, student_label, item_label = read_datashop_student_step(
        ssr_file)

    # Get everything in the right matrix format
    sv = DictVectorizer()
    qv = DictVectorizer()
    ov = DictVectorizer()
    S = sv.fit_transform(stu)
    Q = qv.fit_transform(kcs)
    O = ov.fit_transform(opps)
    X = hstack((S, Q, O))
    y = np.array(y)

    # Regularize the student intercepts
    l2 = [1.0 for i in range(S.shape[1])]
    l2 += [0.0 for i in range(Q.shape[1])]
    l2 += [0.0 for i in range(O.shape[1])]

    # Bound the learning rates to be positive
    bounds = [(None, None) for i in range(S.shape[1])]
Example #34
0
n_hidden_2 = 32  # 隠れ層2のユニットの数
n_input = 4  # 与える変数の数
n_classes = 2  # 分類するクラスの数 今回は生き残ったか否かなので2

# csvファイルの読み込み
df = pd.read_csv('train.csv', header=0)
labelEncoder = preprocessing.LabelEncoder()
df['Sex'] = labelEncoder.fit_transform(df['Sex'])
# df['Cabin'] = labelEncoder.fit_transform(df['Cabin'])
# df['Embarked'] = labelEncoder.fit_transform(df['Embarked'])

#x_np = np.array(df[['Pclass', 'Sex', 'Age', 'Parch' ,'Fare']].fillna(0))
# x_np = np.array(df[['Pclass', 'Sex', 'Age' ,'Fare']].fillna(0))
x_np = np.array(df[['Pclass', 'Sex', 'Age', 'Fare']].fillna(0))
d = df[['SurvivedText']].to_dict('record')
vectorizer = DictVectorizer(sparse=False)
print(d)
y_np = vectorizer.fit_transform(d)
print(y_np)
[x_train, x_test] = np.vsplit(x_np, [train_size])  # 入力データを訓練データとテストデータに分ける
[y_train, y_test] = np.vsplit(y_np, [train_size])  # ラベルを訓練データをテストデータに分ける

#x_train, x_test, y_train, y_test = train_test_split(x_np, y_np, test_size=0.3, random_state=0)

# tf Graph input
x = tf.placeholder("float", [None, n_input])

# 回答が二種類
y = tf.placeholder("float", [None, n_classes])

Example #35
0
                    if len(right_feature) == k + 1:
                        features['%s-%s' % (i + 1, i + k + 1)] = right_feature
                    if len(left_feature) == k + 1:
                        features['%s-%s' % (-i - 1, -i - k - 1)] = left_feature
        x.append(features)
    word_stripped = word.replace('-', '')
    return (x, [_build_feature_dict(word_stripped, k, size, size)
               for k in xrange(len(word_stripped))],
        #(np.array(y) == 0).astype(int),
        np.array(y, dtype=int) + 2,
        np.array(stress, dtype=int))


if __name__ == '__main__':
    X_train, y_train = [], []
    vect_syl = DictVectorizer(sparse=True)
    vect_stress = DictVectorizer(sparse=True)
    vect_syl.feature_names_ = set()
    vect_stress.feature_names_ = set()
    # fit vectorizers
    for _, word, stress in syllabifications('../silabe.train.xml', 10):
        if len(word.strip().replace('-', '')) != len(stress):
            print >> sys.stderr, "Skipped %s" % word
            continue
        x_dict_syl, x_dict_stress, y_syl, y_stress = word_to_feature_dict(
                word.strip(), stress, size=4)
        for x in x_dict_syl:
            for f, v in x.iteritems():
                if isinstance(v, (str, unicode)):
                    f = "%s%s%s" % (f, vect_syl.separator, v)
                vect_syl.feature_names_.add(f)
Example #36
0
		j += 1
		print "Progress : ", j, " / ", len(X_train), "\r",
		sys.stdout.flush()

	x_test[steal_ratio] = []
	for dataPoint in X_test:
	 	x_test[steal_ratio].append(featureHybrid(dataPoint, steal_ratio))


	# z = zip(x_train, y_train)
	# random.shuffle(z)
	# x_train = [a for a,b in z]
	# y_train = [b for a,b in z]


	v = DictVectorizer(sparse = True)
	x_train = v.fit_transform(x_train)

	print "Shape of Data     : ", x_train.shape
	print "Positive Labels   : ", sum(y_train)

	model[steal_ratio] = LinearSVC().fit(x_train, y_train)
	p_label = model[steal_ratio].predict(x_train)
	showResults(y_train, p_label)

	print "Testing set size : ", len(x_test[steal_ratio])
	print "Positive Labels  : ", sum(y_test)

	x_test[steal_ratio] = v.transform(x_test[steal_ratio])
	p_label = model[steal_ratio].predict(x_test[steal_ratio])
	showResults(y_test, p_label)
 def __init__(self, lens, **kwargs):
     super().__init__(**kwargs)
     self.lens = lens
     self.vect = DictVectorizer()
Example #38
0
    def __init__(self,
                 dataset,
                 training,
                 test,
                 config='config/properties.json',
                 lr=0.001,
                 num_factors=10,
                 num_iter=100,
                 threshold=4,
                 implicit=False):

        self.dataset = dataset

        self.config_file = config

        self.properties = []

        self.implicit = implicit

        if self.implicit:

            self.threshold = 0.5

        else:

            self.threshold = threshold

        self._set_properties()

        self._read_item_attributes()

        print('finished reading item attributes')

        self.model = pylibfm.FM(num_factors=num_factors,
                                num_iter=num_iter,
                                verbose=True,
                                task="classification",
                                initial_learning_rate=lr,
                                learning_rate_schedule="optimal")

        self.x_train, self.y_train, self.train_users, self.train_items = self._load_data(
            training)

        self.x_test, self.y_test, self.test_users, self.test_items = self._load_data(
            test)

        if self.implicit:  # need to generate negative candidates for training

            num_negative_candidates = 100

            all_items = self.train_items.union(self.test_items)

            unrated_items = [
                item for item in all_items if item not in self.train_items
            ]

            unrated_items = sorted(unrated_items)

            for user in self.train_users:

                negative_candidates = list(
                    random.sample(unrated_items, num_negative_candidates))

                for item in negative_candidates:

                    self.x_train.append(self._fetch_attributes(user, item))

                    self.y_train.append(0.)

            for user in self.test_users:

                negative_candidates = list(
                    random.sample(unrated_items, num_negative_candidates))

                for item in negative_candidates:

                    self.x_test.append(self._fetch_attributes(user, item))

                    self.y_test.append(0.)

        print('finished reading data')

        self.vectorizer = DictVectorizer()

        self.x_train = self.vectorizer.fit_transform(self.x_train)

        self.x_test = self.vectorizer.transform(self.x_test)

        print('finished transforming data')

        self.model.fit(self.x_train, self.y_train)  # fit the model

        print('finished fitting model')
Example #39
0
            item_ids = []
        for item_id in item_ids:
            x = np.append(user_dic[user_id].A,
                          item_dic[item_id].A).reshape(1, -1)
            x_tem = np.append(x_tem, x, axis=0)
            y_data.append(vector[item_id])
            users.append(user_id)
            items.append(item_id)
        x_data = sparse.vstack((x_data, sparse.csr_matrix(x_tem[1:])))
        print('Creating X_data Now:', user_id)
    return x_data.tocsr()[1:], np.array(y_data), np.array(users), np.array(
        items)


data = open('Data_Lon.csv').readlines()

user_data, user_style, item_attribute, item_tag = Load_data(data)

v = DictVectorizer()
user_data = v.fit_transform(user_data)

user_data = sparse.hstack((user_data, user_style)).tocsr()
item_data = sparse.hstack((item_attribute, item_tag)).tocsr()

x_data, y_data, user_id, item_id = create_x_data()

sparse.save_npz('x_data.npz', x_data)
sparse.save_npz('y_data.npz', sparse.csr_matrix(y_data))
sparse.save_npz('user_id.npz', sparse.csr_matrix(user_id))
sparse.save_npz('item_id.npz', sparse.csr_matrix(item_id))
Example #40
0
#    #filtered = [w for w, pos in refiltered if pos.startswith('NN')]
#    #词干化
#    ps = PorterStemmer()
#    filtered = [ps.stem(w) for w in filtered]
#    return " ".join(filtered)
def clear_title(title, remove_stopwords):
    raw_text = BeautifulSoup(title, 'lxml').get_text()
    letters = re.sub('[^a-zA-Z]', ' ', raw_text)
    words = letters.lower().split()
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words]
    return ' '.join(words)


dict_vec = DictVectorizer(sparse=False)
PATH_TO_ORIGINAL_DATA = './datasets/'
#PATH_TO_PROCESSED_DATA = '/path/to/store/processed/data/'
data = pd.read_csv(PATH_TO_ORIGINAL_DATA + 'cleared_bugs', sep='\t')
selected_columns = ['Product', 'Component', 'Assignee', 'Summary']
data = data[selected_columns]
#print len(data['Product'].unique())
#print len(data['Component'].unique())
classes = data['Assignee'].unique()
n_classes = len(classes)
#print n_classes
classmap = pd.Series(data=np.arange(n_classes), index=classes)
#print classmap
data = pd.merge(data,
                pd.DataFrame({
                    'Assignee': classes,
Example #41
0
# 首先我们补充age里的数据,使用平均数或者中位数都是对模型偏离造成最小影响的策略。
#inplace=True:不创建新的对象,直接对原始对象进行修改;
# inplace=False:对数据进行修改,创建并返回新的对象承载其修改结果。
X['age'].fillna(X['age'].mean(), inplace=True)

#对原始数据进行分割,25%的乘客数据用于测试
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=33)

#对类别型特征进行转化,成为特征向量
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)  #sparse=False意思是不产生稀疏矩阵
#转换特征后,凡是类别型的特征都单独剥离出来,独成一列特征,数值型保持不变
X_train = vec.fit_transform(X_train.to_dict(orient='record'))

# 对测试数据的特征进行转换
X_test = vec.transform(X_test.to_dict(orient='record'))

# 一、使用单一决策树进行模型训练和预测分析
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()  #使用默认配置初始化决策树分类器
dtc.fit(X_train, y_train)  #使用分割得到的训练数据进行模型学习
dtc_y_predict = dtc.predict(X_test)  #使用训练好的决策树模型对测试特征数据进行预测

# 二、使用随机森林分类器进行集成模型的训练以及预测分析
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
Example #42
0
class FMRec:
    def __init__(self,
                 dataset,
                 training,
                 test,
                 config='config/properties.json',
                 lr=0.001,
                 num_factors=10,
                 num_iter=100,
                 threshold=4,
                 implicit=False):

        self.dataset = dataset

        self.config_file = config

        self.properties = []

        self.implicit = implicit

        if self.implicit:

            self.threshold = 0.5

        else:

            self.threshold = threshold

        self._set_properties()

        self._read_item_attributes()

        print('finished reading item attributes')

        self.model = pylibfm.FM(num_factors=num_factors,
                                num_iter=num_iter,
                                verbose=True,
                                task="classification",
                                initial_learning_rate=lr,
                                learning_rate_schedule="optimal")

        self.x_train, self.y_train, self.train_users, self.train_items = self._load_data(
            training)

        self.x_test, self.y_test, self.test_users, self.test_items = self._load_data(
            test)

        if self.implicit:  # need to generate negative candidates for training

            num_negative_candidates = 100

            all_items = self.train_items.union(self.test_items)

            unrated_items = [
                item for item in all_items if item not in self.train_items
            ]

            unrated_items = sorted(unrated_items)

            for user in self.train_users:

                negative_candidates = list(
                    random.sample(unrated_items, num_negative_candidates))

                for item in negative_candidates:

                    self.x_train.append(self._fetch_attributes(user, item))

                    self.y_train.append(0.)

            for user in self.test_users:

                negative_candidates = list(
                    random.sample(unrated_items, num_negative_candidates))

                for item in negative_candidates:

                    self.x_test.append(self._fetch_attributes(user, item))

                    self.y_test.append(0.)

        print('finished reading data')

        self.vectorizer = DictVectorizer()

        self.x_train = self.vectorizer.fit_transform(self.x_train)

        self.x_test = self.vectorizer.transform(self.x_test)

        print('finished transforming data')

        self.model.fit(self.x_train, self.y_train)  # fit the model

        print('finished fitting model')

    def _set_properties(self):

        with codecs.open(self.config_file, 'r',
                         encoding='utf-8') as config_read:

            property_file = json.loads(config_read.read())

            for typology in property_file[self.dataset]:

                for property_name in property_file[self.dataset][typology]:

                    self.properties.append(Property(property_name, typology))

    def _fetch_attributes(self, user, item):

        # create a dictionary with user item interactions and item attributes

        d = {'user_id': user, 'item_id': item}

        attribute_dict = self.item_attributes[item]

        for prop_name, attribute in attribute_dict.items():

            d[prop_name] = attribute

        return d

    def _read_item_attributes(self):

        self.item_attributes = defaultdict(
            dict)  # dict of dict containing item attributes

        for prop in self.properties:  # iterate in content based properties

            prop_name = prop.name

            if prop_name == 'feedback':  # no need for feedback data here

                pass

            if 'feedback_' in prop_name:  # no need for hybrid graphs

                prop_name = prop_name.replace('feedback_', '')

            with open('datasets/%s/graphs/%s.edgelist' %
                      (self.dataset, prop_name)) as edgelist:

                for line in edgelist:

                    line_split = line.strip('\n').split(' ')

                    item = line_split[0]

                    attribute = line_split[1]

                    self.item_attributes[item][prop_name] = attribute

    def _load_data(self, data):

        X = []

        y = []

        users = set()

        items = set()

        with open(data) as data_file:

            for line in data_file:

                line_split = line.strip('\n').split(' ')

                user = line_split[0]

                item = line_split[1]

                rating = line_split[2]

                # create a dictionary with user item interactions and item attributes

                d = self._fetch_attributes(user, item)

                X.append(d)

                if int(rating) >= self.threshold:

                    rating = 1

                else:
                    rating = 0

                y.append(float(rating))

                users.add(user)

                items.add(item)

        return X, y, users, items

    def compute_user_item_features(self, user, item, items_liked_by_user,
                                   users_liking_the_item):

        try:

            d = self._fetch_attributes(user, item)

            score = self.model.predict(self.vectorizer.transform(d))[0]

            features = [score]  # user item relatedness from fm model

        except KeyError:  # do not have user item pair in embedding

            features = [0.]

        return features

    def fit(self, x_train, y_train, qids_train):

        return 0

    def predict(self, x_test, qids_test):

        preds = x_test

        return preds

    @staticmethod
    def parse_args():

        parser = argparse.ArgumentParser(description="Run entity2rec.")

        parser.add_argument('--dimensions',
                            type=int,
                            default=200,
                            help='Number of dimensions. Default is 200.')

        parser.add_argument('--iter',
                            default=5,
                            type=int,
                            help='Number of epochs in SGD')

        parser.add_argument('--workers',
                            type=int,
                            default=8,
                            help='Number of parallel workers. Default is 8.')

        parser.add_argument('--config_file',
                            nargs='?',
                            default='config/properties.json',
                            help='Path to configuration file')

        parser.add_argument('--dataset',
                            nargs='?',
                            default='Movielens1M',
                            help='Dataset')

        parser.add_argument('--train',
                            dest='train',
                            help='train',
                            default=None)

        parser.add_argument('--test', dest='test', help='test', default=None)

        parser.add_argument('--validation',
                            dest='validation',
                            default=None,
                            help='validation')

        parser.add_argument(
            '--all_items',
            dest='all_unrated_items',
            action='store_false',
            default=True,
            help=
            'Whether keeping the rated items of the training set as candidates. '
            'Default is AllUnratedItems')
        parser.add_argument('--implicit',
                            dest='implicit',
                            action='store_true',
                            default=False,
                            help='Implicit feedback with boolean values')

        parser.add_argument('--write_features',
                            dest='write_features',
                            action='store_true',
                            default=False,
                            help='Writes the features to file')

        parser.add_argument('--read_features',
                            dest='read_features',
                            action='store_true',
                            default=False,
                            help='Reads the features from a file')

        parser.add_argument(
            '--threshold',
            dest='threshold',
            default=4,
            type=int,
            help='Threshold to convert ratings into binary feedback')

        parser.add_argument('--num_users',
                            dest='num_users',
                            type=int,
                            default=False,
                            help='Sample of users for evaluation')

        parser.add_argument('--lr',
                            dest='lr',
                            type=float,
                            default=0.001,
                            help='Starting value for the learning rate')

        parser.add_argument('--hyper_opt',
                            dest='hyper_opt',
                            default=False,
                            action='store_true',
                            help='Sample of users for evaluation')

        return parser.parse_args()
Example #43
0
def get_data_queue(args):
    users, items, labels = [], [], []
    if args.dataset == 'ml-100k':
        data_path = os.path.join(args.data, 'ml-100k', 'u.data')
    elif args.dataset == 'ml-1m':
        data_path = os.path.join(args.data, 'ml-1m', 'ratings.dat')
    elif args.dataset == 'ml-10m':
        data_path = os.path.join(args.data, 'ml-10m', 'ratings.dat')
    elif args.dataset == 'youtube-small':
        data_path = os.path.join(args.data, 'youtube-weighted-small.npy')

    if 'ml' in args.dataset:
        # movielens dataset
        with open(data_path, 'r') as f:
            for i, line in enumerate(f.readlines()):
                if args.dataset == 'ml-100k':
                    line = line.split()
                elif args.dataset == 'ml-1m' or args.dataset == 'ml-10m':
                    line = line.split('::')
                users.append(int(line[0]) - 1)
                items.append(int(line[1]) - 1)
                labels.append(float(line[2]))
        labels = StandardScaler().fit_transform(np.reshape(
            labels, [-1, 1])).flatten().tolist()

        print('user', max(users), min(users))
        print('item', max(items), min(items))

        users, items, labels = shuffle(users, items, labels)
        indices = list(range(len(users)))
        num_train = int(len(users) * args.train_portion)
        num_valid = int(len(users) * args.valid_portion)

        if not args.mode == 'libfm':
            data_queue = torch.utils.data.TensorDataset(
                torch.tensor(users), torch.tensor(items), torch.tensor(labels))

            train_queue = torch.utils.data.DataLoader(
                data_queue,
                batch_size=args.batch_size,
                sampler=torch.utils.data.sampler.SubsetRandomSampler(
                    indices[:num_train]),
                pin_memory=True)

            valid_queue = torch.utils.data.DataLoader(
                data_queue,
                batch_size=args.batch_size,
                sampler=torch.utils.data.sampler.SubsetRandomSampler(
                    indices[num_train:num_train + num_valid]),
                pin_memory=True)

            test_queue = torch.utils.data.DataLoader(
                data_queue,
                batch_size=args.batch_size,
                sampler=torch.utils.data.sampler.SubsetRandomSampler(
                    indices[num_train + num_valid:]),
                pin_memory=True)

        else:
            # prepare data format for libfm
            data_queue = []
            for i in range(len(users)):
                data_queue.append({
                    'user': str(users[i]),
                    'item': str(items[i])
                })

            v = DictVectorizer()
            data_queue = v.fit_transform(data_queue)
            train_queue = [
                data_queue[:num_train],
                np.array(labels[:num_train])
            ]
            valid_queue = [
                data_queue[num_train:num_train + num_valid],
                np.array(labels[num_train:num_train + num_valid])
            ]
            test_queue = [
                data_queue[num_train + num_valid:],
                np.array(labels[num_train + num_valid:])
            ]

    else:
        # 3-d dataset
        [ps, qs, rs, labels] = np.load(data_path).tolist()
        labels = StandardScaler().fit_transform(np.reshape(
            labels, [-1, 1])).flatten().tolist()

        ps = [int(i) for i in ps]
        qs = [int(i) for i in qs]
        rs = [int(i) for i in rs]
        print('p', max(ps), min(ps))
        print('q', max(qs), min(qs))
        print('r', max(rs), min(rs))

        ps, qs, rs, labels = shuffle(ps, qs, rs, labels)
        indices = list(range(len(ps)))
        num_train = int(len(ps) * args.train_portion)
        num_valid = int(len(ps) * args.valid_portion)

        if not args.mode == 'libfm':
            data_queue = torch.utils.data.TensorDataset(
                torch.tensor(ps), torch.tensor(qs), torch.tensor(rs),
                torch.tensor(labels))

            train_queue = torch.utils.data.DataLoader(
                data_queue,
                batch_size=args.batch_size,
                sampler=torch.utils.data.sampler.SubsetRandomSampler(
                    indices[:num_train]),
                pin_memory=True)

            valid_queue = torch.utils.data.DataLoader(
                data_queue,
                batch_size=args.batch_size,
                sampler=torch.utils.data.sampler.SubsetRandomSampler(
                    indices[num_train:num_train + num_valid]),
                pin_memory=True)

            test_queue = torch.utils.data.DataLoader(
                data_queue,
                batch_size=args.batch_size,
                sampler=torch.utils.data.sampler.SubsetRandomSampler(
                    indices[num_train + num_valid:]),
                pin_memory=True)

        else:
            # prepare data format for libfm
            data_queue = []
            for i in range(len(ps)):
                data_queue.append({
                    'p': str(ps[i]),
                    'q': str(qs[i]),
                    'r': str(rs[i])
                })

            v = DictVectorizer()
            data_queue = v.fit_transform(data_queue)
            train_queue = [
                data_queue[:num_train],
                np.array(labels[:num_train])
            ]
            valid_queue = [
                data_queue[num_train:num_train + num_valid],
                np.array(labels[num_train:num_train + num_valid])
            ]
            test_queue = [
                data_queue[num_train + num_valid:],
                np.array(labels[num_train + num_valid:])
            ]

    return train_queue, valid_queue, test_queue
    examples,
    ys,
    train_size=0.9,
    shuffle=True,
    random_state=RANDOM_SEED,
)
# split off train, validate from (tv) pieces.
ex_train, ex_vali, y_train, y_vali = train_test_split(
    ex_tv, y_tv, train_size=0.9, shuffle=True, random_state=RANDOM_SEED
)

#%% vectorize:

from sklearn.preprocessing import StandardScaler, MinMaxScaler

feature_numbering = DictVectorizer(sparse=False)
# Learn columns from training data (again)
feature_numbering.fit(ex_train)
# Translate our list of texts -> matrices of counts
rX_train = feature_numbering.transform(ex_train)
rX_vali = feature_numbering.transform(ex_vali)
rX_test = feature_numbering.transform(ex_test)

scaling = StandardScaler()
X_train = scaling.fit_transform(rX_train)
X_vali = scaling.transform(rX_vali)
X_test = scaling.transform(rX_test)

print(X_train.shape, X_vali.shape)
#%% train a model:
from sklearn.tree import DecisionTreeRegressor
Example #45
0
        """Here we transform each input (a string) into a python dict full of features"""
        return [self._ff(s) for s in X]


if __name__ == "__main__":

    # create Logistic Regression pipeline
    text_log_clf = Pipeline(
        [
            ('ff', FF(
                lowercase=True,
                byte_unigrams=True,
            )
            ),
            # This will convert python dicts into efficient sparse data structures
            ('dict', DictVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', LogisticRegression(max_iter=500, verbose=2, C=100., solver='sag')),
        ]
    )

    # create Naive Bayes pipeline
    text_nbc_clf = Pipeline(
        [
            ('ff', FF(
                lowercase=True,
                byte_unigrams=True,
            )
            ),
            # This will convert python dicts into efficient sparse data structures
            ('dict', DictVectorizer()),
    if dataset == "train":
        with open('../data/train_2.dat') as f:
            for line in f:
                (userID, movieID, rating) = line.split(' ')
                data.append({"userID": str(userID), "movieID": str(movieID)})
                try:
                    # for matrix factorization, this was
                    y.append(float(rating))
                    # y.append(float(rating))
                except ValueError:
                    print "Check line {l}".format(l=line)
                users.add(userID)
                movies.add(movieID)
        return (data, y, users, movies)


train = get_unique_users_movies("train")
test = get_unique_users_movies("test")

X_train, y_train = train[0], train[1]

X_test = test[0]

print type(y_train)

v = DictVectorizer()
X_train_dv = v.fit_transform(X_train)
X_test_dv = v.transform(X_test)

print X_train_dv
from sklearn.feature_extraction import DictVectorizer
'''one-hot编码'''
onehot_encoder = DictVectorizer()
X = [{'city': 'New York'}, {'city': 'San Francisco'}, {'city': 'Chapel Hill'}]

print(onehot_encoder.fit_transform(X).toarray())
'''特征标准化'''
# 等同于StandardScaler
from sklearn import preprocessing
import numpy as np
X = np.array([[0., 0., 5., 13., 9., 1.], [0., 0., 13., 15., 10., 15.],
              [0., 3., 15., 2., 0., 11.]])
print(preprocessing.scale(X))

# 能更好的处理异常值
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)
Example #48
0
print(train_data.columns.values.tolist())

tf_idf = TfidfVectorizer(min_df=5)

train_full_descr_transformed = tf_idf.fit_transform(
    train_data['FullDescription'].values.astype('U'), y=None)
test_full_descr_transformed = tf_idf.transform(
    test_data['FullDescription'].values.astype('U'))

train_data['LocationNormalized'].fillna('nan', inplace=True)
train_data['ContractTime'].fillna('nan', inplace=True)

from sklearn.feature_extraction import DictVectorizer

enc = DictVectorizer()

X_train_categ = enc.fit_transform(
    train_data[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test_categ = enc.transform(test_data[['LocationNormalized',
                                        'ContractTime']].to_dict('records'))
"""
print ('X_train_categ size: ', X_train_categ.size, '\n')
print ('X_test_categ size: ', X_test_categ.size, '\n')
print ('test_data[[LocationNormalized, ContractTime]: ', test_data[['LocationNormalized', 'ContractTime']], '\n')
print ('X_train_categ: ', X_train_categ, '\n')
print ('train_full_descr_transformed size: ', train_full_descr_transformed.size, '\n')
print ('train_data[LocationNormalized] size: ', train_data['LocationNormalized'].size, '\n')
"""

from scipy.sparse import hstack
Example #49
0
 def __init__(self):
     # Any classifier could be used here
     self.model = LogisticRegression()
     self.vectorizer = DictVectorizer()
     self.labelEncoder = LabelEncoder()
def train(data, classifier_file):  # do not change the heading of the function
    data_list = data
    model_x = []
    model_y = []
    vo_list = {
        'IH', 'UW', 'OY', 'AH', 'ER', 'EY', 'AO', 'AW', 'AY', 'EH', 'AE', 'UH',
        'IY', 'AA', 'OW'
    }
    co_list = {
        'W', 'K', 'HH', 'G', 'JH', 'Z', 'Y', 'N', 'V', 'SH', 'L', 'NG', 'S',
        'CH', 'R', 'D', 'B', 'TH', 'F', 'DH', 'T', 'P', 'M', 'ZH'
    }
    strong_suffixes = {
        'al', 'ance', 'ancy', 'ant', 'ard', 'ary', 'àte', 'auto', 'ence',
        'ency', 'ent', 'ery', 'est', 'ial', 'ian', 'iana', 'en', 'ésce', 'ic',
        'ify', 'ine', 'ion', 'tion', 'ity', 'ive', 'ory', 'ous', 'ual', 'ure',
        'wide', 'y', 'se', 'ade', 'e', 'ee', 'een', 'eer', 'ese', 'esque',
        'ette', 'eur', 'ier', 'oon', 'que'
    }

    strong_prefixes = {
        'ad', 'co', 'con', 'counter', 'de', 'di', 'dis', 'e', 'en', 'ex', 'in',
        'mid', 'ob', 'para', 'pre', 're', 'sub', 'a', 'be', 'with', 'for'
    }

    neutral_prefixes = {
        'down', 'fore', 'mis', 'over', 'out', 'un', 'under', 'up', 'anti',
        'bi', 'non', 'pro', 'tri', 'contra', 'counta', 'de', 'dis', 'extra',
        'inter', 'intro', 'multi', 'non', 'post', 'retro', 'super', 'trans',
        'ultra'
    }

    neutral_suffixes = {
        'able', 'age', 'al', 'ate', 'ed', 'en', 'er', 'est', 'ful', 'hood',
        'ible', 'ing', 'ile', 'ish', 'ism', 'ist', 'ize', 'less', 'like', 'ly'
        'man', 'ment', 'most', 'ness', 'old', 's', 'ship', 'some', 'th',
        'ward', 'wise', 'y'
    }

    suffixes = {
        'inal', 'ain', 'tion', 'sion', 'osis', 'oon', 'sce', 'que', 'ette',
        'eer', 'ee', 'aire', 'able', 'ible', 'acy', 'cy', 'ade', 'age', 'al',
        'al', 'ial', 'ical', 'an', 'ance', 'ence', 'ancy', 'ency', 'ant',
        'ent', 'ant', 'ent', 'ient', 'ar', 'ary', 'ard', 'art', 'ate', 'ate',
        'ate', 'ation', 'cade', 'drome', 'ed', 'ed', 'en', 'en', 'ence',
        'ency', 'er', 'ier', 'er', 'or', 'er', 'or', 'ery', 'es', 'ese', 'ies',
        'es', 'ies', 'ess', 'est', 'iest', 'fold', 'ful', 'ful', 'fy', 'ia',
        'ian', 'iatry', 'ic', 'ic', 'ice', 'ify', 'ile', 'ing', 'ion', 'ish',
        'ism', 'ist', 'ite', 'ity', 'ive', 'ive', 'ative', 'itive', 'ize',
        'less', 'ly', 'ment', 'ness', 'or', 'ory', 'ous', 'eous', 'ose',
        'ious', 'ship', 'ster', 'ure', 'ward', 'wise', 'ize', 'phy', 'ogy'
    }

    prefixes = {
        'ac', 'ad', 'af', 'ag', 'al', 'an', 'ap', 'as', 'at', 'an', 'ab',
        'abs', 'acer', 'acid', 'acri', 'act', 'ag', 'acu', 'aer', 'aero', 'ag',
        'agi', 'ig', 'act', 'agri', 'agro', 'alb', 'albo', 'ali', 'allo',
        'alter', 'alt', 'am', 'ami', 'amor', 'ambi', 'ambul', 'ana', 'ano',
        'andr', 'andro', 'ang', 'anim', 'ann', 'annu', 'enni', 'ante',
        'anthrop', 'anti', 'ant', 'anti', 'antico', 'apo', 'ap', 'aph', 'aqu',
        'arch', 'aster', 'astr', 'auc', 'aug', 'aut', 'aud', 'audi', 'aur',
        'aus', 'aug', 'auc', 'aut', 'auto', 'bar', 'be', 'belli', 'bene', 'bi',
        'bine', 'bibl', 'bibli', 'biblio', 'bio', 'bi', 'brev', 'cad', 'cap',
        'cas', 'ceiv', 'cept', 'capt', 'cid', 'cip', 'cad', 'cas', 'calor',
        'capit', 'capt', 'carn', 'cat', 'cata', 'cath', 'caus', 'caut',
        'cause', 'cuse', 'cus', 'ceas', 'ced', 'cede', 'ceed', 'cess', 'cent',
        'centr', 'centri', 'chrom', 'chron', 'cide', 'cis', 'cise', 'circum',
        'cit', 'civ', 'clam', 'claim', 'clin', 'clud', 'clus claus', 'co',
        'cog', 'col', 'coll', 'con', 'com', 'cor', 'cogn', 'gnos', 'com',
        'con', 'contr', 'contra', 'counter', 'cord', 'cor', 'cardi', 'corp',
        'cort', 'cosm', 'cour', 'cur', 'curr', 'curs', 'crat', 'cracy', 'cre',
        'cresc', 'cret', 'crease', 'crea', 'cred', 'cresc', 'cret', 'crease',
        'cru', 'crit', 'cur', 'curs', 'cura', 'cycl', 'cyclo', 'de', 'dec',
        'deca', 'dec', 'dign', 'dei', 'div', 'dem', 'demo', 'dent', 'dont',
        'derm', 'di', 'dy', 'dia', 'dic', 'dict', 'dit', 'dis', 'dif', 'dit',
        'doc', 'doct', 'domin', 'don', 'dorm', 'dox', 'duc', 'duct', 'dura',
        'dynam', 'dys', 'ec', 'eco', 'ecto', 'en', 'em', 'end', 'epi', 'equi',
        'erg', 'ev', 'et', 'ex', 'exter', 'extra', 'extro', 'fa', 'fess',
        'fac', 'fact', 'fec', 'fect', 'fic', 'fas', 'fea', 'fall', 'fals',
        'femto', 'fer', 'fic', 'feign', 'fain', 'fit', 'feat', 'fid', 'fid',
        'fide', 'feder', 'fig', 'fila', 'fili', 'fin', 'fix', 'flex', 'flect',
        'flict', 'flu', 'fluc', 'fluv', 'flux', 'for', 'fore', 'forc', 'fort',
        'form', 'fract', 'frag', 'frai', 'fuge', 'fuse', 'gam', 'gastr',
        'gastro', 'gen', 'gen', 'geo', 'germ', 'gest', 'giga', 'gin', 'gloss',
        'glot', 'glu', 'glo', 'gor', 'grad', 'gress', 'gree', 'graph', 'gram',
        'graf', 'grat', 'grav', 'greg', 'hale', 'heal', 'helio', 'hema',
        'hemo', 'her', 'here', 'hes', 'hetero', 'hex', 'ses', 'sex', 'h**o',
        'hum', 'human', 'hydr', 'hydra', 'hydro', 'hyper', 'hypn', 'an', 'ics',
        'ignis', 'in', 'im', 'in', 'im', 'il', 'ir', 'infra', 'inter', 'intra',
        'intro', 'ty', 'jac', 'ject', 'join', 'junct', 'judice', 'jug',
        'junct', 'just', 'juven', 'labor', 'lau', 'lav', 'lot', 'lut', 'lect',
        'leg', 'lig', 'leg', 'levi', 'lex', 'leag', 'leg', 'liber', 'liver',
        'lide', 'liter', 'loc', 'loco', 'log', 'logo', 'ology', 'loqu',
        'locut', 'luc', 'lum', 'lun', 'lus', 'lust', 'lude', 'macr', 'macer',
        'magn', 'main', 'mal', 'man', 'manu', 'mand', 'mania', 'mar', 'mari',
        'mer', 'matri', 'medi', 'mega', 'mem', 'ment', 'meso', 'meta', 'meter',
        'metr', 'micro', 'migra', 'mill', 'kilo', 'milli', 'min', 'mis', 'mit',
        'miss', 'mob', 'mov', 'mot', 'mon', 'mono', 'mor', 'mort', 'morph',
        'multi', 'nano', 'nasc', 'nat', 'gnant', 'nai', 'nat', 'nasc', 'neo',
        'neur', 'nom', 'nom', 'nym', 'nomen', 'nomin', 'non', 'non', 'nov',
        'nox', 'noc', 'numer', 'numisma', 'ob', 'oc', 'of', 'op', 'oct',
        'oligo', 'omni', 'onym', 'oper', 'ortho', 'over', 'pac', 'pair',
        'pare', 'paleo', 'pan', 'para', 'pat', 'pass', 'path', 'pater', 'patr',
        'path', 'pathy', 'ped', 'pod', 'pedo', 'pel', 'puls', 'pend', 'pens',
        'pond', 'per', 'peri', 'phage', 'phan', 'phas', 'phen', 'fan', 'phant',
        'fant', 'phe', 'phil', 'phlegma', 'phobia', 'phobos', 'phon', 'phot',
        'photo', 'pico', 'pict', 'plac', 'plais', 'pli', 'ply', 'plore', 'plu',
        'plur', 'plus', 'pneuma', 'pneumon', 'pod', 'poli', 'poly', 'pon',
        'pos', 'pound', 'pop', 'port', 'portion', 'post', 'pot', 'pre', 'pur',
        'prehendere', 'prin', 'prim', 'prime', 'pro', 'proto', 'psych',
        'punct', 'pute', 'quat', 'quad', 'quint', 'penta', 'quip', 'quir',
        'quis', 'quest', 'quer', 're', 'reg', 'recti', 'retro', 'ri', 'ridi',
        'risi', 'rog', 'roga', 'rupt', 'sacr', 'sanc', 'secr', 'salv', 'salu',
        'sanct', 'sat', 'satis', 'sci', 'scio', 'scientia', 'scope', 'scrib',
        'script', 'se', 'sect', 'sec', 'sed', 'sess', 'sid', 'semi', 'sen',
        'scen', 'sent', 'sens', 'sept', 'sequ', 'secu', 'sue', 'serv', 'sign',
        'signi', 'simil', 'simul', 'sist', 'sta', 'stit', 'soci', 'sol',
        'solus', 'solv', 'solu', 'solut', 'somn', 'soph', 'spec', 'spect',
        'spi', 'spic', 'sper', 'sphere', 'spir', 'stand', 'stant', 'stab',
        'stat', 'stan', 'sti', 'sta', 'st', 'stead', 'strain', 'strict',
        'string', 'stige', 'stru', 'struct', 'stroy', 'stry', 'sub', 'suc',
        'suf', 'sup', 'sur', 'sus', 'sume', 'sump', 'super', 'supra', 'syn',
        'sym', 'tact', 'tang', 'tag', 'tig', 'ting', 'tain', 'ten', 'tent',
        'tin', 'tect', 'teg', 'tele', 'tem', 'tempo', 'ten', 'tin', 'tain',
        'tend', 'tent', 'tens', 'tera', 'term', 'terr', 'terra', 'test', 'the',
        'theo', 'therm', 'thesis', 'thet', 'tire', 'tom', 'tor', 'tors',
        'tort', 'tox', 'tract', 'tra', 'trai', 'treat', 'trans', 'tri', 'trib',
        'tribute', 'turbo', 'typ', 'ultima', 'umber', 'umbraticum', 'un',
        'uni', 'vac', 'vade', 'vale', 'vali', 'valu', 'veh', 'vect', 'ven',
        'vent', 'ver', 'veri', 'verb', 'verv', 'vert', 'vers', 'vi', 'vic',
        'vicis', 'vict', 'vinc', 'vid', 'vis', 'viv', 'vita', 'vivi', 'voc',
        'voke', 'vol', 'volcan', 'volv', 'volt', 'vol', 'vor', 'with', 'zo'
    }
    neutral_prefixes = upper(neutral_prefixes)
    neutral_suffixes = upper(neutral_suffixes)
    strong_prefixes = upper(strong_prefixes)
    strong_suffixes = upper(strong_suffixes)
    full_suffixes_set = upper(suffixes)
    full_prefixes_set = upper(prefixes)
    suffix = {"1", "2", "0"}
    for line in data_list:
        dict = {}
        vow_index = []
        vowelCount = 0
        pattern = ""
        y = ""
        dict["pos"] = nltk.pos_tag([line.split(":")[0]])[0][1]
        word = line.split(":")[0]
        temp = check_prefix(word, neutral_prefixes)
        if temp:
            dict['neu_pre'] = temp
        temp = check_suffix(word, neutral_suffixes)
        if temp:
            dict['neu_suf'] = temp
        temp = check_prefix(word, strong_prefixes)
        if temp:
            dict['str_pre'] = temp
        temp = check_suffix(word, strong_suffixes)
        if temp:
            dict['str_suf'] = temp
        temp = check_prefix(word, full_suffixes_set)
        if temp:
            dict['ful_pre'] = temp
        temp = check_suffix(word, full_prefixes_set)
        if temp:
            dict['ful_suf'] = temp
        line = line.split(":")[1].strip()

        syllables = line.split(" ")
        l = []
        for i in syllables:
            l.append(i if not (i[-1].isdigit()) else i[:-1])
        dict.update(Counter({''.join(i) for i in get_ngrams(l)}))
        dict['len'] = len(syllables)
        out = ''
        for i in range(len(syllables)):
            syl = syllables[i]

            if syl[-1] in suffix:
                vowelCount += 1
                vow_index.append(i)
                out += syl[-1]
                # if syl[-1]=="1":
                #     model_y.append(vowelCount)
                pattern += "V"
            else:
                pattern += "C"

        model_y.append(out)
        vowelCount = 0
        dict["pattern"] = pattern
        dict['vow_len'] = len(vow_index)
        for i in vow_index:
            vowelCount += 1
            if i - 1 >= 0:
                dict["onset2_" + str(vowelCount)] = syllables[i - 1]
            if i + 1 < len(syllables):
                dict["coda1_" + str(vowelCount)] = syllables[i + 1]
            dict["nucleus_" + str(vowelCount)] = syllables[i][:-1]
        model_x.append(dict)
    # print(pd.DataFrame(model_x))
    # print(model_y)
    v = DictVectorizer(sparse=True)

    X = v.fit_transform(model_x)
    classifier = LogisticRegression(penalty='l2', class_weight='balanced')

    classifier.fit(X, model_y)
    with open(classifier_file, 'wb') as f:
        pickle.dump(classifier, f)
        pickle.dump(v, f)
    sd_y = stats.stdev(y)

    for observation in x:
        score_x.append((observation - mean_x) / sd_x)

    for obseervation in y:
        score_y.append((observation - mean_y) / sd_y)

    return (sum([i * j for i, j in zip(score_x, score_y)])) / (n - 1)


print(pearson(x, y))

#################

staff = [{
    'name': 'Steve Miller',
    'age': 33.
}, {
    'name': 'Lyndon Jones',
    'age': 12.
}, {
    'name': 'Baxter Morth',
    'age': 18.
}]

vec = DictVectorizer()
vec.fit_transform(staff).toarray()

print(vec.get_feature_names())
Example #52
0
class ImitationLearner(object):

    # initialize the classifier to be learned
    def __init__(self):
        # Any classifier could be used here
        self.model = LogisticRegression()
        self.vectorizer = DictVectorizer()
        self.labelEncoder = LabelEncoder()

    # this function predicts an instance given the state
    # state keeps track the various actions taken
    # it does not change the instance in any way,
    # it does change the state
    # the predicted structured output is returned in the end
    def predict(self, structured_instance, state=None, expert_policy_prob=0.0):
        if state == None:
            state = self.transitionSystem(
                structured_instance=structured_instance)

        # predict all remaining actions
        # if we do not have any actions we are done
        while len(state.agenda) > 0:
            # for each action
            # pop it from the queue
            current_action = state.agenda.popleft()
            # extract features and add them to the action
            # (even for the optimal policy, it doesn't need the features but they are needed later on)
            current_action.features = state.extractFeatures(
                structured_instance=structured_instance, action=current_action)
            # the first condition is to avoid un-necessary calls to random which give me reproducibility headaches
            if (expert_policy_prob
                    == 1.0) or (expert_policy_prob > 0.0
                                and random.random() < expert_policy_prob):
                current_action.label = state.expert_policy(
                    structured_instance, current_action)
            else:
                # predict (probably makes sense to parallelize across instances)
                # vectorize the features:
                vectorized_features = self.vectorizer.transform(
                    current_action.features)
                # predict using the model
                normalized_label = self.model.predict(vectorized_features)
                # get the actual label (returns an array, get the first and only element)
                current_action.label = self.labelEncoder.inverse_transform(
                    normalized_label)[0]
            # add the action to the state making any necessary updates
            state.updateWithAction(current_action, structured_instance)

        # OK return the final state reached
        return state

    class params(object):
        def __init__(self):
            self.learningParam = 0.1
            self.iterations = 40

    def train(self, structuredInstances, params):
        # create the dataset
        trainingFeatures = []
        trainingLabels = []

        # for each iteration
        for iteration in range(params.iterations):
            # set the expert policy prob
            expertPolicyProb = pow(1 - params.learningParam, iteration)
            print("Iteration:" + str(iteration) + ", expert policy prob:" +
                  str(expertPolicyProb))

            for structuredInstance in structuredInstances:

                # so we obtain the predicted output and the actions taken are in state
                # this prediction uses the gold standard since we need this info for the expert policy actions
                final_state = self.predict(structuredInstance,
                                           expert_policy_prob=expertPolicyProb)

                # initialize a second state to avoid having to roll-back
                stateCopy = self.transitionSystem(
                    structured_instance=structuredInstance)
                # The agenda seems to initialized fine
                for action in final_state.actionsTaken:
                    # DAgger just ask the expert
                    stateCopy.agenda.popleft()
                    expert_action_label = stateCopy.expert_policy(
                        structuredInstance, action)

                    # add the labeled features to the training data
                    trainingFeatures.append(action.features)
                    trainingLabels.append(expert_action_label)

                    # take the original action chosen to proceed
                    stateCopy.updateWithAction(action, structuredInstance)

            # OK, let's save the training data and learn some classifiers
            # vectorize the training data collected
            training_data = self.vectorizer.fit_transform(trainingFeatures)
            # encode the labels
            encoded_labels = self.labelEncoder.fit_transform(trainingLabels)
            # train
            self.model.fit(training_data, encoded_labels)
Example #53
0
def main_gp():
    import gp, GPyOpt
    from sklearn.feature_extraction import DictVectorizer

    parser = argparse.ArgumentParser()
    parser.add_argument('-a',
                        '--agent',
                        type=str,
                        default='ppo_agent',
                        help="Agent to use (ppo_agent|dqn_agent|etc)")
    parser.add_argument(
        '-g',
        '--gpu_split',
        type=float,
        default=1,
        help="Num ways we'll split the GPU (how many tabs you running?)")
    parser.add_argument('-n',
                        '--net_type',
                        type=str,
                        default='lstm',
                        help="(lstm|conv2d) Which network arch to use")
    parser.add_argument(
        '--guess',
        action="store_true",
        default=False,
        help="Run the hard-coded 'guess' values first before exploring")
    parser.add_argument(
        '--gpyopt',
        action="store_true",
        default=False,
        help=
        "Use GPyOpt library, or use basic sklearn GP implementation? GpyOpt shows more promise, but has bugs."
    )
    args = parser.parse_args()

    # Encode features
    hsearch = HSearchEnv(gpu_split=args.gpu_split, net_type=args.net_type)
    hypers_, hardcoded = hsearch.hypers, hsearch.hardcoded
    hypers_ = {k: v for k, v in hypers_.items() if k not in hardcoded}
    hsearch.close()

    # Build a matrix of features,  length = max feature size
    max_num_vals = 0
    for v in hypers_.values():
        l = len(v['vals'])
        if l > max_num_vals: max_num_vals = l
    empty_obj = {k: None for k in hypers_}
    mat = pd.DataFrame([empty_obj.copy() for _ in range(max_num_vals)])
    for k, hyper in hypers_.items():
        for i, v in enumerate(hyper['vals']):
            mat.loc[i, k] = v
    mat.ffill(inplace=True)

    # Above is Pandas-friendly stuff, now convert to sklearn-friendly & pipe through OneHotEncoder
    vectorizer = DictVectorizer()
    vectorizer.fit(mat.T.to_dict().values())
    feat_names = vectorizer.get_feature_names()

    # Map TensorForce actions to GPyOpt-compatible `domain`
    # instantiate just to get actions (get them from hypers above?)
    bounds = []
    for k in feat_names:
        hyper = hypers_.get(k, False)
        if hyper:
            bounded, min_, max_ = hyper['type'] == 'bounded', min(
                hyper['vals']), max(hyper['vals'])
        if args.gpyopt:
            b = {'name': k, 'type': 'discrete', 'domain': (0, 1)}
            if bounded: b.update(type='continuous', domain=(min_, max_))
        else:
            b = [min_, max_] if bounded else [0, 1]
        bounds.append(b)

    def hypers2vec(obj):
        h = dict()
        for k, v in obj.items():
            if k in hardcoded: continue
            if type(v) == bool:
                h[k] = float(v)
            else:
                h[k] = v or 0.
        return vectorizer.transform(h).toarray()[0]

    def vec2hypers(vec):
        # Reverse the encoding
        # https://stackoverflow.com/questions/22548731/how-to-reverse-sklearn-onehotencoder-transform-to-recover-original-data
        # https://github.com/scikit-learn/scikit-learn/issues/4414
        if not args.gpyopt:
            vec = [vec]  # gp.py passes as flat, GPyOpt as wrapped
        reversed = vectorizer.inverse_transform(vec)[0]
        obj = {}
        for k, v in reversed.items():
            if '=' not in k:
                obj[k] = v
                continue
            if k in obj: continue  # we already handled this x=y logic (below)
            # Find the winner (max) option for this key
            score, attr, val = v, k.split('=')[0], k.split('=')[1]
            for k2, score2 in reversed.items():
                if k2.startswith(attr + '=') and score2 > score:
                    score, val = score2, k2.split('=')[1]
            obj[attr] = val

        # Bools come in as floats. Also, if the result is False they don't come in at all! So we start iterate
        # hypers now instead of nesting this logic in reversed-iteration above
        for k, v in hypers_.items():
            if v['type'] == 'bool':
                obj[k] = bool(round(obj.get(k, 0.)))
        return obj

    # Specify the "loss" function (which we'll maximize) as a single rl_hsearch instantiate-and-run
    def loss_fn(params):
        hsearch = HSearchEnv(gpu_split=args.gpu_split, net_type=args.net_type)
        reward = hsearch.execute(vec2hypers(params))
        hsearch.close()
        return [reward]

    while True:
        conn = data.engine.connect()
        sql = "SELECT hypers, reward_avg FROM runs WHERE flag=:f"
        runs = conn.execute(text(sql), f=args.net_type).fetchall()
        conn.close()
        X, Y = [], []
        for run in runs:
            X.append(hypers2vec(run.hypers))
            Y.append([run.reward_avg])
        print_feature_importances(X, Y, feat_names)

        if args.guess:
            guesses = {k: v['guess'] for k, v in hypers_.items()}
            X.append(hypers2vec(guesses))
            Y.append([None])
            args.guess = False

        if args.gpyopt:
            pretrain = {'X': np.array(X), 'Y': np.array(Y)} if X else {}
            opt = GPyOpt.methods.BayesianOptimization(f=loss_fn,
                                                      domain=bounds,
                                                      maximize=True,
                                                      **pretrain)
            # using max_iter=1 because of database setup. Normally you'd go until convergence, but since we're using
            # a database for the runs we can parallelize runs across machines (connected to the same database). Then
            # between each run we can grab the result from the other machines and merge with our own; so only run
            # once, reset the model-fitting w/ the full database (which may have grown), and repeat
            opt.run_optimization(max_iter=1)
        else:
            gp.bayesian_optimisation2(n_iters=1,
                                      loss_fn=loss_fn,
                                      bounds=np.array(bounds),
                                      x_list=X,
                                      y_list=Y)
Example #54
0
__author__ = 'davidoregan'

import numpy as np
from sklearn import svm
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction import DictVectorizer
import numpy as np


from ast import literal_eval


vec = DictVectorizer()
mydata2 = pd.read_csv('output.csv')
with open("SVM's/output.csv") as f:
       dic = literal_eval('{' + f.read() +'}')
       print dic


#pos_vectorized = vec.fit_transform(mydata2)









#缺失值处理
x["age"].fillna(x["age"].mean(), inplace=True)

#转换成字典
x = x.to_dict(orient="records")

#数据集划分
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)

#字典特征抽取
from sklearn.feature_extraction import DictVectorizer

transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

#决策树预估器:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

estimator = DecisionTreeClassifier(criterion="entropy", max_depth=8)
estimator.fit(x_train, y_train)
#模型评估:
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_test == y_predict)
score = estimator.score(x_test, y_test)
print("准确率:\n", score)
#可视化决策树 ## http://webgraphviz.com/
print(df_encoded[:5, :])

# Print the shape of the original DataFrame
print(df.shape)

# Print the shape of the transformed array
print(df_encoded.shape)
-------------------------------------------------------------
# Import DictVectorizer
from sklearn.feature_extraction import DictVectorizer

# Convert df into a dictionary: df_dict
df_dict = df.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer()

# Apply dv on df: df_encoded
df_encoded = dv.fit_transform(df_dict)

# Print the resulting first five rows
print(df_encoded[:5,:])

# Print the vocabulary
print(dv.vocabulary_)
-------------------------------------------------------
# Import necessary modules
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
Example #57
0
traindata = pd.read_csv(this_folder + "/insurance-train.csv")
testdata = pd.read_csv(this_folder + "/insurance-test.csv")

X_train = traindata[[
    'Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
    'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium', 'Policy_Sales_Channel',
    'Vintage'
]]
Y_train = traindata['Response']
X_test = testdata[[
    'Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
    'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium', 'Policy_Sales_Channel',
    'Vintage'
]]

vec = DictVectorizer()
X_train = vec.fit_transform(X_train.to_dict(orient="record"))
X_test = vec.transform(X_test.to_dict(orient="record"))

gnb = GaussianNB()
gnb.fit(X_train.toarray(), Y_train)  # input X,y for training
mnb = MultinomialNB()
mnb.fit(X_train, Y_train)

Y_test1 = gnb.predict(X_test.toarray())
output = pd.DataFrame({'id': testdata['id'], 'Response': Y_test1})
output.to_csv('Bayes_gnb.csv', index=False)

Y_test2 = mnb.predict(X_test.toarray())
output = pd.DataFrame({'id': testdata['id'], 'Response': Y_test2})
output.to_csv('Bayes_mnb.csv', index=False)
Example #58
0
def lexrank(sentences, continuous=False, sim_threshold=0.1, alpha=0.9,
            use_divrank=False, divrank_alpha=0.25):
    '''
    compute centrality score of sentences.

    Args:
      sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ]
      continuous: if True, apply continuous LexRank. (see reference)
      sim_threshold: if continuous is False and smilarity is greater or
        equal to sim_threshold, link the sentences.
      alpha: the damping factor of PageRank and DivRank
      divrank: if True, apply DivRank instead of PageRank
      divrank_alpha: strength of self-link [0.0-1.0]
        (it's not the damping factor, see divrank.py)

    Returns: tuple
      (
        {
          # sentence index -> score
          0: 0.003,
          1: 0.002,
          ...
        },
        similarity_matrix
      )
    
    Reference:
      Günes Erkan and Dragomir R. Radev.
      LexRank: graph-based lexical centrality as salience in text
      summarization. (section 3)
      http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html
    '''
    # configure ranker
    ranker_params = {'max_iter': 1000}
    if use_divrank:
        ranker = divrank_scipy
        ranker_params['alpha'] = divrank_alpha
        ranker_params['d'] = alpha
    else:
        ranker = networkx.pagerank_scipy
        ranker_params['alpha'] = alpha

    graph = networkx.DiGraph()

    # sentence -> tf
    sent_tf_list = []
    for sent in sentences:
        words = tools.word_segmenter_ja(sent)
        tf = collections.Counter(words)
        sent_tf_list.append(tf)

    sent_vectorizer = DictVectorizer(sparse=True)
    sent_vecs = sent_vectorizer.fit_transform(sent_tf_list)

    # compute similarities between senteces
    sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric='cosine')

    if continuous:
        linked_rows, linked_cols = numpy.where(sim_mat > 0)
    else:
        linked_rows, linked_cols = numpy.where(sim_mat >= sim_threshold)

    # create similarity graph
    graph.add_nodes_from(range(sent_vecs.shape[0]))
    for i, j in zip(linked_rows, linked_cols):
        if i == j:
            continue
        weight = sim_mat[i,j] if continuous else 1.0
        graph.add_edge(i, j, weight=weight)

    scores = ranker(graph, **ranker_params)
    return scores, sim_mat
Example #59
0
headers = next(reader)

print(headers)

featureList = []
labelList = []

for row in reader:
    labelList.append(row[len(row) - 1])
    rowDict = {}
    for i in range(1, len(row) - 1):
        rowDict[headers[i]] = row[i]
    featureList.append(rowDict)
print(featureList)

vec = DictVectorizer()
dummyX = vec.fit_transform(featureList).toarray()

print('dummyX: ' + str(dummyX))
print(vec.get_feature_names())

print('labelList: ' + str(labelList))

# vectorize class labels
lb = preprocessing.LabelBinarizer()
dummyY = lb.fit_transform(labelList)
print('dummyY: ' + str(dummyY))

# Using decision tree for classification
# clf = tree.DecisionTreeClassifier()
Example #60
0
            transitions.append(trans)

            x_templist.append(current_dictX)
            y_templist.append(current_Y)

        stack, graph = transition.empty_stack(stack, graph)

        for word in sentence:
            word['head'] = graph['heads'][word['id']]

        x_list.extend(x_temp_list)
        y_list.extend(y_temp_list)

    print("Encoding the features and classes...")
    # Vectorize the feature matrix and carry out a one-hot encoding
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(x_list)
    # The statement below will swallow a considerable memory
    # X = vec.fit_transform(X_dict).toarray()
    # print(vec.get_feature_names())

    y, nbr_to_class, classes_to_nbr = encode_classes(y_list)

    print("Training the model...")
    classifier = linear_model.LogisticRegression(penalty='l2',
                                                 dual=True,
                                                 solver='liblinear')
    model = classifier.fit(X, y)
    print(model)
    print('Predicting')