Exemple #1
1
def learning_curve():
    n = 50000
    nsteps = 10
    full = cu.get_sample_data_frame(n)
    data = full.ix[0 : int(n * 0.6) - 1].reset_index()
    cval = full.ix[int(n * 0.6) : int(n * 0.8) - 1].reset_index()
    test = full.ix[int(n * 0.8) : n - 1].reset_index()
    step = len(data) / nsteps
    ndata = len(data)
    mvec = range(step, ndata + step, step)
    test_features = features.extract_features(test)
    data_error = []
    cval_error = []
    for i in range(len(mvec)):
        m = mvec[i]
        print "running for size", m
        train = data.ix[0 : m - 1].reset_index()
        fea = features.extract_features(train)
        rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=False, n_jobs=5)
        rf.fit(fea, train["OpenStatus"])
        new_priors = cu.load_priors("train.csv")
        old_priors = cu.compute_priors(train.OpenStatus)
        # predict train
        probs = rf.predict_proba(fea)
        # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
        y_true = compute_y_true(train)
        score = multiclass_log_loss(y_true, probs)
        data_error.append(score)
        # predict cval
        probs = rf.predict_proba(test_features)
        # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
        y_true = compute_y_true(test)
        score = multiclass_log_loss(y_true, probs)
        cval_error.append(score)
    return mvec, data_error, cval_error
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1)
    rf.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)

    score()
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_little)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    #classifier = MultinomialNB()
    #classifier = KNeighborsClassifier(n_neighbors=3, weights='distance')
    classifier = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1)
    #classifier = GradientBoostingClassifier(n_estimators=200, learn_rate=0.1)

    classifier.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_litte)
    test_features = features.extract_features(feature_names, data)
    probs = classifier.predict_proba(test_features)

    #print("Calculating priors and updating posteriors")
    #new_priors = cu.get_priors(full_train_file)
    #old_priors = cu.get_priors(train_file)
    #probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_litte)
    cu.write_submission(submission_litte, probs)
 def test_make_dataset_from_sgf(self):
   with tempfile.NamedTemporaryFile() as sgf_file, \
       tempfile.NamedTemporaryFile() as record_file:
     sgf_file.write(TEST_SGF.encode('utf8'))
     sgf_file.seek(0)
     preprocessing.make_dataset_from_sgf(
         utils_test.BOARD_SIZE, sgf_file.name, record_file.name)
     recovered_data = self.extract_data(record_file.name)
   start_pos = go.Position(utils_test.BOARD_SIZE)
   first_move = coords.from_sgf('fd')
   next_pos = start_pos.play_move(first_move)
   second_move = coords.from_sgf('cf')
   expected_data = [
       (
           features.extract_features(utils_test.BOARD_SIZE, start_pos),
           preprocessing._one_hot(utils_test.BOARD_SIZE, coords.to_flat(
               utils_test.BOARD_SIZE, first_move)), -1
       ),
       (
           features.extract_features(utils_test.BOARD_SIZE, next_pos),
           preprocessing._one_hot(utils_test.BOARD_SIZE, coords.to_flat(
               utils_test.BOARD_SIZE, second_move)), -1
       )
   ]
   self.assertEqualData(expected_data, recovered_data)
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)
    data['OpenStatusMod'] = data['OpenStatus'].map(convert_status)
    #print(data['OpenStatusMod'])

    print("Extracting features")
    fea = features.extract_features(feature_names, data)
    #print(fea.columns)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1, random_state = 0)
    print("Training the model, created RFC")
    #rf.fit(fea, data["OpenStatus"])
    rf.fit(fea, data["OpenStatusMod"])

    print("Reading test file and making predictions")
    #data = cu.get_dataframe(test_file)
    data = cu.get_dataframe(full_train_file)
    print("Reading data frame")
    data['OpenStatusMod'] = data['OpenStatus'].map(convert_status)
    print("adding column")
    test_features = features.extract_features(feature_names, data)
    print("extract features")
    probs = rf.predict_proba(test_features)

#    print("Calculating priors and updating posteriors")
#    new_priors = cu.get_priors(full_train_file)
#    old_priors = cu.get_priors(train_file)
#    print "new priors %s" %(new_priors)
#    print "old priors %s" %(old_priors)
#    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
def main():
	start = time.time()
	print("Reading the data from " + train_file)
	data = cu.get_dataframe(train_file)

	print("Extracting features")
	fea = features.extract_features(feature_names, data)

	print("Training the model")
	clf = ExtraTreesClassifier(n_estimators=trees_count, max_features=len(feature_names), max_depth=None, min_samples_split=1, compute_importances=True, bootstrap=False, random_state=0, n_jobs=-1, verbose=2)
	clf.fit(fea, data["OpenStatus"])

	print "Listing feature importances:"
	cu.list_feature_importance(clf,feature_names)
	
	print("Reading test file and making predictions: " + test_file)
	data = cu.get_dataframe(test_file)
	test_features = features.extract_features(feature_names, data)
	probs = clf.predict_proba(test_features)

	if (update_posteriors):
		print("Calculating priors and updating posteriors")
		new_priors = cu.get_priors(full_train_file)
		old_priors = cu.get_priors(train_file)
		probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
	
	print("Saving submission to %s" % submission_file)
	cu.write_submission(submission_file, probs)
	
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
def main():
	start = time.time()

	print "Reading train data and its features from: " + train_file
	data = cu.get_dataframe(train_file)
	global fea
	fea = features.extract_features(feature_names,data)

	percep = Perceptron(penalty=None, alpha=0.0001, fit_intercept=False, n_iter=5, shuffle=False, verbose=1, eta0=1.0, n_jobs=-1, seed=0, class_weight="auto", warm_start=False)

	X = []
	for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
		X.append([i])
	# Must be array type object. Strings must be converted to
	# to integer values, otherwise fit method raises ValueError
	global y
	y = [] 

	print "Collecting statuses"
	
	for element in data["OpenStatus"]:
            for index, status in enumerate(ques_status):
                if element == status:
                    y.append(index)
            
	print "Fitting"
	percep.fit(fea, y)
	
	'''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''   
	
	print "Reading test data and features"
	test_data = cu.get_dataframe(test_file)
	test_fea = features.extract_features(feature_names,test_data)

	print "Making predictions"
	global probs
	#probs = percep.predict_proba(test_fea) # only available for binary classification
	probs = percep.predict(test_fea)
	# shape of probs is [n_samples]
	# convert probs to shape [n_samples,n_classes]
	probs = np.resize(probs, (len(probs) / 5, 5))
	
	#if is_full_train_set == 0:
	#	print("Calculating priors and updating posteriors")
	#	new_priors = cu.get_priors(full_train_file)
	#	old_priors = cu.get_priors(train_file)
	#	probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)	

	print "writing submission to " + submission_file
	cu.write_submission(submission_file, probs)
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
def main():
	start = time.time()

	print "Reading train data and its features from: " + train_file
	data = cu.get_dataframe(train_file)
	global fea
	fea = features.extract_features(feature_names,data)

	mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False)

	X = []
	for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
		X.append([i])
	# Must be array type object. Strings must be converted to
	# to integer values, otherwise fit method raises ValueError
	global y
	y = [] 

	print "Collecting statuses"
	
	for element in data["OpenStatus"]:
            for index, status in enumerate(ques_status):
                if element == status:
                    y.append(index)
            
	print "Fitting"
	mten.fit(fea, y)
	
	'''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''   
	
	print "Reading test data and features"
	test_data = cu.get_dataframe(test_file)
	test_fea = features.extract_features(feature_names,test_data)

	print "Making predictions"
	global probs
	probs = mten.predict(test_fea)
	# shape of probs is [n_samples]
	# convert probs to shape [n_samples,n_classes]
	probs = np.resize(probs, (len(probs) / 5, 5))
	
	if is_full_train_set == 0:
		print("Calculating priors and updating posteriors")
		new_priors = cu.get_priors(full_train_file)
		old_priors = cu.get_priors(train_file)
		probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)	

	print "writing submission to " + submission_file
	cu.write_submission(submission_file, probs)
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
Exemple #9
0
def main():
    # The number of documents to analyze each iteration
    batchsize = 100

    # The total number of questions on Stack Overflow
    D = 3.3e6

    # The number of topics
    K = 20

    # Make sure the topics are included as features for analysis
    feature_names.extend('Topic%d' % k for k in range(K))

    print("Reading the vocabulary")
    vocab = [w.strip() for w in file('./vocab4.txt')]

    # How many words are in the vocabulary
    W = len(vocab)

    print("Reading the data")
    data = cu.get_dataframe(train_file)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)

    print("Allocating the topics")
    allocate_topics(lda, data, K, batchsize, D)

    print("Extracting features")
    fea = extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50, verbose=2,
                                compute_importances=True, n_jobs=4)
    rf.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    allocate_topics(lda, data, K, batchsize, D)
    test_features = extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
def make_submission():
    data = None
    if os.path.exists('data.pik'):
        print("Unpickeling the data")
        data = pickle.load(open('data.pik'))
    else:
        print("Reading the data")
        data = cu.get_dataframe(full_train_file)
        pickle.dump(data,open('data.pik','w'))

    fea = None
    if os.path.exists('fea.pik'):
        print("Unpickeling the fea")
        fea = pickle.load(open('fea.pik'))
    else:
        print("Extracting features")
        fea = features.extract_features(feature_names, data)
        pickle.dump(fea,open('fea.pik','w'))
    
    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                oob_score=True,
                                #criterion='entropy',
                                n_jobs=2)
    
    rf.fit(fea, data["OpenStatus"])
    print "Features Importance:"
    imps = zip(rf.feature_importances_,
               feature_names,)
    imps.sort(reverse=True)
    print '\n'.join([ str(_) for _ in imps ])
    print "Generalization Error:", rf.oob_score_

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    if True:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
Exemple #11
0
def gen_sentiment_vectors(reviews, word_lists, popular_words):
	sentiment_vectors = dict()

	for i in reviews:
		doc_tag = reviews[i]['title']
		prev_sentiment = None

		for line in reviews[i]['reviews']:
			if line == ("<p>", ) or line == ("</p>", ):
				continue

			sentiment = line[0]
			sentence = line[1]

			features = f.extract_features(sentence, word_lists, popular_words, doc_tag, prev_sentiment)

			if sentiment in sentiment_vectors:
				sentiment_vectors[sentiment] = f.merge_features(sentiment_vectors[sentiment], features)
			else:
				sentiment_vectors[sentiment] = features

			prev_sentiment = sentiment

#		if i % 20 == 0:
#			print "Done with " + str(i)

	for sentiment in sentiment_vectors:
		sentiment_vectors[sentiment] = f.smooth_features(sentiment_vectors[sentiment])

	return sentiment_vectors
def cross_validate():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Cross-Validating")
    rf = RandomForestClassifier(n_estimators=10,
                                verbose=1,
                                compute_importances=True,
                                n_jobs=2)
    cv = cross_validation.KFold(len(data),
                                k=10,
                                indices=False)
    results = []
    for traincv, testcv in cv:
        print "\t-- cv [%d]"%len(results)
        print "\t","extracting features"
        #...
        feacv = features.extract_features(feature_names,
                                          traincv)
        print "\t","learning"
        rf.fit(feacv, data["OpenStatus"])
        print "\t","predicting"
        probs = rf.predict_proba(testcv)
        print "\t","evaluating"
        results.append( llfun(target[testcv],
                              [x["OpenStatus"] for x in probas]) )
    print "LogLoss: " + str( np.array(results).mean() )
def create_dataset(split = 0.5, size = None):
	"""
	Reads in a set of texts and split into train and test sets, tagged by author
	"""
	train_data, test_data = [],[]
	max_feats = defaultdict(float)
	for file_name in os.listdir(path)[:size]:
		base_name = os.path.basename(file_name)
		author = base_name.split('_',1)[0]

		print "Reading in from %s" % base_name
		with codecs.open(os.path.join(path, file_name),'r','utf8') as doc:
			content = doc.read()
			feat_vec = features.extract_features(content)
			for feat, value in feat_vec.iteritems():
				if value > max_feats[feat]:
					max_feats[feat]=value
			length = len(feat_vec)
			feat_vec['author']=author
			if random.random()<split:
				train_data.append(feat_vec)
			else:
				test_data.append(feat_vec)

	print "Normalizing..."
	for feat_vec in train_data:
		for feat in feat_vec:
			feat_vec[feat]/=max_feats[feat]
	for feat_vec in test_data:
		for feat in feat_vec:
			feat_vec[feat]/=max_feats[feat]

	return train_data, test_data
def main():
    
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)
    print("Writing short sample features file")
    ''' preview in console '''
    print(fea.values[:4])
    print fea.describe().to_string()
    ''' save the X features data (matrix)'''
    # cu.write_submission(train_features_short_file, fea.values)
    np.savetxt(train_features_short_file, fea.values, fmt='%d', delimiter=',', newline='\n')

    
    '''train_features_short = [fea, data["OpenStatus"]]'''
    closed_reasons = data["OpenStatus"]
    closed_reasons_count = Counter(closed_reasons)

    print(closed_reasons_count.keys()[0:5])
    closed_reasons_enum = map(closed_reasons_count.keys().index, closed_reasons)    
    print(closed_reasons_enum[:9])
    
    print("Saving submission to %s" % submission_file)
    ''' save the y supervised classification data (vector) '''
    np.savetxt(train_y_short_file, closed_reasons_enum, fmt='%d', delimiter=',', newline='\n')

    '''
Exemple #15
0
def train(samples, vocabulary):
    logger.debug("Extracting features")

    X = []
    for s in samples:
        X.append(extract_features(s[0], vocabulary))
    X = sp.vstack(X, format="csr")

    y = np.array([s[1] for s in samples])

    clf = RandomForestClassifier(n_estimators=30)

    if args["-c"]:
        logger.debug("Performing N-fold cross-validation (N=%s)" % args["-f"])
        scores = cross_validation.cross_val_score(
            clf, X.toarray(), y, n_jobs=int(args["-j"]), cv=int(args["-f"]), scoring=args["-s"]
        )
        print("F1: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

    logger.debug("Training model on all data")
    clf.fit(X.toarray(), y)

    logger.debug("Done, returning model and vocabulary")

    return (clf, vocabulary)
Exemple #16
0
def predict(clf, ua, vocabulary):
    """Predict if a patient is diagnosed with a disease."""

    X = extract_features(ua, vocabulary)
    pred = clf.predict(X.toarray())

    return X, pred
def preprocess(file):
    print('  Tokenizing...')
    tokens = [line.strip() for line in file]

    # Generate all word POS tags
    print('  Tagging parts of speech...')
    pos_tagged = nltk.pos_tag(tokens)

    # Generate all word lemma forms
    print('  Finding lemma forms...')
    l = lemmatizer()
    lemma_tagged = [{'word': token, 'lemma': l(token, pos), 'pos': pos} for (token, pos) in pos_tagged]

    # Make a list of lists
    # Go through all words, appending to a list within the list
    # If the word POS is '.', then flush the current list to the sentence list with that word
    # Then start working with a new list
    sentence_chunked = []
    current_sentence = []
    for word_form in lemma_tagged:
        current_sentence.append(word_form)

        if word_form['pos'] == '.':
            sentence_chunked.append(current_sentence)
            current_sentence = []

    feature_tagged = []
    for sentence in sentence_chunked:
        for i in range(0, len(sentence)):
            word_form = sentence[i]
            feature_tagged.append(extract_features(word_form, i, sentence))

    return feature_tagged
Exemple #18
0
def train(sentences):
    """Train NER tagger.

    Parameters
    ----------
    sentences : iterable over list
        A sequence of lists of tokens.
    """
    if not isinstance(sentences, list):
        sentences = list(sentences)

    logger.debug("Extracting features")

    vocabulary = dict((t[0], i) for s in sentences for i, t in enumerate(s))

    X = []
    for i, s in enumerate(sentences):
        X.append(extract_features(s, vocabulary))
    X = sp.vstack(X, format='csr')

    # FIXME Only BIO tags for now
    y = np.array([bio_int[tok[2][0]] for s in sentences for tok in s])

    params = {
        "loss": ["l1", "l2"],
        "multi_class": [True, False],
        "C": [1., 10., 100.],
    }
    logger.debug("Training linear SVMs")
    clf = GridSearchCV(LinearSVC(), params, n_jobs=-1).fit(X, y)
    logger.debug("Done, returning the best one")
    return (clf.best_estimator, vocabulary)
Exemple #19
0
def classify(text):
	s = score(extract_features(text))
	print s
	if s <= MAX_HAM_SCORE:
		return "ham"
	elif s>= MIN_SPAM_SCORE:
		return "spam"
	else:
		return "unsure"
Exemple #20
0
def compile_data(input_file, label_file):
    cf = ff.extract_features(input_file)
    truth = la.do_label(cf, label_file)
    data = []
    # we can throw away the time stamps now
    for x in cf:
        data.append(x[3])

    return data, truth
Exemple #21
0
 def operate(word_form, i, sentence):
     feature_set = extract_features(word_form, i, sentence)
     key = generate_key(word_form)
     # Increment total count for sense
     output[key][0] += 1
     # For each found feature, increment
     # count for that feature-value pair
     for feature, value in feature_set.items():
         output[key][1][feature][value] += 1
Exemple #22
0
def measure_model(datasize=1000, testsize=500):
    data = cu.get_sample_data_frame(datasize)
    test = cu.get_test_data_frame(testsize)
    # data = full.ix[len(full)/4:].reset_index() # last n/4 * 3 records
    # test = full.ix[:(len(full)/4)-1].reset_index() # first n/4 records
    # data = cu.get_dataframe('train-sample.csv')
    # test = cu.get_dataframe('public_leaderboard.csv')
    fea = features.extract_features(data)
    test_features = features.extract_features(test)
    rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=5)
    rf.fit(fea, data["OpenStatus"])
    probs = rf.predict_proba(test_features)
    new_priors = cu.load_priors("train.csv")
    old_priors = cu.compute_priors(data.OpenStatus)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    y_true = compute_y_true(test)
    score = multiclass_log_loss(y_true, probs)
    return score, rf, fea
Exemple #23
0
def make_dataset_from_selfplay(data_extracts):
    '''
    Returns an iterable of tf.Examples.
    Args:
        data_extracts: An iterable of (position, pi, result) tuples
    '''
    tf_examples = (make_tf_example(features_lib.extract_features(pos), pi, result)
                   for pos, pi, result in data_extracts)
    return tf_examples
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    X = features.extract_features(feature_names, data)
    y =  [ class_labels[i] for i in data["OpenStatus"]]
    skf = StratifiedKFold(y, 10)
    result_f1 = 0
    result_logloss = 0

    fold = 1
    for train, test in skf:

        print "Fold %d" % fold 
        fold+=1

        X_train = [X.ix[i] for i in train]
        y_train = [y[i] for i in train]

        X_test = [X.ix[i] for i in test]
        y_test = [y[i] for i in test]

        if (options.__dict__['classifier'] == 'erf'):
            classifier = ExtraTreesClassifier(n_estimators=100, verbose=0, compute_importances=True, n_jobs=-1)
        elif(options.__dict__['classifier'] == 'mnb'):
            classifier =  MultinomialNB()
        elif (options.__dict__['classifier'] == 'knn'):
            classifier = KNeighborsClassifier(n_neighbors=11)
        elif (options.__dict__['classifier'] == 'gbc'):
            classifier = GradientBoostingClassifier(n_estimators=200, learn_rate=0.1)


        classifier.fit(X_train, y_train)
        
        probs = classifier.predict_proba(X_test)
        if (options.__dict__['priors'] != 0):
            print("Calculating priors and updating posteriors")
            new_priors = cu.get_priors(full_train_file)
            old_priors = cu.get_priors(train_file)
            probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

        y_pred = probs
        y_test = np.array(y_test)
        logloss = multiclass_log_loss(y_test, y_pred, eps=1e-15)

        y_pred = classifier.predict(X_test)
        f1 = f1_score(y_test, y_pred, pos_label=None)
        print "Log Loss: %f f1: %f" % (logloss, f1)
        result_f1 += f1
        result_logloss += logloss


    print '\navg LogLoss: %f avg f1: %f' % (result_logloss/10.0, result_f1/10.0)
    def predict(self, ua):
        """Predict if a patient is diagnosed with a disease."""
        
        if re.search(r"urllib|nagios|spider|bot|google|http_request|jeeves|yahoo|http", ua, re.IGNORECASE) is not None:
            return True

        X = extract_features(ua, self.vocabulary)
        pred = self.clf.predict(X.toarray())
        if not pred[0]:
            return True

        return False
Exemple #26
0
def predict(clf, sentence, vocabulary):
    """Predict BIO labels for a single sentence."""

    X = extract_features(sentence, vocabulary)
    pred = [int_bio[y] for y in clf.predict(X)]

    # Heuristic repair: make output consistent,
    # but never worse than the raw prediction.
    for i in xrange(len(pred)):
        if pred[i] == "I" and (i == 0 or pred[i - 1] == "O"):
            pred[i] = "B"

    return pred
def profile_one_dim(im):
    im = gray_level(im)
    print(np.shape(im))
    vertical_sum = np.sum(im, axis=0)/np.shape(im)[1]
    fig = plt.figure(0)
    fig.canvas.set_window_title('Projection Profile - ' + filename)
    plt.plot(vertical_sum)
    # plt.show()
    P, X, Y = zone_division(im, vertical_sum)

    density_symmetry, roughness_max, roughness_symmetry = extract_features(im, P, X, Y)
    fv = feature_vector(density_symmetry, roughness_max, roughness_symmetry, filename)
    all_vector.append(fv)
def main():
    print("Reading the data")
    train_data = cu.get_dataframe(train_file)

    print("Extracting features")
    train_features = features.extract_features(feature_names, train_data)

    print("Reading test file and making predictions")
    test_data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, test_data)


    # print("Training random forest")
    # rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1)
    # rf.fit(train_features, train_data["OpenStatus"])
    # probs = rf.predict_proba(test_features)

    # print("Training decision tree")
    # dt = DecisionTreeClassifier()
    # dt.fit(train_features, train_data["OpenStatus"])
    # probs = dt.predict_proba(test_features)

    # print("Training adaboost")
    # ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME").fit(train_features, train_data["OpenStatus"])
    # probs = ada.predict_proba(test_features)

    print("Training nearest neighbors")
    scaler = preprocessing.StandardScaler().fit(train_features)
    train_features_scaled = scaler.transform(train_features)
    test_features_scaled = scaler.transform(test_features)
    nbrs = KNeighborsClassifier(n_neighbors=10).fit(train_features_scaled, train_data["OpenStatus"])
    probs = nbrs.predict_proba(test_features_scaled)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    actual = cu.get_actual(test_data["OpenStatus"])
    print(cu.get_log_loss(actual, probs, 10**(-15)))
Exemple #29
0
def calc_sentiment_prob(vectors, sent_counts, sentence, sentiment, word_lists, popular_words, doc_tag = None, prev_sentiment = None):
	features = f.extract_features(sentence, word_lists, popular_words, doc_tag, prev_sentiment)

	prob = math.log(float(sent_counts[sentiment]) / float(sent_counts['total']))

	for feature in features:
		if feature not in vectors[sentiment]:
			feature = "<UNK>"

		feature_count = vectors[sentiment][feature]

		prob += math.log(float(feature_count + 1) / float(sent_counts[sentiment] + len(vectors[sentiment])))

	return prob
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=100, verbose=2, compute_importances=True, n_jobs=-1)
    rf.fit(fea, data["OpenStatus"])
    gb = GradientBoostingClassifier(n_estimators=100, learn_rate=1.0)
    gb.fit(fea, data["OpenStatus"])
    dt = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0)
    dt.fit(fea, data["OpenStatus"])
    et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
    et.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)

    probs = rf.predict_proba(test_features)
    probs2 = gb.predict_proba(test_features)
    probs3 = dt.predict_proba(test_features)
    probs4 = et.predict_proba(test_features)

    for i in range(0, len(probs)):
        for j in range(0,5):
            probs[i][j] = (probs[i][j] + probs2[i][j] + probs3[i][j] + probs4[i][j])/4

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
Exemple #31
0
def main():
    start = time.time()

    if (use_low_mem == 1):
        data_iter = cu.iter_data_frames(train_file, _chunksize)
    
        i = _chunksize
        fea = None
        y = []        
        
        for train_data in data_iter:
            print "About to have processed: " + str(i)
            print("Extracting features")
            if fea is None:
                fea = features.extract_features(feature_names, train_data)
            else:
                fea = fea.append(features.extract_features(feature_names, train_data))
            for element in train_data['OpenStatus']:
                for index, status in enumerate(ques_status):
                    if element == status: y.append(index)
        
            i = i + _chunksize
    else:
        print "Reading train data and its features from: " + train_file
        data = cu.get_dataframe(train_file)
        fea = features.extract_features(feature_names,data)
        print "Collecting statuses"
        y = []
        for element in data["OpenStatus"]:
                for index, status in enumerate(ques_status):
                    if element == status:
                        y.append(index)

    if do_cross_validation == 1:
        logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None,
                                       fit_intercept=True, intercept_scaling=1, tol=0.0001)

        print 'starting 10 fold verification'
        # Dividing the dataset into k = 10 folds for cross validation
        kf = KFold(len(y),k = 10)
        fold = 0
        result_sum = 0
        for train_index,test_index in kf:
            fold += 1
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for i in train_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_train.append(temp)
                y_train.append(y[i])
                
            for i in test_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_test.append(temp)
                y_test.append(y[i])
            
            print "fitting this fold's data"
            
            rf.fit(X_train, y_train)
            y_test = vectorize_actual(y_test)
            
            #_pred_probs = denormalize(rf.predict_proba(X_test))
            _pred_probs = rf.predict_proba(X_test)
            
            print("Calculating priors and updating posteriors")
            #new_priors = cu.get_priors(full_train_file)
            new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952]
            old_priors = cu.get_priors(train_file)
            _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001)            
            # evaluating the performance
            result = eval.mcllfun(y_test,_pred_probs)
            result_sum += result
            print "MCLL score for fold %d = %0.11f" % (fold,result)
            
        print "Average MCLL score for this classifier = %0.11f" % (result_sum/10)     
    else:
        logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None,
                                       fit_intercept=True, intercept_scaling=1, tol=0.0001) # not available: compute_importances=True

        print "Fitting"
        logit.fit(fea, y)
        
        print "Reading test data and features"
        test_data = cu.get_dataframe(test_file)
        test_fea = features.extract_features(feature_names,test_data)

        print "Making predictions"
        global probs
        probs = logit.predict_proba(test_fea)
        
        if is_full_train_set == 0:
            print("Calculating priors and updating posteriors")
            #new_priors = cu.get_priors(full_train_file)
            new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952]
            old_priors = cu.get_priors(train_file)
            probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)    

        print "writing submission to " + submission_file
        cu.write_submission(submission_file, probs)

    finish = time.time()
    print "completed in %0.4f seconds" % (finish-start)
Exemple #32
0
def main():
    start = time.time()

    result_sum = 0

    data = cu.get_dataframe("data/train-sample.csv")
    #test_data = cu.get_dataframe("data/public_leaderboard.csv")   #use this for evaluating public_leaderboard

    print 'data loaded'

    fea = features.extract_features(feature_names, data)
    #test_fea = features.extract_features(feature_names,test_data)  #use this for evaluating public_leaderboard

    print 'features extracted'

    knn = KNeighborsClassifier(n_neighbors=10, weights='distance')

    # Must be array type object. Strings must be converted to
    # to integer values, otherwise fit method raises ValueError
    y = []
    ques_status = [
        'open', 'too localized', 'not constructive', 'off topic',
        'not a real question'
    ]
    for element in data['OpenStatus']:
        for index, status in enumerate(ques_status):
            if element == status: y.append(index)

    print 'starting 10 fold verification'
    # Dividing the dataset into k = 10 folds for cross validation
    skf = StratifiedKFold(y, k=10)
    fold = 0
    for train_index, test_index in skf:
        fold += 1
        X_train = []
        X_test = []
        y_train = []
        y_test = []
        for i in train_index:
            temp = []
            temp.append(fea['ReputationAtPostCreation'][i])
            temp.append(fea['UserAge'][i])
            temp.append(fea['Title'][i])
            temp.append(fea['BodyMarkdown'][i])
            temp.append(fea['OwnerUndeletedAnswerCountAtPostTime'][i])
            X_train.append(temp)
            y_train.append(y[i])

        for i in test_index:
            temp = []
            temp.append(fea['ReputationAtPostCreation'][i])
            temp.append(fea['UserAge'][i])
            temp.append(fea['Title'][i])
            temp.append(fea['BodyMarkdown'][i])
            temp.append(fea['OwnerUndeletedAnswerCountAtPostTime'][i])
            X_test.append(temp)
            y_test.append(y[i])

        y_test = vectorize_actual(y_test)  # vectorize y_test
        knn.fit(X_train, y_train)  # train the classifier
        predictions = knn.predict_proba(X_test)  # predict the test fold

        # evaluating the performance
        result = eval_tool.mcllfun(y_test, predictions)
        result_sum += result
        print "MCLL score for fold %d = %0.11f" % (fold, result)

    print "Average MCLL score for this classifier = %0.11f" % (result_sum / 10)
    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)

    ### Use this code for evaluting public_leaderboard
    '''knn.fit(fea,y)
Exemple #33
0
def find_candidates(image, scale, overlap, xband, yband, svc, X_scaler,
                    _count):

    image = color_scale(image)

    xstart, xstop = xband
    ystart, ystop = yband

    # generate HOG features over entire search region
    if config.HOG_FEAT:
        hog_conv = convert_color(image, config.HOG_SPACE)
        hog_region = hog_conv[ystart:ystop, xstart:xstop, :]
        if scale != 1.0:
            hog_region = cv2.resize(hog_region, (0, 0),
                                    fx=1.0 / scale,
                                    fy=1.0 / scale)
        hog_channel = config.HOG_CHANNEL
        if hog_channel == 'ALL':
            hog_features = []
            for channel in range(hog_region.shape[2]):
                hog_features.append(
                    get_hog_features(hog_region[:, :, channel],
                                     feature_vec=False))
            hog_features = np.array(hog_features)
        else:
            hog_features = get_hog_features(hog_region[:, :, hog_channel],
                                            feature_vec=False)[np.newaxis, ...]

    # overlap = config.OVERLAP
    window = config.WINDOW_SIZE

    img_region = image[ystart:ystop, xstart:xstop, :]
    if scale != 1.0:
        img_region = cv2.resize(img_region, (0, 0),
                                fx=1.0 / scale,
                                fy=1.0 / scale)

    # start  window sliding rewrite
    xspan = img_region.shape[1]
    yspan = img_region.shape[0]
    pix_per_step = np.int(window * (1 - overlap))
    buff = np.int(window * overlap)
    if (xspan - buff) % pix_per_step == 0:
        xwindows = np.int((xspan - buff) / pix_per_step) - 1
    else:
        xwindows = np.int((xspan - buff) / pix_per_step)
    if (yspan - buff) % pix_per_step == 0:
        ywindows = np.int((yspan - buff) / pix_per_step) - 1
    else:
        ywindows = np.int((yspan - buff) / pix_per_step)

    if xwindows <= 0 or ywindows <= 0:
        raise Exception("Invalid config - area too small")

    print('scale,x,y', scale, xwindows, ywindows)

    boxes = []
    cars = 0
    for iy in range(ywindows):
        for ix in range(xwindows):
            leftx = ix * pix_per_step
            topy = iy * pix_per_step
            endx = leftx + window
            endy = topy + window
            subimg = img_region[topy:endy, leftx:endx]

            features = extract_features(subimg)
            features_sc = X_scaler.transform(features.reshape(1, -1))
            prediction = svc.predict(features_sc)

            if prediction == 1:
                cv2.imwrite(
                    'output_images/cars/' + str(_count) + '_' + str(scale) +
                    '.png', cv2.cvtColor(subimg * 255, cv2.COLOR_RGB2BGR))
                cars += 1
                xbox_left = np.int(leftx * scale)
                ytop_draw = np.int(topy * scale)
                win_draw = np.int(window * scale)
                box = ((xbox_left + xstart, ytop_draw + ystart),
                       (xbox_left + win_draw + xstart,
                        ytop_draw + win_draw + ystart))
                confidence = svc.decision_function(features_sc)
                pred_box = PredictionBox(box, confidence)
                boxes.append(pred_box)
    if cars != 0:
        print('!!!', scale, ':', cars)

    return boxes
 def run(self, position):
     'Return a sorted list of (probability, move) tuples'
     processed_position = features.extract_features(position)
     probabilities = self.session.run(
         self.output, feed_dict={self.x: processed_position[None, :]})[0]
     return probabilities.reshape([go.N, go.N])
import numpy as np
import pickle
import cv2
import sys
from color import convert_color
from features import extract_features
from detect_with_labels import cars_from_bboxes, draw_boxes

dist_pickle = pickle.load( open("svc_classifier.p", "rb" ) )
svc = dist_pickle["svc"]
X_scaler = dist_pickle["X_scaler"]
p = dist_pickle["parameters"]


features = extract_features(sys.argv[1:], color_space=p['color_space'],
                        spatial_size=p['spatial_size'], hist_bins=p['hist_bins'],
                        orient=p['orient'], pix_per_cell=p['pix_per_cell'],
                        cell_per_block=p['cell_per_block'],
                        hog_channel=p['hog_channel'], spatial_feat=p['spatial_feat'],
                        hist_feat=p['hist_feat'], hog_feat=p['hog_feat'])

scaled_features = X_scaler.transform(features)
for prediction in svc.predict(scaled_features):
    print('car' if prediction else 'not car')
Exemple #36
0
     "entropy"]
class_names = ["Attention", "HornsUp", "TrailArms"]

print("Extracting features and labels for window size {} and step size {}...".format(window_size, step_size))
sys.stdout.flush()

n_features = len(feature_names)

X = np.zeros((0,n_features))
y = np.zeros(0,)

for i,window_with_timestamp_and_label in slidingWindow(data, window_size, step_size):
    # omit timestamp and label from accelerometer window for feature extraction:
    window = window_with_timestamp_and_label[:,1:-1]
    # extract features over window:
    x = extract_features(window)
    # append features:
    X = np.append(X, np.reshape(x, (1,-1)), axis=0)
    # append label:
    y = np.append(y, window_with_timestamp_and_label[5, -1])

print("Finished feature extraction over {} windows".format(len(X)))
print("Unique labels found: {}".format(set(y)))
sys.stdout.flush()

# %%---------------------------------------------------------------------------
#
#		                    Plot data points
#
# -----------------------------------------------------------------------------
        while hdata[count][0] < window_with_timestamp_and_label[row][
                4] and count > 0:
            count = count - 1
            print("changed count ", count)
        #remove timestamps from accel data
        temp = np.vstack((temp, window_with_timestamp_and_label[row][:-2]))
        #add hr data to accel
        hr_label = np.append(hdata[count][1], hdata[count][2])
        window_with_timestamp_and_label[row] = np.append(
            temp[row + 1], hr_label)
        #add in label (hr_data is on form hr, t, label)
        #remove time and label for feature extraction
    window = window_with_timestamp_and_label[:, :-1]
    # extract features over window:
    x = extract_features(
        window
    )  #x, y, z, t (not reoriented)  -> x, y, z, heart rate, label/class -> x, y, z, hr
    # append features:
    # shapes into 1 row with unspecified number of columns (so just 1 row of n_features)
    X = np.append(X, np.reshape(x, (1, -1)), axis=0)
    # append label:
    y = np.append(
        y, window_with_timestamp_and_label[10,
                                           -1])  #we don't know why this is 10?

print("Finished feature extraction over {} windows".format(len(X)))
print("Unique labels found: {}".format(set(y)))
sys.stdout.flush()

# %%---------------------------------------------------------------------------
#
Exemple #38
0
def main():

    if (use_low_mem == 1):
        data_iter = cu.iter_data_frames(train_file, _chunksize)

        i = _chunksize
        fea = None
        y = []

        for train_data in data_iter:
            print "About to have processed: " + str(i)
            print("Extracting features")
            if fea is None:
                fea = features.extract_features(feature_names, train_data)
            else:
                fea = fea.append(
                    features.extract_features(feature_names, train_data))
            for element in train_data['OpenStatus']:
                for index, status in enumerate(ques_status):
                    if element == status: y.append(index)

            i = i + _chunksize
    else:
        print("Reading the data from:" + train_file)
        data = cu.get_dataframe(train_file)
        print("Extracting features")
        fea = features.extract_features(feature_names, data)
        y = []
        for element in data['OpenStatus']:
            for index, status in enumerate(ques_status):
                if element == status: y.append(index)

    if do_cross_validation == 1:
        depth = len(feature_names)
        print "depth=" + str(depth)
        rf = GradientBoostingClassifier(loss='deviance',
                                        learn_rate=0.1,
                                        n_estimators=100,
                                        subsample=1.0,
                                        min_samples_split=1,
                                        min_samples_leaf=1,
                                        max_depth=depth,
                                        init=None,
                                        random_state=None)

        print 'starting 10 fold verification'
        # Dividing the dataset into k = 10 folds for cross validation
        kf = KFold(len(y), k=10)
        fold = 0
        result_sum = 0
        for train_index, test_index in kf:
            fold += 1
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for i in train_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_train.append(temp)
                y_train.append(y[i])

            for i in test_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_test.append(temp)
                y_test.append(y[i])

            print "Fitting for fold " + str(fold)

            rf.fit(X_train, y_train)
            y_test = vectorize_actual(y_test)

            _pred_probs = rf.predict_proba(X_test)
            print("Calculating priors and updating posteriors")
            #new_priors = cu.get_priors(full_train_file)
            # priors distribution over classes based on the training set
            #new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952]
            # priors distribution over classes based on the updated training set's last month
            new_priors = [
                0.03410911204982466, 0.01173872976800856, 0.018430671606251586,
                0.926642216133641, 0.009079270442274271
            ]
            old_priors = cu.get_priors(train_file)
            _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs,
                                                   new_priors, 0.001)
            # evaluating the performance
            result = eval.mcllfun(y_test, _pred_probs)
            result_sum += result
            print "MCLL score for fold %d = %0.11f" % (fold, result)

        print "depth=" + str(depth)
        print "Average MCLL score for this classifier = %0.11f" % (result_sum /
                                                                   10)
    else:
        #rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=True, n_jobs=-1)
        rf = GradientBoostingClassifier(loss='deviance',
                                        learn_rate=0.1,
                                        n_estimators=100,
                                        subsample=1.0,
                                        min_samples_split=1,
                                        min_samples_leaf=1,
                                        max_depth=len(feature_names),
                                        init=None,
                                        random_state=None)

        rf.fit(fea, y)

        print("Reading test file " + test_file + " and making predictions")
        data = cu.get_dataframe(test_file)
        test_features = features.extract_features(feature_names, data)
        probs = rf.predict_proba(test_features)

        # commented out, because we want to adjust probabilities to the last month data anyway
        #if do_full_train == 0:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

        print("Saving submission to %s" % submission_file)
        cu.write_submission(submission_file, probs)
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('mode', help='extract | tsne | umap')
    parser.add_argument(
        'data',
        help=
        '[features]: Filepath to an image or folder containing images to extract features from. [tsne/umap]: Filepath to a .csv file to read into a DataFrame. '
    )
    parser.add_argument('out', help='Output filepath of operation')
    parser.add_argument(
        '--feature-cols',
        '-f',
        help=
        '[tsne/umap]: Numerical data column indices to treat as features. Ex: "B,C,F", use "all" to consider all columns (excluding optional unique-col).'
    )
    parser.add_argument(
        '--unique-col',
        '-u',
        help=
        '[tsne/umap]: The column index containing unique IDs for each row (typically "ID" or "Name" column). Not required. Omitted from "all" feature-cols'
    )
    parser.add_argument(
        '--reduce',
        '-r',
        help=
        '[tsne/umap]: How many dimensions to reduce features to. Default is 2.',
        default='2')
    parser.add_argument(
        '--model',
        '-m',
        help=
        '[features]: Which model to use. ResNet50 | Xception | VGG16 | VGG19 | InceptionV3 | MobileNet. Default: ResNet50',
        default='ResNet50')

    args = parser.parse_args(argv[1:])

    # === FEATURE EXTRACTION ===
    # We expect an image filepath or folder of images
    if args.mode == 'features':
        assert os.path.exists(args.data),\
            'Features mode (data arg): File or directory not found: "{}"'\
            .format(args.data)

        # Calculate and write to args.out
        features = extract_features(args.data,
                                    model=args.model,
                                    write_to=args.out)

    # === DIMENSION REDUCTION ===
    # We expect a .csv file of features
    elif args.mode in ['tsne', 'umap']:

        # Make sure we know what columns are intended to be used numerically as a list of strings, or 'all'
        feature_cols = args.feature_cols
        if feature_cols is None:
            raise Exception(
                'Feature reduction mode: No data column indices provided. Example usage: "--feature-cols B,C,F", "--feature-cols all"'
            )
        elif feature_cols != 'all':
            feature_cols = [
                s.strip() for s in feature_cols.split(',') if s.strip() != ''
            ]

        # Parse the data into a squashed pd.DataFrame with first column being unique keys
        df = parse_data(args.data, feature_cols, args.unique_col)

        if args.mode == 'tsne':
            tsne(df, dims=int(args.reduce), write_to=args.out)

        elif args.mode == 'umap':
            umap(df, write_to=args.out)
Exemple #40
0
def yield_data(filename):
    with open(filename) as handle:
        for line in handle:
            s = line.strip().split(',')
            h1, h2 = s[-2], s[-1]
            yield from extract_features(h1, h2, include_target=True)
def predict():
    """
    Given a window of accelerometer data, predict the activity label. 
    Then use the onActivityDetected(activity) function to notify the 
    Android must use the same feature extraction that you used to 
    train the model.
    """
    # have to fix the window size
    prediction_array=np.array([])
    p_time=np.array([])
    clf = load("classifier.pickle")
    # maybe we are not even filling buffer but just running a for loop

    data_file_ss_11 = os.path.join('data', 'accel_data-12-08-BP-ss.csv')
    data_ss_11 = np.loadtxt(data_file_ss_11, delimiter=',', dtype = object, converters = {0: np.float, 1: np.float, 2: np.float, 3: lambda t: datetime.strptime(t.decode("utf-8"), "%d/%m/%Y %H:%M")})
    data_ss_11 = np.insert(data_ss_11, 3, 0, axis = 1)
    hdata_file_ss_11 = os.path.join('data', 'BPM_2017-12-08-BP-ss.csv')
    hdata_ss_11 = np.loadtxt(hdata_file_ss_11, delimiter=',', dtype = object, converters = {0: lambda t: datetime.strptime(t.decode("utf-8"), "%d/%m/%Y %H:%M"), 1: np.float})


    data = data_ss_11
    hdata = hdata_ss_11


    window_size=20
    step_size=20
    #because hr data in backwards
    count = len(hdata)-1
    for i,window_with_timestamp_and_label in slidingWindow(data, window_size, step_size):
        temp = np.zeros((1,3))
        #while time at row count is under time at accel, increase count (move to next row)
        #only have one window. Each row in window has own observation that needs hr
        for row in range(len(window_with_timestamp_and_label)):

            # print (hdata[count])
            # print(" ")
            # print (window_with_timestamp_and_label[row])


            while hdata[count][0] < window_with_timestamp_and_label[row][4] and count > 0:
                count=count-1
                print("changed count ", count)

            if row==0:
                p_time=np.append(p_time,window_with_timestamp_and_label[row][4])
            #remove timestamps from accel data
            temp = np.vstack((temp,window_with_timestamp_and_label[row][:-2]))
            #add hr data to accel
            hr_label = np.append(hdata[count][1],9)
            window_with_timestamp_and_label[row] = np.append(temp[row+1], hr_label)
            #add in label (hr_data is on form hr, t, label)
            #remove time and label for feature extraction
        window = window_with_timestamp_and_label[:,:-1]
        # extract features over window:
        # print("Buffer filled. Run your classifier.")

        prediction=clf.predict(np.reshape(extract_features(window),(1,-1)))[0]
        prediction_array=   np.append(prediction_array,prediction)
        # print prediction

    # for i in range(0,len(prediction_array)):
    #     p_time=np.append(p_time,i)

    plt.plot(p_time,prediction_array)
    plt.xlabel('Time')
    plt.ylabel('Predicted Label')
    plt.show()
    return
Exemple #42
0
    column_names_2006 = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel',
        'phead', 'pdeprel'
    ]
    train_file = '../../corpus/conllx/sv/swedish_talbanken05_train.conll'
    test_file = '../../corpus/conllx/sv/swedish_talbanken05_test.conll'

    train_sentences = conll.read_sentences(train_file)
    test_sentences = conll.read_sentences(test_file)
    formatted_train_corpus = conll.split_rows(train_sentences,
                                              column_names_2006)
    formatted_test_corpus = conll.split_rows(test_sentences, column_names_2006)
    for mode in [1, 3]:
        print("Extracting the features...")
        X_dict, y = features.extract_features(formatted_train_corpus, mode)
        print("Encoding the features...")
        # Vectorize the feature matrix and carry out a one-hot encoding
        vec = DictVectorizer(sparse=True)
        X = vec.fit_transform(X_dict)

        print("Training the model...")
        classifier = linear_model.LogisticRegression(penalty='l2',
                                                     dual=True,
                                                     solver='liblinear',
                                                     multi_class='ovr')
        model = classifier.fit(X, y)
        dump(model, './clfs/logres_mode=' + str(mode) + '_feats.joblib')
        dump(vec,
             './vectorizers/mode=' + str(mode) + '_feats_vectorizer.joblib')
        print("Predicting the chunks in the test set...")
    def train(self):
        # Load data
        cars = self.fill_data(self.root_car)
        notcars = self.fill_data(self.root_notcars)

        # Show an example of each kind and print number of examples
        if self.debug:
            test_car = visualizer.read_and_draw_image(cars[0], 'Car')
            test_not_car = visualizer.read_and_draw_image(notcars[0], 'No_Car')
            print("Number of Car examples: ", len(cars))
            print("Number of Non-Car examples: ", len(notcars))

        # TODO see if image ranges from 0 to 1
        car_features = features.extract_features(
            cars[0:self.num_train_examples],
            color_space=self.color_space,
            spatial_size=self.spatial_size,
            hist_bins=self.hist_bins,
            orient=self.orient,
            pix_per_cell=self.pix_per_cell,
            cell_per_block=self.cell_per_block,
            hog_channel=self.hog_channel,
            spatial_feat=self.spatial_feat,
            hist_feat=self.hist_feat,
            hog_feat=self.hog_feat)
        notcar_features = features.extract_features(
            notcars[0:self.num_train_examples],
            color_space=self.color_space,
            spatial_size=self.spatial_size,
            hist_bins=self.hist_bins,
            orient=self.orient,
            pix_per_cell=self.pix_per_cell,
            cell_per_block=self.cell_per_block,
            hog_channel=self.hog_channel,
            spatial_feat=self.spatial_feat,
            hist_feat=self.hist_feat,
            hog_feat=self.hog_feat)

        if self.debug:
            from skimage.feature import hog

            car_feat_image = cv2.cvtColor(test_car, cv2.COLOR_RGB2YCrCb)
            for channel in range(car_feat_image.shape[2]):
                channel_image = car_feat_image[:, :, channel]
                fd, hog_image = hog(channel_image,
                                    orientations=self.orient,
                                    pixels_per_cell=(self.pix_per_cell,
                                                     self.pix_per_cell),
                                    cells_per_block=(self.cell_per_block,
                                                     self.cell_per_block),
                                    visualise=True)
                visualizer.draw_two_images(channel_image,
                                           hog_image,
                                           title='Car_Channel_' + str(channel),
                                           save=True)

            notcar_feat_image = cv2.cvtColor(test_not_car, cv2.COLOR_RGB2YCrCb)
            for channel in range(notcar_feat_image.shape[2]):
                channel_image = notcar_feat_image[:, :, channel]
                fd, hog_image = hog(channel_image,
                                    orientations=self.orient,
                                    pixels_per_cell=(self.pix_per_cell,
                                                     self.pix_per_cell),
                                    cells_per_block=(self.cell_per_block,
                                                     self.cell_per_block),
                                    visualise=True)
                visualizer.draw_two_images(channel_image,
                                           hog_image,
                                           title='No_Car_Channel_' +
                                           str(channel),
                                           save=True)

        # TODO normalize data
        # TODO try different colorspaces
        # TODO try color HOG

        X = np.vstack((car_features, notcar_features)).astype(np.float64)
        print(X.shape)
        X_scaler = StandardScaler().fit(X)
        self.X_scaler = X_scaler
        scaled_X = X_scaler.transform(X)
        y = np.hstack(
            (np.ones(len(car_features)), np.zeros(len(notcar_features))))

        rand_state = np.random.randint(0, 100)
        X_train, X_test, y_train, y_test = train_test_split(
            scaled_X, y, test_size=0.2, random_state=rand_state)

        print('Using:', self.orient, 'orientations', self.pix_per_cell,
              'pixels per cell and', self.cell_per_block, 'cells per block')
        print('Feature vector length:', len(X_train[0]))
        # Use a linear SVC
        svc = LinearSVC(C=1000)
        self.svc = svc
        # Check the training time for the SVC
        t = time.time()
        svc.fit(X_train, y_train)
        t2 = time.time()
        print(round(t2 - t, 2), 'Seconds to train SVC...')
        # Check the score of the SVC
        print('Test Accuracy of SVC = ', round(svc.score(X_test, y_test), 4))

        # Check the prediction time for a single sample
        t = time.time()
        n_predict = 10
        print('My SVC predicts: ', svc.predict(X_test[0:n_predict]))
        print('For these', n_predict, 'labels: ', y_test[0:n_predict])
        t2 = time.time()
        print(round(t2 - t, 5), 'Seconds to predict', n_predict,
              'labels with SVC')
Exemple #44
0
male_count = 0
female_count = 0

try:
  index = -1
  while True:
    index += 1
    if (male_count + female_count) % 100 == 0:
      conn.commit()
      print(f"a: {male_count + female_count}, m: {male_count}, f: {female_count}")

    c.execute("SELECT id, male, body FROM posts WHERE length(body) > 150 ORDER BY ROWID DESC LIMIT ? OFFSET ?;", (500, index * 500))
    posts = c.fetchall()
    if len(posts) == 0:
      print(f"No more posts.")
      conn.commit()
      exit()
    for post in posts:
      post_id = post[0]
      is_male = post[1]
      body = post[2] 

      x = features.extract_features(body)
      c.execute("INSERT INTO examples VALUES (?, ?, ?, ?);", (post_id, len(body), is_male, json.dumps(x)))
      if is_male:
        male_count += 1
      else:
        female_count += 1
except KeyboardInterrupt:
  conn.commit()
Exemple #45
0
def load_and_train_all_features():
    # Read in car and non-car images

    non_vehicle_images = glob.glob('training_data/non-vehicles/*/*.png')
    vehicle_images = glob.glob('training_data/vehicles/*/*.png')
    train_jpeg = False
    #non_vehicle_images = glob.glob('training_data/non-vehicles_smallset/*/*.jpeg')
    #vehicle_images = glob.glob('training_data/vehicles_smallset/*/*.jpeg')
    #train_jpeg=True
    #print('USING SMALLSET JPEG IMAGES!!!! us mping to load in features')

    print('Trainig Data:')
    print('Car images:', len(vehicle_images))
    print('Non-Car images:', len(non_vehicle_images))

    cars = []
    notcars = []

    for image in vehicle_images:
        cars.append(image)

    for image in non_vehicle_images:
        notcars.append(image)

    # experemted with play with these values to see how your classifier
    # performs under different binning scenarios

    colorspace = 'YCrCb'  # Can be RGB, HSV, LUV, HLS, YUV, YCrCb
    orient = 9  # HOG orientations
    pix_per_cell = 8  # HOG pixels per cell
    cell_per_block = 2  # HOG cells per block
    hog_channel = 'ALL'  # Can be 0, 1, 2, or "ALL"
    spatial = 32
    spatial_size = (spatial, spatial)  # Spatial binning dimensions
    hist_bins = 32  # Number of histogram bins
    spatial_feat = True  # Spatial features on or off
    hist_feat = True  # Histogram features on or off
    hog_feat = True  # HOG features on or off
    y_start_stop = [400, 656]  # Min and max in y to search in slide_window()
    train_jpeg = False

    print('Generating Features for Cars')
    car_features = extract_features(cars,
                                    color_space=colorspace,
                                    spatial_size=(spatial, spatial),
                                    hist_bins=hist_bins,
                                    hist_range=(0, 256),
                                    orient=orient,
                                    pix_per_cell=pix_per_cell,
                                    cell_per_block=cell_per_block,
                                    hog_channel=hog_channel,
                                    spatial_feat=spatial_feat,
                                    hist_feat=hist_feat,
                                    hog_feat=hog_feat,
                                    train_jpeg=train_jpeg)

    print('Generating Features for non car images')

    notcar_features = extract_features(notcars,
                                       color_space=colorspace,
                                       spatial_size=(spatial, spatial),
                                       hist_bins=hist_bins,
                                       hist_range=(0, 256),
                                       orient=orient,
                                       pix_per_cell=pix_per_cell,
                                       cell_per_block=cell_per_block,
                                       hog_channel=hog_channel,
                                       spatial_feat=spatial_feat,
                                       hist_feat=hist_feat,
                                       hog_feat=hog_feat,
                                       train_jpeg=train_jpeg)

    # Create an array stack of feature vectors
    X = np.vstack((car_features, notcar_features)).astype(np.float64)
    # Fit a per-column scaler
    X_scaler = StandardScaler().fit(X)
    # Apply the scaler to X
    scaled_X = X_scaler.transform(X)

    # Define the labels vector
    y = np.hstack((np.ones(len(car_features)), np.zeros(len(notcar_features))))

    # Split up data into randomized training and test sets
    rand_state = np.random.randint(0, 100)
    X_train, X_test, y_train, y_test = train_test_split(
        scaled_X, y, test_size=0.2, random_state=rand_state)

    print('Using spatial binning of:', spatial, 'and', hist_bins,
          'histogram bins')
    print('Feature vector length:', len(X_train[0]))

    # Use a linear SVC
    svc = LinearSVC(C=1.0)
    #clf = CalibratedClassifierCV(svc) # performance was same on small sample set.
    clf = svc
    # Check the training time for the SVC
    t = time.time()
    clf.fit(X_train, y_train)
    t2 = time.time()
    print(round(t2 - t, 2), 'Seconds to train SVC...')
    # Check the score of the SVC
    print('Test Accuracy of SVC = ', round(clf.score(X_test, y_test), 4))
    # Check the prediction time for a single sample
    t = time.time()
    n_predict = 10
    print('My SVC predicts:     ', clf.predict(X_test[0:n_predict]))
    print('For these', n_predict, 'labels: ', y_test[0:n_predict])
    t2 = time.time()
    print(round(t2 - t, 5), 'Seconds to predict', n_predict, 'labels with SVC')

    print('saving model to classifier-svm.pkl')
    joblib.dump(clf, 'models/classifer-svm.pkl')
    joblib.dump(X_scaler, 'models/xscaler.pkl')

    return clf, X_scaler
Exemple #46
0
def predict(window):
  x = extract_features(window)
  label = int(classifier.predict(x)[0])
  onActivityDetected(class_names[label])

  return
Exemple #47
0
def play(network, args, device=None):
    ''' Plays out a self-play match, returning a MCTSPlayer object containing:
        - the final position
        - the n x 362 tensor of floats representing the mcts search probabilities
        - the n-ary tensor of floats representing the original value-net estimate
          where n is the number of moves in the game'''
    readouts = strat_args.num_readouts  # defined in strategies.py
    # Disable resign in 5% of games
    if random.random() < args.resign_disable_pct:
        resign_threshold = -1.0
    else:
        resign_threshold = None

    player = MCTSPlayer(network,
                        device=device,
                        resign_threshold=resign_threshold)

    player.initialize_game()

    # Must run this once at the start to expand the root node.
    first_node = player.root.select_leaf()
    features = extract_features(first_node.position, NEW_FEATURES)
    prob, val = network.policy_value_fn(features, device=device)
    first_node.incorporate_results(prob.flatten(), val.flatten(), first_node)

    while True:
        start = time.time()
        player.root.inject_noise()
        current_readouts = player.root.N
        # we want to do "X additional readouts", rather than "up to X readouts".
        while player.root.N < current_readouts + readouts:
            player.tree_search()

        if args.verbose >= 3:
            print(player.root.position)
            print(player.root.describe())

        if player.should_resign():
            player.set_result(-1 * player.root.position.to_play,
                              was_resign=True)
            break
        move = player.pick_move()
        player.play_move(move)
        if player.root.is_done():
            player.set_result(player.root.position.result(), was_resign=False)
            break

        if (args.verbose >= 2) or (args.verbose >= 1
                                   and player.root.position.n % 10 == 9):
            print("Q: {:.5f}".format(player.root.Q))
            dur = time.time() - start
            print("%d: %d readouts, %.3f s/100. (%.2f sec)" %
                  (player.root.position.n, readouts, dur / readouts * 100.0,
                   dur),
                  flush=True)
        if args.verbose >= 3:
            print("Played >>",
                  coords.to_gtp(coords.from_flat(player.root.fmove)))

    if args.verbose >= 2:
        utils.dbg("%s: %.3f" % (player.result_string, player.root.Q))
        utils.dbg(player.root.position, player.root.position.score())

    return player
Exemple #48
0
 def setUpClass(self):
     # Set up data for the whole TestCase
     self.train_data, self.train_images =  read_data_from_file('synimg/train/data.csv', max_per_class=MAX_PER_CLASS)
     self.label_encoder, self.train_data = get_labels(self.train_data, print_classes=False) # one-hot encode, returns in column 'style_id'
     self.X_train = extract_features(self.train_images)
     self.y_train = list(self.train_data['style_id'])
Exemple #49
0
    def extract(self,
                image_paths,
                layer_names,
                flipped=False,
                batch_size=64,
                should_reshape_vectors=True,
                verbose=2,
                spatial_pool=None):
        """
        Extract features from the image
        """
        try:
            image_paths.__getattribute__('__len__')
        except AttributeError:
            raise TypeError('image_paths must be a container of paths')
        if len(self.feature_norm_method) > 1:
            raise NotImplementedError()
        if spatial_pool not in [None, 'max', 'sum']:
            raise ValueError('Unknown spatial pool: {}'.format(spatial_pool))
        if spatial_pool is not None:
            should_reshape_vectors = False
        if not isinstance(layer_names, list):
            layer_names = [layer_names]
        if len(layer_names) > 1 and not should_reshape_vectors:
            raise ValueError(
                'Cannot stack features from several layers without reshaping')

        getter = image_getter.ImageGetterFromPaths(
            image_paths, im_shape=self.img_resize_shape, rgb_batch=True)

        feature_dict = extract_features(
            flipped,
            self,
            layer_names=layer_names,
            image_getter=getter,
            im_shape=self.img_resize_shape,
            mean=None,
            batch_size=batch_size,
            verbose=verbose,
            should_reshape_vectors=should_reshape_vectors)

        # feed to the net_stream augmented images anf pool features after
        features = np.hstack(feature_dict.values())
        if spatial_pool is not None and len(features.shape) != 4:
            raise ValueError(
                'Cannot do a spatial pool on features with shape: {}'.format(
                    features.shape))
        if spatial_pool == 'max':
            features = np.max(features, axis=(1, 2))
        elif spatial_pool == 'sum':
            features = np.sum(features, axis=(1, 2))

        # print 'features.shape={}'.format(features.shape)
        if 'unit_norm' in self.feature_norm_method:
            if not should_reshape_vectors:
                raise ValueError(
                    'Cannot do unit_norm without reshaping the vectors')
            sklearn.preprocessing.normalize(features,
                                            norm='l2',
                                            axis=1,
                                            copy=False)
        assert len(features) == len(image_paths)
        return features
Exemple #50
0
 def run(self, position):
     processed_position = features.extract_features(position,
                                                    features=self.features)
     probabilities = self.session.run(
         self.output, dict={self.x: processed_position[None, :]})[0]
     return probabilities.reshape(go.N, go.N)
Exemple #51
0
def _make_tf_example_from_pwc(position_w_context):
    f = dual_net.get_features()
    features = features_lib.extract_features(position_w_context.position, f)
    pi = _one_hot(coords.to_flat(position_w_context.next_move))
    value = position_w_context.result
    return make_tf_example(features, pi, value)
    def evaluate_model(this, samples, targets):
        predictions = this.model.predict(samples)
        for t, p in zip(targets, predictions):
            print "%s, %s" % (t[0], this.num_class_map[p])

    def predict(this, sample):
        prediction = this.model.predict(sample)
        return this.num_class_map[prediction[0]]


if __name__ == '__main__':
    ## Parse command line arguments
    parser = argparse.ArgumentParser(
        description=random_forest_meta['program_description'])
    parser.add_argument('command', **arg_command)
    parser.add_argument('dataset', **arg_dataset)
    parser.add_argument('classes', **arg_classes)
    args = parser.parse_args()

    cmd = args.command[0]
    dataset = args.dataset[0]

    if cmd == "train":
        gt, features = extract_features(args.dataset[0])
        c = classifier(features, gt, random_forest_model)
    if cmd == "test":
        c = classifier(None, None, None, "models/random_forest.model")
        gt, features = extract_features(args.dataset[0])
        c.evaluate_model(features, gt)
Exemple #53
0
def _make_tf_example_from_pwc(position_w_context):
    features = features_lib.extract_features(position_w_context.position)
    pi = _one_hot(coords.flatten_coords(position_w_context.next_move))
    value = position_w_context.result
    return make_tf_example(features, pi, value)
Exemple #54
0
n_features = len(feature_names)

X = np.zeros((0, n_features))
aX = np.zeros((0, n_features))
y = np.zeros(0, )

for i, window_with_timestamp_and_label in slidingWindow(
        data, window_size, step_size):
    # omit timestamp and label from accelerometer window for feature extraction:
    window1 = window_with_timestamp_and_label[:, 1:4]
    window2 = window_with_timestamp_and_label[:, 4:7]

    # extract features over window:
    #accel
    x = extract_features(window1)
    #gyroscope
    gx = extract_features(window2)
    #heart rate is a feature of its own, therefore don't extract

    # append features to array:
    aX = np.reshape(np.append(np.reshape(x, (1, -1)), np.reshape(gx, (1, -1))),
                    (1, -1))
    bX = np.reshape(
        np.append(aX, window_with_timestamp_and_label[(i - 1) % 20, -2]),
        (1, -1))

    X = np.append(X, bX, axis=0)
    y = np.append(y, window_with_timestamp_and_label[10, -1])

print("Finished feature extraction over {} windows".format(len(X)))
Exemple #55
0
def pipeline_v1(vehicles, non_vehicles, params=None, save=False):
    """
    This function performs feature engineering, trains a Linear SVC, and optionally saves the fitted model.
    """
    params = params or {
        # color space
        'cspace': 'YCrCb',  # Can be RGB, HSV, LUV, HLS, YUV, or YCrCb

        # spatial binning params
        'spatial_size': (24, 24),

        # color histogram params
        'hist_bins': 32,
        'hist_range': (0, 256),

        # HOG params
        'orient': 9,
        'pix_per_cell': 8,
        'cell_per_block': 2,
        'hog_channel': 'ALL'  # Can be 0, 1, 2, or "ALL"
    }

    t = time.time()
    vehicle_features = extract_features(vehicles, **params)
    non_vehicle_features = extract_features(non_vehicles, **params)
    t2 = time.time()
    print(round(t2 - t, 2), 'Seconds to extract features...')

    # Create an array stack of feature vectors
    X = np.vstack((vehicle_features, non_vehicle_features)).astype(np.float64)

    # Fit a per-column scaler
    X_scaler = StandardScaler().fit(X)

    # Apply the scaler to X
    scaled_X = X_scaler.transform(X)

    # Define the labels vector
    y = np.hstack(
        (np.ones(len(vehicle_features)), np.zeros(len(non_vehicle_features))))

    # Split up data into randomized training and test sets
    X_train, X_test, y_train, y_test = train_test_split(scaled_X,
                                                        y,
                                                        test_size=0.2)

    print('Using:', params['orient'], 'orientations, ', params['pix_per_cell'],
          'pixels per cell, and', params['cell_per_block'], 'cells per block')
    print('Feature vector length:', len(X_train[0]))

    # Use a linear SVC
    svc = LinearSVC()

    # Check the training time for the SVC
    t = time.time()
    svc.fit(X_train, y_train)
    t2 = time.time()
    print(round(t2 - t, 2), 'Seconds to train SVC...')

    # Check the score of the SVC
    acc = round(svc.score(X_test, y_test), 4)
    print('Test Accuracy of SVC = ', acc)

    # Check the prediction time for a single sample
    t = time.time()
    n_predict = 10
    print('My SVC predicts: ', svc.predict(X_test[0:n_predict]))
    print('For these', n_predict, 'labels: ', y_test[0:n_predict])
    t2 = time.time()

    print(round(t2 - t, 5), 'Seconds to predict', n_predict, 'labels with SVC')

    if save:
        now = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
        pickle_fname = (
            './saved_models/{}|test_acc={}|train_samples={}|test_samples={}.p'.
            format(now, acc, len(y_train), len(y_test)))
        with open(pickle_fname, 'wb') as f:
            pickle.dump(
                {
                    'X_train': X_train,
                    'y_train': y_train,
                    'X_test': X_test,
                    'y_test': y_test,
                    'X_scaler': X_scaler,
                    'svc': svc,
                    'params': params
                }, f, pickle.HIGHEST_PROTOCOL)
            print('Saved model and params to {}'.format(pickle_fname))
Exemple #56
0
 def _extract_features(positions):
     return features.extract_features(self.hparams.board_size,
                                      positions)
Exemple #57
0
def main():
    start = time.time()

    print "Reading train data and its features from: " + train_file
    data = cu.get_dataframe(train_file)
    global fea
    fea = features.extract_features(feature_names, data)

    mnbayes = MultinomialNB(alpha=1.0, fit_prior=True)
    '''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''

    # Must be array type object. Strings must be converted to
    # to integer values, otherwise fit method raises ValueError
    y = []

    for element in data['OpenStatus']:
        for index, status in enumerate(ques_status):
            if element == status: y.append(index)

    if do_cross_validation == 1:
        print 'starting 10 fold verification'
        # Dividing the dataset into k = 10 folds for cross validation
        #skf = StratifiedKFold(y,k = 10)
        skf = KFold(len(y), k=10)
        fold = 0
        result_sum = 0
        for train_index, test_index in skf:
            fold += 1
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for i in train_index:
                temp = []
                for feature_name in feature_names:
                    temp.append(fea[feature_name][i])
                X_train.append(temp)
                y_train.append(y[i])

            for i in test_index:
                temp = []
                for feature_name in feature_names:
                    temp.append(fea[feature_name][i])
                X_test.append(temp)
                y_test.append(y[i])

            mnbayes.fit(
                X_train, y_train
            )  #, sample_weight=None, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851])
            y_test = vectorize_actual(y_test)  # vectorize y_test

            _pred_probs = mnbayes.predict_proba(X_test)
            # evaluating the performance
            result = eval.mcllfun(y_test, _pred_probs)
            result_sum += result
            print "MCLL score for fold %d = %0.11f" % (fold, result)

        print "Average MCLL score for this classifier = %0.11f" % (result_sum /
                                                                   10)

        print "Reading test data and features"
        test_data = cu.get_dataframe(test_file)
        test_fea = features.extract_features(feature_names, test_data)

        print "Fitting"
        mnbayes.fit(
            fea, y
        )  #, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851])

        print "Making predictions"
        global probs
        probs = mnbayes.predict_proba(test_fea)

        #if is_full_train_set == 0:
        #	print("Calculating priors and updating posteriors")
        #	new_priors = cu.get_priors(full_train_file)
        #	old_priors = cu.get_priors(train_file)
        #	probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

        print "writing submission to " + submission_file
        cu.write_submission(submission_file, probs)

    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)
Exemple #58
0
n_samples = 1000
time_elapsed_seconds = (data[n_samples,0] - data[0,0]) / 1000
sampling_rate = n_samples / time_elapsed_seconds

# TODO: list the class labels that you collected data for in the order of label_index (defined in collect-labelled-data.py)
class_names = ["sitting", "walking"] #...

print("Extracting features and labels for window size {} and step size {}...".format(window_size, step_size))
sys.stdout.flush()

X = []
Y = []

for i,window_with_timestamp_and_label in slidingWindow(data, window_size, step_size):
    window = window_with_timestamp_and_label[:,1:-1]   
    feature_names, x = extract_features(window)
    X.append(x)
    Y.append(window_with_timestamp_and_label[10, -1])
    
X = np.asarray(X)
Y = np.asarray(y)
n_features = len(X)
    
print("Finished feature extraction over {} windows".format(len(X)))
print("Unique labels found: {}".format(set(Y)))
print("\n")
sys.stdout.flush()

# %%---------------------------------------------------------------------------
#
#		                Train & Evaluate Classifier
Exemple #59
0
def train(text, category):
	for f in extract_features(text):
		increase_count(f, category)
		increase_total_count(category)
Exemple #60
0
def main():
    start = time.time()

    print "Reading train data and its features from: " + train_file
    data = cu.get_dataframe(train_file)
    global fea
    fea = features.extract_features(feature_names, data)

    percep = Perceptron(penalty=None,
                        alpha=0.0001,
                        fit_intercept=False,
                        n_iter=5,
                        shuffle=False,
                        verbose=1,
                        eta0=1.0,
                        n_jobs=-1,
                        seed=0,
                        class_weight="auto",
                        warm_start=False)

    X = []
    for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
        X.append([i])
    # Must be array type object. Strings must be converted to
    # to integer values, otherwise fit method raises ValueError
    global y
    y = []

    print "Collecting statuses"

    for element in data["OpenStatus"]:
        for index, status in enumerate(ques_status):
            if element == status:
                y.append(index)

    print "Fitting"
    percep.fit(fea, y)
    '''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''

    print "Reading test data and features"
    test_data = cu.get_dataframe(test_file)
    test_fea = features.extract_features(feature_names, test_data)

    print "Making predictions"
    global probs
    #probs = percep.predict_proba(test_fea) # only available for binary classification
    probs = percep.predict(test_fea)
    # shape of probs is [n_samples]
    # convert probs to shape [n_samples,n_classes]
    probs = np.resize(probs, (len(probs) / 5, 5))

    #if is_full_train_set == 0:
    #	print("Calculating priors and updating posteriors")
    #	new_priors = cu.get_priors(full_train_file)
    #	old_priors = cu.get_priors(train_file)
    #	probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print "writing submission to " + submission_file
    cu.write_submission(submission_file, probs)
    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)