Beispiel #1
0
def getDemoData():
    global COUNTER
    if COUNTER == 0:
        data = preprocessing.readData("data/data_master_2018-06.csv")
        COUNTER += 1
    elif COUNTER == 1:
        data = preprocessing.readData("data/data_master_2018-07.csv")
        COUNTER += 1
    elif COUNTER == 2:
        data = preprocessing.readData("data/data_master_2018-10.csv")
        COUNTER += 1
    else:
        data = preprocessing.readData("data/data_master_2018-10.csv")
        COUNTER += 1
    return data
Beispiel #2
0
def mode_baseline(files):
    raw_data = readData(files)
    all_scores = []
    for data_point in raw_data:
        (original_sentence, replStart, replEnd), repl, score = data_point
        all_scores.append(score)
    return stats.mode(all_scores)[0][0]
def task1():
    data = readData(
        [TASK_1 / 'train.csv', TASK_1 / 'dev.csv', EXTRA_TRAIN_TASK_1])

    scores = [score for _, _, score in data]
    plt.hist(scores)
    plt.show()
Beispiel #4
0
def mainEnsemble():
    n_estimators = [10, 15, 20]
    folds = 10
    repeats = 3
    n_classifiers = classifiers.len_classify()
    acc = []
    result_acc = [0] * folds * repeats

    for i in range((folds * repeats)):
        result_acc[i] = [0] * n_classifiers

    data = preprocessing.readData()
    samples, targets = preprocessing.splitSamples(data)

    for num in n_estimators:
        kfold = preprocessing.repeatCrossValidation(folds, repeats)

        for train, test in kfold.split(samples, targets):
            X_train, X_test = samples[train], samples[test]
            y_train, y_test = targets[train], targets[test]

            for i in range(n_classifiers):
                y_pred = ensemble.runBagging(X_train, y_train, X_test, i, num)
                fold_cm, fold_acc = classifiers.mensureAcc(y_pred, y_test)
                acc.append(fold_acc)

        for j in range(n_classifiers):
            for k in range(folds):
                result_acc[k][j] = acc[k * n_classifiers + j]
        file_name = "bagging" + str(n_estimators) + ".csv"
        utils.writeCSV(file_name, result_acc)
    print("Finish!!\n")
Beispiel #5
0
def mean_baseline(files):
    raw_data = readData(files)
    all_scores = []
    for data_point in raw_data:
        (_, _, _), _, score = data_point
        all_scores.append(score)
    return sum(all_scores) / len(all_scores)
Beispiel #6
0
def mainClassification():
    folds = 10
    n_classifiers = classifiers.len_classify()
    cm, acc = [], []
    result_cm, result_acc = [0] * folds, [0] * folds

    for i in range(folds):
        result_cm[i], result_acc[i] = [0] * n_classifiers, [0] * n_classifiers

    data = preprocessing.readData()
    samples, targets = preprocessing.splitSamples(data)
    kfold = preprocessing.crossValidation(folds)

    for train, test in kfold.split(samples, targets):
        X_train, X_test = samples[train], samples[test]
        y_train, y_test = targets[train], targets[test]

        for i in range(n_classifiers):
            y_pred = classifiers.classify(X_train, y_train, X_test, i)
            fold_cm, fold_acc = classifiers.mensureAcc(y_pred, y_test)
            cm.append(fold_cm)
            acc.append(fold_acc)

    for j in range(n_classifiers):
        for k in range(folds):
            result_cm[k][j] = cm[k * n_classifiers + j]
            result_acc[k][j] = acc[k * n_classifiers + j]
    utils.writeCSV("all_classifiers_confusion-matrix.csv", result_cm)
    utils.writeCSV("all_classifiers_accuracy.csv", result_acc)
    print("Finish!!\n")
def test():
	bias_clf = load_model(biasPKLfile)
	truth_clf = load_model(truthPKLfile)
	docBiasLabel,docTruthLabel = readLabel(testLabelPath,"test")
	title,article,numCitation = readData(testDataPath,"test")  
	testdata = list(zip(title,numCitation,article))
	truth_pred = truth_clf.predict(testdata)
	bias_pred = bias_clf.predict(testdata)
	print("Truth value accuracy: ",np.mean(truth_pred == docTruthLabel))
	print("Bias value accuracy: ",np.mean(bias_pred == docBiasLabel))
Beispiel #8
0
def evaluate_baseline(baseline_score, files):
    print(baseline_score)
    raw_data = readData(files)

    square_error = 0
    val_examples = 0
    for data_point in raw_data:
        (original_sentence, replStart, replEnd), repl, score = data_point
        square_error += (baseline_score - score) ** 2
        val_examples += 1

    rmse = math.sqrt(square_error / val_examples)
    return rmse
Beispiel #9
0
def main():
    data = preprocessing.readData()
    sil_values_ag1 = []
    db_values_ag2 = []
    cr_values_ag3 = []

    for i in range(2, 22):
        # Hierarquico
        hier = hierarchical(n_clusters=i, linkage='complete').fit(data)
        sil_values_ag1.append(scoreSil(data, hier.labels_))
        db_values_ag2.append(scoreDB(data, hier.labels_))
        # cr_values_ag3.append(scoreCR(targets, hier.labels_))
    writeCSV("sil_all.csv", sil_values_ag)
    writeCSV("db_all.csv", db_values_ag)
    writeCSV("cr_all.csv", cr_values_ag)
        for row in csv_reader:
            if line_count == 0:
                ofile.write("AvSigVersion" + "," + "HasDetections" + "\n")
                t = 0
                for field in row:
                    columns[field] = t
                    t += 1
                print("Column names are {}".format(", ".join(row)))
                line_count += 1
            else:

                ofile.write(row[columns["AvSigVersion"]] + "," +
                            row[columns["HasDetections"]] + "\n")
                line_count += 1
    print('Processed {} lines.'.format(line_count))
data = preprocessing.readData('graph_data.csv')
byDate = data[["AvSigVersion", 'HasDetections']].values.tolist()
signatures = []
detections = []
dateCount = defaultdict(lambda: 0)
dateSum = defaultdict(lambda: 0)
readingErrors = 0
for line in byDate:
    si = line[0].split(".")
    try:
        signature = int(("000" + str(si[1]))[-4:] + ("000" + str(si[2]))[-4:])
        dateCount[signature] = dateCount[signature] + 1
        dateSum[signature] = dateSum[signature] + line[1]
    except:
        readingErrors += 1
print("OK, just :", readingErrors, "Reading Errors")
import preprocessing
data = preprocessing.readData('train_samples.csv')
t = 2
outFile = 'prunning_test'
while t < 30:
    preprocessing.cross_validation(data=data,
                                   k=5,
                                   file_out=outFile,
                                   min_sample=t)
    t += 1
    with open(outFile, 'a') as ofile:
        ofile.write(str(t) + "\n")
Beispiel #12
0
    weights_ = np.array(weights_)+1E-10
    return (tfidf*weights_).tolist()

def extractTokuchoWord(docs, weights):
    corpus = list(set(w for doc in docs for w in doc if len(w)>1))
    ans = {}
    for word in corpus:
        ans[word] = max(calWeightedTFIDF(word,docs,weights))
    #print(ans)
    return max(ans,key=ans.get)

def min_max(x, axis=None):
    min = x.min(axis=axis, keepdims=True)
    max = x.max(axis=axis, keepdims=True)
    result = (x-min)/(max-min)
    return result

if __name__ == "__main__":
    data_master = preprocessing.readData("data_master.csv")
    np.random.seed(seed=0)
    idx = np.random.randint(1000,4000,200)
    docs = data_master.loc[idx, "tango"].to_list()
    reaction = min_max(data_master.loc[idx, "reaction"].to_numpy())
    eikyoudo = data_master.loc[idx, "eikyoudo"].to_numpy()
    weights = reaction*eikyoudo
    print(weights)
    ideal_weights = [1 for _ in range(len(weights))]
    normal_tokuchogo = extractTokuchoWord(docs,ideal_weights)
    weighted_tokuchogo = extractTokuchoWord(docs,weights)
    print("Normal tokuchogo",normal_tokuchogo)
    print("Weighted tokuchogo", weighted_tokuchogo)
Beispiel #13
0
    def __init__(self, use_trained_model, mode, input_file_path):

        self.use_trained_model = use_trained_model
        self.mode = mode

        print("----READING DATA----")
        #inputs
        self.X = tf.placeholder(tf.int32, [None, None])

        if self.mode != "Test":
            self.Y = tf.placeholder(tf.float32, [1, FLAGS.num_classes])

        # tf variables
        #self.tf_ideal_learning_rate = tf.placeholder(tf.float32, shape=[])
        self.tf_ideal_l2_reg_parameter = tf.placeholder(tf.float32, shape=[])
        self.sequence_length = tf.placeholder(tf.int32, [None])

        print("reading embeddings...")
        # read word embeddings
        self.vocabList, self.embeddings = preprocessing.readGloveEmbeddings(
            FLAGS.word_embed_path, FLAGS.word_embedding_size)
        self.char_list, self.char_embeddings = preprocessing.readCharEmbeddings(
            path=FLAGS.char_embed_path, embedding_size=FLAGS.embedding_dim)

        #create word embeddings
        self.tf_embeddings = tf.Variable(tf.constant(
            0.0, shape=[self.embeddings.shape[0], self.embeddings.shape[1]]),
                                         trainable=False,
                                         name="tf_embeddings")
        self.embedding_placeholder = tf.placeholder(
            tf.float32, [self.embeddings.shape[0], self.embeddings.shape[1]])
        self.embedding_init = self.tf_embeddings.assign(
            self.embedding_placeholder)

        print("transforming dictionaries...")
        # turn list to a dict for increase in performance
        self.vocabulary = {}
        self.char_vocabulary = {}

        for i in range(len(self.vocabList)):
            self.vocabulary[self.vocabList[i]] = i

        for i in range(len(self.char_list)):
            self.char_vocabulary[self.char_list[i]] = i

        del self.char_list, self.vocabList

        print("reading the text data...")
        #read tweets
        self.tr_set, self.target_val, self.seq_len = preprocessing.readData(
            input_file_path, self.mode)

        self.tweets = [row[1] for row in self.tr_set]
        self.users = [row[0] for row in self.tr_set]

        self.valid_set_size = int(
            len(self.tweets) * FLAGS.dev_sample_percentage)

        #split dataset into parts according to mode
        if mode == "Train":
            self.train_tweets = self.tweets
            self.train_users = self.users
            self.train_seqlen = self.seq_len
            print("Training set size of tweets: " +
                  str(len(self.train_tweets)))

        elif mode == "Valid":
            self.valid_tweets = self.tweets[:self.valid_set_size]
            self.train_tweets = self.tweets[self.valid_set_size:]
            self.valid_users = self.users[:self.valid_set_size]
            self.train_users = self.users[self.valid_set_size:]
            self.valid_seqlen = self.seq_len[:self.valid_set_size]
            self.train_seqlen = self.seq_len[self.valid_set_size:]
            print("Training set size of tweets: " +
                  str(len(self.train_tweets)) +
                  " Validation set size of tweets: " +
                  str(len(self.valid_tweets)))

        elif mode == "Test":
            self.test_tweets = self.tweets
            self.test_users = self.users
            self.test_seqlen = self.seq_len

            print("Test set size of tweets:" + str(len(self.test_tweets)))
Beispiel #14
0
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

import pipeline
import preprocessing

np.random.seed(123456789)

classifier = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)

print("Reading train/test sets...")
X_train, y_train, X_test = preprocessing.readData()

print("Pipeline for train/test sets...")
X_train, y_train, id_train, X_test, id_test = pipeline.Pipeline(X_train, y_train, X_test)
id_test = id_test.astype(int)

print("Fitting KNN classifier...")
classifier.fit(X_train, y_train)

print("Predicting over test set...")
pred = classifier.predict(X_test)
res = pd.DataFrame({"id":id_test, "status_group":pred})
res.to_csv("../submissions/new_submission.csv", index=False)
Beispiel #15
0
    now_date = dt_now.strftime("%Y年%m月%d日")
    messageText = "----{5} 今週のチワワ----\n\
今週もお疲れさまでした!\n\
今週の皆さんの会話を聞いていたチワワの名前は……\n\
「{0}チワワ」になりました。\n\
同じ話題が過去に出てきたのは{1}でした。\n\
その時のチャットは「{2}」のような感じでした。\n\
{0}チワワのレベルは{3}になりました。\n\
{4}\
来週も頑張りましょう!".format(tokuchogo, kako_wadai[1], kako_wadai[0], level,
                    Summon_chiwawa.summon(level), now_date)
    return messageText


def getTokuchogo(dataFrame):
    docs = dataFrame["tango"].to_list()
    reaction = tokuchogo.min_max(dataFrame["reaction"].to_numpy())
    eikyoudo = dataFrame["now_eikyoudo"].to_numpy()
    weights = reaction * eikyoudo
    weighted_tokuchogo = tokuchogo.extractTokuchoWord(docs, weights)
    return weighted_tokuchogo


if __name__ == "__main__":
    now_data = preprocessing.readData("data/data_master_2018-10.csv")
    now_date = datetime.datetime.strptime("2018-10-19", "%Y-%m-%d").date()
    print(makeMessage(now_data, now_date))
    # companyId = ""
    # groupId = ""
    # print(get_today_message(companyId, groupId))
def train():    
	docBiasLabel,docTruthLabel = readLabel(trainLabelPath)
	title,article,numCitation = readData(trainDataPath)
	traindata = list(zip(title,numCitation,article))
	dataExtractor = Pipeline([('TitleArticleExtractor', TitleArticleExtractor()),])
	TfidfTitle = Pipeline([
						('selector', ItemSelector(key='title')),
						('vect', TfidfVectorizer(min_df = 0.01)),
						('to_dense', DenseTransformer()),
			     ])
	TfidfArticle = Pipeline([
						('selector', ItemSelector(key='article')),
						('vect', TfidfVectorizer(min_df = 0.01)),
						('to_dense', DenseTransformer()),
					])
	textStatsTitle = Pipeline([
					('selector', ItemSelector(key='title')),
					('stats', Text_Stats()),  
					('to_dense', DenseTransformer()),
					
				])
	textStatsArticle = Pipeline([
					('selector', ItemSelector(key='article')),
					('stats', Text_Stats()),  
					('to_dense', DenseTransformer()),					
				])

	matchNgrams =  Pipeline([
					('selector', ItemSelector(key='ngram')),
					('func', extractFeature()), 
					('to_dense', DenseTransformer()),
										
				])

	
	bias_clf = Pipeline([
			('TitleArticleExtractor', dataExtractor),
			('union', FeatureUnion(
				transformer_list=[
									('tfidf_title', TfidfTitle),
									('tfidf_article', TfidfArticle),
									('text_stats_title', textStatsTitle),
									('text_stats_body', textStatsArticle),
									('matchngrams', matchNgrams),
								],
							)),
					('clf', MultinomialNB()),
			])

	bias_clf.fit(traindata, docBiasLabel)

	with open(biasPKLfile,"wb") as f_pk:
		pickle.dump(bias_clf,f_pk,pickle.HIGHEST_PROTOCOL)
	

	truth_clf = Pipeline([			
			('TitleArticleExtractor', dataExtractor),
			('union', FeatureUnion(
				transformer_list=[
									('tfidf_title', TfidfTitle),
									('tfidf_article',TfidfArticle),
									('text_stats_headline',textStatsTitle),
									('text_stats_body', textStatsArticle),
									('matchngrams', matchNgrams),
								],
							)),
					('clf', GaussianNB()),
				])
	truth_clf.fit(traindata, docTruthLabel)

	with open(truthPKLfile,"wb") as f_pk:
		pickle.dump(truth_clf,f_pk,pickle.HIGHEST_PROTOCOL)
    col_sums = np.sum(conf_matrix, axis=0)
    diag = np.diag(conf_matrix)
    scores = 2 * diag / (row_sums + col_sums)
    return np.mean(scores)


accs = []
f1s = []
tiempos = []

#possibleK = [1,3,5,7,9,11]
#dims = list(range(30,50))
#possibleK=[1]*len(dims) + [3]*len(dims) + [5]*len(dims) + [7]*len(dims)
possibleK = [1]
dims = [44]
X, y, _ = preprocessing.readData()
print("El conjunto de datos tiene dimension: " + str(len(X.iloc[0])))

for k, dim in zip(possibleK, dims):
    print("K=" + str(k))
    print("Dim=" + str(dim))
    classifier = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    skf = StratifiedKFold(n_splits=5, random_state=123456789)

    scores = []
    f1_scores = []
    elapsed_times = []
    conf_matrix = []

    X_index = skf.split(X, y)
    missclassified_images = []
    with open("density_graph_data.csv",'w') as ofile:
        for row in csv_reader:
            if line_count == 0:
                ofile.write("AvSigVersion"+"\n")
                t=0
                for field in row:
                    columns[field]=t
                    t+=1
                print("Column names are {}".format(", ".join(row)))
                line_count += 1
            else:
                
                ofile.write(row[columns["AvSigVersion"]] +"\n")
                line_count += 1
    print('Processed {} lines.'.format(line_count))
data=preprocessing.readData('density_graph_data.csv')
byDate= data[["AvSigVersion"]].values.tolist()
signatures=[]
detections=[]
dateCount=defaultdict(lambda : 0)
readingErrors=0
for line in byDate:
    si = line[0].split(".")
    try:
        signature=int(("000"+str(si[1]))[-4:]+("000"+str(si[2]))[-4:])
        dateCount[signature] = dateCount[signature]+1
    except:
        readingErrors+=1
print("OK, just :",readingErrors,"Reading Errors")
    #signatures.append(int(("000"+str(si[1]))[-4:]+("000"+str(si[2]))[-4:]))
    #detections.append(line[1])