Exemple #1
0
def readInfo(trainName, testName):
    dataframe = readData(trainName + '.csv')
    dataframe2 = readData(testName + '.csv')

    unethical_variables = [
        'estu_tieneetnia', 'estu_tipodocumento.1', 'fami_trabajolaborpadre',
        'fami_trabajolabormadre', 'estu_genero.1', 'estu_pais_reside.1',
        'estu_depto_reside.1', 'estu_cod_reside_depto.1',
        'estu_mcpio_reside.1', 'estu_cod_reside_mcpio.1', 'estu_areareside',
        'fami_nivelsisben', 'fami_pisoshogar', 'fami_tienemicroondas',
        'fami_tienehorno', 'fami_tieneautomovil.1', 'fami_tienedvd',
        'fami_tiene_nevera.1', 'estu_nacionalidad.1', 'fami_telefono.1',
        'estu_trabajaactualmente', 'estu_antecedentes', 'estu_expectativas',
        'cole_cod_dane_establecimiento', 'cole_cod_dane_sede',
        'cole_area_ubicacion', 'cole_jornada', 'cole_cod_mcpio_ubicacion',
        'cole_mcpio_ubicacion', 'cole_cod_depto_ubicacion',
        'cole_depto_ubicacion'
    ]

    NaN_variables = [
        'estu_tomo_cursopreparacion', 'estu_cursodocentesies', 'desemp_prof',
        'estu_cursoiesapoyoexterno', 'estu_cursoiesexterna',
        'estu_simulacrotipoicfes', 'estu_actividadrefuerzoareas',
        'estu_actividadrefuerzogeneric'
    ]

    no_aportan_variables = [
        'estu_estudiante.1', 'cole_sede_principal', 'cole_nombre_sede',
        'cole_codigo_icfes', 'profundiza', 'cole_nombre_establecimiento',
        'cole_nombre_establecimiento', 'cole_genero', 'cole_naturaleza',
        'periodo.1', 'estu_fechanacimiento.1', 'estu_inst_cod_departamento',
        'periodo', 'estu_consecutivo.1'
    ]

    for variable in [NaN_variables, no_aportan_variables, unethical_variables]:
        dataframe.drop(variable, axis=1, inplace=True)

    for variable in [NaN_variables, no_aportan_variables, unethical_variables]:
        dataframe2.drop(variable, axis=1, inplace=True)

    training_data = dataframe.to_numpy().tolist()
    for el in training_data:
        x = 0
        for ele in el:
            if type(ele) is not str and math.isnan(ele):
                el[x] = '-'
            x += 1

    testing_data = dataframe2.to_numpy().tolist()
    for el in testing_data:
        x = 0
        for ele in el:
            if type(ele) is not str and math.isnan(ele):
                el[x] = '-'
            x += 1

    return training_data, testing_data
Exemple #2
0
def test_raise_exceptions():
    from hrmData import hrmData
    from readData import readData
    from timeSegment import timeSegment
    with pytest.raises(ValueError):
        myDataset = readData("test_data31.csv")
        hrmObject = hrmData(myDataset, 1, 4)
    with pytest.raises(ValueError):
        myDataset = readData("test_data31.csv")
        hrmObject = hrmData(myDataset, 2, 5)
def test_faulty_data_load():
    from readData import readData
    myDataset1 = readData("test_data28.csv")
    nanValueTime1 = 0.9
    nanValueVoltage1 = -0.345
    assert myDataset1.time[324] == nanValueTime1
    assert myDataset1.voltage[338] == nanValueVoltage1

    myDataset2 = readData("test_data30.csv")
    badDataTime = 3.86
    badDataVoltage = -0.025
    assert pytest.approx(myDataset2.time[965]) == badDataTime
    assert pytest.approx(myDataset2.voltage[972]) == badDataVoltage
def test_regular_data_read():
    from readData import readData
    myDataset1 = readData("test_data3shortTime.csv")
    timeFromCSV = [0, 0.003, 0.006, 0.008, 0.011, 0.014, 0.017, 0.019, 0.022,
                   0.025, 0.028, 0.031, 0.033, 0.036, 0.039, 0.042, 0.044,
                   0.047, 0.05, 0.053]
    assert pytest.approx(myDataset1.time[0:20]) == timeFromCSV
Exemple #5
0
def test_voltage_extremes():
    from hrmData import hrmData
    from readData import readData
    from timeSegment import timeSegment
    myDataset = readData("test_data31.csv")
    hrmObject = hrmData(myDataset)
    maxMinValue = (0.7875, -0.19375)
    assert pytest.approx(hrmObject.voltage_extremes) == maxMinValue
Exemple #6
0
def test_num_beats():
    from hrmData import hrmData
    from readData import readData
    from timeSegment import timeSegment
    myDataset = readData("test_data31.csv")
    hrmObject = hrmData(myDataset)
    numBeatsin31 = 19
    assert numBeatsin31*0.8 < hrmObject.num_beats < numBeatsin31*1.2
Exemple #7
0
def main(fileName, k):

    sourceData = readData.readData(fileName)

    result1 = kmeans(sourceData, k)
    result2 = kmeansPlusPlus(sourceData, k)

    return result1, result2
Exemple #8
0
def main(args):
	ticks = time.time()
	trainingData = readData.readData(TRAINING_LABELS_PATH, TRAINING_IMAGES_PATH)
	totalInstances = 0
	for i in range(0, len(trainingData)):
                for fV in range(0, len(trainingData[i])):
			totalInstances += 1

	nb = NaiveBayes(trainingData, totalInstances)
	nb.naiveBayes()
	testingData = readData.readData(TEST_LABELS_PATH, TEST_IMAGES_PATH)
	predictedVals = nb.predictLabels(testingData)
	ticks = time.time() - ticks
	print "Total Accuracy: 	"+str(nb.accuracy(predictedVals))
	print "Execution Time: 	"+str(ticks)
	print "Confusion Matrix:\n "
	nb.confusionmatrix(predictedVals)
Exemple #9
0
def main():
    from readData import readData
    from hrmData import hrmData
    import numpy as np
    import json
    csvFileName = "test_data31.csv"
    myDataset = readData(csvFileName)
    hrmObject = hrmData(myDataset)
    write_to_json(csvFileName, hrmObject)
Exemple #10
0
def test_subtractDCOffset():
    from hrmData import hrmData
    from readData import readData
    from timeSegment import timeSegment
    myDataset = readData("test_data31.csv")
    hrmObject = hrmData(myDataset)
    subtractedOffsetValues = [-0.027071875, -0.002071875, 0.029178125,
                              0.054178125, 0.079178125]
    assert hrmObject.meanSubtractedVoltage[0:5] == \
        pytest.approx(subtractedOffsetValues)
def test_default_value_for_time_segment():
    from readData import readData
    from hrmData import hrmData
    from timeSegment import timeSegment
    myDataset = readData('easytestfile.csv')
    myTimePoints = timeSegment(myDataset)
    expectedVoltageValues = [[100, 101], [102, 103], [104, 105], [106, 107],
                             [108, 109], [110, 111], [112, 113], [114, 115],
                             [116, 117], [118, 119]]
    assert myTimePoints.segmentList == expectedVoltageValues
Exemple #12
0
def test_time_of_beats():
    from hrmData import hrmData
    from readData import readData
    from timeSegment import timeSegment
    myDataset = readData("test_data31.csv")
    hrmObject = hrmData(myDataset)
    locationOfBeats = [0., 0.7715, 1.543,  2.3145, 3.086, 3.8575, 4.629,
                       5.4005, 6.172, 6.9435, 7.715,  8.4865, 9.258, 10.0295,
                       10.801,  11.5725, 12.344, 13.1155, 13.887]
    assert locationOfBeats == pytest.approx(hrmObject.beats)
def test_voltage_list():
    from readData import readData
    from hrmData import hrmData
    from timeSegment import timeSegment
    myDataset = readData('easytestfile.csv')
    myTimePoints = timeSegment(myDataset, 4)
    expectedVoltageValues = [[100, 101, 102, 103], [104, 105, 106, 107],
                             [108, 109, 110, 111], [112, 113, 114, 115],
                             [116, 117, 118, 119]]
    assert myTimePoints.segmentList == expectedVoltageValues
def test_time_index():
    timesAt2SecondsFile2 = [
        720, 1440, 2160, 2880, 3600, 4320, 5040, 5760, 6480, 7200, 7920, 8640,
        9360
    ]
    from readData import readData
    from hrmData import hrmData
    from timeSegment import timeSegment
    myDataset = readData("test_data2.csv")
    myTimePoints = timeSegment(myDataset, 2)
    assert myTimePoints.listOfSegmentsIdx == timesAt2SecondsFile2
Exemple #15
0
def test_intervalHR():
    from hrmData import hrmData
    from readData import readData
    from timeSegment import timeSegment
    myDataset = readData("test_data31.csv")
    hrmObject = hrmData(myDataset)
    actualHR = 84.38818565400844
    assert pytest.approx(hrmObject.global_mean_hr_bpm) == 84.38818565400844

    hrmObject2 = hrmData(myDataset, 2, 8)
    assert pytest.approx(hrmObject.global_mean_hr_bpm) == 84.38818565400844
def QB():
    items = readData()
    message = {
        'status': 200,
        'message': 'OK',
        'data': items
    }
    resp = jsonify(message)
    resp.status_code = 200
    print(resp)
    return(resp)
Exemple #17
0
def main(argv=None):
    """
    不收敛
    """
    num_repeat = 1
    MSEs = []
    abbrTrain = 'E:\python_project\happinessPredict\DataSet\happiness_train_abbr.csv'
    for _ in range(num_repeat):
        x_train, x_test, y_train, y_test = readData.readData(abbrTrain, True)
        y_train_array = np.zeros((len(y_train), 5))
        for i in range(len(y_train)):
            if y_train[i] == 1:
                y_train_array[i] = np.array([1, 0, 0, 0, 0])
            elif y_train[i] == 2:
                y_train_array[i] = np.array([0, 1, 0, 0, 0])
            elif y_train[i] == 3:
                y_train_array[i] = np.array([0, 0, 1, 0, 0])
            elif y_train[i] == 4:
                y_train_array[i] = np.array([0, 0, 0, 1, 0])
            elif y_train[i] == 5:
                y_train_array[i] = np.array([0, 0, 0, 0, 1])

        y_test_array = np.zeros((len(y_test), 5))
        for i in range(len(y_test)):
            if y_test[i] == 1:
                y_test_array[i] = np.array([1, 0, 0, 0, 0])
            elif y_test[i] == 2:
                y_test_array[i] = np.array([0, 1, 0, 0, 0])
            elif y_test[i] == 3:
                y_test_array[i] = np.array([0, 0, 1, 0, 0])
            elif y_test[i] == 4:
                y_test_array[i] = np.array([0, 0, 0, 1, 0])
            elif y_test[i] == 5:
                y_test_array[i] = np.array([0, 0, 0, 0, 1])
        # 对每一行的样本的同一位置的特征进行z-score标准化
        scaler = preprocessing.StandardScaler().fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

        mse = train(x_train, x_test, y_train_array, y_test_array)
        MSEs.append(mse)
        tf.reset_default_graph()

    plt.figure()
    plt.grid()
    plt.xlabel('iteration$(\\times10^2)$')
    plt.ylabel('MSE')
    # plt.axis([0, len(mse), 0, 1.1])
    for i in range(num_repeat):
        plt.plot(MSEs[i])
    # 保存图片
    # plt.savefig("diff_s1423 with PCA+ANN.svg", transparent=True, format='svg')
    plt.show()
Exemple #18
0
def test_determineLagTime():
    from hrmData import hrmData
    from readData import readData
    from timeSegment import timeSegment
    myDataset = readData("test_data31.csv")
    hrmObject = hrmData(myDataset)
    actualHR = 80
    assert 0.9*actualHR < hrmObject.heartRateList[0] < 1.1*actualHR
    assert 0.9*actualHR < hrmObject.heartRateList[1] < 1.1*actualHR
    assert 0.9*actualHR < hrmObject.heartRateList[2] < 1.1*actualHR
    assert 0.9*actualHR < hrmObject.heartRateList[3] < 1.1*actualHR
    assert 0.9*actualHR < hrmObject.heartRateList[4] < 1.1*actualHR
    assert 0.9*actualHR < hrmObject.heartRateList[5] < 1.1*actualHR
Exemple #19
0
def depthSearch(idx, depth=1000):
	if dbase.phdExists(idx):
		return
	if depth < 0:
		print >> sys.stderr, "[Search] Exceeded search depth on", idx
	phd, tpls = readData(idx)
	if dbase.writePhD(idx, phd[1]) is None:
		print >> sys.stderr, "[Search] Failed on writing Ph. D.", idx
		return
	for aID in writeDegreeTuples(tpls):
		try:
			depthSearch(aID, depth-1)
		except Exception, e:
			print >> sys.stderr, "[Search] Exception occured processing %d:" % aID, e
Exemple #20
0
def test_convertTimeToIdx():
    from hrmData import hrmData
    from readData import readData
    from timeSegment import timeSegment
    myDataset = readData("test_data31.csv")
    hrmObject = hrmData(myDataset)
    startIdx1 = 0
    endIdx1 = 6
    assert startIdx1 == hrmObject.startIdx
    assert endIdx1 == hrmObject.endIdx

    hrmObject2 = hrmData(myDataset, 2, 8)
    startIdx2 = 1
    endIdx2 = 4
    assert startIdx2 == hrmObject2.startIdx
    assert endIdx2 == hrmObject2.endIdx
def split_data(input_path):
    for split in SPLITS:
        output_path = join(input_path, 'split_files', split)
        os.makedirs(join(output_path, 'features'), exist_ok=True)
        os.makedirs(join(output_path, 'labels'), exist_ok=True)
        i = 0
        for code in CPC_CODES:
            for patent in readData(input_path, split, code):
                abstract = patent['abstract'].encode().decode()
                desc = patent['description'].encode().decode()
                with open(join(output_path, 'features', f'{i}.desc'),
                          'w') as file:
                    file.write(desc)
                with open(join(output_path, 'labels', f'{i}.label'),
                          'w') as file:
                    file.write(abstract)
                i += 1
Exemple #22
0
def test_write_json():
    from main import main
    from hrmData import hrmData
    from readData import readData
    import json
    from timeSegment import timeSegment
    myDataset = readData("test_data31.csv")
    hrmObject = hrmData(myDataset)
    main()
    data = {'File Name': hrmObject.rawData.csvFileName,
            'mean_hr_bpm': hrmObject.mean_hr_bpm,
            'voltage_extremes': hrmObject.voltage_extremes,
            'duration': hrmObject.duration,
            'num_beats': hrmObject.num_beats,
            'beats': hrmObject.beats}
    with open('test_data31.json') as data_file:
        data_loaded = json.load(data_file)
    data_loaded_list = data_loaded.items()
    data_list = data.items()
    assert data_loaded_list == data_list
Exemple #23
0
import pandas as pd
import numpy as np

import readData
import Cal_Acc_Gyro
import motion

file_dir = os.getcwd()
print(file_dir)
data_file_number = '2'

f = 100
T = 1 / f

#   1. read data
measure_data = readData.readData(file_dir, data_file_number)
print(measure_data.frames)

#   2. calculate the norm of  acc and gyro and detect the contact
contact_flag = Cal_Acc_Gyro.contac_detection(measure_data.acc_data,
                                             measure_data.gyro_data,
                                             measure_data.frames)

#   3. calculate the attitude and position
calculate_data = motion.motion(measure_data.acc_data, measure_data.gyro_data,
                               contact_flag)

#   4. plot the signal of vel and position
n = measure_data.frames
t = np.linspace(0, n * T, n + 1)
plt.figure(1)
Exemple #24
0
from readData import readData
from Clustering import kmeans_al
from Vi import showCluster
from dataNomalization import normalization
from dimReduction import DimReduction
from oneHot import oneHotData
from topicModelling import topic
from metricLearning import metricLearning
#import pandas as pd

if __name__ == "__main__":
    fileName = '../../demographic+Data+From+mimic.csv'
    toRows = 100

    data = readData(fileName, toRows)
    onehotdata = oneHotData(data)
    scaledData = normalization(onehotdata)

    #df_scaledData = pd.DataFrame(scaledData)
    df_OriData = topic(scaledData, 10)

    df_NewData = metricLearning(df_OriData)

    #print (df_OriData)

    OriKmeansresult = kmeans_al(df_OriData)
    NewKmeansresult = kmeans_al(df_NewData)
    #print kmeansresult.labels_
    TwoDOriData = DimReduction(df_OriData)
    TwoDNewData = DimReduction(df_NewData)
Exemple #25
0
import readData
from frequentCount import *
from aprioriGen import *

min_sup = 2
D = readData.readData("shoppingList.csv")


def miningFrequentItemSet(D, min_sup):

    # initialized
    frequentItemSets = []
    L1 = find_frequent_1_itemsets(D, min_sup)
    frequentItemSets.extend(L1)

    # find frequent itemset Lk, until it is empty
    Lk = L1
    while len(Lk) != 0:

        # here, Ck is also a semi-finished Lk which processed by link and prune
        Ck = apriori_gen(Lk)

        # obtain final frequent itemset Lk
        Lk = scanDataBase(D, min_sup, Ck)

        frequentItemSets.extend(Lk)

    return frequentItemSets


if __name__ == "__main__":
Exemple #26
0
from featureEngineer import featureEngineer
from readData import readData

# ### Set global variables
#
# Note: The first N entries in the dataset should have labels. The rest will be used for testing. The very last column should contain the labels

# In[ ]:

FILE_PATH = 'dataset.csv'
OUT_FILE = 'run1.h5'

# ### Clean and split data into arrays

# In[ ]:

X, y = readData(FILE_PATH)
X, k = featureEngineer(X)

A = np.matmul(X.T, X)
B = np.matmul(X.T, y)

A = np.linalg.pinv(A)

W = np.matmul(A, B)  #These are the learned weights

with h5py.File(OUT_FILE, 'w') as file:
    file.create_dataset('weights', W.shape)
    file['weights'][...] = W
    file.create_dataset('k', (1, ), data=k)
Exemple #27
0
from readData import readData
import numpy as np
X, Y = readData()
m = len(X)
t0 = 0
t1 = 0
alpha = .01
iter = 9
from gradientDescent import gradientDescent
t0, t1 = gradientDescent(X, Y, t0, t1, alpha, iter, m)
print t0, t1
if inputFileTr.lower() == "" or inputClassesTr == "" or (mdlName.lower() != "test"  and mdlName.lower() != "train"):# or inputFileTs.lower() == "" or inputClassesTs.lower() == "":
    print("You have NOT entered one of the required inputs!")
    sys.exit()

#inputFileTr = "X_train.txt"
#inputClassesTr = "y_train.txt"
#inputFileTs = "X_test.txt"
#inputClassesTs = "y_test.txt"

print("\nLoading saved tree ...")
tree = ET.ElementTree(file="trained_Tree.xml")
xmlRoot = tree.getroot()
root = makeTree(xmlRoot)
print("\nReading 1 set of data ...")
clases = readData(inputClassesTr)
data1 = readData(inputFileTr)
data1 = np.append(data1, clases, axis=1)

if inputClassesTs != "":
    print("\nReading 2 set of data ...")
    clases = readData(inputClassesTs)
    data2 = readData(inputFileTs)
    data2 = np.append(data2, clases, axis=1)

data = np.append(data1, data2, axis=0) if inputClassesTs != "" else data1

randIndx = np.load('invtRandData.npy') if (mdlName.lower() == "test") else np.load('randData.npy')

print("Estimating classes and calculating accuracy ...")
result = TreeResult (data[randIndx, :], root)
def reduce_tweets_words():

	[leave_tweets, stay_tweets, other_tweets] = readData()

	leave_tweets = categorizy_tweets(leave_tweets, "neg")
	new_leave = getTokenizedTweetsFile("leaveTweets/ExtraLeaveTweets.txt", "neg")
	leave_Farias = getTokenizedTweetsFile("leaveTweets/FariasLeave.txt", "neg")
	stay_tweets = categorizy_tweets(stay_tweets, "pos")
	new_stay = getTokenizedTweetsFile("stayTweets/ExtraStayTweets.txt", "pos")
	stay_Farias = getTokenizedTweetsFile("stayTweets/FariasStay.txt", "pos")
	other_tweets = categorizy_tweets(other_tweets, "neutral") 

	#Os arquivos que foram de Ada e vem pelo metodo categorizy_tweets sao todos 'Str'
	#Vou tentar fazer todos Unicode
	#leave_tweets = unicode_them(leave_tweets)
	#stay_tweets = unicode_them(stay_tweets)
	#other_tweets = unicode_them(other_tweets)

	tokenized_tweets = leave_tweets + new_leave + leave_Farias + stay_tweets + new_stay + stay_Farias + other_tweets + other_tweets
	all_words = []

	#import ipdb;ipdb.set_trace()
	print(len(leave_tweets))
	print(len(new_leave))
	print(len(leave_Farias))
	print(len(stay_tweets))
	print(len(new_stay))
	print(len(stay_Farias))
	print(len(other_tweets))
	print(len(other_tweets))
	print(len(tokenized_tweets))
	#############################################################################
	#
	# AGORA QUE OS TWEETS ESTAO EM TUPLAS (tweets_tokenizados , categoria) VAMOS
	# REDUZIR O TAMANHO DOS TWEETS TOKENIZADOS TIRANDO STOPWORDS E OUTRAS COISAS
	# QUE NAO AGREGAM NA EXTRACAO DE CARACTERISTICAS
	#
	#############################################################################

	# Fazendo o stopwords nas palavras dos documentos pra tirar muita coisa inutil
	stop_words = set(stopwords.words("english"))

	#Vou tentar melhorar a lista de stopwords colocando nela algumas pontuacoes q nao servem de nada
	# Fiz elas com unicode pq eh assim que as stop_words estao
	punctuation = [u'.', u'-', u',', u'"', u'(', u')', u':', u"'", u'--', u';', 
	u'!', u'$', u'*', u'&', u'...', u':/', u'/', u'..']
	punctuation = set(punctuation)

	punct = list(string.punctuation)
	#stop = stopwords.words('english') + punctuation + ['rt', 'via']
	global new_stop_words
	new_stop_words = stop_words.union(punct)

	twitter_symbols = [u'rt', u'#voteleave', u'#voteremain', u'#leaveeu', u'h', u'#rt', u'=', u'@', u'https',
	u'+', u'\'', u'|', u'…', u'‘', u'’', u'..', u'...']
	twitter_symbols = set(twitter_symbols)
	new_stop_words = new_stop_words.union(twitter_symbols)

	# NA VERDADE NAO TO CONSEGUINDO TIRAR O @USER DO RT MAS ISSO
	# NAO VAI INTERFERIR POIS A FREQUENCIA DE SE TER UM @USER DO MESMO USER EH POUCA
	#user_rt_pattern = "@\w+?"
	#url_pattern = 'http[s]:/'
	emotions_pattern = '\u\d+'
	url_pattern = 'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+'
	user_rt_pattern = '(?:@[\w_]+)'
	# "(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
 #    r'(?:[\w_]+)', # other words
 #    r'(?:\S)' # anything else
    #user_rt_pattern = '(?:@[\w_]+)'
	
	

	# filtered_tweets = [for tweet_cat in tokenized_tweets if ]
	filtered_tweets = []
	tokens_to_be_removed = []
	#print(tokenized_tweets[228:230])

	#############################################################################
	#
	# AQUI EU VOU TENTAR CONSTRUIR OS BIGRAMS DE CADA TWEET E ASSIM ADICIONA-LOS
	# EM UMA LISTA COM TODOS OS BIGRAMS FEITOS, DO MESMO JEITO QUE FACO COM TODAS
	# AS PALAVRAS DOS TWEETS
	#
	# Eh melhor criar a lista de todos os bigrams juntando cada lista de bigrams dos tweets
	# do que fazer a lista dos bigrams baseado na lista de todas as palavras
	# pq da segunda maneira podemos fazer bigrams que no existem pq pegam de um tweet e de outro
	#############################################################################

	#Para cada tupla (tweet_tok,categoria)
	for tweet_cat in tokenized_tweets:

		#Pra cada token desse tweet
		for token in tweet_cat[0]:
			#Se o token for uma das stop_words ou ter o Regex de URl ou RT a gnt tira
			#import ipdb;ipdb.set_trace()
			#token = token.encode('utf-8').decode('utf-8')
			if token in new_stop_words or re.match(url_pattern, token) or re.search(user_rt_pattern, token) or re.match(emotions_pattern, token):
				tokens_to_be_removed.append(token)
				#print(tokens_to_be_removed)

		#Vi todos os tokens q eram pra ser removidos desse tweet
		#Agora vou remove-los
		for token in tokens_to_be_removed:
			#import ipdb;ipdb.set_trace()
			#token = token.encode('utf-8').decode('utf-8')
			tweet_cat[0].remove(token)

		#Limpar o tokens_to_be_removed pq senao vai sempre acumular de outros tweets
		tokens_to_be_removed = []

		# Encodando tudo pra sair do Unicode e ficar em UTF-8
		#tweet_cat[0] = [token.encode('utf-8') for token in tweet_cat[0]]

		#Primeiro criar os bigrams desse tweet e dps adicionar na lista de todos os bigrams
		#print(type(tweet_cat[26][0]))
		# for token in tweet_cat[0]:
		# 	#Transformando tudo em unicode
		# 	if type(token) == str:
	 #  			token = token.decode('utf-8')
  # 			elif type(token) == unicode:
  # 				token = token.encode('utf-8').decode('utf-8')
		tweet_bigrams = list(bigrams(tweet_cat[0]))
		#tweet_bigrams = [(tupla[0].decode('utf-8'), tupla[1].decode('utf-8')) for tupla in tweet_bigrams]
		#import ipdb;ipdb.set_trace()
		#print(type(tweet_cat[26][0]))

		# tweet_bigrams eh uma lista entao se eu simplesmente fazer .append() em all_bigrams
		# all_bigrams ira ser so uma lista de listas
		#tweet_bigrams = [bi.encode('utf-8') for bi in tweet_bigrams]
		for i in range(len(tweet_bigrams)):
			all_bigrams.append(tweet_bigrams[i])

		#Adiciona o tweet sem as stopwords na nova lista

		##################################################################
		#
		# AGORA TEM UM NOVO CAMPO COM TODOS OS BIGRAMS DO TWEET
		# ASSIM OS BIGRAMS TB TERAO UMA CATEGORIA E SERAO IMPORTANTES PRA A CLASSIFICACAO
		# COM ISSO AO INVES DE TUPLA SERA TRIPLA (tokens, bigrams, category)
		#
		##################################################################

		tweet_bigrams_cat = (tweet_cat[0], tweet_bigrams, tweet_cat[1])

		filtered_tweets.append(tweet_bigrams_cat)
	# Exemplo de tweet filtrado com stopwords
	# ([u'@mpvine', u'If', u'fifty', u'million', u'people', u'say',
	# u'foolish', u'thing', u"it's", u'still', u'foolish', u'thing'], 'pos')
	#print(filtered_tweets[228:230])


	#######################################################################
	#
	# JOGANDO TODOS OS TWEETS REDUZIDOS EM UM ARQUIVO
	#
	#######################################################################


	#Arquivo com as novas tuplas dos tweets filtrados
	with open('FilteredTweets2.txt', 'w') as outfile:
		for item in filtered_tweets:
  			outfile.write(str(item) + '\n')

  	#Arquivo com as novas tuplas dos tweets filtrados
	with open('Bigrams.txt', 'w') as outfile:
		for item in all_bigrams:
  			outfile.write(str(item) + '\n')

  	return filtered_tweets
Exemple #30
0
def main(args):
	decisionTree = DecisionTree() 	
	cIndex, attributesList, data = readData.readData(args.input)
	decisionTree.makeTree(decisionTree.root, cIndex, attributesList, data[0:500])
	print accuracy(decisionTree.root, data[500:600], cIndex)
# Batch normalisation after layers -> Gaussian
# Data augmentation
# Confusion matrix

batch_size = 100
epochs = 1
validation_size = 100
num_classes = 10
result_file = "test_run_results.txt"

# input image dimensions
img_x, img_y = 32, 32

# load data sets
arr, labels, images = readData(
    'C:\\Users\\nystr\\GTSRB\\Final_Training\\Images', num_classes,
    (img_x, img_y))

v_arr, v_labels, v_images = readValidationData(
    'C:\\Users\\nystr\\GTSRB\\Final_Test\\Images', (img_x, img_y),
    validation_size)

#arr, labels, images = readData('C:/Users/Filip/Documents/Kandidat/GTSRB/Final_Training/Images', num_classes, (img_x, img_y))

#v_arr, v_labels, v_images = readValidationData('C:/Users/Filip/Documents/Kandidat/GTSRB/Final_Test/Images',
#                                      (img_x, img_y), validation_size)

x_train = np.asarray(arr)
y_train = oneHotEncode(labels, num_classes)

x_test = np.asarray(v_arr)
import readData
import costFunction
from sigmoid import sigmoidGradient
from randomInit import randomInit
from backPropagation import backPropagation
from predict import predict
from gradientChecking import gradientChecking

showcost = 0

if __name__=="__main__":
    input_layer = 400
    hidden_layer = 25
    num_labels = 10

    (X,y) = readData.readData()

    lam = 1

    if showcost:
        import numpy as np
        (ogTheta1,ogTheta2) = readData.readWeights()

        Thetas = np.reshape(ogTheta1,ogTheta1.size)
        Thetas = np.append(Thetas,ogTheta2)
        print costFunction.computeRegularizedCost(Thetas,
                                                  X,y,input_layer,hidden_layer,
                                                  num_labels,lam)

    Theta1 = randomInit(input_layer,hidden_layer)
    Theta2 = randomInit(hidden_layer,num_labels)
Created on : 2015年10月23日
time: 下午2:01:41
Function: 
'''
import numpy as np  
import scipy as sp  
from sklearn import tree  
from sklearn.metrics import precision_recall_curve  
from sklearn.metrics import classification_report  
from sklearn.cross_validation import train_test_split  
from readData import readData

  
''''' 数据读入 '''  
path = "E:/Desktop/Image/SVMData/loc_train.txt"
x,y = readData(path)


''''x_train 训练数据, 
    x_test  训练标签, 
    y_train 测试数据, 
    y_test  测试标签'''
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.5)  
  
''''' 使用信息熵作为划分标准,对决策树进行训练 '''  
clf = tree.DecisionTreeClassifier(criterion='entropy')  


print(clf)  
clf.fit(x_train, y_train)  
  
Exemple #34
0
from numpy import *  

import SVM  
from readData import readData
################## test svm #####################  
## step 1: load data  
print "step 1: load data..."  

# fileIn = open('D:/Desktop/python study/Image/src/testSet.txt')  
# for line in fileIn.readlines():  
#     lineArr = line.strip().split() 
#     print(lineArr) 
#     dataSet.append([float(tk) for tk in lineArr[:-1]])  
#     labels.append(float(lineArr[-1]))  

dataSet,labels = readData("E:/Desktop/Image/SVMData/gender_wechat.txt")
t = int(len(labels)/5)
dataSet = mat(dataSet)  
labels = mat(labels).T  
train_x = dataSet[0:t, :]  
train_y = labels[0:t, :]  
test_x = dataSet[t:len(labels), :]  
test_y = labels[t:len(labels), :]  
 
## step 2: training...  
print "step 2: training..."  
C = 0.6  
toler = 0.001  
maxIter = 50  
svmClassifier = SVM.trainSVM(train_x, train_y, C, toler, maxIter, kernelOption = ('rbf', 0))  
  
f_in_trn = 'Data/images_train'
f_in_tst = 'Data/images_test'
f_in_sol = 'Data/train_solutions.csv'

f_in_flat_trn = 'Data/train_.csv'
f_in_flat_tst = 'Data/test_.csv'

f_out_trn = 'Data/train_32_deskew.csv'
f_out_tst = 'Data/test_32_deskew.csv'
f_out_subm = 'Submissions/ls_32_deskew.csv'

# Process images
from readData import readData
(Xtrn, Ytrn, Xtst) = readData(f_in_trn, f_in_tst, f_in_sol, augmenting=False)
from saveData import saveData
saveData((Xtrn, Xtst), (f_out_trn, f_out_tst), colfmt='%.18e')

# Load processed images from flat file, on disk
'''
from loadData import loadData
Xtrn = loadData(f_in_flat_trn, rowskip=0)
Xtst = loadData(f_in_flat_tst, rowskip=0)
tst = loadData(f_in_flat_tst, rowskip=0)
Ytrn = loadData(f_in_sol, rowskip=1)
'''

# Fit OLS
'''
from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(Xtrn, Ytrn[::, 1:])
Exemple #36
0
def main():
    [x_train, y_train, x_test, y_test] = readData.readData()
    backword(x_train, y_train, x_test, y_test)
def reduce_tweets_words():

	#Ja vai pegar os tweets dos txts e tokeniza-los e colocar nesse array como tuplas
	# (Twitter_tokenizado , categoria_twitter)
	# ([u'RT', u'@mpvine', u':', u'If', u'fifty', u'million', u'people', u'say', u'a', 
	# u'foolish', u'thing', u',', u"it's", u'still', u'a', u'foolish', u'thing', u'.'], 'pos')
	

	# Tem 2853 no FeatureSet, sendo: 1286 Stay e 1567 Leave
	# openFile_getTokenizedTweets("StayTweets1.txt", "pos")
	# openFile_getTokenizedTweets("StayTweetsDate.txt", "pos")
	# openFile_getTokenizedTweets("StayTweetsDate2.txt", "pos")
	# openFile_getTokenizedTweets("StayJune14.txt", "pos")
	# openFile_getTokenizedTweets("StayJune15.txt", "pos")
	# openFile_getTokenizedTweets("StayJune16.txt", "pos")
	# openFile_getTokenizedTweets("StayJune17.txt", "pos")
	# openFile_getTokenizedTweets("StayJune18.txt", "pos")
	# openFile_getTokenizedTweets("StayJune19.txt", "pos")
	# openFile_getTokenizedTweets("StayJune20.txt", "pos")
	# openFile_getTokenizedTweets("StayTweetsNow.txt", "pos")

	
	###########################################################################################

	#Too fazendo isso pra pegar so os 1286 primeiros desse arquivo q tem 1537
	# with open("LeaveTweets1.txt") as doc:
	# 	lines = doc.readlines()
	# 	lines = lines[:1286]
	# 	#print(len(lines))
	# 	for l in lines:
	# 		#Pra tirar se tiver emotions no formato /u2026 por exemplo
	# 		l = l.decode('unicode_escape').encode('ascii','ignore')
	# 		tokens = tknzr.tokenize(l)
	# 		global tokenized_tweets
	# 		#Pega cada token e bota em minuscula
	# 		lw_tokens = [w.lower() for w in tokens]
	# 		tokenized_tweets.append((lw_tokens, "neg", l))

	# openFile_getTokenizedTweets("LeaveTweetsDate.txt", "neg")
	# openFile_getTokenizedTweets("LeaveTweetsDate2.txt", "neg")
	# openFile_getTokenizedTweets("LeaveJune14.txt", "neg")
	# openFile_getTokenizedTweets("LeaveJune15.txt", "neg")
	# openFile_getTokenizedTweets("LeaveJune16.txt", "neg")
	# openFile_getTokenizedTweets("LeaveJune17.txt", "neg")
	# openFile_getTokenizedTweets("LeaveJune18.txt", "neg")
	# openFile_getTokenizedTweets("LeaveJune19.txt", "neg")
	# openFile_getTokenizedTweets("LeaveJune20.txt", "neg")
	# openFile_getTokenizedTweets("LeaveTweetsNow.txt", "neg")

	[leave_tweets, stay_tweets, other_tweets] = readData()

	leave_tweets = categorizy_tweets(leave_tweets, "neg")
	new_leave = getTokenizedTweetsFile("ExtraLeaveTweets.txt", "neg")
	stay_tweets = categorizy_tweets(stay_tweets, "pos")
	new_stay = getTokenizedTweetsFile("ExtraStayTweets.txt", "pos")
	other_tweets = categorizy_tweets(other_tweets, "neutral") 

	tokenized_tweets = leave_tweets + new_leave + stay_tweets + new_stay + other_tweets
	all_words = []

	print(len(leave_tweets))
	print(len(new_leave))
	print(len(stay_tweets))
	print(len(new_stay))
	print(len(other_tweets))
	print(len(tokenized_tweets))
	#############################################################################
	#
	# AGORA QUE OS TWEETS ESTAO EM TUPLAS (tweets_tokenizados , categoria) VAMOS
	# REDUZIR O TAMANHO DOS TWEETS TOKENIZADOS TIRANDO STOPWORDS E OUTRAS COISAS
	# QUE NAO AGREGAM NA EXTRACAO DE CARACTERISTICAS
	#
	#############################################################################

	# Fazendo o stopwords nas palavras dos documentos pra tirar muita coisa inutil
	stop_words = set(stopwords.words("english"))

	#Vou tentar melhorar a lista de stopwords colocando nela algumas pontuacoes q nao servem de nada
	# Fiz elas com unicode pq eh assim que as stop_words estao
	punctuation = [u'.', u'-', u',', u'"', u'(', u')', u':', u'?', u"'", u'--', u';', 
	u'!', u'$', u'*', u'&', u'...', u':/', u'/', u'%', u'..']
	punctuation = set(punctuation)
	global new_stop_words
	new_stop_words = stop_words.union(punctuation)

	twitter_symbols = [u'rt', u'#voteleave', u'#voteremain', u'#leaveeu', u'h', u'#rt', u'=', u'@', u'https',
	u'+', u"'", u'|', u'...']
	twitter_symbols = set(twitter_symbols)
	new_stop_words = new_stop_words.union(twitter_symbols)

	# NA VERDADE NAO TO CONSEGUINDO TIRAR O @USER DO RT MAS ISSO
	# NAO VAI INTERFERIR POIS A FREQUENCIA DE SE TER UM @USER DO MESMO USER EH POUCA
	#user_rt_pattern = "@\w+?"
	#url_pattern = 'http[s]:/'
	emotions_pattern = '\u\d+'
	url_pattern = 'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+'
	user_rt_pattern = '(?:@[\w_]+)'
    #user_rt_pattern = '(?:@[\w_]+)'
	
	# filtered_tweets = [for tweet_cat in tokenized_tweets if ]
	filtered_tweets = []
	tokens_to_be_removed = []
	#print(tokenized_tweets[228:230])

	#Para cada tupla (tweet_tok,categoria)
	for tweet_cat in tokenized_tweets:

		#Pra cada token desse tweet
		for token in tweet_cat[0]:
			#Se o token for uma das stop_words ou ter o Regex de URl ou RT a gnt tira
			#print(token)
			if token in new_stop_words or re.match(url_pattern, token) or re.search(user_rt_pattern, token) or re.match(emotions_pattern, token):
				tokens_to_be_removed.append(token)
				#print(tokens_to_be_removed)

		#Vi todos os tokens q eram pra ser removidos desse tweet
		#Agora vou remove-los
		for token in tokens_to_be_removed:
			tweet_cat[0].remove(token)

		#Limpar o tokens_to_be_removed pq senao vai sempre acumular de outros tweets
		tokens_to_be_removed = []
		#Adiciona o tweet sem as stopwords na nova lista
		filtered_tweets.append(tweet_cat)

	# Exemplo de tweet filtrado com stopwords
	# ([u'@mpvine', u'If', u'fifty', u'million', u'people', u'say',
	# u'foolish', u'thing', u"it's", u'still', u'foolish', u'thing'], 'pos')
	#print(filtered_tweets[228:230])


	#######################################################################
	#
	# JOGANDO TODOS OS TWEETS REDUZIDOS EM UM ARQUIVO
	#
	#######################################################################


	#Arquivo com as novas tuplas dos tweets filtrados
	with open('FilteredTweets2.txt', 'w') as outfile:
		for item in filtered_tweets:
  			outfile.write(str(item) + '\n')


  	return filtered_tweets
from matplotlib import pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.cross_validation import KFold
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer,mean_squared_error,r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from readData import readData
from gridsearch_helper import grid_search_helper_basic,grid_search_helper


#read the sets split into training,testing and validation sets by dataSplit.py

X_train,Y_train,X_val,Y_val,X_test,Y_test=readData()



#define the no of dimensions of the feature vector and the max dimension of the expanded polynomial
#feature space
noDim=8
noPoly=2

#define no of repetitions and folds of Cross-Validation that is to be done for training
noRep=5
n_folds=5



#define scoring functions
Exemple #39
0
 def __init__(self):
     self.data = readData.readData( readData.datafilePath )
Exemple #40
0
    start = 0
    end = 0
    d = []
    for i in xrange(len(data)-1):
        if isZero(data[i],data[i+1]):
            print start,end
            if end -start >=step:
                d.append(data[start:end])
            start = end
        else:
            end = i
    if end -start >=step:
        d.append(data[start:end])
    return d

data = readData("data/turn/Turn1.csv")
sf = lowPassFilter(data["AX"],0.02)
#d,vars,means = splitData(sf.tolist())
#newD = merge(d,vars,means)
#print len(newD)
#for line in newD:
#    print len(line)
newD = splitData2(sf)
for line in newD:
    print len(line)
data = readData("data/turn/Turn3.csv")
sf = lowPassFilter(data["AX"],0.02)
for line in splitData2(sf):
    newD.append(line)
for line in newD:
    sim = []
Exemple #41
0
import readData
import costFunction
from sigmoid import sigmoidGradient
from randomInit import randomInit
from backPropagation import backPropagation
from predict import predict
from gradientChecking import gradientChecking

showcost = 0

if __name__ == "__main__":
    input_layer = 400
    hidden_layer = 25
    num_labels = 10

    (X, y) = readData.readData()

    lam = 1

    if showcost:
        import numpy as np
        (ogTheta1, ogTheta2) = readData.readWeights()

        Thetas = np.reshape(ogTheta1, ogTheta1.size)
        Thetas = np.append(Thetas, ogTheta2)
        print costFunction.computeRegularizedCost(Thetas, X, y, input_layer,
                                                  hidden_layer, num_labels,
                                                  lam)

    Theta1 = randomInit(input_layer, hidden_layer)
    Theta2 = randomInit(hidden_layer, num_labels)
Exemple #42
0
def main(args):
    decisionTree = DecisionTree()
    cIndex, attributesList, data = readData.readData(args.input)
    decisionTree.makeTree(decisionTree.root, cIndex, attributesList, data)
    decisionTree.printTree(decisionTree.root)