def main(args):
    for Vs in VLIST:
        f = VLIST.index(Vs)
        plt.figure(f + 1)
        subfigure = read.readData(str(Vs) + '.csv')[0]
        stylelist = ['solid', 'dashed', 'dashdot']
        zoomlist = [False, True]
        for zoom in zoomlist:
            for i in range(len(PLIST)):
                plt.plot(subfigure[0], subfigure[i+1],
                        label = PLIST[i], linewidth = 4 if i == 0 else 2,
                        linestyle = stylelist[i], markersize = 12 )
            y = [float(S)**2/(2*(1-float(S))) for S in subfigure[0]]
            plt.plot(subfigure[0], y,
                    label = "Universal lower bound", linewidth = 3,
                    linestyle = 'dotted', markersize = 12 )
            plt.xlabel('Traffic Intensity')
            plt.ylabel(r'$ E\left[\sum_{i,j} Q_{i,j}\right]$')
            plt.title(r'$ E\left[\sum_{i,j} Q_{i,j}\right]$ vs Traffic Intensity')
                    #+ str(VLIST[f]))
            if (not zoom):
                plt.grid(True)
                plt.legend()
                plt.savefig("Avgq_Vs " + str(VLIST[f]) + ".png")
                plt.savefig("Avgq_Vs " + str(VLIST[f]) + ".eps")
                plt.savefig("Avgq_Vs " + str(VLIST[f]) + ".svg")
                plt.show()
            else:
                plt.xlim(0.9, 1.0) #zoom in the plot to high traffic intensity level
                plt.grid(True)
                plt.legend()
                plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f]) + ".png")
                plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f]) + ".eps")
                plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f]) + ".svg")
                plt.show()
Example #2
0
def main(args):
    for Vs in VLIST:
        f = VLIST.index(Vs)
        plt.figure(f + 1)
        subfigure = read.readData(str(Vs) + '.csv')[1]
        stylelist = ['solid', 'dashed', 'dashdot']
        for i in range(len(PLIST)):
            plt.plot(subfigure[0],
                     subfigure[i + 1],
                     label=PLIST[i],
                     linewidth=4 if i == 0 else 2,
                     linestyle=stylelist[i],
                     markersize=12)
        y = [float(S)**2 / 2 for S in subfigure[0]]
        plt.plot(subfigure[0],
                 y,
                 label="Universal lower bound",
                 linewidth=3,
                 linestyle='dotted',
                 markersize=12)
        plt.xlabel('Traffic Intensity')
        plt.ylabel(r'$(1-2\lambda) E[\sum_{i,j} Q_{i,j}]$')
        plt.title(r'$(1-2\lambda) E[\sum_{i,j} Q_{i,j}]$ vs Traffic Intensity')
        #+ str(VLIST[f]))
        plt.grid(True)
        plt.legend()
        plt.savefig("EpsilonXAvgq_Vs " + str(VLIST[f]) + ".png")
        plt.savefig("EpsilonXAvgq_Vs " + str(VLIST[f]) + ".eps")
        plt.savefig("EpsilonXAvgq_Vs " + str(VLIST[f]) + ".svg")
        plt.show()
Example #3
0
def check_feature_rate():
    import math
    import randomForest as rf
    missing_input = 'none'  #'mean'
    transform = False
    scale = True
    use_text = False
    dummy = False
    use_feature_selection = False

    data_path = 'DorCirurgiaCategNA.csv'
    class_questionnaire = 'Q92510'
    class_name = 'Q92510_snDorPos'
    data, original_attributes, categories = read.readData(
        data_path=data_path,
        class_name=class_name,
        class_questionnaire=class_questionnaire,
        missing_input=missing_input,
        dummy=dummy,
        transform_numeric=transform,
        use_text=use_text,
        skip_class_questionnaire=True)  #skip_class_questionnaire=False)

    X = data[:, 0:-1]
    y = np.array(data[:, -1])

    ntrees = 5001
    replace = False
    mtry = math.sqrt
    max_depth = None
    missing_branch = True
    seed = np.random.randint(0, 10000)
    clf1 = rf.RandomForest(ntrees=ntrees,
                           oob_error=True,
                           random_state=seed,
                           mtry=mtry,
                           missing_branch=missing_branch,
                           prob_answer=False,
                           max_depth=max_depth,
                           replace=replace,
                           balance=True)
    clf1.fit(X, y)
    attributes_used = {}
    for tree in clf1.forest:

        for attribute in tree.feature_indices:
            if (attribute not in attributes_used.keys()):
                attributes_used[attribute] = 1
            else:
                attributes_used[attribute] += 1

    if (len((attributes_used.keys())) != X.shape[1]):
        print(len(attributes_used.keys()))
        print(X.shape[1])
        print('not equal!!! %r' %
              (1 - len(attributes_used.keys()) / X.shape[1]))
    print({original_attributes[a]: b for a, b in attributes_used.items()})
    print(1 - clf1.oob_error_)
Example #4
0
def plot_missing_rate():
    data_path = 'RotEOmbroCirurgiaCategNAReduzido.csv'  #'Dados/risk_factors_cervical_cancer.csv'
    class_name = 'Q92510_opcForca[RotEOmbro]'
    #class_name = 'Q92510_snDorPos'
    class_questionnaire = 'Q92510'
    missing_input = 'none'  #'mean'
    transform = False
    scale = True
    use_text = False
    dummy = False
    use_feature_selection = False
    data, original_attributes, categories = read.readData(
        data_path=data_path,
        class_name=class_name,
        class_questionnaire=class_questionnaire,
        missing_input=missing_input,
        dummy=dummy,
        transform_numeric=transform,
        use_text=use_text,
        skip_class_questionnaire=True)
    X = data
    print(X.shape)

    features_missing = [0, 0, 0, 0, 0]

    m = 0
    for j in range((X.shape[1])):
        cj = 0
        for i in range((X.shape[0])):
            if (utils.isnan(X[i][j])):
                cj += 1
        if (cj / X.shape[0] == 0):
            print(original_attributes[j])
            features_missing[0] += 1
        elif (cj / X.shape[0] <= 0.25):
            features_missing[1] += 1
        elif (cj / X.shape[0] <= 0.5):
            features_missing[2] += 1
        elif (cj / X.shape[0] <= 0.75):
            features_missing[3] += 1
        elif (cj / X.shape[0] < 1):
            features_missing[4] += 1

        m += cj / X.shape[0]
    print(m / X.shape[1])
    exit()
    print(features_missing)

    plt.pie(
        features_missing[::-1],
        labels=['0%', '0.05% a 25%', '26% a 50%', '51% a 75%',
                '76% a 98%'][::-1],
        colors=colors,
        startangle=90,
        radius=1,
        autopct=lambda p: '{:.0f}'.format(p * sum(features_missing) / 100))
    plt.show()
Example #5
0
def main(args):
    for Vs in VLIST:
        f = VLIST.index(Vs)
        plt.figure(f + 1)
        subfigure = read.readData(str(VLIST.index(Vs)) + 'th Vs' + '.csv')[0]
        stylelist = ['solid', 'dashed', 'dashdot', ':']
        markerlist = ['.', ',', 'o', '^']
        zoomlist = [False, True]
        for zoom in zoomlist:
            for i in range(len(PLIST)):
                plt.rc('text', usetex=True)
                plt.rc('font', family='serif')
                plt.plot(subfigure[0],
                         subfigure[i + 1],
                         label=LABELS[i],
                         linewidth=4 if i == 0 else 2,
                         linestyle=stylelist[i],
                         marker=markerlist[i],
                         markersize=12)
            y = [float(S)**2 / (2 * (1 - float(S))) for S in subfigure[0]]
            plt.plot(subfigure[0],
                     y,
                     label="Universal lower bound",
                     linewidth=3,
                     linestyle='dotted',
                     markersize=12)
            plt.xlabel(r'\text{Traffic Intensity}', fontsize=16)
            plt.ylabel(r'$E\left[\sum_{i,j} Q_{i,j}\right]$', fontsize=16)
            plt.title(
                r'$E\left[\sum_{i,j} Q_{i,j}\right]$'
                r' \text{ vs Traffic Intensity}',
                fontsize=16)
            plt.subplots_adjust(left=0.12, right=0.97, top=0.9, bottom=0.1)

            #+ str(VLIST[f]))
            if (not zoom):
                plt.grid(True)
                plt.legend(fontsize=16)
                plt.savefig("Avgq_Vs " + str(VLIST[f]) + ".png")
                #plt.savefig("Avgq_Vs " + str(VLIST[f]) + ".eps")
                #plt.savefig("Avgq_Vs " + str(VLIST[f]) + ".svg")
                #plt.savefig("Avgq_Vs " + str(VLIST[f]))
                plt.show()
            else:
                plt.xlim(
                    0.9,
                    1.0)  #zoom in the plot to high traffic intensity level
                plt.grid(True)
                plt.legend(fontsize=16)
                plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f]) + ".png")
                #plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f]) + ".eps")
                #plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f]) + ".svg")
                #plt.savefig("Avgq_zoomin_Vs " + str(VLIST[f]))
                plt.show()
Example #6
0
def classify(pacient_filename,model_filename,class_name):
    if(model_filename[-7:] != '.pickle' ):
        model_filename = model_filename + '.pickle'
    try:
        with open(model_filename, 'rb') as handle:
            clf = pickle.load(handle)   
    except(FileNotFoundError):
        print('Could not find file %r.\n' % model_filename)
        exit()

    data = read.readData(data_path = pacient_filename, class_name = class_name)
    X = data[data.columns[:-1]]
    classdict = (clf.predict(X,prob=True))[0]
    outcome = max(classdict,key=classdict.get)
    print(f"Outcome {outcome} with {classdict[outcome]/sum(classdict.values())*100}% of probabily.")
    transform_to_JSON(clf,clf.feature_contribution(X),out='classification_'+class_name+'.json',diffsur=False,addline=classdict)
Example #7
0
def main(args):
    for Vs in VLIST:
        f = VLIST.index(Vs)
        fig = plt.figure(f + 1)
        xyzs = read.readData(str(Vs) + '.csv')[0][1:]
        ax = fig.add_subplot(111, projection='3d')
        xs = [float(num) for num in xyzs[0]]
        ys = [float(num) for num in xyzs[1]]
        zs = [float(num) for num in xyzs[2]]
        ax.scatter(xs, ys, zs, c='b', marker='o')
        ax.set_xlabel(LABELS[0])
        ax.set_ylabel(LABELS[1])
        ax.set_zlabel(LABELS[2])
        plt.grid(True)
        plt.legend(fontsize=16)
        fig.savefig("Queue Lengths 3D " + str(VLIST[f]) + ".png")
        plt.show()
def main(args):
    for Vs in VLIST:
        f = VLIST.index(Vs)
        plt.figure(f + 1)
        subfigure = read.readData(str(Vs) + '.csv')[1]
        stylelist = ['solid', 'dashed', 'dashdot', 'dotted']
        markerlist = ['.', ',', 'o', '^']
        for i in range(len(PLIST)):
            plt.rc('text', usetex=True)
            plt.rc('font', family='serif')
            plt.plot(subfigure[0],
                     subfigure[i + 1],
                     label=LABELS[i],
                     linewidth=4 if i == 0 else 2,
                     linestyle=stylelist[i],
                     marker=markerlist[i],
                     markersize=5)
        y = [float(S)**2 / 2 for S in subfigure[0]]
        plt.plot(subfigure[0],
                 y,
                 label="Universal lower bound",
                 linewidth=3,
                 linestyle='dotted',
                 markersize=5)
        plt.xlabel(r'\text{Traffic Intensity }' r'$(\rho)$', fontsize=16)
        plt.ylabel(r'$(1-\rho) E\left[\sum_{i,j} Q_{i,j}\right]$', fontsize=16)
        plt.title(
            r'$(1-\rho)E\left[\sum_{i,j} Q_{i,j}\right]$'
            r' \text{ vs Traffic Intensity}',
            fontsize=16)
        #+ str(VLIST[f]))
        plt.subplots_adjust(left=0.12, right=0.97, top=0.9, bottom=0.1)
        plt.grid(True)
        plt.legend(fontsize=16)
        plt.savefig("EpsilonXAvgq_Vs " + str(VLIST[f]) + ".png")
        #plt.savefig("EpsilonXAvgq_Vs " + str(VLIST[f]) + ".eps")
        #plt.savefig("EpsilonXAvgq_Vs " + str(VLIST[f]) + ".svg")
        plt.show()
Example #9
0
# clear
def clearData(data):
    clear_data = []
    for item in data:
        clear_data.append(sanitize.st(item))
    return clear_data


# clear2
def clearData2(data):
    return [sanitize.st(item) for item in data]


# get
def getData(data):
    return sorted(data)[0:3]


# get@set
def getData2(data):
    return sorted(set(data))[0:3]


james = getData(clearData(read.readData('data/james.txt')))
sarah = getData(clearData2(read.readData('data/sarah.txt')))
julie = getData2(clearData2(read.readData('data/julie.txt')))

print(james)
print(sarah)
print(julie)
Example #10
0
import shelve
import read

user = shelve.open("testUser")
while True:
	name = input("Enter the name: ")
	if not name:	
		break
	age = input("Enter the age: ")
	country = input("Enter the country of origin: ")

	user[name] = (age, country)	 

userData = read.readData(user)	#read data back from shelve using the function in read.py file
print(userData)

user.clear()	
user.close()	
from pyspark.sql.functions import length
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from read import readData
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

spark = SparkSession.builder.appName('nlp').getOrCreate()

data = readData()
data.show()

data = data.withColumn('length', length(data['text']))
data.show()
data.groupby('class').mean().show()

tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label')

clean_up = VectorAssembler(inputCols=['tf_idf', 'length'],
                           outputCol='features')
Example #12
0
    missing_input = 'none'  #'mean'
    transform = False
    scale = True
    use_text = False
    dummy = False
    use_feature_selection = False

    import random
    seed = random.randint(0, 10000)

    for data_path, class_name in data_paths:
        data = read.readData(
            data_path=data_path,
            class_name=class_name,
            class_questionnaire=class_questionnaire,
            missing_input=missing_input,
            dummy=dummy,
            transform_numeric=transform,
            use_text=use_text,
            skip_class_questionnaire=True)  #skip_class_questionnaire=False)

        X = data[data.columns[:-1]]
        y = data[class_name]

        ntimes = 2
        ntrees = 5
        mtry = math.sqrt
        max_depth = None
        missing_branch = True
        #seed =  89444
        replace = False
Example #13
0
def main():
    currDir = os.getcwd()
    path = os.path.join(currDir, 'data.csv')

    input, output = readData(path, 'Economy..GDP.per.Capita.', 'Freedom',
                             'Happiness.Score')

    # split in 80/20 percent
    np.random.seed(5)
    indexes = [i for i in range(len(input))]
    trainSample = np.random.choice(indexes,
                                   int(0.8 * len(input)),
                                   replace=False)
    testSample = [i for i in indexes if not i in trainSample]

    trainInputs = [input[i] for i in trainSample]
    trainOutputs = [output[i] for i in trainSample]

    testInputs = [input[i] for i in testSample]
    testOutputs = [output[i] for i in testSample]

    #data normalization
    trainInputs, testInputs, trainOutputs, testOutputs = executeNormalization(
        trainInputs, testInputs, trainOutputs, testOutputs)

    #tool univariate
    GDPTrainInputs = [trainInputs[0] for _ in range(len(trainInputs))]
    GDPTestInputs = [testInputs[0] for _ in range(len(testInputs))]

    print("         UNIVARIATE")
    print("SKLEARN REGRESSION")
    regressorSklearnUni = linear_model.SGDRegressor(alpha=0.005,
                                                    max_iter=1000,
                                                    average=len(trainInputs))
    regressorSklearnUni.fit(GDPTrainInputs, trainOutputs)
    w = [regressorSklearnUni.intercept_[0], regressorSklearnUni.coef_[0]]
    print("Learnt model is: f(x) = " + str(w[0]) + " + " + str(w[1]) + " * x")

    print("MANUAL REGRESSION")
    regressorMySGDRegression = MySGDRegression()
    regressorMySGDRegression.fit(GDPTrainInputs, trainOutputs)
    w = [
        regressorMySGDRegression.intercept_, regressorMySGDRegression.coef_[0]
    ]
    print("Learnt model is: f(x) = " + str(w[0]) + " + " + str(w[1]) + " * x")

    print("\n\n         BIVARIATE")
    print("SKLEARN REGRESSION")
    toolRegression = tool_regression(trainInputs, trainOutputs)
    print("MANUAL REGRESSION")
    manual_regressor = manual_regression(trainInputs, trainOutputs)

    print("\n\n ERRORS")
    print(
        "1.TOOL UNIVARIATE ERROR:   ",
        mean_squared_error(testOutputs, toolRegression.predict(GDPTestInputs)))
    print("2.MANUAL UNIVARIATE ERROR: ",
          meanSquareError(manual_regressor, GDPTestInputs, testOutputs))
    print("3.TOOL BIVARIATE ERROR:    ",
          mean_squared_error(testOutputs, toolRegression.predict(testInputs)))
    print("4.MANUAL BIVARIATE ERROR:  ",
          meanSquareError(manual_regressor, testInputs, testOutputs))
Example #14
0
#test_size=0.2,random_state=9)

# exercise_index = np.where(attributes == 'ExercĂ­cio?')[0][0]
# feature_index = exercise_index
# not_nan_rows = [a for a in range(X.shape[0]) if not utils.isnan(X[:,feature_index][a])]
# Xs,ys,d = utils.split_categ(X[not_nan_rows],y[not_nan_rows],exercise_index,list(set(X[not_nan_rows,exercise_index])))
# print(utils.information_gain(y[not_nan_rows],ys))

# m.to_dot(attributes,out='out.dot')

exit()
data, original_attributes, categories = read.readData(
    data_path='../Dados/TestBaloonAdultAct.csv',
    class_name='inflated',
    dummy=dummy,
    transform_numeric=transform,
    use_text=use_text,
    missing_input='none')
X = data[:, 0:-1]
y = np.array(data[:, -1])
#import plot
# plot.plot_randomforest_accuracy(X,y,original_attributes,ntrees=100,mtry=math.sqrt,replace=False,max_depth=None,missing_branch=False)
# exit()
seeds = [10, 25, 40, 50, 120, 35, 128, 90, 97, 100]
import time

dif = []
i = 0
for seed in seeds:
    starttime = time.time()
Example #15
0
from sklearn.model_selection import train_test_split
import decisionTree as dt
from sklearn.model_selection import KFold
data_path = '../RotEOmbroCirurgiaCategNAReduzido.csv'
class_name = 'Q92510_opcForca[RotEOmbro]'  #'Q92510_snDorPos'
class_questionnaire = 'Q92510'
missing_input = 'none'
dummy = False
transform = False
use_text = False

data, original_attributes, categories = read.readData(
    data_path=data_path,
    class_name=class_name,
    class_questionnaire=class_questionnaire,
    missing_input=missing_input,
    dummy=dummy,
    transform_numeric=transform,
    use_text=use_text,
    skip_class_questionnaire=True)

sf = []
st = []
vp, vp1, fp, fp1, fn, fn1, vn, vn1 = 0, 0, 0, 0, 0, 0, 0, 0
X = data[:, 0:-1]
y = np.array(data[:, -1])
n_splits = X.shape[0]
sss = KFold(n_splits=n_splits, random_state=9)

for train_index, test_index in sss.split(X, y):
    X_train = X[train_index]
Example #16
0
import read
import sanitize

# clear
def clearData(data):
    clear_data = []
    for item in data:
        clear_data.append(sanitize.st(item))      
    return  clear_data;

# clear2
def clearData2(data):       
    return  [sanitize.st(item) for item in data]

# get
def getData(data):
    return sorted(data)[0 : 3]

# get@set
def getData2(data):
    return sorted(set(data))[0 : 3]

james = getData(clearData(read.readData('data/james.txt')))
sarah = getData(clearData2(read.readData('data/sarah.txt')))
julie = getData2(clearData2(read.readData('data/julie.txt')))

print(james)
print(sarah)
print(julie)