def _read_csv_file(self, delimiter="\t"):
     print("Reading CSV file ")
     begin_time = time.time()
     # for Micky's big file
     # wikitree_sf = SFrame.read_csv(self._input_directory_path + self._target_file_name, delimiter="\t")
     wikitree_sf = SFrame.read_csv(self._input_directory_path +
                                   self._target_file_name,
                                   delimiter=delimiter)
     end_time = time.time()
     run_time = end_time - begin_time
     print(run_time)
     return wikitree_sf
Beispiel #2
0
        profile = webdriver.FirefoxProfile()
        profile.set_preference('browser.download.folderList', 2)
        profile.set_preference('browser.download.dir', symlink_path)
        profile.set_preference('browser.helperApps.neverAsk.saveToDisk',
                               'text/html,application/pdf,application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
        profile.set_preference('pdfjs.disabled', True)

        browser = webdriver.Firefox(profile)
        #  load classifier
        lr_clf = joblib.load('./model/pfpj_classifier.pkl')

        totalprocessos = 0
        totalerros = 0

        seeds = SFrame.read_csv('seedSP.csv', verbose=False, column_type_hints=[str, str, int])
        del seeds['Seed']

        if hasattr(args, 'a') and args.a:
            fh = open(args.a, 'r')
            numprocessos, numerro = [buscaprocesso(busca) for busca in fh.readlines()]
            fh.close
            totalprocessos += numprocessos
            totalerros += numerro
        else:
            buscas = args.q
            totalprocessos, totalerros = buscaprocesso(buscas)
            totalbuscas = 1

        print("Parsing has been done")
        print('Total de erros / processos: %d / %d:' % (totalerros, totalprocessos))
            *z : pre-activation function
        ** return (Float value) **
    """
    return sigmoid(z) * (1 - sigmoid(z))

def sigmoid(z):
    """
        Compute the sigmoid function
        ** input : **
            *z : pre-activation function
        ** return (Float value) from O to 1  **
    """
    return 1 / (1 + np.exp(-z))

if __name__ == '__main__':
    dataset = SFrame.read_csv("adult.csv")

    CATEGORY_KEYS = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"]
    CONTINU_KEYS = ["capital-gain", "fnlwgt", "hours-per-week", "age", "capital-loss", "educational-num"]

    # Process nonlinear columns
    dataset = columns_to_category(dataset, CATEGORY_KEYS)
    # Process linear columns
    dataset = columns_to_normalize(dataset, CONTINU_KEYS)
    # Convert the output from string to binary
    dataset["income"] = dataset["income"].apply(lambda x : 1. if x == ">50K" else 0.)

    keys = CATEGORY_KEYS + CONTINU_KEYS + ["income"]
    features = []
    # Create the features matrix
    for line in dataset:
Beispiel #4
0
    """
        Shuffle the two lists keeping the order
        ** input : **
            *features : numpy array of features
            *targets : numpy vector of targets
        ** return (numpy array of features, numpy vector of targets) **
    """
    c = list(zip(features.tolist(), targets.tolist()))
    random.shuffle(c)
    features[:], targets[:] = zip(*c)
    return np.array(features), np.array(targets)


if __name__ == '__main__':
    # Load both csv with sframe
    train_data = SFrame.read_csv("train.csv")
    test_data = SFrame.read_csv("test.csv")

    test_data["Survived"] = -1
    # We add a new columns for each csv to be abel to differentiate them later
    train_data["type"] = "train"
    test_data["type"] = "test"
    # We now can merge the two csv together
    data = train_data.append(test_data)

    # We extract features and targets from the csv
    train_features, train_targets, test_features = process_csv(data)

    # We initialize all variables. The weight is a one dimensional vector (one weight per feature)
    weights = np.random.randn(train_features.shape[1])
    # The bias
Beispiel #5
0
)  # run at the start of every ipython notebook to use plotly.offline
# this injects the plotly.js source files into the notebook
#--------------------------------------------------
# %matplotlib inline
# import matplotlib.pyplot as plt
# import seaborn as sns
#--------------------------------------------------

# ---
# # Read data into SFrames

# In[4]:

usersSF = SFrame.read_csv("%s/users.dat" % DATADIR,
                          delimiter='::',
                          header=False,
                          verbose=False,
                          column_type_hints=[int, str, int, int, str])
usersSF = usersSF.rename({
    'X1': 'UserID',
    'X2': 'Gender',
    'X3': 'Age',
    'X4': 'Occupation',
    'X5': 'ZipCode',
})
usersDescSF = dict(zip(usersSF.column_names(), usersSF.column_types()))
print usersDescSF

# In[5]:

ratingsSF = SFrame.read_csv("%s/ratings.dat" % DATADIR,