def _read_csv_file(self, delimiter="\t"): print("Reading CSV file ") begin_time = time.time() # for Micky's big file # wikitree_sf = SFrame.read_csv(self._input_directory_path + self._target_file_name, delimiter="\t") wikitree_sf = SFrame.read_csv(self._input_directory_path + self._target_file_name, delimiter=delimiter) end_time = time.time() run_time = end_time - begin_time print(run_time) return wikitree_sf
profile = webdriver.FirefoxProfile() profile.set_preference('browser.download.folderList', 2) profile.set_preference('browser.download.dir', symlink_path) profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/html,application/pdf,application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet') profile.set_preference('pdfjs.disabled', True) browser = webdriver.Firefox(profile) # load classifier lr_clf = joblib.load('./model/pfpj_classifier.pkl') totalprocessos = 0 totalerros = 0 seeds = SFrame.read_csv('seedSP.csv', verbose=False, column_type_hints=[str, str, int]) del seeds['Seed'] if hasattr(args, 'a') and args.a: fh = open(args.a, 'r') numprocessos, numerro = [buscaprocesso(busca) for busca in fh.readlines()] fh.close totalprocessos += numprocessos totalerros += numerro else: buscas = args.q totalprocessos, totalerros = buscaprocesso(buscas) totalbuscas = 1 print("Parsing has been done") print('Total de erros / processos: %d / %d:' % (totalerros, totalprocessos))
*z : pre-activation function ** return (Float value) ** """ return sigmoid(z) * (1 - sigmoid(z)) def sigmoid(z): """ Compute the sigmoid function ** input : ** *z : pre-activation function ** return (Float value) from O to 1 ** """ return 1 / (1 + np.exp(-z)) if __name__ == '__main__': dataset = SFrame.read_csv("adult.csv") CATEGORY_KEYS = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"] CONTINU_KEYS = ["capital-gain", "fnlwgt", "hours-per-week", "age", "capital-loss", "educational-num"] # Process nonlinear columns dataset = columns_to_category(dataset, CATEGORY_KEYS) # Process linear columns dataset = columns_to_normalize(dataset, CONTINU_KEYS) # Convert the output from string to binary dataset["income"] = dataset["income"].apply(lambda x : 1. if x == ">50K" else 0.) keys = CATEGORY_KEYS + CONTINU_KEYS + ["income"] features = [] # Create the features matrix for line in dataset:
""" Shuffle the two lists keeping the order ** input : ** *features : numpy array of features *targets : numpy vector of targets ** return (numpy array of features, numpy vector of targets) ** """ c = list(zip(features.tolist(), targets.tolist())) random.shuffle(c) features[:], targets[:] = zip(*c) return np.array(features), np.array(targets) if __name__ == '__main__': # Load both csv with sframe train_data = SFrame.read_csv("train.csv") test_data = SFrame.read_csv("test.csv") test_data["Survived"] = -1 # We add a new columns for each csv to be abel to differentiate them later train_data["type"] = "train" test_data["type"] = "test" # We now can merge the two csv together data = train_data.append(test_data) # We extract features and targets from the csv train_features, train_targets, test_features = process_csv(data) # We initialize all variables. The weight is a one dimensional vector (one weight per feature) weights = np.random.randn(train_features.shape[1]) # The bias
) # run at the start of every ipython notebook to use plotly.offline # this injects the plotly.js source files into the notebook #-------------------------------------------------- # %matplotlib inline # import matplotlib.pyplot as plt # import seaborn as sns #-------------------------------------------------- # --- # # Read data into SFrames # In[4]: usersSF = SFrame.read_csv("%s/users.dat" % DATADIR, delimiter='::', header=False, verbose=False, column_type_hints=[int, str, int, int, str]) usersSF = usersSF.rename({ 'X1': 'UserID', 'X2': 'Gender', 'X3': 'Age', 'X4': 'Occupation', 'X5': 'ZipCode', }) usersDescSF = dict(zip(usersSF.column_names(), usersSF.column_types())) print usersDescSF # In[5]: ratingsSF = SFrame.read_csv("%s/ratings.dat" % DATADIR,