def __init__(self, classifier_type, input_file_path, lev): """ Build the tree :param classifier_type: String ("LR", "SVM", "FT") :param input_file_path: The path for the data file """ print("Building the tree") self.root = Node("Top", [], 0) self.root.score = 1 self.tree_type = classifier_type self.input_file_path = input_file_path print("Loading the labels") self.labels = load_labels(input_file_path) if classifier_type == "FT": print("Start loading data") self.text = load_text(input_file_path) else: print("Start vectorizing") self.matrix, self.vectorizer = vectorize(input_file_path) # joblib.dump(vectorizer, "./load_model/" + 'vectorizerc_leadp.pkl') # scipy.sparse.save_npz("./load_model/" + 'lp_matrix.npz', matrix) # with open("./load_model/" + 'vectorizerc_leadp.pkl', 'rb') as f: # self.vectorizer = pickle.load(f) # self.matrix = scipy.sparse.load_npz("./load_model/" + 'lp_matrix.npz') cur = self.root for i, label in enumerate(self.labels): if not label: continue levels = label.split("/")[1:] if len(levels) > 0 and levels[0] == 'World': continue for j, key in enumerate(levels): # ignore the first "Top" taxonomy if j >= lev: continue child_keys = [c for c in cur.child.keys()] if not child_keys: other_node = Node("*", [], cur.level + 1) cur.child["*"] = other_node other_node.pre_node = cur if key not in child_keys: new_node = Node(key, [i], cur.level + 1) cur.child[key] = new_node new_node.pre_node = cur cur.input.append(i) cur = cur.child[key] else: cur.input.append(i) cur = cur.child[key] if "*" in cur.child.keys(): cur.child["*"].input.append(i) # if the label doesn't stop at the leaves of the tree. Add it to the OTHER node of the curNode's child. cur = self.root if i % 200000 == 0: print("{0} documents are processed".format(i))
def classify(level): a = time.time() train_labels = load_labels('./level/train_8_level.csv') # !!! test_text = load_text('./level/test_8_level.csv') # !!! test_labels = load_labels('./level/test_8_level.csv') # !!! train_labels = [ '/'.join(label.split('/')[:level + 1]) for label in train_labels ] test_labels = [ '/'.join(label.split('/')[:level + 1]) for label in test_labels ] print(train_labels[0]) assert len(train_labels[0].split('/')) == level + 1 le = preprocessing.LabelEncoder() le.fit(np.concatenate((train_labels, test_labels))) encoded_train_labels = le.transform(train_labels) encoded_test_labels = le.transform(test_labels) print('number of labels: ') print(len(le.classes_)) print(encoded_train_labels.shape) print(encoded_test_labels.shape) print('load complete') # svm = OneVsRestClassifier(LinearSVC()) lg = OneVsRestClassifier(LogisticRegression()) m, vectorizer = vectorize('./level/train_8_level.csv') # !!! print('vectorize complete') # sup = train_supervised(input='FT_train.txt') # svm.fit(m, encoded_train_labels) lg.fit(m, encoded_train_labels) print("fit complete") # print(svm.predict(vectorizer.transform(test_text[0]))) # print(lg.predict(vectorizer.transform(test_text[0]))) label_predict = lg.predict(vectorizer.transform(test_text)) label_predict = le.inverse_transform(label_predict) # label_predict = [] # for text in test_text: # p, prob = sup.predict(clean(text)) # label_predict.append(p[0][9:]) # print(p) with open('temp.txt', 'w') as f: for pred in label_predict: f.write(pred + '\n') b = time.time() print('{0} levels:'.format(level)) print('number of labels {0}'.format(len(le.classes_))) print('flat LR') print('Time used: {0}s'.format(b - a)) print('Accuracy: ') print(evaluate_accu('temp.csv', './level/test_8_level.csv', level)) # !!!
def getInfoFromParameters(input_file, parameters, estimator): Corpus = preprocessing.process_data( input_file, to_lower_case=parameters.lowerCaseFlag, remove_stop_words=parameters.removeStopWordsFlag, stem=parameters.stemFlag) pipeline = preprocessing.vectorize(estimator, max_features=parameters.maxFeatures, ngram_range=parameters.ngramRange, tf=parameters.tfidfFlags[0], tfidf=parameters.tfidfFlags[1]) return Corpus, pipeline
def getInfoFromParameters(input_file, parameters): Corpus = preprocessing.process_data( input_file, to_lower_case=parameters.lowerCaseFlag, remove_stop_words=parameters.removeStopWordsFlag, stem=parameters.stemFlag) counts_by_comment, names = preprocessing.vectorize( Corpus, max_features=parameters.maxFeatures, ngram_range=parameters.ngramRange, tf=parameters.tfidfFlags[0], tfidf=parameters.tfidfFlags[1]) return Corpus, counts_by_comment, names
data_uri = '../data/nppurg2.csv' save_uri = './np_model.ckpt' ntrn = 81661 frac_val = 0.05 ntst = 2722 # data preparation print('::: data preparation') smiles = pds.read_csv(data_uri).as_matrix()[:ntrn + ntst, 0] #0: SMILES Y = np.asarray(pds.read_csv(data_uri).as_matrix()[:ntrn + ntst, 1:], dtype=np.float32) # 1: MolWT, 2: LogP, 3: QED list_seq = smiles_to_seq(smiles, char_set) Xs, X = vectorize(list_seq, char_set) tstX = X[-ntst:] tstXs = Xs[-ntst:] tstY = Y[-ntst:] for n in [50000, 81661]: ntrn = n X = X[:ntrn] Xs = Xs[:ntrn] Y = Y[:ntrn] nL = int(len(Y) * frac) nU = len(Y) - nL nL_trn = int(nL * (1 - frac_val))
# Setup directory for preprocessing and model storage ############################################################################### path = utils.setup_model_dir() # Transform data into vectors for processing by neural network ############################################################################### print('Pre-processing extracted song data...') df = pp.convert_byte_data(df) df = pp.create_target_classes(df) # Shuffle a few times for i in range(5): df = df.iloc[np.random.permutation(len(df))] df = df.fillna(0) # Transform into NumPy matrix, normalized by column X, y, y_map = pp.vectorize(df, 'target', path) t_preproc = time.time() print('Cleaned and processed', len(df.index), 'rows in', round((t_preproc - t_extract), 2), 'seconds.') # Train neural network ############################################################################### print('Training neural network...') print('[', X.shape[1], '] x [', np.unique(y).size, ']') model_simple = nn.deep_nn(pp.scaler(X, 'robust', path), y, 'std', path) # nn.deep_nn(X, y) t_nn = time.time() print('Neural network trained in', round((t_nn - t_preproc), 2), 'seconds.') print('Evaluating model and saving class probabilities...') predDF = pd.DataFrame.from_records(model_simple.predict(pp.scaler(X,