Ejemplo n.º 1
0
 def __init__(self, classifier_type, input_file_path, lev):
     """
     Build the tree
     :param classifier_type: String ("LR", "SVM", "FT")
     :param input_file_path: The path for the data file
     """
     print("Building the tree")
     self.root = Node("Top", [], 0)
     self.root.score = 1
     self.tree_type = classifier_type
     self.input_file_path = input_file_path
     print("Loading the labels")
     self.labels = load_labels(input_file_path)
     if classifier_type == "FT":
         print("Start loading data")
         self.text = load_text(input_file_path)
     else:
         print("Start vectorizing")
         self.matrix, self.vectorizer = vectorize(input_file_path)
         # joblib.dump(vectorizer, "./load_model/" + 'vectorizerc_leadp.pkl')
         # scipy.sparse.save_npz("./load_model/" + 'lp_matrix.npz', matrix)
         # with open("./load_model/" + 'vectorizerc_leadp.pkl', 'rb') as f:
         #     self.vectorizer = pickle.load(f)
         # self.matrix = scipy.sparse.load_npz("./load_model/" + 'lp_matrix.npz')
     cur = self.root
     for i, label in enumerate(self.labels):
         if not label:
             continue
         levels = label.split("/")[1:]
         if len(levels) > 0 and levels[0] == 'World':
             continue
         for j, key in enumerate(levels):
             # ignore the first "Top" taxonomy
             if j >= lev:
                 continue
             child_keys = [c for c in cur.child.keys()]
             if not child_keys:
                 other_node = Node("*", [], cur.level + 1)
                 cur.child["*"] = other_node
                 other_node.pre_node = cur
             if key not in child_keys:
                 new_node = Node(key, [i], cur.level + 1)
                 cur.child[key] = new_node
                 new_node.pre_node = cur
                 cur.input.append(i)
                 cur = cur.child[key]
             else:
                 cur.input.append(i)
                 cur = cur.child[key]
         if "*" in cur.child.keys():
             cur.child["*"].input.append(i)
             # if the label doesn't stop at the leaves of the tree. Add it to the OTHER node of the curNode's child.
         cur = self.root
         if i % 200000 == 0:
             print("{0} documents are processed".format(i))
Ejemplo n.º 2
0
def classify(level):
    a = time.time()
    train_labels = load_labels('./level/train_8_level.csv')  # !!!

    test_text = load_text('./level/test_8_level.csv')  # !!!
    test_labels = load_labels('./level/test_8_level.csv')  # !!!

    train_labels = [
        '/'.join(label.split('/')[:level + 1]) for label in train_labels
    ]
    test_labels = [
        '/'.join(label.split('/')[:level + 1]) for label in test_labels
    ]
    print(train_labels[0])
    assert len(train_labels[0].split('/')) == level + 1
    le = preprocessing.LabelEncoder()
    le.fit(np.concatenate((train_labels, test_labels)))
    encoded_train_labels = le.transform(train_labels)
    encoded_test_labels = le.transform(test_labels)
    print('number of labels: ')
    print(len(le.classes_))
    print(encoded_train_labels.shape)
    print(encoded_test_labels.shape)
    print('load complete')

    # svm = OneVsRestClassifier(LinearSVC())
    lg = OneVsRestClassifier(LogisticRegression())

    m, vectorizer = vectorize('./level/train_8_level.csv')  # !!!
    print('vectorize complete')
    # sup = train_supervised(input='FT_train.txt')

    # svm.fit(m, encoded_train_labels)
    lg.fit(m, encoded_train_labels)
    print("fit complete")
    # print(svm.predict(vectorizer.transform(test_text[0])))
    # print(lg.predict(vectorizer.transform(test_text[0])))
    label_predict = lg.predict(vectorizer.transform(test_text))
    label_predict = le.inverse_transform(label_predict)
    # label_predict = []
    # for text in test_text:
    #     p, prob = sup.predict(clean(text))
    #     label_predict.append(p[0][9:])
    #     print(p)
    with open('temp.txt', 'w') as f:
        for pred in label_predict:
            f.write(pred + '\n')

    b = time.time()
    print('{0} levels:'.format(level))
    print('number of labels {0}'.format(len(le.classes_)))
    print('flat LR')
    print('Time used: {0}s'.format(b - a))
    print('Accuracy: ')
    print(evaluate_accu('temp.csv', './level/test_8_level.csv', level))  # !!!
Ejemplo n.º 3
0
def getInfoFromParameters(input_file, parameters, estimator):
    Corpus = preprocessing.process_data(
        input_file,
        to_lower_case=parameters.lowerCaseFlag,
        remove_stop_words=parameters.removeStopWordsFlag,
        stem=parameters.stemFlag)
    pipeline = preprocessing.vectorize(estimator,
                                       max_features=parameters.maxFeatures,
                                       ngram_range=parameters.ngramRange,
                                       tf=parameters.tfidfFlags[0],
                                       tfidf=parameters.tfidfFlags[1])

    return Corpus, pipeline
Ejemplo n.º 4
0
def getInfoFromParameters(input_file, parameters):
    Corpus = preprocessing.process_data(
        input_file,
        to_lower_case=parameters.lowerCaseFlag,
        remove_stop_words=parameters.removeStopWordsFlag,
        stem=parameters.stemFlag)
    counts_by_comment, names = preprocessing.vectorize(
        Corpus,
        max_features=parameters.maxFeatures,
        ngram_range=parameters.ngramRange,
        tf=parameters.tfidfFlags[0],
        tfidf=parameters.tfidfFlags[1])

    return Corpus, counts_by_comment, names
Ejemplo n.º 5
0
data_uri = '../data/nppurg2.csv'
save_uri = './np_model.ckpt'

ntrn = 81661
frac_val = 0.05
ntst = 2722

# data preparation
print('::: data preparation')

smiles = pds.read_csv(data_uri).as_matrix()[:ntrn + ntst, 0]  #0: SMILES
Y = np.asarray(pds.read_csv(data_uri).as_matrix()[:ntrn + ntst, 1:],
               dtype=np.float32)  # 1: MolWT, 2: LogP, 3: QED

list_seq = smiles_to_seq(smiles, char_set)
Xs, X = vectorize(list_seq, char_set)

tstX = X[-ntst:]
tstXs = Xs[-ntst:]
tstY = Y[-ntst:]

for n in [50000, 81661]:
    ntrn = n

    X = X[:ntrn]
    Xs = Xs[:ntrn]
    Y = Y[:ntrn]

    nL = int(len(Y) * frac)
    nU = len(Y) - nL
    nL_trn = int(nL * (1 - frac_val))
Ejemplo n.º 6
0
# Setup directory for preprocessing and model storage
###############################################################################
path = utils.setup_model_dir()

# Transform data into vectors for processing by neural network
###############################################################################
print('Pre-processing extracted song data...')
df = pp.convert_byte_data(df)
df = pp.create_target_classes(df)
# Shuffle a few times
for i in range(5):
    df = df.iloc[np.random.permutation(len(df))]
df = df.fillna(0)
# Transform into NumPy matrix, normalized by column
X, y, y_map = pp.vectorize(df, 'target', path)
t_preproc = time.time()
print('Cleaned and processed', len(df.index), 'rows in',
      round((t_preproc - t_extract), 2), 'seconds.')

# Train neural network
###############################################################################
print('Training neural network...')
print('[', X.shape[1], '] x [', np.unique(y).size, ']')
model_simple = nn.deep_nn(pp.scaler(X, 'robust', path), y, 'std', path)
# nn.deep_nn(X, y)
t_nn = time.time()
print('Neural network trained in', round((t_nn - t_preproc), 2), 'seconds.')

print('Evaluating model and saving class probabilities...')
predDF = pd.DataFrame.from_records(model_simple.predict(pp.scaler(X,