class ACMClassificator(BaseACMClassificator): def __init__(self): self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize) self.mlb = MultiLabelBinarizer() self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, class_weight=None), n_jobs=-1 ) def _prepare_problems(self, problems): return self.vectorizer.transform([p.statement for p in problems]) def fit(self, problems, tags): nltk.download('punkt', quiet=True) self.vectorizer.fit([p.statement for p in problems]) mat = self._prepare_problems(problems) self.mlb = self.mlb.fit(tags) self.classificator.fit(mat.toarray(), self.mlb.transform(tags)) def predict(self, problems): mat = self._prepare_problems(problems) predicted = self.classificator.predict(mat.toarray()) return self.mlb.inverse_transform(predicted)
def fit_images(): client = pymongo.MongoClient('localhost', 27017) db = client['image_annotation'] responses = db['mapped_responses'].find() no_labels = db['labels_binary'].find() numbers = [] for i in no_labels: numbers.append(set([int(i["number"])])) train_data = [] labels = [] i=0 mlb = MultiLabelBinarizer() mlb.fit(numbers) for index, instance in enumerate(responses): t_data = instance['hist']['0'] indexes[index] = instance['image_no'] train_data.append(t_data) label = instance['binary_results'] new_labels = [] for key, value in enumerate(label): value1 = int(value) new_labels.append(set([value1])) new_labels = mlb.transform(new_labels) labels.append(label) classifier = KNeighborsClassifier(n_neighbors = 5, weights='uniform') classifier.fit(train_data, labels) build_dir = getBuildDir() pickle.dump(classifier, open(join(build_dir, 'model.data'),'w'),protocol=1) client.close()
def test_multilabel_classification_report(): n_classes = 4 n_samples = 50 make_ml = make_multilabel_classification _, y_true_ll = make_ml(n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples) _, y_pred_ll = make_ml(n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples) expected_report = """\ precision recall f1-score support 0 0.50 0.67 0.57 24 1 0.51 0.74 0.61 27 2 0.29 0.08 0.12 26 3 0.52 0.56 0.54 27 avg / total 0.45 0.51 0.46 104 """ lb = MultiLabelBinarizer() lb.fit([range(4)]) y_true_bi = lb.transform(y_true_ll) y_pred_bi = lb.transform(y_pred_ll) for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: report = classification_report(y_true, y_pred) assert_equal(report, expected_report)
def run_classifier(sentences, labels, test_doc_list, output_file_path_list): import numpy as np train_matrix, tfidf = tf_idf_fit_transform(sentences) from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() label_matrix = mlb.fit_transform(labels) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC estimator = LinearSVC() classifier = OneVsRestClassifier(estimator, n_jobs=-1) classifier.fit(train_matrix, label_matrix) for test_doc, output_file_path in zip(test_doc_list, output_file_path_list): test_sentences = doc2sentences([test_doc]) sentence_matrix = tfidf.transform(test_sentences) print("Shape of sentence matrix : ", sentence_matrix.shape) predictions = classifier.predict(sentence_matrix) from lxml import etree document = etree.Element('doc') doc_tree = etree.ElementTree(document) for i in range(len(test_sentences)): curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1] etree.SubElement(document, "Sent", classes=", ".join(curr_pred)).text = test_sentences[i] doc_tree.write(output_file_path)
def evaluate_solution(users, urecovered, observed_index, xs=None, E=None, hidden_edges=None): """Evaluate the quality of the recovered user profile""" mse = mean_squared_error(users[observed_index, :], urecovered[observed_index, :]) if hidden_edges is None or len(hidden_edges) < 1: return mse, None labeler = MultiLabelBinarizer(classes=np.arange(xs.shape[1])) gold = labeler.fit_transform([E[e] for e in sorted(hidden_edges)]) # gold = np.array([E[e] for e in sorted(hidden_edges)]) eh = sorted(hidden_edges) heads, tails = zip(*eh) Cr = np.dot(urecovered, xs.T) Dr = np.abs(Cr[heads, :] - Cr[tails, :]) # TODO prediction here could be better: instead of predict the k best # directions all the time, look at revealed edge to compute threshold of # similarity (i.e replace 0.05) best_dirs = np.argsort(Dr, 1).astype(int)[:, :2] pred = [] for all_dir, suggestion in zip(Dr, best_dirs): my_pred = [suggestion[0]] if all_dir[suggestion[1]] < 0.05: my_pred.append(suggestion[1]) pred.append(my_pred) pred = labeler.fit_transform(pred) return mse, f1_score(gold, pred, average='samples')
def main(): #Explore the data for how many class labels reviewsDict = {} with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/reviewUsefulDict.pickle") as f: reviewsDict = pickle.load(f) print "Reviews Dictionary loaded .. " ''' usefulCountDict = {} for key, value in reviewsDict.iteritems(): if value not in usefulCountDict: usefulCountDict[value] = 1 else: usefulCountDict[value] = usefulCountDict[value]+1 pprint(usefulCountDict) ''' corpus, target = DictToList(reviewsDict) vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5, sublinear_tf=True) XAll = vectorizer.fit_transform(corpus) mlb = MultiLabelBinarizer() yAll = mlb.fit_transform(target) with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.fv", 'w') as f: pickle.dump(XAll, f) with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.target2", 'w') as f: pickle.dump(yAll, f) with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.mlb", 'w') as f: pickle.dump(mlb, f) print "Dumped featrue vectors .... "
def get_training_data(window_size_ms, train_time_sec=30): #loop until empty input is detected X = [] y = [] print "Training time for each key is {} seconds".format(train_time_sec) i = 0 while True: s = raw_input('Press <enter> to begin training key {} or q-<enter> to quit'.format(i)) if s: break j = 0 while j < train_time_sec: j += (window_size_ms / float(1000)) freq_spect = read_spectral_data_for_time(window_size_ms) X.append(freq_spect) y.append([i]) #increment key counter i += 1 mb = MultiLabelBinarizer() y = mb.fit_transform(y) X = np.asarray(X) y = np.asarray(y) return X, y
def load_data(config={}): """ Load the Reuters dataset. Returns ------- data : dict with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels' """ stop_words = stopwords.words("english") vectorizer = TfidfVectorizer(stop_words=stop_words) mlb = MultiLabelBinarizer() documents = reuters.fileids() test = [d for d in documents if d.startswith('test/')] train = [d for d in documents if d.startswith('training/')] docs = {} docs['train'] = [reuters.raw(doc_id) for doc_id in train] docs['test'] = [reuters.raw(doc_id) for doc_id in test] xs = {'train': [], 'test': []} xs['train'] = vectorizer.fit_transform(docs['train']).toarray() xs['test'] = vectorizer.transform(docs['test']).toarray() ys = {'train': [], 'test': []} ys['train'] = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train]) ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test]) data = {'x_train': xs['train'], 'y_train': ys['train'], 'x_test': xs['test'], 'y_test': ys['test'], 'labels': globals()["labels"]} return data
def generateTrainFeatures(L): """ This function generates the training data features and its target labels. Input: L : The number of training data Output: trainX -> a (L * 2000) numpy matrix representing the 2000 features for each of the L training samples trainY -> (L * 185) numpy matrix representing the target class of the training samples Logic: The input text is read, preprocessed to remove stop words, and is appended to a list. Similarly, each of the target class values are read into a list. Sklearn package TFIDF vectorizer is used for generating TFIDF matrix for the 2000 frequent words. The multi-label classification algorithms require a target Y variable of the form, (nsamples * nclasses), multilabel binarizer is used for converting the list of classes to a matrix form. """ global classOrder X = [] Y = [] # read the input for i in range(L): categories = raw_input() target = [int(y) for y in categories.split(" ")] del target[0] meaningfulWords = readInput() Y.append(target) X.append(meaningfulWords) # construct TF-IDF matrix representing the features trainX = vectorizer.fit_transform(X).toarray() # convert the target label list to a suitable matrix form mlb = MultiLabelBinarizer() trainY = mlb.fit_transform(Y) # for representing the order of the classes classOrder = mlb.classes_ return (trainX, trainY)
def read_all_data(p): img_src = "images/" df = pd.read_pickle("frame_no_stem.pkl") images = __read_all_images(img_src) print("Finished reading images") x_images = [] x_desc = [] y_category = [] all_categories = set() for asin in df.index.values: if asin in images: data = images[asin] x_images.append(data) item = df.loc[asin] x_desc.append(item.description) cate = item.categories y_category.append(cate) for c in cate: all_categories.add(c) print("Finished reading dataframe") mlb = MultiLabelBinarizer() y_total = mlb.fit_transform(y_category) x_images = np.array(x_images) x_desc = np.array(x_desc) return x_images,x_desc, y_total
def run_classifier(sentences, labels, test_docs): import numpy as np train_matrix, tfidf = tf_idf_fit_transform(sentences) test_sentences = doc2sentences(test_docs) sentence_matrix = tfidf.transform(test_sentences) print("Shape of sentence matrix : ", sentence_matrix.shape) from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() label_matrix = mlb.fit_transform(labels) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import linearSVC # estimator = SVC(kernel='linear') estimator = linearSVC() classifier = OneVsRestClassifier(estimator, n_jobs=-1) classifier.fit(train_matrix, label_matrix) predictions = classifier.predict(sentence_matrix) import csv with open("classified.csv", "w") as fl: writer = csv.writer(fl) for i in range(len(test_sentences)): curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1] writer.writerow((test_sentences[i], curr_pred))
def __init__(self, inter_filePath = "inter/technology_companies_of_the_united_states/"): # [[cat,cat...]...] self.m = Word2Vec.load_word2vec_format("vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5", binary=True) self.dim = 400 (correct_categories_train, context_categories_train) = self.load_category_page(inter_filePath + "category_page.txt") (correct_categories_test, context_categories_test) = self.load_category_page(inter_filePath + "category_page_test.txt") ## ---- By mean --- Xvectors = np.array(self.predict_vector_by_mean(context_categories_train)) Xvectors_test = np.array(self.predict_vector_by_mean(context_categories_test)) ## ---- By mean --- * ## ---- By SVM --- corpus_train = [" ".join(i) for i in context_categories_train] corpus_test = [" ".join(i) for i in context_categories_test] cv = CountVectorizer(min_df = 1) X = cv.fit_transform(corpus_train) ##TFIDF transformer = TfidfTransformer() X_tfidf = transformer.fit_transform(X) #Labels mlb = MultiLabelBinarizer() mlb.fit(correct_categories_train + correct_categories_test) Y = mlb.transform(correct_categories_train) ###Transform to multilabel indicator #predict test labels X_test = cv.transform(corpus_test) Y_test = mlb.transform(correct_categories_test) #Y_predict_ovr = self.ovrSVM(X, Y, X_test) Y_predict_ovr = self.ovrSVM(Xvectors, Y, Xvectors_test) #Y_predict_ovo = self.ovoSVM(X, Y, X_test) print "---One versus rest---" print "Macro F-1:", f1_score(Y_test, Y_predict_ovr, average='macro') print "Micro F-1:", f1_score(Y_test, Y_predict_ovr, average='micro')
def perform_train_test_split(db_name=ds.DEFAULT_DB_NAME, train_size=ds.DEFAULT_TRAININGSET_SIZE): """ Get all document_ids of given database and split's it according to given train_size. The tricky part is that we n :param db_name: Name of database to split documents (default DEFAULT_DB_NAME) :param train_size: Size in percentage [0,1] of the training set. :return splitted_dataset - List of lists [[DEFAULT_DATASET_LIST_INDEX_TRAINING], [DEFAULT_DATASET_LIST_INDEX_TEST]] """ database = db.couch_database(db_name) all_docs = database.getAllDocumentsFromDatabase() doc_ids_list = [] all_tag_list = [] i = 0 for row in all_docs.rows: document = row.doc #append the document id to doc_ids_list doc_ids_list.append(document[cp.COUCHDB_DOCUMENT_FIELD_ID]) tag_list = [] #if document has tags than split and add them if pp.STACKEXCHANGE_TAGS_COLUM in document.keys(): document_tags = document[pp.STACKEXCHANGE_TAGS_COLUM] tags_list = document_tags.split(sep=dtm_provider.TAG_SPLIT_separator) for tag in tags_list: #remove the closing tag (last item) tag_list.append(tag[:-1]) #append the list of document tags to all_tag_list all_tag_list.append(tag_list) i += 1 if i > 10000: break mlb = MultiLabelBinarizer() tags_encoded = mlb.fit_transform(all_tag_list) print(len(doc_ids_list)) splitted_dataset = cross_validation.train_test_split(doc_ids_list,tags_encoded, train_size=0.8, random_state=42, stratify=tags_encoded)
class VectorizedData: """ Simple container that holds the input dataset in a sklearn-friendly form, with X, y numpy vectors. TODO: we ignore # of matches for each fbpath """ def __init__(self, data, Xdict=None, Ydict=None): fdict = [q_to_fdict(q) for q in data] lset = [q_to_lset(q) for q in data] if Xdict is None: self.Xdict = DictVectorizer() self.X = self.Xdict.fit_transform(fdict) else: self.Xdict = Xdict self.X = self.Xdict.transform(fdict) if Ydict is None: self.Ydict = MultiLabelBinarizer() self.Y = self.Ydict.fit_transform(lset) else: self.Ydict = Ydict # Filter out data with unknown labels, MultiLabelBinarizer() cannot # handle this known_lset = [set([label for label in ls if label in self.Ydict.classes_]) for ls in lset] lset_n = sum([len(ls) for ls in lset]) known_lset_n = sum([len(ls) for ls in known_lset]) if known_lset_n < lset_n: print('dropped %d out of %d labels (not in training set)' % (lset_n - known_lset_n, lset_n), file=sys.stderr) self.Y = self.Ydict.transform(known_lset) def cfier_score(self, cfier, scorer): """ Measure cfier performance on this dataset. scorer -> lambda cfier, X: cfier.predict_proba(X) (or decision_function when probabilities not predicted) """ skl_score = cfier.score(self.X.toarray(), self.Y) # XXX: Matched paths might/could be weighted by their nMatches too... # Measure prediction performance Ypred = cfier.predict(self.X.toarray()) n_q = float(np.size(self.Y, axis=0)) # number of questions where all correct paths have been recalled recall_all = np.sum(np.sum(self.Y, axis=1) == np.sum(Ypred * self.Y, axis=1)) / n_q # number of questions where at least one correct path has been recalled recall_any = np.sum((np.sum(self.Y, axis=1) != 0) == (np.sum(Ypred * self.Y, axis=1) != 0)) / n_q # number of *PATHS* (not q.) that were correct precision = np.sum((Ypred + self.Y) == 2) / float(np.sum(Ypred)) # Measure scoring performance Yscores = scorer(cfier, self.X.toarray()) # MRR of first correct path mrr = mrr_by_score(self.Y, Yscores) # number of questions where at least one correct path has been recalled in top N paths # TODO return {'sklScore': skl_score, 'qRecallAll': recall_all, 'qRecallAny': recall_any, 'pPrec': precision, 'qScoreMRR': mrr}
def createDataMatrix(ngram_features, character_gram_features,tweetText, pos, pos_features, different_pos_tags, pos_text, voca_clusters, categories): tokenizer_case_preserve = Tokenizer(preserve_case=True) tokenizer = Tokenizer(preserve_case=False) handmade_features, cll, cll2 = [], [], [] for tweet in tweetText: feat = [] feat.append(exclamations(tweet)) feat.append(questions(tweet)) feat.append(questions_and_exclamation(tweet)) feat.append(emoticon_negative(tweet)) feat.append(emoticon_positive(tweet)) words = tokenizer_case_preserve.tokenize(tweet) #preserving casing feat.append(allCaps(words)) feat.append(elongated(words)) feat.append(questions_and_exclamation(words[-1])) handmade_features.append(np.array(feat)) words = tokenizer.tokenize(tweet) words = [word.strip("_NEG") for word in words] cll.append(getClusters(voca_clusters, words)) #cll2.append(getClusters(voca_handmade, words)) bl = csr_matrix(bing_lius(tweetText, pos, different_pos_tags, pos_text)) nrc_emo = csr_matrix(nrc_emotion(tweetText, pos, different_pos_tags, pos_text )) mpqa_feat = csr_matrix(mpqa(tweetText,pos, different_pos_tags, pos_text)) handmade_features = np.array(handmade_features) mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_clusters.values()))) cluster_memberships_binarized = csr_matrix(mlb.fit_transform(cll)) #mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_handmade.values()))) #cluster_memberships_binarized_2 = csr_matrix(mlb.fit_transform(cll2)) hasht = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-unigrams.txt')) # sent140aff_data = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-unigrams.txt')) hasht_bigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-bigrams.txt')) # sent140affBigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-bigrams.txt')) sentQ = csr_matrix(get_sentiwordnet(pos_text, pos)) pos_features = csr_matrix(pos_features) handmade_features = csr_matrix(handmade_features) # ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, # sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float) # ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float) ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, sentQ, handmade_features, pos_features, cluster_memberships_binarized, bl, mpqa_feat, nrc_emo, hasht, hasht_bigrams ), dtype=float) # print ngram_features.shape, character_gram_features.shape, cluster_memberships_binarized.shape, handmade_features.shape, pos_features.shape, # sent140affBigrams.shape, hasht_bigrams, hasht.shape, sent140aff_data.shape, bl.shape, mpqa_feat.shape, nrc_emo.shape y=[] for i in categories: if i=='positive': y.append(1) elif i == 'negative': y.append(-1) elif i == 'UNKNOWN': y.append(0) else: print i ffeatures = normalize(ffeatures) # ffeatures, y = shuffle(ffeatures,y) return ffeatures, y
def print_report(name_classificator, testing_problems, testing_tags, predicted_problems, predicted_tags): predicted_problems, predicted_tags = make_right_order(testing_problems, predicted_problems, predicted_tags) mlb = MultiLabelBinarizer().fit(testing_tags + predicted_tags) testing_tags = mlb.transform(testing_tags) predicted_tags = mlb.transform(predicted_tags) print(name_classificator) print(classification_report(testing_tags, predicted_tags, target_names=mlb.classes_)) print('label ranking average precision score =', label_ranking_average_precision_score(testing_tags, predicted_tags)) print('\n', ('#'*100), '\n')
def xval(clf, x, y, train_index, test_index): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(x_train, y_train) mlb = MultiLabelBinarizer() y_pred = clf.predict_proba(x_test) mse = mean_squared_error(mlb.fit_transform(label_binarize(y_test, clf.classes_)), y_pred) acc = accuracy_score(y_test, y_pred.argmax(axis=1)) evals = clf.get_num_evals() return mse, acc, evals
def test_BRKnna_no_labels_take_closest(self): data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0, 1]]) train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid2', 'lid3'], ['lid0', 'lid5']] mlb = MultiLabelBinarizer(sparse_output=True) y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(n_neighbors=2, threshold=0.6, mode='a') knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[0, 1]])).todense() print(pred) np.testing.assert_array_equal([[1, 0, 0, 0, 0]], pred)
def test_BRKnna_predict_dense(self): data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer() y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a') knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense() np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
def test_BRKnnb_predict_two_samples(self): data = csr.csr_matrix([[0, 1], [1, 1.1], [1, 1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid5'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer(sparse_output=True) y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(mode='b', n_neighbors=3) knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[0, 1], [2, 2]])).todense() np.testing.assert_array_equal([[1, 1, 0, 0], [0, 0, 1, 1]], pred)
def run_classifierAccuracy(trainSentences, trainLabels, testSentences, testLabels): all_labels = ["Drought", "Earthquake", "Flood", "Epidemic", "Hurricane", \ "Rebellion", "Terrorism", "Tornado", "Tsunami", "displaced_people_and_evacuations", \ "donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \ "injured_or_dead_people", "missing_trapped_or_found_people"] disaster_labels = ["Drought", "Earthquake", "Flood", "Hurricane", \ "Tornado", "Tsunami", "displaced_people_and_evacuations", \ "donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \ "injured_or_dead_people", "missing_trapped_or_found_people"] health_labels = ["Epidemic", "displaced_people_and_evacuations", \ "donation_needs_or_offers_or_volunteering_services", \ "injured_or_dead_people"] conflict_labels = ["Rebellion", "Terrorism", "displaced_people_and_evacuations", \ "infrastructure_and_utilities_damage", \ "injured_or_dead_people", "missing_trapped_or_found_people"] import numpy as np curr_labels = all_labels trainLabels = [list(set(l).intersection(curr_labels)) for l in trainLabels] testLabels = [list(set(l).intersection(curr_labels))for l in testLabels] from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=curr_labels) train_label_matrix = mlb.fit(trainLabels) print("Labels : ", mlb.classes_) train_label_matrix = mlb.transform(trainLabels) test_label_matrix = mlb.transform(testLabels) print("Shape of label matrix : ", test_label_matrix.shape) train_matrix, tfidf = tf_idf_fit_transform(trainSentences) test_matrix = tfidf.transform(testSentences) print("Shape of sentence matrix : ", test_matrix.shape) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier # estimator = LinearSVC() estimator = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0, n_jobs = -1) classifier = OneVsRestClassifier(estimator, n_jobs=-1) classifier.fit(train_matrix, train_label_matrix) predictions = classifier.predict(test_matrix) from sklearn.metrics import f1_score, precision_score, recall_score print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro')) print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro')) print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro')) print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro')) print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro')) print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro')) print("Macro-Precision", precision_score(test_label_matrix, predictions, average=None)) print("Macro-Recall", recall_score(test_label_matrix, predictions, average=None)) print("Macro-F1", f1_score(test_label_matrix, predictions, average=None))
def main(): #sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"]) sets = select_sets_by_tag(20,4,tag_names) #sets = random_select_sets(30,6) train_tags = fetch_tags(sets["train"]) train_texts = id_to_filename(sets["train"])#txt_to_list(sets["train"]) #vectorize count_vect = CountVectorizer(stop_words='english', encoding="utf-16", input="filename") X_train_counts = count_vect.fit_transform(train_texts) #tf-idf transformation tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) #process tags mlb = MultiLabelBinarizer() processed_train_tags = mlb.fit_transform(train_tags) #rint(processed_train_tags) #classifier #clf = OneVsRestClassifier(MultinomialNB()) clf = OneVsRestClassifier(LinearSVC()) clf.fit(X_train_tfidf,processed_train_tags) print("classes:{}".format(clf.classes_)) #process test set test_texts = id_to_filename(sets["test"])#txt_to_list(sets["test"]) X_test_counts = count_vect.transform(test_texts) #print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts))) X_test_tfidf = tfidf_transformer.transform(X_test_counts) predicted_tags = clf.predict(X_test_tfidf) predicted_tags_readable = mlb.inverse_transform(predicted_tags) test_tags_actual = fetch_tags(sets["test"]) predicted_probs = clf.decision_function(X_test_tfidf) #predicted_probs = clf.get_params(X_test_tfidf) class_list = mlb.classes_ report = metrics.classification_report(mlb.transform(test_tags_actual),predicted_tags,target_names=class_list) print(report) #retrieve top 30% for each class top_percentage = 30 threshold_index = int( len(sets["test"]) *(top_percentage/100.0) ) threshold_vals_dic = {} threshold_vals = [] num_classes = len(class_list) for i in range(num_classes): z = [ predicted_probs[j,i] for j in range(len(sets["test"]))] z.sort(reverse=True) threshold_vals_dic[class_list[i]]= z[threshold_index] threshold_vals.append(z[threshold_index]) print(threshold_vals_dic) print_predictions(sets["test"],predicted_tags_readable,class_list, class_probablities=predicted_probs,threshold_vals=threshold_vals)
class TimeSeriesLabelTransformer(BaseTaskTransformer): def __init__(self, namespace, name, labels=None): '''Initialize a time-series label transformer Parameters ---------- jam : jams.JAMS The JAMS object container n_samples : int > 0 The number of samples in the audio frame label_encoder : sklearn.preprocessing.MultiLabelBinarizer The (pre-constructed) label encoder ''' super(TimeSeriesLabelTransformer, self).__init__(namespace, 0) self.encoder = MultiLabelBinarizer() self.encoder.fit([labels]) self._classes = set(self.encoder.classes_) self.name = name def transform(self, jam): ann = self.find_annotation(jam) intervals = np.asarray([[0.0, jam.file_metadata.duration]]) values = [None] mask = False if ann: ann_int, ann_val = ann.data.to_interval_values() intervals = np.vstack([intervals, ann_int]) values.extend(ann_val) mask = True # Suppress all intervals not in the encoder tags = [] for v in values: if v in self._classes: tags.extend(self.encoder.transform([[v]])) else: tags.extend(self.encoder.transform([[]])) tags = np.asarray(tags) target = self.encode_intervals(jam.file_metadata.duration, intervals, tags) return {'output_{:s}'.format(self.name): target, 'mask_{:s}'.format(self.name): mask}
def get_data(train_file, test_file): X_train, Y_train = load_data(train_file) X_train = [ln.split('\t')[1] for ln in X_train] X_test, Y_test = load_data(test_file) X_test = [ln.split('\t')[1] for ln in X_test] mlb = MultiLabelBinarizer() Y_train = [set(s.split('_')) - {'None'} for s in Y_train] Y_test = [set(s.split('_')) - {'None'} for s in Y_test] Y_train = mlb.fit_transform(Y_train) Y_test = mlb.transform(Y_test) return X_train, X_test, Y_train, Y_test, mlb.classes_
def test_multilabelbinarizer_vs_sklearn(): # Compare msmbuilder.preprocessing.MultiLabelBinarizer # with sklearn.preprocessing.MultiLabelBinarizer multilabelbinarizerr = MultiLabelBinarizerR() multilabelbinarizerr.fit(np.concatenate(trajs)) multilabelbinarizer = MultiLabelBinarizer() multilabelbinarizer.fit(trajs) y_ref1 = multilabelbinarizerr.transform(trajs[0]) y1 = multilabelbinarizer.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def __kfold_prob_tp_fp(self, X, y, n_folds=2): # if isinstance(X, csr_matrix) and isinstance(y, np.ndarray): # X=X.toarray() # elif isinstance(X, np.ndarray) and isinstance(y, np.ndarray): # if len(y.shape)==1: # y=MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y]) # elif len(y.shape)==2: # pass if isinstance(y, list): y = np.asarray(y) try: with open(self.prefix + self.dir_name + "/" + str(n_folds) + "FCV_prob.pickle", "rb") as f: [tp_av, fp_av] = pickle.load(f) except: kf = KFold(y.shape[0], n_folds=n_folds) TP_avr = [] FP_avr = [] for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model = self.model model = model.fit(X_train, y_train) y_predict = model.predict(X_test) y_prob_predict = model.predict_proba(X_test) TP = [] FP = [] if len(y.shape) == 1: y_predict = MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y_predict]) elif len(y.shape) == 2: pass for class_ind, class_prob in zip(y_predict.transpose(), y_prob_predict.transpose()): TP_class = [] FP_class = [] for ind, prob in zip(class_ind, class_prob): if ind == 1: TP_class.append(prob) elif ind == 0: FP_class.append(prob) TP.append(np.sum(TP_class) / len(class_ind)) FP.append(np.sum(FP_class) / len(class_ind)) TP_avr.append(TP) FP_avr.append(FP) tp_av, fp_av = np.average(TP_avr, axis=0), np.average(FP_avr, axis=0) with open(self.prefix + self.dir_name + "/" + str(n_folds) + "FCV_prob.pickle", "wb") as f: pickle.dump([tp_av, fp_av], f) f.close() # print('tp, fp by prob', tp_av, fp_av) return [tp_av, fp_av]
def load_data(): labels=pd.read_csv("train.csv") bismatch=pd.read_csv("train_photo_to_biz_ids.csv") labels=bismatch.merge(labels,how='left',on='business_id') labels=labels[pd.isnull(labels['labels'])==False] labels['labels']=labels['labels'].map(lambda x:[int(i) for i in x.split(" ")]) training_=os.listdir("train_photos/train244") train_ids=pd.DataFrame({"photo_id":[int(i.split(".")[0]) for i in training_]}) train_ids=train_ids.merge(labels,on='photo_id',how='inner') # val_ids=val_ids.merge(labels,on='photo_id',how='inner') mlb=MultiLabelBinarizer() mlb.fit(train_ids['labels'].tolist()) # X_train=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_ids['photo_id'].tolist()]).astype(np.float32) # X_test=np.array([imread('train_photos/val244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()]).astype(np.float32) return train_ids,mlb
def prepVect(min_df=2, max_features=50000, n_captions=5, n_sbu=None, multilabel=False): print "prepping the Word Tokenizer..." _0, _1, trY, _3 = coco(mode='full', n_captions=n_captions) if n_sbu: _4, sbuY, _5 = sbuXYFilenames(n_sbu) trY.extend(sbuY) vect = Tokenizer(min_df=min_df, max_features=max_features) captions = sampleCaptions(trY, n_captions) vect.fit(captions) if multilabel: mlb = MultiLabelBinarizer() mlb.fit(vect.transform(captions)) return vect, mlb # if not multilabel: return vect
def __init__(self): self.trainExamples = ['exodus_gods_and_kings.p', 'how_to_train_your_dragon_2.p', 'bears.p', 'see_no_evil_2.p', 'addicted.p', "the_internet's_own_boy_the_story_of_aaron_swartz.p", 'the_salt_of_the_earth.p', 'the_other_woman.p', 'project_almanac.p', 'edge_of_tomorrow.p', 'maya_the_bee_movie.p', 'cowspiracy_the_sustainability_secret.p', "let's_be_cops.p", "winter's_tale.p", 'the_trip_to_italy.p', 'yellowbird.p', 'alexander_and_the_terrible_horrible_no_good_very_bad_day.p', 'rosewater.p', 'the_hero_of_color_city.p', 'endless_love.p', 'dracula_untold.p', 'dumb_and_dumber_to.p', 'tomorrowland.p', 'the_hunger_games_mockingjay_part_1.p', 'tammy.p', 'hot_tub_time_machine_2.p', 'lucy.p', 'the_lego_movie.p', 'the_judge.p', 'cake.p', 'st_vincent.p', 'black_or_white.p', 'american_sniper.p', 'mr_peabody_&_sherman.p', 'this_is_where_i_leave_you.p', 'x-men_days_of_future_past.p', 'non-stop.p', 'get_on_up.p', 'the_fault_in_our_stars.p', 'song_one.p', 'robocop.p', 'into_the_storm.p', 'a_most_wanted_man.p', 'the_good_lie.p', 'wild.p', 'the_maze_runner.p', 'beyond_the_lights.p', 'divergent.p', 'spring.p', 'as_above_so_below.p', 'noble.p', 'hercules.p', 'i-lived&y=2015.p', 'night_at_the_museum_secret_of_the_tomb.p', 'planes:fire_&_rescue.p', 'old_fashioned.p', 'the_identical.p', 'dawn_of_the_planet_of_the_apes.p', 'cabin_fever_patient_zero.p', 'ride_along.p', 'dear_white_people.p', 'if_i_stay.p', 'red_army.p', 'the_boxtrolls.p', 'captain_america_the_winter_soldier.p', 'virunga.p', 'the_interview.p', 'earth_to_echo.p', 'a_walk_among_the_tombstones.p', 'persecuted.p', 'the_book_of_life.p', 'unbroken.p', 'the_drop.p', 'need_for_speed.p', 'brick_mansions.p', 'maleficent.p', 'blended.p', "devil's_due.p", 'jessabelle.p', 'fear_clinic.p', 'gone_girl.p', 'birdman_or_the_unexpected_virtue_of_ignorance.p', 'kill_the_messenger.p', 'my_little_pony_equestria_girls.p', 'rio_2.p', 'big_hero_6.p', 'guardians_of_the_galaxy.p', 'noah.p', 'the_hobbit_the_battle_of_the_five_armies.p', 'i_frankenstein.p', 'the_november_man.p', 'the_pyramid.p', 'and_so_it_goes.p', 'birdman_or_the_unexpected_virtue_of_ignorance.p', 'inherent_vice.p', 'merchants_of_doubt.p', 'iris.p', 'lambert,_stamp.p'] self.testExamples = [x for x in util2.getMovieDataset() if x not in self.trainExamples] # Standard DictVectorizer fitted with all colors as the features. self.dVec = DictVectorizer(sparse=False) self.dVec.fit([dict((feature,0) for feature in util2.getColors())]) # Standard MultiLabelBinarizer with all genre names self.mlb = MultiLabelBinarizer() self.pipeline = Pipeline([ ('organizeData', Movie_Data_Aggregator()), ('union', FeatureUnion( transformer_list = [ ('colors', Pipeline([ ('selector', Data_Selector(key='colors')), ('dVec', self.dVec), ])), ('subs', Pipeline([ ('selector', Data_Selector(key='subs')), ('tfidf', TfidfVectorizer(strip_accents='ascii', max_features=15)), ])), ], transformer_weights={ 'colors': 0.5, 'subs': 0.5, }, )), ('sgd', SGDClassifier(alpha= 1e-06, loss="perceptron", n_iter= 150, penalty="l2")), ]) # OneVsRestClassifier used for prediction self.classif = OneVsRestClassifier(self.pipeline)
def chi2(X, y): X = check_array(X, accept_sparse='csr') if np.any((X.data if issparse(X) else X) < 0): raise ValueError("Input X must be non-negative.") Y = MultiLabelBinarizer().fit_transform(y) if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) observed = safe_sparse_dot(Y.T, X) # n_classes * n_features feature_count = check_array(X.sum(axis=0)) class_prob = check_array(Y.mean(axis=0)) expected = np.dot(class_prob.T, feature_count) return _chisquare(observed, expected)
class BaseAttributeCluster(object): attribute_name = '' def __init__(self): self._binariser = None self._kmeans_model = None self._records = self.get_records() self.data = pd.DataFrame(self._records) self.binarised_data = self.transform_data( self.data[self.attribute_name]) self.train, self.test = train_test(self.binarised_data, self.data['class']) @property def binariser(self): if self._binariser is None: all_labels = list( set([ item for r in self._records for item in self.tokenise(r[self.attribute_name]) ])) self._binariser = MultiLabelBinarizer() self._binariser.fit([all_labels]) return self._binariser def kmeans_model(self, filepath=None, ignore_no_file=True, n_clusters=2): if filepath is not None and os.path.exists(filepath): with open(filepath, 'rb') as f: self._kmeans_model = dill.load(f) elif filepath is not None and not ignore_no_file: raise FileNotFoundError(f'Saved model not found at {filepath}.') else: if self._kmeans_model is None: self._kmeans_model = KMeans(n_clusters=n_clusters) self._kmeans_model.fit(self.train['x']) if filepath is not None: with open(filepath, 'wb') as f: dill.dump(self._kmeans_model, f) return self._kmeans_model def get_records(self): attr = getattr(Citation, self.attribute_name) with SessionManager() as session_manager: citations = session_manager.session.query(attr, ManualClassification.classification_id) \ .join(ManualClassification, Citation.doi == ManualClassification.doi) \ .group_by(attr, ManualClassification.classification_id).all() return [{ self.attribute_name: getattr(c, self.attribute_name) if getattr(c, self.attribute_name) is not None else '', 'class': c.classification_id } for c in citations] @staticmethod def tokenise(input_string): return [t.strip().lower() for t in input_string.split(',')] def transform_data(self, data): binarised_data = self.binariser.transform( [self.tokenise(x) for x in data]) return binarised_data
def __init__(self, **kwargs): self.model = OneVsRestClassifier(LinearSVC(**kwargs), n_jobs=1) self.paras = kwargs self.mlb = MultiLabelBinarizer()
class PandasDataset: """Class to simplify pre processing steps on dataframe. Requires prioir understanding of the dataset """ def __init__(self): self.original_df = None self.current_df = None self.label_encoder = None def from_preprocessed(self, path: str): """Load from pre processed dataset file :param path: path to pre processed PandasDataset file :return: """ with open(path, 'rb') as f: dataset = pickle.load(f) self.original_df = dataset.original_df self.current_df = dataset.current_df self.label_encoder = dataset.label_encoder f.close() def read_data(self, filename: str = "sentisum-evaluation-dataset.csv"): """Load the CSV file into a workable format :param: :return: pd dataset """ self.original_df = pd.read_csv(filename, header=None) data = self.original_df.fillna('') column_names = ['text'] label_names = [] for idx in range(1, 15): name = 'label_' + str(idx) label_names.append(name) column_names.append(name) data.columns = column_names data['topics'] = data[label_names].values.tolist() out_data = data[['text', 'topics']] def clean_topics(x): return [top for top in x if top != ''] out_data['topics'] = out_data['topics'].map(clean_topics) self.current_df = out_data return self.current_df def replace_labels(self, label: str, target: str): """Replace occurances of all labels with the target label :param label: source label :param target: target label :return: """ def replace_lab(x): return [top if top != label else target for top in x] self.current_df.topics = self.current_df.topics.map(replace_lab) def merge_labels(self, minimum_samples: int = 100, minority_label: str = 'others'): """Merge Labels with less than minimum samples :param minimum_samples: :param minority_label: name for the common label :return: """ label_counts = self.current_df.topics.explode().value_counts() label_names = label_counts.index label_others = [] for idx, label in enumerate(label_names): if label_counts[idx] < minimum_samples: label_others.append(label) def replace_others(x): new_labels = [] for top in x: sent = top.split(' ')[-1] if top in label_others: new_labels.append(' '.join([minority_label, sent])) else: new_labels.append(top) return new_labels self.current_df.topics = self.current_df.topics.map(replace_others) return self.current_df def undersample_label(self, topic: str, fraction: float): """Undersample a given label. Selectively works on single occurances :param topic: :param fraction: fraction to retain :return: """ temp_df = self.current_df[self.current_df.topics.apply( lambda x: topic in x)] temp_df = temp_df[temp_df.topics.str.len() == 1].sample(frac=fraction) single_label_data = self.current_df[self.current_df.topics.str.len() == 1] drop_index = single_label_data[single_label_data.topics.apply( lambda x: topic in x)].index self.current_df = self.current_df.drop(drop_index) self.current_df = self.current_df.append(temp_df) def undersample_label_combo(self, topic_a: str, topic_b: str, fraction: float): """Under sample a given combination of labels. todo Add a combo with more than 2 topics :param topic_a: :param topic_b: :param fraction: fraction to retain :return: """ temp_df = self.current_df[self.current_df.topics.apply( lambda x: x == [topic_a, topic_b])] temp_df = temp_df[temp_df.topics.str.len() == 2].sample(frac=fraction) double_label_data = self.current_df[self.current_df.topics.str.len() == 2] drop_index = double_label_data[double_label_data.topics.apply( lambda x: x == [topic_a, topic_b])].index self.current_df = self.current_df.drop(drop_index) self.current_df = self.current_df.append(temp_df) def overview(self): """Gives a quick overview of the current dataframe :return: """ return { "value_counts": self.current_df.topics.explode().value_counts(), "labels": self.current_df.topics.explode().unique(), "mean no. of tokens": self.current_df.text.str.split().str.len().std(), "mean no. of sentences": self.current_df.text.str.split('.').str.len().std() } def encode_labels(self): """Encode the label classes for classification using MultiLabelBinarizer :return: class list """ def l2t(x): return tuple(x) self.current_df.topics = self.current_df.topics.map(l2t) self.label_encoder = MultiLabelBinarizer() self.current_df['encoded'] = self.label_encoder.fit_transform( self.current_df.topics.tolist()).tolist() return self.label_encoder.classes_.tolist() def train_test_split(self, test_size: float = 0.2): """Generate train and test sets :param test_size: test set fraction :return: train_dataset, test_dataset """ train_dataset, test_dataset = train_test_split(self.current_df, test_size=test_size) train_dataset = train_dataset.reset_index(drop=True) test_dataset = test_dataset.reset_index(drop=True) return train_dataset, test_dataset def save_dataset(self, path: str): """ :param path: :return: """ output = open(path, 'wb') pickle.dump(self, output) output.close()
if dat == 'forestcover': dataset = fetch_covtype(shuffle=True) X = dataset.data y = dataset.target # normal data are those with attribute 2 # abnormal those with attribute 4 s = (y == 2) + (y == 4) X = X[s, :] y = y[s] y = (y != 2).astype(int) print_outlier_ratio(y) print('--- Vectorizing data...') if dat == 'SF': lb = MultiLabelBinarizer() x1 = lb.fit_transform(X[:, 1]) X = np.c_[X[:, :1], x1, X[:, 2:]] y = (y != b'normal.').astype(int) print_outlier_ratio(y) if dat == 'SA': lb = MultiLabelBinarizer() x1 = lb.fit_transform(X[:, 1]) x2 = lb.fit_transform(X[:, 2]) x3 = lb.fit_transform(X[:, 3]) X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] y = (y != b'normal.').astype(int) print_outlier_ratio(y) if dat in ('http', 'smtp'):
def main(): f = open('store.pckl', 'rb') eng_data = pickle.load(f) f.close() eng_data = eng_data[eng_data.ingredients_text.apply(len) > 0].reset_index() np.random.seed(1234) train, validate, test = np.split( eng_data.sample(frac=1, random_state=134), [int(.6 * len(eng_data)), int(.8 * len(eng_data))]) mlb = MultiLabelBinarizer() X_train = mlb.fit_transform(train['ingredients_text']).astype(np.float32) y_train = train['nutrition-score-fr_100g'].values train_dataset = torch.utils.data.TensorDataset( torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float()) all_ing = len(X_train[0]) # neural network model = NeuralNet(all_ing) print(model) train_loader = DataLoader(train_dataset, batch_size=200, shuffle=True, num_workers=4) # start training learning_rate = 0.001 # optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5) optimizer = optim.Adam(model.parameters(), lr=learning_rate) criterion = nn.MSELoss() for epoch in range(1, 20): total_loss = 0 correct = 0 for batch_idx, (data, target) in enumerate(train_loader): data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = criterion(output[:, 0], target) loss.backward() optimizer.step() ''' if batch_idx % 10 == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.data[0])) ''' # accumulate the loss of each minibatch total_loss += loss.data[0] * len(data) # compute the accuracy per minibatch pred_classes = output.data.max(1, keepdim=True)[1] correct += pred_classes.eq( target.data.view_as(pred_classes).long()).sum().double() # compute the mean loss for each epoch mean_loss = total_loss / len(train_loader.dataset) # compute the accuracy for each epoch acc = correct / len(train_loader.dataset) print( 'Train Epoch: {} Avg_Loss: {:.5f} Acc: {}/{} ({:.3f}%)'.format( epoch, mean_loss, correct, len(train_loader.dataset), 100. * acc)) pdb.set_trace() validate.ingredients_text = validate.ingredients_text.str.replace( 'strawberry candy', '') X_val = mlb.transform(validate['ingredients_text']).astype(np.float32) y_test = test['nutrition-score-fr_100g'].values regr = lm.LinearRegression() # 1 regr.fit(X_train, y_train) pdb.set_trace()
def __get_data(attr=None, short=False, raw=False, mem=True, source=False): """Get training, test, and validation data for X and y, handling the logic of retrieving single-label vs. multi-label data.""" if not source: # Read the labels, and split them between training, validation, and test sets. print('reading the labels... ', end='', flush=True) df = pd.read_csv(labels_path) print('done!', flush=True) if raw: return df if attr: if 'from' in attr: y = df[re.search('(.+)_from_(.+)', attr).groups()[0]] elif '+' in attr: y = df[list(re.search('(.+)\+(.+)', attr).groups())] else: y = df[attr] #elif MULTI: y = df[list(attributes)] #else: y = df[attribute] assert (attr is not None) # Convert the class vectors to binary class matrices. if '+' in attr: """ two options: I. multi-labeled matrix (each observation has multiple columns set to 1) II. expanded categories (each observation has one column set to 1) """ print('extracting multi-label data via one-hot-encoding... ', end='', flush=True) if MULTI_AS_ONEHOT_DISTINCT_CLASSES: #this is option II multilabels = np.tile(y.columns, (y.shape[0], 1)).astype(object) multilabels += df[y.columns].values.astype(str) multilabels = multilabels.sum(axis=1) lb = LabelBinarizer() lb.fit(np.unique(multilabels)) y = lb.transform(multilabels) print() print(multilabels) print(y) pickle_save(lb, binarizer_path.format(attr)) else: #this is option I labels = [] for a in y.columns: l = a + '{}' for v in df[a].unique(): labels.append(l.format(v)) bin = MultiLabelBinarizer() bin.fit([labels]) multilabels = np.tile(y.columns, (y.shape[0], 1)).astype(object) multilabels += df[y.columns].values.astype(str) y = bin.transform(multilabels) pickle_save(bin, binarizer_path.format(attr + '__multi')) else: print('extracting label data via one-hot-encoding... ', end='', flush=True) y = to_categorical(y) y_train = y[:num_train] y_valid = y[num_train:num_train + num_valid] y_test = y[num_train + num_valid:] print('done!', flush=True) if short: return sublocals(locals(), 'y_train', 'y_test', 'y_valid') # Read the images, and split them between training, validation, and test sets. print('reading the scaled features... ', end='', flush=True) X = np.load(scaled_data_path) x_train = X[:num_train, :] x_valid = X[num_train:num_train + num_valid, :] x_test = X[num_train + num_valid:, :] print('--------------------------------------------') print(x_train.shape[0], 'training samples') print('x_train shape:', x_train.shape) print(x_valid.shape[0], 'validation samples') print('x_valid shape:', x_valid.shape) print(x_test.shape[0], 'test samples') print('x_test shape:', x_test.shape) if source: return sublocals(locals(), 'x_train', 'x_test', 'x_valid') print('y_train.shape:', y_train.shape) print('y_valid.shape:', y_valid.shape) print('y_test.shape:', y_test.shape) print('--------------------------------------------') return sublocals(locals(), 'x_train', 'x_test', 'x_valid', 'y_train', 'y_test', 'y_valid')
vectorizer = TfidfVectorizer(tokenizer=tokenize) vectorised_train_documents = vectorizer.fit_transform(train_docs) vectorised_test_documents = vectorizer.transform(test_docs) print(vectorised_train_documents.shape) print(vectorised_test_documents.shape) #Calcolo indice tfifd tfidf_transformer = TfidfTransformer() vectorised_train_tfidf_documents = tfidf_transformer.fit_transform( vectorised_train_documents) print(vectorised_train_tfidf_documents.shape) #Estrazione valori attesi mlb = MultiLabelBinarizer() train_labels = mlb.fit_transform( [reuters.categories(doc_id) for doc_id in train_docs_id]) test_labels = mlb.transform( [reuters.categories(doc_id) for doc_id in test_docs_id]) #Predizione e analisi risultati multinomial = [] bernoulli = [] macqa = [] bacqa = [] macc = [] bacc = [] iter = [170, 200, 500, 800, 1000, 2000, 5000, 10000, 15000, 18000]
def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams)
embeddings = numpy.load(embedding_file) vocab = json.load(open(vocab_file)) int2vocab = {i: w for w, i in vocab.items()} embedding_dim = embeddings[0].shape[0] print('Loading corpus', corpus_file) dataset: SplitDataSet = split_corpus(corpus_file) print(len(dataset.y_train), 'training samples') print(len(dataset.y_test), 'test samples') print(len(dataset.y_dev), 'dev samples') # oov_counts = count_oovs(dataset.x_train) # breakpoint() mlb = MultiLabelBinarizer().fit(dataset.y_train) num_classes = mlb.classes_.shape[0] train_y = mlb.transform(dataset.y_train) test_y = mlb.transform(dataset.y_test) dev_y = mlb.transform(dataset.y_dev) train_x_int = [[ vocab[w] for w in re.findall('\w+', x_.lower()) if w in vocab ] for x_ in dataset.x_train] test_x_int = [[ vocab[w] for w in re.findall('\w+', x_.lower()) if w in vocab ] for x_ in dataset.x_test] dev_x_int = [[ vocab[w] for w in re.findall('\w+', x_.lower()) if w in vocab ] for x_ in dataset.x_dev]
def audio_tagging_results(reference, estimated): """audio_tagging_results. Returns clip-level F1 Scores :param reference: The ground truth dataframe as pd.DataFrame :param estimated: Predicted labels by the model ( thresholded ) """ if "event_label" in reference.columns: classes = reference.event_label.dropna().unique().tolist( ) + estimated.event_label.dropna().unique().tolist() encoder = MultiLabelBinarizer().fit([classes]) reference = get_audio_tagging_df(reference) estimated = get_audio_tagging_df(estimated) ref_labels, _ = utils.encode_labels(reference['event_label'], encoder=encoder) reference['event_label'] = ref_labels.tolist() est_labels, _ = utils.encode_labels(estimated['event_label'], encoder=encoder) estimated['event_label'] = est_labels.tolist() matching = reference.merge(estimated, how='outer', on="filename", suffixes=["_ref", "_pred"]) def na_values(val): if type(val) is np.ndarray: return val elif isinstance(val, list): return np.array(val) if pd.isna(val): return np.zeros(len(encoder.classes_)) return val ret_df = pd.DataFrame(columns=['label', 'f1', 'precision', 'recall']) if not estimated.empty: matching['event_label_pred'] = matching.event_label_pred.apply( na_values) matching['event_label_ref'] = matching.event_label_ref.apply(na_values) y_true = np.vstack(matching['event_label_ref'].values) y_pred = np.vstack(matching['event_label_pred'].values) ret_df.loc[:, 'label'] = encoder.classes_ for avg in [None, 'macro', 'micro']: avg_f1 = skmetrics.f1_score(y_true, y_pred, average=avg) avg_pre = skmetrics.precision_score(y_true, y_pred, average=avg) avg_rec = skmetrics.recall_score(y_true, y_pred, average=avg) # avg_auc = skmetrics.roc_auc_score(y_true, y_pred, average=avg) if avg == None: # Add for each label non pooled stats ret_df.loc[:, 'precision'] = avg_pre ret_df.loc[:, 'recall'] = avg_rec ret_df.loc[:, 'f1'] = avg_f1 # ret_df.loc[:, 'AUC'] = avg_auc else: # Append macro and micro results in last 2 rows ret_df = ret_df.append( { 'label': avg, 'precision': avg_pre, 'recall': avg_rec, 'f1': avg_f1, # 'AUC': avg_auc }, ignore_index=True) return ret_df
def dump_multilabel_classification( model, suffix="", folder=None, allow_failure=None, verbose=False, label_string=False, first_class=0, comparable_outputs=None, target_opset=None): """ Trains and dumps a model for a binary classification problem. The function trains a model and calls :func:`dump_data_and_model`. Every created filename will follow the pattern: ``<folder>/<prefix><task><classifier-name><suffix>.<data|expected|model|onnx>.<pkl|onnx>``. """ X = [[0, 1], [1, 1], [2, 0], [0.5, 0.5], [1.1, 1.1], [2.1, 0.1]] X = numpy.array(X, dtype=numpy.float32) if label_string: y = [["l0"], ["l1"], ["l2"], ["l0", "l1"], ["l1"], ["l2"]] else: y = [[0 + first_class], [1 + first_class], [2 + first_class], [0 + first_class, 1 + first_class], [1 + first_class], [2 + first_class]] y = MultiLabelBinarizer().fit_transform(y) model.fit(X, y) if verbose: print("[make_multilabel_classification] model '{}'".format( model.__class__.__name__)) model_onnx, prefix = convert_model( model, "multi-class classifier", [("input", FloatTensorType([None, 2]))], target_opset=target_opset) if verbose: print("[make_multilabel_classification] model was converted") dump_data_and_model( X.astype(numpy.float32), model, model_onnx, folder=folder, allow_failure=allow_failure, basename=prefix + "Mcl" + model.__class__.__name__ + suffix, verbose=verbose, comparable_outputs=comparable_outputs, ) X, y = make_multilabel_classification(40, n_features=4, random_state=42, n_classes=3) X = X[:, :2] model.fit(X, y) if verbose: print("[make_multilabel_classification] model '{}'".format( model.__class__.__name__)) model_onnx, prefix = convert_model(model, "multi-class classifier", [("input", FloatTensorType([None, 2]))]) if verbose: print("[make_multilabel_classification] model was converted") dump_data_and_model( X[:10].astype(numpy.float32), model, model_onnx, folder=folder, allow_failure=allow_failure, basename=prefix + "RndMla" + model.__class__.__name__ + suffix, verbose=verbose, comparable_outputs=comparable_outputs, )
def tokenSequenceToPianoRoll(token_sequence, int_to_combi): mlb = MultiLabelBinarizer() mlb.fit([np.arange(128).tolist()]) combi_pairs = [int_to_combi[i] for i in token_sequence] piano_roll = mlb.transform(combi_pairs) return piano_roll
clean_txt = re.sub('[^a-z\s]+',' ',text) # replacing spcl chars, punctuations by space clean_txt = re.sub('(\s+)',' ',clean_txt) # replacing multiple spaces by single space min_length = 3 words = map(lambda word: word.lower(), word_tokenize(clean_txt)) # tokenizing, lowercase words = [word for word in words if word not in Stop_Words] # filtering stopwords words = filter(lambda t: len(t)>=min_length, words) # filtering words of length <=2 tokens =(list(map(lambda token: PorterStemmer().stem(token),words))) # stemming tokens return tokens n_classes = 10 labels = categories stop_words = stopwords.words("english") mlb = MultiLabelBinarizer() docs = {} docs['train'] = [reuters.raw(doc_id) for doc_id in train] docs['test'] = [reuters.raw(doc_id) for doc_id in test] trd = docs['train'] tstd = docs['test'] y_tr = mlb.fit_transform(x1) y_tst = mlb.fit_transform(x2) t_d_tr = [tokenize(dd) for dd in trd]#tokenized training docs t_d_tst = [tokenize(ddd) for ddd in tstd] def read_corpus(fname):
imgs_train, labels_train, imgs_valid, labels_valid = dataset.train_test_split( test_shape=0.1) labels_train = np.array(dataset.labels_origin) imgs_train = np.array(dataset.imgs_origin) labels_train_splite = [] for labels in labels_train: label = labels.split("_") labels_train_splite.append(label) labels_valid_splite = [] for labels in labels_valid: label = labels.split("_") labels_valid_splite.append(label) mlb = MultiLabelBinarizer() labels_train = mlb.fit_transform(labels_train_splite) labels_valid = mlb.fit_transform(labels_valid_splite) labels_train = io.label_smooth(labels_train, [0, 1, 4, 5]) for (i, label) in enumerate(mlb.classes_): print("{}. {}".format(i + 1, label)) # %% 数据预处理 imgs_train = np.array(imgs_train, dtype="float32") imgs_valid = np.array(imgs_valid, dtype="float32") normalization_datagen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True)
class BeautyDataLoader: def __init__(self): self.config, _ = config.get_config() self.df = pd.read_csv(self.config.dataset_path) self.df['id'] = list(range(1, len(self.df)+1)) # self.df = shuffle(self.df) self.df.set_index('id') # self.train_test_split() self.startified_splits() self.generate_label_encodings() self.transform_labels() def data_loader(self): pass def train_test_split(self, split=.10): pass # self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, random_state=42) # self.X_train.reset_index(), self.X_test.reset_index(), self.y_train.reset_index(), self.y_test.reset_index() def startified_splits(self): self.dataset_split = { 'train_X': [], 'train_y': [], 'valid_X': [], 'valid_y': [], } skf = SKF( n_splits=self.config.KFolds, shuffle=True, random_state=42 ) data = self.df['file_path'] labels = self.df['isbeauty'] for train_index, test_index in skf.split(data, labels): self.dataset_split['train_X'].append([data[d] for d in train_index if d in data][:]) self.dataset_split['valid_X'].append([data[d] for d in test_index if d in data][:]) self.dataset_split['train_y'].append([ [self.df['isbeauty'][d], self.df['skill'][d]] for d in train_index if d in labels][:]) self.dataset_split['valid_y'].append( [[self.df['isbeauty'][d], self.df['skill'][d]] for d in test_index if d in labels][:]) # print(self.dataset_split) def get_label_names(self): return self.mlb.classes_ def generate_label_encodings(self): # train_labels, valid_labels = [], [] labels = [] for b, sk in zip(self.df.isbeauty[1:], self.df.skill[1:]): if b != 'isbeauty' and sk!='skill': labels.append([b, sk]) # print("Labels:") self.mlb = MultiLabelBinarizer() self.mlb.fit(labels) # Loop over all labels and show them n_labels = len(self.mlb.classes_) # for (i, label) in enumerate(self.mlb.classes_): # print("{}. {}".format(i, label)) def transform_labels(self): train_labels, valid_labels = [], [] for train_y_set_i, valid_y_set_i in zip(self.dataset_split['train_y'], self.dataset_split['valid_y']): train_labels.append(list(self.mlb.transform(train_y_set_i))) valid_labels.append(list(self.mlb.transform(valid_y_set_i))) self.dataset_split['train_y'] = train_labels self.dataset_split['valid_y'] = valid_labels def normalize_img(self, filenames, label): """ Function to normalize image between 0 and 1 Args: file_path ([str]): comple path of the image file label ([list]): milti-class label for the image """ try: img = tf.io.read_file(filenames) image_vec = tf.image.decode_jpeg(img, channels=CHANNELS) # resize and normalize img_norm = tf.image.resize(image_vec, [IMG_SIZE, IMG_SIZE])/255.0 return img_norm, label except Exception as e: print(e) def create_dataset(self, fold=0, is_training=True): """ Here fold 0 is first dataset from all the folds created. Args: fold (int, optional): [description]. Defaults to 0. dataset (str, optional): [description]. Defaults to 'train'. is_training (bool, optional): [description]. Defaults to True. """ AUTOTUNE = tf.data.experimental.AUTOTUNE # Adapt preprocessing and prefetching dynamically SHUFFLE_BUFFER_SIZE = 1024 dataset = {} train_files = self.dataset_split['train_X'][fold] train_labels = self.dataset_split['train_y'][fold] valid_files = self.dataset_split['train_X'][fold] valid_labels = self.dataset_split['train_y'][fold] for typ, filenames, labels in [('train', train_files, train_labels), ('valid', valid_files, valid_labels)]: train_data = tf.data.Dataset.from_tensor_slices((filenames, labels)) # print(list(train_data.as_numpy_iterator())[0]) # normalize images train_data = train_data.map( self.normalize_img, num_parallel_calls=AUTOTUNE ) # if is_training == True: # # This is a small dataset, only load it once, and keep it in memory. # train_data = train_data.cache() # # Shuffle the data each buffer size # train_data = train_data.shuffle(buffer_size=1024) # Batch the data for multiple steps train_data = train_data.batch(32, drop_remainder=True) # Fetch batches in the background while the model is training. train_data = train_data.prefetch(buffer_size=self.config.autotune) dataset[typ] = train_data del train_data # print(train_data) print('Dataset Creation done for {} fold'.format(fold)) return dataset
def find_label_issues( labels, pred_probs, *, confident_joint=None, filter_by="prune_by_noise_rate", return_indices_ranked_by=None, rank_by_kwargs={}, multi_label=False, frac_noise=1.0, num_to_remove_per_class=None, min_examples_per_class=1, n_jobs=None, verbose=False, ): """ Identifies potential label issues in the dataset using confident learning. Returns a boolean mask for the entire dataset where ``True`` represents a label issue and ``False`` represents an example that is confidently/accurately labeled. Instead of a mask, you can obtain *indices* of the label issues in your dataset by setting `return_indices_ranked_by` to specify the label quality score used to order the label issues. The number of indices returned is controlled by `frac_noise`: reduce its value to identify fewer label issues. If you aren't sure, leave this set to 1.0. Tip: if you encounter the error "pred_probs is not defined", try setting ``n_jobs=1``. Parameters ---------- labels : np.array A discrete vector of noisy labels, i.e. some labels may be erroneous. *Format requirements*: for dataset with K classes, labels must be in 0, 1, ..., K-1. pred_probs : np.array, optional An array of shape ``(N, K)`` of model-predicted probabilities, ``P(label=k|x)``. Each row of this matrix corresponds to an example `x` and contains the model-predicted probabilities that `x` belongs to each possible class, for each of the K classes. The columns must be ordered such that these probabilities correspond to class 0, 1, ..., K-1. **Caution**: `pred_probs` from your model must be out-of-sample! You should never provide predictions on the same examples used to train the model, as these will be overfit and unsuitable for finding label-errors. To obtain out-of-sample predicted probabilities for every datapoint in your dataset, you can use :ref:`cross-validation <pred_probs_cross_val>`. Alternatively it is ok if your model was trained on a separate dataset and you are only evaluating data that was previously held-out. confident_joint : np.array, optional An array of shape ``(K, K)`` representing the confident joint, the matrix used for identifying label issues, which estimates a confident subset of the joint distribution of the noisy and true labels, ``P_{noisy label, true label}``. Entry ``(j, k)`` in the matrix is the number of examples confidently counted into the pair of ``(noisy label=j, true label=k)`` classes. The `confident_joint` can be computed using :py:func:`count.compute_confident_joint <cleanlab.count.compute_confident_joint>`. If not provided, it is computed from the given (noisy) `labels` and `pred_probs`. filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given'}, default='prune_by_noise_rate' Method used for filtering/pruning out the label issues: - ``'prune_by_noise_rate'``: works by removing examples with *high probability* of being mislabeled for every non-diagonal in the confident joint (see `prune_counts_matrix` in `filter.py`). These are the examples where (with high confidence) the given label is unlikely to match the predicted label for the example. - ``'prune_by_class'``: works by removing the examples with *smallest probability* of belonging to their given class label for every class. - ``'both'``: Removes only the examples that would be filtered by both ``'prune_by_noise_rate'`` and ``'prune_by_class'``. - ``'confident_learning'``: Returns the examples in the off-diagonals of the confident joint. These are the examples that are confidently predicted to be a different label than their given label. - ``'predicted_neq_given'``: Find examples where the predicted class (i.e. argmax of the predicted probabilities) does not match the given label. return_indices_ranked_by : {None, 'self_confidence', 'normalized_margin', 'confidence_weighted_entropy'}, default=None If ``None``, returns a boolean mask (``True`` if example at index is label error). If not ``None``, returns an array of the label error indices (instead of a boolean mask) where error indices are ordered: - ``'normalized_margin'``: ``normalized margin (p(label = k) - max(p(label != k)))`` - ``'self_confidence'``: ``[pred_probs[i][labels[i]] for i in label_issues_idx]`` - ``'confidence_weighted_entropy'``: ``entropy(pred_probs) / self_confidence`` rank_by_kwargs : dict, optional Optional keyword arguments to pass into scoring functions for ranking by label quality score (see :py:func:`rank.get_label_quality_scores <cleanlab.rank.get_label_quality_scores>`). multi_label : bool, optional If ``True``, labels should be an iterable (e.g. list) of iterables, containing a list of labels for each example, instead of just a single label. The multi-label setting supports classification tasks where an example has 1 or more labels. Example of a multi-labeled `labels` input: ``[[0,1], [1], [0,2], [0,1,2], [0], [1], ...]``. frac_noise : float, default=1.0 Used to only return the "top" ``frac_noise * num_label_issues``. The choice of which "top" label issues to return is dependent on the `filter_by` method used. It works by reducing the size of the off-diagonals of the `joint` distribution of given labels and true labels proportionally by `frac_noise` prior to estimating label issues with each method. This parameter only applies for `filter_by=both`, `filter_by=prune_by_class`, and `filter_by=prune_by_noise_rate` methods and currently is unused by other methods. When ``frac_noise=1.0``, return all "confident" estimated noise indices (recommended). frac_noise * number_of_mislabeled_examples_in_class_k. num_to_remove_per_class : array_like An iterable of length K, the number of classes. E.g. if K = 3, ``num_to_remove_per_class=[5, 0, 1]`` would return the indices of the 5 most likely mislabeled examples in class 0, and the most likely mislabeled example in class 2. Note ---- Only set this parameter if ``filter_by='prune_by_class'``. You may use with ``filter_by='prune_by_noise_rate'``, but if ``num_to_remove_per_class=k``, then either k-1, k, or k+1 examples may be removed for any class due to rounding error. If you need exactly 'k' examples removed from every class, you should use ``filter_by='prune_by_class'``. min_examples_per_class : int, default=1 Minimum number of examples per class to avoid flagging as label issues. This is useful to avoid deleting too much data from one class when pruning noisy examples in datasets with rare classes. n_jobs : optional Number of processing threads used by multiprocessing. Default ``None`` sets to the number of cores on your CPU. Set this to 1 to *disable* parallel processing (if its causing issues). Windows users may see a speed-up with ``n_jobs=1``. verbose : optional If ``True``, prints when multiprocessing happens. Returns ------- label_issues : np.array A boolean mask for the entire dataset where ``True`` represents a label issue and ``False`` represents an example that is accurately labeled with high confidence. Note ---- You can also return the *indices* of the label issues in your dataset by setting `return_indices_ranked_by`. """ assert filter_by in [ "prune_by_noise_rate", "prune_by_class", "both", "confident_learning", "predicted_neq_given", ] # TODO: change default to confident_learning ? assert len(labels) == len(pred_probs) if filter_by in [ "confident_learning", "predicted_neq_given" ] and (frac_noise != 1.0 or num_to_remove_per_class is not None): warn_str = ( "WARNING! frac_noise and num_to_remove_per_class parameters are only supported" " for filter_by 'prune_by_noise_rate', 'prune_by_class', and 'both'. They " "are not supported for methods 'confident_learning' or " "'predicted_neq_given'.") warnings.warn(warn_str) if (num_to_remove_per_class is not None) and (filter_by in [ "confident_learning", "predicted_neq_given" ]): # TODO - add support for these two filters raise ValueError( "filter_by 'confident_learning' or 'predicted_neq_given' is not supported (yet) when setting 'num_to_remove_per_class'" ) # Set-up number of multiprocessing threads if n_jobs is None: n_jobs = multiprocessing.cpu_count() else: assert n_jobs >= 1 # Number of examples in each class of labels if multi_label: label_counts = value_counts([i for lst in labels for i in lst]) else: label_counts = value_counts(labels) # Number of classes labels K = len(pred_probs.T) # Boolean set to true if dataset is large big_dataset = K * len(labels) > 1e8 # Ensure labels are of type np.array() labels = np.asarray(labels) if confident_joint is None or filter_by == "confident_learning": from cleanlab.count import compute_confident_joint confident_joint, cl_error_indices = compute_confident_joint( labels=labels, pred_probs=pred_probs, multi_label=multi_label, return_indices_of_off_diagonals=True, ) if filter_by in ["prune_by_noise_rate", "prune_by_class", "both"]: # Create `prune_count_matrix` with the number of examples to remove in each class and # leave at least min_examples_per_class examples per class. # `prune_count_matrix` is transposed relative to the confident_joint. prune_count_matrix = _keep_at_least_n_per_class( prune_count_matrix=confident_joint.T, n=min_examples_per_class, frac_noise=frac_noise, ) if num_to_remove_per_class is not None: # Estimate joint probability distribution over label issues psy = prune_count_matrix / np.sum(prune_count_matrix, axis=1) noise_per_s = psy.sum(axis=1) - psy.diagonal() # Calibrate labels.t. noise rates sum to num_to_remove_per_class tmp = (psy.T * num_to_remove_per_class / noise_per_s).T np.fill_diagonal(tmp, label_counts - num_to_remove_per_class) prune_count_matrix = round_preserving_row_totals(tmp) # Prepare multiprocessing shared data if n_jobs > 1: if multi_label: _labels = RawArray("I", int2onehot(labels).flatten()) else: _labels = RawArray("I", labels) _label_counts = RawArray("I", label_counts) _prune_count_matrix = RawArray("I", prune_count_matrix.flatten()) _pred_probs = RawArray("f", pred_probs.flatten()) else: # Multiprocessing is turned off. Create tuple with all parameters args = ( labels, label_counts, prune_count_matrix, pred_probs, multi_label, min_examples_per_class, ) # Perform Pruning with threshold probabilities from BFPRT algorithm in O(n) # Operations are parallelized across all CPU processes if filter_by == "prune_by_class" or filter_by == "both": if n_jobs > 1: # parallelize with multiprocessing.Pool( n_jobs, initializer=_init, initargs=( _labels, _label_counts, _prune_count_matrix, prune_count_matrix.shape, _pred_probs, pred_probs.shape, multi_label, min_examples_per_class, ), ) as p: if verbose: # pragma: no cover print("Parallel processing label issues by class.") sys.stdout.flush() if big_dataset and tqdm_exists: label_issues_masks_per_class = list( tqdm.tqdm(p.imap(_prune_by_class, range(K)), total=K), ) else: label_issues_masks_per_class = p.map( _prune_by_class, range(K)) else: # n_jobs = 1, so no parallelization label_issues_masks_per_class = [ _prune_by_class(k, args) for k in range(K) ] label_issues_mask = np.stack(label_issues_masks_per_class).any(axis=0) if filter_by == "both": label_issues_mask_by_class = label_issues_mask if filter_by == "prune_by_noise_rate" or filter_by == "both": if n_jobs > 1: # parallelize with multiprocessing.Pool( n_jobs, initializer=_init, initargs=( _labels, _label_counts, _prune_count_matrix, prune_count_matrix.shape, _pred_probs, pred_probs.shape, multi_label, min_examples_per_class, ), ) as p: if verbose: # pragma: no cover print("Parallel processing label issues by noise rate.") sys.stdout.flush() if big_dataset and tqdm_exists: label_issues_masks_per_class = list( tqdm.tqdm(p.imap(_prune_by_count, range(K)), total=K)) else: label_issues_masks_per_class = p.map( _prune_by_count, range(K)) else: # n_jobs = 1, so no parallelization label_issues_masks_per_class = [ _prune_by_count(k, args) for k in range(K) ] label_issues_mask = np.stack(label_issues_masks_per_class).any(axis=0) if filter_by == "both": label_issues_mask = label_issues_mask & label_issues_mask_by_class if filter_by == "confident_learning": label_issues_mask = np.zeros(len(labels), dtype=bool) for idx in cl_error_indices: label_issues_mask[idx] = True if filter_by == "predicted_neq_given": label_issues_mask = find_predicted_neq_given(labels, pred_probs, multi_label=multi_label) # Remove label issues if given label == model prediction if multi_label: pred = _multiclass_crossval_predict(labels, pred_probs) labels = MultiLabelBinarizer().fit_transform(labels) else: pred = pred_probs.argmax(axis=1) for i, pred_label in enumerate(pred): if (multi_label and np.all(pred_label == labels[i]) or not multi_label and pred_label == labels[i]): label_issues_mask[i] = False if verbose: print("Number of label issues found: {}".format( sum(label_issues_mask))) # TODO: run count.num_label_issues() and adjust the total issues found here to match if return_indices_ranked_by is not None: er = order_label_issues( label_issues_mask=label_issues_mask, labels=labels, pred_probs=pred_probs, rank_by=return_indices_ranked_by, rank_by_kwargs=rank_by_kwargs, ) return er return label_issues_mask
def pianoRollToTokenSequence(piano_roll, combi_to_int): mlb = MultiLabelBinarizer() mlb.fit([np.arange(128).tolist()]) combi_pairs = mlb.inverse_transform(piano_roll) return [combi_to_int[combi] for combi in combi_pairs]
X_train = np.array(norm_corpus) Y_train = [] for q in category: qstring = str(q) min_list = qstring.split(',') max_list = [] for m in min_list: max_list.append(m.strip()) Y_train.append(max_list) X_test = np.array(norm_query_docs) target_names = ['Academic', 'Mess', 'Internet', 'Maintainance'] mlb = MultiLabelBinarizer() Y = mlb.fit_transform(Y_train) classifier = Pipeline([('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(X_train, Y) predicted = classifier.predict(X_test) all_labels = mlb.inverse_transform(predicted) print mlb.classes_ temp_categories = [] for item, labels in zip(X_test, all_labels):
STYLES = { style['id']: style for style in STYLES if (style['articleType'] == article_type and style['gender'] in genders and style['usage'] in usages) } image_paths = [ *filter(lambda p: p.split(os.path.sep)[-1][:-4] in STYLES.keys(), image_paths) ] X, y = load_images_and_labels(image_paths, STYLES, (64, 64)) X = X.astype('float') / 255.0 mlb = MultiLabelBinarizer() y = mlb.fit_transform(y) (X_train, X_test, y_train, y_test) = train_test_split(X, y, stratify=y, test_size=0.2, random_state=SEED) (X_train, X_valid, y_train, y_valid) = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=SEED) model = build_network(width=64, height=64, depth=3, classes=len(mlb.classes_)) model.compile(loss='binary_crossentropy',
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('--data_path', type=str, default='data') arg('--model', type=str, default='pnasnet5large') arg('--exp-name', type=str, default='pnasnet5large_2') arg('--batch-size', type=int, default=32) arg('--lr', type=float, default=1e-2) arg('--patience', type=int, default=4) arg('--n-epochs', type=int, default=15) arg('--n-folds', type=int, default=10) arg('--fold', type=int, default=0) arg('--random-seed', type=int, default=314159) arg('--num-workers', type=int, default=6) arg('--gpus', type=str, default='0') arg('--resize', type=int, default=331) arg('--crop', type=int, default=331) arg('--scale', type=str, default='0.4, 1.0') arg('--mean', type=str, default='0.485, 0.456, 0.406') arg('--std', type=str, default='0.229, 0.224, 0.225') args = parser.parse_args() print(args) os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus # os.environ['MXNET_CUDNN_AUTOTUNE_DEFAULT'] = '1' # os.environ['MXNET_UPDATE_ON_KVSTORE'] = "0" # os.environ['MXNET_EXEC_ENABLE_ADDTO'] = "1" # os.environ['MXNET_USE_TENSORRT'] = "0" # os.environ['MXNET_GPU_WORKER_NTHREADS'] = "2" # os.environ['MXNET_GPU_COPY_NTHREADS'] = "1" # os.environ['MXNET_OPTIMIZER_AGGREGATION_SIZE'] = "54" random_seed = args.random_seed set_random_seed(random_seed) path_to_data = Path(args.data_path) labels = pd.read_csv(path_to_data / 'labels.csv') num_classes = len(labels) train = pd.read_csv(path_to_data / 'train.csv.zip') n_folds = args.n_folds make_folds(train, n_folds, random_seed) mlb = MultiLabelBinarizer([str(i) for i in range(num_classes)]) s = train['attribute_ids'].str.split() res = pd.DataFrame(mlb.fit_transform(s), columns=mlb.classes_, index=train.index) train = pd.concat([res, train['id'] + '.png', train['fold']], axis=1) gpu_count = len(args.gpus.split(',')) batch_size = args.batch_size resize = args.resize crop = args.crop scale = tuple(float(x) for x in args.scale.split(',')) mean = [float(x) for x in args.mean.split(',')] std = [float(x) for x in args.std.split(',')] # jitter_param = 0.4 # lighting_param = 0.1 labels_ids = [str(i) for i in range(num_classes)] num_workers = args.num_workers fold = args.fold train_transformer = get_train_transform(resize=resize, crop=crop, scale=scale, mean=mean, std=std) train_loader = mx.gluon.data.DataLoader(MXDataset( path_to_data / 'train', train[train['fold'] != fold].copy(), labels_ids, train_transformer), batch_size=batch_size * gpu_count, shuffle=True, num_workers=num_workers, pin_memory=True) test_transformer = get_test_transform(resize=resize, crop=crop, mean=mean, std=std) dev_loader = mx.gluon.data.DataLoader(MXDataset( path_to_data / 'train', train[train['fold'] == fold].copy(), labels_ids, test_transformer), batch_size=batch_size * gpu_count, shuffle=False, num_workers=num_workers, pin_memory=True) fp16 = True if args.model == 'pnasnet5large': net = get_pnasnet5large(num_classes) else: raise (f'No such model {args.model}') if fp16: net.cast('float16') ctx = [mx.gpu(i) for i in range(gpu_count)] net.collect_params().reset_ctx(ctx) epoch_size = len(train_loader) lr = args.lr * batch_size / 256 steps = [step * epoch_size for step in [7, 9]] factor = 0.5 warmup_epochs = 5 warmup_mode = 'linear' schedule = mx.lr_scheduler.MultiFactorScheduler( step=steps, factor=factor, base_lr=lr, warmup_steps=warmup_epochs * epoch_size, warmup_mode=warmup_mode) if fp16: weight = 128 opt = mx.optimizer.Adam( multi_precision=True, learning_rate=lr, rescale_grad=1 / weight, lr_scheduler=schedule, ) else: opt = mx.optimizer.Adam( learning_rate=lr, lr_scheduler=schedule, ) trainer = mx.gluon.Trainer(net.collect_params(), opt) if fp16: loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(weight=weight) else: loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss() path_to_models = Path('models') path_to_model = path_to_models / args.exp_name path_to_exp = path_to_model / f'fold_{fold}' if not path_to_exp.exists(): path_to_exp.mkdir(parents=True) patience = args.patience lr_reset_epoch = 1 lr_changes = 0 max_lr_changes = 2 n_epochs = args.n_epochs best_dev_f2 = th2 = 0 train_losses = [] dev_losses, dev_f2s, dev_ths = [], [], [] dev_met1, dev_met2 = [], [] for epoch in range(1, n_epochs + 1): train_loss, all_predictions, all_targets = epoch_step( train_loader, desc=f'[ Training {epoch}/{n_epochs}.. ]', fp16=fp16, ctx=ctx, net=net, loss=loss, trainer=trainer) train_losses.append(train_loss) dev_loss, all_predictions, all_targets = epoch_step( dev_loader, desc=f'[ Validating {epoch}/{n_epochs}.. ]', fp16=fp16, ctx=ctx, net=net, loss=loss) dev_losses.append(dev_loss) metrics = {} argsorted = all_predictions.argsort(axis=1) for threshold in [0.01, 0.05, 0.1, 0.15, 0.2]: metrics[f'valid_f2_th_{threshold:.2f}'] = get_score( binarize_prediction(all_predictions, threshold, argsorted), all_targets) dev_met1.append(metrics) dev_f2 = 0 for th in dev_met1[-1]: if dev_met1[-1][th] > dev_f2: dev_f2 = dev_met1[-1][th] th2 = th all_predictions = all_predictions / all_predictions.max(1, keepdims=True) metrics = {} argsorted = all_predictions.argsort(axis=1) for threshold in [0.05, 0.1, 0.2, 0.3, 0.4]: metrics[f'valid_norm_f2_th_{threshold:.2f}'] = get_score( binarize_prediction(all_predictions, threshold, argsorted), all_targets) dev_met2.append(metrics) for th in dev_met2[-1]: if dev_met2[-1][th] > dev_f2: dev_f2 = dev_met2[-1][th] th2 = th dev_f2s.append(dev_f2) dev_ths.append(th2) if dev_f2 > best_dev_f2: best_dev_f2 = dev_f2 best_th = th2 if fp16: net.cast('float32') net.save_parameters((path_to_exp / 'model').as_posix()) net.cast('float16') else: net.save_parameters((path_to_exp / 'model').as_posix()) save_dict( { 'dev_loss': dev_loss, 'dev_f2': best_dev_f2, 'dev_th': best_th, 'epoch': epoch, 'dev_f2s': dev_f2s, 'dev_ths': dev_ths, 'dev_losses': dev_losses, 'dev_met1': dev_met1, 'dev_met2': dev_met2, }, path_to_exp / 'meta_data.pkl') elif (patience and epoch - lr_reset_epoch > patience and max(dev_f2s[-patience:]) < best_dev_f2): # "patience" epochs without improvement lr_changes += 1 if lr_changes > max_lr_changes: break lr *= factor print(f'lr updated to {lr}') lr_reset_epoch = epoch if fp16: weight = 128 opt = mx.optimizer.Adam(multi_precision=True, learning_rate=lr, rescale_grad=1 / weight) else: opt = mx.optimizer.Adam(learning_rate=lr) trainer = mx.gluon.Trainer(net.collect_params(), opt) plot_all(path_to_exp, train_losses, dev_losses, dev_f2s, dev_ths, dev_met1, dev_met2)
def main(): # parameters write_whole_cluster = False perform_pca = False birch_thresh = 2.0 eval_file_names = [ 'filtered_eval_three_event.csv', 'filtered_eval_five_event.csv', 'filtered_eval_seven_event.csv' ] annotated_file_names = [ 'annotated_three_event.txt', 'annotated_five_event.txt', 'annotated_seven_event.txt' ] '''for i in range(1,179): if(i not in temp): print(i) ''' for m in range(0, len(eval_file_names)): fileName = eval_file_names[m] file_prefix = 'output' print(fileName) for birch_thresh in np.arange(0.0, 4.1, 0.2): df = pd.read_csv(fileName, header=None, encoding='latin-1') class_labels = [None] * len(df) temp = {} with open(annotated_file_names[m], "r") as ins: label = 1 for line in ins: line = line.strip() if line.startswith("#"): continue if line: line = line.split(',') # print(line) for item in line: class_labels[int(item) - 1] = label temp[int(item)] = True label += 1 df.columns = [ 'record_id', 'date', 'url', 'counts', 'themes', 'locations', 'persons', 'organizations', 'tone' ] df = df[pd.notnull(df['themes'])] df = df[pd.notnull(df['locations'])] df_locations = pd.DataFrame(df['locations']) df_counts = pd.DataFrame(df['counts']) df_counts.fillna('#', inplace=True) df_counts = pd.DataFrame( df_counts['counts'].str.split(';')) # splitting counts for row in df_counts.itertuples(): for i in range(0, len(row.counts)): try: temp_list = row.counts[i].split('#') row.counts[i] = temp_list[0] + '#' + temp_list[ 1] + '#' + temp_list[5] # print(row.locations[i]) except: continue if len(row.counts) == 1 and row.counts[0] == '': row.counts.append( '#' ) # so that news with no counts are clustered together row.counts.pop(0) if row.counts[len(row.counts) - 1] == '': row.counts.pop() # df_counts.to_csv('countsonly.csv', sep=',') row_dict = df.copy(deep=True) row_dict.fillna('', inplace=True) row_dict.index = range(len(row_dict)) row_dict = row_dict.to_dict( 'index') # dictionary that maps row number to row identifier_dict = { } # dictionary that maps GKG Record Id to Row Number i = 0 for index, row in df.iterrows(): identifier_dict[row['record_id']] = i i += 1 df = df[df.columns[[4]]] df.columns = ['themes'] df = pd.DataFrame(df['themes'].str.split(';')) # splitting themes df_locations = pd.DataFrame(df_locations['locations'].str.split( ';')) # splitting locations for row in df_locations.itertuples(): for i in range(0, len(row.locations)): try: row.locations[i] = (row.locations[i].split('#'))[ 3] # for retaining only ADM1 Code except: continue # merged = list(itertools.chain(*row.locations)) # df_locations.loc[row.Index, 'locations'] = merged df = df[pd.notnull(df['themes'])] mlb = MultiLabelBinarizer(sparse_output=True) sparse_themes = mlb.fit_transform(df['themes']) mlb2 = MultiLabelBinarizer(sparse_output=True) sparse_locations = mlb2.fit_transform(df_locations['locations']) mlb3 = MultiLabelBinarizer(sparse_output=True) sparse_counts = mlb3.fit_transform(df_counts['counts']) df = hstack([sparse_themes, sparse_locations, sparse_counts]) # Reducing dimensions through principal component analysis if perform_pca: pca = PCA(n_components=None) df = pd.DataFrame(pca.fit_transform(df)) # print("Starting clustering") brc = Birch(branching_factor=50, n_clusters=None, threshold=birch_thresh, compute_labels=True) predicted_labels = brc.fit_predict(df) clusters = {} n = 0 for item in predicted_labels: if item in clusters: clusters[item].append(list((row_dict[n]).values( ))) # since row_dict[n] is itself a dictionary else: clusters[item] = [list((row_dict[n]).values())] n += 1 # print(n) label = 0 cluster_labels = [None] * n with open(file_prefix + '.txt', 'w', encoding='utf-8') as file: for item in clusters: file.write("\n\nCluster " + str(item) + "\n") for i in range(0, len(clusters[item])): gkg_record_id = clusters[item][i][0] file.write( str(identifier_dict[gkg_record_id] + 1) + '\n' + clusters[item][i][2] + '\n' + clusters[item][i][3] + '\n\n') # appending url cluster_labels[identifier_dict[gkg_record_id]] = label label += 1 #print(cluster_labels) # cluster_labels = predicted_labels matrix = metrics.cluster.contingency_matrix( class_labels, cluster_labels) rand_index, precision, recall, f1 = precision_recall_fmeasure( matrix) ari = metrics.cluster.adjusted_rand_score(class_labels, cluster_labels) # print("AdjustedRI:", ari) nmi = metrics.normalized_mutual_info_score(class_labels, cluster_labels) # print("NMI :", nmi) print(birch_thresh, ",", rand_index, ",", precision, ",", recall, ",", f1, ",", ari, ",", nmi)
def __init__(self, cursor): DB = getdatabase(cursor) rentLog = DB.rentLog dealLog = DB.dealLog rentitems = DB.rent dealitems = DB.deal self.lens_to_internal_rentuser_ids = defaultdict( lambda: len(self.lens_to_internal_rentuser_ids)) self.lens_to_internal_rentitem_ids = defaultdict( lambda: len(self.lens_to_internal_rentitem_ids)) self.lens_to_internal_dealuser_ids = defaultdict( lambda: len(self.lens_to_internal_dealuser_ids)) self.lens_to_internal_dealitem_ids = defaultdict( lambda: len(self.lens_to_internal_dealitem_ids)) # itemLogs rentLog, self.n_users_rent, self.n_items_rent = self.raw_ratings( rentLog, 'rent') dealLog, self.n_users_deal, self.n_items_deal = self.raw_ratings( dealLog, 'deal') self.sparse_rent = self.interactions_list_to_sparse_matrix( rentLog, self.n_users_rent, self.n_items_rent) self.sparse_deal = self.interactions_list_to_sparse_matrix( dealLog, self.n_users_deal, self.n_items_deal) self.user_indicator_features_rent = sparse.identity(self.n_users_rent) self.item_indicator_features_rent = sparse.identity(self.n_items_rent) self.user_indicator_features_deal = sparse.identity(self.n_users_deal) self.item_indicator_features_deal = sparse.identity(self.n_items_deal) # itemCategories rentitems_categories, rentitems_titles = self.items_categories( rentitems) dealitems_categories, dealitems_titles = self.items_categories( dealitems) rent_categories = [ rentitems_categories[internal_id] for internal_id in range(self.n_items_rent) ] deal_categories = [ dealitems_categories[internal_id] for internal_id in range(self.n_items_deal) ] rent_categories_features = MultiLabelBinarizer().fit_transform( rent_categories) deal_categories_features = MultiLabelBinarizer().fit_transform( deal_categories) rent_categories_features = sparse.coo_matrix(rent_categories_features) deal_categories_features = sparse.coo_matrix(deal_categories_features) # result self.full_rentitem_features = sparse.hstack( [self.item_indicator_features_rent, rent_categories_features]) self.full_dealitem_features = sparse.hstack( [self.item_indicator_features_deal, deal_categories_features])
yearList = [] years = data["year"].tolist() for year in years: try: yearList.append(int(year)) except ValueError as err: yearList.append(0) yearList = pd.Series(yearList) # Pre-process genres # Store as elements of list genres = data["genres"].tolist() genreList = [] for group in genres: genreList.append(group.split(", ")) mlb = MultiLabelBinarizer() mlb.fit(genreList) # Number of unique genres numGenres = len(mlb.classes_) encoded_genres = mlb.transform(genreList) # Existence matrix to show if each genre corresponds to each track or not (0 or 1) genreDF = pd.DataFrame(encoded_genres,columns = mlb.classes_) genreDF['track_id'] = trackids genreDF.set_index('track_id') data['encoded_artist'] = encoded_artists.values data['encoded_country'] = encoded_countries.values data['encoded_year'] = yearList.values data = data.merge(genreDF, on='track_id')
import joblib import pandas as pd from sklearn.preprocessing import MultiLabelBinarizer multilabel = MultiLabelBinarizer() model = joblib.load('lps_model.pkl') tfidf = joblib.load('tfidf_model.pkl') df = pd.read_csv('https://raw.githubusercontent.com/nagappanm/Python-Machine-Learning/master/Multi_Label_Text_Classification_with_Skmultilearn/data/so_dataset_updated_blank.csv') df['Tagsupdated']=df['Tagsupdated'].fillna("") df['Tagsupdated'] = df['Tagsupdated'].apply(lambda x: x.split(',')) y = multilabel.fit_transform(df['Tagsupdated']) x = [ 'how to write code in query and php Histogram:'] y_predict = model.predict(tfidf.transform(x)) print("The Output from model is:",(y_predict.toarray())); print("The Output from model is:",multilabel.inverse_transform(y_predict)); inverseTransformList = multilabel.inverse_transform(y_predict) out = [item.strip() for t in inverseTransformList for item in t] out from collections import OrderedDict out = list(OrderedDict.fromkeys(out)) print("The List distinct is:",out)
# load names (reviews) files and labels files inside the ZIP file into the memory with open("FILE_NAMES.txt") as f: FILE_NAMES = f.read().split("\n") with open("FILE_LABELS.txt") as f: FILE_LABELS = f.read().split("\n") FILE_NAMES_tokens = [FILE_NAME.split() for FILE_NAMES in FILE_NAMES] # load the module to transform names(reviews) inputs into binary vectors # What does this is MultiLabelBinarizer # Get one-hot encoding of FILE NAME tokens from sklearn.preprocessing import MultiLabelBinarizer one_hot_encoding = MultiLabelBinarizer() one_hot_encoding.fit(FILE_NAMES_tokens) # need to divide the data into training and test from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(FILE_NAMES_tokens, FILE_LABELS, test_size = 0.2, random_state = None) # Create SVM classfier # What does this is LinearSVC # and then Train it from sklearn.svm import LinearSVC
from utils.config import root import os import numpy as np import sys sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) from sklearn.metrics import f1_score from sklearn.preprocessing import MultiLabelBinarizer from model.bert.metrics import f1_np mlb = MultiLabelBinarizer() all_labels = [ '生物性污染', '细胞有丝分裂不同时期的特点', '液泡的结构和功能', '组成细胞的化学元素', '兴奋在神经纤维上的传导', '不完全显性', '免疫系统的组成', '生物技术在其他方面的应用', '群落的结构', '中央官制——三公九卿制', '核糖体的结构和功能', '人体免疫系统在维持稳态中的作用', '皇帝制度', '激素调节', '伴性遗传', '地球运动的地理意义', '宇宙中的地球', '地球运动的基本形式', '基因工程的原理及技术', '体液免疫的概念和过程', '基因的分离规律的实质及应用', '蛋白质的合成', '地球的内部圈层结构及特点', '人口增长与人口问题', '经济学常识', '劳动就业与守法经营', '器官移植', '生物技术实践', '垄断组织的出现', '基因工程的概念', '神经调节和体液调节的比较', '人口与城市', '组成细胞的化合物', '地理', '文艺的春天', '生物工程技术', '基因的自由组合规律的实质及应用', '郡县制', '人体水盐平衡调节', '内质网的结构和功能', '人体的体温调节', '免疫系统的功能', '科学社会主义常识', '与细胞分裂有关的细胞器', '太阳对地球的影响', '古代史', '清末民主革命风潮', '复等位基因', '人工授精、试管婴儿等生殖技术', '“重农抑商”政策', '生态系统的营养结构', '减数分裂的概念', '地球的外部圈层结构及特点', '细胞的多样性和统一性', '政治', '工业区位因素', '细胞大小与物质运输的关系', '夏商两代的政治制度', '农业区位因素', '溶酶体的结构和功能', '生产活动与地域联系', '内环境的稳态', '遗传与进化', '胚胎移植', '生物科学与社会', '近代史', '第三产业的兴起和“新经济”的出现', '公民道德与伦理常识', '中心体的结构和功能', '社会主义市场经济的伦理要求', '高中', '选官、用官制度的变化', '减数分裂与有丝分裂的比较', '遗传的细胞基础', '地球所处的宇宙环境', '培养基与无菌技术', '生活中的法律常识', '高尔基体的结构和功能', '社会主义是中国人民的历史性选择', '人口迁移与人口流动', '现代史', '地球与地图', '走进细胞', '生物', '避孕的原理和方法', '血糖平衡的调节', '现代生物技术专题', '海峡两岸关系的发展', '生命活动离不开细胞', '兴奋在神经元之间的传递', '历史', '分子与细胞', '拉马克的进化学说', '遗传的分子基础', '稳态与环境' ]
def __init__(self, vectors, clf): self.embeddings = vectors self.clf = TopKRanker(clf) self.binarizer = MultiLabelBinarizer(sparse_output=True)
class ClassificationProcessor(BaseProcessor): """ Corpus Pre Processor class """ def __init__(self, multi_label=False, **kwargs): super(ClassificationProcessor, self).__init__(**kwargs) self.multi_label = multi_label self.multi_label_binarizer: MultiLabelBinarizer = None def info(self): info = super(ClassificationProcessor, self).info() info['task'] = kashgari.CLASSIFICATION info['config']['multi_label'] = self.multi_label return info def _build_label_dict(self, labels: List[str]): if self.multi_label: label_set = set() for i in labels: label_set = label_set.union(list(i)) else: label_set = set(labels) self.label2idx = {} for idx, label in enumerate(sorted(label_set)): self.label2idx[label] = len(self.label2idx) self.idx2label = dict([(value, key) for key, value in self.label2idx.items()]) self.dataset_info['label_count'] = len(self.label2idx) self.multi_label_binarizer = MultiLabelBinarizer( classes=list(self.label2idx.keys())) def process_y_dataset(self, data: List[str], max_len: Optional[int] = None, subset: Optional[List[int]] = None) -> np.ndarray: if subset is not None: target = utils.get_list_subset(data, subset) else: target = data if self.multi_label: return self.multi_label_binarizer.fit_transform(target) else: numerized_samples = self.numerize_label_sequences(target) return to_categorical(numerized_samples, len(self.label2idx)) def numerize_token_sequences(self, sequences: List[List[str]]): result = [] for seq in sequences: if self.add_bos_eos: seq = [self.token_bos] + seq + [self.token_eos] unk_index = self.token2idx[self.token_unk] result.append( [self.token2idx.get(token, unk_index) for token in seq]) return result def numerize_label_sequences(self, sequences: List[str]) -> List[int]: """ Convert label sequence to label-index sequence ``['O', 'O', 'B-ORG'] -> [0, 0, 2]`` Args: sequences: label sequence, list of str Returns: label-index sequence, list of int """ return [self.label2idx[label] for label in sequences] def reverse_numerize_label_sequences(self, sequences, **kwargs): if self.multi_label: return self.multi_label_binarizer.inverse_transform(sequences) else: return [self.idx2label[label] for label in sequences]
def get_encoded_dataset(self): ''' loads training, validation and testing data, performs preprocessing (stemming, stop word removal, padding/truncation) returns: processed train, val and test document along with original test document and corresponding document id, label encoder, test document ids rtype: dataframe, MultiLabelBinarizer, list ''' print("\nLoading data (train, val, test)...") mlb = MultiLabelBinarizer() train, val, test, test_raw = load_preprocess_data() # extract training document (HPISection) and corresponding label (Dx) X_train = train['HPISection'] train['Dx'] = train['Dx'].str.split(',') y_train = train['Dx'] y_train = mlb.fit_transform(list(y_train)) # set prediction labels self.TARGET_NAMES = mlb.classes_ # extract validation set X_val = val['HPISection'] val['Dx'] = val['Dx'].str.split(',') y_val = val['Dx'] y_val = mlb.transform(list(y_val)) # extract test data and format test, y_test = format_test(test, mlb) # save gold standard save_gs(test) # Extract test set X_test = test['HPISection'] X_test_original = test_raw['HPISection'] X_test_original_index = test_raw['index'] self.MAX_SEQ_LENGTH = max([len(s.split()) for s in X_train]) self.NUM_LABELS = y_train.shape[1] # fit a tokenizer tokenizer = self.create_tokenizer(X_train) # get word -> integer mapping word2idx = tokenizer.word_index print("\n Setting vocabulary size...") # Set vocabulary size self.set_num_words(word2idx) # save tokenizer # with open(OUTPUT_PATH + model_.MODEL_NAME + '_tokenizer.pickle', 'wb') as handle: # pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) # pad sequences (train) x_train = tokenizer.texts_to_sequences(X_train) x_train = pad_sequences(x_train, maxlen=self.MAX_SEQ_LENGTH, padding='pre', truncating='pre') # pad sequences (test) x_test = tokenizer.texts_to_sequences(X_test) x_test = pad_sequences(x_test, maxlen=self.MAX_SEQ_LENGTH, padding='pre', truncating='pre') # pad sequences (val) x_val = tokenizer.texts_to_sequences(X_val) x_val = pad_sequences(x_val, maxlen=self.MAX_SEQ_LENGTH, padding='pre', truncating='pre') return x_train, y_train, x_val, y_val, x_test, y_test, X_test_original, X_test_original_index, word2idx, mlb, \ test['id']
#Get labels of all the train businesses as arrays train_labels = numpy.array( [getLabels(y) for y in train_business_features['label']]) #Get feature vectors of all the train businesses as arrays train_features = numpy.array( [getFeatureVectors(x) for x in train_business_features['feature vector']]) #Get feature vectors of all the test businesses as arrays test_features = numpy.array( [getFeatureVectors(x) for x in test_business_features['feature vector']]) #Convert train labels into binary format to avail for multi-classification mul_bin = MultiLabelBinarizer() train_labels_bin = mul_bin.fit_transform(train_labels) #Split the train data set to predict f1 score on 20% of the train data random_state = numpy.random.RandomState(0) train_feat, test_feat, train_lab, test_lab = train_test_split( train_features, train_labels_bin, test_size=.2, random_state=random_state) #Initialize the linear svm classifier classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True)) #Train svm using 80% of train data classifier.fit(train_feat, train_lab) #Predict labels of 20% of train data predict_test_lab = classifier.predict(test_feat)