class ACMClassificator(BaseACMClassificator):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
        self.mlb = MultiLabelBinarizer()
        self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini",
                                                                     max_depth=None,
                                                                     min_samples_split=2,
                                                                     min_samples_leaf=1,
                                                                     min_weight_fraction_leaf=0.,
                                                                     max_features="auto",
                                                                     max_leaf_nodes=None,
                                                                     class_weight=None),
                                                 n_jobs=-1
                                                 )

    def _prepare_problems(self, problems):
        return self.vectorizer.transform([p.statement for p in problems])

    def fit(self, problems, tags):
        nltk.download('punkt', quiet=True)
        self.vectorizer.fit([p.statement for p in problems])
        mat = self._prepare_problems(problems)
        self.mlb = self.mlb.fit(tags)
        self.classificator.fit(mat.toarray(), self.mlb.transform(tags))

    def predict(self, problems):
        mat = self._prepare_problems(problems)
        predicted = self.classificator.predict(mat.toarray())
        return self.mlb.inverse_transform(predicted)
def fit_images():
    client = pymongo.MongoClient('localhost', 27017)
    db = client['image_annotation']
    responses = db['mapped_responses'].find()
    no_labels = db['labels_binary'].find()
    numbers = []
    for i in no_labels:
        numbers.append(set([int(i["number"])]))
    train_data = []
    labels = []
    i=0
    mlb = MultiLabelBinarizer()
    mlb.fit(numbers)
    for index, instance in enumerate(responses):
        t_data =  instance['hist']['0']
        indexes[index] = instance['image_no']
        train_data.append(t_data)
        label = instance['binary_results']
        new_labels = []
        for key, value in enumerate(label):
            value1 = int(value)
            new_labels.append(set([value1]))
        new_labels = mlb.transform(new_labels)
        labels.append(label)
    classifier = KNeighborsClassifier(n_neighbors = 5, weights='uniform')
    classifier.fit(train_data, labels)
    build_dir = getBuildDir()
    pickle.dump(classifier, open(join(build_dir, 'model.data'),'w'),protocol=1)
    client.close()
def test_multilabel_classification_report():
    n_classes = 4
    n_samples = 50
    make_ml = make_multilabel_classification
    _, y_true_ll = make_ml(n_features=1, n_classes=n_classes, random_state=0,
                           n_samples=n_samples)
    _, y_pred_ll = make_ml(n_features=1, n_classes=n_classes, random_state=1,
                           n_samples=n_samples)

    expected_report = """\
             precision    recall  f1-score   support

          0       0.50      0.67      0.57        24
          1       0.51      0.74      0.61        27
          2       0.29      0.08      0.12        26
          3       0.52      0.56      0.54        27

avg / total       0.45      0.51      0.46       104
"""

    lb = MultiLabelBinarizer()
    lb.fit([range(4)])
    y_true_bi = lb.transform(y_true_ll)
    y_pred_bi = lb.transform(y_pred_ll)

    for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]:
        report = classification_report(y_true, y_pred)
        assert_equal(report, expected_report)
def run_classifier(sentences, labels, test_doc_list, output_file_path_list):
	import numpy as np

	train_matrix, tfidf = tf_idf_fit_transform(sentences)

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer()
	label_matrix = mlb.fit_transform(labels)

	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import LinearSVC
	estimator = LinearSVC()
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, label_matrix)

	for test_doc, output_file_path in zip(test_doc_list, output_file_path_list):
		test_sentences = doc2sentences([test_doc])
		sentence_matrix = tfidf.transform(test_sentences)
		print("Shape of sentence matrix : ", sentence_matrix.shape)
		predictions = classifier.predict(sentence_matrix)

		from lxml import etree
		document = etree.Element('doc')
		doc_tree = etree.ElementTree(document)
		for i in range(len(test_sentences)):
			curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1]
			etree.SubElement(document, "Sent", classes=", ".join(curr_pred)).text = test_sentences[i]
		doc_tree.write(output_file_path)
Exemple #5
0
def evaluate_solution(users, urecovered, observed_index, xs=None, E=None,
                      hidden_edges=None):
    """Evaluate the quality of the recovered user profile"""
    mse = mean_squared_error(users[observed_index, :],
                             urecovered[observed_index, :])
    if hidden_edges is None or len(hidden_edges) < 1:
        return mse, None
    labeler = MultiLabelBinarizer(classes=np.arange(xs.shape[1]))
    gold = labeler.fit_transform([E[e] for e in sorted(hidden_edges)])
    # gold = np.array([E[e] for e in sorted(hidden_edges)])
    eh = sorted(hidden_edges)
    heads, tails = zip(*eh)
    Cr = np.dot(urecovered, xs.T)
    Dr = np.abs(Cr[heads, :] - Cr[tails, :])
    # TODO prediction here could be better: instead of predict the k best
    # directions all the time, look at revealed edge to compute threshold of
    # similarity (i.e replace 0.05)
    best_dirs = np.argsort(Dr, 1).astype(int)[:, :2]
    pred = []
    for all_dir, suggestion in zip(Dr, best_dirs):
        my_pred = [suggestion[0]]
        if all_dir[suggestion[1]] < 0.05:
            my_pred.append(suggestion[1])
        pred.append(my_pred)
    pred = labeler.fit_transform(pred)
    return mse, f1_score(gold, pred, average='samples')
def main():
    #Explore the data for how many class labels
    reviewsDict = {}
    with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/reviewUsefulDict.pickle") as f:
        reviewsDict = pickle.load(f)
    print "Reviews Dictionary loaded .. "
    '''
    usefulCountDict = {}
    for key, value in reviewsDict.iteritems():
        if value not in usefulCountDict:
            usefulCountDict[value] = 1
        else:
            usefulCountDict[value] = usefulCountDict[value]+1
    pprint(usefulCountDict)
    '''
    corpus, target = DictToList(reviewsDict)
    
    vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5, sublinear_tf=True)
    XAll = vectorizer.fit_transform(corpus)
    mlb = MultiLabelBinarizer()
    yAll = mlb.fit_transform(target)
    
    with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.fv", 'w') as f:
        pickle.dump(XAll, f)
    with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.target2", 'w') as f:
        pickle.dump(yAll, f)
    with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.mlb", 'w') as f:
        pickle.dump(mlb, f)
    
    print "Dumped featrue vectors .... "
def get_training_data(window_size_ms, train_time_sec=30):
	#loop until empty input is detected
	X = []
	y = []

	print "Training time for each key is {} seconds".format(train_time_sec)
	i = 0
	while True:
		s = raw_input('Press <enter> to begin training key {} or q-<enter> to quit'.format(i))
		if s: break

		j = 0
		while j < train_time_sec:
			j += (window_size_ms / float(1000))
			freq_spect = read_spectral_data_for_time(window_size_ms)
			X.append(freq_spect)
			y.append([i])

		#increment key counter
		i += 1

	mb = MultiLabelBinarizer()
	y = mb.fit_transform(y)

	X = np.asarray(X)
	y = np.asarray(y)
	return X, y
Exemple #8
0
def load_data(config={}):
    """
    Load the Reuters dataset.

    Returns
    -------
    data : dict
        with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
    """
    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
                                     for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id)
                                for doc_id in test])
    data = {'x_train': xs['train'], 'y_train': ys['train'],
            'x_test': xs['test'], 'y_test': ys['test'],
            'labels': globals()["labels"]}
    return data
Exemple #9
0
def generateTrainFeatures(L):
    """
    This function generates the training data features and its target labels.
    Input: L : The number of training data
    Output: trainX -> a (L * 2000) numpy matrix representing the 2000 features for each of the
                        L training samples
            trainY -> (L * 185) numpy matrix representing the target class of the training samples
    Logic:
    The input text is read, preprocessed to remove stop words, and is appended to a list.
    Similarly, each of the target class values are read into a list.
    Sklearn package TFIDF vectorizer is used for generating TFIDF matrix for the 2000 frequent
    words. 
    The multi-label classification algorithms require a target Y variable of the form,
    (nsamples * nclasses), multilabel binarizer is used for converting the list of classes
    to a matrix form.
    """
    global classOrder
    X = []
    Y = []
    # read the input
    for i in range(L):
        categories = raw_input()
        target = [int(y) for y in categories.split(" ")]
        del target[0]
        meaningfulWords = readInput()
        Y.append(target)
        X.append(meaningfulWords)
    # construct TF-IDF matrix representing the features
    trainX = vectorizer.fit_transform(X).toarray()
    # convert the target label list to a suitable matrix form
    mlb = MultiLabelBinarizer()
    trainY = mlb.fit_transform(Y)
    # for representing the order of the classes
    classOrder = mlb.classes_
    return (trainX, trainY)
Exemple #10
0
def read_all_data(p):
    img_src = "images/"

    df = pd.read_pickle("frame_no_stem.pkl")
    images = __read_all_images(img_src) 
    print("Finished reading images")

    x_images = []
    x_desc = []
    y_category = []
    all_categories = set()

    for asin in df.index.values:
        if asin in images:
            data = images[asin]
            x_images.append(data)

            item = df.loc[asin]
            x_desc.append(item.description)
            cate = item.categories
            y_category.append(cate)
            for c in cate:
                all_categories.add(c)

    print("Finished reading dataframe")
    mlb = MultiLabelBinarizer()
    y_total = mlb.fit_transform(y_category)
    x_images = np.array(x_images)
    x_desc = np.array(x_desc)

    
    return x_images,x_desc, y_total
def run_classifier(sentences, labels, test_docs):
	import numpy as np

	train_matrix, tfidf = tf_idf_fit_transform(sentences)

	test_sentences = doc2sentences(test_docs)
	sentence_matrix = tfidf.transform(test_sentences)
	print("Shape of sentence matrix : ", sentence_matrix.shape)

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer()
	label_matrix = mlb.fit_transform(labels)

	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import linearSVC
	# estimator = SVC(kernel='linear')
	estimator = linearSVC()
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, label_matrix)
	predictions = classifier.predict(sentence_matrix)

	import csv
	with open("classified.csv", "w") as fl:
		writer = csv.writer(fl)
		for i in range(len(test_sentences)):
			curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1]
			writer.writerow((test_sentences[i], curr_pred))
Exemple #12
0
    def __init__(self, inter_filePath = "inter/technology_companies_of_the_united_states/"):
        # [[cat,cat...]...]
        self.m = Word2Vec.load_word2vec_format("vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5", binary=True) 
        self.dim = 400

        (correct_categories_train, context_categories_train) = self.load_category_page(inter_filePath + "category_page.txt")  
        (correct_categories_test, context_categories_test) = self.load_category_page(inter_filePath + "category_page_test.txt")
        ## ----  By mean ---
        Xvectors = np.array(self.predict_vector_by_mean(context_categories_train))
        Xvectors_test = np.array(self.predict_vector_by_mean(context_categories_test))


        ## ----  By mean --- *

        ## ----  By SVM ---
        corpus_train = [" ".join(i) for i in context_categories_train]
        corpus_test = [" ".join(i) for i in context_categories_test]
        cv = CountVectorizer(min_df = 1)
        X = cv.fit_transform(corpus_train)
        ##TFIDF
        transformer = TfidfTransformer()
        X_tfidf = transformer.fit_transform(X)
        #Labels
        mlb = MultiLabelBinarizer()
        mlb.fit(correct_categories_train + correct_categories_test)
        Y = mlb.transform(correct_categories_train) ###Transform to multilabel indicator
        #predict test labels
        X_test = cv.transform(corpus_test)
        Y_test = mlb.transform(correct_categories_test)
        #Y_predict_ovr = self.ovrSVM(X, Y, X_test)
        Y_predict_ovr = self.ovrSVM(Xvectors, Y, Xvectors_test)
        #Y_predict_ovo = self.ovoSVM(X, Y, X_test)
        print "---One versus rest---"
        print "Macro F-1:", f1_score(Y_test, Y_predict_ovr, average='macro')
        print "Micro F-1:", f1_score(Y_test, Y_predict_ovr, average='micro')
def perform_train_test_split(db_name=ds.DEFAULT_DB_NAME,
                                        train_size=ds.DEFAULT_TRAININGSET_SIZE):
    
    """
    Get all document_ids of given database and split's it according to given
    train_size.
    The tricky part is that we n
    
    :param db_name: Name of database to split documents (default DEFAULT_DB_NAME)
    :param train_size: Size in percentage [0,1] of the training set.
    :return splitted_dataset - List of lists 
                    [[DEFAULT_DATASET_LIST_INDEX_TRAINING], 
                    [DEFAULT_DATASET_LIST_INDEX_TEST]]
    """
    
    database = db.couch_database(db_name)
    all_docs = database.getAllDocumentsFromDatabase()
    
    doc_ids_list = []
    all_tag_list = []
    
    i = 0
    
    for row in all_docs.rows:
        
        document = row.doc
        #append the document id to doc_ids_list
        doc_ids_list.append(document[cp.COUCHDB_DOCUMENT_FIELD_ID])
        
        tag_list = []
        
        #if document has tags than split and add them
        if pp.STACKEXCHANGE_TAGS_COLUM in document.keys():
            
            document_tags = document[pp.STACKEXCHANGE_TAGS_COLUM]
            
            tags_list = document_tags.split(sep=dtm_provider.TAG_SPLIT_separator)
            
            for tag in tags_list:
                
                #remove the closing tag (last item)
                tag_list.append(tag[:-1])
        #append the list of document tags to all_tag_list        
        all_tag_list.append(tag_list)
        
        i += 1
        
        if i > 10000:
            break
    
    mlb = MultiLabelBinarizer()
    tags_encoded = mlb.fit_transform(all_tag_list)

    
    print(len(doc_ids_list))
    
    splitted_dataset = cross_validation.train_test_split(doc_ids_list,tags_encoded,
                                               train_size=0.8, random_state=42, 
                                               stratify=tags_encoded)
Exemple #14
0
class VectorizedData:
    """ Simple container that holds the input dataset
    in a sklearn-friendly form, with X, y numpy vectors.

    TODO: we ignore # of matches for each fbpath """
    def __init__(self, data, Xdict=None, Ydict=None):
        fdict = [q_to_fdict(q) for q in data]
        lset = [q_to_lset(q) for q in data]

        if Xdict is None:
            self.Xdict = DictVectorizer()
            self.X = self.Xdict.fit_transform(fdict)
        else:
            self.Xdict = Xdict
            self.X = self.Xdict.transform(fdict)

        if Ydict is None:
            self.Ydict = MultiLabelBinarizer()
            self.Y = self.Ydict.fit_transform(lset)
        else:
            self.Ydict = Ydict

            # Filter out data with unknown labels, MultiLabelBinarizer() cannot
            # handle this
            known_lset = [set([label for label in ls if label in self.Ydict.classes_]) for ls in lset]
            lset_n = sum([len(ls) for ls in lset])
            known_lset_n = sum([len(ls) for ls in known_lset])
            if known_lset_n < lset_n:
                print('dropped %d out of %d labels (not in training set)' % (lset_n - known_lset_n, lset_n), file=sys.stderr)

            self.Y = self.Ydict.transform(known_lset)

    def cfier_score(self, cfier, scorer):
        """ Measure cfier performance on this dataset.

        scorer -> lambda cfier, X: cfier.predict_proba(X)
        (or decision_function when probabilities not predicted) """
        skl_score = cfier.score(self.X.toarray(), self.Y)

        # XXX: Matched paths might/could be weighted by their nMatches too...

        # Measure prediction performance
        Ypred = cfier.predict(self.X.toarray())
        n_q = float(np.size(self.Y, axis=0))
        # number of questions where all correct paths have been recalled
        recall_all = np.sum(np.sum(self.Y, axis=1) == np.sum(Ypred * self.Y, axis=1)) / n_q
        # number of questions where at least one correct path has been recalled
        recall_any = np.sum((np.sum(self.Y, axis=1) != 0) == (np.sum(Ypred * self.Y, axis=1) != 0)) / n_q
        # number of *PATHS* (not q.) that were correct
        precision = np.sum((Ypred + self.Y) == 2) / float(np.sum(Ypred))

        # Measure scoring performance
        Yscores = scorer(cfier, self.X.toarray())
        # MRR of first correct path
        mrr = mrr_by_score(self.Y, Yscores)
        # number of questions where at least one correct path has been recalled in top N paths
        # TODO

        return {'sklScore': skl_score, 'qRecallAll': recall_all, 'qRecallAny': recall_any, 'pPrec': precision, 'qScoreMRR': mrr}
def createDataMatrix(ngram_features, character_gram_features,tweetText, pos, pos_features, different_pos_tags, pos_text, voca_clusters, categories):
    tokenizer_case_preserve = Tokenizer(preserve_case=True)
    tokenizer = Tokenizer(preserve_case=False)
    handmade_features, cll, cll2 = [], [], []
    for tweet in tweetText:
        feat = []
        feat.append(exclamations(tweet))
        feat.append(questions(tweet))
        feat.append(questions_and_exclamation(tweet))
        feat.append(emoticon_negative(tweet))
        feat.append(emoticon_positive(tweet))
        words = tokenizer_case_preserve.tokenize(tweet) #preserving casing
        feat.append(allCaps(words))
        feat.append(elongated(words))
        feat.append(questions_and_exclamation(words[-1]))
        handmade_features.append(np.array(feat))
        words = tokenizer.tokenize(tweet)
        words = [word.strip("_NEG") for word in words]
        cll.append(getClusters(voca_clusters, words))
        #cll2.append(getClusters(voca_handmade, words))


    bl = csr_matrix(bing_lius(tweetText, pos, different_pos_tags, pos_text))
    nrc_emo = csr_matrix(nrc_emotion(tweetText, pos, different_pos_tags, pos_text ))
    mpqa_feat = csr_matrix(mpqa(tweetText,pos, different_pos_tags, pos_text))
    handmade_features = np.array(handmade_features)
    mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_clusters.values())))
    cluster_memberships_binarized = csr_matrix(mlb.fit_transform(cll))
    #mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_handmade.values())))
    #cluster_memberships_binarized_2 = csr_matrix(mlb.fit_transform(cll2))
    
    hasht = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-unigrams.txt'))
#    sent140aff_data = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-unigrams.txt'))
    hasht_bigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-bigrams.txt'))
#    sent140affBigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-bigrams.txt'))
    sentQ = csr_matrix(get_sentiwordnet(pos_text, pos))
    pos_features = csr_matrix(pos_features)
    handmade_features = csr_matrix(handmade_features)
    # ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, 
#                             sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
#    ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
    ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, sentQ, handmade_features, pos_features, cluster_memberships_binarized, bl, mpqa_feat, nrc_emo, hasht, hasht_bigrams ), dtype=float)

#     print ngram_features.shape, character_gram_features.shape, cluster_memberships_binarized.shape, handmade_features.shape, pos_features.shape, 
#     sent140affBigrams.shape, hasht_bigrams, hasht.shape, sent140aff_data.shape, bl.shape, mpqa_feat.shape, nrc_emo.shape
    y=[]
    for i in categories:
        if i=='positive':
            y.append(1)
        elif i == 'negative':
            y.append(-1)
        elif i == 'UNKNOWN':
            y.append(0)
        else:
            print i
    ffeatures = normalize(ffeatures)
#     ffeatures, y = shuffle(ffeatures,y)
    return ffeatures, y
Exemple #16
0
def print_report(name_classificator, testing_problems, testing_tags, predicted_problems, predicted_tags):
    predicted_problems, predicted_tags = make_right_order(testing_problems, predicted_problems, predicted_tags)
    mlb = MultiLabelBinarizer().fit(testing_tags + predicted_tags)
    testing_tags = mlb.transform(testing_tags)
    predicted_tags = mlb.transform(predicted_tags)
    print(name_classificator)
    print(classification_report(testing_tags, predicted_tags, target_names=mlb.classes_))
    print('label ranking average precision score =',
          label_ranking_average_precision_score(testing_tags, predicted_tags))
    print('\n', ('#'*100), '\n')
def xval(clf, x, y, train_index, test_index):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(x_train, y_train)
    mlb = MultiLabelBinarizer()
    y_pred = clf.predict_proba(x_test)
    mse = mean_squared_error(mlb.fit_transform(label_binarize(y_test, clf.classes_)), y_pred)
    acc = accuracy_score(y_test, y_pred.argmax(axis=1))
    evals = clf.get_num_evals()
    return mse, acc, evals
Exemple #18
0
    def test_BRKnna_no_labels_take_closest(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0, 1]])
        train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid2', 'lid3'], ['lid0', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=True)
        y = mlb.fit_transform(train_ids)
        knn = BRKNeighborsClassifier(n_neighbors=2, threshold=0.6, mode='a')
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
        print(pred)
        np.testing.assert_array_equal([[1, 0, 0, 0, 0]], pred)
Exemple #19
0
    def test_BRKnna_predict_dense(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
        np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
Exemple #20
0
    def test_BRKnnb_predict_two_samples(self):
        data = csr.csr_matrix([[0, 1], [1, 1.1], [1, 1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid5'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=True)
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[0, 1], [2, 2]])).todense()
        np.testing.assert_array_equal([[1, 1, 0, 0], [0, 0, 1, 1]], pred)
def run_classifierAccuracy(trainSentences, trainLabels, testSentences, testLabels):
	all_labels = ["Drought", "Earthquake", "Flood", "Epidemic", "Hurricane", \
			"Rebellion", "Terrorism", "Tornado", "Tsunami", "displaced_people_and_evacuations", \
			"donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \
			"injured_or_dead_people", "missing_trapped_or_found_people"]
	disaster_labels = ["Drought", "Earthquake", "Flood", "Hurricane", \
			"Tornado", "Tsunami", "displaced_people_and_evacuations", \
			"donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \
			"injured_or_dead_people", "missing_trapped_or_found_people"]
	health_labels = ["Epidemic", "displaced_people_and_evacuations", \
			"donation_needs_or_offers_or_volunteering_services", \
			"injured_or_dead_people"]
	conflict_labels = ["Rebellion", "Terrorism", "displaced_people_and_evacuations", \
			"infrastructure_and_utilities_damage", \
			"injured_or_dead_people", "missing_trapped_or_found_people"]
	import numpy as np
	curr_labels = all_labels

	trainLabels = [list(set(l).intersection(curr_labels)) for l in trainLabels]
	testLabels = [list(set(l).intersection(curr_labels))for l in testLabels]

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer(classes=curr_labels)
	train_label_matrix = mlb.fit(trainLabels)
	print("Labels : ", mlb.classes_)
	train_label_matrix = mlb.transform(trainLabels)
	test_label_matrix = mlb.transform(testLabels)
	print("Shape of label matrix : ", test_label_matrix.shape)

	train_matrix, tfidf = tf_idf_fit_transform(trainSentences)
	test_matrix = tfidf.transform(testSentences)
	print("Shape of sentence matrix : ", test_matrix.shape)


	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import LinearSVC
	from sklearn.ensemble import RandomForestClassifier
	# estimator = LinearSVC()
	estimator = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0, n_jobs = -1)
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, train_label_matrix)
	predictions = classifier.predict(test_matrix)

	from sklearn.metrics import f1_score, precision_score, recall_score
	print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro'))
	print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro'))
	print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro'))
	print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro'))
	print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro'))
	print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro'))
	print("Macro-Precision", precision_score(test_label_matrix, predictions, average=None))
	print("Macro-Recall", recall_score(test_label_matrix, predictions, average=None))
	print("Macro-F1", f1_score(test_label_matrix, predictions, average=None))
Exemple #22
0
def main():
    #sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"])
    sets = select_sets_by_tag(20,4,tag_names)
    #sets = random_select_sets(30,6)
    train_tags = fetch_tags(sets["train"])
    train_texts = id_to_filename(sets["train"])#txt_to_list(sets["train"])
    #vectorize
    count_vect = CountVectorizer(stop_words='english', encoding="utf-16", input="filename")
    X_train_counts = count_vect.fit_transform(train_texts)

    #tf-idf transformation
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    #process tags
    mlb = MultiLabelBinarizer()
    processed_train_tags = mlb.fit_transform(train_tags)
    #rint(processed_train_tags)
    #classifier
    #clf = OneVsRestClassifier(MultinomialNB())
    clf = OneVsRestClassifier(LinearSVC())
    clf.fit(X_train_tfidf,processed_train_tags)
    print("classes:{}".format(clf.classes_))
    #process test set

    test_texts = id_to_filename(sets["test"])#txt_to_list(sets["test"])
    X_test_counts = count_vect.transform(test_texts)
    #print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts)))
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    predicted_tags = clf.predict(X_test_tfidf)
    predicted_tags_readable = mlb.inverse_transform(predicted_tags)
    test_tags_actual = fetch_tags(sets["test"])
    predicted_probs = clf.decision_function(X_test_tfidf)
    #predicted_probs = clf.get_params(X_test_tfidf)
    class_list = mlb.classes_
    report = metrics.classification_report(mlb.transform(test_tags_actual),predicted_tags,target_names=class_list)
    print(report)
    #retrieve top 30% for each class
    top_percentage = 30
    threshold_index = int( len(sets["test"]) *(top_percentage/100.0) )
    threshold_vals_dic = {}
    threshold_vals = []
    num_classes = len(class_list)
    for i in range(num_classes):
        z = [ predicted_probs[j,i] for j in range(len(sets["test"]))]
        z.sort(reverse=True)
        threshold_vals_dic[class_list[i]]= z[threshold_index]
        threshold_vals.append(z[threshold_index])
    print(threshold_vals_dic)


    print_predictions(sets["test"],predicted_tags_readable,class_list, class_probablities=predicted_probs,threshold_vals=threshold_vals)
Exemple #23
0
class TimeSeriesLabelTransformer(BaseTaskTransformer):

    def __init__(self, namespace, name, labels=None):
        '''Initialize a time-series label transformer

        Parameters
        ----------
        jam : jams.JAMS
            The JAMS object container

        n_samples : int > 0
            The number of samples in the audio frame

        label_encoder : sklearn.preprocessing.MultiLabelBinarizer
            The (pre-constructed) label encoder
        '''

        super(TimeSeriesLabelTransformer, self).__init__(namespace, 0)

        self.encoder = MultiLabelBinarizer()
        self.encoder.fit([labels])
        self._classes = set(self.encoder.classes_)
        self.name = name

    def transform(self, jam):

        ann = self.find_annotation(jam)

        intervals = np.asarray([[0.0, jam.file_metadata.duration]])
        values = [None]
        mask = False

        if ann:
            ann_int, ann_val = ann.data.to_interval_values()
            intervals = np.vstack([intervals, ann_int])
            values.extend(ann_val)
            mask = True

        # Suppress all intervals not in the encoder
        tags = []
        for v in values:
            if v in self._classes:
                tags.extend(self.encoder.transform([[v]]))
            else:
                tags.extend(self.encoder.transform([[]]))

        tags = np.asarray(tags)
        target = self.encode_intervals(jam.file_metadata.duration,
                                       intervals,
                                       tags)
        return {'output_{:s}'.format(self.name): target,
                'mask_{:s}'.format(self.name): mask}
def get_data(train_file, test_file):
    X_train, Y_train = load_data(train_file)
    X_train = [ln.split('\t')[1] for ln in X_train]
    X_test, Y_test = load_data(test_file)
    X_test = [ln.split('\t')[1] for ln in X_test]

    mlb = MultiLabelBinarizer()
    Y_train = [set(s.split('_')) - {'None'} for s in Y_train]
    Y_test = [set(s.split('_')) - {'None'} for s in Y_test]
    Y_train = mlb.fit_transform(Y_train)
    Y_test = mlb.transform(Y_test)

    return X_train, X_test, Y_train, Y_test, mlb.classes_
def test_multilabelbinarizer_vs_sklearn():
    # Compare msmbuilder.preprocessing.MultiLabelBinarizer
    # with sklearn.preprocessing.MultiLabelBinarizer

    multilabelbinarizerr = MultiLabelBinarizerR()
    multilabelbinarizerr.fit(np.concatenate(trajs))

    multilabelbinarizer = MultiLabelBinarizer()
    multilabelbinarizer.fit(trajs)

    y_ref1 = multilabelbinarizerr.transform(trajs[0])
    y1 = multilabelbinarizer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
    def __kfold_prob_tp_fp(self, X, y, n_folds=2):
        # if isinstance(X, csr_matrix) and isinstance(y, np.ndarray):
        #     X=X.toarray()
        # elif isinstance(X, np.ndarray) and isinstance(y, np.ndarray):
        #     if len(y.shape)==1:
        #         y=MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y])
        #     elif len(y.shape)==2:
        #         pass
        if isinstance(y, list):
            y = np.asarray(y)

        try:
            with open(self.prefix + self.dir_name + "/" + str(n_folds) + "FCV_prob.pickle", "rb") as f:
                [tp_av, fp_av] = pickle.load(f)
        except:
            kf = KFold(y.shape[0], n_folds=n_folds)
            TP_avr = []
            FP_avr = []
            for train_index, test_index in kf:
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model = self.model
                model = model.fit(X_train, y_train)
                y_predict = model.predict(X_test)
                y_prob_predict = model.predict_proba(X_test)
                TP = []
                FP = []
                if len(y.shape) == 1:
                    y_predict = MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y_predict])
                elif len(y.shape) == 2:
                    pass
                for class_ind, class_prob in zip(y_predict.transpose(), y_prob_predict.transpose()):
                    TP_class = []
                    FP_class = []
                    for ind, prob in zip(class_ind, class_prob):
                        if ind == 1:
                            TP_class.append(prob)
                        elif ind == 0:
                            FP_class.append(prob)
                    TP.append(np.sum(TP_class) / len(class_ind))
                    FP.append(np.sum(FP_class) / len(class_ind))
                TP_avr.append(TP)
                FP_avr.append(FP)
            tp_av, fp_av = np.average(TP_avr, axis=0), np.average(FP_avr, axis=0)
            with open(self.prefix + self.dir_name + "/" + str(n_folds) + "FCV_prob.pickle", "wb") as f:
                pickle.dump([tp_av, fp_av], f)
                f.close()
            # print('tp, fp by prob', tp_av, fp_av)
        return [tp_av, fp_av]
Exemple #27
0
def load_data():
    labels=pd.read_csv("train.csv")
    bismatch=pd.read_csv("train_photo_to_biz_ids.csv")
    labels=bismatch.merge(labels,how='left',on='business_id')
    labels=labels[pd.isnull(labels['labels'])==False]
    labels['labels']=labels['labels'].map(lambda x:[int(i) for i in x.split(" ")])
    training_=os.listdir("train_photos/train244")
    train_ids=pd.DataFrame({"photo_id":[int(i.split(".")[0]) for i in training_]})
    train_ids=train_ids.merge(labels,on='photo_id',how='inner')
#    val_ids=val_ids.merge(labels,on='photo_id',how='inner')
    mlb=MultiLabelBinarizer()
    mlb.fit(train_ids['labels'].tolist())
#    X_train=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_ids['photo_id'].tolist()]).astype(np.float32)
#    X_test=np.array([imread('train_photos/val244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()]).astype(np.float32)
    return train_ids,mlb
def prepVect(min_df=2, max_features=50000, n_captions=5, n_sbu=None,
             multilabel=False):
    print "prepping the Word Tokenizer..."
    _0, _1, trY, _3 = coco(mode='full', n_captions=n_captions)
    if n_sbu:
        _4, sbuY, _5 = sbuXYFilenames(n_sbu)
        trY.extend(sbuY)
    vect = Tokenizer(min_df=min_df, max_features=max_features)
    captions = sampleCaptions(trY, n_captions)
    vect.fit(captions)
    if multilabel:
        mlb = MultiLabelBinarizer()
        mlb.fit(vect.transform(captions))
        return vect, mlb
    # if not multilabel:
    return vect
Exemple #29
0
    def __init__(self):
        self.trainExamples = ['exodus_gods_and_kings.p', 'how_to_train_your_dragon_2.p', 'bears.p', 'see_no_evil_2.p', 'addicted.p', "the_internet's_own_boy_the_story_of_aaron_swartz.p", 'the_salt_of_the_earth.p', 'the_other_woman.p', 'project_almanac.p', 'edge_of_tomorrow.p', 'maya_the_bee_movie.p', 'cowspiracy_the_sustainability_secret.p', "let's_be_cops.p", "winter's_tale.p", 'the_trip_to_italy.p', 'yellowbird.p', 'alexander_and_the_terrible_horrible_no_good_very_bad_day.p', 'rosewater.p', 'the_hero_of_color_city.p', 'endless_love.p', 'dracula_untold.p', 'dumb_and_dumber_to.p', 'tomorrowland.p', 'the_hunger_games_mockingjay_part_1.p', 'tammy.p', 'hot_tub_time_machine_2.p', 'lucy.p', 'the_lego_movie.p', 'the_judge.p', 'cake.p', 'st_vincent.p', 'black_or_white.p', 'american_sniper.p', 'mr_peabody_&_sherman.p', 'this_is_where_i_leave_you.p', 'x-men_days_of_future_past.p', 'non-stop.p', 'get_on_up.p', 'the_fault_in_our_stars.p', 'song_one.p', 'robocop.p', 'into_the_storm.p', 'a_most_wanted_man.p', 'the_good_lie.p', 'wild.p', 'the_maze_runner.p', 'beyond_the_lights.p', 'divergent.p', 'spring.p', 'as_above_so_below.p', 'noble.p', 'hercules.p', 'i-lived&y=2015.p', 'night_at_the_museum_secret_of_the_tomb.p', 'planes:fire_&_rescue.p', 'old_fashioned.p', 'the_identical.p', 'dawn_of_the_planet_of_the_apes.p', 'cabin_fever_patient_zero.p', 'ride_along.p', 'dear_white_people.p', 'if_i_stay.p', 'red_army.p', 'the_boxtrolls.p', 'captain_america_the_winter_soldier.p', 'virunga.p', 'the_interview.p', 'earth_to_echo.p', 'a_walk_among_the_tombstones.p', 'persecuted.p', 'the_book_of_life.p', 'unbroken.p', 'the_drop.p', 'need_for_speed.p', 'brick_mansions.p', 'maleficent.p', 'blended.p', "devil's_due.p", 'jessabelle.p', 'fear_clinic.p', 'gone_girl.p', 'birdman_or_the_unexpected_virtue_of_ignorance.p', 'kill_the_messenger.p', 'my_little_pony_equestria_girls.p', 'rio_2.p', 'big_hero_6.p', 'guardians_of_the_galaxy.p', 'noah.p', 'the_hobbit_the_battle_of_the_five_armies.p', 'i_frankenstein.p', 'the_november_man.p', 'the_pyramid.p', 'and_so_it_goes.p', 'birdman_or_the_unexpected_virtue_of_ignorance.p', 'inherent_vice.p', 'merchants_of_doubt.p', 'iris.p', 'lambert,_stamp.p']
        self.testExamples = [x for x in util2.getMovieDataset() if x not in self.trainExamples]
        # Standard DictVectorizer fitted with all colors as the features.
        self.dVec = DictVectorizer(sparse=False)
        self.dVec.fit([dict((feature,0) for feature in util2.getColors())])
        # Standard MultiLabelBinarizer with all genre names 
        self.mlb = MultiLabelBinarizer()
        self.pipeline = Pipeline([
            ('organizeData', Movie_Data_Aggregator()),
            ('union', FeatureUnion(
                transformer_list = [
                ('colors', Pipeline([
                    ('selector', Data_Selector(key='colors')),
                    ('dVec', self.dVec),
                    ])),
                ('subs', Pipeline([
                    ('selector', Data_Selector(key='subs')),
                    ('tfidf', TfidfVectorizer(strip_accents='ascii', max_features=15)),
                    ])),
                ],
                transformer_weights={
                'colors': 0.5,
                'subs': 0.5,
                },
                )),
            ('sgd', SGDClassifier(alpha= 1e-06, loss="perceptron", n_iter= 150, penalty="l2")),
            ])
		# OneVsRestClassifier used for prediction
        self.classif = OneVsRestClassifier(self.pipeline)
def chi2(X, y):
    X = check_array(X, accept_sparse='csr')
    if np.any((X.data if issparse(X) else X) < 0):
        raise ValueError("Input X must be non-negative.")

    Y = MultiLabelBinarizer().fit_transform(y)
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)

    observed = safe_sparse_dot(Y.T, X)          # n_classes * n_features

    feature_count = check_array(X.sum(axis=0))
    class_prob = check_array(Y.mean(axis=0))
    expected = np.dot(class_prob.T, feature_count)

    return _chisquare(observed, expected)
Exemple #31
0
class BaseAttributeCluster(object):
    attribute_name = ''

    def __init__(self):
        self._binariser = None
        self._kmeans_model = None
        self._records = self.get_records()
        self.data = pd.DataFrame(self._records)
        self.binarised_data = self.transform_data(
            self.data[self.attribute_name])
        self.train, self.test = train_test(self.binarised_data,
                                           self.data['class'])

    @property
    def binariser(self):
        if self._binariser is None:
            all_labels = list(
                set([
                    item for r in self._records
                    for item in self.tokenise(r[self.attribute_name])
                ]))
            self._binariser = MultiLabelBinarizer()
            self._binariser.fit([all_labels])
        return self._binariser

    def kmeans_model(self, filepath=None, ignore_no_file=True, n_clusters=2):
        if filepath is not None and os.path.exists(filepath):
            with open(filepath, 'rb') as f:
                self._kmeans_model = dill.load(f)
        elif filepath is not None and not ignore_no_file:
            raise FileNotFoundError(f'Saved model not found at {filepath}.')
        else:
            if self._kmeans_model is None:
                self._kmeans_model = KMeans(n_clusters=n_clusters)
                self._kmeans_model.fit(self.train['x'])
            if filepath is not None:
                with open(filepath, 'wb') as f:
                    dill.dump(self._kmeans_model, f)
        return self._kmeans_model

    def get_records(self):
        attr = getattr(Citation, self.attribute_name)

        with SessionManager() as session_manager:
            citations = session_manager.session.query(attr,
                                                      ManualClassification.classification_id) \
                .join(ManualClassification, Citation.doi == ManualClassification.doi) \
                .group_by(attr, ManualClassification.classification_id).all()

        return [{
            self.attribute_name:
            getattr(c, self.attribute_name)
            if getattr(c, self.attribute_name) is not None else '',
            'class':
            c.classification_id
        } for c in citations]

    @staticmethod
    def tokenise(input_string):
        return [t.strip().lower() for t in input_string.split(',')]

    def transform_data(self, data):
        binarised_data = self.binariser.transform(
            [self.tokenise(x) for x in data])
        return binarised_data
Exemple #32
0
 def __init__(self, **kwargs):
     self.model = OneVsRestClassifier(LinearSVC(**kwargs), n_jobs=1)
     self.paras = kwargs
     self.mlb = MultiLabelBinarizer()
Exemple #33
0
class PandasDataset:
    """Class to simplify pre processing steps on dataframe. Requires prioir understanding of the dataset

    """
    def __init__(self):
        self.original_df = None
        self.current_df = None
        self.label_encoder = None

    def from_preprocessed(self, path: str):
        """Load from pre processed dataset file

        :param path: path to pre processed PandasDataset file
        :return:
        """
        with open(path, 'rb') as f:
            dataset = pickle.load(f)
            self.original_df = dataset.original_df
            self.current_df = dataset.current_df
            self.label_encoder = dataset.label_encoder
            f.close()

    def read_data(self, filename: str = "sentisum-evaluation-dataset.csv"):
        """Load the CSV file into a workable format

        :param:
        :return: pd dataset
        """
        self.original_df = pd.read_csv(filename, header=None)
        data = self.original_df.fillna('')

        column_names = ['text']
        label_names = []
        for idx in range(1, 15):
            name = 'label_' + str(idx)
            label_names.append(name)
            column_names.append(name)

        data.columns = column_names

        data['topics'] = data[label_names].values.tolist()

        out_data = data[['text', 'topics']]

        def clean_topics(x):
            return [top for top in x if top != '']

        out_data['topics'] = out_data['topics'].map(clean_topics)

        self.current_df = out_data

        return self.current_df

    def replace_labels(self, label: str, target: str):
        """Replace occurances of all labels with the target label

        :param label: source label
        :param target: target label
        :return:
        """
        def replace_lab(x):
            return [top if top != label else target for top in x]

        self.current_df.topics = self.current_df.topics.map(replace_lab)

    def merge_labels(self,
                     minimum_samples: int = 100,
                     minority_label: str = 'others'):
        """Merge Labels with less than minimum samples

        :param minimum_samples:
        :param minority_label: name for the common label
        :return:
        """
        label_counts = self.current_df.topics.explode().value_counts()
        label_names = label_counts.index

        label_others = []
        for idx, label in enumerate(label_names):
            if label_counts[idx] < minimum_samples:
                label_others.append(label)

        def replace_others(x):
            new_labels = []
            for top in x:
                sent = top.split(' ')[-1]
                if top in label_others:
                    new_labels.append(' '.join([minority_label, sent]))
                else:
                    new_labels.append(top)
            return new_labels

        self.current_df.topics = self.current_df.topics.map(replace_others)

        return self.current_df

    def undersample_label(self, topic: str, fraction: float):
        """Undersample a given label. Selectively works on single occurances

        :param topic:
        :param fraction: fraction to retain
        :return:
        """
        temp_df = self.current_df[self.current_df.topics.apply(
            lambda x: topic in x)]
        temp_df = temp_df[temp_df.topics.str.len() == 1].sample(frac=fraction)

        single_label_data = self.current_df[self.current_df.topics.str.len() ==
                                            1]
        drop_index = single_label_data[single_label_data.topics.apply(
            lambda x: topic in x)].index
        self.current_df = self.current_df.drop(drop_index)
        self.current_df = self.current_df.append(temp_df)

    def undersample_label_combo(self, topic_a: str, topic_b: str,
                                fraction: float):
        """Under sample a given combination of labels.
        todo Add a combo with more than 2 topics
        :param topic_a:
        :param topic_b:
        :param fraction: fraction to retain
        :return:
        """
        temp_df = self.current_df[self.current_df.topics.apply(
            lambda x: x == [topic_a, topic_b])]
        temp_df = temp_df[temp_df.topics.str.len() == 2].sample(frac=fraction)

        double_label_data = self.current_df[self.current_df.topics.str.len() ==
                                            2]
        drop_index = double_label_data[double_label_data.topics.apply(
            lambda x: x == [topic_a, topic_b])].index

        self.current_df = self.current_df.drop(drop_index)
        self.current_df = self.current_df.append(temp_df)

    def overview(self):
        """Gives a quick overview of the current dataframe

        :return:
        """
        return {
            "value_counts":
            self.current_df.topics.explode().value_counts(),
            "labels":
            self.current_df.topics.explode().unique(),
            "mean no. of tokens":
            self.current_df.text.str.split().str.len().std(),
            "mean no. of sentences":
            self.current_df.text.str.split('.').str.len().std()
        }

    def encode_labels(self):
        """Encode the label classes for classification using MultiLabelBinarizer

        :return: class list
        """
        def l2t(x):
            return tuple(x)

        self.current_df.topics = self.current_df.topics.map(l2t)

        self.label_encoder = MultiLabelBinarizer()

        self.current_df['encoded'] = self.label_encoder.fit_transform(
            self.current_df.topics.tolist()).tolist()

        return self.label_encoder.classes_.tolist()

    def train_test_split(self, test_size: float = 0.2):
        """Generate train and test sets

        :param test_size: test set fraction
        :return: train_dataset, test_dataset
        """
        train_dataset, test_dataset = train_test_split(self.current_df,
                                                       test_size=test_size)
        train_dataset = train_dataset.reset_index(drop=True)
        test_dataset = test_dataset.reset_index(drop=True)
        return train_dataset, test_dataset

    def save_dataset(self, path: str):
        """

        :param path:
        :return:
        """
        output = open(path, 'wb')
        pickle.dump(self, output)
        output.close()
    if dat == 'forestcover':
        dataset = fetch_covtype(shuffle=True)
        X = dataset.data
        y = dataset.target
        # normal data are those with attribute 2
        # abnormal those with attribute 4
        s = (y == 2) + (y == 4)
        X = X[s, :]
        y = y[s]
        y = (y != 2).astype(int)
        print_outlier_ratio(y)

    print('--- Vectorizing data...')

    if dat == 'SF':
        lb = MultiLabelBinarizer()
        x1 = lb.fit_transform(X[:, 1])
        X = np.c_[X[:, :1], x1, X[:, 2:]]
        y = (y != b'normal.').astype(int)
        print_outlier_ratio(y)

    if dat == 'SA':
        lb = MultiLabelBinarizer()
        x1 = lb.fit_transform(X[:, 1])
        x2 = lb.fit_transform(X[:, 2])
        x3 = lb.fit_transform(X[:, 3])
        X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
        y = (y != b'normal.').astype(int)
        print_outlier_ratio(y)

    if dat in ('http', 'smtp'):
Exemple #35
0
def main():

    f = open('store.pckl', 'rb')
    eng_data = pickle.load(f)
    f.close()

    eng_data = eng_data[eng_data.ingredients_text.apply(len) > 0].reset_index()

    np.random.seed(1234)
    train, validate, test = np.split(
        eng_data.sample(frac=1, random_state=134),
        [int(.6 * len(eng_data)),
         int(.8 * len(eng_data))])

    mlb = MultiLabelBinarizer()
    X_train = mlb.fit_transform(train['ingredients_text']).astype(np.float32)
    y_train = train['nutrition-score-fr_100g'].values

    train_dataset = torch.utils.data.TensorDataset(
        torch.from_numpy(X_train).float(),
        torch.from_numpy(y_train).float())

    all_ing = len(X_train[0])

    # neural network
    model = NeuralNet(all_ing)

    print(model)
    train_loader = DataLoader(train_dataset,
                              batch_size=200,
                              shuffle=True,
                              num_workers=4)

    # start training
    learning_rate = 0.001
    #    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    for epoch in range(1, 20):

        total_loss = 0
        correct = 0

        for batch_idx, (data, target) in enumerate(train_loader):

            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)

            loss = criterion(output[:, 0], target)

            loss.backward()
            optimizer.step()
            '''
            if batch_idx % 10 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.data[0]))
            '''
            # accumulate the loss of each minibatch
            total_loss += loss.data[0] * len(data)

            # compute the accuracy per minibatch
            pred_classes = output.data.max(1, keepdim=True)[1]
            correct += pred_classes.eq(
                target.data.view_as(pred_classes).long()).sum().double()

        # compute the mean loss for each epoch
        mean_loss = total_loss / len(train_loader.dataset)

        # compute the accuracy for each epoch
        acc = correct / len(train_loader.dataset)

        print(
            'Train Epoch: {}   Avg_Loss: {:.5f}   Acc: {}/{} ({:.3f}%)'.format(
                epoch, mean_loss, correct, len(train_loader.dataset),
                100. * acc))

    pdb.set_trace()
    validate.ingredients_text = validate.ingredients_text.str.replace(
        'strawberry candy', '')

    X_val = mlb.transform(validate['ingredients_text']).astype(np.float32)
    y_test = test['nutrition-score-fr_100g'].values
    regr = lm.LinearRegression()  # 1
    regr.fit(X_train, y_train)
    pdb.set_trace()
Exemple #36
0
def __get_data(attr=None, short=False, raw=False, mem=True, source=False):
    """Get training, test, and validation data for X and y, handling
	the logic of retrieving single-label vs. multi-label data."""
    if not source:
        # Read the labels, and split them between training, validation, and test sets.

        print('reading the labels... ', end='', flush=True)
        df = pd.read_csv(labels_path)
        print('done!', flush=True)
        if raw: return df
        if attr:
            if 'from' in attr:
                y = df[re.search('(.+)_from_(.+)', attr).groups()[0]]
            elif '+' in attr:
                y = df[list(re.search('(.+)\+(.+)', attr).groups())]
            else:
                y = df[attr]
        #elif MULTI: y = df[list(attributes)]
        #else: y = df[attribute]

        assert (attr is not None)

        # Convert the class vectors to binary class matrices.
        if '+' in attr:
            """
			two options:
				I. multi-labeled matrix (each observation has multiple columns set to 1)
				II. expanded categories (each observation has one column set to 1)
			"""
            print('extracting multi-label data via one-hot-encoding... ',
                  end='',
                  flush=True)

            if MULTI_AS_ONEHOT_DISTINCT_CLASSES:

                #this is option II
                multilabels = np.tile(y.columns,
                                      (y.shape[0], 1)).astype(object)
                multilabels += df[y.columns].values.astype(str)
                multilabels = multilabels.sum(axis=1)

                lb = LabelBinarizer()
                lb.fit(np.unique(multilabels))

                y = lb.transform(multilabels)
                print()
                print(multilabels)
                print(y)

                pickle_save(lb, binarizer_path.format(attr))

            else:
                #this is option I
                labels = []
                for a in y.columns:
                    l = a + '{}'
                    for v in df[a].unique():
                        labels.append(l.format(v))

                bin = MultiLabelBinarizer()
                bin.fit([labels])

                multilabels = np.tile(y.columns,
                                      (y.shape[0], 1)).astype(object)
                multilabels += df[y.columns].values.astype(str)

                y = bin.transform(multilabels)
                pickle_save(bin, binarizer_path.format(attr + '__multi'))

        else:
            print('extracting label data via one-hot-encoding... ',
                  end='',
                  flush=True)
            y = to_categorical(y)

        y_train = y[:num_train]
        y_valid = y[num_train:num_train + num_valid]
        y_test = y[num_train + num_valid:]

        print('done!', flush=True)
        if short: return sublocals(locals(), 'y_train', 'y_test', 'y_valid')

    # Read the images, and split them between training, validation, and test sets.
    print('reading the scaled features... ', end='', flush=True)
    X = np.load(scaled_data_path)

    x_train = X[:num_train, :]
    x_valid = X[num_train:num_train + num_valid, :]
    x_test = X[num_train + num_valid:, :]

    print('--------------------------------------------')
    print(x_train.shape[0], 'training samples')
    print('x_train shape:', x_train.shape)
    print(x_valid.shape[0], 'validation samples')
    print('x_valid shape:', x_valid.shape)
    print(x_test.shape[0], 'test samples')
    print('x_test shape:', x_test.shape)
    if source: return sublocals(locals(), 'x_train', 'x_test', 'x_valid')
    print('y_train.shape:', y_train.shape)
    print('y_valid.shape:', y_valid.shape)
    print('y_test.shape:', y_test.shape)
    print('--------------------------------------------')
    return sublocals(locals(), 'x_train', 'x_test', 'x_valid', 'y_train',
                     'y_test', 'y_valid')
Exemple #37
0
vectorizer = TfidfVectorizer(tokenizer=tokenize)
vectorised_train_documents = vectorizer.fit_transform(train_docs)
vectorised_test_documents = vectorizer.transform(test_docs)
print(vectorised_train_documents.shape)
print(vectorised_test_documents.shape)

#Calcolo indice tfifd

tfidf_transformer = TfidfTransformer()
vectorised_train_tfidf_documents = tfidf_transformer.fit_transform(
    vectorised_train_documents)
print(vectorised_train_tfidf_documents.shape)

#Estrazione valori attesi

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(
    [reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform(
    [reuters.categories(doc_id) for doc_id in test_docs_id])

#Predizione e analisi risultati

multinomial = []
bernoulli = []
macqa = []
bacqa = []
macc = []
bacc = []

iter = [170, 200, 500, 800, 1000, 2000, 5000, 10000, 15000, 18000]
 def __init__(self, **hyperparams):
     self._hyperparams = hyperparams
     self._wrapped_model = Op(**self._hyperparams)
Exemple #39
0
    embeddings = numpy.load(embedding_file)
    vocab = json.load(open(vocab_file))
    int2vocab = {i: w for w, i in vocab.items()}
    embedding_dim = embeddings[0].shape[0]

    print('Loading corpus', corpus_file)
    dataset: SplitDataSet = split_corpus(corpus_file)
    print(len(dataset.y_train), 'training samples')
    print(len(dataset.y_test), 'test samples')
    print(len(dataset.y_dev), 'dev samples')

    # oov_counts = count_oovs(dataset.x_train)
    # breakpoint()

    mlb = MultiLabelBinarizer().fit(dataset.y_train)
    num_classes = mlb.classes_.shape[0]
    train_y = mlb.transform(dataset.y_train)
    test_y = mlb.transform(dataset.y_test)
    dev_y = mlb.transform(dataset.y_dev)

    train_x_int = [[
        vocab[w] for w in re.findall('\w+', x_.lower()) if w in vocab
    ] for x_ in dataset.x_train]
    test_x_int = [[
        vocab[w] for w in re.findall('\w+', x_.lower()) if w in vocab
    ] for x_ in dataset.x_test]
    dev_x_int = [[
        vocab[w] for w in re.findall('\w+', x_.lower()) if w in vocab
    ] for x_ in dataset.x_dev]
Exemple #40
0
def audio_tagging_results(reference, estimated):
    """audio_tagging_results. Returns clip-level F1 Scores

    :param reference: The ground truth dataframe as pd.DataFrame
    :param estimated: Predicted labels by the model ( thresholded )
    """
    if "event_label" in reference.columns:
        classes = reference.event_label.dropna().unique().tolist(
        ) + estimated.event_label.dropna().unique().tolist()
        encoder = MultiLabelBinarizer().fit([classes])
        reference = get_audio_tagging_df(reference)
        estimated = get_audio_tagging_df(estimated)
        ref_labels, _ = utils.encode_labels(reference['event_label'],
                                            encoder=encoder)
        reference['event_label'] = ref_labels.tolist()
        est_labels, _ = utils.encode_labels(estimated['event_label'],
                                            encoder=encoder)
        estimated['event_label'] = est_labels.tolist()

    matching = reference.merge(estimated,
                               how='outer',
                               on="filename",
                               suffixes=["_ref", "_pred"])

    def na_values(val):
        if type(val) is np.ndarray:
            return val
        elif isinstance(val, list):
            return np.array(val)
        if pd.isna(val):
            return np.zeros(len(encoder.classes_))
        return val

    ret_df = pd.DataFrame(columns=['label', 'f1', 'precision', 'recall'])
    if not estimated.empty:
        matching['event_label_pred'] = matching.event_label_pred.apply(
            na_values)
        matching['event_label_ref'] = matching.event_label_ref.apply(na_values)

        y_true = np.vstack(matching['event_label_ref'].values)
        y_pred = np.vstack(matching['event_label_pred'].values)
        ret_df.loc[:, 'label'] = encoder.classes_
        for avg in [None, 'macro', 'micro']:
            avg_f1 = skmetrics.f1_score(y_true, y_pred, average=avg)
            avg_pre = skmetrics.precision_score(y_true, y_pred, average=avg)
            avg_rec = skmetrics.recall_score(y_true, y_pred, average=avg)
            # avg_auc = skmetrics.roc_auc_score(y_true, y_pred, average=avg)

            if avg == None:
                # Add for each label non pooled stats
                ret_df.loc[:, 'precision'] = avg_pre
                ret_df.loc[:, 'recall'] = avg_rec
                ret_df.loc[:, 'f1'] = avg_f1
                # ret_df.loc[:, 'AUC'] = avg_auc
            else:
                # Append macro and micro results in last 2 rows
                ret_df = ret_df.append(
                    {
                        'label': avg,
                        'precision': avg_pre,
                        'recall': avg_rec,
                        'f1': avg_f1,
                        # 'AUC': avg_auc
                    },
                    ignore_index=True)
    return ret_df
Exemple #41
0
def dump_multilabel_classification(
        model,
        suffix="",
        folder=None,
        allow_failure=None,
        verbose=False,
        label_string=False,
        first_class=0,
        comparable_outputs=None,
        target_opset=None):
    """
    Trains and dumps a model for a binary classification problem.
    The function trains a model and calls
    :func:`dump_data_and_model`.

    Every created filename will follow the pattern:
    ``<folder>/<prefix><task><classifier-name><suffix>.<data|expected|model|onnx>.<pkl|onnx>``.
    """
    X = [[0, 1], [1, 1], [2, 0], [0.5, 0.5], [1.1, 1.1], [2.1, 0.1]]
    X = numpy.array(X, dtype=numpy.float32)
    if label_string:
        y = [["l0"], ["l1"], ["l2"], ["l0", "l1"], ["l1"], ["l2"]]
    else:
        y = [[0 + first_class], [1 + first_class], [2 + first_class],
             [0 + first_class, 1 + first_class],
             [1 + first_class], [2 + first_class]]
    y = MultiLabelBinarizer().fit_transform(y)
    model.fit(X, y)
    if verbose:
        print("[make_multilabel_classification] model '{}'".format(
            model.__class__.__name__))
    model_onnx, prefix = convert_model(
        model, "multi-class classifier",
        [("input", FloatTensorType([None, 2]))],
        target_opset=target_opset)
    if verbose:
        print("[make_multilabel_classification] model was converted")
    dump_data_and_model(
        X.astype(numpy.float32),
        model,
        model_onnx,
        folder=folder,
        allow_failure=allow_failure,
        basename=prefix + "Mcl" + model.__class__.__name__ + suffix,
        verbose=verbose,
        comparable_outputs=comparable_outputs,
    )

    X, y = make_multilabel_classification(40, n_features=4, random_state=42,
                                          n_classes=3)
    X = X[:, :2]
    model.fit(X, y)
    if verbose:
        print("[make_multilabel_classification] model '{}'".format(
            model.__class__.__name__))
    model_onnx, prefix = convert_model(model, "multi-class classifier",
                                       [("input", FloatTensorType([None, 2]))])
    if verbose:
        print("[make_multilabel_classification] model was converted")
    dump_data_and_model(
        X[:10].astype(numpy.float32),
        model,
        model_onnx,
        folder=folder,
        allow_failure=allow_failure,
        basename=prefix + "RndMla" + model.__class__.__name__ + suffix,
        verbose=verbose,
        comparable_outputs=comparable_outputs,
    )
def tokenSequenceToPianoRoll(token_sequence, int_to_combi):
    mlb = MultiLabelBinarizer()
    mlb.fit([np.arange(128).tolist()])
    combi_pairs = [int_to_combi[i] for i in token_sequence]
    piano_roll = mlb.transform(combi_pairs)
    return piano_roll
Exemple #43
0
    clean_txt = re.sub('[^a-z\s]+',' ',text)  # replacing spcl chars, punctuations by space
    clean_txt = re.sub('(\s+)',' ',clean_txt)  # replacing multiple spaces by single space
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(clean_txt))  # tokenizing, lowercase
    words = [word for word in words if word not in Stop_Words]  # filtering stopwords
    words = filter(lambda t: len(t)>=min_length, words)  # filtering words of length <=2
    tokens =(list(map(lambda token: PorterStemmer().stem(token),words)))  # stemming tokens
    return tokens


n_classes = 10
labels = categories
stop_words = stopwords.words("english")


mlb = MultiLabelBinarizer()
docs = {}
docs['train'] = [reuters.raw(doc_id) for doc_id in train]
docs['test'] = [reuters.raw(doc_id) for doc_id in test]

trd = docs['train']
tstd = docs['test']

y_tr = mlb.fit_transform(x1)
y_tst = mlb.fit_transform(x2)

t_d_tr = [tokenize(dd) for dd in trd]#tokenized training docs
t_d_tst = [tokenize(ddd) for ddd in tstd]


def read_corpus(fname):
Exemple #44
0
imgs_train, labels_train, imgs_valid, labels_valid = dataset.train_test_split(
    test_shape=0.1)
labels_train = np.array(dataset.labels_origin)
imgs_train = np.array(dataset.imgs_origin)

labels_train_splite = []
for labels in labels_train:
    label = labels.split("_")
    labels_train_splite.append(label)

labels_valid_splite = []
for labels in labels_valid:
    label = labels.split("_")
    labels_valid_splite.append(label)

mlb = MultiLabelBinarizer()
labels_train = mlb.fit_transform(labels_train_splite)
labels_valid = mlb.fit_transform(labels_valid_splite)
labels_train = io.label_smooth(labels_train, [0, 1, 4, 5])

for (i, label) in enumerate(mlb.classes_):
    print("{}. {}".format(i + 1, label))

# %% 数据预处理

imgs_train = np.array(imgs_train, dtype="float32")
imgs_valid = np.array(imgs_valid, dtype="float32")

normalization_datagen = ImageDataGenerator(featurewise_center=True,
                                           featurewise_std_normalization=True)
class BeautyDataLoader:

    def __init__(self):
        self.config, _ = config.get_config() 
        self.df = pd.read_csv(self.config.dataset_path)        
        self.df['id'] = list(range(1, len(self.df)+1))
        # self.df = shuffle(self.df)
        self.df.set_index('id')

        # self.train_test_split()
        self.startified_splits()
        self.generate_label_encodings()
        self.transform_labels()

    def data_loader(self):
        pass
    
    def train_test_split(self, split=.10):
        pass
        # self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, random_state=42)

        # self.X_train.reset_index(), self.X_test.reset_index(), self.y_train.reset_index(), self.y_test.reset_index()

    def startified_splits(self):

        self.dataset_split = {
            'train_X': [], 'train_y': [],
            'valid_X': [], 'valid_y': [],
        }
        

        skf = SKF(
            n_splits=self.config.KFolds, shuffle=True, random_state=42
            )

        data = self.df['file_path']
        labels = self.df['isbeauty']
        for train_index, test_index in skf.split(data, labels):
            
            self.dataset_split['train_X'].append([data[d] for d in train_index if d in data][:])
            self.dataset_split['valid_X'].append([data[d] for d in test_index if d in data][:])
            
            self.dataset_split['train_y'].append([
                [self.df['isbeauty'][d], self.df['skill'][d]] for d in train_index if d in labels][:])

            self.dataset_split['valid_y'].append(
               [[self.df['isbeauty'][d], self.df['skill'][d]] for d in test_index if d in labels][:])            
            
        # print(self.dataset_split)
    
    def get_label_names(self):
        return self.mlb.classes_


    def generate_label_encodings(self):
        # train_labels, valid_labels = [], []
        labels = []
        for b, sk in zip(self.df.isbeauty[1:], self.df.skill[1:]):
            if b != 'isbeauty' and sk!='skill':
                labels.append([b, sk])
        # print("Labels:")
        self.mlb = MultiLabelBinarizer()
        self.mlb.fit(labels)

        # Loop over all labels and show them
        n_labels = len(self.mlb.classes_)

        # for (i, label) in enumerate(self.mlb.classes_):
        #     print("{}. {}".format(i, label))
        
    def transform_labels(self):
        
        train_labels, valid_labels = [], []

        for train_y_set_i, valid_y_set_i in zip(self.dataset_split['train_y'], self.dataset_split['valid_y']):
            train_labels.append(list(self.mlb.transform(train_y_set_i)))
            valid_labels.append(list(self.mlb.transform(valid_y_set_i)))
        
        self.dataset_split['train_y'] = train_labels
        self.dataset_split['valid_y'] = valid_labels

    
    def normalize_img(self, filenames, label):
        """ Function to normalize image between 0 and 1

        Args:
            file_path ([str]): comple path of the image file
            label ([list]): milti-class label for the image
        """
        try:
            img = tf.io.read_file(filenames)
            image_vec = tf.image.decode_jpeg(img, channels=CHANNELS)

            # resize and normalize
            img_norm = tf.image.resize(image_vec, [IMG_SIZE, IMG_SIZE])/255.0

            return img_norm, label
        except Exception as e:
            print(e)

    
    def create_dataset(self, fold=0, is_training=True):        
        """ Here fold 0 is first dataset from all the folds created.

        Args:
            fold (int, optional): [description]. Defaults to 0.
            dataset (str, optional): [description]. Defaults to 'train'.
            is_training (bool, optional): [description]. Defaults to True.
        """
        AUTOTUNE = tf.data.experimental.AUTOTUNE # Adapt preprocessing and prefetching dynamically
        SHUFFLE_BUFFER_SIZE = 1024

        dataset = {}

        train_files = self.dataset_split['train_X'][fold]
        train_labels = self.dataset_split['train_y'][fold]
        valid_files = self.dataset_split['train_X'][fold]
        valid_labels = self.dataset_split['train_y'][fold]

        for typ, filenames, labels in [('train', train_files, train_labels), ('valid', valid_files, valid_labels)]:
            train_data = tf.data.Dataset.from_tensor_slices((filenames, labels))
            # print(list(train_data.as_numpy_iterator())[0])
            # normalize images        
            train_data = train_data.map(
                self.normalize_img, num_parallel_calls=AUTOTUNE
                )

            # if is_training == True:
            #     # This is a small dataset, only load it once, and keep it in memory.
            #     train_data = train_data.cache()
            #     # Shuffle the data each buffer size
            #     train_data = train_data.shuffle(buffer_size=1024)
            
            # Batch the data for multiple steps
            train_data = train_data.batch(32, drop_remainder=True)
            # Fetch batches in the background while the model is training.
            train_data = train_data.prefetch(buffer_size=self.config.autotune)

            
            dataset[typ] = train_data
            del train_data
            # print(train_data)
        print('Dataset Creation done for {} fold'.format(fold))
        return dataset
Exemple #46
0
def find_label_issues(
    labels,
    pred_probs,
    *,
    confident_joint=None,
    filter_by="prune_by_noise_rate",
    return_indices_ranked_by=None,
    rank_by_kwargs={},
    multi_label=False,
    frac_noise=1.0,
    num_to_remove_per_class=None,
    min_examples_per_class=1,
    n_jobs=None,
    verbose=False,
):
    """
    Identifies potential label issues in the dataset using confident learning.

    Returns a boolean mask for the entire dataset where ``True`` represents
    a label issue and ``False`` represents an example that is confidently/accurately labeled.

    Instead of a mask, you can obtain *indices* of the label issues in your
    dataset by setting `return_indices_ranked_by` to specify the label quality
    score used to order the label issues.

    The number of indices returned is controlled by `frac_noise`: reduce its
    value to identify fewer label issues. If you aren't sure, leave this set to 1.0.

    Tip: if you encounter the error "pred_probs is not defined", try setting
    ``n_jobs=1``.

    Parameters
    ----------
    labels : np.array
      A discrete vector of noisy labels, i.e. some labels may be erroneous.
      *Format requirements*: for dataset with K classes, labels must be in 0, 1, ..., K-1.

    pred_probs : np.array, optional
      An array of shape ``(N, K)`` of model-predicted probabilities,
      ``P(label=k|x)``. Each row of this matrix corresponds
      to an example `x` and contains the model-predicted probabilities that
      `x` belongs to each possible class, for each of the K classes. The
      columns must be ordered such that these probabilities correspond to
      class 0, 1, ..., K-1.

      **Caution**: `pred_probs` from your model must be out-of-sample!
      You should never provide predictions on the same examples used to train the model,
      as these will be overfit and unsuitable for finding label-errors.
      To obtain out-of-sample predicted probabilities for every datapoint in your dataset, you can use :ref:`cross-validation <pred_probs_cross_val>`.
      Alternatively it is ok if your model was trained on a separate dataset and you are only evaluating
      data that was previously held-out.

    confident_joint : np.array, optional
      An array of shape ``(K, K)`` representing the confident joint, the matrix used for identifying label issues, which
      estimates a confident subset of the joint distribution of the noisy and true labels, ``P_{noisy label, true label}``.
      Entry ``(j, k)`` in the matrix is the number of examples confidently counted into the pair of ``(noisy label=j, true label=k)`` classes.
      The `confident_joint` can be computed using :py:func:`count.compute_confident_joint <cleanlab.count.compute_confident_joint>`.
      If not provided, it is computed from the given (noisy) `labels` and `pred_probs`.

    filter_by : {'prune_by_class', 'prune_by_noise_rate', 'both', 'confident_learning', 'predicted_neq_given'}, default='prune_by_noise_rate'

      Method used for filtering/pruning out the label issues:

      - ``'prune_by_noise_rate'``: works by removing examples with *high probability* of being mislabeled for every non-diagonal in the confident joint (see `prune_counts_matrix` in `filter.py`). These are the examples where (with high confidence) the given label is unlikely to match the predicted label for the example.
      - ``'prune_by_class'``: works by removing the examples with *smallest probability* of belonging to their given class label for every class.
      - ``'both'``: Removes only the examples that would be filtered by both ``'prune_by_noise_rate'`` and ``'prune_by_class'``.
      - ``'confident_learning'``: Returns the examples in the off-diagonals of the confident joint. These are the examples that are confidently predicted to be a different label than their given label.
      - ``'predicted_neq_given'``: Find examples where the predicted class (i.e. argmax of the predicted probabilities) does not match the given label.

    return_indices_ranked_by : {None, 'self_confidence', 'normalized_margin', 'confidence_weighted_entropy'}, default=None
      If ``None``, returns a boolean mask (``True`` if example at index is label error).
      If not ``None``, returns an array of the label error indices
      (instead of a boolean mask) where error indices are ordered:

      - ``'normalized_margin'``: ``normalized margin (p(label = k) - max(p(label != k)))``
      - ``'self_confidence'``: ``[pred_probs[i][labels[i]] for i in label_issues_idx]``
      - ``'confidence_weighted_entropy'``: ``entropy(pred_probs) / self_confidence``

    rank_by_kwargs : dict, optional
      Optional keyword arguments to pass into scoring functions for ranking by
      label quality score (see :py:func:`rank.get_label_quality_scores
      <cleanlab.rank.get_label_quality_scores>`).

    multi_label : bool, optional
      If ``True``, labels should be an iterable (e.g. list) of iterables, containing a
      list of labels for each example, instead of just a single label.
      The multi-label setting supports classification tasks where an example has 1 or more labels.
      Example of a multi-labeled `labels` input: ``[[0,1], [1], [0,2], [0,1,2], [0], [1], ...]``.

    frac_noise : float, default=1.0
      Used to only return the "top" ``frac_noise * num_label_issues``. The choice of which "top"
      label issues to return is dependent on the `filter_by` method used. It works by reducing the
      size of the off-diagonals of the `joint` distribution of given labels and true labels
      proportionally by `frac_noise` prior to estimating label issues with each method.
      This parameter only applies for `filter_by=both`, `filter_by=prune_by_class`, and
      `filter_by=prune_by_noise_rate` methods and currently is unused by other methods.
      When ``frac_noise=1.0``, return all "confident" estimated noise indices (recommended).

      frac_noise * number_of_mislabeled_examples_in_class_k.

    num_to_remove_per_class : array_like
      An iterable of length K, the number of classes.
      E.g. if K = 3, ``num_to_remove_per_class=[5, 0, 1]`` would return
      the indices of the 5 most likely mislabeled examples in class 0,
      and the most likely mislabeled example in class 2.

      Note
      ----
      Only set this parameter if ``filter_by='prune_by_class'``.
      You may use with ``filter_by='prune_by_noise_rate'``, but
      if ``num_to_remove_per_class=k``, then either k-1, k, or k+1
      examples may be removed for any class due to rounding error. If you need
      exactly 'k' examples removed from every class, you should use
      ``filter_by='prune_by_class'``.

    min_examples_per_class : int, default=1
      Minimum number of examples per class to avoid flagging as label issues.
      This is useful to avoid deleting too much data from one class
      when pruning noisy examples in datasets with rare classes.

    n_jobs : optional
      Number of processing threads used by multiprocessing. Default ``None``
      sets to the number of cores on your CPU.
      Set this to 1 to *disable* parallel processing (if its causing issues).
      Windows users may see a speed-up with ``n_jobs=1``.

    verbose : optional
      If ``True``, prints when multiprocessing happens.

    Returns
    -------
    label_issues : np.array
      A boolean mask for the entire dataset where ``True`` represents a
      label issue and ``False`` represents an example that is accurately
      labeled with high confidence.

      Note
      ----
      You can also return the *indices* of the label issues in your dataset by setting
      `return_indices_ranked_by`.
    """

    assert filter_by in [
        "prune_by_noise_rate",
        "prune_by_class",
        "both",
        "confident_learning",
        "predicted_neq_given",
    ]  # TODO: change default to confident_learning ?
    assert len(labels) == len(pred_probs)
    if filter_by in [
            "confident_learning", "predicted_neq_given"
    ] and (frac_noise != 1.0 or num_to_remove_per_class is not None):
        warn_str = (
            "WARNING! frac_noise and num_to_remove_per_class parameters are only supported"
            " for filter_by 'prune_by_noise_rate', 'prune_by_class', and 'both'. They "
            "are not supported for methods 'confident_learning' or "
            "'predicted_neq_given'.")
        warnings.warn(warn_str)
    if (num_to_remove_per_class is not None) and (filter_by in [
            "confident_learning", "predicted_neq_given"
    ]):
        # TODO - add support for these two filters
        raise ValueError(
            "filter_by 'confident_learning' or 'predicted_neq_given' is not supported (yet) when setting 'num_to_remove_per_class'"
        )

    # Set-up number of multiprocessing threads
    if n_jobs is None:
        n_jobs = multiprocessing.cpu_count()
    else:
        assert n_jobs >= 1

    # Number of examples in each class of labels
    if multi_label:
        label_counts = value_counts([i for lst in labels for i in lst])
    else:
        label_counts = value_counts(labels)
    # Number of classes labels
    K = len(pred_probs.T)
    # Boolean set to true if dataset is large
    big_dataset = K * len(labels) > 1e8
    # Ensure labels are of type np.array()
    labels = np.asarray(labels)
    if confident_joint is None or filter_by == "confident_learning":
        from cleanlab.count import compute_confident_joint

        confident_joint, cl_error_indices = compute_confident_joint(
            labels=labels,
            pred_probs=pred_probs,
            multi_label=multi_label,
            return_indices_of_off_diagonals=True,
        )
    if filter_by in ["prune_by_noise_rate", "prune_by_class", "both"]:
        # Create `prune_count_matrix` with the number of examples to remove in each class and
        # leave at least min_examples_per_class examples per class.
        # `prune_count_matrix` is transposed relative to the confident_joint.
        prune_count_matrix = _keep_at_least_n_per_class(
            prune_count_matrix=confident_joint.T,
            n=min_examples_per_class,
            frac_noise=frac_noise,
        )

        if num_to_remove_per_class is not None:
            # Estimate joint probability distribution over label issues
            psy = prune_count_matrix / np.sum(prune_count_matrix, axis=1)
            noise_per_s = psy.sum(axis=1) - psy.diagonal()
            # Calibrate labels.t. noise rates sum to num_to_remove_per_class
            tmp = (psy.T * num_to_remove_per_class / noise_per_s).T
            np.fill_diagonal(tmp, label_counts - num_to_remove_per_class)
            prune_count_matrix = round_preserving_row_totals(tmp)

        # Prepare multiprocessing shared data
        if n_jobs > 1:
            if multi_label:
                _labels = RawArray("I", int2onehot(labels).flatten())
            else:
                _labels = RawArray("I", labels)
            _label_counts = RawArray("I", label_counts)
            _prune_count_matrix = RawArray("I", prune_count_matrix.flatten())
            _pred_probs = RawArray("f", pred_probs.flatten())
        else:  # Multiprocessing is turned off. Create tuple with all parameters
            args = (
                labels,
                label_counts,
                prune_count_matrix,
                pred_probs,
                multi_label,
                min_examples_per_class,
            )

    # Perform Pruning with threshold probabilities from BFPRT algorithm in O(n)
    # Operations are parallelized across all CPU processes
    if filter_by == "prune_by_class" or filter_by == "both":
        if n_jobs > 1:  # parallelize
            with multiprocessing.Pool(
                    n_jobs,
                    initializer=_init,
                    initargs=(
                        _labels,
                        _label_counts,
                        _prune_count_matrix,
                        prune_count_matrix.shape,
                        _pred_probs,
                        pred_probs.shape,
                        multi_label,
                        min_examples_per_class,
                    ),
            ) as p:
                if verbose:  # pragma: no cover
                    print("Parallel processing label issues by class.")
                sys.stdout.flush()
                if big_dataset and tqdm_exists:
                    label_issues_masks_per_class = list(
                        tqdm.tqdm(p.imap(_prune_by_class, range(K)),
                                  total=K), )
                else:
                    label_issues_masks_per_class = p.map(
                        _prune_by_class, range(K))
        else:  # n_jobs = 1, so no parallelization
            label_issues_masks_per_class = [
                _prune_by_class(k, args) for k in range(K)
            ]
        label_issues_mask = np.stack(label_issues_masks_per_class).any(axis=0)

    if filter_by == "both":
        label_issues_mask_by_class = label_issues_mask

    if filter_by == "prune_by_noise_rate" or filter_by == "both":
        if n_jobs > 1:  # parallelize
            with multiprocessing.Pool(
                    n_jobs,
                    initializer=_init,
                    initargs=(
                        _labels,
                        _label_counts,
                        _prune_count_matrix,
                        prune_count_matrix.shape,
                        _pred_probs,
                        pred_probs.shape,
                        multi_label,
                        min_examples_per_class,
                    ),
            ) as p:
                if verbose:  # pragma: no cover
                    print("Parallel processing label issues by noise rate.")
                sys.stdout.flush()
                if big_dataset and tqdm_exists:
                    label_issues_masks_per_class = list(
                        tqdm.tqdm(p.imap(_prune_by_count, range(K)), total=K))
                else:
                    label_issues_masks_per_class = p.map(
                        _prune_by_count, range(K))
        else:  # n_jobs = 1, so no parallelization
            label_issues_masks_per_class = [
                _prune_by_count(k, args) for k in range(K)
            ]
        label_issues_mask = np.stack(label_issues_masks_per_class).any(axis=0)

    if filter_by == "both":
        label_issues_mask = label_issues_mask & label_issues_mask_by_class

    if filter_by == "confident_learning":
        label_issues_mask = np.zeros(len(labels), dtype=bool)
        for idx in cl_error_indices:
            label_issues_mask[idx] = True

    if filter_by == "predicted_neq_given":
        label_issues_mask = find_predicted_neq_given(labels,
                                                     pred_probs,
                                                     multi_label=multi_label)

    # Remove label issues if given label == model prediction
    if multi_label:
        pred = _multiclass_crossval_predict(labels, pred_probs)
        labels = MultiLabelBinarizer().fit_transform(labels)
    else:
        pred = pred_probs.argmax(axis=1)
    for i, pred_label in enumerate(pred):
        if (multi_label and np.all(pred_label == labels[i])
                or not multi_label and pred_label == labels[i]):
            label_issues_mask[i] = False

    if verbose:
        print("Number of label issues found: {}".format(
            sum(label_issues_mask)))

    # TODO: run count.num_label_issues() and adjust the total issues found here to match
    if return_indices_ranked_by is not None:
        er = order_label_issues(
            label_issues_mask=label_issues_mask,
            labels=labels,
            pred_probs=pred_probs,
            rank_by=return_indices_ranked_by,
            rank_by_kwargs=rank_by_kwargs,
        )
        return er
    return label_issues_mask
def pianoRollToTokenSequence(piano_roll, combi_to_int):
    mlb = MultiLabelBinarizer()
    mlb.fit([np.arange(128).tolist()])
    combi_pairs = mlb.inverse_transform(piano_roll)
    return [combi_to_int[combi] for combi in combi_pairs]
Exemple #48
0
X_train = np.array(norm_corpus)
Y_train = []

for q in category:
    qstring = str(q)
    min_list = qstring.split(',')
    max_list = []
    for m in min_list:
        max_list.append(m.strip())

    Y_train.append(max_list)

X_test = np.array(norm_query_docs)
target_names = ['Academic', 'Mess', 'Internet', 'Maintainance']

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(Y_train)

classifier = Pipeline([('vectorizer', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train, Y)

predicted = classifier.predict(X_test)
all_labels = mlb.inverse_transform(predicted)

print mlb.classes_

temp_categories = []
for item, labels in zip(X_test, all_labels):
Exemple #49
0
    STYLES = {
        style['id']: style
        for style in STYLES
        if (style['articleType'] == article_type and style['gender'] in genders
            and style['usage'] in usages)
    }

image_paths = [
    *filter(lambda p: p.split(os.path.sep)[-1][:-4] in STYLES.keys(),
            image_paths)
]

X, y = load_images_and_labels(image_paths, STYLES, (64, 64))
X = X.astype('float') / 255.0

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)

(X_train, X_test, y_train, y_test) = train_test_split(X,
                                                      y,
                                                      stratify=y,
                                                      test_size=0.2,
                                                      random_state=SEED)
(X_train, X_valid, y_train, y_valid) = train_test_split(X_train,
                                                        y_train,
                                                        stratify=y_train,
                                                        test_size=0.2,
                                                        random_state=SEED)

model = build_network(width=64, height=64, depth=3, classes=len(mlb.classes_))
model.compile(loss='binary_crossentropy',
Exemple #50
0
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('--data_path', type=str, default='data')

    arg('--model', type=str, default='pnasnet5large')
    arg('--exp-name', type=str, default='pnasnet5large_2')

    arg('--batch-size', type=int, default=32)
    arg('--lr', type=float, default=1e-2)
    arg('--patience', type=int, default=4)
    arg('--n-epochs', type=int, default=15)

    arg('--n-folds', type=int, default=10)
    arg('--fold', type=int, default=0)

    arg('--random-seed', type=int, default=314159)

    arg('--num-workers', type=int, default=6)
    arg('--gpus', type=str, default='0')

    arg('--resize', type=int, default=331)
    arg('--crop', type=int, default=331)
    arg('--scale', type=str, default='0.4, 1.0')
    arg('--mean', type=str, default='0.485, 0.456, 0.406')
    arg('--std', type=str, default='0.229, 0.224, 0.225')

    args = parser.parse_args()
    print(args)

    os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus

    #  os.environ['MXNET_CUDNN_AUTOTUNE_DEFAULT'] = '1'
    #  os.environ['MXNET_UPDATE_ON_KVSTORE'] = "0"
    #  os.environ['MXNET_EXEC_ENABLE_ADDTO'] = "1"
    #  os.environ['MXNET_USE_TENSORRT'] = "0"
    #  os.environ['MXNET_GPU_WORKER_NTHREADS'] = "2"
    #  os.environ['MXNET_GPU_COPY_NTHREADS'] = "1"
    #  os.environ['MXNET_OPTIMIZER_AGGREGATION_SIZE'] = "54"

    random_seed = args.random_seed
    set_random_seed(random_seed)

    path_to_data = Path(args.data_path)
    labels = pd.read_csv(path_to_data / 'labels.csv')
    num_classes = len(labels)

    train = pd.read_csv(path_to_data / 'train.csv.zip')

    n_folds = args.n_folds
    make_folds(train, n_folds, random_seed)

    mlb = MultiLabelBinarizer([str(i) for i in range(num_classes)])
    s = train['attribute_ids'].str.split()
    res = pd.DataFrame(mlb.fit_transform(s),
                       columns=mlb.classes_,
                       index=train.index)
    train = pd.concat([res, train['id'] + '.png', train['fold']], axis=1)

    gpu_count = len(args.gpus.split(','))
    batch_size = args.batch_size

    resize = args.resize
    crop = args.crop
    scale = tuple(float(x) for x in args.scale.split(','))
    mean = [float(x) for x in args.mean.split(',')]
    std = [float(x) for x in args.std.split(',')]

    #  jitter_param = 0.4
    #  lighting_param = 0.1
    labels_ids = [str(i) for i in range(num_classes)]
    num_workers = args.num_workers

    fold = args.fold
    train_transformer = get_train_transform(resize=resize,
                                            crop=crop,
                                            scale=scale,
                                            mean=mean,
                                            std=std)
    train_loader = mx.gluon.data.DataLoader(MXDataset(
        path_to_data / 'train', train[train['fold'] != fold].copy(),
        labels_ids, train_transformer),
                                            batch_size=batch_size * gpu_count,
                                            shuffle=True,
                                            num_workers=num_workers,
                                            pin_memory=True)

    test_transformer = get_test_transform(resize=resize,
                                          crop=crop,
                                          mean=mean,
                                          std=std)
    dev_loader = mx.gluon.data.DataLoader(MXDataset(
        path_to_data / 'train', train[train['fold'] == fold].copy(),
        labels_ids, test_transformer),
                                          batch_size=batch_size * gpu_count,
                                          shuffle=False,
                                          num_workers=num_workers,
                                          pin_memory=True)
    fp16 = True
    if args.model == 'pnasnet5large':
        net = get_pnasnet5large(num_classes)
    else:
        raise (f'No such model {args.model}')

    if fp16:
        net.cast('float16')
    ctx = [mx.gpu(i) for i in range(gpu_count)]
    net.collect_params().reset_ctx(ctx)

    epoch_size = len(train_loader)
    lr = args.lr * batch_size / 256
    steps = [step * epoch_size for step in [7, 9]]
    factor = 0.5
    warmup_epochs = 5
    warmup_mode = 'linear'
    schedule = mx.lr_scheduler.MultiFactorScheduler(
        step=steps,
        factor=factor,
        base_lr=lr,
        warmup_steps=warmup_epochs * epoch_size,
        warmup_mode=warmup_mode)

    if fp16:
        weight = 128
        opt = mx.optimizer.Adam(
            multi_precision=True,
            learning_rate=lr,
            rescale_grad=1 / weight,
            lr_scheduler=schedule,
        )
    else:
        opt = mx.optimizer.Adam(
            learning_rate=lr,
            lr_scheduler=schedule,
        )
    trainer = mx.gluon.Trainer(net.collect_params(), opt)
    if fp16:
        loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(weight=weight)
    else:
        loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()

    path_to_models = Path('models')
    path_to_model = path_to_models / args.exp_name
    path_to_exp = path_to_model / f'fold_{fold}'
    if not path_to_exp.exists():
        path_to_exp.mkdir(parents=True)

    patience = args.patience
    lr_reset_epoch = 1
    lr_changes = 0
    max_lr_changes = 2
    n_epochs = args.n_epochs
    best_dev_f2 = th2 = 0
    train_losses = []
    dev_losses, dev_f2s, dev_ths = [], [], []
    dev_met1, dev_met2 = [], []
    for epoch in range(1, n_epochs + 1):
        train_loss, all_predictions, all_targets = epoch_step(
            train_loader,
            desc=f'[ Training {epoch}/{n_epochs}.. ]',
            fp16=fp16,
            ctx=ctx,
            net=net,
            loss=loss,
            trainer=trainer)
        train_losses.append(train_loss)

        dev_loss, all_predictions, all_targets = epoch_step(
            dev_loader,
            desc=f'[ Validating {epoch}/{n_epochs}.. ]',
            fp16=fp16,
            ctx=ctx,
            net=net,
            loss=loss)
        dev_losses.append(dev_loss)

        metrics = {}
        argsorted = all_predictions.argsort(axis=1)
        for threshold in [0.01, 0.05, 0.1, 0.15, 0.2]:
            metrics[f'valid_f2_th_{threshold:.2f}'] = get_score(
                binarize_prediction(all_predictions, threshold, argsorted),
                all_targets)
        dev_met1.append(metrics)

        dev_f2 = 0
        for th in dev_met1[-1]:
            if dev_met1[-1][th] > dev_f2:
                dev_f2 = dev_met1[-1][th]
                th2 = th

        all_predictions = all_predictions / all_predictions.max(1,
                                                                keepdims=True)
        metrics = {}
        argsorted = all_predictions.argsort(axis=1)
        for threshold in [0.05, 0.1, 0.2, 0.3, 0.4]:
            metrics[f'valid_norm_f2_th_{threshold:.2f}'] = get_score(
                binarize_prediction(all_predictions, threshold, argsorted),
                all_targets)
        dev_met2.append(metrics)

        for th in dev_met2[-1]:
            if dev_met2[-1][th] > dev_f2:
                dev_f2 = dev_met2[-1][th]
                th2 = th

        dev_f2s.append(dev_f2)
        dev_ths.append(th2)
        if dev_f2 > best_dev_f2:
            best_dev_f2 = dev_f2
            best_th = th2
            if fp16:
                net.cast('float32')
                net.save_parameters((path_to_exp / 'model').as_posix())
                net.cast('float16')
            else:
                net.save_parameters((path_to_exp / 'model').as_posix())
            save_dict(
                {
                    'dev_loss': dev_loss,
                    'dev_f2': best_dev_f2,
                    'dev_th': best_th,
                    'epoch': epoch,
                    'dev_f2s': dev_f2s,
                    'dev_ths': dev_ths,
                    'dev_losses': dev_losses,
                    'dev_met1': dev_met1,
                    'dev_met2': dev_met2,
                }, path_to_exp / 'meta_data.pkl')
        elif (patience and epoch - lr_reset_epoch > patience
              and max(dev_f2s[-patience:]) < best_dev_f2):
            # "patience" epochs without improvement
            lr_changes += 1
            if lr_changes > max_lr_changes:
                break
            lr *= factor
            print(f'lr updated to {lr}')
            lr_reset_epoch = epoch
            if fp16:
                weight = 128
                opt = mx.optimizer.Adam(multi_precision=True,
                                        learning_rate=lr,
                                        rescale_grad=1 / weight)
            else:
                opt = mx.optimizer.Adam(learning_rate=lr)
            trainer = mx.gluon.Trainer(net.collect_params(), opt)

        plot_all(path_to_exp, train_losses, dev_losses, dev_f2s, dev_ths,
                 dev_met1, dev_met2)
Exemple #51
0
def main():

    # parameters
    write_whole_cluster = False
    perform_pca = False
    birch_thresh = 2.0

    eval_file_names = [
        'filtered_eval_three_event.csv', 'filtered_eval_five_event.csv',
        'filtered_eval_seven_event.csv'
    ]
    annotated_file_names = [
        'annotated_three_event.txt', 'annotated_five_event.txt',
        'annotated_seven_event.txt'
    ]
    '''for i in range(1,179):
        if(i not in temp):
            print(i)
    '''

    for m in range(0, len(eval_file_names)):

        fileName = eval_file_names[m]
        file_prefix = 'output'
        print(fileName)

        for birch_thresh in np.arange(0.0, 4.1, 0.2):
            df = pd.read_csv(fileName, header=None, encoding='latin-1')

            class_labels = [None] * len(df)
            temp = {}
            with open(annotated_file_names[m], "r") as ins:
                label = 1
                for line in ins:
                    line = line.strip()
                    if line.startswith("#"):
                        continue
                    if line:
                        line = line.split(',')
                        # print(line)
                        for item in line:
                            class_labels[int(item) - 1] = label
                            temp[int(item)] = True
                        label += 1

            df.columns = [
                'record_id', 'date', 'url', 'counts', 'themes', 'locations',
                'persons', 'organizations', 'tone'
            ]

            df = df[pd.notnull(df['themes'])]
            df = df[pd.notnull(df['locations'])]

            df_locations = pd.DataFrame(df['locations'])
            df_counts = pd.DataFrame(df['counts'])
            df_counts.fillna('#', inplace=True)

            df_counts = pd.DataFrame(
                df_counts['counts'].str.split(';'))  # splitting counts

            for row in df_counts.itertuples():
                for i in range(0, len(row.counts)):
                    try:
                        temp_list = row.counts[i].split('#')
                        row.counts[i] = temp_list[0] + '#' + temp_list[
                            1] + '#' + temp_list[5]
                        # print(row.locations[i])
                    except:
                        continue
                if len(row.counts) == 1 and row.counts[0] == '':
                    row.counts.append(
                        '#'
                    )  # so that news with no counts are clustered together
                    row.counts.pop(0)

                if row.counts[len(row.counts) - 1] == '':
                    row.counts.pop()

            # df_counts.to_csv('countsonly.csv', sep=',')

            row_dict = df.copy(deep=True)
            row_dict.fillna('', inplace=True)
            row_dict.index = range(len(row_dict))
            row_dict = row_dict.to_dict(
                'index')  # dictionary that maps row number to row

            identifier_dict = {
            }  # dictionary that maps GKG Record Id to Row Number
            i = 0
            for index, row in df.iterrows():
                identifier_dict[row['record_id']] = i
                i += 1

            df = df[df.columns[[4]]]
            df.columns = ['themes']

            df = pd.DataFrame(df['themes'].str.split(';'))  # splitting themes

            df_locations = pd.DataFrame(df_locations['locations'].str.split(
                ';'))  # splitting locations

            for row in df_locations.itertuples():
                for i in range(0, len(row.locations)):
                    try:
                        row.locations[i] = (row.locations[i].split('#'))[
                            3]  # for retaining only ADM1 Code
                    except:
                        continue
                # merged = list(itertools.chain(*row.locations))
                # df_locations.loc[row.Index, 'locations'] = merged

            df = df[pd.notnull(df['themes'])]

            mlb = MultiLabelBinarizer(sparse_output=True)
            sparse_themes = mlb.fit_transform(df['themes'])

            mlb2 = MultiLabelBinarizer(sparse_output=True)
            sparse_locations = mlb2.fit_transform(df_locations['locations'])

            mlb3 = MultiLabelBinarizer(sparse_output=True)
            sparse_counts = mlb3.fit_transform(df_counts['counts'])

            df = hstack([sparse_themes, sparse_locations, sparse_counts])

            # Reducing dimensions through principal component analysis

            if perform_pca:
                pca = PCA(n_components=None)
                df = pd.DataFrame(pca.fit_transform(df))

            # print("Starting clustering")
            brc = Birch(branching_factor=50,
                        n_clusters=None,
                        threshold=birch_thresh,
                        compute_labels=True)
            predicted_labels = brc.fit_predict(df)

            clusters = {}
            n = 0

            for item in predicted_labels:
                if item in clusters:
                    clusters[item].append(list((row_dict[n]).values(
                    )))  # since row_dict[n] is itself a dictionary
                else:
                    clusters[item] = [list((row_dict[n]).values())]
                n += 1

            # print(n)
            label = 0
            cluster_labels = [None] * n
            with open(file_prefix + '.txt', 'w', encoding='utf-8') as file:
                for item in clusters:
                    file.write("\n\nCluster " + str(item) + "\n")
                    for i in range(0, len(clusters[item])):
                        gkg_record_id = clusters[item][i][0]
                        file.write(
                            str(identifier_dict[gkg_record_id] + 1) + '\n' +
                            clusters[item][i][2] + '\n' +
                            clusters[item][i][3] + '\n\n')  # appending url
                        cluster_labels[identifier_dict[gkg_record_id]] = label
                    label += 1

            #print(cluster_labels)
            # cluster_labels = predicted_labels

            matrix = metrics.cluster.contingency_matrix(
                class_labels, cluster_labels)
            rand_index, precision, recall, f1 = precision_recall_fmeasure(
                matrix)

            ari = metrics.cluster.adjusted_rand_score(class_labels,
                                                      cluster_labels)
            # print("AdjustedRI:", ari)

            nmi = metrics.normalized_mutual_info_score(class_labels,
                                                       cluster_labels)
            # print("NMI       :", nmi)

            print(birch_thresh, ",", rand_index, ",", precision, ",", recall,
                  ",", f1, ",", ari, ",", nmi)
Exemple #52
0
    def __init__(self, cursor):

        DB = getdatabase(cursor)
        rentLog = DB.rentLog
        dealLog = DB.dealLog
        rentitems = DB.rent
        dealitems = DB.deal

        self.lens_to_internal_rentuser_ids = defaultdict(
            lambda: len(self.lens_to_internal_rentuser_ids))
        self.lens_to_internal_rentitem_ids = defaultdict(
            lambda: len(self.lens_to_internal_rentitem_ids))
        self.lens_to_internal_dealuser_ids = defaultdict(
            lambda: len(self.lens_to_internal_dealuser_ids))
        self.lens_to_internal_dealitem_ids = defaultdict(
            lambda: len(self.lens_to_internal_dealitem_ids))

        # itemLogs
        rentLog, self.n_users_rent, self.n_items_rent = self.raw_ratings(
            rentLog, 'rent')
        dealLog, self.n_users_deal, self.n_items_deal = self.raw_ratings(
            dealLog, 'deal')

        self.sparse_rent = self.interactions_list_to_sparse_matrix(
            rentLog, self.n_users_rent, self.n_items_rent)
        self.sparse_deal = self.interactions_list_to_sparse_matrix(
            dealLog, self.n_users_deal, self.n_items_deal)

        self.user_indicator_features_rent = sparse.identity(self.n_users_rent)
        self.item_indicator_features_rent = sparse.identity(self.n_items_rent)
        self.user_indicator_features_deal = sparse.identity(self.n_users_deal)
        self.item_indicator_features_deal = sparse.identity(self.n_items_deal)

        # itemCategories
        rentitems_categories, rentitems_titles = self.items_categories(
            rentitems)
        dealitems_categories, dealitems_titles = self.items_categories(
            dealitems)

        rent_categories = [
            rentitems_categories[internal_id]
            for internal_id in range(self.n_items_rent)
        ]
        deal_categories = [
            dealitems_categories[internal_id]
            for internal_id in range(self.n_items_deal)
        ]

        rent_categories_features = MultiLabelBinarizer().fit_transform(
            rent_categories)
        deal_categories_features = MultiLabelBinarizer().fit_transform(
            deal_categories)

        rent_categories_features = sparse.coo_matrix(rent_categories_features)
        deal_categories_features = sparse.coo_matrix(deal_categories_features)

        # result
        self.full_rentitem_features = sparse.hstack(
            [self.item_indicator_features_rent, rent_categories_features])
        self.full_dealitem_features = sparse.hstack(
            [self.item_indicator_features_deal, deal_categories_features])
Exemple #53
0
yearList = []
years = data["year"].tolist()
for year in years:
	try:
		yearList.append(int(year))	
	except ValueError as err:
		yearList.append(0) 
yearList = pd.Series(yearList)

# Pre-process genres
# Store as elements of list
genres = data["genres"].tolist()
genreList = []
for group in genres:
	genreList.append(group.split(", "))
mlb = MultiLabelBinarizer()
mlb.fit(genreList)

# Number of unique genres
numGenres = len(mlb.classes_)
encoded_genres = mlb.transform(genreList) 

# Existence matrix to show if each genre corresponds to each track or not (0 or 1)
genreDF = pd.DataFrame(encoded_genres,columns = mlb.classes_)
genreDF['track_id'] = trackids
genreDF.set_index('track_id')

data['encoded_artist'] = encoded_artists.values
data['encoded_country'] = encoded_countries.values
data['encoded_year'] = yearList.values
data = data.merge(genreDF, on='track_id')
Exemple #54
0
import joblib
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
multilabel = MultiLabelBinarizer()

model = joblib.load('lps_model.pkl')
tfidf = joblib.load('tfidf_model.pkl')

df = pd.read_csv('https://raw.githubusercontent.com/nagappanm/Python-Machine-Learning/master/Multi_Label_Text_Classification_with_Skmultilearn/data/so_dataset_updated_blank.csv')
df['Tagsupdated']=df['Tagsupdated'].fillna("")
df['Tagsupdated'] = df['Tagsupdated'].apply(lambda x: x.split(','))

y = multilabel.fit_transform(df['Tagsupdated'])

x = [ 'how to write code in query and php Histogram:']

y_predict = model.predict(tfidf.transform(x))

print("The Output from model is:",(y_predict.toarray()));

print("The Output from model is:",multilabel.inverse_transform(y_predict));

inverseTransformList = multilabel.inverse_transform(y_predict)

out = [item.strip() for t in inverseTransformList for item in t]
out
from collections import OrderedDict
out = list(OrderedDict.fromkeys(out))
print("The List distinct is:",out)
Exemple #55
0
Fichier : SVM.py Projet : goonhu/NN
# load names (reviews) files and labels files inside the ZIP file into the memory

with open("FILE_NAMES.txt") as f:
    FILE_NAMES = f.read().split("\n")

with open("FILE_LABELS.txt") as f:
    FILE_LABELS = f.read().split("\n")

FILE_NAMES_tokens = [FILE_NAME.split() for FILE_NAMES in FILE_NAMES]

# load the module to transform names(reviews) inputs into binary vectors
# What does this is MultiLabelBinarizer
# Get one-hot encoding of FILE NAME tokens
from sklearn.preprocessing import MultiLabelBinarizer

one_hot_encoding = MultiLabelBinarizer()
one_hot_encoding.fit(FILE_NAMES_tokens)

# need to divide the data into training and test

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(FILE_NAMES_tokens, FILE_LABELS, test_size = 0.2, random_state = None)

# Create SVM classfier
# What does this is LinearSVC

# and then Train it

from sklearn.svm import LinearSVC
from utils.config import root
import os
import numpy as np
import sys
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))

from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from model.bert.metrics import f1_np

mlb = MultiLabelBinarizer()
all_labels = [
    '生物性污染', '细胞有丝分裂不同时期的特点', '液泡的结构和功能', '组成细胞的化学元素', '兴奋在神经纤维上的传导', '不完全显性',
    '免疫系统的组成', '生物技术在其他方面的应用', '群落的结构', '中央官制——三公九卿制', '核糖体的结构和功能',
    '人体免疫系统在维持稳态中的作用', '皇帝制度', '激素调节', '伴性遗传', '地球运动的地理意义', '宇宙中的地球',
    '地球运动的基本形式', '基因工程的原理及技术', '体液免疫的概念和过程', '基因的分离规律的实质及应用', '蛋白质的合成',
    '地球的内部圈层结构及特点', '人口增长与人口问题', '经济学常识', '劳动就业与守法经营', '器官移植', '生物技术实践',
    '垄断组织的出现', '基因工程的概念', '神经调节和体液调节的比较', '人口与城市', '组成细胞的化合物', '地理', '文艺的春天',
    '生物工程技术', '基因的自由组合规律的实质及应用', '郡县制', '人体水盐平衡调节', '内质网的结构和功能', '人体的体温调节',
    '免疫系统的功能', '科学社会主义常识', '与细胞分裂有关的细胞器', '太阳对地球的影响', '古代史', '清末民主革命风潮',
    '复等位基因', '人工授精、试管婴儿等生殖技术', '“重农抑商”政策', '生态系统的营养结构', '减数分裂的概念',
    '地球的外部圈层结构及特点', '细胞的多样性和统一性', '政治', '工业区位因素', '细胞大小与物质运输的关系', '夏商两代的政治制度',
    '农业区位因素', '溶酶体的结构和功能', '生产活动与地域联系', '内环境的稳态', '遗传与进化', '胚胎移植', '生物科学与社会',
    '近代史', '第三产业的兴起和“新经济”的出现', '公民道德与伦理常识', '中心体的结构和功能', '社会主义市场经济的伦理要求', '高中',
    '选官、用官制度的变化', '减数分裂与有丝分裂的比较', '遗传的细胞基础', '地球所处的宇宙环境', '培养基与无菌技术',
    '生活中的法律常识', '高尔基体的结构和功能', '社会主义是中国人民的历史性选择', '人口迁移与人口流动', '现代史', '地球与地图',
    '走进细胞', '生物', '避孕的原理和方法', '血糖平衡的调节', '现代生物技术专题', '海峡两岸关系的发展', '生命活动离不开细胞',
    '兴奋在神经元之间的传递', '历史', '分子与细胞', '拉马克的进化学说', '遗传的分子基础', '稳态与环境'
]
Exemple #57
0
 def __init__(self, vectors, clf):
     self.embeddings = vectors
     self.clf = TopKRanker(clf)
     self.binarizer = MultiLabelBinarizer(sparse_output=True)
Exemple #58
0
class ClassificationProcessor(BaseProcessor):
    """
    Corpus Pre Processor class
    """
    def __init__(self, multi_label=False, **kwargs):
        super(ClassificationProcessor, self).__init__(**kwargs)
        self.multi_label = multi_label
        self.multi_label_binarizer: MultiLabelBinarizer = None

    def info(self):
        info = super(ClassificationProcessor, self).info()
        info['task'] = kashgari.CLASSIFICATION
        info['config']['multi_label'] = self.multi_label
        return info

    def _build_label_dict(self, labels: List[str]):
        if self.multi_label:
            label_set = set()
            for i in labels:
                label_set = label_set.union(list(i))
        else:
            label_set = set(labels)
        self.label2idx = {}
        for idx, label in enumerate(sorted(label_set)):
            self.label2idx[label] = len(self.label2idx)

        self.idx2label = dict([(value, key)
                               for key, value in self.label2idx.items()])
        self.dataset_info['label_count'] = len(self.label2idx)
        self.multi_label_binarizer = MultiLabelBinarizer(
            classes=list(self.label2idx.keys()))

    def process_y_dataset(self,
                          data: List[str],
                          max_len: Optional[int] = None,
                          subset: Optional[List[int]] = None) -> np.ndarray:
        if subset is not None:
            target = utils.get_list_subset(data, subset)
        else:
            target = data
        if self.multi_label:
            return self.multi_label_binarizer.fit_transform(target)
        else:
            numerized_samples = self.numerize_label_sequences(target)
            return to_categorical(numerized_samples, len(self.label2idx))

    def numerize_token_sequences(self, sequences: List[List[str]]):
        result = []
        for seq in sequences:
            if self.add_bos_eos:
                seq = [self.token_bos] + seq + [self.token_eos]
            unk_index = self.token2idx[self.token_unk]
            result.append(
                [self.token2idx.get(token, unk_index) for token in seq])
        return result

    def numerize_label_sequences(self, sequences: List[str]) -> List[int]:
        """
        Convert label sequence to label-index sequence
        ``['O', 'O', 'B-ORG'] -> [0, 0, 2]``

        Args:
            sequences: label sequence, list of str

        Returns:
            label-index sequence, list of int
        """
        return [self.label2idx[label] for label in sequences]

    def reverse_numerize_label_sequences(self, sequences, **kwargs):
        if self.multi_label:
            return self.multi_label_binarizer.inverse_transform(sequences)
        else:
            return [self.idx2label[label] for label in sequences]
Exemple #59
0
    def get_encoded_dataset(self):
        '''
            loads training, validation and testing data, performs preprocessing (stemming, stop word removal, padding/truncation)

            returns: processed train, val and test document along with original test document and corresponding document id,  label encoder, test document ids
            rtype: dataframe, MultiLabelBinarizer, list
        '''

        print("\nLoading data (train, val, test)...")

        mlb = MultiLabelBinarizer()

        train, val, test, test_raw = load_preprocess_data()

        # extract training document (HPISection) and corresponding label (Dx)
        X_train = train['HPISection']
        train['Dx'] = train['Dx'].str.split(',')
        y_train = train['Dx']
        y_train = mlb.fit_transform(list(y_train))

        # set prediction labels
        self.TARGET_NAMES = mlb.classes_

        # extract validation set
        X_val = val['HPISection']
        val['Dx'] = val['Dx'].str.split(',')
        y_val = val['Dx']
        y_val = mlb.transform(list(y_val))

        # extract test data and format
        test, y_test = format_test(test, mlb)

        # save gold standard
        save_gs(test)

        # Extract test set
        X_test = test['HPISection']
        X_test_original = test_raw['HPISection']
        X_test_original_index = test_raw['index']

        self.MAX_SEQ_LENGTH = max([len(s.split()) for s in X_train])

        self.NUM_LABELS = y_train.shape[1]

        # fit a tokenizer
        tokenizer = self.create_tokenizer(X_train)

        # get word -> integer mapping
        word2idx = tokenizer.word_index

        print("\n Setting vocabulary size...")
        # Set vocabulary size
        self.set_num_words(word2idx)

        # save tokenizer
        # with open(OUTPUT_PATH + model_.MODEL_NAME + '_tokenizer.pickle', 'wb') as handle:
        #    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

        # pad sequences (train)
        x_train = tokenizer.texts_to_sequences(X_train)
        x_train = pad_sequences(x_train, maxlen=self.MAX_SEQ_LENGTH, padding='pre', truncating='pre')

        # pad sequences (test)
        x_test = tokenizer.texts_to_sequences(X_test)
        x_test = pad_sequences(x_test, maxlen=self.MAX_SEQ_LENGTH, padding='pre', truncating='pre')

        # pad sequences (val)
        x_val = tokenizer.texts_to_sequences(X_val)
        x_val = pad_sequences(x_val, maxlen=self.MAX_SEQ_LENGTH, padding='pre', truncating='pre')

        return x_train, y_train, x_val, y_val, x_test, y_test, X_test_original, X_test_original_index, word2idx, mlb, \
               test['id']
Exemple #60
0

#Get labels of all the train businesses as arrays
train_labels = numpy.array(
    [getLabels(y) for y in train_business_features['label']])

#Get feature vectors of all the train businesses as arrays
train_features = numpy.array(
    [getFeatureVectors(x) for x in train_business_features['feature vector']])

#Get feature vectors of all the test businesses as arrays
test_features = numpy.array(
    [getFeatureVectors(x) for x in test_business_features['feature vector']])

#Convert train labels into binary format to avail for multi-classification
mul_bin = MultiLabelBinarizer()
train_labels_bin = mul_bin.fit_transform(train_labels)

#Split the train data set to predict f1 score on 20% of the train data
random_state = numpy.random.RandomState(0)
train_feat, test_feat, train_lab, test_lab = train_test_split(
    train_features, train_labels_bin, test_size=.2, random_state=random_state)

#Initialize the linear svm classifier
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))

#Train svm using 80% of train data
classifier.fit(train_feat, train_lab)

#Predict labels of 20% of train data
predict_test_lab = classifier.predict(test_feat)