Beispiel #1
0
    def __init__(self, inter_filePath = "inter/technology_companies_of_the_united_states/"):
        # [[cat,cat...]...]
        self.m = Word2Vec.load_word2vec_format("vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5", binary=True) 
        self.dim = 400

        (correct_categories_train, context_categories_train) = self.load_category_page(inter_filePath + "category_page.txt")  
        (correct_categories_test, context_categories_test) = self.load_category_page(inter_filePath + "category_page_test.txt")
        ## ----  By mean ---
        Xvectors = np.array(self.predict_vector_by_mean(context_categories_train))
        Xvectors_test = np.array(self.predict_vector_by_mean(context_categories_test))


        ## ----  By mean --- *

        ## ----  By SVM ---
        corpus_train = [" ".join(i) for i in context_categories_train]
        corpus_test = [" ".join(i) for i in context_categories_test]
        cv = CountVectorizer(min_df = 1)
        X = cv.fit_transform(corpus_train)
        ##TFIDF
        transformer = TfidfTransformer()
        X_tfidf = transformer.fit_transform(X)
        #Labels
        mlb = MultiLabelBinarizer()
        mlb.fit(correct_categories_train + correct_categories_test)
        Y = mlb.transform(correct_categories_train) ###Transform to multilabel indicator
        #predict test labels
        X_test = cv.transform(corpus_test)
        Y_test = mlb.transform(correct_categories_test)
        #Y_predict_ovr = self.ovrSVM(X, Y, X_test)
        Y_predict_ovr = self.ovrSVM(Xvectors, Y, Xvectors_test)
        #Y_predict_ovo = self.ovoSVM(X, Y, X_test)
        print "---One versus rest---"
        print "Macro F-1:", f1_score(Y_test, Y_predict_ovr, average='macro')
        print "Micro F-1:", f1_score(Y_test, Y_predict_ovr, average='micro')
Beispiel #2
0
def print_report(name_classificator, testing_problems, testing_tags, predicted_problems, predicted_tags):
    predicted_problems, predicted_tags = make_right_order(testing_problems, predicted_problems, predicted_tags)
    mlb = MultiLabelBinarizer().fit(testing_tags + predicted_tags)
    testing_tags = mlb.transform(testing_tags)
    predicted_tags = mlb.transform(predicted_tags)
    print(name_classificator)
    print(classification_report(testing_tags, predicted_tags, target_names=mlb.classes_))
    print('label ranking average precision score =',
          label_ranking_average_precision_score(testing_tags, predicted_tags))
    print('\n', ('#'*100), '\n')
def run_classifierAccuracy(trainSentences, trainLabels, testSentences, testLabels):
	all_labels = ["Drought", "Earthquake", "Flood", "Epidemic", "Hurricane", \
			"Rebellion", "Terrorism", "Tornado", "Tsunami", "displaced_people_and_evacuations", \
			"donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \
			"injured_or_dead_people", "missing_trapped_or_found_people"]
	disaster_labels = ["Drought", "Earthquake", "Flood", "Hurricane", \
			"Tornado", "Tsunami", "displaced_people_and_evacuations", \
			"donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \
			"injured_or_dead_people", "missing_trapped_or_found_people"]
	health_labels = ["Epidemic", "displaced_people_and_evacuations", \
			"donation_needs_or_offers_or_volunteering_services", \
			"injured_or_dead_people"]
	conflict_labels = ["Rebellion", "Terrorism", "displaced_people_and_evacuations", \
			"infrastructure_and_utilities_damage", \
			"injured_or_dead_people", "missing_trapped_or_found_people"]
	import numpy as np
	curr_labels = all_labels

	trainLabels = [list(set(l).intersection(curr_labels)) for l in trainLabels]
	testLabels = [list(set(l).intersection(curr_labels))for l in testLabels]

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer(classes=curr_labels)
	train_label_matrix = mlb.fit(trainLabels)
	print("Labels : ", mlb.classes_)
	train_label_matrix = mlb.transform(trainLabels)
	test_label_matrix = mlb.transform(testLabels)
	print("Shape of label matrix : ", test_label_matrix.shape)

	train_matrix, tfidf = tf_idf_fit_transform(trainSentences)
	test_matrix = tfidf.transform(testSentences)
	print("Shape of sentence matrix : ", test_matrix.shape)


	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import LinearSVC
	from sklearn.ensemble import RandomForestClassifier
	# estimator = LinearSVC()
	estimator = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0, n_jobs = -1)
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, train_label_matrix)
	predictions = classifier.predict(test_matrix)

	from sklearn.metrics import f1_score, precision_score, recall_score
	print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro'))
	print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro'))
	print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro'))
	print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro'))
	print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro'))
	print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro'))
	print("Macro-Precision", precision_score(test_label_matrix, predictions, average=None))
	print("Macro-Recall", recall_score(test_label_matrix, predictions, average=None))
	print("Macro-F1", f1_score(test_label_matrix, predictions, average=None))
Beispiel #4
0
class TimeSeriesLabelTransformer(BaseTaskTransformer):

    def __init__(self, namespace, name, labels=None):
        '''Initialize a time-series label transformer

        Parameters
        ----------
        jam : jams.JAMS
            The JAMS object container

        n_samples : int > 0
            The number of samples in the audio frame

        label_encoder : sklearn.preprocessing.MultiLabelBinarizer
            The (pre-constructed) label encoder
        '''

        super(TimeSeriesLabelTransformer, self).__init__(namespace, 0)

        self.encoder = MultiLabelBinarizer()
        self.encoder.fit([labels])
        self._classes = set(self.encoder.classes_)
        self.name = name

    def transform(self, jam):

        ann = self.find_annotation(jam)

        intervals = np.asarray([[0.0, jam.file_metadata.duration]])
        values = [None]
        mask = False

        if ann:
            ann_int, ann_val = ann.data.to_interval_values()
            intervals = np.vstack([intervals, ann_int])
            values.extend(ann_val)
            mask = True

        # Suppress all intervals not in the encoder
        tags = []
        for v in values:
            if v in self._classes:
                tags.extend(self.encoder.transform([[v]]))
            else:
                tags.extend(self.encoder.transform([[]]))

        tags = np.asarray(tags)
        target = self.encode_intervals(jam.file_metadata.duration,
                                       intervals,
                                       tags)
        return {'output_{:s}'.format(self.name): target,
                'mask_{:s}'.format(self.name): mask}
def test_multilabel_classification_report():
    n_classes = 4
    n_samples = 50
    make_ml = make_multilabel_classification
    _, y_true_ll = make_ml(n_features=1, n_classes=n_classes, random_state=0,
                           n_samples=n_samples)
    _, y_pred_ll = make_ml(n_features=1, n_classes=n_classes, random_state=1,
                           n_samples=n_samples)

    expected_report = """\
             precision    recall  f1-score   support

          0       0.50      0.67      0.57        24
          1       0.51      0.74      0.61        27
          2       0.29      0.08      0.12        26
          3       0.52      0.56      0.54        27

avg / total       0.45      0.51      0.46       104
"""

    lb = MultiLabelBinarizer()
    lb.fit([range(4)])
    y_true_bi = lb.transform(y_true_ll)
    y_pred_bi = lb.transform(y_pred_ll)

    for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]:
        report = classification_report(y_true, y_pred)
        assert_equal(report, expected_report)
Beispiel #6
0
def load_data(config={}):
    """
    Load the Reuters dataset.

    Returns
    -------
    data : dict
        with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
    """
    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
                                     for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id)
                                for doc_id in test])
    data = {'x_train': xs['train'], 'y_train': ys['train'],
            'x_test': xs['test'], 'y_test': ys['test'],
            'labels': globals()["labels"]}
    return data
def fit_images():
    client = pymongo.MongoClient('localhost', 27017)
    db = client['image_annotation']
    responses = db['mapped_responses'].find()
    no_labels = db['labels_binary'].find()
    numbers = []
    for i in no_labels:
        numbers.append(set([int(i["number"])]))
    train_data = []
    labels = []
    i=0
    mlb = MultiLabelBinarizer()
    mlb.fit(numbers)
    for index, instance in enumerate(responses):
        t_data =  instance['hist']['0']
        indexes[index] = instance['image_no']
        train_data.append(t_data)
        label = instance['binary_results']
        new_labels = []
        for key, value in enumerate(label):
            value1 = int(value)
            new_labels.append(set([value1]))
        new_labels = mlb.transform(new_labels)
        labels.append(label)
    classifier = KNeighborsClassifier(n_neighbors = 5, weights='uniform')
    classifier.fit(train_data, labels)
    build_dir = getBuildDir()
    pickle.dump(classifier, open(join(build_dir, 'model.data'),'w'),protocol=1)
    client.close()
class ACMClassificator(BaseACMClassificator):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
        self.mlb = MultiLabelBinarizer()
        self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini",
                                                                     max_depth=None,
                                                                     min_samples_split=2,
                                                                     min_samples_leaf=1,
                                                                     min_weight_fraction_leaf=0.,
                                                                     max_features="auto",
                                                                     max_leaf_nodes=None,
                                                                     class_weight=None),
                                                 n_jobs=-1
                                                 )

    def _prepare_problems(self, problems):
        return self.vectorizer.transform([p.statement for p in problems])

    def fit(self, problems, tags):
        nltk.download('punkt', quiet=True)
        self.vectorizer.fit([p.statement for p in problems])
        mat = self._prepare_problems(problems)
        self.mlb = self.mlb.fit(tags)
        self.classificator.fit(mat.toarray(), self.mlb.transform(tags))

    def predict(self, problems):
        mat = self._prepare_problems(problems)
        predicted = self.classificator.predict(mat.toarray())
        return self.mlb.inverse_transform(predicted)
Beispiel #9
0
class VectorizedData:
    """ Simple container that holds the input dataset
    in a sklearn-friendly form, with X, y numpy vectors.

    TODO: we ignore # of matches for each fbpath """
    def __init__(self, data, Xdict=None, Ydict=None):
        fdict = [q_to_fdict(q) for q in data]
        lset = [q_to_lset(q) for q in data]

        if Xdict is None:
            self.Xdict = DictVectorizer()
            self.X = self.Xdict.fit_transform(fdict)
        else:
            self.Xdict = Xdict
            self.X = self.Xdict.transform(fdict)

        if Ydict is None:
            self.Ydict = MultiLabelBinarizer()
            self.Y = self.Ydict.fit_transform(lset)
        else:
            self.Ydict = Ydict

            # Filter out data with unknown labels, MultiLabelBinarizer() cannot
            # handle this
            known_lset = [set([label for label in ls if label in self.Ydict.classes_]) for ls in lset]
            lset_n = sum([len(ls) for ls in lset])
            known_lset_n = sum([len(ls) for ls in known_lset])
            if known_lset_n < lset_n:
                print('dropped %d out of %d labels (not in training set)' % (lset_n - known_lset_n, lset_n), file=sys.stderr)

            self.Y = self.Ydict.transform(known_lset)

    def cfier_score(self, cfier, scorer):
        """ Measure cfier performance on this dataset.

        scorer -> lambda cfier, X: cfier.predict_proba(X)
        (or decision_function when probabilities not predicted) """
        skl_score = cfier.score(self.X.toarray(), self.Y)

        # XXX: Matched paths might/could be weighted by their nMatches too...

        # Measure prediction performance
        Ypred = cfier.predict(self.X.toarray())
        n_q = float(np.size(self.Y, axis=0))
        # number of questions where all correct paths have been recalled
        recall_all = np.sum(np.sum(self.Y, axis=1) == np.sum(Ypred * self.Y, axis=1)) / n_q
        # number of questions where at least one correct path has been recalled
        recall_any = np.sum((np.sum(self.Y, axis=1) != 0) == (np.sum(Ypred * self.Y, axis=1) != 0)) / n_q
        # number of *PATHS* (not q.) that were correct
        precision = np.sum((Ypred + self.Y) == 2) / float(np.sum(Ypred))

        # Measure scoring performance
        Yscores = scorer(cfier, self.X.toarray())
        # MRR of first correct path
        mrr = mrr_by_score(self.Y, Yscores)
        # number of questions where at least one correct path has been recalled in top N paths
        # TODO

        return {'sklScore': skl_score, 'qRecallAll': recall_all, 'qRecallAny': recall_any, 'pPrec': precision, 'qScoreMRR': mrr}
Beispiel #10
0
class ncClassifier(object):
    def __init__(self, emb_dict, clf):
        self.embeddings = emb_dict
        self.clf = TopKRanker(clf)  # here clf is LR
        self.binarizer = MultiLabelBinarizer(sparse_output=True)

    def split_train_evaluate(self, X, Y, train_precent, seed=None):
        np.random.seed(seed=seed)
        state = np.random.get_state()
        training_size = int(train_precent * len(X))
        shuffle_indices = np.random.permutation(np.arange(len(X)))
        X_train = [X[shuffle_indices[i]] for i in range(training_size)]
        Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
        X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
        Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]
        self.train(X_train, Y_train, Y)
        np.random.set_state(state)
        return self.evaluate(X_test, Y_test)

    def train(self, X, Y, Y_all):
        # to support multi-labels, fit means dict mapping {orig cat: binarized vec}
        self.binarizer.fit(Y_all)
        X_train = [self.embeddings[x] for x in X]
        # since we have use Y_all fitted, then we simply transform
        Y = self.binarizer.transform(Y)
        self.clf.fit(X_train, Y)

    def predict(self, X, top_k_list):
        X_ = np.asarray([self.embeddings[x] for x in X])
        # see TopKRanker(OneVsRestClassifier)
        # the top k probs to be output...
        Y = self.clf.predict(X_, top_k_list=top_k_list)
        return Y

    def evaluate(self, X, Y):
        # multi-labels, diff len of labels of each node
        top_k_list = [len(l) for l in Y]
        Y_ = self.predict(X, top_k_list)  # pred val of X_test i.e. Y_pred
        Y = self.binarizer.transform(Y)  # true val i.e. Y_test
        averages = ["micro", "macro", "samples", "weighted"]
        results = {}
        for average in averages:
            results[average] = f1_score(Y, Y_, average=average)
        print(results)
        return results
Beispiel #11
0
class Classifier(object):
    def __init__(self, vectors, clf):
        self.embeddings = vectors
        self.clf = TopKRanker(clf)
        self.binarizer = MultiLabelBinarizer(sparse_output=True)

    def train(self, X, Y, Y_all):
        self.binarizer.fit(Y_all)
        X_train = [self.embeddings[x] for x in X]
        Y = self.binarizer.transform(Y)
        self.clf.fit(X_train, Y)

    def evaluate(self, X, Y):
        top_k_list = [len(l) for l in Y]
        Y_ = self.predict(X, top_k_list)
        Y = self.binarizer.transform(Y)
        averages = ["micro", "macro", "samples", "weighted"]
        results = {}
        for average in averages:
            results[average] = f1_score(Y, Y_, average=average)
        # print('Results, using embeddings of dimensionality', len(self.embeddings[X[0]]))
        # print('-------------------')
        print(results)
        return results
        # print('-------------------')

    def predict(self, X, top_k_list):
        X_ = numpy.asarray([self.embeddings[x] for x in X])
        Y = self.clf.predict(X_, top_k_list=top_k_list)
        return Y

    def split_train_evaluate(self, X, Y, train_precent, seed=0):
        state = numpy.random.get_state()

        training_size = int(train_precent * len(X))
        numpy.random.seed(seed)
        shuffle_indices = numpy.random.permutation(numpy.arange(len(X)))
        X_train = [X[shuffle_indices[i]] for i in range(training_size)]
        Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
        X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
        Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]

        self.train(X_train, Y_train, Y)
        numpy.random.set_state(state)
        return self.evaluate(X_test, Y_test)
Beispiel #12
0
def load_data(train_set, multilabel=True):
    X_data = []
    y_data = []
    for c, (vector, target
            ) in enumerate(train_set):  # load one vector into memory at a time
        X_data.append(vector)
        y_data.append(target)
        if c % 10000 == 0:
            print c

    print len(X_data), 'training examples'

    # Dictionary of classes.
    class_list = list(set([y for y_seq in y_data for y in y_seq]))
    nb_classes = len(class_list)
    print nb_classes, 'classes'
    class_dict = dict(zip(class_list, np.arange(len(class_list))))
    with open('class_dict.json', 'w') as fp:
        json.dump(class_dict, fp)
    print 'Exported class dictionary'

    y_data_int = []
    for y_seq in y_data:
        y_data_int.append([class_dict[y_seq[0]]])

    # Tokenize and pad text.
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(X_data)
    X_data = tokenizer.texts_to_sequences(X_data)
    word_index = tokenizer.word_index
    print('Found %s unique tokens' % len(word_index))
    with open('word_index.json', 'w') as fp:
        json.dump(word_index, fp)
    print 'Exported word dictionary'
    X_data = pad_sequences(X_data,
                           maxlen=MAX_SEQUENCE_LENGTH,
                           padding='post',
                           truncating='post',
                           dtype='float32')
    print('Shape of data tensor:', X_data.shape)

    if multilabel:
        mlb = MultiLabelBinarizer()
        mlb.fit([class_dict.values()])
        y_data = mlb.transform(y_data_int)
    else:
        y_data = to_categorical(y_data_int)
        y_h_data = to_categorical(y_h_data_int)

    print('Shape of label tensor:', y_data.shape)

    X_train, X_val, y_train, y_val = train_test_split(X_data,
                                                      y_data,
                                                      test_size=0.1,
                                                      random_state=42)

    return X_train, X_val, y_train, y_val, nb_classes, word_index
Beispiel #13
0
def one_hot_encoding(data):
    type_list = data.unique().tolist()
    type_list = [[x] for x in type_list]
    mlb = MultiLabelBinarizer()
    mlb.fit(type_list)
    targets = data.values
    targets = [[a] for a in targets]
    feature = mlb.transform(targets)
    return feature
def binarize_labels(class_list, train, val, test):
    labelencoder = MultiLabelBinarizer(classes=class_list)
    train = labelencoder.fit_transform(train)
    val = labelencoder.fit_transform(val)
    test = labelencoder.transform(test)
    print(
        "\nTotal classes detected in each set: \n Train = {}, \n Val = {}, \n Test= {}"
        .format(len(train[0]), len(val[0]), len(test[0])))
    return train, val, test
Beispiel #15
0
def full_jrc_(jrc_data_home,
              langs,
              train_years,
              test_years,
              outpath,
              cat_policy='all',
              most_common_cat=300):

    print('fetching the datasets')
    cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
    training_docs, label_names = fetch_jrcacquis(langs=langs,
                                                 data_path=jrc_data_home,
                                                 years=train_years,
                                                 cat_filter=cat_list,
                                                 cat_threshold=1,
                                                 parallel=None,
                                                 most_frequent=most_common_cat)
    test_docs, _ = fetch_jrcacquis(langs=langs,
                                   data_path=jrc_data_home,
                                   years=test_years,
                                   cat_filter=label_names,
                                   parallel='force')

    def _group_by_lang(doc_list, langs):
        return {
            lang: [d for d in doc_list if d.lang == lang]
            for lang in langs
        }

    training_docs = _group_by_lang(training_docs, langs)
    test_docs = _group_by_lang(test_docs, langs)

    mlb = MultiLabelBinarizer()
    mlb.fit([label_names])

    dataset = MultilingualDataset()
    data.dataset_name = 'JRC-Acquis-full'
    for lang in langs:
        analyzer = CountVectorizer(strip_accents='unicode',
                                   min_df=3,
                                   stop_words=stopwords.words(
                                       NLTK_LANGMAP[lang])).build_analyzer()

        Xtr, Ytr, IDtr = zip(*[(d.text, d.categories,
                                d.parallel_id + '__' + d.id)
                               for d in training_docs[lang] if d.lang == lang])
        Xte, Yte, IDte = zip(*[(d.text, d.categories,
                                d.parallel_id + '__' + d.id)
                               for d in test_docs[lang] if d.lang == lang])
        Xtr = [' '.join(analyzer(d)) for d in Xtr]
        Xte = [' '.join(analyzer(d)) for d in Xte]
        Ytr = mlb.transform(Ytr)
        Yte = mlb.transform(Yte)
        dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte,
                    IDtr, IDte)

    dataset.save(outpath)
Beispiel #16
0
def apply_multilabel_binarizer(data_frame):
    print('###  multi_label_binarizer  ###')
    ################################################ classification: from text to sparse binary matrix [[0, 1, 0],[1, 0, 1]]
    temp_classification = data_frame.apply(lambda row : th.tokenize_complex_text_in_set(row['classification']), axis=1)
    df_to_list = temp_classification.tolist()
    mlb = MultiLabelBinarizer()
    mlb.fit(df_to_list)
    classes = list(mlb.classes_)
    return mlb.transform(df_to_list), classes, len(classes)
Beispiel #17
0
def build_juxtaposed_matrices(dataset_name,
                              langs,
                              training_docs,
                              test_docs,
                              label_names,
                              preprocess=True):
    """
    Builds the document-by-term weighted matrices for each language. Representations are not independent of each other,
    since all of them lie on the same yuxtaposed feature space.
    :param dataset_name: the name of the dataset (str)
    :param langs: list of languages (str)
    :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
    :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
    :param label_names: list of names of labels (str)
    :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
    :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
    by language the processed wikipedia documents in their respective language-specific feature spaces
    """

    multiling_dataset = MultilingualDataset()
    multiling_dataset.dataset_name = dataset_name

    mlb = MultiLabelBinarizer()
    mlb.fit([label_names])

    multiling_dataset.set_labels(mlb.classes_)

    tr_data_stack = []
    for lang in langs:
        print("\nprocessing %d training and %d test for language <%s>" %
              (len(training_docs[lang]), len(test_docs[lang]), lang))
        tr_data, tr_labels, tr_ID = zip(*training_docs[lang])
        te_data, te_labels, te_ID = zip(*test_docs[lang])
        if preprocess:
            tr_data = preprocess_documents(tr_data, lang)
            te_data = preprocess_documents(te_data, lang)
        tr_data_stack.extend(tr_data)
        multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels,
                              tr_ID, te_ID)

    tfidf = TfidfVectorizer(strip_accents='unicode',
                            min_df=3,
                            sublinear_tf=True)
    tfidf.fit(tr_data_stack)

    for lang in langs:
        print("\nweighting documents for language <%s>" % (lang))
        (tr_data, tr_labels, tr_ID), (te_data, te_labels,
                                      te_ID) = multiling_dataset[lang]
        Xtr = tfidf.transform(tr_data)
        Xte = tfidf.transform(te_data)
        Ytr = mlb.transform(tr_labels)
        Yte = mlb.transform(te_labels)
        multiling_dataset.add(lang, Xtr, Ytr, Xte, Yte, tr_ID, te_ID)

    multiling_dataset.show_dimensions()
    return multiling_dataset
Beispiel #18
0
def loading_json():
    script_start_time = time.time()
    
    print('%0.2f min: Start loading data'%((time.time() - script_start_time)/60))
    
    train={}
    test={}
    validation={}
    with open('train.json') as json_data:
        train= json.load(json_data)
    with open('test.json') as json_data:
        test= json.load(json_data)
    with open('validation.json') as json_data:
        validation = json.load(json_data)
    
    print('Train No. of images: %d'%(len(train['images'])))
    print('Test No. of images: %d'%(len(test['images'])))
    print('Validation No. of images: %d'%(len(validation['images'])))
    
    # JSON TO PANDAS DATAFRAME
    # train data
    train_img_url=train['images']
    train_img_url=pd.DataFrame(train_img_url)
    train_ann=train['annotations']
    train_ann=pd.DataFrame(train_ann)
    train=pd.merge(train_img_url, train_ann, on='imageId', how='inner')
    
    # test data
    test=pd.DataFrame(test['images'])
    
    # Validation Data
    val_img_url=validation['images']
    val_img_url=pd.DataFrame(val_img_url)
    val_ann=validation['annotations']
    val_ann=pd.DataFrame(val_ann)
    validation=pd.merge(val_img_url, val_ann, on='imageId', how='inner')
    
    datas = {'Train': train, 'Test': test, 'Validation': validation}
    for data in datas.values():
        data['imageId'] = data['imageId'].astype(np.uint32)
    
    print('%0.2f min: Finish loading data'%((time.time() - script_start_time)/60))
    print('='*50)
    
    print('%0.2f min: Start converting label'%((time.time() - script_start_time)/60))
    
    mlb = MultiLabelBinarizer()
    train_label = mlb.fit_transform(train['labelId'])
    validation_label = mlb.transform(validation['labelId'])
    dummy_label_col = list(mlb.classes_)
    print(dummy_label_col)
    print('%0.2f min: Finish converting label'%((time.time() - script_start_time)/60))
    
    for data in [validation_label, train_label, test]:
        print(data.shape)
    
    return train, test, validation
Beispiel #19
0
def binarize(y_train, y_val, y_test):

    # Fit the multi-label binarizer on the training set
    print("Labels:")
    mlb = MultiLabelBinarizer()
    mlb.fit(y_train)

    # Loop over all labels and show them
    N_LABELS = len(mlb.classes_)
    for (i, label) in enumerate(mlb.classes_):
        print("{}. {}".format(i, label))

    # transform the targets of the training and test sets
    y_train_bin = mlb.transform(y_train)
    y_val_bin = mlb.transform(y_val)
    y_test_bin = mlb.transform(y_test)

    return (y_train_bin, y_val_bin, y_test_bin, N_LABELS)
Beispiel #20
0
def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home,
                                        train_years, test_years, cat_policy,
                                        most_common_cat, outpath):

    tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
    assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
    langs = list(tr_ids.keys())

    print('fetching the datasets')

    cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
    training_docs, label_names = fetch_jrcacquis(langs=langs,
                                                 data_path=jrc_data_home,
                                                 years=train_years,
                                                 cat_filter=cat_list,
                                                 cat_threshold=1,
                                                 parallel=None,
                                                 most_frequent=most_common_cat)
    test_docs, _ = fetch_jrcacquis(langs=langs,
                                   data_path=jrc_data_home,
                                   years=test_years,
                                   cat_filter=label_names,
                                   parallel='force')

    def filter_by_id(doclist, ids):
        ids_set = frozenset(itertools.chain.from_iterable(ids.values()))
        return [x for x in doclist if (x.parallel_id + '__' + x.id) in ids_set]

    training_docs = filter_by_id(training_docs, tr_ids)
    test_docs = filter_by_id(test_docs, te_ids)

    print('jrc: {} train, {} test, {} categories'.format(
        len(training_docs), len(test_docs), len(label_names)))

    mlb = MultiLabelBinarizer()
    mlb.fit([label_names])

    dataset = MultilingualDataset()
    for lang in langs:
        analyzer = CountVectorizer(strip_accents='unicode',
                                   min_df=3,
                                   stop_words=stopwords.words(
                                       NLTK_LANGMAP[lang])).build_analyzer()

        Xtr, Ytr, IDtr = zip(*[(d.text, d.categories,
                                d.parallel_id + '__' + d.id)
                               for d in training_docs if d.lang == lang])
        Xte, Yte, IDte = zip(*[(d.text, d.categories,
                                d.parallel_id + '__' + d.id) for d in test_docs
                               if d.lang == lang])
        Xtr = [' '.join(analyzer(d)) for d in Xtr]
        Xte = [' '.join(analyzer(d)) for d in Xte]
        Ytr = mlb.transform(Ytr)
        Yte = mlb.transform(Yte)
        dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)

    dataset.save(outpath)
Beispiel #21
0
def print_multilabel_results(resfile, outdir, args=None, n_strats=1):
    """ Function that calculates performance statistics and prints them to
        a result file for multilabel tests
    """
    #logging.info('Writing scores to %s', str(outdir))
    with open(resfile, 'rb') as f:
        results = pickle.load(f)
    # # Now do the evaluation!
    # #results = [
    # #    0 => ([x, y, z], <-- true
    # #          [x, y, k]) <-- pred
    # #]
    y_trues = [[] for _ in range(n_strats)]
    y_preds = [[] for _ in range(n_strats)]
    for idx, result in enumerate(results):
        y_trues[idx % n_strats] += result[0]
        y_preds[idx % n_strats] += result[1]

    for strat, (y_true, y_pred) in enumerate(zip(y_trues, y_preds)):
        bnz = MultiLabelBinarizer()
        bnz.fit(y_true)
        all_tags = copy.deepcopy(y_true)
        for preds in y_pred:
            for label in preds:
                if label not in bnz.classes_:
                    all_tags.append([label])
                    bnz.fit(all_tags)
        y_true = bnz.transform(y_true)
        y_pred = bnz.transform(y_pred)

        labels = bnz.classes_
        report = metrics.classification_report(y_true, y_pred, target_names=labels)
        f1w = metrics.f1_score(y_true, y_pred, average='weighted')
        f1i = metrics.f1_score(y_true, y_pred, average='micro')
        f1a = metrics.f1_score(y_true, y_pred, average='macro')
        pw = metrics.precision_score(y_true, y_pred, average='weighted')
        pi = metrics.precision_score(y_true, y_pred, average='micro')
        pa = metrics.precision_score(y_true, y_pred, average='macro')
        rw = metrics.recall_score(y_true, y_pred, average='weighted')
        ri = metrics.recall_score(y_true, y_pred, average='micro')
        ra = metrics.recall_score(y_true, y_pred, average='macro')

        file_header = (
            "# MULTILABEL EXPERIMENT REPORT\n" +
            time.strftime("# Generated %c\n#\n") +
            ('#\n# Args: {}\n#\n'.format(args) if args else '') +
            "# 3 FOLD CROSS VALIDATION WITH {} CHANGESETS\n".format(len(y_true)) +
            "# F1 SCORE : {:.3f} weighted, {:.3f} micro-avg'd, {:.3f} macro-avg'd\n".format(f1w, f1i, f1a) +
            "# PRECISION: {:.3f} weighted, {:.3f} micro-avg'd, {:.3f} macro-avg'd\n".format(pw, pi, pa) +
            "# RECALL   : {:.3f} weighted, {:.3f} micro-avg'd, {:.3f} macro-avg'd\n#\n".format(rw, ri, ra) +
            "# {:-^55}\n#".format("CLASSIFICATION REPORT") + report.replace('\n', "\n#")
        )
        os.makedirs(str(outdir), exist_ok=True)
        savetxt("{}/{}.txt".format(outdir, strat),
                np.array([]), fmt='%d', header=file_header, delimiter=',',
                comments='')
def data_loader(params, is_rebuild_dataset=False):
    if os.path.exists(config.train_x_path) and not is_rebuild_dataset:
        x_train = np.load(config.train_x_path)
        x_test = np.load(config.test_x_path)
        y_train = np.load(config.train_y_path)
        y_test = np.load(config.test_y_path)

        with open(config.vocab_save_path, 'r', encoding='utf-8') as f:
            vocab = {}
            for content in f.readlines():
                k, v = content.strip().split('\t')
                vocab[k] = int(v)
        label_df = pd.read_csv(config.data_label_path)
        # 多标签编码
        mlb = MultiLabelBinarizer()
        mlb.fit([label_df['label']])

        return x_train, x_test, y_train, y_test, vocab, mlb

    df = pd.read_csv(config.data_path, header=None).rename(columns={
        0: 'label',
        1: 'content'
    })
    df = parallelize(df, proc)

    text_preprocesser = tf.keras.preprocessing.text.Tokenizer(
        num_words=params['vocab_size'], oov_token="<UNK>")
    text_preprocesser.fit_on_texts(df['content'])

    vocab = text_preprocesser.word_index
    with open(config.vocab_save_path, 'w', encoding='utf-8') as f:
        for k, v in vocab.items():
            f.write(f'{k}\t{str(v)}\n')

    x = text_preprocesser.texts_to_sequences(df['content'])
    x = tf.keras.preprocessing.sequence.pad_sequences(
        x, maxlen=params['padding_size'], padding='post', truncating='post')

    # label_df = pd.read_csv(config.data_label_path)

    mlb = MultiLabelBinarizer()
    df['label'] = df['label'].apply(lambda x: x.split())
    mlb.fit(df['label'])

    y = mlb.transform(df['label'])

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    np.save(config.train_x_path, x_train)
    np.save(config.test_x_path, x_test)
    np.save(config.train_y_path, y_train)
    np.save(config.test_y_path, y_test)

    return x_train, x_test, y_train, y_test, vocab, mlb
def main():
    """

    :return:
    """

    # Gets or creates a logger
    logging.config.fileConfig('logging.conf')
    logger = logging.getLogger(__name__)

    logger.info("********** NEW RUN **********")

    # *******Change train_on_dataset to True for small dataset ********
    train_on_full_dataset = False
    is_submission = False

    if train_on_full_dataset:
        train_data_dir = "Data/train_data"
        test_data_dir = "Data/reuters_test_data"
    else:
        train_data_dir = "Data/reuters_train_data"
        test_data_dir = "Data/reuters_test_data"

    logger.info("Initiating training with data from '%s' directory",
                train_data_dir)
    knn_model = model.Model(train_data_dir)

    logger.info(
        "Predicting testing with data with countries from '%s' directory",
        test_data_dir)
    if is_submission:
        predictions = knn_model.predict(test_data_dir, is_submission=True)
    else:
        predictions, reference = knn_model.predict(test_data_dir)

    logger.info("Prediction complete")

    pickleHelper.save_to_pickle("predictions", predictions)
    # path_to_predictions = "Pickles/predictions-2019-08-15-1027.p"
    # try:
    #     returned_predictions = pickleHelper.retrieve_from_pickle(path_to_predictions, "predictions")
    # except FileNotFoundError:
    #     returned_predictions = knn_model.predict(test_data_dir)

    # print(predictions)
    # print(reference)

    mlb = MultiLabelBinarizer()
    r = mlb.fit_transform(reference)
    p = mlb.transform(predictions)
    try:
        score = sklearn.metrics.f1_score(y_true=r, y_pred=p, average='macro')
        print(score)
        logger.info("The f1 score is: %s", score)
    except ValueError as ex:
        logger.error("result value is invalid: " + str(ex))
def calculate_multilabel_metrics_text(path,
                                      model,
                                      metric='Hamming',
                                      threshold=0.1,
                                      verbose=False):
    mlb = MultiLabelBinarizer()
    mlb.fit([model.labels])
    with open(path, 'r') as test_data:
        ground_truth = []
        predicted = []
        probabilities = []
        for line in test_data.readlines():
            parts = line.split(' ')
            label_count = len(line.split('__label__')) - 1
            labels = [parts[i] for i in range(1, label_count * 2 + 1, 2)]
            ground_truth.append(labels)
            text = ' '.join(parts[label_count * 2:])
            predicted_labels, probability = model.predict(
                text[:-1], k=-1)  # , threshold=threshold)
            ordered_probabilities = np.zeros(len(mlb.classes_))
            for i, label in enumerate(predicted_labels):
                ordered_probabilities[np.where(
                    mlb.classes_ == label)] = probability[i]
            predicted_labels = get_best_labels(predicted_labels, probability)
            predicted.append(list(predicted_labels))
            probabilities.append(ordered_probabilities)
            if verbose:
                print(labels, "###", predicted_labels)
        predicted = mlb.transform(predicted)
        ground_truth = mlb.transform(ground_truth)

        if metric == 'Hamming':
            print('Hamming loss: {0}'.format(
                hamming_loss(ground_truth, predicted)))
            print('Hamming_score: {0}'.format(
                hamming_score(ground_truth, predicted)))
            return hamming_score(ground_truth, predicted)
        elif metric == "MAP":
            return MAP(ground_truth, probabilities)
        elif metric == "Report":
            return classification_report(ground_truth,
                                         predicted,
                                         target_names=mlb.classes_)
Beispiel #25
0
    def fitClassifier(self, train_profiles, train_labels):
        clf = LogisticRegression(C=1.0, solver='lbfgs', max_iter=10000)
        clf = OneVsRestClassifier(clf)
        mlb = MultiLabelBinarizer()

        mlb = mlb.fit(train_labels)
        train_labels = mlb.transform(train_labels)
        clf.fit(train_profiles, train_labels)

        return clf, mlb
Beispiel #26
0
def one_hot_encoded_multiclass(df, feature, default_value_name):

    mlb = MultiLabelBinarizer()
    cat = df[feature].str.split(',')
    integer_encoded = mlb.fit_transform(cat)
    df[feature] = integer_encoded
    default_value = mlb.transform(np.array(default_value_name).reshape(-1, 1))
    #results_union = set().union(*cat)

    return df, default_value[0]
Beispiel #27
0
 def add_pos_count_cols(df, tokenized_col):
     df['temp'] = df[tokenized_col].apply(lambda x: [ent.pos_ for ent in x])
     mlb = MultiLabelBinarizer()
     mlb.fit(df['temp'])
     df = df.join(
         pd.DataFrame(mlb.transform(df['temp']),
                      columns=mlb.classes_,
                      index=df.index))
     del df['temp']
     return df
Beispiel #28
0
class MyMultiLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = MultiLabelBinarizer(*args, **kwargs)

    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self

    def transform(self, x, y=0):
        return self.encoder.transform(x)
Beispiel #29
0
    def __init__(
            self,
            inter_filePath="inter/technology_companies_of_the_united_states/"):
        # [[cat,cat...]...]
        self.m = Word2Vec.load_word2vec_format(
            "vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5",
            binary=True)
        self.dim = 400

        (correct_categories_train, context_categories_train
         ) = self.load_category_page(inter_filePath + "category_page.txt")
        (correct_categories_test, context_categories_test
         ) = self.load_category_page(inter_filePath + "category_page_test.txt")
        ## ----  By mean ---
        Xvectors = np.array(
            self.predict_vector_by_mean(context_categories_train))
        Xvectors_test = np.array(
            self.predict_vector_by_mean(context_categories_test))

        ## ----  By mean --- *

        ## ----  By SVM ---
        corpus_train = [" ".join(i) for i in context_categories_train]
        corpus_test = [" ".join(i) for i in context_categories_test]
        cv = CountVectorizer(min_df=1)
        X = cv.fit_transform(corpus_train)
        ##TFIDF
        transformer = TfidfTransformer()
        X_tfidf = transformer.fit_transform(X)
        #Labels
        mlb = MultiLabelBinarizer()
        mlb.fit(correct_categories_train + correct_categories_test)
        Y = mlb.transform(
            correct_categories_train)  ###Transform to multilabel indicator
        #predict test labels
        X_test = cv.transform(corpus_test)
        Y_test = mlb.transform(correct_categories_test)
        #Y_predict_ovr = self.ovrSVM(X, Y, X_test)
        Y_predict_ovr = self.ovrSVM(Xvectors, Y, Xvectors_test)
        #Y_predict_ovo = self.ovoSVM(X, Y, X_test)
        print "---One versus rest---"
        print "Macro F-1:", f1_score(Y_test, Y_predict_ovr, average='macro')
        print "Micro F-1:", f1_score(Y_test, Y_predict_ovr, average='micro')
Beispiel #30
0
def binarize_labels(pred_labels, true_labels):
    srcids = list(pred_labels.keys())
    tot_labels = [
        list(labels)
        for labels in list(pred_labels.values()) + list(true_labels.values())
    ]
    mlb = MultiLabelBinarizer().fit(tot_labels)
    pred_mat = mlb.transform(pred_labels.values())
    true_mat = mlb.transform(true_labels.values())
    return pred_mat, true_mat
Beispiel #31
0
def binarize_labels(true_labels, pred_labels, excluding_labels=[]):
    excluding_labels = ['building-ebu3b']
    srcids = list(pred_labels.keys())
    tot_labels = [[label for label in labels if label not in excluding_labels]
                  for labels in list(pred_labels.values()) +
                  list(true_labels.values())]
    mlb = MultiLabelBinarizer().fit(tot_labels)
    pred_mat = mlb.transform(pred_labels.values())
    true_mat = mlb.transform(true_labels.values())
    return true_mat, pred_mat
class EncodeMultilabel(object):
    mlb: MultiLabelBinarizer

    def __init__(self, label_names: list[str]):
        self.mlb = MultiLabelBinarizer()
        self.mlb.fit([label_names])

    def __call__(self, sample: tuple[int, list[str]]) -> tuple[int, np.ndarray]:
        track_id, label = sample
        return track_id, self.mlb.transform([label])[0]
Beispiel #33
0
def main(argv):
    options = argparser().parse_args(argv[1:])

    train_texts, train_labels = load_data(options.train, options.input_format,
                                          options.multiclass)
    dev_texts, dev_labels = load_data(options.dev, options.input_format,
                                      options.multiclass)
    num_train_examples = len(train_texts)

    label_encoder = MultiLabelBinarizer()
    label_encoder.fit(train_labels)
    train_Y = label_encoder.transform(train_labels)
    dev_Y = label_encoder.transform(dev_labels)
    num_labels = len(label_encoder.classes_)

    classifier, tokenizer, optimizer, config = prepare_classifier(
        num_train_examples, num_labels, options)
    config.multiclass = options.multiclass

    tokenize = make_tokenization_function(tokenizer, options.seq_len)
    train_X = tokenize(train_texts)
    dev_X = tokenize(dev_texts)

    history = classifier.fit(
        train_X,
        train_Y,
        epochs=options.epochs,
        batch_size=options.batch_size,
        validation_data=(dev_X, dev_Y),
    )

    metrics_values = classifier.evaluate(dev_X,
                                         dev_Y,
                                         batch_size=options.batch_size)
    for name, value in zip(classifier.metrics_names, metrics_values):
        print(f'{name}\t{value}')

    if options.save_model is not None:
        save_trained_model(options.save_model, classifier, tokenizer,
                           label_encoder.classes_, config)

    return 0
Beispiel #34
0
def fungo_test_wrapper(name='cellcycle_FUN'):
    X_train, X_test, train_ids, test_ids, id2doc, nodes = read_fungo(name)
    X_train, X_test = np.array(X_train), np.array(X_test)
    id2doc_train = id2doc
    args = conf()
    # id2doc_train = filter_ancestors(id2doc, nodes)
    tree = Tree(args, train_ids, test_ids, id2doc=id2doc_train, id2doc_a=id2doc, nodes=nodes, rootname='Top')
    mlb = MultiLabelBinarizer(classes=tree.class_idx)
    Y_train = mlb.fit_transform([tree.id2doc_ancestors[docid]['class_idx'] for docid in train_ids])
    Y_test = mlb.transform([tree.id2doc_ancestors[docid]['class_idx'] for docid in test_ids])
    return X_train, Y_train, X_test, Y_test
Beispiel #35
0
def classification(X, y, testSize=0.2):
    clf = OneVsRestClassifier(LogisticRegression(max_iter=10000))
    binarizer = MultiLabelBinarizer()
    trainX, testX, trainY, testY = train_test_split(X,
                                                    y,
                                                    test_size=testSize,
                                                    shuffle=True)
    binarizer.fit(y)
    clf.fit(trainX, binarizer.transform(trainY))
    topKList = [len(i) for i in testY]
    probs = np.asarray(clf.predict_proba(np.asarray(testX)))
    for i, k in enumerate(topKList):
        lables = clf.classes_[probs[i, :].argsort()[-k:]].tolist()
        probs[i, :] = 0
        probs[i, lables] = 1
    testY = binarizer.transform(testY)
    return {
        'micro': f1_score(testY, probs, average='micro'),
        'macro': f1_score(testY, probs, average='macro')
    }
Beispiel #36
0
def computeF1(cleanedFigure, predictions):
    support = []
    f1s = []
    for true, pred in zip(cleanedFigure, predictions):
        support.append(len(true))
        binarizer = MultiLabelBinarizer().fit(true + pred)
        y_true = binarizer.transform(true)
        y_pred = binarizer.transform(pred)
        f1s.append(f1_score(y_true, y_pred, average='micro'))
    F1 = np.average(f1s, weights=support)
    return F1
Beispiel #37
0
def main():
    collection_stats()

    print("Staring classifier ..")

    X_train = list()
    X_test = list()

    y_train = list()
    y_test = list()

    print("Reading training and testing data ..")

    for doc_id in reuters.fileids():
        if doc_id.startswith("train"):
            X_train.append(reuters.raw(doc_id))
            y_train.append(reuters.categories(doc_id))
        else:
            X_test.append(reuters.raw(doc_id))
            y_test.append(reuters.categories(doc_id))

    X_train = numpy.array(X_train)
    y_train = numpy.array(y_train)
    X_test = numpy.array(X_test)
    y_test = numpy.array(y_test)

    binarizer = MultiLabelBinarizer(classes=reuters.categories())

    classifier = Pipeline([
        ('vectorizer',
         TfidfVectorizer(tokenizer=tokenize,
                         min_df=0,
                         max_df=0.90,
                         max_features=3000,
                         use_idf=True,
                         sublinear_tf=True)),
        # ('tfidf', TfidfTransformer()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])
    print("Training classifier ..")
    classifier.fit(X_train, binarizer.fit_transform(y_train))
    print("Testing classifier ..")
    res = classifier.predict(X_test)

    hard_precision = classifier.score(X_test, binarizer.transform(y_test))

    precision = average_precision_score(res,
                                        binarizer.fit_transform(y_test),
                                        average=None)
    recall = recall_score(res, binarizer.fit_transform(y_test), average=None)
    f1score = f1_score(res, binarizer.fit_transform(y_test), average=None)
    print("Hard precision: " + str(hard_precision))

    log_results(reuters.categories(), precision, recall, f1score)
Beispiel #38
0
def test_normalize_option_multilabel_classification():
    # Test in the multilabel case
    n_classes = 4
    n_samples = 100
    # using sequence of sequences is deprecated, but still tested
    make_ml = ignore_warnings(make_multilabel_classification)
    _, y_true = make_ml(n_features=1,
                        n_classes=n_classes,
                        random_state=0,
                        n_samples=n_samples)
    _, y_pred = make_ml(n_features=1,
                        n_classes=n_classes,
                        random_state=1,
                        n_samples=n_samples)

    # Be sure to have at least one empty label
    y_true += ([], )
    y_pred += ([], )
    n_samples += 1

    lb = MultiLabelBinarizer().fit([range(n_classes)])
    y_true_binary_indicator = lb.transform(y_true)
    y_pred_binary_indicator = lb.transform(y_pred)

    for name in METRICS_WITH_NORMALIZE_OPTION:
        metrics = ALL_METRICS[name]

        # List of list of labels
        measure = assert_warns(DeprecationWarning,
                               metrics,
                               y_true,
                               y_pred,
                               normalize=True)
        assert_greater(measure,
                       0,
                       msg="We failed to test correctly the normalize option")
        assert_almost_equal(
            ignore_warnings(metrics)(y_true, y_pred, normalize=False) /
            n_samples,
            measure,
            err_msg="Failed with %s" % name)

        # Indicator matrix format
        measure = metrics(y_true_binary_indicator,
                          y_pred_binary_indicator,
                          normalize=True)
        assert_greater(measure,
                       0,
                       msg="We failed to test correctly the normalize option")
        assert_almost_equal(metrics(y_true_binary_indicator,
                                    y_pred_binary_indicator,
                                    normalize=False) / n_samples,
                            measure,
                            err_msg="Failed with %s" % name)
Beispiel #39
0
def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True):
    """
    Builds the document-by-term weighted matrices for each language. Representations are independent of each other,
    i.e., each language-specific matrix lies in a dedicate feature space.
    :param dataset_name: the name of the dataset (str)
    :param langs: list of languages (str)
    :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
    :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
    :param label_names: list of names of labels (str)
    :param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages
    :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
    :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
    by language the processed wikipedia documents in their respective language-specific feature spaces
    """

    mlb = MultiLabelBinarizer()
    mlb.fit([label_names])

    lW = {}

    multilingual_dataset = MultilingualDataset()
    multilingual_dataset.dataset_name = dataset_name
    multilingual_dataset.set_labels(mlb.classes_)
    for lang in langs:
        print("\nprocessing %d training, %d test, %d wiki for language <%s>" %
              (len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang))

        tr_data, tr_labels, IDtr = zip(*training_docs[lang])
        te_data, te_labels, IDte = zip(*test_docs[lang])

        if preprocess:
            tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True,
                                    tokenizer=NLTKStemTokenizer(lang, verbose=True),
                                    stop_words=stopwords.words(NLTK_LANGMAP[lang]))
        else:
            tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)

        Xtr = tfidf.fit_transform(tr_data)
        Xte = tfidf.transform(te_data)
        if wiki_docs:
            lW[lang] = tfidf.transform(wiki_docs[lang])

        Ytr = mlb.transform(tr_labels)
        Yte = mlb.transform(te_labels)

        multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)

    multilingual_dataset.show_dimensions()
    multilingual_dataset.show_category_prevalences()

    if wiki_docs:
        return multilingual_dataset, lW
    else:
        return multilingual_dataset
Beispiel #40
0
def main():
    #sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"])
    sets = select_sets_by_tag(20,4,tag_names)
    #sets = random_select_sets(30,6)
    train_tags = fetch_tags(sets["train"])
    train_texts = id_to_filename(sets["train"])#txt_to_list(sets["train"])
    #vectorize
    count_vect = CountVectorizer(stop_words='english', encoding="utf-16", input="filename")
    X_train_counts = count_vect.fit_transform(train_texts)

    #tf-idf transformation
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    #process tags
    mlb = MultiLabelBinarizer()
    processed_train_tags = mlb.fit_transform(train_tags)
    #rint(processed_train_tags)
    #classifier
    #clf = OneVsRestClassifier(MultinomialNB())
    clf = OneVsRestClassifier(LinearSVC())
    clf.fit(X_train_tfidf,processed_train_tags)
    print("classes:{}".format(clf.classes_))
    #process test set

    test_texts = id_to_filename(sets["test"])#txt_to_list(sets["test"])
    X_test_counts = count_vect.transform(test_texts)
    #print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts)))
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    predicted_tags = clf.predict(X_test_tfidf)
    predicted_tags_readable = mlb.inverse_transform(predicted_tags)
    test_tags_actual = fetch_tags(sets["test"])
    predicted_probs = clf.decision_function(X_test_tfidf)
    #predicted_probs = clf.get_params(X_test_tfidf)
    class_list = mlb.classes_
    report = metrics.classification_report(mlb.transform(test_tags_actual),predicted_tags,target_names=class_list)
    print(report)
    #retrieve top 30% for each class
    top_percentage = 30
    threshold_index = int( len(sets["test"]) *(top_percentage/100.0) )
    threshold_vals_dic = {}
    threshold_vals = []
    num_classes = len(class_list)
    for i in range(num_classes):
        z = [ predicted_probs[j,i] for j in range(len(sets["test"]))]
        z.sort(reverse=True)
        threshold_vals_dic[class_list[i]]= z[threshold_index]
        threshold_vals.append(z[threshold_index])
    print(threshold_vals_dic)


    print_predictions(sets["test"],predicted_tags_readable,class_list, class_probablities=predicted_probs,threshold_vals=threshold_vals)
def get_data(train_file, test_file):
    X_train, Y_train = load_data(train_file)
    X_train = [ln.split('\t')[1] for ln in X_train]
    X_test, Y_test = load_data(test_file)
    X_test = [ln.split('\t')[1] for ln in X_test]

    mlb = MultiLabelBinarizer()
    Y_train = [set(s.split('_')) - {'None'} for s in Y_train]
    Y_test = [set(s.split('_')) - {'None'} for s in Y_test]
    Y_train = mlb.fit_transform(Y_train)
    Y_test = mlb.transform(Y_test)

    return X_train, X_test, Y_train, Y_test, mlb.classes_
def test_multilabelbinarizer_vs_sklearn():
    # Compare msmbuilder.preprocessing.MultiLabelBinarizer
    # with sklearn.preprocessing.MultiLabelBinarizer

    multilabelbinarizerr = MultiLabelBinarizerR()
    multilabelbinarizerr.fit(np.concatenate(trajs))

    multilabelbinarizer = MultiLabelBinarizer()
    multilabelbinarizer.fit(trajs)

    y_ref1 = multilabelbinarizerr.transform(trajs[0])
    y1 = multilabelbinarizer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Beispiel #43
0
def test_normalize_option_multilabel_classification():
    # Test in the multilabel case
    n_classes = 4
    n_samples = 100
    # using sequence of sequences is deprecated, but still tested
    make_ml = ignore_warnings(make_multilabel_classification)
    _, y_true = make_ml(n_features=1, n_classes=n_classes,
                        random_state=0, n_samples=n_samples)
    _, y_pred = make_ml(n_features=1, n_classes=n_classes,
                        random_state=1, n_samples=n_samples)

    # Be sure to have at least one empty label
    y_true += ([], )
    y_pred += ([], )
    n_samples += 1

    lb = MultiLabelBinarizer().fit([range(n_classes)])
    y_true_binary_indicator = lb.transform(y_true)
    y_pred_binary_indicator = lb.transform(y_pred)

    for name in METRICS_WITH_NORMALIZE_OPTION:
        metrics = ALL_METRICS[name]

        # List of list of labels
        measure = assert_warns(DeprecationWarning, metrics, y_true, y_pred,
                               normalize=True)
        assert_greater(measure, 0,
                       msg="We failed to test correctly the normalize option")
        assert_almost_equal(ignore_warnings(metrics)(y_true, y_pred,
                                                     normalize=False)
                            / n_samples, measure,
                            err_msg="Failed with %s" % name)

        # Indicator matrix format
        measure = metrics(y_true_binary_indicator,
                          y_pred_binary_indicator, normalize=True)
        assert_greater(measure, 0,
                       msg="We failed to test correctly the normalize option")
        assert_almost_equal(metrics(y_true_binary_indicator,
                                    y_pred_binary_indicator, normalize=False)
                            / n_samples, measure,
                            err_msg="Failed with %s" % name)
class ACMClassificator(BaseACMClassificator):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
        self.mlb = MultiLabelBinarizer()
        self.classificator = OneVsRestClassifier(SVC(), n_jobs=-1)

    def _prepare_problems(self, problems):
        return self.vectorizer.transform([p.statement for p in problems])

    def fit(self, problems, tags):
        nltk.download('punkt', quiet=True)
        self.vectorizer.fit([p.statement for p in problems])
        mat = self._prepare_problems(problems)
        self.mlb = self.mlb.fit(tags)
        self.classificator.fit(mat.toarray(), self.mlb.transform(tags))

    def predict(self, problems):
        mat = self._prepare_problems(problems)
        predicted = self.classificator.predict(mat.toarray())
        return self.mlb.inverse_transform(predicted)
Beispiel #45
0
class GlobalLabelTransformer(BaseTaskTransformer):

    def __init__(self, namespace, name, labels=None):
        '''Initialize a global label transformer

        Parameters
        ----------
        jam : jams.JAMS
            The JAMS object container
        '''

        super(GlobalLabelTransformer, self).__init__(namespace, 0)

        self.encoder = MultiLabelBinarizer()
        self.encoder.fit([labels])
        self._classes = set(self.encoder.classes_)
        self.name = name

    def transform(self, jam):

        ann = self.find_annotation(jam)

        intervals = np.asarray([[0, 1]])
        values = [None]
        mask = False

        if ann:
            values = list(ann.data.value)
            intervals = np.tile(intervals, [len(values), 1])
            mask = True

        # Suppress all intervals not in the encoder
        tags = [v for v in values if v in self._classes]
        if len(tags):
            target = self.encoder.transform([tags]).max(axis=0)
        else:
            target = np.zeros(len(self._classes), dtype=np.int)

        return {'output_{:s}'.format(self.name): target,
                'mask_{:s}'.format(self.name): mask}
def run_classifierAccuracy(terms, labels, testSentences, testLabels):
	labels = ["Drought", "Earthquake", "Flood", "Epidemic", "Hurricane", \
			"Rebellion", "Terrorism", "Tornado", "Tsunami", "displaced_people_and_evacuations", \
			"donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \
			"injured_or_dead_people", "missing_trapped_or_found_people"]
	import numpy as np
	class_terms_matrix, tfidf = tf_idf_fit_transform(terms)

	sentence_matrix = tfidf.transform(testSentences)

	print("Shape of sentence matrix : ", sentence_matrix.shape)
	# print("Original order of lables:")
	# print(labels)

	from sklearn.metrics.pairwise import cosine_similarity
	similarity_matrix = cosine_similarity(sentence_matrix, class_terms_matrix)
	similarity_matrix = binary_rel(similarity_matrix)

	predictions = []
	for i in range(len(testSentences)):
		predictions.append([labels[x] for x in range(similarity_matrix.shape[1]) if similarity_matrix[i][x]==1])

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer(classes=labels)
	# mlb = MultiLabelBinarizer()
	test_label_matrix = mlb.fit_transform(testLabels)
	predictions = mlb.transform(predictions)
	print("Shape of label matrix : ", test_label_matrix.shape)
	print("Labels : ", mlb.classes_)

	from sklearn.metrics import f1_score, precision_score, recall_score
	print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro'))
	print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro'))
	print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro'))
	print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro'))
	print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro'))
	print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro'))
	print("Macro-Precision", precision_score(test_label_matrix, predictions, average=None))
	print("Macro-Recall", recall_score(test_label_matrix, predictions, average=None))
	print("Macro-F1", f1_score(test_label_matrix, predictions, average=None))
#
# where images are a 1x784 flatt array and labels are an integer between 0 and 9.
#

mnist = input_data.read_data_sets("MNIST_data/")
x_train = mnist.train.images
x_test = mnist.test.images
y_train = mnist.train.labels
y_train = [[i] for i in y_train]
y_test = mnist.test.labels
y_test = [[i] for i in y_test]

# One-hot encode labels
one_hot = MultiLabelBinarizer()
y_train = one_hot.fit_transform(y_train)
y_test = one_hot.transform(y_test)

# Example 1: Fully connected neural network model
# We start with a 'sequential' model type (connecting layers together)
model = keras.Sequential()
# Adds a densely-connected layer with 32 units to the model, followed by an ReLU activation.
model.add(keras.layers.Dense(32, activation='relu'))
# Adds a densely-connected layer with 16 units to the model, followed by an ReLU activation.
model.add(keras.layers.Dense(16, activation='relu'))
# Add a softmax layer with 10 output units:
model.add(keras.layers.Dense(10, activation='softmax'))

# Train the model:
model.compile(optimizer=tf.train.AdamOptimizer(0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
Beispiel #48
0
def test_multilabel_representation_invariance():
    # Generate some data
    n_classes = 4
    n_samples = 50
    # using sequence of sequences is deprecated, but still tested
    make_ml = ignore_warnings(make_multilabel_classification)
    _, y1 = make_ml(n_features=1, n_classes=n_classes, random_state=0,
                    n_samples=n_samples)
    _, y2 = make_ml(n_features=1, n_classes=n_classes, random_state=1,
                    n_samples=n_samples)

    # Be sure to have at least one empty label
    y1 += ([], )
    y2 += ([], )

    # NOTE: The "sorted" trick is necessary to shuffle labels, because it
    # allows to return the shuffled tuple.
    rng = check_random_state(42)
    shuffled = lambda x: sorted(x, key=lambda *args: rng.rand())
    y1_shuffle = [shuffled(x) for x in y1]
    y2_shuffle = [shuffled(x) for x in y2]

    # Let's have redundant labels
    y2_redundant = [x * rng.randint(1, 4) for x in y2]

    # Binary indicator matrix format
    lb = MultiLabelBinarizer().fit([range(n_classes)])
    y1_binary_indicator = lb.transform(y1)
    y2_binary_indicator = lb.transform(y2)

    y1_sparse_indicator = sp.coo_matrix(y1_binary_indicator)
    y2_sparse_indicator = sp.coo_matrix(y2_binary_indicator)

    y1_shuffle_binary_indicator = lb.transform(y1_shuffle)
    y2_shuffle_binary_indicator = lb.transform(y2_shuffle)

    for name in MULTILABELS_METRICS:
        metric = ALL_METRICS[name]

        # XXX cruel hack to work with partial functions
        if isinstance(metric, partial):
            metric.__module__ = 'tmp'
            metric.__name__ = name

        measure = metric(y1_binary_indicator, y2_binary_indicator)

        # Check representation invariance
        assert_almost_equal(metric(y1_sparse_indicator,
                                   y2_sparse_indicator),
                            measure,
                            err_msg="%s failed representation invariance  "
                                    "between dense and sparse indicator "
                                    "formats." % name)

        # Check shuffling invariance with dense binary indicator matrix
        assert_almost_equal(metric(y1_shuffle_binary_indicator,
                                   y2_shuffle_binary_indicator), measure,
                            err_msg="%s failed shuffling invariance "
                                    " with dense binary indicator format."
                                    % name)

        # Check deprecation warnings related to sequence of sequences
        deprecated_metric = partial(assert_warns, DeprecationWarning, metric)

        # Check representation invariance
        assert_almost_equal(deprecated_metric(y1, y2),
                            measure,
                            err_msg="%s failed representation invariance  "
                                    "between list of list of labels "
                                    "format and dense binary indicator "
                                    "format." % name)

        # Check invariance with redundant labels with list of labels
        assert_almost_equal(deprecated_metric(y1, y2_redundant), measure,
                            err_msg="%s failed rendundant label invariance"
                                    % name)

        # Check shuffling invariance with list of labels
        assert_almost_equal(deprecated_metric(y1_shuffle, y2_shuffle), measure,
                            err_msg="%s failed shuffling invariance "
                                    "with list of list of labels format."
                                    % name)

        # Check raises error with mix input representation
        assert_raises(ValueError, deprecated_metric, y1, y2_binary_indicator)
        assert_raises(ValueError, deprecated_metric, y1_binary_indicator, y2)
Beispiel #49
0
print("done in %fs" % (duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print("")

print("Extracting features from the test data using the vectorizer")
t0 = time()
X_test = vectorizer.transform(questions_test)
duration = time() - t0
print("done in %fs" % (duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print("")
feature_names = vectorizer.get_feature_names()
if feature_names:
    feature_names = np.asarray(feature_names)

y_train = mlb.transform(tags_train)
y_test = mlb.transform(tags_test)
tags = list(mlb.classes_)
print("n_unique_tags = %d" % len(tags))

print("")

# chi2 can be used to reduce the number of features to the top k most relevant

# if opts.select_chi2:
#     print("Extracting %d best features by a chi-squared test" %
#           opts.select_chi2)
#     t0 = time()
#     ch2 = SelectKBest(chi2, k=opts.select_chi2)
#     X_train = ch2.fit_transform(X_train, y_train)
#     X_test = ch2.transform(X_test)
Beispiel #50
0
# Factorize building_id, display_address, manager_id, street_address
for col in ('building_id', 'display_address', 'manager_id', 'street_address'):
    X_train, X_test = factorize(X_train, X_test, col)

# Create binarized features
fmt = lambda feat: [s.replace("\u00a0", "").strip().lower().replace(" ", "_") for s in feat]  # format features
X_train["features"] = X_train["features"].apply(fmt)
X_test["features"] = X_test["features"].apply(fmt)
features = [f for f_list in list(X_train["features"]) + list(X_test["features"]) for f in f_list]
ps = pd.Series(features)
grouped = ps.groupby(ps).agg(len)
features = grouped[grouped >= 10].index.sort_values().values    # limit to features with >=10 observations
mlb = MultiLabelBinarizer().fit([features])
columns = ['feature_' + s for s in mlb.classes_]
flt = lambda l: [i for i in l if i in mlb.classes_]     # filter out features not present in MultiLabelBinarizer
X_train = X_train.join(pd.DataFrame(data=mlb.transform(X_train["features"].apply(flt)), columns=columns, index=X_train.index))
X_test = X_test.join(pd.DataFrame(data=mlb.transform(X_test["features"].apply(flt)), columns=columns, index=X_test.index))







# Save

##X_train = X_train.sort_index(axis=1).sort_values(by="listing_id")
##X_test = X_test.sort_index(axis=1).sort_values(by="listing_id")
##columns_to_drop = ["photos", "pred_0","pred_1", "pred_2", "description", "features", "created"]
##X_train.drop([c for c in X_train.columns if c in columns_to_drop], axis=1).\
## to_csv("../data_prepared/train_ManStatsListFCFQ_leak.csv", index=False, encoding='utf-8')
try:
    from sklearn.preprocessing import MultiLabelBinarizer

    lb = MultiLabelBinarizer()
except ImportError, e:
    from sklearn.preprocessing import LabelBinarizer

    lb = LabelBinarizer()


TRIM_SAMPLES = len(tags)  # / 10
tags = tags[:TRIM_SAMPLES]
learn_data = learn_data[:TRIM_SAMPLES]

lb.fit(tags)
labels = lb.transform(tags)

print "using\t", TRIM_SAMPLES, "samples"
print "\t", len(keywords), "keywords"
print "\t", len(lb.classes_), "tags"
metadata = learn_data.sum(axis=1)

print "\t", metadata.mean(), "avg words in document"
print "\t", metadata.max(), "biggest document"
print "\t", metadata.min(), "smallest document"


# plt.figure(figsize=(8, 6))
# plot_subfigure(learn_data, labels, 1, "With unlabeled samples + CCA", "cca")
# plot_subfigure(learn_data, labels, 2, "With unlabeled samples + PCA", "pca")
# plt.subplots_adjust(.04, .02, .97, .94, .09, .2)
Beispiel #52
0
classes = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
           'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING',
           'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON',
           'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY',
           'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY',
           'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']

category = train_frame['Category']
mapping = {clazz: num for (num, clazz) in enumerate(classes)}

most_freq_class = Counter(category).most_common()[0][0]

predicted = category.apply(lambda cat: mapping[most_freq_class])
expected = category.apply(lambda cat: mapping[cat])

mlb = MultiLabelBinarizer()

expected_b = mlb.fit_transform(to_singleton(expected))
predicted_b = mlb.transform(to_singleton(predicted))

for (clazz, count) in Counter(category).most_common():
    print("{}\t{}".format(clazz, count))

# todo: use validation.py
print("Accuracy on training: {}".format(accuracy_score(expected_b, predicted_b)))
print("Log los on training: {}".format(log_loss(expected_b, predicted_b)))

test_prediction = np.full((submission_size, len(predicted_b[0])), predicted_b[0])
create_submission(test_prediction, 'baseline_sub.csv')

            else:
                continue
                idx_test_cur = [ind for ind in range(0,len(y_predict_cate)) if cate_cur in y_predict_cate[ind]]
                idx_train_cur = [ind for ind in range(0,len(y_train_cate)) if cate_cur in y_train_cate[ind]]
                x_train_cur = x_train[idx_train_cur]
                x_test_cur = x_test[idx_test_cur]
                y_train_code_cur = []
                y_test_code_cur = []
                for category_predict_tuple in y_test_code[idx_test_cur]:
                    codes = []
                    if len(category_predict_tuple) == 0:
                        codes.append(defaultcode[cate_cur])
                    else:
                        codes.extend([v for v in category_predict_tuple if v.startswith(cate_cur)])
                    y_test_code_cur.append(codes)
                y_test_code_cur_map = ml.transform(y_test_code_cur)
                for y_train_code_tuple in y_train_code[idx_train_cur]:
                    codes = []
                    if len(y_train_code_tuple) == 0:
                        codes.append(defaultcode[cate_cur])
                    else:
                        codes.extend([v for v in y_train_code_tuple if v.startswith(cate_cur)])
                    y_train_code_cur.append(codes)
                y_train_code_cur_map = ml.transform(y_train_code_cur)

                model_code = DecisionTreeClassifier()
                model_code.fit(x_train_cur,y_train_code_cur_map)
                y_predict_code_map = model_code.predict(x_test_cur)
                y_predict_code_map_prob = model_code.predict_proba(x_test_cur)
                y_text_new,y_predict_new = transfer_multilabel(y_predict_code_map,y_test_code_cur_map,ml,y_predict_code_map_prob,cate_cur)
                report_y_predict.extend(y_predict_new)
Beispiel #54
0
    # We want to convert the labels into vectors. For example, if we have:
    # keywords = [
    #             ['solar', 'physics', 'astronomy'],
    #             ['physics', 'lasers'],
    #             ['astronomy']
    #           ]
    # this would become:
    # keywords_binarised = [
    #             [1, 1, 1, 0],
    #             [0, 1, 0, 1],
    #             [0, 0, 1, 0]
    #           ]
    mlb = MultiLabelBinarizer()
    mlb.fit(keywords)
    keywords_vector = mlb.transform(keywords)

    # We generate a transform from words -> vector space. This is very similar
    # to the above conversion of the keywords. In this scenario, the entire
    # corpus from our training set is converted into an id -> word sparse-
    # matrix.
    bow_transform = CountVectorizer(analyzer=text_to_vector).fit(' '.join(text))

    # We transform our corpus into the unique vector space
    bow_vector = bow_transform.transform(text)

    # We convert the vector into a term frequency - inverse document frequency
    # Term frequencey: f_t (number of times in a document term t exists)
    # Inverse document frequency: log(N/n_t) (number of documents divided by
    #                                         the number of documents that
    #                                         contain term t)
Beispiel #55
0
    mlb.fit(train_ids['business_id'].tolist())
#    X_train=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_ids['photo_id'].tolist()]).astype(np.float32)
#    X_test=np.array([imread('train_photos/val244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()]).astype(np.float32)
    return train_ids,mlb
def load_train(train_list):
    return(np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_list]).astype(np.float32)/255.0)
train_ids,mlb=load_data()
labels=pd.read_csv("train.csv")
labels=labels[pd.isnull(labels['labels'])==False].reset_index(drop=True)
labels['assignment']=np.random.uniform(size=(labels.shape[0],1))

MLB=MultiLabelBinarizer()
train_ids=train_ids.merge(labels[['business_id','assignment']],on='business_id',how='left')
MLB.fit(train_ids['labels'].tolist()) 
labels['labels']=labels['labels'].map(lambda x:[int(i) for i in x.split(" ")])
BETA=MLB.transform(labels.sort('business_id')['labels'])
val_ids=train_ids[train_ids['assignment']>=.9].reset_index(drop=True)
val_Y=MLB.transform(val_ids['labels'])
train_ids=train_ids[train_ids['assignment']<.9].reset_index(drop=True)
Y_test=mlb.transform(val_ids['business_id'].tolist())
print Y_test.shape
np.random.seed(42)
#train_ids=train_ids.sort('business_id').reset_index(drop=True)
train_ids.reindex(np.random.permutation(train_ids.index))
val_ids.reindex(np.random.permutation(val_ids.index))
validate=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()[0:10000]]).astype(np.float32)/255.0

datagen = ImageDataGenerator(
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=20,
def run_classifierAccuracy(trainSentences, trainLabels, testSentences, testLabels):
	all_labels = ['tsunami', 'heat_wave', 'cold_wave', 'forest_fire', 'limnic_erruptions', \
				'storm', 'avalanches', 'blizzard', 'earthquake', 'floods', 'hurricane', \
				'drought', 'volcano', 'fire', 'cyclone', 'hail_storms', 'land_slide', \
				'intensity', 'epicentre', 'temperature', 'depth', 'speed', 'magnitude', \
				'terrorist_attack', 'suicide_attack', 'normal_bombing', 'shoot_out', \
				'aviation_hazard', 'train_collision', 'industrial_accident', \
				'vehicular_collision', 'surgical_strikes', 'transport_hazards', 'riots', \
				'epidemic', 'famine', 'time', 'place', 'type', 'reason', 'after_effects', \
				'casualties', 'name', 'participant']
	disaster_labels = ['tsunami', 'heat_wave', 'cold_wave', 'forest_fire', 'limnic_erruptions', \
				'storm', 'avalanches', 'blizzard', 'earthquake', 'floods', 'hurricane', \
				'drought', 'volcano', 'fire', 'cyclone', 'hail_storms', 'land_slide', \
				'intensity', 'epicentre', 'temperature', 'depth', 'speed', 'magnitude', \
				'time', 'place', 'type', 'reason', 'after_effects', \
				'casualties', 'name', 'participant']
	health_labels = ['epidemic', 'famine', 'time', 'place', 'type', 'reason', 'after_effects', \
				'casualties', 'name', 'participant']
	conflict_labels = ['terrorist_attack', 'suicide_attack', 'normal_bombing', 'shoot_out', \
				'aviation_hazard', 'train_collision', 'industrial_accident', \
				'vehicular_collision', 'surgical_strikes', 'transport_hazards', 'riots', \
				'time', 'place', 'type', 'reason', 'after_effects', \
				'casualties', 'name', 'participant']
	import numpy as np
	curr_labels = set(all_labels)

	trainLabels = [list(set(l).intersection(curr_labels)) for l in trainLabels]
	curr_labels = []
	for l in trainLabels:
		curr_labels.extend(l)
	curr_labels = set(curr_labels)
	testLabels = [list(set(l).intersection(curr_labels))for l in testLabels]

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer(classes=list(curr_labels))
	train_label_matrix = mlb.fit(trainLabels)
	print("Labels : ", mlb.classes_)
	train_label_matrix = mlb.transform(trainLabels)
	test_label_matrix = mlb.transform(testLabels)
	print("Shape of label matrix : ", test_label_matrix.shape)

	train_matrix, tfidf = tf_idf_fit_transform(trainSentences)
	test_matrix = tfidf.transform(testSentences)
	print("Shape of sentence matrix : ", test_matrix.shape)


	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import LinearSVC
	from sklearn.ensemble import RandomForestClassifier
	estimator = LinearSVC()
	# estimator = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0, n_jobs = -1)
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, train_label_matrix)
	predictions = classifier.predict(test_matrix)

	from sklearn.metrics import f1_score, precision_score, recall_score
	print("All-Precision", precision_score(test_label_matrix, predictions, average=None))
	print("All-Recall", recall_score(test_label_matrix, predictions, average=None))
	print("All-F1", f1_score(test_label_matrix, predictions, average=None))
	print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro'))
	print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro'))
	print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro'))
	print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro'))
	print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro'))
	print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro'))
X_train=loadimages(tphotos['photo_id'])
for epoch in xrange(0,epochs):
    tphotos=train.groupby('business_id').apply(extract_images).reset_index()
    tphotos.reindex(np.random.permutation(tphotos.index))
    tphotos.columns=['business_id','photo_id']
    tphotos=tphotos.merge(labels,on='business_id',how='left')
    tphotos['labels']=tphotos['labels'].map(lambda x:[int(i) for i in x.split(" ")])
    tstphotos=test.groupby('business_id').apply(extract_images).reset_index()
    tstphotos.reindex(np.random.permutation(tstphotos.index))
    tstphotos.columns=['business_id','photo_id']
    tstphotos=tstphotos.merge(labels,on='business_id',how='left')
    tstphotos['labels']=tstphotos['labels'].map(lambda x:[int(i) for i in x.split(" ")])
    #Y_train=mlb.transform(tphotos['labels'])
    if epoch==0:
        Y_train=loadimages(tstphotos['photo_id'])
        Y_test=mlb.transform(tstphotos['labels'])
    X_test=mlb.transform(tphotos['labels'])
    X_train=loadimages(tphotos['photo_id'])
#    X_train=np.random.uniform(size=(X_test.shape[0],3,224,224))
    pdb.set_trace()
    inputkeys={"input"+str(i):X_train[:,i,:,:,:] for i in xrange(0,n_images)}
    inputkeys['output1']=X_test
    graph.fit(inputkeys,nb_epoch=1,batch_size=16)
#    graph.fit({"input1":X_train,"input2":X_train,'output1':X_test},nb_epoch=2)
#    model.fit(X_train,X_test,batch_size=128,nb_epoch=1,verbose=0)
    inputkeys={"input"+str(i):Y_train[:,i,:,:,:] for i in xrange(0,n_images)}   
    prob=graph.predict(inputkeys)['output1']
    pred=np.round(prob)
#    probs=graph.predict_proba({"input1":Y_train[:,0,:,:,:],"input2":Y_train[:,1,:,:,:]})
#    print prob.mean(axis=0)
#    print prob.max(axis=0)
Beispiel #58
0
for chunk in reader:
    chunk.dropna(inplace=True) 
    chunks.append(chunk)

test = pd.concat(chunks)

del(chunks)

# Split the tags by spaces
train_labels = train['Tags'].map(lambda x: x.split())
test_labels = test['Tags'].map(lambda x: x.split())

# The label binarizer takes all the tags and turns them into a big sparse matrix
mlb = MultiLabelBinarizer()
mlb.fit(pd.concat([train_labels, test_labels]))
labels = mlb.transform(train_labels)

# Turn the tokens into a sparse matrix
vect = CountVectorizer(
    # Get text from html
    preprocessor = preprocess,
    # Turn the text into tokens
    tokenizer = tokenize,
    # Generate ngrams
    ngram_range = (1, 2),
    # Remove extremely common tokens
    max_df = 0.5,
    # Remove extremely uncommon tokens
    min_df = 0.001
)