コード例 #1
0
ファイル: SVMs.py プロジェクト: pkumusic/HCE
    def __init__(self, inter_filePath = "inter/technology_companies_of_the_united_states/"):
        # [[cat,cat...]...]
        self.m = Word2Vec.load_word2vec_format("vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5", binary=True) 
        self.dim = 400

        (correct_categories_train, context_categories_train) = self.load_category_page(inter_filePath + "category_page.txt")  
        (correct_categories_test, context_categories_test) = self.load_category_page(inter_filePath + "category_page_test.txt")
        ## ----  By mean ---
        Xvectors = np.array(self.predict_vector_by_mean(context_categories_train))
        Xvectors_test = np.array(self.predict_vector_by_mean(context_categories_test))


        ## ----  By mean --- *

        ## ----  By SVM ---
        corpus_train = [" ".join(i) for i in context_categories_train]
        corpus_test = [" ".join(i) for i in context_categories_test]
        cv = CountVectorizer(min_df = 1)
        X = cv.fit_transform(corpus_train)
        ##TFIDF
        transformer = TfidfTransformer()
        X_tfidf = transformer.fit_transform(X)
        #Labels
        mlb = MultiLabelBinarizer()
        mlb.fit(correct_categories_train + correct_categories_test)
        Y = mlb.transform(correct_categories_train) ###Transform to multilabel indicator
        #predict test labels
        X_test = cv.transform(corpus_test)
        Y_test = mlb.transform(correct_categories_test)
        #Y_predict_ovr = self.ovrSVM(X, Y, X_test)
        Y_predict_ovr = self.ovrSVM(Xvectors, Y, Xvectors_test)
        #Y_predict_ovo = self.ovoSVM(X, Y, X_test)
        print "---One versus rest---"
        print "Macro F-1:", f1_score(Y_test, Y_predict_ovr, average='macro')
        print "Micro F-1:", f1_score(Y_test, Y_predict_ovr, average='micro')
コード例 #2
0
def fit_images():
    client = pymongo.MongoClient('localhost', 27017)
    db = client['image_annotation']
    responses = db['mapped_responses'].find()
    no_labels = db['labels_binary'].find()
    numbers = []
    for i in no_labels:
        numbers.append(set([int(i["number"])]))
    train_data = []
    labels = []
    i=0
    mlb = MultiLabelBinarizer()
    mlb.fit(numbers)
    for index, instance in enumerate(responses):
        t_data =  instance['hist']['0']
        indexes[index] = instance['image_no']
        train_data.append(t_data)
        label = instance['binary_results']
        new_labels = []
        for key, value in enumerate(label):
            value1 = int(value)
            new_labels.append(set([value1]))
        new_labels = mlb.transform(new_labels)
        labels.append(label)
    classifier = KNeighborsClassifier(n_neighbors = 5, weights='uniform')
    classifier.fit(train_data, labels)
    build_dir = getBuildDir()
    pickle.dump(classifier, open(join(build_dir, 'model.data'),'w'),protocol=1)
    client.close()
コード例 #3
0
def test_multilabel_classification_report():
    n_classes = 4
    n_samples = 50
    make_ml = make_multilabel_classification
    _, y_true_ll = make_ml(n_features=1, n_classes=n_classes, random_state=0,
                           n_samples=n_samples)
    _, y_pred_ll = make_ml(n_features=1, n_classes=n_classes, random_state=1,
                           n_samples=n_samples)

    expected_report = """\
             precision    recall  f1-score   support

          0       0.50      0.67      0.57        24
          1       0.51      0.74      0.61        27
          2       0.29      0.08      0.12        26
          3       0.52      0.56      0.54        27

avg / total       0.45      0.51      0.46       104
"""

    lb = MultiLabelBinarizer()
    lb.fit([range(4)])
    y_true_bi = lb.transform(y_true_ll)
    y_pred_bi = lb.transform(y_pred_ll)

    for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]:
        report = classification_report(y_true, y_pred)
        assert_equal(report, expected_report)
コード例 #4
0
ファイル: tags.py プロジェクト: jfsantos/crema
class TimeSeriesLabelTransformer(BaseTaskTransformer):

    def __init__(self, namespace, name, labels=None):
        '''Initialize a time-series label transformer

        Parameters
        ----------
        jam : jams.JAMS
            The JAMS object container

        n_samples : int > 0
            The number of samples in the audio frame

        label_encoder : sklearn.preprocessing.MultiLabelBinarizer
            The (pre-constructed) label encoder
        '''

        super(TimeSeriesLabelTransformer, self).__init__(namespace, 0)

        self.encoder = MultiLabelBinarizer()
        self.encoder.fit([labels])
        self._classes = set(self.encoder.classes_)
        self.name = name

    def transform(self, jam):

        ann = self.find_annotation(jam)

        intervals = np.asarray([[0.0, jam.file_metadata.duration]])
        values = [None]
        mask = False

        if ann:
            ann_int, ann_val = ann.data.to_interval_values()
            intervals = np.vstack([intervals, ann_int])
            values.extend(ann_val)
            mask = True

        # Suppress all intervals not in the encoder
        tags = []
        for v in values:
            if v in self._classes:
                tags.extend(self.encoder.transform([[v]]))
            else:
                tags.extend(self.encoder.transform([[]]))

        tags = np.asarray(tags)
        target = self.encode_intervals(jam.file_metadata.duration,
                                       intervals,
                                       tags)
        return {'output_{:s}'.format(self.name): target,
                'mask_{:s}'.format(self.name): mask}
コード例 #5
0
def test_multilabelbinarizer_vs_sklearn():
    # Compare msmbuilder.preprocessing.MultiLabelBinarizer
    # with sklearn.preprocessing.MultiLabelBinarizer

    multilabelbinarizerr = MultiLabelBinarizerR()
    multilabelbinarizerr.fit(np.concatenate(trajs))

    multilabelbinarizer = MultiLabelBinarizer()
    multilabelbinarizer.fit(trajs)

    y_ref1 = multilabelbinarizerr.transform(trajs[0])
    y1 = multilabelbinarizer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
コード例 #6
0
ファイル: keras_test.py プロジェクト: TELSER1/yelp_recruit
def load_data():
    labels=pd.read_csv("train.csv")
    bismatch=pd.read_csv("train_photo_to_biz_ids.csv")
    labels=bismatch.merge(labels,how='left',on='business_id')
    labels=labels[pd.isnull(labels['labels'])==False]
    labels['labels']=labels['labels'].map(lambda x:[int(i) for i in x.split(" ")])
    training_=os.listdir("train_photos/train244")
    train_ids=pd.DataFrame({"photo_id":[int(i.split(".")[0]) for i in training_]})
    train_ids=train_ids.merge(labels,on='photo_id',how='inner')
#    val_ids=val_ids.merge(labels,on='photo_id',how='inner')
    mlb=MultiLabelBinarizer()
    mlb.fit(train_ids['labels'].tolist())
#    X_train=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_ids['photo_id'].tolist()]).astype(np.float32)
#    X_test=np.array([imread('train_photos/val244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()]).astype(np.float32)
    return train_ids,mlb
コード例 #7
0
class ACMClassificator(BaseACMClassificator):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
        self.mlb = MultiLabelBinarizer()
        self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini",
                                                                     max_depth=None,
                                                                     min_samples_split=2,
                                                                     min_samples_leaf=1,
                                                                     min_weight_fraction_leaf=0.,
                                                                     max_features="auto",
                                                                     max_leaf_nodes=None,
                                                                     class_weight=None),
                                                 n_jobs=-1
                                                 )

    def _prepare_problems(self, problems):
        return self.vectorizer.transform([p.statement for p in problems])

    def fit(self, problems, tags):
        nltk.download('punkt', quiet=True)
        self.vectorizer.fit([p.statement for p in problems])
        mat = self._prepare_problems(problems)
        self.mlb = self.mlb.fit(tags)
        self.classificator.fit(mat.toarray(), self.mlb.transform(tags))

    def predict(self, problems):
        mat = self._prepare_problems(problems)
        predicted = self.classificator.predict(mat.toarray())
        return self.mlb.inverse_transform(predicted)
コード例 #8
0
def prepVect(min_df=2, max_features=50000, n_captions=5, n_sbu=None,
             multilabel=False):
    print "prepping the Word Tokenizer..."
    _0, _1, trY, _3 = coco(mode='full', n_captions=n_captions)
    if n_sbu:
        _4, sbuY, _5 = sbuXYFilenames(n_sbu)
        trY.extend(sbuY)
    vect = Tokenizer(min_df=min_df, max_features=max_features)
    captions = sampleCaptions(trY, n_captions)
    vect.fit(captions)
    if multilabel:
        mlb = MultiLabelBinarizer()
        mlb.fit(vect.transform(captions))
        return vect, mlb
    # if not multilabel:
    return vect
コード例 #9
0
def run_classifierAccuracy(trainSentences, trainLabels, testSentences, testLabels):
	all_labels = ["Drought", "Earthquake", "Flood", "Epidemic", "Hurricane", \
			"Rebellion", "Terrorism", "Tornado", "Tsunami", "displaced_people_and_evacuations", \
			"donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \
			"injured_or_dead_people", "missing_trapped_or_found_people"]
	disaster_labels = ["Drought", "Earthquake", "Flood", "Hurricane", \
			"Tornado", "Tsunami", "displaced_people_and_evacuations", \
			"donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \
			"injured_or_dead_people", "missing_trapped_or_found_people"]
	health_labels = ["Epidemic", "displaced_people_and_evacuations", \
			"donation_needs_or_offers_or_volunteering_services", \
			"injured_or_dead_people"]
	conflict_labels = ["Rebellion", "Terrorism", "displaced_people_and_evacuations", \
			"infrastructure_and_utilities_damage", \
			"injured_or_dead_people", "missing_trapped_or_found_people"]
	import numpy as np
	curr_labels = all_labels

	trainLabels = [list(set(l).intersection(curr_labels)) for l in trainLabels]
	testLabels = [list(set(l).intersection(curr_labels))for l in testLabels]

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer(classes=curr_labels)
	train_label_matrix = mlb.fit(trainLabels)
	print("Labels : ", mlb.classes_)
	train_label_matrix = mlb.transform(trainLabels)
	test_label_matrix = mlb.transform(testLabels)
	print("Shape of label matrix : ", test_label_matrix.shape)

	train_matrix, tfidf = tf_idf_fit_transform(trainSentences)
	test_matrix = tfidf.transform(testSentences)
	print("Shape of sentence matrix : ", test_matrix.shape)


	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import LinearSVC
	from sklearn.ensemble import RandomForestClassifier
	# estimator = LinearSVC()
	estimator = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0, n_jobs = -1)
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, train_label_matrix)
	predictions = classifier.predict(test_matrix)

	from sklearn.metrics import f1_score, precision_score, recall_score
	print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro'))
	print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro'))
	print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro'))
	print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro'))
	print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro'))
	print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro'))
	print("Macro-Precision", precision_score(test_label_matrix, predictions, average=None))
	print("Macro-Recall", recall_score(test_label_matrix, predictions, average=None))
	print("Macro-F1", f1_score(test_label_matrix, predictions, average=None))
コード例 #10
0
ファイル: tags.py プロジェクト: jfsantos/crema
class GlobalLabelTransformer(BaseTaskTransformer):

    def __init__(self, namespace, name, labels=None):
        '''Initialize a global label transformer

        Parameters
        ----------
        jam : jams.JAMS
            The JAMS object container
        '''

        super(GlobalLabelTransformer, self).__init__(namespace, 0)

        self.encoder = MultiLabelBinarizer()
        self.encoder.fit([labels])
        self._classes = set(self.encoder.classes_)
        self.name = name

    def transform(self, jam):

        ann = self.find_annotation(jam)

        intervals = np.asarray([[0, 1]])
        values = [None]
        mask = False

        if ann:
            values = list(ann.data.value)
            intervals = np.tile(intervals, [len(values), 1])
            mask = True

        # Suppress all intervals not in the encoder
        tags = [v for v in values if v in self._classes]
        if len(tags):
            target = self.encoder.transform([tags]).max(axis=0)
        else:
            target = np.zeros(len(self._classes), dtype=np.int)

        return {'output_{:s}'.format(self.name): target,
                'mask_{:s}'.format(self.name): mask}
コード例 #11
0
class ACMClassificator(BaseACMClassificator):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
        self.mlb = MultiLabelBinarizer()
        self.classificator = OneVsRestClassifier(SVC(), n_jobs=-1)

    def _prepare_problems(self, problems):
        return self.vectorizer.transform([p.statement for p in problems])

    def fit(self, problems, tags):
        nltk.download('punkt', quiet=True)
        self.vectorizer.fit([p.statement for p in problems])
        mat = self._prepare_problems(problems)
        self.mlb = self.mlb.fit(tags)
        self.classificator.fit(mat.toarray(), self.mlb.transform(tags))

    def predict(self, problems):
        mat = self._prepare_problems(problems)
        predicted = self.classificator.predict(mat.toarray())
        return self.mlb.inverse_transform(predicted)
コード例 #12
0
datasets_single, queries_single = zip(*preprocessed_data_list_single)

q_fasttext = preprocessing.fast_text_embeddings(queries)
documentation_file_parameteropt.write("fasttext Evaluation \n")
documentation_file_modelopt.write("fasttext Evaluation \n")

#split data in training and test data
d_train, d_test, q_train, q_test = train_test_split(datasets,
                                                    q_fasttext,
                                                    test_size=0.2)

#encode labels, for abstracts with MultiLabelBinarizer as one sample can have multiple labels, for
#citation contexts use LabelEncoder
label_encoder = MultiLabelBinarizer()
#label_encoder = LabelEncoder()
label_encoder.fit(datasets)
d_train_encoded = label_encoder.transform(d_train)
pickle.dump(label_encoder, open('label_encoder_fasttext.sav', 'wb'))

#Linear SVM: optimizing parameters with grid search
print("SVM model evaluation")
svm_dict = dict(estimator__C=[1, 2, 5, 10, 50, 100])
classifier_svm = RandomizedSearchCV(estimator=OneVsRestClassifier(
    svm.LinearSVC()),
                                    param_distributions=svm_dict,
                                    n_iter=5,
                                    n_jobs=1)
classifier_svm.fit(np.asarray(q_train), np.asarray(d_train_encoded))
documentation_file_parameteropt.write(
    "Linear SVM: Best parameters {}, reached score: {} \n".format(
        classifier_svm.best_params_, classifier_svm.best_score_))
コード例 #13
0
train_json = json.load(open('train.json'))

#filenames=["./imaterial_train/"+str(i)+".jpg" for i in range(1,201)]
#print(filenames)

y_train = []
for i in range(1, 201):
    labels = train_json['annotations'][i]['labelId']
    labels = np.array(list(map(int, labels)))
    y_train.append(labels)
y_train = np.array(y_train)

all_labels = []
for i in range(1, 229):
    all_labels.append([i])
mlb.fit(all_labels)  # fitting multilabelbinarizer to all labels
y_train = mlb.transform(y_train)

#print('smallest vertical:',min(i.shape[0] for i in X_train))
#print('smallest horizontal:',min(i.shape[1] for i in X_train))

#All images resized to the smallest dimensions
X_train_resized = [transform.resize(img, (200, 128, 3)) for img in X_train]
X_train_flat = np.array([img.flatten() for img in X_train_resized])

os.chdir('./imaterial_validation')
X_test = np.array([io.imread(str(i) + '.jpg') for i in range(1, 201)])
os.chdir('..')

X_test_resized = [transform.resize(img, (200, 128, 3)) for img in X_test]
X_test_flat = np.array([img.flatten() for img in X_test_resized])
コード例 #14
0
# print(firstlast)

def tags_for_question(question_id):
    return df_tags[df_tags['Id'] == question_id].Tag.values

def add_tags_column(row):
    row['Tags'] = tags_for_question(row['Id'])
    return row

df_questions = df_text.apply(add_tags_column, axis=1)
# print(df_questions[['Id', 'Text', 'Tags']].head())


multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_questions.Tags)
Y = multilabel_binarizer.transform(df_questions.Tags)

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(df_questions.Text.values.astype('U'))

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

ros = RandomOverSampler(random_state=9000)
X_tfidf_resampled, Y_tfidf_resampled = ros.fit_sample(X_tfidf, Y)

x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf_resampled, Y_tfidf_resampled, test_size=0.2, random_state=9000)


コード例 #15
0
>>> x=pd.DataFrame([["I have 12345678 tomatoes"],["12345678"])
	       
SyntaxError: invalid syntax
>>> x=pd.DataFrame([["I have 12345678 tomatoes"],["12345678"]])
>>> x
                          0
0  I have 12345678 tomatoes
1                  12345678
>>> x.T
                          0         1
0  I have 12345678 tomatoes  12345678
>>> x=pd.DataFrame([["I have 12345678 tomatoes","12345678"]])
>>> x
                          0         1
0  I have 12345678 tomatoes  12345678
>>> binarizer = MultiLabelBinarizer.fit(x)
Traceback (most recent call last):
  File "<pyshell#13>", line 1, in <module>
    binarizer = MultiLabelBinarizer.fit(x)
TypeError: fit() missing 1 required positional argument: 'y'
>>> binarizer = MultiLabelBinarizer.fit(x[0])
Traceback (most recent call last):
  File "<pyshell#14>", line 1, in <module>
    binarizer = MultiLabelBinarizer.fit(x[0])
TypeError: fit() missing 1 required positional argument: 'y'
>>> binarizer = MultiLabelBinarizer.fit(x[0],x[1])
Traceback (most recent call last):
  File "<pyshell#15>", line 1, in <module>
    binarizer = MultiLabelBinarizer.fit(x[0],x[1])
  File "/Users/montana/miniconda3/lib/python3.6/site-packages/sklearn/preprocessing/label.py", line 696, in fit
    if self.classes is None:
コード例 #16
0
def get_or_make_label_encoder(params,
                              problem,
                              mode,
                              label_list=None,
                              zero_class=None):
    """Simple function to create or load existing label encoder
    If mode is train, alway create new label_encder

    Arguments:
        problem {str} -- problem name
        mode {mode} -- mode

    Keyword Arguments:
        label_list {list} -- label list to fit the encoder (default: {None})
        zero_class {str} -- what to assign as 0 (default: {'O'})

    Returns:
        LabelEncoder -- label encoder
    """
    if label_list is None:
        return None
    problem_path = params.ckpt_dir
    create_path(problem_path)
    le_path = os.path.join(problem_path, '%s_label_encoder.pkl' % problem)
    is_seq2seq_text = params.problem_type[problem] == 'seq2seq_text'
    is_multi_cls = params.problem_type[problem] == 'multi_cls'
    is_seq2seq_tag = params.problem_type[problem] == 'seq2seq_tag'

    if mode == 'train' and not os.path.exists(le_path):

        if is_seq2seq_text:
            label_encoder = load_transformer_tokenizer(
                params.transformer_tokenizer_name)
            pickle.dump(label_encoder, open(le_path, 'wb'))

        elif is_multi_cls:
            label_encoder = MultiLabelBinarizer()
            label_encoder.fit(label_list)
            pickle.dump(label_encoder, open(le_path, 'wb'))

        else:
            if isinstance(label_list[0], list):
                label_list = [
                    item for sublist in label_list for item in sublist
                ]
                if is_seq2seq_tag:
                    label_list.extend([BOS_TOKEN, EOS_TOKEN])
            label_encoder = LabelEncoder()

            label_encoder.fit(label_list, zero_class=zero_class)
            label_encoder.dump(le_path)

    else:

        if is_seq2seq_text or is_multi_cls:
            label_encoder = pickle.load(open(le_path, 'rb'))
        else:
            label_encoder = LabelEncoder()
            label_encoder.load(le_path)

    if not is_seq2seq_text:
        if is_multi_cls:
            params.num_classes[problem] = label_encoder.classes_.shape[0]
        else:
            params.num_classes[problem] = len(label_encoder.encode_dict)
            if EOS_TOKEN in label_encoder.encode_dict:
                params.eos_id[problem] = int(
                    label_encoder.transform([EOS_TOKEN])[0])
    else:
        params.num_classes[problem] = len(label_encoder.vocab)
        params.eos_id[problem] = label_encoder.convert_tokens_to_ids(
            [EOS_TOKEN])

    return label_encoder
コード例 #17
0
class HumanDataset(Dataset):
    def __init__(self, images_df, base_path, augument=True, mode="train"):
        if not isinstance(base_path, pathlib.Path):
            base_path = pathlib.Path(base_path)
        self.images_df = images_df.copy()
        self.augument = augument
        self.images_df.Id = self.images_df.Id.apply(lambda x: base_path / x)
        self.mlb = MultiLabelBinarizer(
            classes=np.arange(0, config.num_classes))
        self.mlb.fit(np.arange(0, config.num_classes))
        self.mode = mode

    def __len__(self):
        return len(self.images_df)

    def __getitem__(self, index):
        X = self.read_images(index)
        if not self.mode == "test":
            labels = np.array(
                list(map(int, self.images_df.iloc[index].Target.split(' '))))
            y = np.eye(config.num_classes, dtype=np.float)[labels].sum(axis=0)
        else:
            y = str(self.images_df.iloc[index].Id.absolute())
        if self.augument:
            X = self.augumentor(X)
        #X = T.Compose([T.ToPILImage(),T.ToTensor(),T.Normalize([0.08069, 0.05258, 0.05487, 0.08282], [0.13704, 0.10145, 0.15313, 0.13814])])(X)
        X = T.Compose([T.ToPILImage(), T.ToTensor()])(X)
        return X.float(), y

    def read_images(self, index):
        row = self.images_df.iloc[index]
        filename = str(row.Id.absolute())
        #use only rgb channels
        if config.channels == 4:
            images = np.zeros(shape=(512, 512, 4))
        else:
            images = np.zeros(shape=(512, 512, 3))
        r = np.array(Image.open(filename + "_red.png"))
        g = np.array(Image.open(filename + "_green.png"))
        b = np.array(Image.open(filename + "_blue.png"))
        y = np.array(Image.open(filename + "_yellow.png"))
        images[:, :, 0] = r.astype(np.uint8)
        images[:, :, 1] = g.astype(np.uint8)
        images[:, :, 2] = b.astype(np.uint8)
        if config.channels == 4:
            images[:, :, 3] = y.astype(np.uint8)
        images = images.astype(np.uint8)
        #images = np.stack(images,-1)
        if config.img_height == 512:
            return images
        else:
            return cv2.resize(images, (config.img_weight, config.img_height))

    def augumentor(self, image):
        augment_img = iaa.Sequential([
            iaa.OneOf([
                iaa.Affine(rotate=90),
                iaa.Affine(rotate=180),
                iaa.Affine(rotate=270),
                iaa.Affine(shear=(-16, 16)),
                iaa.Fliplr(0.5),
                iaa.Flipud(0.5),
            ])
        ],
                                     random_order=True)

        image_aug = augment_img.augment_image(image)
        return image_aug
コード例 #18
0
ファイル: workbench.py プロジェクト: nishnik/QuoRecommender
# unique_tags = []

# with open("../logs/tags.txt") as top_tag_list:
#     for line in top_tag_list:
#         line = line.split('\n')[0]
#         if cnt[line] > 0:
#             unique_tags.append(line) 

# for key in data:
#     for tag in data[key]:
#         if tag not in unique_tags:
#             data[key].remove(tag)

tags = data.values()
mlb = MultiLabelBinarizer()
mlb.fit(tags)
print("Saving trained LabelBinarizer to disk")
joblib.dump(mlb, '../dump/pkl/' + str(mlb)[:5] + '.pkl')
print("")

# Split corpus into training and test sets
questions_train, questions_test, tags_train, tags_test = train_test_split(questions, tags, test_size=0.2, random_state = random.randint(1, 100))

print("Extracting features from the training data using the vectorizer")
t0 = time()
X_train = vectorizer.transform(questions_train)
duration = time() - t0
print("done in %fs" % (duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print("")
コード例 #19
0
import os
import pathlib

import pandas as pd
import numpy as np
import matplotlib.image as mpimg
from sklearn.preprocessing import MultiLabelBinarizer
from PIL import Image

import torch
from torchvision import datasets, transforms, models
from torch.utils import data

CLASSES = np.arange(0, 28)
multilabel_binarizer = MultiLabelBinarizer(CLASSES)
multilabel_binarizer.fit(CLASSES)

INPUT_DIR = '../input'
TRAIN_IMAGES_DIR = pathlib.Path(INPUT_DIR, 'train').as_posix()
TEST_IMAGES_DIR = pathlib.Path(INPUT_DIR, 'test').as_posix()
TARGETS_COLUMN_NAME = 'Target'
COLORS = ('red', 'green', 'blue', 'yellow')
IMAGE_FILE_EXT = 'png'


class HumanProteinAtlasDataset(data.Dataset):
    def __init__(self, images_description_df, transform=None, train_mode=True):

        self.images_description_df = images_description_df.copy()
        self.transform = transform
        self.train_mode = train_mode
コード例 #20
0
def binarizer_labels(data):
    multilabel_binarizer = MultiLabelBinarizer()
    multilabel_binarizer.fit(data)
    # transform target variable
    y = multilabel_binarizer.transform(data)
    return y, multilabel_binarizer
コード例 #21
0
dataset['genre_list'] = dataset['genre_list'].apply(
    lambda x: ast.literal_eval(x))
train['genre_list'] = train['genre_list'].apply(lambda x: ast.literal_eval(x))
test['genre_list'] = test['genre_list'].apply(lambda x: ast.literal_eval(x))
val['genre_list'] = val['genre_list'].apply(lambda x: ast.literal_eval(x))

labels = {}

for genre in test['genre_list']:
    if len(genre) in labels:
        labels[len(genre)] += 1
    else:
        labels[len(genre)] = 1

mlb = MultiLabelBinarizer()
mlb.fit(dataset['genre_list'].tolist())

transformed_labels = mlb.fit_transform(dataset['genre_list'].tolist())

train_labels = mlb.transform(train['genre_list'].tolist())

test_labels = mlb.transform(test['genre_list'].tolist())

val_labels = mlb.transform(val['genre_list'].tolist())

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    text = text.translate(str.maketrans('', '', punctuation))
コード例 #22
0
def preprocess_data(args):

    label_counter = Counter([])
    examples_per_file = Counter()

    print("Reading all files for labels.")
    for input_file in args.input_files:
        with xopen(input_file, "rt") as f:
            for example, labels in input_readers[args.task](f):
                examples_per_file[input_file] += 1
                label_counter.update(labels)

    if args.top_n_labels > 0:
        mlb_full = MultiLabelBinarizer(sparse_output=True)
        mlb_full = mlb_full.fit(label_counter.keys())
        label_counter = dict(label_counter.most_common(args.top_n_labels))

    mlb = MultiLabelBinarizer(sparse_output=True)
    # Passing a list in a list because that's what the function wants.
    if args.labels_in:
        labels = json.load(open(args.labels_in))
        mlb = mlb.fit([labels])
    else:
        mlb = mlb.fit([[pair for pair in label_counter]])

    # Save list of partial -> full mapping if doing top N labels.
    if args.top_n_labels > 0:

        label_mapping = np.where(np.in1d(mlb_full.classes_,
                                         mlb.classes_))[0].tolist()

        with xopen(args.label_mapping, "wt") as f:
            f.write(json.dumps(label_mapping))

        # Also save the full labels.
        with xopen(args.full_labels, "wt") as f:
            f.write(json.dumps(list(mlb_full.classes_)))

    # Save list of labels.
    with xopen(args.labels_out, "wt") as f:
        f.write(json.dumps(list(mlb.classes_)))

    # Set parallel tokenization thread count.
    os.environ["RAYON_NUM_THREADS"] = str(args.processes)

    from tokenizers import Tokenizer, decoders, trainers
    from tokenizers.models import WordPiece
    from tokenizers.normalizers import BertNormalizer
    from tokenizers.pre_tokenizers import BertPreTokenizer
    from tokenizers.processors import BertProcessing

    if args.task == 'cafa':
        # Define our custom tokenizer.
        # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word
        # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences.
        tokenizer = WordPiece.from_files(args.vocab,
                                         unk_token="[UNK]",
                                         max_input_chars_per_word=20000)
        tokenizer = Tokenizer(tokenizer)
        tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"])
        tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.post_processor = BertProcessing(
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[CLS]", tokenizer.token_to_id("[CLS]")))
        tokenizer.decoder = decoders.WordPiece(prefix='##')
    else:
        tokenizer = BertWordPieceTokenizer(args.vocab,
                                           lowercase=args.do_lower_case)

    tokenizer.enable_padding(max_length=args.seq_len)
    tokenizer.enable_truncation(max_length=args.seq_len)

    for input_file in args.input_files:
        with xopen(input_file, 'rt') as in_f:

            file_name = generate_out_filename(input_file, args)

            with xopen(file_name, "wt") as out_f:
                print("Processing to: ", file_name)

                # Write the shape as the first row, useful for the finetuning.
                if args.labels_in:
                    n_labels = len(json.load(open(args.labels_in)))
                else:
                    n_labels = len(label_counter)
                out_f.write(
                    json.dumps((examples_per_file[input_file], n_labels)) +
                    '\n')

                batch_size = min(examples_per_file[input_file],
                                 args.processes * 100)
                example_batch = []
                labels_batch = []
                doc_idx_batch = []

                with ParallelGenerator(input_readers[args.task](in_f),
                                       max_lookahead=batch_size) as g:
                    START_POS = int(args.window_start) / 100
                    for doc_idx, (example, labels) in enumerate(g):
                        #example = ' '.join(example.split(' ')[-510:])
                        example_batch.append(example)
                        labels_batch.append(labels)
                        doc_idx_batch.append(doc_idx)

                        if len(example_batch) == batch_size:
                            example_batch = tokenizer.encode_batch(
                                example_batch)
                            labels_batch = mlb.transform(labels_batch)

                            for example, labels, doc_idx in zip(
                                    example_batch, labels_batch,
                                    doc_idx_batch):
                                # Convert sparse arrays to python lists for json dumping.
                                # print(labels);input()
                                labels = labels.nonzero()[1].tolist()
                                """try:
                                    [][0]
                                    print("DOC_LEN:",len(example.overflowing)+1)
                                    mid = len(example.overflowing)//2
                                    out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                                except IndexError:
                                    out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                                if args.all_blocks or args.n_blocks > 0:
                                    blocks = [example.ids] + [
                                        blk.ids for blk in example.overflowing
                                    ]
                                    #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                                    for b, block in enumerate(blocks, 2):
                                        if b > args.n_blocks and args.n_blocks > 0:
                                            break
                                        out_f.write(
                                            json.dumps(
                                                [block, labels, doc_idx]) +
                                            '\n')
                                else:
                                    window = get_window(example, START_POS)
                                    assert len(window) == 512
                                    assert all(
                                        [type(y) is int for y in window])
                                    out_f.write(
                                        json.dumps([window, labels]) + '\n')

                            example_batch = []
                            labels_batch = []

                    # Write out whatever is left in the last smaller batch.
                    example_batch = tokenizer.encode_batch(example_batch)
                    labels_batch = mlb.transform(labels_batch)

                    for example, labels, doc_idx in zip(
                            example_batch, labels_batch, doc_idx_batch):
                        # Convert sparse arrays to python lists for json dumping.
                        # print(labels);input()
                        labels = labels.nonzero()[1].tolist()
                        """try:
                            [][0]
                            print("DOC_LEN:",len(example.overflowing)+1)
                            mid = len(example.overflowing)//2
                            out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                        except IndexError:
                            out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                        if args.all_blocks or args.n_blocks > 0:
                            blocks = [example.ids] + [
                                blk.ids for blk in example.overflowing
                            ]
                            #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                            for b, block in enumerate(blocks, 2):
                                if b > args.n_blocks and args.n_blocks > 0:
                                    break
                                out_f.write(
                                    json.dumps([block, labels, doc_idx]) +
                                    '\n')
                        else:
                            out_f.write(
                                json.dumps(
                                    [get_window(example, START_POS), labels]) +
                                '\n')
コード例 #23
0
class DynamicLabelTransformer(BaseTaskTransformer):
    '''Time-series label transformer.

    Attributes
    ----------
    name : str
        The name of this transformer object

    namespace : str
        The JAMS namespace for this task

    labels : list of str [optional]
        The list of labels for this task.

        If not provided, it will attempt to infer the label set from the
        namespace definition.

    sr : number > 0
        The audio sampling rate

    hop_length : int > 0
        The hop length for annotation frames

    See Also
    --------
    StaticLabelTransformer
    '''
    def __init__(self, name, namespace, labels=None, sr=22050, hop_length=512):
        super(DynamicLabelTransformer, self).__init__(name=name,
                                                      namespace=namespace,
                                                      sr=sr,
                                                      hop_length=hop_length)

        if labels is None:
            labels = jams.schema.values(namespace)

        self.encoder = MultiLabelBinarizer()
        self.encoder.fit([labels])
        self._classes = set(self.encoder.classes_)

        self.register('tags', [None, len(self._classes)], np.bool)

    def empty(self, duration):
        '''Empty label annotations.

        Constructs a single observation with an empty value (None).

        Parameters
        ----------
        duration : number > 0
            The duration of the annotation
        '''
        ann = super(DynamicLabelTransformer, self).empty(duration)
        ann.append(time=0, duration=duration, value=None)
        return ann

    def transform_annotation(self, ann, duration):
        '''Transform an annotation to dynamic label encoding.

        Parameters
        ----------
        ann : jams.Annotation
            The annotation to convert

        duration : number > 0
            The duration of the track

        Returns
        -------
        data : dict
            data['tags'] : np.ndarray, shape=(n, n_labels)
                A time-varying binary encoding of the labels
        '''
        intervals, values = ann.to_interval_values()

        # Suppress all intervals not in the encoder
        tags = []
        for v in values:
            if v in self._classes:
                tags.extend(self.encoder.transform([[v]]))
            else:
                tags.extend(self.encoder.transform([[]]))

        tags = np.asarray(tags)
        target = self.encode_intervals(duration, intervals, tags)

        return {'tags': target}

    def inverse(self, encoded, duration=None):
        '''Inverse transformation'''

        ann = jams.Annotation(namespace=self.namespace, duration=duration)
        for start, end, value in self.decode_intervals(encoded,
                                                       duration=duration):
            # Map start:end to frames
            f_start, f_end = time_to_frames([start, end],
                                            sr=self.sr,
                                            hop_length=self.hop_length)

            confidence = np.mean(encoded[f_start:f_end + 1, value])

            value_dec = self.encoder.inverse_transform(np.atleast_2d(value))[0]

            for vd in value_dec:
                ann.append(time=start,
                           duration=end - start,
                           value=vd,
                           confidence=confidence)

        return ann
コード例 #24
0
class StaticLabelTransformer(BaseTaskTransformer):
    '''Static label transformer.

    Attributes
    ----------
    name : str
        The name of this transformer object

    namespace : str
        The JAMS namespace for this task

    labels : list of str [optional]
        The list of labels for this task.

        If not provided, it will attempt to infer the label set from the
        namespace definition.

    See Also
    --------
    DynamicLabelTransformer
    '''
    def __init__(self, name, namespace, labels=None):
        super(StaticLabelTransformer, self).__init__(name=name,
                                                     namespace=namespace,
                                                     sr=1,
                                                     hop_length=1)

        if labels is None:
            labels = jams.schema.values(namespace)

        self.encoder = MultiLabelBinarizer()
        self.encoder.fit([labels])
        self._classes = set(self.encoder.classes_)
        self.register('tags', [len(self._classes)], np.bool)

    def transform_annotation(self, ann, duration):
        '''Transform an annotation to static label encoding.

        Parameters
        ----------
        ann : jams.Annotation
            The annotation to convert

        duration : number > 0
            The duration of the track

        Returns
        -------
        data : dict
            data['tags'] : np.ndarray, shape=(n_labels,)
                A static binary encoding of the labels
        '''
        intervals = np.asarray([[0, 1]])
        values = list([obs.value for obs in ann])
        intervals = np.tile(intervals, [len(values), 1])

        # Suppress all intervals not in the encoder
        tags = [v for v in values if v in self._classes]
        if len(tags):
            target = self.encoder.transform([tags]).astype(np.bool).max(axis=0)
        else:
            target = np.zeros(len(self._classes), dtype=np.bool)

        return {'tags': target}

    def inverse(self, encoded, duration=None):
        '''Inverse static tag transformation'''

        ann = jams.Annotation(namespace=self.namespace, duration=duration)

        if np.isrealobj(encoded):
            detected = (encoded >= 0.5)
        else:
            detected = encoded

        for vd in self.encoder.inverse_transform(np.atleast_2d(detected))[0]:
            vid = np.flatnonzero(self.encoder.transform(np.atleast_2d(vd)))
            ann.append(time=0,
                       duration=duration,
                       value=vd,
                       confidence=encoded[vid])
        return ann
コード例 #25
0
buki_data = pd.read_csv(f'{ORIG_DIR}/statink-weapon2.csv')

# カテゴリ名称を変更比較用
display(buki_data.loc[buki_data['category2'] == 'maneuver'].head(3))

# カテゴリ名称とブキ名称が被るためカテゴリ名称を変更する
buki_data.loc[buki_data['category2'] == 'maneuver',
              'category2'] = 'maneuver_cat'
display(buki_data.loc[buki_data['category2'] == 'maneuver_cat'].head(3))

# +
#https://prob.space/competitions/game_winner
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()  # マルチラベル形式でonehot化(複数列で1つく)
mlb.fit([set(train['A1-weapon'].unique())])
MultiLabelBinarizer(classes=None, sparse_output=False)


def trans_weapon(df,
                 columns=['A1-weapon', 'A2-weapon', 'A3-weapon', 'A4-weapon']):
    """指定列をonehot化"""
    weapon = df.fillna('none')
    weapon_binarized = mlb.transform(weapon[columns].values)
    return pd.DataFrame(weapon_binarized, columns=mlb.classes_)


def make_input_output(df, with_y=False):
    """各武器列をonehot化して結合"""
    a_weapon = trans_weapon(
        df, ['A1-weapon', 'A2-weapon', 'A3-weapon', 'A4-weapon'])
コード例 #26
0
ファイル: data.py プロジェクト: remstef/neuralnetworking
class SpamDataset(torch.utils.data.Dataset):

    '''
    maxlength = size of padding
    '''
    def __init__(self, maxlength = 30, subset='all'):
      super(SpamDataset, self)
      self.maxlength = maxlength
      self.subset = subset
      self.samples, self.labels = self.load_data()
      self.samples_vectors = self.load_samples_vectors()

    def load_data(self):
      # do some preprocessing if preprocessed file does not exist
      if not os.path.isfile('SMSSpamCollection_normalized.pkl'):
        #import spacy; nlp=spacy.load('en')
        print('Applying spacy.')
        import en_core_web_sm
        nlp = en_core_web_sm.load()

        def normalize(message):
          doc = nlp(message)
          normalized = doc
          normalized = filter(lambda t : t.is_alpha and not t.is_stop, doc)
          normalized = map(lambda t : t.text, normalized)
          normalized = list(normalized)
          return normalized

        messages = pandas.read_csv(
            'SMSSpamCollection',
            sep='\t',
            quoting=csv.QUOTE_NONE,
            names=['label', 'message'],
            encoding='UTF-8')
        messages['normalized'] = messages.message.apply(normalize)
        messages.to_pickle('SMSSpamCollection_normalized')

      # load normalized messages
      messages = pandas.read_pickle('SMSSpamCollection_normalized')
      samples = messages.normalized.tolist()
      labels = messages.label.tolist()

      self.classes = list(set(labels))
      self.classes.sort()   #Otherwise it is inconsistent which class is 0 and which is 1
      self.oh_classes = MultiLabelBinarizer()
      self.oh_classes.fit([[label] for label in self.classes])
      self.classes = dict([(l,i) for (i,l) in enumerate(self.oh_classes.classes_)])

      labels = [self.classes[l] for l in labels]
      cut_off = int(len(samples) * 0.2)
      if self.subset == 'train':
        samples = samples[cut_off:]
        labels =labels[cut_off:]
      elif self.subset == 'test':
        samples= samples[:cut_off]
        labels = labels[:cut_off]
      return samples, labels

    def __len__(self):
      return len(self.samples)

    """Returns (item as matrix of token vectors, label, index) instead of the normal (item, label). See constructor for more info"""
    def __getitem__(self, index):
      msg = self.samples[index]
      int_label = self.classes[self.labels[index]]   #Convert label to integer. Is this slow?
      #mat = np.zeros( (self.maxlength, self.ftmodel.numpy_normalized_vectors.shape[1]) )
      mat = np.zeros( (self.maxlength, 300) )   #Hard coded for the wikipedia data that has dim=300

      for i, token in enumerate(msg):
        if i >= self.maxlength:
          break
        v = emb.getVector(token)
        mat[i,:] = v
        oh_label = self.oh_classes.transform([[self.labels[index]]])
        oh_label = oh_label[0]
        #print(oh_label)

      return ( torch.tensor(mat), int_label, oh_label, index )
コード例 #27
0
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import numpy as np

from keywords import *

path = Path.cwd() / "../../pdf-reports/"
df = read_plaintext_with_keywords(path)
df = add_chapter_fields(df)

print(df['num_keywords'].value_counts())
df_keywords = df[df.num_keywords > 0]

mlb = MultiLabelBinarizer()
mlb.fit(df_keywords.keywords)
print(len(mlb.classes_))
print(mlb.classes_)

y = mlb.transform(df_keywords.keywords)
print(y.shape)

tv = TfidfVectorizer(ngram_range=(1, 2),
                     preprocessor=preproc,
                     stop_words='english')
tv1 = TfidfVectorizer(ngram_range=(1, 1),
                      preprocessor=preproc,
                      stop_words='english')
tv2 = TfidfVectorizer(ngram_range=(2, 2),
                      preprocessor=preproc,
                      stop_words='english')
コード例 #28
0
try:
    from sklearn.preprocessing import MultiLabelBinarizer

    lb = MultiLabelBinarizer()
except ImportError, e:
    from sklearn.preprocessing import LabelBinarizer

    lb = LabelBinarizer()


TRIM_SAMPLES = len(tags)  # / 10
tags = tags[:TRIM_SAMPLES]
learn_data = learn_data[:TRIM_SAMPLES]

lb.fit(tags)
labels = lb.transform(tags)

print "using\t", TRIM_SAMPLES, "samples"
print "\t", len(keywords), "keywords"
print "\t", len(lb.classes_), "tags"
metadata = learn_data.sum(axis=1)

print "\t", metadata.mean(), "avg words in document"
print "\t", metadata.max(), "biggest document"
print "\t", metadata.min(), "smallest document"


# plt.figure(figsize=(8, 6))
# plot_subfigure(learn_data, labels, 1, "With unlabeled samples + CCA", "cca")
# plot_subfigure(learn_data, labels, 2, "With unlabeled samples + PCA", "pca")
コード例 #29
0
    lambda x: len(x) == 10)]  # tags가 10개인 데이터만을 사용
train = train[:5000]  # 5000개 학습 데이터 샘플 사용
n_tags = 500  # 최다 등장 500개 tags만을 사용
n_titles = 500  # 최다 등장 500개 title 형태소만을 사용
undup_tags = np.array((pd.Series(np.concatenate(
    train['tags'].values)).value_counts()[:n_tags].index))
undup_gnr = np.unique(np.concatenate(meta['song_gn_gnr_basket']))
undup_dtl_gnr = np.unique(np.concatenate(meta['song_gn_dtl_gnr_basket']))
undup_title = np.array(
    (pd.Series(np.concatenate(
        decom_train['keywords'].values)).value_counts()[:n_titles].index))
enc = MultiLabelBinarizer()
enc_gnr = MultiLabelBinarizer()
enc_dtl_gnr = MultiLabelBinarizer()
enc_title = MultiLabelBinarizer()
enc.fit([undup_tags])
enc_gnr.fit([undup_gnr])
enc_dtl_gnr.fit([undup_dtl_gnr])
enc_title.fit([undup_title])


class MF():
    def __init__(self, rating_mat, dim_latent, l2, alpha, l_rate, n_epochs):
        self.r_mat = rating_mat
        self.n_users, self.n_items = rating_mat.shape
        self.dim_latent = dim_latent
        self.l2 = l2
        self.alpha = alpha
        self.l_rate = l_rate
        self.n_epochs = n_epochs
コード例 #30
0
# Main Code
source_data_dir = '../train_data'
test_data_dir = '../test_data'
data_dir = 'copied'
print('pre stage start')
print('read data start')
if os.path.exists(data_dir):
    shutil.rmtree(data_dir)
copy_data(source_data_dir, data_dir)
log_collection = read_data(data_dir)
test_data = read_data(test_data_dir)
print('read data end')
print('prepare data start')
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(log_collection['labels'])

y_train = multilabel_binarizer.transform(log_collection['labels'])
y_test = multilabel_binarizer.transform(test_data['labels'])
X_train = log_collection['clean_text']
X_test = test_data['clean_text']
id_train = np.arange(len(log_collection['clean_text']))
id_test = np.arange(len(test_data['clean_text']))
X_train, X_test, tfidf_vocab = prepare_data(X_train, X_test)
tfidf_reversed_vocab = {i: word for word, i in tfidf_vocab.items()}

print('prepare data end')

# Training
print('train model start')
model = train(X_train, y_train)
コード例 #31
0
#     image_ids.append(path[start:end])
print('image_ids:', image_ids)

data = pd.read_csv("MovieGenre.csv", encoding="ISO-8859-1")
y = []
parsed_movies = []

classes = utils.list_genres(7)
# classes = set(classes)
print('classes:', classes)
print(len(classes))
y = utils.list_movies(classes, image_ids)
print('y:', y)
mlb = MultiLabelBinarizer()
#print('mlb:',mlb)
mlb.fit(y)
#print('mlb:',mlb)
y = mlb.transform(y)
print('y:', y)
print('y.shape:', y.shape)
x = []
x.append(utils.get_image(img_path))
x = np.asarray(x)

print('x.shape:', x.shape)


def predict(X):
    init = tf.global_variables_initializer()

    x, _ = utils.create_placeholders(150, 150, 3, 7)
bount=0

for i in range(0,len(training_all)):
    
    image_id=training_all[i][0]
    count +=1    
    label_str=training_all[i][1]
    cur_label_list=label_str.split()

    cur_label_list=tuple([int(ff) for ff in cur_label_list])
    label_list.append(cur_label_list)
    image_id_list.append(image_id)
        

mlb = MultiLabelBinarizer()
mlb.fit(label_list)



X_train_name, X_test_name, y_train_label, y_test_label = train_test_split\
    (image_id_list, label_list, test_size=0.015)
    

train_label=mlb.transform(y_train_label)

y_test=mlb.transform(y_test_label)


bb_image=[]
for i_name in X_test_name:
    cur_image=load_image(basepath,i_name)    
コード例 #33
0
class DataGenerator(Sequence):
    def __init__(self, data_path, tokenize_func, options, bg_data_path=None, bg_sample_rate=1., max_chars=None, label_encoder=None):
        texts, labels = load_data(data_path, options, max_chars=max_chars)
        self.num_examples = len(texts)
        self.batch_size = options.batch_size
        #self.seq_len = options.seq_len
        self.X = tokenize_func(texts)

        if label_encoder is None:
            self.label_encoder = MultiLabelBinarizer()
            self.label_encoder.fit(labels)
        else:
            self.label_encoder = label_encoder

        self.Y = self.label_encoder.transform(labels)
        self.num_labels = len(self.label_encoder.classes_)

        if bg_data_path is not None:
            self.bg_sample_rate = bg_sample_rate
            self.bg_num_examples, self.bg_X, self.bg_Y = [], [], []
            for path in bg_data_path.split():
                bg_texts, bg_labels = load_data(path, options, max_chars=max_chars)
                self.bg_num_examples.append(len(bg_texts))
                self.bg_X.append(tokenize_func(bg_texts))

                self.bg_Y.append(self.label_encoder.transform(bg_labels))
            #self.bg_num_labels = len(self.label_encoder.classes_)
            self.bg_num_corpora = len(self.bg_num_examples)
        else:
            self.bg_sample_rate = 0
            self.bg_num_examples = [0]
            self.bg_num_corpora = 0

        self.on_epoch_end()


    def on_epoch_end(self):
        self.indexes = np.arange(self.num_examples)
        np.random.shuffle(self.indexes)

        if self.bg_sample_rate > 0:
            if hasattr(self, "bg_indexes"):
                for i, bg_indexes in enumerate(self.bg_indexes):
                    seen_bg_idxs = bg_indexes[:len(self.indexes)]
                    unseen_bg_idxs = bg_indexes[len(self.indexes):]
                    np.random.shuffle(seen_bg_idxs)
                    self.bg_indexes[i] = np.concatenate([unseen_bg_idxs, seen_bg_idxs])
            else:
                self.bg_indexes = [np.arange(x) for x in self.bg_num_examples]
                for i,_ in enumerate(self.bg_indexes):
                    np.random.shuffle(self.bg_indexes[i])

        self.index = 0


    def __len__(self):
        return int((self.num_examples//self.batch_size)*(1+self.bg_sample_rate)) * self.bg_num_corpora

    def __getitem__(self, index):
        if np.random.random() <= 1/(self.bg_sample_rate+self.bg_num_corpora):
            batch_indexes = self.indexes[self.index*self.batch_size:(self.index+1)*self.batch_size]
            self.index += 1
            X, Y = self.X, self.Y
        else:
            i = np.random.randint(0, self.bg_num_corpora)
            try:
                batch_indexes = self.bg_indexes[i][self.index*self.batch_size:(self.index+1)*self.batch_size]
            except IndexError:
                end = ((self.index+1)*self.batch_size) % len(self.bg_indexes[i])
                if end < self.batch_size:
                    end = self.batch_size
                    beg = 0
                else:
                    beg = end-self.batch_size
                batch_indexes = self.bg_indexes[i][beg:end]

            X, Y = self.bg_X[i], self.bg_Y[i]

        batch_X = {}
        for key in self.X:
            batch_X[key] = np.empty((self.batch_size, *X[key].shape[1:]))
            for j, idx in enumerate(batch_indexes):
                batch_X[key][j] = X[key][idx]

        batch_y = np.empty((self.batch_size, *Y.shape[1:]), dtype=int)
        for j, idx in enumerate(batch_indexes):
            batch_y[j] = Y[idx]

        return batch_X, batch_y
コード例 #34
0
def load_data(train_set):
    X_data = []
    y_data = []
    for c, (vector, target) in enumerate(train_set):
        X_data.append(vector)
        y_data.append(target)
        if c % 10000 == 0:
            print(c)
    print(len(X_data), 'training examples')

    class_freqs = Counter([y for y_seq in y_data for y in y_seq]).most_common()
    class_list = [y[0] for y in class_freqs]
    nb_classes = len(class_list)
    print(nb_classes, 'classes')
    class_dict = dict(zip(class_list, np.arange(len(class_list))))

    with open('data_path_save/attention_blstm/class_dict.pkl', 'wb') as fp:
        pickle.dump(class_dict, fp)
    print('Exported class dictionary')

    y_data_int = []
    for y_seq in y_data:
        y_data_int.append([class_dict[y] for y in y_seq])

    X_data_flat = []
    for raw_text in X_data:
        flat_text = []
        for sent in raw_text:
            flat_text.extend(sent)
        X_data_flat.append(flat_text)

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token=1)
    tokenizer.fit_on_texts(X_data_flat)
    X_data_int = np.zeros((len(X_data), MAX_SEQ_LEN, MAX_SENT_LENGTH))
    for idx, raw_text in enumerate(X_data):
        sents_batch = np.zeros((MAX_SEQ_LEN, MAX_SENT_LENGTH))
        tokens = tokenizer.texts_to_sequences(raw_text)
        sents = pad_sequences(tokens,
                              maxlen=MAX_SENT_LENGTH,
                              padding='post',
                              truncating='post',
                              dtype='float32')
        for j, sent in enumerate(sents):
            if j >= MAX_SEQ_LEN:
                break
            sents_batch[j, :] = sent
        X_data_int[idx, :, :] = sents_batch
    X_data = X_data_int
    print('Shape of data tensor:', X_data.shape)

    word_index = tokenizer.word_index
    print('Found %s unique tokens' % len(word_index))
    with open('data_path_save/attention_blstm/word_index.json', 'w') as fp:
        json.dump(word_index, fp)
    print('Exported word dictionary')

    mlb = MultiLabelBinarizer()
    mlb.fit([class_dict.values()])
    y_data = mlb.transform(y_data_int)
    print('Shape of label tensor:', y_data.shape)

    X_train, X_val, y_train, y_val = train_test_split(X_data,
                                                      y_data,
                                                      train_size=0.8,
                                                      test_size=0.2,
                                                      random_state=42)

    return X_train, X_val, y_train, y_val, nb_classes, word_index
コード例 #35
0
def select_data(XX,YY, ctype, min_samples, outputfolder):
    # convert multilabel to multi-hot
    mlb = MultiLabelBinarizer()

    if ctype == 'diagnostic':
        X = XX[YY.diagnostic_len > 0]
        Y = YY[YY.diagnostic_len > 0]
        mlb.fit(Y.diagnostic.values)
        y = mlb.transform(Y.diagnostic.values)
    elif ctype == 'subdiagnostic':
        counts = pd.Series(np.concatenate(YY.subdiagnostic.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.subdiagnostic = YY.subdiagnostic.apply(lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['subdiagnostic_len'] = YY.subdiagnostic.apply(lambda x: len(x))
        X = XX[YY.subdiagnostic_len > 0]
        Y = YY[YY.subdiagnostic_len > 0]
        mlb.fit(Y.subdiagnostic.values)
        y = mlb.transform(Y.subdiagnostic.values)
    elif ctype == 'superdiagnostic':
        counts = pd.Series(np.concatenate(YY.superdiagnostic.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.superdiagnostic = YY.superdiagnostic.apply(lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['superdiagnostic_len'] = YY.superdiagnostic.apply(lambda x: len(x))
        X = XX[YY.superdiagnostic_len > 0]
        Y = YY[YY.superdiagnostic_len > 0]
        mlb.fit(Y.superdiagnostic.values)
        y = mlb.transform(Y.superdiagnostic.values)
    elif ctype == 'form':
        # filter
        counts = pd.Series(np.concatenate(YY.form.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.form = YY.form.apply(lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['form_len'] = YY.form.apply(lambda x: len(x))
        # select
        X = XX[YY.form_len > 0]
        Y = YY[YY.form_len > 0]
        mlb.fit(Y.form.values)
        y = mlb.transform(Y.form.values)
    elif ctype == 'rhythm':
        # filter 
        counts = pd.Series(np.concatenate(YY.rhythm.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.rhythm = YY.rhythm.apply(lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['rhythm_len'] = YY.rhythm.apply(lambda x: len(x))
        # select
        X = XX[YY.rhythm_len > 0]
        Y = YY[YY.rhythm_len > 0]
        mlb.fit(Y.rhythm.values)
        y = mlb.transform(Y.rhythm.values)
    elif ctype == 'all':
        # filter 
        counts = pd.Series(np.concatenate(YY.all_scp.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.all_scp = YY.all_scp.apply(lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['all_scp_len'] = YY.all_scp.apply(lambda x: len(x))
        # select
        X = XX[YY.all_scp_len > 0]
        Y = YY[YY.all_scp_len > 0]
        mlb.fit(Y.all_scp.values)
        y = mlb.transform(Y.all_scp.values)
    else:
        pass

    # save LabelBinarizer
    with open(outputfolder+'mlb.pkl', 'wb') as tokenizer:
        pickle.dump(mlb, tokenizer)

    return X, Y, y, mlb
コード例 #36
0
ファイル: data_torch.py プロジェクト: doublechenching/hpi
class HumanDataset(Dataset):
    def __init__(self,
                 images_df,
                 base_path,
                 target_shape=(512, 512),
                 augument=True,
                 use_yellow=False,
                 mode="train"):
        if not isinstance(base_path, pathlib.Path):
            base_path = pathlib.Path(base_path)

        self.images_df = images_df.copy()
        self.augument = augument
        self.images_df.Id = self.images_df.Id.apply(lambda x: base_path / x)
        self.mlb = MultiLabelBinarizer(classes=np.arange(0, cfg.num_classes))
        self.mlb.fit(np.arange(0, cfg.num_classes))
        self.mode = mode
        self.target_shape = target_shape
        self.use_yellow = use_yellow

    def __len__(self):
        return len(self.images_df)

    def __getitem__(self, index):
        X = self.read_images(index)
        if not self.mode == "test":
            labels = np.array(
                list(map(int, self.images_df.iloc[index].Target.split(' '))))
            y = np.eye(cfg.num_classes, dtype=np.float)[labels].sum(axis=0)
        else:
            y = str(self.images_df.iloc[index].Id.absolute())

        if self.augument:
            X = self.augumentor(X)

        X = T.Compose([T.ToPILImage(), T.ToTensor()])(X)

        return X.float(), y

    def read_images(self, index):
        row = self.images_df.iloc[index]
        filename = str(row.Id.absolute())
        if 'ENSG' in filename:
            filename = os.path.split(filename)[-1]
            filename = os.path.join(cfg.extra_data, filename)
            images = np.array(Image.open(filename + ".png"))

        else:
            r = np.array(Image.open(filename + "_red.png"))
            g = np.array(Image.open(filename + "_green.png"))
            b = np.array(Image.open(filename + "_blue.png"))
            images = [r, g, b]
            if self.use_yellow:
                y = np.array(Image.open(filename + "_yellow.png"))
                images.append(y)
            images = np.stack(images, axis=-1)

        images = images.astype(np.uint8)

        if self.target_shape == (512, 512) and images.shape[:2] == (512, 512):
            return images
        else:
            return cv2.resize(images, self.target_shape)

    def augumentor(self, image):
        sometimes = lambda aug: iaa.Sometimes(0.8, aug)
        augment_img = iaa.Sequential([
            iaa.Fliplr(0.5),
            iaa.Flipud(0.5),
            iaa.BilateralBlur(),
            iaa.Affine(rotate=90),
            iaa.ContrastNormalization((0.8, 1.3)),
            sometimes(
                iaa.Affine(scale={
                    "x": (0.8, 1.2),
                    "y": (0.8, 1.2)
                },
                           translate_percent={
                               "x": (-0.1, 0.1),
                               "y": (-0.1, 0.1)
                           },
                           rotate=(-30, 30),
                           shear=(-5, 5)))
        ],
                                     random_order=True)

        image_aug = augment_img.augment_image(image)

        return image_aug
コード例 #37
0
ファイル: rnd.py プロジェクト: jonnybazookatone/ace
    keywords_test = keywords.pop(-1)

    # We want to convert the labels into vectors. For example, if we have:
    # keywords = [
    #             ['solar', 'physics', 'astronomy'],
    #             ['physics', 'lasers'],
    #             ['astronomy']
    #           ]
    # this would become:
    # keywords_binarised = [
    #             [1, 1, 1, 0],
    #             [0, 1, 0, 1],
    #             [0, 0, 1, 0]
    #           ]
    mlb = MultiLabelBinarizer()
    mlb.fit(keywords)
    keywords_vector = mlb.transform(keywords)

    # We generate a transform from words -> vector space. This is very similar
    # to the above conversion of the keywords. In this scenario, the entire
    # corpus from our training set is converted into an id -> word sparse-
    # matrix.
    bow_transform = CountVectorizer(analyzer=text_to_vector).fit(' '.join(text))

    # We transform our corpus into the unique vector space
    bow_vector = bow_transform.transform(text)

    # We convert the vector into a term frequency - inverse document frequency
    # Term frequencey: f_t (number of times in a document term t exists)
    # Inverse document frequency: log(N/n_t) (number of documents divided by
    #                                         the number of documents that
コード例 #38
0
        # Save and close session
        logger.info("Commiting changes to database.")
        try:
            db_session.commit()
            db_session.close()
        except:
            db_session.rollback()
            raise

    # Get the features into X, and multilabel y indicator format
    # -------------------------------------------------------------------- #
    logger.info("Preparing training and input interactions.")
    X_train, y_train = interactions_to_Xy_format(training.all(), selection)
    X_test, _ = interactions_to_Xy_format(testing, selection)
    mlb = MultiLabelBinarizer(classes=sorted(labels))
    mlb.fit(y_train)
    y_train = mlb.transform(y_train)

    logging.info("Computing class distributions.")
    counter = {l: int(c) for l, c in zip(mlb.classes, y_train.sum(axis=0))}
    counter["n_samples"] = int(y_train.shape[0])
    json.dump(counter,
              fp=open("{}/training_distribution.json".format(direc), 'w'),
              indent=4,
              sort_keys=True)

    logger.info("Computing usable feature proportions in testing samples.")

    def separate_features(row):
        features = row[0].upper().split(',')
        interpro = set(term for term in features if 'IPR' in term)
コード例 #39
0
def run_classifierAccuracy(trainSentences, trainLabels, testSentences, testLabels):
	all_labels = ['tsunami', 'heat_wave', 'cold_wave', 'forest_fire', 'limnic_erruptions', \
				'storm', 'avalanches', 'blizzard', 'earthquake', 'floods', 'hurricane', \
				'drought', 'volcano', 'fire', 'cyclone', 'hail_storms', 'land_slide', \
				'intensity', 'epicentre', 'temperature', 'depth', 'speed', 'magnitude', \
				'terrorist_attack', 'suicide_attack', 'normal_bombing', 'shoot_out', \
				'aviation_hazard', 'train_collision', 'industrial_accident', \
				'vehicular_collision', 'surgical_strikes', 'transport_hazards', 'riots', \
				'epidemic', 'famine', 'time', 'place', 'type', 'reason', 'after_effects', \
				'casualties', 'name', 'participant']
	disaster_labels = ['tsunami', 'heat_wave', 'cold_wave', 'forest_fire', 'limnic_erruptions', \
				'storm', 'avalanches', 'blizzard', 'earthquake', 'floods', 'hurricane', \
				'drought', 'volcano', 'fire', 'cyclone', 'hail_storms', 'land_slide', \
				'intensity', 'epicentre', 'temperature', 'depth', 'speed', 'magnitude', \
				'time', 'place', 'type', 'reason', 'after_effects', \
				'casualties', 'name', 'participant']
	health_labels = ['epidemic', 'famine', 'time', 'place', 'type', 'reason', 'after_effects', \
				'casualties', 'name', 'participant']
	conflict_labels = ['terrorist_attack', 'suicide_attack', 'normal_bombing', 'shoot_out', \
				'aviation_hazard', 'train_collision', 'industrial_accident', \
				'vehicular_collision', 'surgical_strikes', 'transport_hazards', 'riots', \
				'time', 'place', 'type', 'reason', 'after_effects', \
				'casualties', 'name', 'participant']
	import numpy as np
	curr_labels = set(all_labels)

	trainLabels = [list(set(l).intersection(curr_labels)) for l in trainLabels]
	curr_labels = []
	for l in trainLabels:
		curr_labels.extend(l)
	curr_labels = set(curr_labels)
	testLabels = [list(set(l).intersection(curr_labels))for l in testLabels]

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer(classes=list(curr_labels))
	train_label_matrix = mlb.fit(trainLabels)
	print("Labels : ", mlb.classes_)
	train_label_matrix = mlb.transform(trainLabels)
	test_label_matrix = mlb.transform(testLabels)
	print("Shape of label matrix : ", test_label_matrix.shape)

	train_matrix, tfidf = tf_idf_fit_transform(trainSentences)
	test_matrix = tfidf.transform(testSentences)
	print("Shape of sentence matrix : ", test_matrix.shape)


	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import LinearSVC
	from sklearn.ensemble import RandomForestClassifier
	estimator = LinearSVC()
	# estimator = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0, n_jobs = -1)
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, train_label_matrix)
	predictions = classifier.predict(test_matrix)

	from sklearn.metrics import f1_score, precision_score, recall_score
	print("All-Precision", precision_score(test_label_matrix, predictions, average=None))
	print("All-Recall", recall_score(test_label_matrix, predictions, average=None))
	print("All-F1", f1_score(test_label_matrix, predictions, average=None))
	print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro'))
	print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro'))
	print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro'))
	print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro'))
	print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro'))
	print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro'))
#freq_words(train['clean_plot'], 50)
stop_words = set(stopwords.words('english'))


def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)


train['clean_plot'] = train['clean_plot'].apply(lambda x: remove_stopwords(x))

#freq_words(train['clean_plot'], 50)

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(train['Genre'])

Y = multilabel_binarizer.transform(train['Genre'])

X_train, X_test, y_train, y_test = train_test_split(
    train['clean_plot'],
    Y,
    test_size=0.2,
    shuffle=True,
    random_state=np.random.randint(1000))
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33) # 0.25 x 0.8 = 0.2

X_train.shape
y_train.shape
#X_val.shape
#y_val.shape
コード例 #41
0
ファイル: data.py プロジェクト: remstef/neuralnetworking
class NewsGroupDataset(torch.utils.data.Dataset):

    '''
    maxlength = size of padding
    '''
    def __init__(self, subset='all',  maxlength = 30):
      super(NewsGroupDataset, self)
      self.maxlength = maxlength
      self.subset = subset
      self.file_name = "20_newsgroup_normalized_" + subset
      self.samples, self.labels = self.load_data()
      self.samples_vectors = self.load_samples_vectors()


    def load_data(self):
      messages = fetch_20newsgroups(subset=self.subset, remove=('headers','footers','quotes'), shuffle=True, random_state=42)
      # do some preprocessing if preprocessed file does not exist
      if not os.path.isfile(self.file_name):
        #import spacy; nlp=spacy.load('en')
        print('Applying spacy.')
        import en_core_web_sm
        nlp = en_core_web_sm.load()

        def normalize(message):
          doc = nlp(message)
          normalized = doc
          normalized = filter(lambda t : t.is_alpha and not t.is_stop, doc)
          normalized = map(lambda t : t.text, normalized)
          normalized = list(normalized)
          return normalized

        messages_normalized = list(map(normalize, messages.data))
        with open(self.file_name, 'wb') as f:
          pickle.dump(messages_normalized, f)

      with open(self.file_name, 'rb') as f:
        messages_normalized = pickle.load(f)

      samples = messages_normalized
      labels = messages.target

      self.classes = list(set(labels))
      self.classes.sort()   #Otherwise it is inconsistent which class is 0 and which is 1
      self.oh_classes = MultiLabelBinarizer()
      self.oh_classes.fit([[label] for label in self.classes])
      self.classes = dict([(l,i) for (i,l) in enumerate(self.oh_classes.classes_)])

      return samples, labels

    def __len__(self):
      return len(self.samples)

    """Returns (item as matrix of token vectors, label, index) instead of the normal (item, label). See constructor for more info"""
    def __getitem__(self, index):
      msg = self.samples[index]
      label = self.classes[self.labels[index]]   #Convert label to integer. Is this slow?
      oh_label = self.oh_classes.transform([[self.labels[index]]])
      oh_label = oh_label[0]
      #mat = np.zeros( (self.maxlength, self.ftmodel.numpy_normalized_vectors.shape[1]) )
      mat = np.zeros( (self.maxlength, 300) )   #Hard coded for the wikipedia data that has dim=300


      for i, token in enumerate(msg):
        if i >= self.maxlength:
          break
        if not emb.containsWord(token):
          continue
        v = emb.getVector(token)
        mat[i,:] = v

      return ( torch.tensor(mat), label, oh_label,  index )

    def get_sample(self, index):
        return self.samples[index]
コード例 #42
0
ファイル: idbusiness2.py プロジェクト: TELSER1/yelp_recruit
#    val_ids=val_ids.merge(labels,on='photo_id',how='inner')
    mlb=LabelEncoder()
    mlb.fit(train_ids['business_id'].tolist())
#    X_train=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_ids['photo_id'].tolist()]).astype(np.float32)
#    X_test=np.array([imread('train_photos/val244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()]).astype(np.float32)
    return train_ids,mlb
def load_train(train_list):
    return(np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_list]).astype(np.float32)/255.0)
train_ids,mlb=load_data()
labels=pd.read_csv("train.csv")
labels=labels[pd.isnull(labels['labels'])==False].reset_index(drop=True)
labels['assignment']=np.random.uniform(size=(labels.shape[0],1))

MLB=MultiLabelBinarizer()
train_ids=train_ids.merge(labels[['business_id','assignment']],on='business_id',how='left')
MLB.fit(train_ids['labels'].tolist()) 
labels['labels']=labels['labels'].map(lambda x:[int(i) for i in x.split(" ")])
BETA=MLB.transform(labels.sort('business_id')['labels'])
val_ids=train_ids[train_ids['assignment']>=.9].reset_index(drop=True)
val_Y=MLB.transform(val_ids['labels'])
train_ids=train_ids[train_ids['assignment']<.9].reset_index(drop=True)
Y_test=mlb.transform(val_ids['business_id'].tolist())
print Y_test.shape
np.random.seed(42)
#train_ids=train_ids.sort('business_id').reset_index(drop=True)
train_ids.reindex(np.random.permutation(train_ids.index))
val_ids.reindex(np.random.permutation(val_ids.index))
validate=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()[0:10000]]).astype(np.float32)/255.0

datagen = ImageDataGenerator(
    featurewise_center=True,
コード例 #43
0
class Evaluator(object):
    def __init__(self, config, model):
        self.config = config
        if config.model_name.endswith("flat"):
            self.n_classes = config.fn_classes
        else:
            self.n_classes = config.hn_classes
        self.model = model
        self.loss = model.loss
        self.logits = model.logits
        self.mlb = MultiLabelBinarizer()
        if not self.config.model_name.endswith("flat"):
            self.preds = model.preds
            self.scores = model.scores
            if config.data_from == "reuters":
                self.mlb.fit([[
                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                    17, 18, 19, 20
                ]])
            elif config.data_from == "20newsgroup":
                self.mlb.fit([[
                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
                ]])
            elif config.data_from == "ice":
                hcl_ids = [_ for _ in range(self.config.EOS + 1)]
                self.mlb.fit([hcl_ids])
        else:
            if config.data_from == "reuters":
                self.mlb.fit([[
                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                    17
                ]])  # for transfrom preds
            elif config.data_from == "20newsgroup":
                self.mlb.fit([[
                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                    17, 18, 19
                ]])
            elif config.data_from == "ice":
                hcl_ids = [_ for _ in range(0, self.config.EOS - 2)]
                self.mlb.fit([hcl_ids])

    def get_metric(self,
                   preds,
                   labels,
                   average=None,
                   about="all",
                   data_type="dev"):
        precisions, recalls, fscores, _ = precision_recall_fscore_support(
            labels, preds, average=average)
        if about == "all":
            print('%s average precision recall f1-score: %f %f %f' %
                  (average, precisions, recalls, fscores))
        f1_summary = tf.Summary(value=[
            tf.Summary.Value(tag='{}:{}:{}/f1'.format(data_type, about,
                                                      average),
                             simple_value=fscores)
        ])
        self.summaries.append(f1_summary)

    def get_evaluation(self, sess, batch):
        batch_idx, batch_ds = batch
        feed_dict = self.model.get_feed_dict(batch, False)
        if self.config.model_name.endswith("flat"):
            test_size = batch_ds.get_data_size()
            logits, loss = sess.run([self.model.prob, self.loss],
                                    feed_dict=feed_dict)
            # print("logits:", logits)
            preds = np.array([[i for i in range(self.n_classes)]
                              for _ in range(test_size)])
            # print("preds:", preds.shape)
            preds = prediction_with_threshold(self.config,
                                              preds,
                                              logits,
                                              threshold=self.config.thred)
            print("preds:", preds[0:2])
            preds = self.mlb.transform(preds)
            if self.config.data_from == "20newsgroup":
                labels = batch_ds.data["y_f"]
            elif self.config.data_from == "reuters":
                labels = batch_ds.data["y_seqs"]
            elif self.config.data_from == "ice":
                labels = batch_ds.data["y_f"]
            if self.config.data_from != "ice":
                labels = self.mlb.transform(labels)
            return preds, labels

        else:
            preds, scores = sess.run([self.preds, self.scores],
                                     feed_dict=feed_dict)
            # print("check eval:", preds[0,:], scores[0,:], preds.shape, scores.shape)   # why test is not fixed?   cause keep_prob
            preds = prediction_with_threshold(self.config,
                                              preds,
                                              scores,
                                              threshold=self.config.thred)
            preds_log = preds
            preds = self.mlb.transform(preds)
            if self.config.data_from == "ice":
                labels = batch_ds.data["y_h"]
                labels_log = labels
                labels = self.mlb.transform(labels)
            elif self.config.data_from == "20newsgroup":
                labels = batch_ds.data["y_h"]
                labels_log = labels
            else:
                labels = batch_ds.data["y_seqs"]
                labels_log = labels
            print("check eval:", "\n", preds_log[0:3], "\n", labels_log[0:3])
            return preds, labels

    def get_evaluation_from_batches(self, sess, batches):
        config = self.config
        elist = [self.get_evaluation(sess, batch) for batch in batches]
        preds = [elem[0] for elem in elist]
        labels = [elem[1] for elem in elist]
        preds = np.concatenate(preds, axis=0)
        labels = np.concatenate(labels, axis=0)
        # print("preds, labels:", preds[0,:], labels[0,:], len(preds[0,:]), len(labels[0,:]))
        return Evaluation(config, preds, labels)
コード例 #44
0
chunks = []
for chunk in reader:
    chunk.dropna(inplace=True) 
    chunks.append(chunk)

test = pd.concat(chunks)

del(chunks)

# Split the tags by spaces
train_labels = train['Tags'].map(lambda x: x.split())
test_labels = test['Tags'].map(lambda x: x.split())

# The label binarizer takes all the tags and turns them into a big sparse matrix
mlb = MultiLabelBinarizer()
mlb.fit(pd.concat([train_labels, test_labels]))
labels = mlb.transform(train_labels)

# Turn the tokens into a sparse matrix
vect = CountVectorizer(
    # Get text from html
    preprocessor = preprocess,
    # Turn the text into tokens
    tokenizer = tokenize,
    # Generate ngrams
    ngram_range = (1, 2),
    # Remove extremely common tokens
    max_df = 0.5,
    # Remove extremely uncommon tokens
    min_df = 0.001
)
コード例 #45
0
query = 'SELECT DISTINCT SUBSTRING_INDEX(SUBSTRING_INDEX(t.components, \',\', n.n), \',\', -1) value ' \
        'FROM porru_dataset.' + project + ' t CROSS JOIN ' \
                                          '(' \
                                          'SELECT a.N + b.N * 10 + 1 n ' \
                                          'FROM ' \
                                          '(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) a ' \
                                          ',(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) b ' \
                                          'ORDER BY n ' \
                                          ') n ' \
                                          'WHERE n.n <= 1 + (LENGTH(t.components) - LENGTH(REPLACE(t.components, \',\', \'\'))) ' \
                                          'ORDER BY value'

cursor.execute(query)
componentList = numpy.array(cursor.fetchall())
mlb = MultiLabelBinarizer()
mlb.fit(componentList)
print 'No. of label: ' + str(len(mlb.classes_))
componentLabel = [None] * len(data)
for i in range(len(data)):
    componentLabel[i] = data[i, 5].split(',')
componentBinary = mlb.transform(componentLabel)

# train MultiLabelBinarizer for Type
query = 'SELECT DISTINCT type FROM porru_dataset.' + project
cursor.execute(query)
typeList = numpy.array(cursor.fetchall())
mlb_t = MultiLabelBinarizer()
mlb_t.fit(typeList)
print 'No. of label: ' + str(len(mlb_t.classes_))
typeLabel = [None] * len(data)
for i in range(len(data)):
コード例 #46
0
    a_=[loadbusimage(im_) for im_ in x]
    return(np.array(a_))
labels=pd.read_csv("train.csv")
labels=labels[pd.isnull(labels['labels'])==False]
bismatch=pd.read_csv("train_photo_to_biz_ids.csv")
photo_labels=bismatch.merge(labels,how='left',on='business_id')
photo_labels=photo_labels[pd.isnull(photo_labels['labels'])==False]
photo_labels['labels']=photo_labels['labels'].map(lambda x:[int(i) for i in x.split(" ")])
np.random.seed(42)
labels['assignment']=np.random.randint(0,10,size=(labels.shape[0],1))
photo_labels=photo_labels.merge(labels[['business_id','assignment']],on='business_id')
train=photo_labels[photo_labels['assignment']<=7].reset_index(drop=True)
test=photo_labels[photo_labels['assignment']>7].reset_index(drop=True)

mlb=MultiLabelBinarizer()
mlb.fit(train['labels'].tolist()+test['labels'].tolist())
#INSERT NORMALIZATION TRAINING HERE
n_images=10
graph = Graph()
nfilters=32
for i in xrange(0,n_images):
    graph.add_input(name="input"+str(i),input_shape=(3,size,size))
graph.add_shared_node(Convolution2D(nfilters, 3, 3, border_mode='same',activation='relu'),name='conv1',inputs=["input"+str(i) for i in xrange(0,10)])
graph.add_shared_node(BatchNormalization(),name='batch1',inputs=['conv1'])
graph.add_shared_node(Convolution2D(nfilters,3,3,activation=LeakyReLU()), name='conv2', inputs=['batch1'])
graph.add_shared_node(BatchNormalization(),name='batch2',inputs=['conv2'])
graph.add_shared_node(Convolution2D(nfilters,3,3,activation=LeakyReLU()), name='conv3', inputs=['batch2'])
graph.add_shared_node(BatchNormalization(),name='batch3',inputs=['conv3'])
graph.add_shared_node(Convolution2D(nfilters,3,3,activation=LeakyReLU()), name='conv4', inputs=['batch3'])
graph.add_shared_node(BatchNormalization(),name='batch4',inputs=['conv4'])
graph.add_shared_node(Convolution2D(nfilters,3,3,activation=LeakyReLU()), name='conv5', inputs=['batch4'])
コード例 #47
0
import tokenization
from extract_features import InputExample, convert_examples_to_features
import numpy as np
import requests,os,json,time
import tensorflow as tf

from sklearn.preprocessing import MultiLabelBinarizer
from config import Config
config = Config()

""" 1: 准备多标签处理工具,用于将概率转为文本标签 """
all_labels = open(config.class_path,encoding="utf-8").readlines()
all_labels = [label.strip() for label in all_labels]

mlb = MultiLabelBinarizer()
mlb.fit([[label] for label in all_labels])


"""2: 初始化tokenizer,用于文本到id的转换"""
vocab_file = os.environ.get('vocab_file', './pretrained_model/roberta_zh_l12/vocab.txt')
max_token_len = os.environ.get('max_token_len', 400)
tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=True)

tf.app.flags.DEFINE_string('server', '0.0.0.0:8500', 'PredictionService host:port')
FLAGS = tf.app.flags.FLAGS



"""3: 将样本转化为符合的输入格式"""	
def preprocess(text):
コード例 #48
0
class PCXRayDataset(Dataset):
    def __init__(self, datadir, csvpath, splitpath=None, transform=None, views=["PA","L"],
                 dataset='train', pretrained=False, min_patients_per_label=50, 
                 counter_examples=False, duplicate=False):
        """
        Data reader. Only selects labels that at least min_patients_per_label patients have.
        """
        super(PCXRayDataset, self).__init__()

        assert dataset in ['train', 'val', 'test']

        self.datadir = datadir
        self.transform = transform
        self.pretrained = pretrained
        self.threshold = min_patients_per_label
        self.views = views
        self.counter_examples = counter_examples
        self.duplicate = duplicate

        self.df = pd.read_csv(csvpath)
        self.total_samples = len(self.df.PatientID.unique())

        self._build_labels()
        self.mb = MultiLabelBinarizer(classes=self.labels)
        self.mb.fit(self.labels)

        # Split into train or validation
        if splitpath is not None:
            with open(splitpath, 'rb') as f:
                train_ids, val_ids, test_ids = pickle.load(f)
            if dataset == 'train':
                self.df = self.df[self.df.PatientID.isin(train_ids)]
            elif dataset == 'val':
                self.df = self.df[self.df.PatientID.isin(val_ids)]
            else:
                self.df = self.df[self.df.PatientID.isin(test_ids)]

            self.df = self.df.reset_index()
            
        self.nb_classes = len(self.labels)
        
        if self.duplicate:
            self.to_duplicate = np.random.choice(range(self.total_samples), self.total_samples//2)
            self.nb_classes = self.nb_classes*2
        
    def __len__(self):
        return self.total_samples

    
    def get_labels(self, idx):
        subset = self.df[self.df.PatientID == self.df.PatientID[idx * 2]]
        labels = eval(subset.Clean_Labels.tolist()[0])
        if set(labels).difference(self.labels):
            labels.append('other')
        labels = [l for l in labels if l in self.labels]
        return labels
    
    def __getitem__(self, idx):

        subset = self.df[self.df.PatientID == self.df.PatientID[idx * 2]]
        labels = self.get_labels(idx)
        encoded_labels = self.mb.transform([labels]).squeeze()
        
        sample = {}
        if "PA" in self.views:
            pa_path = subset[subset.Projection == 'PA'][['ImageID', 'ImageDir']]
            pa_path = join(self.datadir, pa_path['ImageID'].tolist()[0])
            #pa_path = join(self.datadir,'216840111366964012989926673512011108125227151_00-185-152.png')
            pa_img = np.array(Image.open(pa_path))[..., np.newaxis]
            if self.pretrained:
                pa_img = np.repeat(pa_img, 3, axis=-1)
            sample["PA"] = pa_img

        if "L" in self.views:
            l_path = subset[subset.Projection == 'L'][['ImageID', 'ImageDir']]
            l_path = join(self.datadir, l_path['ImageID'].tolist()[0])
            # l_path = './data/processed/0/46523715740384360192496023767246369337_veyewt.png'
            l_img = np.array(Image.open(l_path))[..., np.newaxis]
            if self.pretrained:
                l_img = np.repeat(l_img, 3, axis=-1)
            sample["L"] = l_img

        if self.transform is not None:
            sample = self.transform(sample)

        sample['labels'] = labels
        sample['encoded_labels'] = torch.from_numpy(encoded_labels.astype(np.float32))
        sample['sample_weight'] = torch.max(sample['encoded_labels'] * self.labels_weights)
                        
        
        if self.counter_examples:
            
            if self.duplicate:
                new_encoded_labels = torch.zeros(self.nb_classes).long()
                
                # put the labels at the lower or upper copy
                if idx in self.to_duplicate:
                    new_encoded_labels[:self.nb_classes//2] = sample['encoded_labels']
                else:
                    new_encoded_labels[self.nb_classes//2:] = sample['encoded_labels']
                
                sample['encoded_labels'] = new_encoded_labels
               
            # pick a query label
            topresent = np.random.choice(range(len(sample['encoded_labels'])), p=self.labels_weights_dup)
            sample['cond'] = torch.LongTensor([topresent]).squeeze()
            
            # is the query in this sample?
            sample['cond_target'] = sample['encoded_labels'][topresent].long().squeeze()
            
            sample['cond_weight'] = self.labels_weights[sample['cond']//2]

        return sample["PA"], sample['cond_target'], sample['cond'], sample['cond_weight']

    def _build_labels(self):
        labels_dict = {}
        for labels in self.df.Clean_Labels:
            for label in eval(labels):
                label = label.strip()
                if label not in labels_dict:
                    labels_dict[label] = 0
                labels_dict[label] += 1

        labels = []
        labels_count = []
        other_counts = []
        for k, v in labels_dict.items():
            if v > self.threshold * 2:
                labels.append(k)
                labels_count.append(v)
            else:
                other_counts.append(v)
                
        labels.append('other')
        labels_count.append(sum(other_counts))
        
        self.labels = labels
        self.labels_count = labels_count
        self.labels_weights = torch.from_numpy(np.array([(len(self) / label)
                                                         for label in labels_count], dtype=np.float32))
        self.labels_weights = self.labels_weights/self.labels_weights.max()
        self.labels_weights = self.labels_weights**2
        self.labels_weights_dup = torch.cat([self.labels_weights]*2).numpy()
        self.labels_weights_dup /= self.labels_weights_dup.sum()
        #self.labels_weights = torch.clamp(self.labels_weights * 0.1, 1., 5.)
        self.nb_labels = len(self.labels)
コード例 #49
0
def createMLB():
    labels_set = get_labels_set()
    mlb = MultiLabelBinarizer()
    mlb.fit(labels_set)
    return mlb