Ejemplo n.º 1
0
    def __init__(self, config):
        """
        The constructor of the DataGenerator class. It loads the training
        labels and the images.

        Parameters
        ----------
            config: dict
                a dictionary with necessary information for the dataloader
                (e.g batch size)
        """
        cwd = os.getenv("DATA_PATH")
        if cwd is None:
            print("Set your DATA_PATH env first")
            sys.exit(1)
        self.config = config
        try:
            if self.config.augment:
                pass
        except AttributeError:
            self.config.augment = False

        # Read csv file
        tmp = pd.read_csv(os.path.abspath(os.path.join(cwd, 'train.csv')),
                          delimiter=',',
                          engine='python')
        # A vector of images id.
        image_ids = tmp["Id"]
        data_path = os.path.join(cwd, 'train')
        print(data_path)
        self.n = len(image_ids)

        # For each id sublist of the 4 filenames [batch_size, 4]
        self.filenames = np.asarray([[
            os.path.join(cwd, 'train', id + '_' + c + '.png')
            for c in ['red', 'green', 'yellow', 'blue']
        ] for id in image_ids])
        # Labels
        self.labels = tmp["Target"].values
        # To one-hot representation of labels
        # e.g. before e.g. ['22 0' '12 23 0']
        # after split [['22', '0'], ['12', '23', '0']]
        # after binarize it is one hot representation
        binarizer = MultiLabelBinarizer(classes=np.arange(28))
        self.labels = [[int(c) for c in l.split(' ')] for l in self.labels]
        self.labels = binarizer.fit_transform(self.labels)

        # Build a validation set
        try:
            self.train_filenames, self.val_filenames,\
                self.train_labels, self.val_labels = train_test_split(
                    self.filenames, self.labels,
                    test_size=self.config.val_split,
                    random_state=42)
        except AttributeError:
            print('WARN: val_split not set - using 0.1')
            self.train_filenames, self.val_filenames,\
                self.train_labels, self.val_labels = train_test_split(
                    self.filenames, self.labels,
                    test_size=0.1, random_state=42)

        print("Shape of training data: {}".format(self.train_filenames.shape))
        print("Shape of training labels: {}".format(self.train_labels.shape))

        # Get list of all possible images (incl. augmented if exist)
        data_train_folder = os.path.join(cwd, 'train')

        # Augment training data if specified in config file (and if possible)
        if self.config.augment:
            print("Getting augmented dataset...")
            filter_list = ['yellow', 'red', 'blue', 'green']
            aug_train_list = []
            aug_train_labels = []

            for i in range(0, self.train_filenames.shape[0]):
                filename = self.train_filenames[i][0] \
                    .rsplit('/')[-1].rsplit('_')[0]
                print("Augmenting {}".format(filename))
                temp_rot = []
                temp_rev = []
                counter = 1
                while True:
                    test_f = os.path.join(
                        data_train_folder,
                        filename + '_rot{}'.format(counter) + '_' +
                        filter_list[0] + '.png')
                    if os.path.isfile(test_f) is False:
                        break
                    temp_rot = [
                        os.path.join(
                            data_train_folder, filename +
                            '_rot{}'.format(counter) + '_' + f + '.png')
                        for f in filter_list
                    ]
                    temp_rev = [
                        os.path.join(
                            data_train_folder, filename +
                            '_rev{}'.format(counter) + '_' + f + '.png')
                        for f in filter_list
                    ]
                    flag = True
                    if SKIP_CHECK is False:
                        try:
                            for fname in temp_rev:
                                with open(fname, 'rb') as f:
                                    # Check header of file
                                    flag = flag and (f.read(4) == b'\x89PNG')
                            for fname in temp_rot:
                                with open(fname, 'rb') as f:
                                    # Check header of file
                                    flag = flag and (f.read(4) == b'\x89PNG')
                        except IOError as e:
                            print(e)
                            flag = False
                    if flag is True:
                        aug_train_list.append(temp_rot)
                        aug_train_labels.append(self.train_labels[i])
                        aug_train_list.append(temp_rev)
                        aug_train_labels.append(self.train_labels[i])
                    else:
                        print("corrupted images found")
                        print(temp_rot)
                        print(temp_rev)

                    counter += 1

            try:
                # Append list of all aug filenames to training set
                self.train_filenames = np.vstack(
                    (self.train_filenames, np.asarray(aug_train_list)))
                self.train_labels = np.vstack(
                    (self.train_labels, np.asarray(aug_train_labels)))
                # Append list of all aug filenames to 'all' set
                self.filenames = np.vstack(
                    (self.filenames, np.asarray(aug_train_list)))
                self.labels = np.vstack(
                    (self.labels, np.asarray(aug_train_labels)))
            # aug_train_list is empty (no aug data available)
            except ValueError:
                print('No augmented data found. Please augment first')

        # New label frequency
        print("New label distribution: {}".format(
            self.train_labels.sum(axis=0)))

        self.n_train = len(self.train_labels)
        self.n_val = len(self.val_labels)
        self.n = len(self.labels)

        if hasattr(config, 'random_state'):
            random_state = config.random_state
        else:
            random_state = 42
        np.random.seed(random_state)
        if hasattr(config, 'bootstrap_size'):
            n_samples = int(config.bootstrap_size * self.n_train)
            new_indices = resample(np.arange(self.n_train),
                                   n_samples=n_samples,
                                   random_state=random_state)
            self.train_filenames = self.train_filenames[new_indices]
            self.train_labels = self.train_labels[new_indices]
            self.n_train = len(self.train_labels)

        print('Size of training set is {}'.format(self.n_train))
        print('Size of validation set is {}'.format(self.n_val))
        # Compute class weigths
        self.class_weights = (self.n_train) * np.reshape(
            1 / np.sum(self.train_labels, axis=0), (1, -1))
        # Number batches per epoch
        self.train_batches_per_epoch = int(
            (self.n_train - 1) / self.config.batch_size) + 1
        self.val_batches_per_epoch = int(
            (self.n_val - 1) / self.config.batch_size) + 1
        self.all_batches_per_epoch = int(
            (self.n - 1) / self.config.batch_size) + 1
Ejemplo n.º 2
0
def pedicting_tag(request):
    print 'inside predicting tag'
    class lemmatokenizer(object):
        def __init__(self):
            self.stemmer = SnowballStemmer('english')
            self.token_pattern = r"(?u)\b\w\w+\b"       
    #         self.wnl = WordNetLemmatizer()
        def __call__(self,doc):                                                     # here, doc is one string sentence
            token_pattern = re.compile(self.token_pattern)
            return [self.stemmer.stem(t) for t in token_pattern.findall(doc)]       # return lambda doc: token_pattern.findall(doc) 
    #         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


    vect_title = CountVectorizer(max_df=0.5,min_df=5,stop_words='english',tokenizer=lemmatokenizer(),ngram_range=(1,3))


    # In[9]:

    tfidf_vect_title = TfidfVectorizer(smooth_idf=False,max_df=0.5,min_df=5,stop_words='english',tokenizer=lemmatokenizer(),ngram_range=(1,3))


    le = preprocessing.LabelEncoder()  
    le.fit(y_labels) 
    d_set['label_num'] = pd.Series([le.transform(ast.literal_eval(i)) for i in d_set['tag']])
    d_set.head()


    new_y_labels = d_set['label_num'].values.tolist()

    mlb = MultiLabelBinarizer() 
    mlb.fit(new_y_labels)

    y_tag_dtm = mlb.transform(new_y_labels) 

    y_tag_dtm.shape


    # In[14]:

    X_labels = d_set['title'].values.tolist()

    # print (X_labels)


    # In[15]:

    vect_title.fit(X_labels)
    X_title_dtm = vect_title.transform(X_labels)

    X_title_dtm


    from sklearn.decomposition import PCA

    pca = PCA(n_components=100).fit(X_title_dtm.toarray())
    pca_samples = pca.transform(X_title_dtm.toarray())

    pca_df = pd.DataFrame(np.round(pca_samples,4))

    print (pca_df.head())


    # In[ ]:




    # In[17]:

    new_df = pd.DataFrame(X_title_dtm.toarray(),columns=vect_title.get_feature_names())



    new_df.shape



    d = collections.Counter(vect_title.get_feature_names())

    new_df['target_list'] = [i for i in y_tag_dtm] 


    tfidf_vect_title.fit(X_labels)
    X_title_dtm_tfidf = tfidf_vect_title.transform(X_labels)

    X_title_dtm_tfidf


    # In[23]:

    new_df_of_tfidf = pd.DataFrame(X_title_dtm_tfidf.toarray(),columns=tfidf_vect_title.get_feature_names()) 


    # In[24]:

    new_df_of_tfidf['target_list'] = [i for i in y_tag_dtm] 


    # In[25]:

    y = new_df_of_tfidf['target_list'] 
    X = new_df_of_tfidf.drop('target_list',axis=1)  


    X = np.array(X.values.tolist())                           # it will convert list to numpy ndarray
    y = np.array(y.values.tolist())


    # In[28]:

    # print (X[0]) 


    # In[29]:

    pca_X = PCA(n_components=200).fit_transform(X)  
    pca_X = np.round(pca_X,4)

    pca_y = PCA(n_components=50).fit_transform(y)  
    pca_y = np.round(pca_y,4)


    # In[30]:

    print (pca_y) 


    # In[31]:

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)   


    # In[32]:

    # X_train, X_test, y_train, y_test = train_test_split(pca_X, pca_y, test_size=0.2, random_state=1)   


    # In[ ]:




    # In[33]:

    # clf = Pipeline([('classifier',OneVsRestClassifier(SVC(probability=True,random_state=0)))])  # just to for Pipeline example

    knn_clf = KNeighborsClassifier(n_neighbors=5)
    # mnb_clf = MultinomialNB()                                                                   # not working for MultiLabelinput
    # svc_clf = OneVsRestClassifier(SVC(probability=True,random_state=0))

    # time_pass_y = np.random.randint(2,size=(2838,1))                                            # produce ndarray of size 2838 X 1

    knn_clf.fit(X_train, y_train)
    # mnb_clf.fit(X_train, y_train) 

    knn_pred = knn_clf.predict(X_test)  
    # mnb_pred = mnb_clf.predict(X_test)
    # svc_pred = svc_clf.predict(X_test)


    # In[34]:

    knn_clf.score(X_test, y_test) 


    # In[53]:

    from sklearn import metrics

    knn_report = metrics.classification_report(y_test[:100], knn_pred[:100]) 
    knn_f1_score = metrics.f1_score(y_test[:], knn_pred[:], average='samples') 
    knn_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, knn_pred, average='samples')  # on full data-set
    knn_avg_precision_score = metrics.average_precision_score(y_test, knn_pred, average='samples')
    knn_roc_auc_score = metrics.roc_auc_score(y_test, knn_pred, average='samples')

    # mnb_report = metrics.classification_report(y_test[:100], mnb_pred[:100])  #throwing error mnb_clf can't work on multilabel O/P


    # In[36]:

    metrics.accuracy_score(y_true=y_test[:100], y_pred=knn_pred[:100])          # I think it's same as calculating hamming_score


    # In[37]:

    # print (knn_report)                                   # its type is str

    print "For knn_clf (KNearestNeighbours) : "
    print "precision, recall, fbeta_score, support : ",knn_precision_recall_fscore
    print "f1_score : ",knn_f1_score
    print "avg. precision_score : ",knn_avg_precision_score 
    print "roc_auc_score : ",knn_roc_auc_score


    # In[38]:

    # def does_test_tag_match(d, list_of_tags):      # no need for this function


    # In[39]:

    test = ["how to use policy iteration in ml ?"]
    # test = ["what is lstm ?"] 

    # test_dtm = vect_title.transform(test)                                           # without tfidf
    test_dtm = tfidf_vect_title.transform(test)                                       # with tfidf

    # print (test_dtm.toarray()[0])
    status = False
    for i in test_dtm.toarray()[0]:
        if (i!=0):
            status = True
            break

    ans = knn_clf.predict(test_dtm.toarray())
    ans = mlb.inverse_transform(ans)

    if (len(ans[0])==0 or status==False):
        print ("sorry, we can't predict your category!!!")
    else:
        ans = le.inverse_transform(ans)
        print (ans)
        
        

    forest = RandomForestClassifier(n_estimators=100, random_state=0)
    rf_clf = MultiOutputClassifier(forest, n_jobs=-1)
    rf_clf.fit(X_train, y_train)
    rf_pred = rf_clf.predict(X_test)


    # In[41]:

    rf_clf 


    # In[42]:

    metrics.accuracy_score(y_true=y_test[:100], y_pred=rf_pred[:100])          # I think it's same as calculating hamming_score


    # In[43]:

    rf_clf.score(X_test, y_test)

    rf_report = metrics.classification_report(y_test[:100], rf_pred[:100])
    rf_f1_score = metrics.f1_score(y_test, rf_pred, average='samples')  
    rf_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, rf_pred, average='samples')  # on full data-set
    rf_avg_precision_score = metrics.average_precision_score(y_test, rf_pred, average='samples')
    rf_roc_auc_score = metrics.roc_auc_score(y_test, rf_pred, average='samples') 


    # In[47]:

    # print (rf_report) 

    print "For rf_clf (RandomForest) : "
    print "precision, recall, fbeta_score, support : ",rf_precision_recall_fscore
    print "f1_score : ",rf_f1_score  
    print "avg. precision_score : ",rf_avg_precision_score 
    print "roc_auc_score : ",rf_roc_auc_score

    # test = ["what is reinforcement learning ?"] 

    test = ["what is ai,lstm and data visualization ?"] 

    # test_dtm = vect_title.transform(test)                                            # without tfidf
    test_dtm = tfidf_vect_title.transform(test)                                        # with tfidf

    status = False
    for i in test_dtm.toarray()[0]:
        if (i!=0):
            status = True
            break

    ans = rf_clf.predict(test_dtm.toarray())
    ans = mlb.inverse_transform(ans)
    if (len(ans[0])==0 or status==False):
        print ("sorry, we can't predict your category!!!")
    else:
        ans = le.inverse_transform(ans)
        print (ans)
Imports
"""
import pandas as pd
import sklearn
from sklearn.preprocessing import MultiLabelBinarizer
import pyarrow as pa
import pyarrow.parquet as pq
"""
global variables
"""
#need to coordinate with Jason on how this is created and how to refactor using the API instead of a document.
#it is a dataframe with the entities
infile = 'jason_mimc-554_new.csv'

outputfile = 'tpot_prep-diagnosis_names_one_hot_encoded.parquet'
mlb = MultiLabelBinarizer()

def load_dataframe():
    with open(infile) as json_file:
        df = pd.read_csv(infile)
        return df

def write_dataframe(df):
    table = pa.Table.from_pandas(df)
    pq.write_table(table, outputfile)

def diagnoses_one_hot_encoding():
    df = load_dataframe()

    #create boolean mask matched non NaNs values
    mask = df['diagnosis'].notnull()
Ejemplo n.º 4
0
def main():
	parser = argparse.ArgumentParser('Build a model for a classifier')
	parser.add_argument('--categoriesFile',required=True,type=str,help='Category list file')
	parser.add_argument('--params',required=True,type=str,help='JSON string with parameters')
	parser.add_argument('--useTestSet',action='store_true',help='Whether to use the test set instead of the validation set')
	parser.add_argument('--inJSON',required=True,type=str,help='Filename of JSON documents')
	args = parser.parse_args()

	print("Running with --params %s" % args.params)

	params = json.loads(args.params)

	with open(args.inJSON) as f:
		documents = json.load(f)

	with open(args.categoriesFile) as f:
		categories = [ line.strip() for line in f ]


	#test_docs = [ d for d in documents if 'phase4' in d['annotations'] ]
	#documents = [ d for d in documents if not 'phase4' in d['annotations'] ]

	#viruses = {'SARS-CoV-2','SARS-CoV','MERS-CoV'}
	#documents = [ d for d in documents if any(entity['type'] == 'Virus' for entity in d['entities']) or any( v in d['annotations'] for v in viruses) ]

	train_docs = [ d for d in documents if len(d['annotations']) > 0 and d['phase'] != 'testset' ]
	test_docs = [ d for d in documents if d['phase'] == 'testset' ]
	#other_docs = [ d for d in documents if len(d['annotations']) == 0 ]

	toRemoveFromTraining = {'RemoveFromCorpus?','NotAllEnglish','NotRelevant','FixAbstract'}
	train_docs = [ d for d in train_docs if not any (f in d['annotations'] for f in toRemoveFromTraining) ]

	if not args.useTestSet:
		train_docs, test_docs = train_test_split(train_docs, test_size=0.25, random_state=42)

	train_categories = [ [ a for a in d['annotations'] if a in categories ] for d in train_docs ]
	test_categories = [ [ a for a in d['annotations'] if a in categories ] for d in test_docs ]

	encoder = MultiLabelBinarizer()
	train_targets = encoder.fit_transform(train_categories)
	test_targets = encoder.fit_transform(test_categories)
	target_names = encoder.classes_
	
	assert len(target_names) == len(categories)
	
	print("len(train_docs):",len(train_docs))
	print("len(test_docs):",len(test_docs))

	print("class balance for train:", 100*sum(train_targets)/len(train_targets))
	print("class balance for test:", 100*sum(test_targets)/len(test_targets))

	sys.stdout.flush()

	clf = DocumentClassifier(params)

	print('train_targets.shape=',train_targets.shape)
	sys.stdout.flush()

	clf.fit(train_docs, train_targets, target_names)

	predictions = clf.predict(test_docs)

	print('predictions.shape=',predictions.shape)
	sys.stdout.flush()

	results = {}

	all_tn, all_fp, all_fn, all_tp = 0,0,0,0

	all_precisions, all_recalls, all_f1_scores = [],[],[]

	for i,label in enumerate(target_names):
		gold_for_label = test_targets[:,i]
		predictions_for_label = predictions[:,i] > 0.5
		
		tn, fp, fn, tp = sklearn.metrics.confusion_matrix(gold_for_label, predictions_for_label).ravel()
		tn, fp, fn, tp = map(int, [tn, fp, fn, tp])

		all_tn += tn
		all_fp += fp
		all_fn += fn
		all_tp += tp

		precision = sklearn.metrics.precision_score(gold_for_label,predictions_for_label)
		recall = sklearn.metrics.recall_score(gold_for_label,predictions_for_label)
		f1_score = sklearn.metrics.f1_score(gold_for_label,predictions_for_label)

		all_precisions.append(precision)
		all_recalls.append(recall)
		all_f1_scores.append(f1_score)

		print(f"{label}\t{precision}\t{recall}\t{f1_score}")
		sys.stdout.flush()
		results[label] = {'tn':tn,'fp':fp,'fn':fn,'tp':tp,'precision':precision,'recall':recall,'f1_score':f1_score}

	micro_precision = all_tp / (all_tp + all_fp) if (all_tp + all_fp) > 0 else 0
	micro_recall = all_tp / (all_tp + all_fn) if (all_tp + all_fn) > 0 else 0
	micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0

	macro_precision = sum(all_precisions) / len(all_precisions)
	macro_recall = sum(all_recalls) / len(all_recalls)
	macro_f1 = sum(all_f1_scores) / len(all_f1_scores)

	results['MICRO'] = {'tn':all_tn,'fp':all_fp,'fn':all_fn,'tp':all_tp,'precision':micro_precision,'recall':micro_recall,'f1_score':micro_f1}
	results['MACRO'] = {'precision':macro_precision,'recall':macro_recall,'f1_score':macro_f1}

	print("-"*30)
	print(f"MICRO\t{micro_precision}\t{micro_recall}\t{micro_f1}")
	print(f"MACRO\t{macro_precision}\t{macro_recall}\t{macro_f1}")
	print("-"*30)

	output = {'params':params, 'results':results}

	print(json.dumps(output))

	print("Done")
Ejemplo n.º 5
0
    print("Calculating Daylight fingerprint...")
    data['fingerprint'] = data['mol'].apply(ft.daylight_fingerprint)
    data['fingerprint'] = data['fingerprint'].apply(
        ft.daylight_fingerprint_padding)
    print("Daylight fingerprint calculation done.")

if args.featurizer == 'ecfp':
    print("Calculating ECFP...")
    data['fingerprint'] = data['mol'].apply(ft.get_ecfp)
    print("ECFP calculation done.")

# Input (x) and label (y)
X = data['fingerprint']
X = np.array(np.stack(X), dtype=float)

mlb = MultiLabelBinarizer().fit(data['agrochemical'])
Y = mlb.transform(data['agrochemical'])

# Build neural network model
layers_dim = [X.shape[1], 512, 128, 16, 4, Y.shape[1]]
activation = ['relu', 'relu', 'relu', 'relu', 'sigmoid']

image_name = filename[:filename.rfind('.')] + '.png'

training_acc, training_loss, validation_acc, validation_loss = \
    model_func.plot_nn_loss_against_epoch(X, Y, layers_dim, activation, args.epochs, image_name,
                                          loss=args.loss, optimizer=args.optimizer)

print("Number of epochs for maximum training accuracy:",
      np.argmax(training_acc))
print("Number of epochs for minimum training loss:", np.argmin(training_loss))
Ejemplo n.º 6
0
def get_word_vec(df, train_or_test='train'):
    mlb = MultiLabelBinarizer()
    wordVecs = mlb.fit_transform(df['word_list'])
    wordVecs = [[e] for e in wordVecs]

    return mlb, pd.DataFrame(wordVecs)
Ejemplo n.º 7
0
Labels = []
for i in range(T):
    labels = map(int, input().split(' '))
    RawData.append(input())
    Labels.append(labels)

Queries = []
for i in range(E):
    Queries.append(input())

RawData.extend(Queries)
X = CVectorizer.fit_transform(RawData)
Xtf = TfIdfVectorizer.fit_transform(X)
del X

MLB = MultiLabelBinarizer()
Yt = MLB.fit_transform(Labels)
XtfTrain = Xtf[0:T]
XtfTest = Xtf[T:]
Clf = OneVsRestClassifier(LinearSVC(loss='l1', class_weight={
    1: 100,
    0: 1
})).fit(XtfTrain, Yt)
Classes = list(MLB.classes_)

for xTest in XtfTest:
    y = Clf.decision_function(xTest)
    y1 = list(y[0])
    c1 = Classes
    lbls = [x for (y, x) in sorted(zip(y1, c1))][-10:]
    list.reverse(lbls)
Ejemplo n.º 8
0
        JOIN section_content_75pct t2 ON t1.file_id=t2.file_id AND t1.section_id=t2.section_id
        """
        df = pandas.read_sql_query(con=conn, sql=sql_text)
        df_randomized_order = df.sample(frac=1, random_state=rng_seed)
        heading_plus_content_corpus = df_randomized_order[
            'abstracted_heading_plus_content']
        content_corpus = df_randomized_order['content_text_w_o_tags']
        heading_text_corpus = df_randomized_order['heading_text']
        url_corpus = df_randomized_order['url']

        # Class '2' has been merged into class '1'
        label_set = ['-', '1', '3', '4', '5', '6', '7', '8']
        labels = [
            str(x).split(',') for x in df_randomized_order['section_code']
        ]
        mlb = MultiLabelBinarizer(classes=label_set)
        labels_matrix = mlb.fit_transform(labels)

        tfidf = TfidfVectorizer(ngram_range=(1, 1),
                                analyzer='word',
                                stop_words='english')
        tfidfX = tfidf.fit_transform(heading_plus_content_corpus)

        logging.info('tfidf matrix shape: ')
        logging.info(tfidfX.shape)

        features_tfidf = pandas.DataFrame(tfidfX.todense())
        # Assign column names to make it easier to print most useful features later
        features_tfidf.columns = tfidf.get_feature_names()
        features_combined = features_tfidf
        print(i)

valid_X = np.array(valid_X)
np.save(data_path, valid_X)

# process label
print("label preprocessing")

train_y = []
for train_id in train_list:
    train_y.append(get_labels(train_id))
valid_y = []
for valid_id in valid_list:
    valid_y.append(get_labels(valid_id))

encoder = MultiLabelBinarizer()
encoder.fit(train_y + valid_y)
train_y_onehot = encoder.transform(train_y)
valid_y_onehot = encoder.transform(valid_y)
train_y_onehot = np.delete(train_y_onehot, [2, 3, 5, 6, 7, 10, 12],
                           1)  # delete out 8 and "No Finding" column
valid_y_onehot = np.delete(valid_y_onehot, [2, 3, 5, 6, 7, 10, 12],
                           1)  # delete out 8 and "No Finding" column

with open(data_path + "/train_y_onehot.pkl", "wb") as f:
    pickle.dump(train_y_onehot, f)
with open(data_path + "/valid_y_onehot.pkl", "wb") as f:
    pickle.dump(valid_y_onehot, f)
with open(data_path + "/label_encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)
Ejemplo n.º 10
0
def _label_matrix(tr_target, te_target):
    mlb = MultiLabelBinarizer(sparse_output=True)
    ytr = mlb.fit_transform(tr_target)
    yte = mlb.transform(te_target)
    print(mlb.classes_)
    return ytr, yte
Ejemplo n.º 11
0
from os.path import dirname, join
import sys
from languageflow.flow import Flow
from languageflow.model import Model
from languageflow.transformer.tfidf import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from languageflow.validation.validation import TrainTestSplitValidation
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from load_data import load_dataset

if __name__ == '__main__':
    data_file = join(dirname(dirname(dirname(dirname(__file__)))), "data",
                     "fb_bank", "corpus", "train.xlsx")
    X, y = load_dataset(data_file)

    flow = Flow()
    flow.data(X, y)

    transformer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.8, min_df=8)
    flow.transform(MultiLabelBinarizer())
    flow.transform(transformer)
    flow.add_model(
        Model(OneVsRestClassifier(LogisticRegression()), "LogisticRegression"))
    flow.set_validation(TrainTestSplitValidation(test_size=0.1))

    flow.train()
    flow.export(model_name="LogisticRegression", export_folder="model")
Ejemplo n.º 12
0
def evaluate():

    warnings.filterwarnings("ignore", category=UserWarning)

    f_test = open('data/raw/Test.csv')

    lines_test = csv.reader(f_test)

    f_train = open('data/raw/Train.csv')

    lines_train = csv.reader(f_train)

    #next(lines) #to skip the header of the csv

    true_ans_test = []
    true_ans_train = []

    train_title_feature = np.load('data/vectorized/Train_title.npy')
    train_summary_feature = np.load('data/vectorized/Train_summary.npy')

    test_title_feature = np.load('data/vectorized/Test_title.npy')
    test_summary_feature = np.load('data/vectorized/Test_summary.npy')

    for line in lines_test:
        source_uri = line[4]
        true_ans_test.append(source_uri.split(' '))

    for line in lines_train:
        source_uri = line[4]
        true_ans_train.append(source_uri.split(' '))

    f = open('reports/Test.ans')

    lines = csv.reader(f)
    #next(lines) #to skip the header of the csv

    pred_ans = []

    for line in lines:
        pred_ans.append(line[0].split(' '))

    f.close()

    classes = ['nytimes', 'indiatimes', 'washingtonpost']
    mlb = MultiLabelBinarizer(classes)
    pred_ans_b = mlb.fit_transform(pred_ans)
    true_ans_b = mlb.transform(true_ans_test)

    print('\n\nMLB:')

    Sub_accuracy_score = accuracy_score(true_ans_b, pred_ans_b)
    Sub_accuracy_score = str(round(Sub_accuracy_score, 3))

    print('\nSubset Accuracy: ' + Sub_accuracy_score)

    hamming_score = hamming_loss(true_ans_b, pred_ans_b)
    hamming_score = str(round(hamming_score, 3))

    print('\nHamming Loss: ' + hamming_score + '\n\n')

    strategies = ['stratified', 'uniform']

    X_test = np.squeeze(
        np.concatenate((test_title_feature, test_summary_feature), 2))
    y_test = true_ans_test

    X_train = np.squeeze(
        np.concatenate((train_title_feature, train_summary_feature), 2))
    y_train = true_ans_train

    test_scores = []
    for s in strategies:

        dclf = DummyClassifier(strategy=s, random_state=0)
        dclf = dclf.fit(X_train, y_train)

        pred_ans = []

        ans = dclf.predict(X_test)

        for a in ans:
            pred_ans.append(a)

        pred_ans_b = mlb.fit_transform(pred_ans)

        print('\n\n' + s + ':')

        Sub_accuracy_score = accuracy_score(true_ans_b, pred_ans_b)
        Sub_accuracy_score = str(round(Sub_accuracy_score, 3))

        print('\nSubset Accuracy: ' + Sub_accuracy_score)

        hamming_score = hamming_loss(true_ans_b, pred_ans_b)
        hamming_score = str(round(hamming_score, 3))

        print('\nHamming Loss: ' + hamming_score)

    print('\n\n')
Ejemplo n.º 13
0
# X1 = [
#     [1,2,3,4,5],
#     [1,1,2,2,3],
#     [2,2,3,4,5],
# ]
#
# y1 = [
#     [1,2,3],
#     [1,2],
#     [4,1],
# ]
#
# classifier = OneVsRestClassifier(SVC(class_weight='auto'))
# classifier.fit(X1, y1)
# # y2 = classifier.predict(X2)

from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

c = OneVsRestClassifier(SVC())
X = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 1]]
Y = [[1], [2], [3], [1, 3]]
binarizer = MultiLabelBinarizer().fit(Y)
yyy = binarizer.transform(Y)
estimator = c.fit(X, yyy)
result = estimator.predict([[1, 0, 0]])
print(binarizer.inverse_transform(result))
# hoge = MultiLabelBinarizer().inverse_transform(result)
# print(hoge)
 def __init__(self, vectors, clf):
     self.embeddings = vectors
     self.clf = TopKRanker(clf)
     self.binarizer = MultiLabelBinarizer(sparse_output=True)
Ejemplo n.º 15
0
def generate_vectors(train_url,
                     test_url=None,
                     column='article',
                     trans_type=None,
                     max_n=1,
                     min_df=1,
                     max_df=1.0,
                     max_features=1,
                     sublinear_tf=True,
                     balanced=False,
                     re_weight=0,
                     verbose=False,
                     drop_words=0,
                     multilabel_out=False,
                     label_col='subjects',
                     only_single=True,
                     shuffle=True,
                     apply_fun=None):
    """ generate X, y, X_test vectors with csv(with header) url use pandas and CountVectorizer

    Args:
        train_url: url to train csv
        test_url: url to test csv, set to None if not need X_test
        column: column to use as feature
        trans_type: specific transformer, {'dc','idf', 'hashing'}
        max_n: max_n for ngram_range
        min_df: min_df for CountVectorizer
        max_df: max_df for CountVectorizer
        max_features: max_features for CountVectorizer
        sublinear_tf: sublinear_tf for default TfdcTransformer
        balanced: balanced for default TfdcTransformer, for idf transformer, it is use_idf
        re_weight: re_weight for TfdcTransformer
        verbose: True to show more information
        drop_words: randomly delete some words from sentences
        multilabel_out: return y as multilabel format
        label_col: col name of label
        only_single: only keep records of single label
        shuffle: re sample train data
        apply_fun: callable to be applied on label column

    Returns:
        X, y, X_test

    """
    verbose and print("loading '%s' level data from %s with pandas" %
                      (column, train_url))

    train_df = pd.read_csv(train_url)
    if shuffle:
        train_df = train_df.sample(frac=1)
    if only_single:
        train_df = train_df[train_df['subjects'].apply(lambda x: len(x) < 2)]

    # vectorizer
    s_time = time()
    analyzer = 'word' if column == 'word_seg' else 'char'
    vec = CountVectorizer(analyzer=analyzer,
                          ngram_range=(1, max_n),
                          min_df=min_df,
                          max_df=max_df,
                          max_features=max_features,
                          token_pattern='\w+')
    verbose and print("finish loading, vectorizing")
    verbose and print("vectorizer params:", vec.get_params())
    sequences = train_df[column]
    # delete some words randomly
    for i, row in enumerate(sequences):
        if drop_words <= 0:
            break
        if np.random.ranf() < drop_words:
            row = np.array(row.split())
            sequences.at[i] = ' '.join(row[np.random.ranf(row.shape) > 0.35])
    X = sequences if trans_type == 'hashing' else vec.fit_transform(sequences)
    e_time = time()
    verbose and print("finish vectorizing in %.3f seconds, transforming" %
                      (e_time - s_time))

    # transformer
    if trans_type is None or trans_type == 'idf':
        trans = TfidfTransformer(sublinear_tf=sublinear_tf, use_idf=balanced)
    elif trans_type == 'dc':
        trans = TfdcTransformer(sublinear_tf=sublinear_tf,
                                balanced=balanced,
                                re_weight=re_weight)
    else:
        trans = HashingVectorizer(analyzer=analyzer,
                                  ngram_range=(1, max_n),
                                  n_features=max_features,
                                  token_pattern='\w+',
                                  binary=not balanced)
    verbose and print(trans_type, "transformer params:", trans.get_params())

    if multilabel_out:
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_df[label_col].apply(str.split))
        verbose and print("multilabel columns:\n", mlb.classes_)
    else:
        y = train_df[label_col].apply(apply_fun).values if apply_fun is not None \
            else train_df[label_col].values
    X = trans.fit_transform(X, y)

    X_test = None
    if test_url:
        verbose and print("transforming test set")
        test_df = pd.read_csv(test_url)
        X_test = test_df[column] if trans_type == 'hashing' else vec.transform(
            test_df[column])
        X_test = trans.transform(X_test)
    s_time = time()
    verbose and print("finish transforming in %.3f seconds\n" %
                      (s_time - e_time))
    return X, y, X_test
Ejemplo n.º 16
0
    if args.algo == 'pane':
        Xf = utils.load_emd(path_emb + ".f", n, d / 2, n - 1)
        Xb = utils.load_emd(path_emb + ".b", n, d / 2, n - 1)
        Xf = preprocessing.normalize(Xf, norm='l2', axis=1)
        Xb = preprocessing.normalize(Xb, norm='l2', axis=1)
        X = np.hstack([Xf, Xb])
        print(X.shape)
    else:
        X = utils.load_emd(path_emb, n, d, n - 1)

    path_label = settings.DATA_INFO[args.data]['path'] + 'labels.txt'

    maf1 = []
    mif1 = []
    if args.multi:
        y = utils.load_label(path_label, n)
        X, y = filter(X, y)
        y = MultiLabelBinarizer(sparse_output=True).fit_transform(y)
    else:
        y = utils.read_cluster(n, path_label)

    for ratio in [0.9, 0.7, 0.5, 0.3, 0.1]:
        print("labelled data ratio:" + str(1 - ratio))
        macro_f1_avg, micro_f1_avg = eval(X, y, ratio, args.multi, 3)
        maf1.append(macro_f1_avg)
        mif1.append(micro_f1_avg)
        print("macro-f1=%f, micro-f1=%f", macro_f1_avg, micro_f1_avg)

    print(maf1)
    print(mif1)
Ejemplo n.º 17
0
def get_tags_vec(df, train_or_test='train'):
    mlb = MultiLabelBinarizer()
    tagVecs = mlb.fit_transform(df['tags_values'])
    tagVecs = [[e] for e in tagVecs]

    return mlb, pd.DataFrame(tagVecs)
Ejemplo n.º 18
0
    def perform_five_fold(self, model, documents, annotations, doc_ids,
                          pipeline_parameters):
        metrics = list()
        # store list of documents ids per fold
        folds = list()
        # turning into numpy arrays to be able to access values with index array
        documents_np_array = np.array(documents)
        annotations_np_array = np.array(annotations, dtype=object)
        doc_ids_np_array = np.array(doc_ids)
        ann_list = list()

        for ann in annotations_np_array:
            ann_list = ann_list + list([x[2] for x in ann])
        # getting unique label names in annotations
        unique_ann_list = list(set(ann_list))

        # array to store multilabel values
        multilabel_array = []
        for ann in annotations_np_array:
            multilabel_array.append([unique_ann_list.index(x[2]) for x in ann])

        multilabel_binarizer = MultiLabelBinarizer().fit_transform(
            multilabel_array)

        skf = IterativeStratification(n_splits=5, order=1)

        total_metrics = {}

        for train_index, test_index in skf.split(documents_np_array,
                                                 multilabel_binarizer):
            # get annotations train and test datasets
            train_annotations = annotations_np_array[train_index]
            test_annotations = annotations_np_array[test_index]

            # get documents train and test datasets
            train_documents = documents_np_array[train_index]
            test_documents = documents_np_array[test_index]

            fold_metrics = self.perform_fold(
                model, [train_documents.tolist(),
                        train_annotations.tolist()],
                [test_documents.tolist(),
                 test_annotations.tolist()], pipeline_parameters)

            # saving docs used to train fold
            fold_doc_ids = doc_ids_np_array[train_index]
            folds.append(fold_doc_ids.tolist())

            # saving fold metrics
            metrics.append(fold_metrics)

            for key in fold_metrics.keys():
                if key not in total_metrics:
                    total_metrics[key] = {
                        "FN": 0,
                        "FP": 0,
                        "TP": 0,
                        "TN": 0,
                        "f1": 0,
                        "precision": 0,
                        "recall": 0,
                        "acc": 0
                    }
                total_metrics[key][
                    "FN"] = total_metrics[key]["FN"] + fold_metrics[key]["FN"]
                total_metrics[key][
                    "FP"] = total_metrics[key]["FP"] + fold_metrics[key]["FP"]
                total_metrics[key][
                    "TP"] = total_metrics[key]["TP"] + fold_metrics[key]["TP"]
                total_metrics[key][
                    "TN"] = total_metrics[key]["TN"] + fold_metrics[key]["TN"]

        average_metrics = {}
        for label in total_metrics.keys():
            avg_metric = {}
            avg_metric["FN"] = total_metrics[label]["FN"] / 5
            avg_metric["FP"] = total_metrics[label]["FP"] / 5
            avg_metric["TP"] = total_metrics[label]["TP"] / 5
            avg_metric["TN"] = total_metrics[label]["TN"] / 5
            if (avg_metric["TP"] + avg_metric["FN"]) != 0:
                avg_metric["recall"] = avg_metric["TP"] / (avg_metric["TP"] +
                                                           avg_metric["FN"])
            else:
                avg_metric["recall"] = 1.0
            if (avg_metric["TP"] + avg_metric["FP"]) != 0:
                avg_metric["precision"] = avg_metric["TP"] / (
                    avg_metric["TP"] + avg_metric["FP"])
            else:
                avg_metric["precision"] = 0.0
            if (avg_metric["precision"] + avg_metric["recall"]) != 0:
                avg_metric["f1"] = 2 * (
                    avg_metric["precision"] * avg_metric["recall"]) / (
                        avg_metric["precision"] + avg_metric["recall"])
            else:
                avg_metric["f1"] = 0
            avg_metric["acc"] = (avg_metric["TP"] + avg_metric["TN"]) / (
                avg_metric["TP"] + avg_metric["TN"] + avg_metric["FP"] +
                avg_metric["FN"])

            average_metrics[label] = avg_metric

        return metrics, folds, average_metrics
Ejemplo n.º 19
0
            lst[i] = 'other'
    if len(lst) <= 0:
        lst = ['other']
    return lst


data['country'] = data['country'].apply(change_country_name)
# data['genre'] = data['listed_in'].apply(split_comma)
data['genre'] = data['genre'].apply(lambda row: list(set(row)))

print("nontext features processing finished")

# ----------- Convert to One-Hot representation
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=True)
data2 = data[['type', 'title', 'country', 'rating', 'words', 'genre']]
data2 = data2.join(
    pd.DataFrame.sparse.from_spmatrix(mlb.fit_transform(data.pop('genre')),
                                      index=data.index,
                                      columns=mlb.classes_))
data2 = data2.join(
    pd.DataFrame.sparse.from_spmatrix(mlb.fit_transform(data2.pop('country')),
                                      index=data.index,
                                      columns=mlb.classes_))

data2 = data2.join(
    pd.DataFrame.sparse.from_spmatrix(mlb.fit_transform(data2.pop('rating')),
                                      index=data.index,
                                      columns=mlb.classes_))
import pandas as pd
import preprocess
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle

data = pd.read_csv("myFP_217_D2.csv", header=None)

D2 = preprocess.get_data(data)

X = preprocess.get_X(D2)
# y = pd.DataFrame(preprocess.get_target(D2))
value = preprocess.get_target(D2)
value = MultiLabelBinarizer().fit_transform(value)
y = pd.DataFrame(value)

X, y = shuffle(X, y, random_state=0)

X_train, X_test = X[:int((0.8 * len(X)))], X[int((0.8 * len(X))):]
y_train, y_test = y[:int((0.8 * len(X)))], y[int((0.8 * len(X))):]


def cnn1():
    def weight_variable(shape):
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial)

    def bias_variable(shape):
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial)
Ejemplo n.º 21
0
    del psg_list, all_psg

# load datasets from cPickle
training_ds = cPickle.load(open(Pickle_Dir + "TrainingDataset.pickle", "r"))
testing_ds = cPickle.load(open(Pickle_Dir + "TestingDataset.pickle", "r"))
if False:
    error_analysis_ds = cPickle.load(
        open(Pickle_Dir + "ErrorAnalysisDataset.pickle", "r"))
print training_ds.size, testing_ds.size

# Decision Tree Classification
clf = MultiLabelDecisionTreeClassifier(fv_dimension=len(
    training_ds.fv_nparray[0, :]),
                                       min_split_entropy_threshold=0.0)
# Preprocessing - MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=training_ds.classes_, sparse_output=False)
training_ds.cl_indicator_matrix = mlb.fit_transform(
    training_ds.class_label_lists)
print training_ds.size, testing_ds.size, len(training_ds.classes_)
# Model Training
t0 = time()
clf.fit(training_ds.fv_nparray, training_ds.cl_indicator_matrix)
training_time = time() - t0
print "[main.py] Offline cost for training from %d samples is %.04f seconds." \
            % (len(training_ds.fv_nparray), training_time)

# # Accuracy & Efficient Measurement
# metrics = MultiLabelClassifierMetricsCalculator()
# t0 = time()
# for i in xrange(len(testing_ds.fv_nparray)):
#     prediction = clf.predict(testing_ds.fv_nparray[i, :])
        results.append(result)
    return pd.DataFrame(results, index=labels)


if __name__ == '__main__':
    # if True: simple model has given equal parameters for all OvR estimators
    # if False: grid search used to find best parameters per estimator
    simple_model = False

    # load data
    DATA_PATH = Path(__file__).parent / 'data'
    X_train, train_labels = load_data(DATA_PATH / 'train.txt')
    X_test, test_labels = load_data(DATA_PATH / 'test.txt')

    # binarize target labels
    multi_label_binarizer = MultiLabelBinarizer()
    y_train = multi_label_binarizer.fit_transform(train_labels)
    y_test = multi_label_binarizer.transform(test_labels)
    labels = multi_label_binarizer.classes_

    # set initial seed
    SEED = 23249425
    random.seed(SEED)
    # set 10 seeds for randomization of models
    local_seeds = {run: random.randint(1, 2**32 - 1) for run in range(10)}

    # get best parameters for OvR Model
    ovr_best_params = None
    if not simple_model:
        ovr_best_params = ovr_hyperparameter_optimization(
            X_train, y_train, labels, SEED)
Ejemplo n.º 23
0
stitch2se, se2name_mono = load_mono_se()

mono_se_dict = {
    val: i
    for i, val in enumerate(sorted(se2name_mono.keys(), reverse=False))
}

# create lists with pairs and se of each pair ---------------------
labels = list()
pairs = list()
for combo in sorted(combo2se.keys()):
    labels.append(list(combo2se[combo]))
    pairs.append(list(combo2stitch[combo]))

# one-hot-encode the target
mlb_y = MultiLabelBinarizer()
y = mlb_y.fit_transform(labels)
# y_sparse = sparse.csr_matrix(y)
del labels, combo2stitch, combo2se, se2name_mono

# transform the dataset ------------------------------------
x = list()
for pair in pairs:
    x.append([stitch2se.get(item, item) for item in pair])

left = [list(x[i][0]) for i in range(len(x))]
right = [list(x[i][1]) for i in range(len(x))]
del x, pairs, pair

mlb = MultiLabelBinarizer()
    (train['TicketPrefix'].unique(), test['TicketPrefix'].unique()), axis=0)
ticket_prefixes = np.unique(ticket_prefixes)
ticket_prefixes

# Then, we binarize nominal features and impute missing values.  The final feature vector is shown below.

# In[74]:

from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder, Normalizer, MultiLabelBinarizer, Imputer

train['Embarked'].fillna('n/a', inplace=True)
test['Embarked'].fillna('n/a', inplace=True)

mapper = DataFrameMapper([('Sex', LabelEncoder()),
                          (['Pclass'], MultiLabelBinarizer()),
                          (['Age'], [Imputer(), Normalizer()]),
                          ('SibSp', None), ('Parch', None),
                          (['Fare'], [Imputer(), Normalizer()]),
                          (['Cabin'], MultiLabelBinarizer()),
                          (['Title'], MultiLabelBinarizer(classes=all_titles)),
                          (['Embarked'], MultiLabelBinarizer()),
                          (['TicketPrefix'],
                           MultiLabelBinarizer(classes=ticket_prefixes))])

training_instances = mapper.fit_transform(train)
training_labels = np.array(train['Survived'])
print("X dimensions:")
print(mapper.transformed_names_)

# # Evaluating Classifiers
Ejemplo n.º 25
0
    def eval(self, model, return_preds_and_labels=False):
        """
        Performs evaluation on a given model.

        :param model: The model on which to perform evaluation
        :type model: AdaptiveModel
        :param return_preds_and_labels: Whether to add preds and labels in the returned dicts of the
        :type return_preds_and_labels: bool
        :return all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics
                             and reports generated during evaluation.
        :rtype all_results: list of dicts
        """
        model.eval()

        # init empty lists per prediction head
        loss_all = [0 for _ in model.prediction_heads]
        preds_all = [[] for _ in model.prediction_heads]
        probs_all = [[] for _ in model.prediction_heads]
        label_all = [[] for _ in model.prediction_heads]
        ids_all = [[] for _ in model.prediction_heads]
        passage_start_t_all = [[] for _ in model.prediction_heads]

        for step, batch in enumerate(
                tqdm(self.data_loader, desc="Evaluating", mininterval=10)
        ):
            batch = {key: batch[key].to(self.device) for key in batch}

            with torch.no_grad():

                logits = model.forward(**batch)
                losses_per_head = model.logits_to_loss_per_head(logits=logits, **batch)
                preds = model.logits_to_preds(logits=logits, **batch)
                probs = model.logits_to_probs(logits=logits, **batch)
                labels = model.prepare_labels(**batch)

            # stack results of all batches per prediction head
            for head_num, head in enumerate(model.prediction_heads):
                loss_all[head_num] += np.sum(to_numpy(losses_per_head[head_num]))
                preds_all[head_num] += list(to_numpy(preds[head_num]))
                probs_all[head_num] += list(to_numpy(probs[head_num]))
                label_all[head_num] += list(to_numpy(labels[head_num]))
                if head.model_type == "span_classification":
                    ids_all[head_num] += list(to_numpy(batch["id"]))
                    passage_start_t_all[head_num] += list(to_numpy(batch["passage_start_t"]))

        # Evaluate per prediction head
        all_results = []
        for head_num, head in enumerate(model.prediction_heads):
            multilabel = head.model_type == "multilabel_text_classification"
            if multilabel:
                # converting from string preds back to multi-hot encoding
                from sklearn.preprocessing import MultiLabelBinarizer
                mlb = MultiLabelBinarizer(classes=head.label_list)
                # TODO check why .fit() should be called on predictions, rather than on labels
                preds_all[head_num] = mlb.fit_transform(preds_all[head_num])
                label_all[head_num] = mlb.transform(label_all[head_num])
            if hasattr(head, 'aggregate_preds'):
                # Needed to convert NQ ids from np arrays to strings
                ids_all_str = [x.astype(str) for x in ids_all[head_num]]
                ids_all_list = [list(x) for x in ids_all_str]
                head_ids = ["-".join(x) for x in ids_all_list]
                preds_all[head_num], label_all[head_num] = head.aggregate_preds(preds=preds_all[head_num],
                                                                                labels=label_all[head_num],
                                                                                passage_start_t=passage_start_t_all[head_num],
                                                                                ids=head_ids)

            result = {"loss": loss_all[head_num] / len(self.data_loader.dataset),
                      "task_name": head.task_name}
            result.update(
                compute_metrics(metric=head.metric, preds=preds_all[head_num], probs=probs_all[head_num],
                                labels=label_all[head_num], multilabel=multilabel)
            )

            # Select type of report depending on prediction head output type
            if self.report:
                try:
                    result["report"] = compute_report_metrics(head, preds_all[head_num], label_all[head_num])
                except:
                    logger.error(f"Couldn't create eval report for head {head_num} with following preds and labels:"
                                 f"\n Preds: {preds_all[head_num]} \n Labels: {label_all[head_num]}")
                    result["report"] = "Error"

            if return_preds_and_labels:
                result["preds"] = preds_all[head_num]
                result["labels"] = label_all[head_num]
                result["probs"] = probs_all[head_num]

            all_results.append(result)

        return all_results
Ejemplo n.º 26
0
def evalEmbCls(args):
	"""Evaluate graph/network embedding via the multi-lable classification

	args  - parsed arguments
	"""
	assert args, 'Valid args are expected'

	tstart = time.clock()
	tstampt = time.gmtime()
	rootdims = None  # Indices of the root dimensions
	dimrds = None  # Dimension density ratios relative to the possibly indirect super cluster (dimension), typically >= 1
	dimrws = None  # Dimension density ratios relative to the possibly indirect super cluster (dimension), typically <= 1
	dimwsim = None  # Dimension weights (significance ratios)
	dimwdis = None  # Dimension weights for the dissimilarity
	dimnds = None  # Dimensions members (nodes) number

	if args.mode == 'eval':
		# 1.1 Load labels
		mat = loadmat(args.network)  # Compressed Sparse Column format
		# A = mat[args.adj_matrix_name]
		# graph = sparse2graph(A)
		labels_matrix = mat[args.label_matrix_name]  # csc_matrix
		labels_count = labels_matrix.shape[1]
		mlb = MultiLabelBinarizer(range(labels_count))
		lbnds = labels_matrix.shape[0]  # The number of labeled nodes
	else:
		lbnds = None

	# 1.2 Load Embedding
	# model = KeyedVectors.load_word2vec_format(args.embedding, binary=False)
	dimweighted = False
	dis_features_matrix = None  # Dissimilarity features matrix
	embext = os.path.splitext(args.embedding)[1].lower()
	if embext == '.nvc':
		tld0 = time.clock()
		features_matrix, rootdims, dimrds, dimrws, dimwsim, dimwdis, dimnds = loadNvc(args.embedding)
		tldf = time.clock()
		print('Feature matrix loaded on {} sec'.format(int(tldf - tld0)))
		# Cut loaded data to rootdims if required
		if args.rdims:
			if args.dims is None:
				args.dims = 1  # or anything else <= len(rootdims)
			else:
				raise ValueError('Exclusive options --dimensions and --root-dims are specified')
		if args.dims is not None:  #rdims
			if args.dims < rootdims.size:
				args.dims = rootdims.size
			if args.dims > features_matrix.shape[1]:
				args.dims = features_matrix.shape[1]
			print('Reduction to the {} dimensions E [{}, {}] started at {} sec'
				.format(args.dims, rootdims.size, features_matrix.shape[1], int(tldf)))
			# Cut the features_matrix to args.dims E rootdims .. totaldims
			fm = dok_matrix((features_matrix.shape[0], args.dims), dtype=features_matrix.dtype)
			# First, fill the root dims
			for j, idim in enumerate(rootdims):
				fm[:, j] = features_matrix.getcol(idim)
				# print('> colmat type: {}, shape: {}, attrs: {}'.format(type(colmat), colmat.shape, dir(colmat)))
			resdims = np.empty(args.dims, np.uint16)  # rootdims
			resdims[:rootdims.size] = rootdims
			if args.dims > rootdims.size:
				# Fill remained dimensions with the ones having max density step and not belongning to the root
				if dimnds is not None:
					drds = [(i, d, dimnds[i]) for i, d in enumerate(dimrds)]
					# Sort by increasing density step and then number of nodes
					rdmin = min(dimrds)
					drds.sort(key=lambda x: x[1] + x[2] / features_matrix.shape[0] * rdmin)
				else:
					drds = [(i, d) for i, d in enumerate(dimrds)]
					drds.sort(key=lambda x: x[1])
				# print('drds: ', drds[:5], '..', drds[-5:])
				# print('rootdims: ', [(i, dimnds[i]) for i in rootdims[:5]], '..', [(i, dimnds[i]) for i in rootdims[-5:]])
				droot = set(rootdims)
				for j in range(rootdims.size, args.dims):
					idim = drds.pop()[0]
					while idim in droot:
						idim = drds.pop()[0]
					resdims[j] = idim
					fm[:, j] = features_matrix.getcol(idim)
			rootdims = None
			features_matrix = fm
			del fm
			trd1 = time.clock()
			print('  features_matrix reduction completed within {} sec'.format(int(trd1 - tldf)))
			# Cut the accessory arrays to rootdims
			arrs = [dimrds, dimrws, dimwsim, dimwdis]
			for ia, arr in enumerate(arrs):
				# Omit None arrays
				if arr is None:
					continue
				tarr = np.empty(resdims.size, arr.dtype)
				for i, ir in enumerate(resdims):
					tarr[i] = arr[ir]
				arrs[ia] = tarr
			resdims = None
			print('  reduction of the accessory loaded data completed on {} sec'.format(int(time.clock() - trd1)))
		allnds = features_matrix.shape[0]
		if lbnds and allnds > lbnds and adjustRows(lbnds, features_matrix, True):
			print('WARNING, embedding matrices are reduced to the number of nodes in the labels matrix: {} -> {}'
				.format(allnds, lbnds), file=sys.stderr)
		# Omit dissimilarity weighting if required
		if args.no_dissim:
			dimwdis = None
		dimweighted = args.weighted_dims and dimwsim is not None
		if dimweighted:
			print('Node vectors are corrected with the dimension weights')
			if dimwdis is not None:
				dis_features_matrix = features_matrix.copy()
			w0 = 1E-8  # Zero weight placeholder
			for (i, j), v in features_matrix.items():
				# Note: Weights cutting must be applied before the dimensions significance consideration
				# w0 is used because 0 assignement does not work in the cycle affecting the dictionary size
				features_matrix[i, j] = v * dimwsim[j] if not args.dim_vmin or v >= args.dim_vmin else w0
			if dis_features_matrix is not None:
				for (i, j), v in dis_features_matrix.items():
					dis_features_matrix[i, j] = v * dimwdis[j] if not args.dim_vmin or v >= args.dim_vmin else w0
				dis_features_matrix = dis_features_matrix.toarray() #.todense() # order='C'
				if OPTIMIZED:
					sm.quantify(dis_features_matrix, sm.CMP_LE, w0, 0)
				else:
					np.where(dis_features_matrix > w0, dis_features_matrix, 0)
		features_matrix = features_matrix.toarray() #.todense() # order='C'
		if dimweighted:
			if OPTIMIZED:
				sm.quantify(features_matrix, sm.CMP_LE, w0, 0)
			else:
				np.where(features_matrix > w0, features_matrix, 0)
	else:
		features_matrix = None
		if embext == '.mat':
			mat = loadmat(args.embedding)
			# Map nodes to their features
			features_matrix = np.array(mat['embs'], dtype=np.float32, order='C')
			del mat
		elif embext == '.csv':
			features_matrix = np.loadtxt(args.embedding, dtype=np.float32, delimiter=',')
		else:  # ssv
			# Try to parse the file as space separated values
			features_matrix = np.loadtxt(args.embedding, dtype=np.float32)
			#raise ValueError('Embedding in the unknown format is specified: ' + args.embedding)
		allnds = features_matrix.shape[0]
		# Ensure that the adday can be resized if required (owns it's data ranther than a view)
		if lbnds and allnds > lbnds:
			if isinstance(features_matrix, np.ndarray) and not features_matrix.flags['OWNDATA']:
				features_matrix = features_matrix[:lbnds, ...]
			else:
				reduced = adjustRows(lbnds, features_matrix, True)
				assert reduced, 'features_matrix is expected to be reduced from {} to {} items'.format(allnds, lbnds)
			embname = os.path.splitext(args.embedding)[0]
			# COnsider that .nvc embeddings support multiple options on loading and should be retained
			if embext != '.nvc':
				embrds = embname + '.mat'
				embdir, namext = os.path.split(args.embedding)
				move(args.embedding, ''.join((embdir, '/', 'full_', namext)))
			else:
				embrds = ''.join((embname, '_rds', str(lbnds), '.mat'))
			print('WARNING, features matrix is reduced to the number of nodes in the labels matrix: {} -> {}.'
				  ' Saving the reduced features to the {}...'
				.format(allnds, lbnds, embrds), file=sys.stderr)
			savemat(embrds, mdict={'embs': features_matrix})

	# Cut weights lower dim_vmin if required
	if args.dim_vmin and not dimweighted:
		if OPTIMIZED:
			sm.quantify(features_matrix, sm.CMP_LT, args.dim_vmin, 0)
		else:
			np.where(features_matrix >= args.dim_vmin, features_matrix, 0)

	# Binarize if required in case of hamming distance evaluation
	if args.binarize:
		medbin = args.metric == 'hamming'  # Binarize to the median instead of reducing mean square error
		sm.binarize(features_matrix, medbin)
		if dis_features_matrix is not None:
			sm.binarize(dis_features_matrix, medbin)

	assert args.metric != 'jacnop' or (features_matrix.max() <= 1 and features_matrix.min() >= -1), (
		'Jacnop should be applied only to the features matrix normalized to 1, i.e. max(abs(mat)) = 1')

	# Generate Gram (nodes similarity) matrix only -----------------------------
	if args.mode == 'gram':
		# Note: metric here is distance metric = 1 - sim_metric
		if OPTIMIZED:
			gram = np.empty((features_matrix.shape[0], features_matrix.shape[0]), dtype=ValT)
			metid = sm.sim_id(args.metric)
		else:
			metric = args.metric
			# Explicitly assign jaccard distance because other metrics are implicitly used as distance metrics
			if metric == 'jaccard':
				metric = dist_jaccard
			elif metric == 'jacnop':
				metric = dist_jacnop
				# metric = lambda u, v: 1 - sm.sim_jaccard(u, v)
		if dis_features_matrix is None:
			if OPTIMIZED:
				# Note: pdist takes too much time with custom dist funciton: 1m46 sec for cosine, 40 sec for jaccard vs 8 sec for "cosine"
				sm.pairsim(gram, features_matrix, metid)
				# gram2 = squareform(ValT(1) - pdist(X_train, metric))  # cosine, jaccard, hamming
				# print('Gram:\n', gram, '\nOrig Gram:\n', gram2)
			else:
				gram = squareform(ValT(1) - pdist(features_matrix, metric))  # cosine, jaccard, hamming
		else:
			if OPTIMIZED:
				sm.pairsimdis(gram, features_matrix, dis_features_matrix, metid)
			else:
				if metric == 'cosine':
					metric = dist_cosine
				elif metric == 'hamming':
					metric = dist_hamming
				if OPTIMIZED:
					dis_metric = sm.dissim
				# else:
				# 	dis_metric = metric  # Note: 1-sim metric performs less accurate than the custom dissimilarity metric
				gram = pairsimdis(features_matrix, dis_features_matrix, metric, dis_metric)
		# Save resulting Gram (network nodes similarity) matrix
		savemat(args.output, mdict={'gram': gram})
		return

	# Evaluate Embedding ------------------------------------------------------
	# Map nodes to their features (note:  assumes nodes are labeled as integers 1:N)
	# features_matrix = np.asarray([model[str(node)] for node in range(len(graph))])

	# 2. Shuffle, to create train/test groups
	assert labels_matrix.shape[0] == features_matrix.shape[0], 'All evaluating nodes are expected to be labeled'
	shuffles = []
	for x in range(args.num_shuffles):
		if dis_features_matrix is not None:
			shuffles.append(skshuffle(features_matrix, dis_features_matrix, labels_matrix))
		else:
			shuffles.append(skshuffle(features_matrix, labels_matrix))

	# 3. to score each train/test group
	# all_results = defaultdict(list)

	if args.all:
		training_percents = np.asarray(range(1, 10)) * .1
	else:
		training_percents = _trainperc_dfl

	averages = ["micro", "macro"]
	res = np.full([args.num_shuffles, len(training_percents), len(averages)], np.nan, dtype=ValT)
	# for train_percent in training_percents:
	#     for shuf in shuffles:
	Xdis = None
	Xdis_train = None
	res_ave = None  # Average results
	ii = 0
	jj = 0
	try:
		for ii, train_percent in enumerate(training_percents):
			training_size = int(train_percent * features_matrix.shape[0])
			if OPTIMIZED:
				gram = np.empty((training_size, training_size), dtype=ValT)
			gram_test = np.empty((features_matrix.shape[0] - training_size, training_size), dtype=ValT)
			for jj, shuf in enumerate(shuffles):
				print('Training set #{} ({:.1%}), shuffle #{}'.format(ii, train_percent, jj))
				if dis_features_matrix is not None:
					X, Xdis, y = shuf
					#assert len(X) == len(Xdis), 'Feature matrix partitions validation failed'
				else:
					X, y = shuf

				# training_size = int(train_percent * X.shape[0])
				X_train = X[:training_size]
				if dis_features_matrix is not None:
					Xdis_train = Xdis[:training_size]
				y_train_ = y[:training_size]

				X_test = X[training_size:]
				if dis_features_matrix is not None:
					Xdis_test = Xdis[training_size:]
				if OPTIMIZED:
					y_test = sm.colindicesnz(y[training_size:].tocoo())
				else:
					cy = y[training_size:].tocoo()
					y_test = [[] for _ in range(cy.shape[0])]
					for i, j in zip(cy.row, cy.col):
						y_test[i].append(j)
					cy = None

				# find out how many labels should be predicted
				top_k_list = [len(l) for l in y_test]

				# Classification strategy and similarity matrices
				# clf = TopKRanker(SVC(kernel=args.kernel, cache_size=4096, probability=True), 1)  # TopKRanker(LogisticRegression())
				clf = None
				clweight = 'balanced' if args.balance_classes else None
				if args.solver is None:
					clf = TopKRanker(SVC(kernel=args.kernel, cache_size=4096, probability=True, class_weight=clweight, gamma='scale'))  # TopKRanker(LogisticRegression())
				else:
					clf = TopKRanker(LogisticRegression(solver=args.solver, class_weight=clweight, max_iter=512))
				if args.solver is None and args.kernel == 'precomputed':
					# Note: metric here is distance metric = 1 - sim_metric
					if OPTIMIZED:
						metid = sm.sim_id(args.metric)
					else:
						metric = args.metric
						# Explicitly assign jaccard distance because other metrics are implicitly used as distance metrics
						if metric == 'jaccard':
							metric = dist_jaccard
						elif metric == 'jacnop':
							metric = dist_jacnop
							# metric = lambda u, v: 1 - sm.sim_jaccard(u, v)
					if dis_features_matrix is None:
						if OPTIMIZED:
							# Note: pdist takes too much time with custom dist funciton: 1m46 sec for cosine, 40 sec for jaccard vs 8 sec for "cosine"
							sm.pairsim(gram, X_train, metid)
							# gram2 = squareform(ValT(1) - pdist(X_train, metric))  # cosine, jaccard, hamming
							# print('Gram:\n', gram, '\nOrig Gram:\n', gram2)
							sm.pairsim2(gram_test, X_test, X_train, metid)
							# gram_test2 = ValT(1) - cdist(X_test, X_train, metric);
							# print('\n\nGram test:\n', gram_test, '\nOrig Gram test:\n', gram_test2)
						else:
							gram = squareform(ValT(1) - pdist(X_train, metric))  # cosine, jaccard, hamming
							gram_test = ValT(1) - cdist(X_test, X_train, metric);
					else:
						if OPTIMIZED:
							sm.pairsimdis(gram, X_train, Xdis_train, metid)
							sm.pairsimdis2(gram_test, X_test, X_train, Xdis_test, Xdis_train, metid)
						else:
							if metric == 'cosine':
								metric = dist_cosine
							elif metric == 'hamming':
								metric = dist_hamming
							if OPTIMIZED:
								dis_metric = sm.dissim
							# else:
							# 	dis_metric = metric  # Note: 1-sim metric performs less accurate than the custom dissimilarity metric

							gram = pairsimdis(X_train, Xdis_train, metric, dis_metric)
							# gram_test = 1 - cdist(X_test, X_train, metric);
							#gram_test = np.empty((len(X_test), training_size), dtype=ValT)
							for i in range(len(X_test)):
								for j in range(training_size):
									# gram_test[i, j] = ValT(1) - metric(X_test[i], X_train[j]) - dis_metric(Xdis_test[i], Xdis_train[j])
									# Note: positive gram matrix yields abit more accurate resutls
									gram_test[i, j] = ValT(1) - (metric(X_test[i], X_train[j]) + dis_metric(Xdis_test[i], Xdis_train[j])) / ValT(2)
					clf.fit(gram, y_train_)
					preds = clf.predict(gram_test, top_k_list)
				else:
					clf.fit(X_train, y_train_)
					preds = clf.predict(X_test, top_k_list)

				# results = {}
				#
				# for average in averages:
				#     results[average] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average)
				#
				#  all_results[train_percent].append(res)

				for kk,average in enumerate(averages):
					res[jj,ii,kk] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average)
	finally:
		res_ave = np.nanmean(res, 0)
		res_std = np.nanstd(res, 0)
		print("F1 [micro macro]:")
		print(res_ave)
		if len(res_ave) >= 2:
			finres = np.nanmean(res_ave, 0)
			finstd = np.nanmean(res_std, 0)
			print("Average:  {:.4F} ({:.4F}), {:.4F}".format(finres[0], finstd[0], finres[1]))
		else:
			finres = res_ave
			finstd = res_std
		if args.output and ii + jj >= 1:  # Output only non-empty results;  np.nansum(res_ave, 0) != 0
			hbrief = np.uint16(0)
			if args.accuracy_detailed:
				# Evaluate 2-byte hash of the input args
				hf = md5()
				hf.update(' '.join(sys.argv).encode())
				for i, b in enumerate(hf.digest()):
					hbrief = hbrief ^ b << (8 if i%2 else 0)
				# Output detailed accuracy results
				dname, fname = os.path.split(args.embedding)
				acrname = ''.join((dname, '/acr_', os.path.splitext(fname)[0], '_', str(hbrief), '.mat'))
				print('The detailed accuracy results are saved to: ', acrname)
				try:
					savemat(acrname, mdict={'res': res})
				except IOError as err:
					print('WARNING, detailed accuracy results saving falied to {}: {}'
						.format(acrname, err), file=sys.stderr)
			with open(args.output, 'a') as fres:
				# Output the Header if required
				if not fres.tell():
					fres.write('Dims\tWgh\tBin\tMetric \tNDs\tDVmin\t F1mic\tF1miSD\t F1mac\t Solver'
						'\tBCl\t ExecTime\t   Folds\t StartTime        \tInpHash\tEmbeds\n')
				# File name of the embedding and Dimensions number
				print('{: >4}\t{: >3d}\t{: >3d}\t'.format(features_matrix.shape[1], args.weighted_dims, args.binarize)
					, file=fres, end='')
				# Similarity Metric, weighting, no-dissim and dim-val-min
				if args.solver is None and args.kernel == 'precomputed':
					print('{: <7}\t{: >3d}\t'.format(args.metric[:7]
						, args.no_dissim), file=fres, end='')
				else:
					print('{: <7}\t{: >3}\t'.format('-', '-'), file=fres, end='')
				# F1 micro and macro (average value)
				print('{:<.4F}\t {:<.4F}\t{:<.4F}\t {:<.4F}\t '.format(
					args.dim_vmin, finres[0], finstd[0], finres[1]), file=fres, end='')
				# Solver and execution time
				print('{: >6}\t{: >3}\t {: >8d}\t'.format(
					(args.kernel if args.solver is None else args.solver)[:6]
					, int(args.balance_classes), int(time.clock() - tstart)), file=fres, end='')
				# Folds and the timestamp
				# Correct folds to show counts instead of indices
				jj += 1
				if jj == args.num_shuffles:
					ii += 1
				print('{: >2}.{:0>2}/{: >2}.{:0>2}\t {}\t'.format(ii, jj, res.shape[1], res.shape[0]
					, time.strftime('%y-%m-%d_%H:%M:%S', tstampt)), file=fres, end='')
				print('{: >7}\t{}\n'.format(str(hbrief) if hbrief else '-'
					, os.path.split(args.embedding)[1]), file=fres, end='')
print("Number of unique questions in this dataset " + str(len(unique_ids))
      )  #this is the length of bit vector (number of unique qual_ids)

# generate vectors to give to fit_transform in multilabelbinarizer to further generate unique 1-hot encoding
transform_ids = []
for i in unique_ids:
    transform_ids.append([i])

transform_labels = []
for i in unique_labels:
    transform_labels.append([i])

# In[5]:

# generate dictionary that maps labels and qual_ids to their respective 1-hot encoding
enc = MultiLabelBinarizer()
qual_ids_1hot = (enc.fit_transform(transform_ids)).astype(float)
qual_ids_classes = enc.classes_
qual_ids_dict = dict(zip(unique_ids, qual_ids_1hot))
labels_1hot = enc.fit_transform(transform_labels).astype(float)
labels_classes = enc.classes_
labels_dict = dict(zip(unique_labels, labels_1hot))

# In[6]:

# generate final encoding
final_encoding = []
second_try_flag = False
for i in student_vectors:  #loop over all the students
    interactions_vector = []
    for j in student_vectors[
Ejemplo n.º 28
0
#print(one_hot.fit_transform(feature))

#print(one_hot.classes_)

#Reverse one-Hot Encoding

#print(one_hot.inverse_transform(one_hot.transform(feature)))

#print(pd.get_dummies(feature[:,0]))

#Multiclass One-Hot encoding
multiclass_feature = [("Texas", "Florida"), ("California", "Alabama"),
                      ("Texas", "Florida"), ("Delware", "Florida"),
                      ("Texas", "Alabama")]

one_hot_multiclass = MultiLabelBinarizer()

#print(one_hot_multiclass.fit_transform(multiclass_feature))

#print(one_hot_multiclass.classes_)

#Encoding Ordinal Categories Features

dataframe = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})

scale_mapper = {"Low": 1, "Medium": 2, "High": 3}

#print(dataframe['Score'].replace(scale_mapper))

dataframe = pd.DataFrame({
    "Score":
Ejemplo n.º 29
0
def select_data(XX, YY, ctype, min_samples, outputfolder):
    # convert multilabel to multi-hot
    mlb = MultiLabelBinarizer()

    if ctype == 'diagnostic':
        X = XX[YY.diagnostic_len > 0]
        Y = YY[YY.diagnostic_len > 0]
        mlb.fit(Y.diagnostic.values)
        y = mlb.transform(Y.diagnostic.values)
    elif ctype == 'subdiagnostic':
        counts = pd.Series(np.concatenate(
            YY.subdiagnostic.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.subdiagnostic = YY.subdiagnostic.apply(
            lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['subdiagnostic_len'] = YY.subdiagnostic.apply(lambda x: len(x))
        X = XX[YY.subdiagnostic_len > 0]
        Y = YY[YY.subdiagnostic_len > 0]
        mlb.fit(Y.subdiagnostic.values)
        y = mlb.transform(Y.subdiagnostic.values)
    elif ctype == 'superdiagnostic':
        counts = pd.Series(np.concatenate(
            YY.superdiagnostic.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.superdiagnostic = YY.superdiagnostic.apply(
            lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['superdiagnostic_len'] = YY.superdiagnostic.apply(lambda x: len(x))
        X = XX[YY.superdiagnostic_len > 0]
        Y = YY[YY.superdiagnostic_len > 0]
        mlb.fit(Y.superdiagnostic.values)
        y = mlb.transform(Y.superdiagnostic.values)
    elif ctype == 'form':
        # filter
        counts = pd.Series(np.concatenate(YY.form.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.form = YY.form.apply(
            lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['form_len'] = YY.form.apply(lambda x: len(x))
        # select
        X = XX[YY.form_len > 0]
        Y = YY[YY.form_len > 0]
        mlb.fit(Y.form.values)
        y = mlb.transform(Y.form.values)
    elif ctype == 'rhythm':
        # filter
        counts = pd.Series(np.concatenate(YY.rhythm.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.rhythm = YY.rhythm.apply(
            lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['rhythm_len'] = YY.rhythm.apply(lambda x: len(x))
        # select
        X = XX[YY.rhythm_len > 0]
        Y = YY[YY.rhythm_len > 0]
        mlb.fit(Y.rhythm.values)
        y = mlb.transform(Y.rhythm.values)
    elif ctype == 'all':
        # filter
        counts = pd.Series(np.concatenate(YY.all_scp.values)).value_counts()
        counts = counts[counts > min_samples]
        YY.all_scp = YY.all_scp.apply(
            lambda x: list(set(x).intersection(set(counts.index.values))))
        YY['all_scp_len'] = YY.all_scp.apply(lambda x: len(x))
        # select
        X = XX[YY.all_scp_len > 0]
        Y = YY[YY.all_scp_len > 0]
        mlb.fit(Y.all_scp.values)
        y = mlb.transform(Y.all_scp.values)
    else:
        pass

    # save LabelBinarizer
    with open(outputfolder + 'mlb.pkl', 'wb') as tokenizer:
        pickle.dump(mlb, tokenizer)

    return X, Y, y, mlb
Ejemplo n.º 30
0
 def transform(self, X):
     return MultiLabelBinarizer(classes=self.class_labels).fit_transform(X)