def feature_mean_sd_by_dialect(): train_vec_file = "data/train.vec" train_txt_file = "data/train.txt" train_ivectors = load_ivectors(train_vec_file) train_labels = load_labels(train_txt_file) # Put the data for the dialects in 4 buckets. data_per_dialect = {0: [], 1: [], 2: [], 3: []} for ivector, dialect in zip(train_ivectors, train_labels): data_per_dialect[dialect].append(list(ivector)) # Convert the lists set up above into np.arrays for dialect in data_per_dialect: data_per_dialect[dialect] = np.array(data_per_dialect[dialect]) # Calculate the means of all features separately for all 4 dialects. means_by_dialect = {} for dialect in data_per_dialect: means_by_dialect[dialect] = data_per_dialect[dialect].mean(axis=0) # Put the means into one 2d dataframe. one, two, three, four = means_by_dialect.values() all_means = np.concatenate(([one], [two], [three], [four]), axis=0) # Calculate the standard deviations of all features separately for all # 4 dialects. sd_by_dialect = {} for dialect in data_per_dialect: sd_by_dialect[dialect] = data_per_dialect[dialect].std(axis=0) # Put the standard deviations into one 2d dataframe. one, two, three, four = sd_by_dialect.values() all_sds = np.concatenate(([one], [two], [three], [four]), axis=0) return all_means, all_sds
def print_topology(rules): label_list = dataset.load_labels() root_nodes = set([y for x,y in rules]) tree = {root:[] for root in root_nodes} for rule in rules: from_label, to_label = rule tree[to_label].append(from_label) for parent, children in tree.iteritems(): print label_list[parent] for child in children: print " >", label_list[child]
def infer_topology_rules(y_values=None, verbose=False): if y_values is None: with open(DATA_FOLDER+'labels_int.p', 'r') as f: y_dict = pickle.load(f) y_values = y_dict.values() print "Infering topology from {0} classifications".format(len(y_values)) label_list = dataset.load_labels() n_labels = len(label_list) topology_rules = [] for from_label in range(n_labels): n = 0 counts = np.zeros((31,), dtype=np.int_) if verbose: print "Now doing ", label_list[from_label] for labeling in y_values: if not from_label in labeling: continue n+=1 for x in labeling: counts[x] += 1 for to_label, count in enumerate(counts): if n == 0: #Let's not draw any conclusions from 0 occurences continue if to_label == from_label: #A label will always occur with itself, ignore continue if count == n: topology_rules.append( (from_label, to_label) ) if verbose: #print "Label {0} -> {1} ({2})".format(from_label, index, count) print "{0} -> {1} ({2})".format(label_list[from_label], label_list[to_label], count) print "Done.",len(topology_rules), "rules inferred." return topology_rules
import learn from sklearn.metrics import f1_score, precision_score, recall_score, zero_one_loss, hamming_loss from sklearn.externals import joblib import numpy as np import infer_topology DATA_FOLDER = '../data/' FILES = [ 72102652, 72102581, 88703198, 72120955, 76674874, 89433288, 80114536, 94086649, 87366150, 87366200, 76745162, 114499672, 115989647, 80045894, 88814249, 74177349, 85300096, 94087262, 103763713, 112385287, 74177295, 111392832, 117338702, 105317679 ] FILES = map(str, FILES) label_list = dataset.load_labels() CLASSIFICATIONS = { "72102652": ["Personen- en familierecht"], "72102581": ["Personen- en familierecht"], "88703198": ["Personen- en familierecht"], "72120955": ["Civiel recht"], #Burgerlijk recht; verzekeringsrecht "76674874": ["Civiel recht"], #Huurrecht/Woonrecht "89433288": ["Strafrecht"], #Strafrecht/Strafvordering "80114536": ["Strafrecht"], #Strafrecht/Strafvordering "87366150": ["Bestuursrecht"], #/Staatsrechts "87366200": ["Burgerlijk procesrecht"], #Burgerlijke rechtsvordering "76745162": ["Burgerlijk procesrecht"], #Burgerlijke rechtsvordering "114499672": ["Civiel recht"], #Burgerlijk recht "94086649": ["Bestuursrecht", "Strafrecht"], #Strafrecht/strafvordering "115989647": ["Bestuursrecht"], #Staats-en Bestuursrecht
114499672, 115989647, 80045894, 88814249, 74177349, 85300096, 94087262, 103763713, 112385287, 74177295, 111392832, 117338702, 105317679, ] FILES = map(str, FILES) label_list = dataset.load_labels() CLASSIFICATIONS = { "72102652": ["Personen- en familierecht"], "72102581": ["Personen- en familierecht"], "88703198": ["Personen- en familierecht"], "72120955": ["Civiel recht"], # Burgerlijk recht; verzekeringsrecht "76674874": ["Civiel recht"], # Huurrecht/Woonrecht "89433288": ["Strafrecht"], # Strafrecht/Strafvordering "80114536": ["Strafrecht"], # Strafrecht/Strafvordering "87366150": ["Bestuursrecht"], # /Staatsrechts "87366200": ["Burgerlijk procesrecht"], # Burgerlijke rechtsvordering "76745162": ["Burgerlijk procesrecht"], # Burgerlijke rechtsvordering "114499672": ["Civiel recht"], # Burgerlijk recht "94086649": ["Bestuursrecht", "Strafrecht"], # Strafrecht/strafvordering
util.print_dataframe( merged.corr('pearson').loc[:'arousal_var', 'Affect_Anger':]) ground_truth = ground_truth.drop(columns='Key') print('\nPearson between features:') util.print_dataframe(data.corr('pearson')) # print('\nKendall:') # util.print_dataframe(data.corr('kendall')) # print('\nSpearman:') # util.print_dataframe(data.corr('spearman')) print('\nMutual Information:') for c, cols in enumerate(data): print( cols, mutual_info_regression(data, data.iloc[:, c], discrete_features=False)) # for col_a, col_b in itertools.combinations(data.columns.tolist(), 2): # print(col_a, col_b, pearsonr(data.loc[:, col_a], data.loc[:, col_b])) ROOT = r'D:\Datasets\Bilgi Universitesi' info, labels = db.load_labels(os.path.join(ROOT, 'SPSS.csv')) text_va = db.load_text_va(info, os.path.join(ROOT, 'text_va')) # fake = pd.DataFrame(np.random.randint(0, 5, size=text_va.shape), columns=text_va.columns) # fake['Key'] = text_va['Key'] independence_test(text_va, labels) text_va = util.preprocess_text_va(text_va) # fake = pd.DataFrame(np.random.randint(0, 5, size=text_va.shape), columns=text_va.columns) # fake['Key'] = text_va['Key'] independence_test(text_va, labels)
def multilabel_binary_y(y): label_list = dataset.load_labels() mlb = MultiLabelBinarizer(classes=range(len(label_list))) return mlb.fit_transform(y)
matplotlib.use('Agg') import matplotlib.pyplot as plt import matplotlib.patches as mpatches import matplotlib.axes as axes import numpy as np import sklearn.decomposition from sklearn.manifold import TSNE import tensorflow as tf # Internal import dataset from model import Model COLOR_MAP = plt.cm.gist_rainbow LABELS = dataset.load_labels() SEED = 0x37255c25 def to_color(index): return index / (len(LABELS) - 1) USE_TSNE = True def pca(train, validate, fname=None): fig = plt.figure(1, figsize=(8, 6)) if not USE_TSNE: pca = sklearn.decomposition.PCA(n_components=2, random_state=SEED)
type=str, help="Use an existing model for generation.") args = parser.parse_args() # Training process. if args.train: TRAIN_VEC_FILE = "data/train.vec" TRAIN_TXT_FILE = "data/train.txt" batch_size = 32 num_epochs = 100 lr = 1e-4 train_ivectors = load_ivectors(TRAIN_VEC_FILE) train_labels = load_labels(TRAIN_TXT_FILE) dataset = SwissDialectDataset(train_ivectors, train_labels) data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True) writer = SummaryWriter() generator = Generator() discriminator = Discriminator() criterion = nn.BCELoss() discr_optimizer = torch.optim.Adam(discriminator.parameters(), lr=lr) gen_optimizer = torch.optim.Adam(generator.parameters(), lr=lr) for epoch in range(num_epochs):
"lr": 0.01, "log_interval": 50, "c": 1.0, "kernel": "rbf", "degree": 3, "max_iter": -1, } model = None best_model = None final_model = None if args.original_split: train_ivectors = load_ivectors(TRAIN_VEC_FILE) train_labels = load_labels(TRAIN_TXT_FILE) test_ivectors = load_ivectors(TEST_VEC_FILE) test_labels = load_labels(TEST_TXT_FILE) if args.gan_ivec_file: gan_ivec_file = args.gan_ivec_file split_path = os.path.split(gan_ivec_file) split_fname = re.split("-|\.", split_path[1]) gan_txt_file = os.path.join( split_path[0], split_fname[0] + "-labels-" + split_fname[2] + ".txt") gan_ivectors = load_ivectors(args.gan_ivec_file) gan_labels = load_labels(gan_txt_file) train_ivectors = np.concatenate((train_ivectors, gan_ivectors),