Esempio n. 1
0
def feature_mean_sd_by_dialect():
    train_vec_file = "data/train.vec"
    train_txt_file = "data/train.txt"

    train_ivectors = load_ivectors(train_vec_file)
    train_labels = load_labels(train_txt_file)

    # Put the data for the dialects in 4 buckets.
    data_per_dialect = {0: [], 1: [], 2: [], 3: []}
    for ivector, dialect in zip(train_ivectors, train_labels):
        data_per_dialect[dialect].append(list(ivector))

    # Convert the lists set up above into np.arrays
    for dialect in data_per_dialect:
        data_per_dialect[dialect] = np.array(data_per_dialect[dialect])

    # Calculate the means of all features separately for all 4 dialects.
    means_by_dialect = {}
    for dialect in data_per_dialect:
        means_by_dialect[dialect] = data_per_dialect[dialect].mean(axis=0)
    # Put the means into one 2d dataframe.
    one, two, three, four = means_by_dialect.values()
    all_means = np.concatenate(([one], [two], [three], [four]), axis=0)

    # Calculate the standard deviations of all features separately for all
    # 4 dialects.
    sd_by_dialect = {}
    for dialect in data_per_dialect:
        sd_by_dialect[dialect] = data_per_dialect[dialect].std(axis=0)
    # Put the standard deviations into one 2d dataframe.
    one, two, three, four = sd_by_dialect.values()
    all_sds = np.concatenate(([one], [two], [three], [four]), axis=0)

    return all_means, all_sds
Esempio n. 2
0
def print_topology(rules):

    label_list = dataset.load_labels()

    root_nodes = set([y for x,y in rules])
    tree = {root:[] for root in root_nodes}

    for rule in rules:
        from_label, to_label = rule
        tree[to_label].append(from_label)

    for parent, children in tree.iteritems():
        print label_list[parent]
        for child in children:
            print "  >", label_list[child]
Esempio n. 3
0
def infer_topology_rules(y_values=None, verbose=False):
    if y_values is None:
        with open(DATA_FOLDER+'labels_int.p', 'r') as f:
            y_dict = pickle.load(f)
            y_values = y_dict.values()

    print "Infering topology from {0} classifications".format(len(y_values))


    label_list = dataset.load_labels()
    n_labels = len(label_list)

    topology_rules = []

    for from_label in range(n_labels):
        n = 0
        counts = np.zeros((31,), dtype=np.int_)

        if verbose:
            print "Now doing ", label_list[from_label]

        for labeling in y_values:
            if not from_label in labeling:
                continue

            n+=1
            for x in labeling:
                counts[x] += 1

        for to_label, count in enumerate(counts):
            if n == 0: #Let's not draw any conclusions from 0 occurences
                continue
            if to_label == from_label: #A label will always occur with itself, ignore
                continue

            if count == n:

                topology_rules.append( (from_label, to_label) )

                if verbose:
                    #print "Label {0} -> {1} ({2})".format(from_label, index, count)
                    print "{0} -> {1} ({2})".format(label_list[from_label], label_list[to_label], count)

    print "Done.",len(topology_rules), "rules inferred."
    return topology_rules
import learn
from sklearn.metrics import f1_score, precision_score, recall_score, zero_one_loss, hamming_loss
from sklearn.externals import joblib
import numpy as np
import infer_topology

DATA_FOLDER = '../data/'

FILES = [
    72102652, 72102581, 88703198, 72120955, 76674874, 89433288, 80114536,
    94086649, 87366150, 87366200, 76745162, 114499672, 115989647, 80045894,
    88814249, 74177349, 85300096, 94087262, 103763713, 112385287, 74177295,
    111392832, 117338702, 105317679
]
FILES = map(str, FILES)
label_list = dataset.load_labels()

CLASSIFICATIONS = {
    "72102652": ["Personen- en familierecht"],
    "72102581": ["Personen- en familierecht"],
    "88703198": ["Personen- en familierecht"],
    "72120955": ["Civiel recht"],  #Burgerlijk recht; verzekeringsrecht
    "76674874": ["Civiel recht"],  #Huurrecht/Woonrecht
    "89433288": ["Strafrecht"],  #Strafrecht/Strafvordering
    "80114536": ["Strafrecht"],  #Strafrecht/Strafvordering
    "87366150": ["Bestuursrecht"],  #/Staatsrechts
    "87366200": ["Burgerlijk procesrecht"],  #Burgerlijke rechtsvordering
    "76745162": ["Burgerlijk procesrecht"],  #Burgerlijke rechtsvordering
    "114499672": ["Civiel recht"],  #Burgerlijk recht
    "94086649": ["Bestuursrecht", "Strafrecht"],  #Strafrecht/strafvordering
    "115989647": ["Bestuursrecht"],  #Staats-en Bestuursrecht
    114499672,
    115989647,
    80045894,
    88814249,
    74177349,
    85300096,
    94087262,
    103763713,
    112385287,
    74177295,
    111392832,
    117338702,
    105317679,
]
FILES = map(str, FILES)
label_list = dataset.load_labels()


CLASSIFICATIONS = {
    "72102652": ["Personen- en familierecht"],
    "72102581": ["Personen- en familierecht"],
    "88703198": ["Personen- en familierecht"],
    "72120955": ["Civiel recht"],  # Burgerlijk recht; verzekeringsrecht
    "76674874": ["Civiel recht"],  # Huurrecht/Woonrecht
    "89433288": ["Strafrecht"],  # Strafrecht/Strafvordering
    "80114536": ["Strafrecht"],  # Strafrecht/Strafvordering
    "87366150": ["Bestuursrecht"],  # /Staatsrechts
    "87366200": ["Burgerlijk procesrecht"],  # Burgerlijke rechtsvordering
    "76745162": ["Burgerlijk procesrecht"],  # Burgerlijke rechtsvordering
    "114499672": ["Civiel recht"],  # Burgerlijk recht
    "94086649": ["Bestuursrecht", "Strafrecht"],  # Strafrecht/strafvordering
        util.print_dataframe(
            merged.corr('pearson').loc[:'arousal_var', 'Affect_Anger':])
    ground_truth = ground_truth.drop(columns='Key')
    print('\nPearson between features:')
    util.print_dataframe(data.corr('pearson'))
    # print('\nKendall:')
    # util.print_dataframe(data.corr('kendall'))
    # print('\nSpearman:')
    # util.print_dataframe(data.corr('spearman'))
    print('\nMutual Information:')
    for c, cols in enumerate(data):
        print(
            cols,
            mutual_info_regression(data,
                                   data.iloc[:, c],
                                   discrete_features=False))
    # for col_a, col_b in itertools.combinations(data.columns.tolist(), 2):
    #     print(col_a, col_b, pearsonr(data.loc[:, col_a], data.loc[:, col_b]))


ROOT = r'D:\Datasets\Bilgi Universitesi'
info, labels = db.load_labels(os.path.join(ROOT, 'SPSS.csv'))
text_va = db.load_text_va(info, os.path.join(ROOT, 'text_va'))
# fake = pd.DataFrame(np.random.randint(0, 5, size=text_va.shape), columns=text_va.columns)
# fake['Key'] = text_va['Key']
independence_test(text_va, labels)
text_va = util.preprocess_text_va(text_va)
# fake = pd.DataFrame(np.random.randint(0, 5, size=text_va.shape), columns=text_va.columns)
# fake['Key'] = text_va['Key']
independence_test(text_va, labels)
Esempio n. 7
0
def multilabel_binary_y(y):
    label_list = dataset.load_labels()

    mlb = MultiLabelBinarizer(classes=range(len(label_list)))
    return mlb.fit_transform(y)
Esempio n. 8
0
    matplotlib.use('Agg')

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.axes as axes
import numpy as np
import sklearn.decomposition
from sklearn.manifold import TSNE
import tensorflow as tf

# Internal
import dataset
from model import Model

COLOR_MAP = plt.cm.gist_rainbow
LABELS = dataset.load_labels()
SEED = 0x37255c25


def to_color(index):
    return index / (len(LABELS) - 1)


USE_TSNE = True


def pca(train, validate, fname=None):
    fig = plt.figure(1, figsize=(8, 6))
    if not USE_TSNE:
        pca = sklearn.decomposition.PCA(n_components=2, random_state=SEED)
Esempio n. 9
0
                        type=str,
                        help="Use an existing model for generation.")

    args = parser.parse_args()

    # Training process.
    if args.train:
        TRAIN_VEC_FILE = "data/train.vec"
        TRAIN_TXT_FILE = "data/train.txt"

        batch_size = 32
        num_epochs = 100
        lr = 1e-4

        train_ivectors = load_ivectors(TRAIN_VEC_FILE)
        train_labels = load_labels(TRAIN_TXT_FILE)
        dataset = SwissDialectDataset(train_ivectors, train_labels)
        data_loader = torch.utils.data.DataLoader(dataset,
                                                  batch_size=batch_size,
                                                  shuffle=True)

        writer = SummaryWriter()

        generator = Generator()
        discriminator = Discriminator()

        criterion = nn.BCELoss()
        discr_optimizer = torch.optim.Adam(discriminator.parameters(), lr=lr)
        gen_optimizer = torch.optim.Adam(generator.parameters(), lr=lr)

        for epoch in range(num_epochs):
Esempio n. 10
0
def multilabel_binary_y(y):
    label_list = dataset.load_labels()

    mlb = MultiLabelBinarizer(classes=range(len(label_list)))
    return mlb.fit_transform(y)
Esempio n. 11
0
        "lr": 0.01,
        "log_interval": 50,
        "c": 1.0,
        "kernel": "rbf",
        "degree": 3,
        "max_iter": -1,
    }

    model = None
    best_model = None
    final_model = None

    if args.original_split:

        train_ivectors = load_ivectors(TRAIN_VEC_FILE)
        train_labels = load_labels(TRAIN_TXT_FILE)
        test_ivectors = load_ivectors(TEST_VEC_FILE)
        test_labels = load_labels(TEST_TXT_FILE)

        if args.gan_ivec_file:
            gan_ivec_file = args.gan_ivec_file
            split_path = os.path.split(gan_ivec_file)
            split_fname = re.split("-|\.", split_path[1])
            gan_txt_file = os.path.join(
                split_path[0],
                split_fname[0] + "-labels-" + split_fname[2] + ".txt")

            gan_ivectors = load_ivectors(args.gan_ivec_file)
            gan_labels = load_labels(gan_txt_file)

            train_ivectors = np.concatenate((train_ivectors, gan_ivectors),