Ejemplo n.º 1
0
def split_and_train(path_to_df, text_field, label_field, split_params={}, save_dir="./", preprocessing_function=None,
                    additional_fields_and_preps={}, additional_data_paths=[], hyperparams={}, log_dir="./",
                    use_gpu=False, postfix="", verbose=True, remove_extra_labels=True):
    """
    Split dataframe with the given params into train and test. Train a model on train and
    :param path_to_df: str, path to csv or parquet file
    :param text_field: str, column of the dataframe in which is the text that should be classified
    :param label_field: str, column of the dataframe in which is the label of the corresponding text
    :param split_params: dict, input format: {"seed": int, default 17, "fraction": float, default: 0.1}
    :param save_dir: str, directory to save the txt files
    :param preprocessing_function: function, function to apply on text_field column
    :param additional_fields_and_preps: dict. Dictionary in the following format
    {field_name1: preprocessing_function1, field_name2: preprocessing_function2} to enable custom preprocessing for
    different fields
    :param additional_data_paths: list of str, paths of fasttext format additional data to concat with train file
    :param hyperparams: dict, all hyperparams for train_supervised
    :param log_dir: str, directory to save the training files and the model
    :param postfix: str, postfix to add to train and validation files
    :param verbose: bool
    :param use_gpu: bool, use gpu for training
    :param verbose: bool
    :param remove_extra_labels: remove datapoints with labels which appear in additional_data_paths but not in
    train_data_path
    :return: object. FastTextModel
    """

    train_data_path, val_data_path = \
        train_val_split_from_df(path_to_df=path_to_df, text_field=text_field, label_field=label_field,
                                split_params=split_params, save_dir=save_dir,
                                preprocessing_function=preprocessing_function, verbose=verbose,
                                additional_fields_and_preps=additional_fields_and_preps, postfix=postfix)
    if verbose:
        print("train path {}".format(train_data_path))
        print("val path {}".format(val_data_path))

    hypers_new = hyperparams.copy()

    if additional_fields_and_preps:
        hypers_new["result_dir"] = os.path.join(log_dir, "{}_{}".format(hash_function(preprocessing_function),
                                                                        "_".join(additional_fields_and_preps.keys())))
    else:
        hypers_new["result_dir"] = os.path.join(log_dir, hash_function(preprocessing_function))
    hypers_new["use_gpu"] = int(use_gpu)
    hypers_new["split_and_train_params"] = {
        "df_path": path_to_df, "split_params": split_params,
        "additional_fields_and_preps": additional_fields_and_preps, "remove_extra_labels": remove_extra_labels
    }

    return train_supervised(train_data_path=train_data_path, val_data_path=val_data_path,
                            additional_data_paths=additional_data_paths, hyperparams=hypers_new,
                            preprocessing_function=preprocessing_function, remove_extra_labels=remove_extra_labels,
                            log_dir=log_dir, use_gpu=use_gpu, verbose=verbose)
Ejemplo n.º 2
0
    def send(self, msg):
        message_hash = utils.hash_function(str(msg))
        prev_hash = self.blockchain.get_latest_block_hash()
        if prev_hash is None:
            prev_hash = constants.GENESIS_HASH

        current_hash = utils.hash_together(message_hash, prev_hash)
        self.blockchain.add(current_hash)

        self.outfile.write("previous hash is " + prev_hash + "\n")
        self.outfile.write("current hash is " + current_hash + "\n")
        self.outfile.write("________________________\n")
        self.outfile.flush()

        return True
Ejemplo n.º 3
0
def check_sr(sr, sc):
    return sr == hash_function(sc + cc + secret)
Ejemplo n.º 4
0
# client asking the server for authentication
sock.send('AUTH')


# client verifies the expected value of 'sr'
def check_sr(sr, sc):
    return sr == hash_function(sc + cc + secret)


while True:
    data = sock.recv(size)

    # server challenge in responce to the 'AUTH' request
    if 'SC.' in data:
        sc = data[3:]
        cr = hash_function(cc + sc + secret)

        # client sends 'cr' and 'cc' to the server
        sock.send('CR.' + cr + 'CC.' + cc)

    # server sends 'sr'
    if 'SR.' in data:
        sr = data[3:]

        if check_sr(sr, sc):
            print 'TRUE'
        else:
            print 'FALSE'

    # client receive an error response
    if 'ERROR' in data:
Ejemplo n.º 5
0
def cross_validate(path_to_df, text_field, label_field, n_folds=5, preprocessing_function=None,
                   additional_fields_and_preps={}, additional_data_paths=[], hyperparams={}, report_top_k=True,
                   log_dir="./", use_gpu=False, return_models=False, seed=17, verbose=False, remove_extra_labels=True):
    """

    :param path_to_df: str, path to csv or parquet file
    :param text_field: str, column of the dataframe in which is the text that should be classified
    :param label_field: str, column of the dataframe in which is the label of the corresponding text
    :param n_folds: int, number of folds
    :param preprocessing_function: function, function to apply on text_field column
    :param additional_fields_and_preps: dict. Dictionary in the following format
    {field_name1: preprocessing_function1, field_name2: preprocessing_function2} to enable custom preprocessing for
    different fields
    :param additional_data_paths: list of str, paths of fasttext format additional data to concat with train file
     :param hyperparams: dict, all hyperparams for train_supervised
    :param report_top_k: bool. If True will return top k scores, otherwise top 1 scores
    :param log_dir: str, directory to save the training files and the model
    :param use_gpu: bool, use gpu for training
    :param return_models: bool. If True will return tuple (scores, models)
    :param seed: int
    :param verbose: bool.
    :param remove_extra_labels: remove datapoints with labels which appear in additional_data_paths but not in
    train_data_path
    :return: list. The scores for each split
    """
    models, scores = [], []

    if path_to_df.endswith("parquet"):
        df = pd.read_parquet(path_to_df)
    else:
        df = pd.read_csv(path_to_df)

    for added_field, prep_f in additional_fields_and_preps.items():
        if df[added_field].dtype != "object":
            df[added_field] = df[added_field].astype(str)
        if prep_f:
            df[added_field] = df[added_field].map(prep_f)
        df[text_field] = df[text_field] + " " + df[added_field]

    for fold_number, val_mask in enumerate(split_list(len(df), n_folds, seed)):
        train_data_path, val_data_path = preprocess_and_save(df, val_mask, text_field, label_field,
                                                             preprocessing_function, additional_fields_and_preps,
                                                             "./tmp_txt/", "_split{}".format(fold_number), verbose, [])

        if verbose:
            print("train path {}".format(train_data_path))
            print("val path {}".format(val_data_path))

        hypers_new = hyperparams.copy()

        if additional_fields_and_preps:
            hypers_new["result_dir"] = os.path.join(log_dir, "{}_{}".format(hash_function(preprocessing_function),
                                                                            "_".join(
                                                                                additional_fields_and_preps.keys())))
        else:
            hypers_new["result_dir"] = os.path.join(log_dir, hash_function(preprocessing_function))
        hypers_new["use_gpu"] = int(use_gpu)
        hypers_new["split_and_train_params"] = {
            "df_path": path_to_df,
            "additional_fields_and_preps": additional_fields_and_preps, "remove_extra_labels": remove_extra_labels
        }

        model = train_supervised(train_data_path=train_data_path, val_data_path=val_data_path,
                                 additional_data_paths=additional_data_paths, hyperparams=hypers_new,
                                 preprocessing_function=preprocessing_function, remove_extra_labels=remove_extra_labels,
                                 log_dir=log_dir, use_gpu=use_gpu, verbose=verbose)

        if report_top_k:
            scores.append(model.top_k_accuracy)
        else:
            scores.append(model.top_1_accuracy)
        if return_models:
            models.append(model)
        del model
        gc.collect()
    if return_models:
        return scores, models
    return scores
Ejemplo n.º 6
0
 def emit_intermediate(self, key, value):
     hash_value = hash_function(key, self.n_reducers)
     store_key = 'intermediate_' + str(hash_value)
     store_value = str(key) + ':' + str(value)
     self.fs_client.append(store_key, store_value)
Ejemplo n.º 7
0
def check_cr(cr, cc):
    return cr == hash_function(cc + sc + secret)
Ejemplo n.º 8
0
    try:
        while True:
            data = connection.recv(1024)

            print data, len(data)

            if data == 'AUTH':
                connection.send('SC.' + sc)

            if 'CR.' in data:
                try:
                    cc_index = data.index('CC.')
                except (ValueError):
                    connection.send('ERROR')
                    break

                cr = data[3:cc_index]
                cc = data[(cc_index + 3):]

                # server verified 'cr'
                if check_cr(cr, cc):
                    sr = hash_function(sc + cc + secret)
                    connection.send('SR.' + sr)
                else:
                    connection.send('ERROR')
                    break

    finally:
        connection.close()
Ejemplo n.º 9
0
def check_cr(cr, cc):
	return cr == hash_function(cc + sc + secret)
Ejemplo n.º 10
0
	try:
		while True:
			data = connection.recv(1024)

			print data, len(data)

			if data == 'AUTH':
				connection.send('SC.' + sc)

			if 'CR.' in data:
				try:
					cc_index = data.index('CC.')
				except (ValueError):
					connection.send('ERROR')
					break

				cr = data[3:cc_index]
				cc = data[(cc_index+3):]

				# server verified 'cr'
				if check_cr(cr, cc):
					sr = hash_function(sc + cc + secret)
					connection.send('SR.' + sr)
				else:
					connection.send('ERROR')
					break

	finally:
		connection.close()
def check_sr(sr, sc):
	return sr == hash_function(sc + cc + secret)
sock.connect((host,port))

# client asking the server for authentication
sock.send('AUTH')

# client verifies the expected value of 'sr'
def check_sr(sr, sc):
	return sr == hash_function(sc + cc + secret)

while True:
	data = sock.recv(size)

	# server challenge in responce to the 'AUTH' request
	if 'SC.' in data:
		sc = data[3:]
		cr = hash_function(cc + sc + secret)

		# client sends 'cr' and 'cc' to the server
		sock.send('CR.' + cr + 'CC.' + cc)

	# server sends 'sr'
	if 'SR.' in data:
		sr = data[3:]

		if check_sr(sr, sc):
			print 'TRUE'
			print 'Authenticated Successfully'
			#Popen([executable, 'python hybrid_attr_iden_main.py'], creationflags=CREATE_NEW_CONSOLE)
			subprocess.call('python hybrid_attr_iden_main.py', shell=True)
		else:
			print 'FALSE'
Ejemplo n.º 13
0
import utils as utils
GENESIS_HASH = utils.hash_function('0')