def experiment2(clusters_number, lists_number): reader = DataReader() # "../data/test1.json") for i in range(lists_number - 1, lists_number): for j in range(0, 2): input_matrix = reader.get_matrix_by_list(i) if j == 1: kmeans = KMeans(n_clusters=clusters_number, init='k-means++') else: centroids = create_initial_centroids(clusters_number, input_matrix) kmeans = KMeans(n_clusters=clusters_number, n_init=1, init=centroids) clasterization = kmeans.fit_predict(input_matrix) sets = users_index_sets_to_users_sets( clusters_list_to_users_index_sets(clasterization), reader) print("\nClasterization by list %s" % i) show_users_sets(sets) out = OutputWriter() out.write_rewrite( OutFiles.centroids_custom if j == 0 else OutFiles.centroids_embedded, "") print("Metrics:") for user_set in sets: m = TeamMetric(user_set)
def main(): """ For a given set of pseudo-random number generator (PRNG) seeds, model the function which maps the domain (X) to the range(Y), where X is the index of the sequence for the number generated and Y is the output of the PRNG at that index. """ # Read or Generate data df = Data().get_data() # print("First {} row of data:\n{}".format(3, df.head(n=3))) # Quickly visualise given (row=integer) or all (raw=None) data. line_plot_all(df=df, row=None) display_plots() # Train and predict each PRNG seed's sequence using an Latent State Model. RMSEs = [] for index, row in df.iterrows(): print("LSTM training for PRNG seed {}".format(row['seed'])) lstm = LatentStateModel(df_row=row) lstm.train_recurrent_network() lstm.predict_values() print("RMSE {}".format(lstm.rmse())) RMSEs.append(lstm.rmse()) line_plot_with_predictions(row=row, data_predicted=lstm.y_predicted) cross_correlation(row=row, data_predicted=lstm.y_predicted) display_plots() print("Average RMSE {}".format(sum(RMSEs)/len(RMSEs)))
def read_row(filenames): """Read a row of data from list of H5 files""" reader = DataReader(filenames) x, y, s = reader.read_row_tf() x.set_shape((3, 160, 320)) y.set_shape(1) s.set_shape(1) return x, y, s
def proba_util(self, features, proba_tol): preds = [] feature = [features[0]] for i, x in enumerate(features[1:]): if features[i][-2] > x[-2]: preds.extend(dr.predict_from_proba(self.regr.predict_proba(feature), proba_tol)) feature = [x] else: feature.append(x) preds.extend(dr.predict_from_proba(self.regr.predict_proba(feature), proba_tol)) return preds
def main(argv): logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) log = logging.getLogger(__name__) if len(argv) < 4: log.error( "4 arguments expected: <clinical_trials_path> <drugs_path> <pubmed_path> " "<output_path>") sys.exit(1) spark = get_spark_session("medical-project") clinical_trials_path = argv[0] drugs_path = argv[1] pubmed_path = argv[2] output_path = argv[3] log.info(f'clinical_trials_path is {clinical_trials_path}') log.info(f'drugs_path is {drugs_path}') log.info(f'pubmed_path is {pubmed_path}') log.info(f'output_path is {output_path}') clinical_trials_schema = StructType([ StructField(CLINICAL_TRIALS_ID, StringType(), True), StructField(CLINICAL_TRIALS_SCIENTIFIC_TITLE, StringType(), True), StructField(CLINICAL_TRIALS_DATE, StringType(), True), StructField(CLINICAL_TRIALS_JOURNAL, StringType(), True), ]) drugs_schema = StructType([ StructField(DRUGS_ATCCODE, StringType(), True), StructField(DRUGS_NAME, StringType(), True) ]) pubmed_schema = StructType([ StructField(PUBMED_ID, StringType(), True), StructField(PUBMED_TITLE, StringType(), True), StructField(PUBMED_DATE, StringType(), True), StructField(PUBMED_JOURNAL, StringType(), True), ]) # suppose that data comes in csv format clinical_trials_df = DataReader.read_csv_with_header_and_schema( spark, clinical_trials_schema, clinical_trials_path) drugs_df = DataReader.read_csv_with_header_and_schema( spark, drugs_schema, drugs_path) pubmed_df = DataReader.read_csv_with_header_and_schema( spark, pubmed_schema, pubmed_path) DrugMentionAnalysis.find_drug_mentions(clinical_trials_df, drugs_df, pubmed_df, output_path)
def experiment1(clusters_number, lists_number): reader = DataReader() #"../data/test1.json" kmeans = KMeans(n_clusters=clusters_number) clasterizations = [] for i in range(0, lists_number): clasterization = kmeans.fit_predict(reader.get_matrix_by_list(i)) sets = users_index_sets_to_users_sets( clusters_list_to_users_index_sets(clasterization), reader) print("Clasterisation by list %s" % i) show_users_sets(sets) for user_set in sets: m = TeamMetric(user_set) print("Metric is: " + str(m))
def validate_proba(self, file, proba_tol): data = dr(file).read_data(precision = self.precision) features = data[:, 2:-1] labels = data[:, -1].astype(int) result = [] k_fold = KFold(n_splits = 2) for train, test in k_fold.split(features): self.train(features[train], labels[train]) preds = self.proba_util(features[test], proba_tol) result.append(precision_score(labels[test], preds, pos_label = 1)) dr.print_report(self.debug, preds, labels[test]) #Print Classification Report return result
def kek(self, proba, proba_tol): file, exp_file = 'corpus_scores\\v2_5_raw_inv.txt', 'corpus_scores\\10_opt_raw.txt' data = dr(file).read_data(precision = self.precision) features = data[:, 2:-1] labels = data[:, -1].astype(int) exp_data = dr(exp_file).read_data(precision = self.precision) exp_features = data[:, 2:-1] exp_labels = data[:, -1].astype(int) f1, f2, l1, l2 = train_test_split(features, labels, test_size = 0.5, random_state = 0) ef1, ef2, el1, el2 = train_test_split(exp_features, exp_labels, test_size = 0.5, random_state = 0) self.train(ef1, el1) if proba: preds = self.proba_util(f2, proba_tol) result = [precision_score(l2, preds, pos_label = 1)] dr.print_report(self.debug, preds, l2) #Print Classification Report self.train(ef2, el2) preds = self.proba_util(f1, proba_tol) result.append(precision_score(l1, preds, pos_label = 1)) dr.print_report(self.debug, preds, l1) #Print Classification Report else: preds = self.regr.predict(f2) result = [precision_score(l2, preds, pos_label = 1)] dr.print_report(self.debug, preds, l2) #Print Classification Report self.train(ef2, el2) preds = self.regr.predict(f1) result.append(precision_score(l1, preds, pos_label = 1)) dr.print_report(self.debug, preds, l1) #Print Classification Report return result
def cluster_users(clustering_tool, reader: DataReader, clustered_users: List[User], clusters_number: int, lists_count: int) -> Clusterings: """ Cluster users by lists with numbers from 0 to given lists_count. Users are stiling from the reader object. In result clusters appears all user from the reader except users in clustered_users. :param clustering_tool: tool from sci-kit for clustering :param reader: DataReader object :param clustered_users: list of users that won't appear in clusterings :param clusters_number: expected number of clusters :param lists_count: cluster by lists from 1 to lists_count :return: Clusterings object, representing clusterings by different lists """ clustering = Clusterings(clusters_number) all_users = reader.get_all_users() for list_number in range(0, lists_count): features_matrix = get_matrix_by_list_for_not_clustered_users( all_users, list_number, clustered_users) clusters_list = clustering_tool.fit_predict(features_matrix) clustering.add_clustering_for_list( convert_clusters_list_to_users_sets(reader, clusters_list, clustered_users), list_number) return clustering
def agglomerative_vs_pc(teams_number): need_balance = False for variant in variants: reader = DataReader(variant) for teams_number in range(2, 10): clusterize_and_compare_by_desires(reader, teams_number, need_balance)
def main(): train_path = 'C:/Users/dronp/Documents/TPC/train' train_filename = 'gold_labels.txt' data, ids = DataReader().read_gold_data(train_path, train_filename) texts = [example[0] for example in data] labels = [example[1] for example in data] solution = Solution() predicted = solution.predict(texts) accuracy_evaluator = AccuracyEvaluator() accuracy_evaluator.evaluate(labels, predicted) print(quality(predicted, labels))
def complete_vs_avg(teams_number): for variant in variants: reader = DataReader(variant) def form_line(metric: ClusteringMetric): return "{},{},{}".format(metric.average_metric, metric.min_metric, metric.max_metric) clustering_alg = UsersAgglomerativeClustering(reader, teams_number) clustering_alg.linkage = "complete" sets = clustering_alg.clusterize() complete = ClusteringMetric(sets) clustering_alg = UsersAgglomerativeClustering(reader, teams_number) clustering_alg.linkage = "average" sets = clustering_alg.clusterize() average = ClusteringMetric(sets) print(form_line(average) + "," + form_line(complete))
# Gave good metric # [6,13,20,24] # [7,14,15,17] # [9,12,18,19] # [4,10,11,25] # [5,16,21,22] # The best clustering ever # [10,20,18,9] # [16,15,19,22] # [17,14,7,5] # [24,25,4,13] # [6,11,12,21] if __name__ == '__main__': reader = DataReader("../data/ms-sne_names.json") clusters_ids = [[41, 50, 51], [42, 46, 47], [43, 54], [44, 53], [45, 48]] # Create clusters of users clusters = [[ reader.get_user_by_id(clusters_ids[cluster_index][user_index]) for user_index in range(0, len(clusters_ids[cluster_index])) ] for cluster_index in range(0, len(clusters_ids))] # Display clusters print("\nFinal clusters:") show_users_sets(clusters) # Display clusters metrics for user_set in clusters: metric = TeamMetric(set(user_set))
def experiment3(clustering_tool_type, clusters_number, input_data_file_name, lists_count): result_clusters = [] is_all_clustered = False reader = DataReader(input_data_file_name) clustered_users = [] users_count = len(cu.get_not_clustered_users_set(reader, clustered_users)) max_cluster_size = int(ceil(users_count / clusters_number)) while not is_all_clustered: # Get clusterings by lists clustering_tool = cu.ClusteringTools.build_clustering_tool( clusters_number, max_cluster_size, clustering_tool_type) clusterings = cu.cluster_users(clustering_tool, reader, clustered_users, clusters_number, lists_count) # Displaying info about the clustering (temporary) print("\nClustering by list %s" % 1) show_users_sets(clusterings.get_clustering_by_list_number(0)) print("Clustering by list %s" % 2) show_users_sets(clusterings.get_clustering_by_list_number(1)) # Find the maximum common part of the clusters of the different lists new_cluster = clusterings.get_max_common_part_of_clusterings() print("Common part: " + str([user.get_id() for user in new_cluster])) # Is it necessary to kick the user? while len(new_cluster) > max_cluster_size: new_cluster = cu.kick_user_from_cluster(new_cluster, lists_count) # Remember users which have been clustered clustered_users.extend(new_cluster) # Save cluster and reduce required clusters number result_clusters.append(new_cluster) clusters_number -= 1 # Check the terminal condition is_all_clustered = True if len( result_clusters) >= CLUSTERS_COUNT else False # Display clusters before balancing print("\nClusters before balancing:") show_users_sets(result_clusters) # Display clusters metrics for user_set in result_clusters: if len(user_set) != 0: metric = TeamMetric(set(user_set)) print(metric.get_final_metric_value()) # There are clusters with more than maximum users? Fix it. result_clusters = cu.balance_after_clustering( result_clusters, cu.get_not_clustered_users_set(reader, clustered_users), lists_count, max_cluster_size) # Display final clusters print("\nFinal clusters:") show_users_sets(result_clusters) # Display final clusters metrics final_metric_value = 0 for user_set in result_clusters: metric = TeamMetric(set(user_set)) final_metric_value += metric.get_final_metric_value() print(metric) return {"clusters": result_clusters, "metric": final_metric_value}
f.write(out_string) desires_weight = 1 need_balance = True metric_type = MetricTypes.DESIRES data_files_names = [ "../data/ms-sne_names.json", "../data/eltech-vector.json", "../data/users.json" ] results = ResultHolder(data_files_names) for data_file_name in data_files_names: reader = DataReader(data_file_name) max_teams = int(len(reader.get_all_users()) / 2) for teams in range(2, max_teams + 1): print("\n\nTEAMS: %d\n" % teams) # Spectral clustering_alg = UsersSpectralClustering(reader, teams, desires_weight=desires_weight, need_balance=need_balance) sets = clustering_alg.cluster() results.add_metric_for( data_file_name, teams, ClusteringMetric(sets, metric_type).get_final_metric())
IMAGE_DIR = "/home/milton/dataset/segmentation/Materials_In_Vessels/Train_Images/" LABEL_DIR = "/home/milton/dataset/segmentation/Materials_In_Vessels/LiquidSolidLabels/" PRE_TRAIN_MODEL_PATH = "/home/milton/dataset/trained_models/vgg16.npy" NUM_CLASSES = 4 EPOCHS = 5 BATCH_SIZE = 5 GPU_NUM = 2 LEARNING_RATE = 1e-5 LOGS_DIR = "/home/milton/research/code-power-logs/fcnvgg16/" TOWER_NAME = 'tower' log_device_placement = True # ..................... Create Data Reader ......................................# data_reader = DataReader(image_dir=IMAGE_DIR, label_dir=LABEL_DIR, batch_size=BATCH_SIZE) data_reader.loadDataSet() ITERATIONS = EPOCHS * data_reader.total_train_count / (BATCH_SIZE * GPU_NUM) print("Total Iterations {}".format(ITERATIONS)) def tower_loss(scope, images, labels, net, keep_prob): """Calculate the total loss on a single tower running the CIFAR model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' images: Images. 4D tensor of shape [batch_size, height, width, 3]. labels: Labels. 1D tensor of shape [batch_size]. Returns: Tensor of shape [] containing the total loss for a batch of data
print("Walking....") walks = self.random_walk.generate(self.graph) DataWriter.write_walks(walk_filename, walks) walks_corpus = DataReader.load_walks(walk_filename) model = Skipgram(sentences=walks_corpus, size=representation_size, window=window_size, min_count=0, trim_rule=None, workers=self.workers) # model.wv.save_word2vec_format(walk_filename[:8], ) model.save(model_filename) print 'Terminal.' if __name__ == '__main__': from utils.data_reader import DataReader from utils.graph import Graph data = DataReader.load_matfile('../data/blogcatalog.mat') is_directed = False graph = Graph(data, is_directed) random_walk = RandomWalk(num_paths=5, path_length=5, alpha=0.0) algorithm = DeepWalk(graph, random_walk, is_directed) algorithm.run(walk_filename='../data/deepwalk.walks', representation_size=100, window_size=5, model_filename='../data/deepwalk.model')
def train_net(net, device, epochs=5, batch_size=1, lr=0.001, val_percent=0.1, save_cp=True, img_scale=0.5, mode=0, alpha=.5): #mode=0 ==> AP-scheduling only #mode=1 ==> power allocation only #mode=2 ==> joint AP-scheduling and power allocation dataset = DataReader(rawPath, mode) n_val = int(len(dataset) * val_percent) n_train = len(dataset) - n_val train, val = random_split(dataset, [n_train, n_val]) train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=True) print("Number of training instances=" + str(n_train)) print("Number of validation instances=" + str(n_val)) writer = SummaryWriter( comment=f'LR_{lr}_BS_{batch_size}_SCALE_{img_scale}') global_step = 0 logging.info(f'''Starting training: Epochs: {epochs} Batch size: {batch_size} Learning rate: {lr} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Images scaling: {img_scale} ''') #Define optimizer and scheduler optimizer = optim.RMSprop(net.parameters(), lr=lr, weight_decay=1e-8, momentum=0.9) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min' if net.n_classes > 1 else 'max', patience=2) criterionAP = nn.BCEWithLogitsLoss() criterionPower = nn.MSELoss() #Main loop over epochs for epoch in range(epochs): net.train() epoch_loss = 0 with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img') as pbar: for batch in train_loader: imgs = batch['image'] true_masks = batch['mask'] true_powers = batch['power'] assert imgs.shape[1] == net.n_channels, \ f'Network has been defined with {net.n_channels} input channels, ' \ f'but loaded images have {imgs.shape[1]} channels. Please check that ' \ 'the images are loaded correctly.' imgs = imgs.to(device=device, dtype=torch.float32) mask_type = torch.float32 if net.n_classes == 1 else torch.long power_type = torch.long true_masks = true_masks.to(device=device, dtype=mask_type) true_powers = true_powers.to(device=device, dtype=power_type) masks_pred = net(imgs) if "AP" in masks_pred: lossAP = criterionAP(masks_pred["AP"], true_masks) if "Power" in masks_pred: lossPower = criterionPower(masks_pred["Power"], true_powers) if mode == 0: loss = lossAP elif mode == 1: loss = lossPower else: loss = alpha * lossAP + (1 - alpha) * lossPower epoch_loss += loss.item() writer.add_scalar('Loss/train', loss.item(), global_step) pbar.set_postfix(**{'loss (batch)': loss.item()}) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_value_(net.parameters(), 0.1) optimizer.step() pbar.update(imgs.shape[0]) global_step += 1 if global_step % (len(dataset) // (10 * batch_size)) == 0: for tag, value in net.named_parameters(): tag = tag.replace('.', '/') writer.add_histogram('weights/' + tag, value.data.cpu().numpy(), global_step) writer.add_histogram('grads/' + tag, value.grad.data.cpu().numpy(), global_step) val_score_AP, val_score_Power = eval_net_AP_Power( net, val_loader, device) if mode == 0: val_score = val_score_AP elif mode == 1: val_score = val_score_Power else: val_score = alpha * val_score_AP + ( 1 - alpha) * val_score_Power scheduler.step(val_score) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step) if net.n_classes > 1: logging.info( 'Validation cross entropy: {}'.format(val_score)) writer.add_scalar('Loss/test', val_score, global_step) else: logging.info( 'Validation Dice Coeff: {}'.format(val_score)) writer.add_scalar('Dice/test', val_score, global_step) writer.add_images('images', imgs, global_step) if net.n_classes == 1: writer.add_images('masks/true', true_masks, global_step) writer.add_images('masks/pred', torch.sigmoid(masks_pred) > 0.5, global_step) if save_cp: try: os.mkdir(dir_checkpoint) logging.info('Created checkpoint directory') except OSError: pass torch.save(net.state_dict(), dir_checkpoint + f'CP_epoch{epoch + 1}.pth') logging.info(f'Checkpoint {epoch + 1} saved !') writer.close()
sets_pc = pc.clusterize() Serializer.serialize_to_file(sets_pc, "../web-visualiser/pc.json") Serializer.serialize_to_file(sets_agg, "../web-visualiser/agg.json") my = ClusteringMetric(sets_pc).get_average_desires_metric() print("{};{}".format(agglomerative, my)) def agglomerative_vs_pc(teams_number): need_balance = False for variant in variants: reader = DataReader(variant) for teams_number in range(2, 10): clusterize_and_compare_by_desires(reader, teams_number, need_balance) def clusterize(filename, teams_number): reader = DataReader(filename) clustering_alg = UsersAgglomerativeClustering(reader, teams_number) cl = clustering_alg.clusterize() Serializer.serialize_to_file(cl, "../web-visualiser/data.json") reader = DataReader("../data/ms-sne.json") #agglomerative_vs_pc(2) clusterize_and_compare_by_desires(reader, 2, True) #clusterize("../data/ms-sne.json", 2)
def clusterize(filename, teams_number): reader = DataReader(filename) clustering_alg = UsersAgglomerativeClustering(reader, teams_number) cl = clustering_alg.clusterize() Serializer.serialize_to_file(cl, "../web-visualiser/data.json")
import multiprocessing as mp import fasttext as ft import csv, os FOLDER = "fasttext_tool/" def saveInfoToFile(row, output): output.write("__label__{} {}\n".format(row['polarity'], str(row['text']))) return "" def adjustForm(dataSet, fileName): print("Transforming...") with open('{}{}'.format(FOLDER, fileName), 'w+') as output: dataSet.apply(lambda x: saveInfoToFile(x, output), axis=1) if __name__ == "__main__": dataReader = DataReader() evaluator = Evaluator() if not "data.train" in os.listdir(FOLDER): dataSet = dataReader.read_data_set() adjustForm(dataSet, "data.train") if not "data.test" in os.listdir(FOLDER): testSet = dataReader.read_test_set() adjustForm(testSet, "data.test") if not "model.bin" in os.listdir(FOLDER): model = ft.train_supervised(input=FOLDER + "data.train") model.save_model(FOLDER + "model.bin") else: model = ft.load_model(FOLDER + "model.bin") (_, precision, recall) = model.test(FOLDER + "data.test") metrics = {'precision': precision, 'recall': recall, 'fscore': evaluator.calculate_fscore(precision, recall)} metrics_str = evaluator.getString(metrics)
import utils.clustering_utils as cu from experiments.experiment3.values_clustering import ValuesClustering from utils.data_reader import DataReader from utils.json_serializer import Serializer from utils.metrics import ClusteringMetric, MetricTypes REPEATS_COUNT = 10 CLUSTERS_COUNT = 3 LISTS_COUNT = 2 INPUT_DATA_FILENAME = "../data/users.json" CLUSTERING_TOOL_TYPE = cu.ClusteringTools.KMEANS METRIC_TYPE = MetricTypes.LISTS if __name__ == '__main__': reader = DataReader(INPUT_DATA_FILENAME) clustering = ValuesClustering(CLUSTERING_TOOL_TYPE, reader) result_clusters = clustering.cluster(CLUSTERS_COUNT, LISTS_COUNT, REPEATS_COUNT) result_metric = ClusteringMetric( list(map(lambda cluster: set(cluster), result_clusters)), METRIC_TYPE) print("\n%s" % str(result_metric)) Serializer.serialize_to_file(result_clusters, "../web-visualiser/data.json")
import pickle import numpy as np import scipy.misc from utils.data_reader import DataReader #def readRecords(path): if __name__ == "__main__": mainFolder = "/home/ali/SharedFolder/detector_test/unetOptimization" \ "/measurement_campaign_20200430/data/" imageFolder = mainFolder + "imgs/" powerFolder = mainFolder + "powers/" assocFolder = mainFolder + "assoc/" path = mainFolder + \ "res_UL_HP10m-K16-M128-sh0_Opt(IEQpower-lmda0.0,maxMinSNR,UL-bisec-Power(lp)-IPAP(iib)-isRoun0,sci-int,sci-int,1229-76-0,InitAssMat-sta-205).pkl" dataSet = DataReader(path) sys.exit(1) with open(path, "rb") as file: data = pickle.load(file) numIterations = data['iiter'] for i in np.arange(0, numIterations, 1): imageFile = "sample_" + str(i) associationMatrix = \ data['Ipap'][i]['APschdul'][-1]['switch_mat'] roundedAssociationMatrix = (np.around(associationMatrix, decimals=0)) beta = np.log10(data['lscale_beta'][i]) allocatedPower = data['pload'][i]['zzeta_opt'] scipy.misc.toimage(beta).save(imageFile + ".jpg") scipy.misc.toimage(associationMatrix).save(imageFile + "_mask.jpg")
from experiments.experiment4.preferences_clustering import PreferencesClustering from utils.data_reader import DataReader from utils.json_serializer import Serializer from utils.metrics import TeamDesiresMetric, ClusteringMetric __author__ = 'Xomak' reader = DataReader("../data/ms-sne.json") pc = PreferencesClustering(reader.get_all_users(), 6) result = pc.clusterize() for current_set in result: output = [] for user in current_set: output.append(str(user)) print(','.join(output)) print(TeamDesiresMetric(current_set)) print(ClusteringMetric(result).get_average_desires_metric()) Serializer.serialize_to_file(result, "../web-visualiser/data.json")