def makeJsonPathFile_LK(self): print(f"[{type(self).__name__}]Running ---> makeJsonPathFile_LK") wb = xlrd.open_workbook(self.lkFileName) sheet = wb.sheet_by_index(0) for x in range(1, sheet.nrows): CommonUtilities.progressBar(x, sheet.nrows) row = CommonUtilities.splitOnComma(sheet.cell_value(x, 0)) left_r, righ_r = self.__getLeftRightElment(row) source_lr = CommonUtilities.getSourceFromPath(left_r) source_rr = CommonUtilities.getSourceFromPath(righ_r) if not left_r in self.dictionary_lk_path.keys( ) and not left_r in self.resources_founded: self.dictionary_lk_path[left_r] = {source_lr: [left_r]} self.resources_founded.append(left_r) self.total_object_linked += 1 self.total_file_linkage += 1 if not righ_r in self.resources_founded: if not source_rr in self.dictionary_lk_path[left_r].keys(): self.dictionary_lk_path[left_r][source_rr] = [] self.dictionary_lk_path[left_r][source_rr].append(righ_r) self.resources_founded.append(righ_r) self.total_file_linkage += 1 CommonUtilities.writeDictToJson(self.dictionary_lk_path, SOURCES_BASE_LK_DICT)
def __AggregateAttributesFile(self): progressCount = 0 print("") CommonUtilities.progressBar(progressCount, len(self.lk_path_dict.keys()), status="Loading ..") for objectSpect, filepath in self.lk_path_dict.items(): progressCount += 1 CommonUtilities.progressBar(progressCount, len(self.lk_path_dict.keys()), status=f"Agg: {objectSpect}") self.__current_json_data = CommonUtilities.loadJsonFile( f"{self.src_dir_name}/{filepath}") #print(f"{self.src_dir_name}/{filepath}") ###Appena viene caricato il file per il l' aggregazione creo i dizionari dinamici self.dym_dict_local = SecondIterationDictionary( self.__current_json_data) self.dym_dict_local.Load() ##Istanzio nella class i dinionari self.coll_sim_din = self.dym_dict_local.dyn_col_sim self.coll_inv_din = self.dym_dict_local.col_inv self.__AggregateAttributes() CommonUtilities.writeDictToJson( self.__current_json_data, f"{self.dst_dir_name}/{filepath}.json") self.__dynDict.save()
def __AggregateFileS(self): self.progress_bar_count = 0 print("") CommonUtilities.progressBar(self.progress_bar_count, len(self.lk_path_dict.keys()), status="Loading ..") new_merged_file = self.__AggregateFiles(self.lk_path_dict.values()) CommonUtilities.writeDictToJson( new_merged_file, f"{self.dst_dir_name}/big_cluster.json")
def __AggregateFiles(self, filesList): mergedData = {} for filepath in filesList: jsdata = CommonUtilities.loadJsonFile( f"{self.src_dir_name}/{filepath}") for attrName, attrValue in jsdata.items(): if not attrName in mergedData.keys(): mergedData[attrName] = [] mergedData[attrName] += attrValue self.progress_bar_count += 1 CommonUtilities.progressBar(self.progress_bar_count, len(self.lk_path_dict.keys()), status=f"Agg: {filepath}") return mergedData
def __AggregateFileSameSourceAndSpect(self): progressCount = 0 print("") CommonUtilities.progressBar(progressCount, len(self.lk_path_dict.keys()), status="Loading ..") for objectSpect, objectSources in self.lk_path_dict.items(): progressCount +=1 CommonUtilities.progressBar(progressCount, len(self.lk_path_dict.keys()), status=f"Agg: {objectSpect}") self.lk_1_path_dict[objectSpect] = {} for sources, filespath in objectSources.items(): self.__findOrCreateDir(sources) self.lk_1_path_dict[objectSpect][sources] = f"{sources}/{self.newFileNameID}" new_merged_file = self.__AggregateFiles(filespath) CommonUtilities.writeDictToJson(new_merged_file, f"{self.dst_dir_name}/{sources}/{self.newFileNameID}.json") self.newFileNameID += 1
def __AggregateFileSameSpect(self): progressCount = 0 print("") CommonUtilities.progressBar(progressCount, len(self.lk_path_dict.keys()), status="Loading ..") for objectSpect, objectSources in self.lk_path_dict.items(): progressCount += 1 CommonUtilities.progressBar(progressCount, len(self.lk_path_dict.keys()), status=f"Agg: {objectSpect}") pathToMerge = [] for sources, filespath in objectSources.items(): pathToMerge.append(filespath) self.lk_2_path_dict[objectSpect] = f"{self.newFileNameID}" new_merged_file = self.__AggregateFiles(pathToMerge) CommonUtilities.writeDictToJson( new_merged_file, f"{self.dst_dir_name}/{self.newFileNameID}.json") self.newFileNameID += 1
import pandas as pd from tqdm import tqdm import json from collections import defaultdict import itertools import CommonUtilities # Leggo la ground truth in un Pandas dataframe pos_df = pd.read_csv('sources_3/gt_onevalue.csv') attrNameSet = [] # ordino lessicograficamente la GT for index, row in pos_df.iterrows(): left_ia = row['left_instance_attribute'] right_ia = row['right_instance_attribute'] attrNameSet.append(left_ia) attrNameSet.append(right_ia) CommonUtilities.progressBar(index + 1, pos_df.shape[0], f"{index+1}/{pos_df.shape[0]}") attrNameSet = set(attrNameSet) with open("attrToAnalizeFull.txt", "w") as attrFile: for attr in sorted(list(attrNameSet)): attrFile.write(f"{attr}\n")
import CommonUtilities with open("attrToAnalizeFull.txt", "r") as AttrNamesF: gtAttrNames = AttrNamesF.readlines() gtAttrNames = set([line[:-1] for line in gtAttrNames]) with open(f"sources_3/big_cluster3_refactor.json", 'r') as f: distros_dict = json.load(f) newLine = '' arrResult = ["left_instance_attribute,right_instance_attribute\n"] for key, value in distros_dict.items(): ###Faccio il prodotto cartesiano e ordino le coppie print(f"Working On {key}") current_list = set(value) slist = sorted(list(current_list.intersection(gtAttrNames))) elementCount = len(slist) print("List sorted") for x in range(0, elementCount): currente = slist[x] ll = [currente + "," + s + "\n" for s in slist[x + 1:]] arrResult += ll CommonUtilities.progressBar(x + 1, elementCount, f"{x}/{elementCount}") print(f"Start Writing: {len(arrResult)} lines of {key}") with open(f"sources_3/custom_ground.csv", "a+") as gF: gF.writelines(arrResult) arrResult = [] print(f"File Saved")