class ParameterEstimation: def __init__(self, network): self._network = network self._data = DataExtractor(network.name) def _get_probabilities(self, X, S, S_combinations): data_vectors = self._data.get_data_vectors() N = len(data_vectors[data_vectors.keys()[0]]) X_values = data_vectors[X] observed_prob_dict = {} # Now we look for the value x of the variable X for x in self._values_dict[X]: # finding matches for x x_indices = set([element_index for (element_index, element) in enumerate(X_values) if element == x]) observed_prob_dict['P(' + X + '=' + x + ')'] = (len(x_indices) / N) + 0.001 for S_combination in S_combinations: z_indices = self._get_z_indices(S, S_combination) z = z_indices x_z = x_indices.intersection(z) observed_prob_dict['P(' + X + '=' + x + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = (len(x_z) / float(len(z))) + 0.001 return observed_prob_dict def get_estimated_cpds(self): values_dict = self._data.get_variable_values_sets() cpds = [] for node in self._network: parents = self._network.predecessors(node) value_combinations = PGMUtils.get_combinations(parents, values_dict) probability_dict = self._get_probabilities(node, parents, value_combinations) cpds.append(probability_dict) return cpds
def __init__(self): super().__init__() self.data_set_loc = conf.config_section_mapper("filePath").get( "data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) self.util = Util() self.sim_act_diff_mov_tf = SimilarActorsFromDiffMovies()
def __init__(self): """ Method Explanation: Intializes all the variables for the analysis task. """ self.util = Util() self.data_extractor = DataExtractor() self.location_id_to_title_map = self.data_extractor.location_mapping() self.location_title_to_id_map = self.data_extractor.location_title_to_id_mapping( ) self.location_list = list( self.location_title_to_id_map.values()) # List of location ids self.LOCATION_COUNT = len(self.location_list) # constant self.global_term_dictionary_current_index = 0 # To store the count of unique terms and indexing a given term in the global dictionary self.global_term_dictionary = dict( ) # To store the global list of terms as keys and their indices as values self.global_term_index_dictionary = dict( ) # To store the global list of terms referenced via the indices as the keys and terms as the values self.location_dictionary = dict( ) # To store the terms of a particular location and their corresponding attributes self.similarity_matrix = numpy.zeros( (self.LOCATION_COUNT, self.LOCATION_COUNT)) # To capture location-location similarity
def __init__(self): """ Initializing the data extractor object to get data from the csv files """ self.data_set_loc = conf.config_section_mapper("filePath").get( "data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc)
def run(self): """ Demo for histogram of replytime """ print('Initialized!') print("=====================================") print('Unzipping JSONs:') print("=====================================") zipper = Unzipper(os.path.abspath('zipped_data')) zipper.unzip_all() print("=====================================") print('Importing Raw Data from JSON\'s') print("=====================================") extractor = DataExtractor(directory='unzipped/', features=[ 'id_str', 'created_at', ('user', 'id_str'), 'in_reply_to_status_id' ]) extractor.save_csv() print("=====================================") print('Extracting Reply Time data') print("=====================================") wrangler = DataWrangler() wrangler.replytime_wrangle() print("=====================================") print('Saving Visualization as "PLOT.png"') print("=====================================") # read in replytimedata with open('processed_data', 'rb') as fp: replytime_data = pickle.load(fp) self.save_plot(replytime_data) print('Ran Succesfully.')
def test_quadratic_choose_versions(self): project = ProjectName.CommonsLang.value extractor = DataExtractor(project) extractor.choose_versions(algorithm="quadratic", strict="true", version_type=VersionType.Untyped) assert True
def __init__(self): """ Initialiazing the data extractor object to get data from the csv files """ super().__init__() self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) actor_actor_matrix_obj.fetchActorActorSimilarityMatrix()
def _extract(self): extractor = DataExtractor(self.project) path = extractor.get_bugged_methods_path(self.version) df = pd.read_csv(path, sep=';') key = 'method_id' bugged = df.groupby(key).apply(lambda x: dict( zip(["is_method_buggy"], x.is_method_buggy))).to_dict() self.data.set_raw_data(bugged)
def __init__(self): self.conf = ParseConfig() self.data_set_loc = self.conf.config_section_mapper("filePath").get( "data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) self.mlmovies = self.data_extractor.get_mlmovies_data() self.genre_tag = GenreTag() self.genre_data = self.genre_tag.get_genre_data()
def __init__(self): self.conf = ParseConfig() self.data_set_loc = os.path.join(os.path.abspath(os.path.dirname(__file__)), self.conf.config_section_mapper("filePath").get("data_set_loc")) self.data_extractor = DataExtractor(self.data_set_loc) self.mlratings = self.data_extractor.get_mlratings_data() self.mlmovies = self.data_extractor.get_mlmovies_data() self.imdb_actor_info = self.data_extractor.get_imdb_actor_info_data() self.genome_tags = self.data_extractor.get_genome_tags_data()
def _extract(self): extractor = DataExtractor(self.project) path = extractor.get_bugged_files_path(self.version, True) df = pd.read_csv(path, sep=';') key = 'file_name' assert key in df.columns bugged = df.groupby(key).apply( lambda x: dict(zip(["is_buggy"], x.is_buggy))).to_dict() self.data.set_raw_data(bugged)
def _extract(self): # get version_date from apache_versions config = Config().config repository_data = config["CACHING"]["RepositoryData"] path = os.path.join(repository_data, config['DATA_EXTRACTION']["AllVersions"], self.project.github_name, self.project.github_name + ".csv") df = pd.read_csv(path, sep=';') version_date = df[df['version_name'] == self.version]['version_date'].to_list()[0] version_date = datetime.strptime(version_date, '%Y-%m-%d %H:%M:%S') # get file list from committed_files path = os.path.join(repository_data, config['DATA_EXTRACTION']["CommittedFiles"], self.project.github_name, self.project.github_name + ".csv") df = pd.read_csv(path, sep=';') issues_path = os.path.join(repository_data, config['DATA_EXTRACTION']["Issues"], self.project.github_name, self.project.github_name + "_dummies.csv") issues_df = pd.read_csv(issues_path, sep=';') issues_df = df[['commit_id', 'issue_id']].merge(issues_df, on=['issue_id'], how='right') # filter commits after version date df = df[df.apply(lambda r: datetime.strptime(r[ 'commit_date'], '%Y-%m-%d %H:%M:%S') < version_date, axis=1)] # split by file_name data = {} issues_data = {} extractor = DataExtractor(self.project) path = extractor.get_bugged_files_path(self.version, True) files = pd.read_csv(path, sep=';')['file_name'].to_list() df = df[df.apply(lambda r: r['file_name'].endswith('.java') and r[ 'file_name'] in files, axis=1)] for file_name, file_df in df.groupby('file_name', as_index=False): norm_name = os.path.normpath(file_name).lower() if norm_name not in self.file_analyser.relative_paths: continue name = self.file_analyser.relative_paths[norm_name] data[name] = self._extract_process_features(file_df) issues_data[name] = self._extract_issues_features( file_df, issues_df, self._get_blame_data(file_name)) # extract the following features: self.data.add(ProcessData(self.project, self.version, data=data)).add( IssuesProductData(self.project, self.version, data=issues_data)).add( IssuesProcessData(self.project, self.version, data=issues_data))
def __init__(self): self.data_extractor = DataExtractor() self.mapping = self.data_extractor.location_mapping() self.location_names = list(self.mapping.values()) self.reference_model = 'CM3x3' self.model_list = self.init_model_list() self.reference_df = pd.DataFrame() self.df_list = self.init_df_list() self.data_dict = dict() self.minmax_scaler = MinMaxScaler()
def __init__(self): self.conf = ParseConfig() self.data_set_loc = self.conf.config_section_mapper("filePath").get("data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) self.ordered_years = [] self.ordered_movie_names = [] self.ordered_actor_names = [] self.print_list = ["\n\nFor Years:", "\n\nFor Movies:", "\n\nFor Actors:"] self.util = Util() self.tensor = self.fetchActorMovieYearTensor() self.factors = self.util.CPDecomposition(self.tensor, 5)
def generate_img_img_adj_matrix(self): """ Method: generate image-image similarity matrix and stash in pickle file""" print("getting and normalizing data...") data_extractor = DataExtractor() loc_mapping = data_extractor.location_mapping() self.img_feature_matrix = data_extractor.prepare_dataset_for_task6( loc_mapping) scaler = MinMaxScaler() scaler.fit(list(self.img_feature_matrix.values())) for img, feature in self.img_feature_matrix.items(): self.img_feature_matrix[img] = scaler.transform([feature])[0] self.img_ids = list(self.img_feature_matrix.keys())
def _extract(self): extractor = DataExtractor(self.project) path = extractor.get_bugged_files_path(self.version, True) df = pd.read_csv(path, sep=';') key = 'file_name' assert key in df.columns bugged = df.groupby(key).apply( lambda x: dict(zip(["is_buggy"], x.is_buggy))).to_dict() ans = dict() for name, value in bugged.items(): norm_name = os.path.normpath(name.lower()) if norm_name in self.file_analyser.relative_paths: ans[self.file_analyser.relative_paths[norm_name]] = value self.data.set_raw_data(ans)
def train_test_dog_breed_detector(do_model_test=True): ''' Trains and tests the dog breed detector model :param do_model_test: set to True for running prediction on the test data :return: dog breed detector model ''' # get train and validation data and labels data_extractor = DataExtractor('dog_images/train', 'dog_images/valid', 'dog_images/test') train_data = data_extractor.load_train_data() train_labels, train_num_dog_breeds = data_extractor.load_train_labels() valid_data = data_extractor.load_valid_data() valid_labels, valid_num_dog_breeds = data_extractor.load_valid_labels() assert train_num_dog_breeds == valid_num_dog_breeds # create and init the model model = DogBreedDetectorModel(num_dog_breeds=train_num_dog_breeds) model.init_model(train_data) # train the model if not model.load_model(): model.train(train_data, train_labels, valid_data, valid_labels) # load test data and predict if do_model_test: test_data = data_extractor.load_test_data() test_labels, test_num_dog_breeds = data_extractor.load_test_labels() assert train_num_dog_breeds == test_num_dog_breeds model.predict(test_data, test_labels) return model
def get_FRIENDS_summary(): read_path = "html/FRIENDS/summary/summary.json" save_dir = "data/FRIENDS/summary/" save_name = "summary.json" fh = open(read_path, "r") json_str = fh.read() fh.close() json_obj = json.loads(json_str) html = json_obj["summary"] extractor = DataExtractor("html") extractor.read(html) h3_tags = extractor.findAllTags("h3") h3_contents = extractor.extractTextFromTagList(h3_tags) h3_contents.append("END") dl_tags = extractor.findAllTags("dl") dl_contents = " ".join(extractor.extractTextFromTagList(dl_tags)) dl_contents = dl_contents + " END" res = {} for i in range(len(h3_contents) - 1): begin_txt = h3_contents[i] end_txt = h3_contents[i + 1] begin_idx = dl_contents.index(begin_txt) end_idx = dl_contents.index(end_txt) summary = dl_contents[begin_idx + len(begin_txt) + 1:end_idx] try: strip_idx = summary.index(" ") summary = summary[:strip_idx] except Exception: pass seq_num = begin_txt[:begin_txt.index(" ")] dot_idx = seq_num.index(".") s_num = seq_num[:dot_idx] e_num = seq_num[dot_idx + 1:] seq_num = s_num.zfill(2) + e_num.zfill(2) res[seq_num] = summary print("Serializing...") json_str = json.dumps(res) print("Done.") print("Saving...") extractor.save(json_str, save_dir + save_name) print("Done.")
def __init__(self): self.conf = ParseConfig() self.data_set_loc = self.conf.config_section_mapper("filePath").get( "data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) self.max_ratings = 5 self.ordered_ratings = [0, 1, 2, 3, 4, 5] self.ordered_movie_names = [] self.ordered_tag_names = [] self.print_list = [ "\n\nFor Tags:", "\n\nFor Movies:", "\n\nFor Ratings:" ] self.util = Util() self.tensor = self.fetchTagMovieRatingTensor() self.factors = self.util.CPDecomposition(self.tensor, 5)
def get_data(project_dirs, batch_dict): df_ls = [] batches_found = [] batches_not_found = [] for project_dir in project_dirs: extractor = DataExtractor(project_dir=project_dir, accession_dict=batch_dict, lib_type='rna') df_project_dir = extractor.collect_data() df_ls.append(df_project_dir) batches_found.extend(list(extractor.batches_found)) batches_not_found.extend(list(extractor.batches_not_found)) df = pd.concat(df_ls) # batches not found in one project directory may be found in another batches_not_found_all = list( set(batches_not_found).difference(batches_found)) return df, batches_found, batches_not_found_all
def __init__(self, network_name): self._network_name = network_name self._data = DataExtractor(network_name) self._values_dict = self._data.get_variable_values_sets() self._node_names = self._values_dict.keys() self._graph = None self._nmis = {}
def __init__(self, crawlerDAO, site): self.url = site self.crawlerDAO = crawlerDAO self.visited = [] self.extractor = DataExtractor()
class DataExtractorTest(unittest.TestCase): def setUp(self): self.extractor = DataExtractor(raw_data) self.extractor.extract() self.race = self.extractor.get_race() def test_extracts_heat(self): self.assertEquals(60, self.race.heat) def test_extracts_race_date_and_time(self): self.assertEquals(datetime.date(2011,12,23), self.race.date) self.assertEquals(datetime.time(20,36), self.race.time) def test_extract_driver_list(self): drivers = [u'CiglaR', u'CASPER', u'Brzi', u'bR1ck', u'gogoGT', u'Shorty', u'dastrong', u'skrla', u'slavisha', u'VINKO'] self.assertEquals(drivers, self.race.driver_list)
class CoactorCoactorMatrix(object): """ Class to compute the Coactor Matrix which represents the number of movies each pair of actors have acted in, together """ def __init__(self): self.conf = ParseConfig() self.data_set_loc = os.path.join( os.path.abspath(os.path.dirname(__file__)), self.conf.config_section_mapper("filePath").get("data_set_loc")) self.data_extractor = DataExtractor(self.data_set_loc) def fetchCoactorCoactorSimilarityMatrix(self): """ Creates the coactor matrix with all the actors in a given set :return: coactor matrix """ movie_actor_df = self.data_extractor.get_movie_actor_data() movie_actor_set_df = movie_actor_df.groupby( ['actorid'])["movieid"].apply(set).reset_index() num_of_actors = len(movie_actor_df.actorid.unique()) coactor_matrix = [[0] * num_of_actors for i in range(num_of_actors)] for index, movie_set in zip(movie_actor_set_df.index, movie_actor_set_df.movieid): for index_2, movie_set_2 in zip(movie_actor_set_df.index, movie_actor_set_df.movieid): if index != index_2: coactor_matrix[index][index_2] = len( movie_set.intersection(movie_set_2)) numpy.savetxt("coactor_coactor_matrix.csv", coactor_matrix, delimiter=",") return coactor_matrix, movie_actor_set_df.actorid.unique()
def __init__(self): """ Initialiazing the data extractor object to get data from the csv files """ super().__init__() self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) self.sim_act_diff_mov_tf = SimilarActorsFromDiffMovies()
def __init__(self, network_name, num_BNs, max_parents): self._num_BNs = num_BNs self._network_name = network_name self._data_extractor = DataExtractor(network_name) self._node_names = self._data_extractor.get_variable_values_sets().keys() self._num_nodes = len(self._data_extractor.get_variable_values_sets()) self._max_parents = max_parents if num_BNs > pow(2, (self._num_nodes * (self._num_nodes - 1)) / float(2)): raise('Invalid number of unique bayesian networks!')
def __init__(self): data_extractor = DataExtractor() self.sorter = Sorter(contigs=data_extractor.ctgs, markers=data_extractor.mrkrs) self.chr_ctg = self.sorter.chr_ctg_dict() self.ctg_fasta = list(Fasta_B10v2_c_corr().generator( )) # namedtuple('FastaRecord', ['id', 'sequence']) self.ctg_fasta_dict = self.dict_ctg_fasta() self.chr_ctg_order = self.sorter.ctg_order_in_chr() self.Contig = namedtuple("Contig", ['id', 'start', 'length', 'seq'])
class LdaGenreActor(GenreTag): def __init__(self): super().__init__() self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) def get_lda_data(self, genre): """ Does LDA on movie-actor counts and outputs movies in terms of latent semantics as U and actor in terms of latent semantics as Vh :param genre: :return: returns U and Vh """ # Getting movie_genre_data movie_genre_data_frame = self.data_extractor.get_mlmovies_data() movie_genre_data_frame = self.split_genres(movie_genre_data_frame) # Getting actor_movie_data movie_actor_data_frame = self.data_extractor.get_movie_actor_data() genre_actor_frame = movie_genre_data_frame.merge(movie_actor_data_frame, how="left", left_on="movieid", right_on="movieid") # genre_actor_frame = genre_actor_frame[genre_actor_frame['year'].notnull()].reset_index() genre_actor_frame = genre_actor_frame[["movieid", "year", "genre", "actorid", "actor_movie_rank"]] genre_actor_frame["actorid_string"] = pd.Series( [str(id) for id in genre_actor_frame.actorid], index=genre_actor_frame.index) genre_data_frame = genre_actor_frame[genre_actor_frame["genre"]==genre] actor_df = genre_data_frame.groupby(['movieid'])['actorid_string'].apply(list).reset_index() actor_df = actor_df.sort_values('movieid') actor_df.to_csv('movie_actor_lda.csv', index=True, encoding='utf-8') actor_df = list(actor_df.iloc[:,1]) (U, Vh) = util.LDA(actor_df, num_topics=4, num_features=1000) for latent in Vh: print ("\n") print(latent)
def load_data(self): directory_path = filedialog.askdirectory( initialdir=os.getcwd(), mustexist=True, title="Please select the data directory...") extractor = DataExtractor(directory_path, self.best_nest_var.get(), self.max_sim_time_var.get()) invalid_files, unfinished_sims = extractor.extract_data() self.data_set = extractor.data_set self.data_plot = DataPlotter(self.data_set) msg_string = "%s simulations had missing or blank files.\n" % invalid_files msg_string += "%s simulations exceeded than the maximum time and so were removed." % unfinished_sims messagebox.showinfo('Data Loaded', msg_string) self.list_box.delete(0, tk.END) grid_row = 0 for data in self.data_set: raw_data_string = "" for key, value in data.items(): raw_data_string += "%s=%s, " % (key, value) grid_row += 1 self.list_box.insert(tk.END, raw_data_string[:-2]) if grid_row % 2 == 0: self.list_box.itemconfig(tk.END, bg='#e0e0e0') else: self.list_box.itemconfig(tk.END, bg='#f4f4f4') # Updating the list of options to split the data by options = self.data_set[0].keys() menu = self.split_options["menu"] menu.delete(0, "end") menu.add_command(label='none', command=lambda: self.split_on_var.set('none')) for string in options: menu.add_command( label=string, command=lambda option=string: self.split_on_var.set(option)) self.add_button.config(state=tk.ACTIVE)
def extract_data(project_ref): index = project_ref[0] project = project_ref[1] general_log = logging.getLogger(__name__) success_log = logging.getLogger("success") failure_log = logging.getLogger("failure") failure_verbose_log = logging.getLogger("failure_verbose") general_log.info(str(index) + ": " + project.github()) try: extractor = DataExtractor(project) extractor.extract() success_log.info("Succeeded to extract {0}.".format(project.github())) except Exception as e: failure_log.error("Failed to extract {0}.".format(project.github())) failure_verbose_log.exception("Failed to extract {0}.".format( project.github())) return e return
def __init__(self): super().__init__() self.data_set_loc = conf.config_section_mapper("filePath").get( "data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) self.actor_matrix, self.actorids = self.fetchActorActorSimilarityMatrix( ) self.coactor_obj = CoactorCoactorMatrix() self.coactor_matrix, self.coactorids = self.coactor_obj.fetchCoactorCoactorSimilarityMatrix( ) self.util = Util()
def load_data(exp_params): ''' Loads the data from a CSV or from the db, and it will cache the results ''' start = time.time() if CSV_LOCATION: # uses a global variable, ugly but prettier than passing it every time print "reading data from local csv" df = pd.read_csv(CSV_LOCATION) elif not LOADING_DB_DATA: print "fetching data from db or cache" global LOADING_DB_DATA LOADING_DB_DATA = True df = DataExtractor(exp_params).get_data() LOADING_DB_DATA = False else: print "waiting for query to end" while (LOADING_DB_DATA): time.sleep(1) df = DataExtractor(exp_params).get_data() # get date to be a string and also saves the unix time for every date df['theday'] = df['theday'].astype('str') df['thedayunix'] = pd.to_datetime(df['theday']).astype(np.int64) // 10**9 # drop any empty values df = df.dropna() end = time.time() print "loading data took {} seconds".format(end - start) return df.to_json(date_format='iso', orient='split')
def __init__(self, hyperparameter, initial_bayesian_network, tabu_list_size, max_change_count): self._bayesian_network = initial_bayesian_network self._best_score = -float('inf') self._best_solution = initial_bayesian_network self._actions_list = ['add', 'remove', 'reverse'] self._tabu_list = OrderedDict() self._tabulist_size = tabu_list_size self._max_change_count = max_change_count self._data = DataExtractor(initial_bayesian_network.name) self._node_names = self._data.get_variable_values_sets().keys() values_sets = self._data.get_variable_values_sets() data_vectors = self._data.get_data_vectors() self._score_util = BDeuScoreUtil(hyperparameter, self._bayesian_network, data_vectors, values_sets)
class Crawler(): def __init__(self, crawlerDAO, site): self.url = site self.crawlerDAO = crawlerDAO self.visited = [] self.extractor = DataExtractor() def run(self): urls = [self.url] print(colored('INICIANDO CRAWLING NO SITE:' + self.url, 'green')) start = time.time() self.search(urls) end = time.time() - start # Final Calcs print(colored('[LOG] SUCCESS FINISHED CRAWL', 'green')) print('-------------------------------------------------------------') print('BUSCA TERMINOU - TEMPO DE BUSCA: ' + str(end / 60)) print('-------------------------------------------------------------') def sync(self, data): #Serializa objeto data em JSON jsonDATA = json.dumps(data, default=lambda o: o.__dict__) self.crawlerDAO.insertDataJSON(jsonDATA) def search(self, urls): for url in urls: if url not in self.visited: data = Data() data = self.extractor.getData(url) #Adiciona novas URLS achadas no processo de getData. urls = urls + data.toCrawl data.toCrawl = [] #Sync com o client novas informacoes. self.sync(data) self.visited.append(url) self.search(urls)
def execute(project): general_log = logging.getLogger(__name__) success_log = logging.getLogger('success') failure_log = logging.getLogger('failure') general_log.info("Extracting project {}...".format(project.github())) try: DataExtractor(project) except Exception as e: failure_log.exception("Failed to extract {0}.".format( project.github())) return e success_log.info("Succeeded to extract {0}.".format(project.github())) return
def setup_data(meta_data_loc): """ Sets up the data so we can facilitate search queries on it. :param meta_data_loc: location of the meta data file :return: list of all data """ meta_data = [] with open(meta_data_loc) as json_file: meta_data = json.load(json_file) data = [] for row in meta_data: data.append( DataExtractor(row["name_references"], row["location"], DIR_PATH)) return data
def run(self): np.random.seed(self.config['seed']) image_dict = {} for i,exppriment_parameters in enumerate(self.experiment_parameters_list): print(exppriment_parameters) (train_data, probe_data, train_update_sets, test_update_sets, test_set)=DataExtractor().get_data() shadow_model=TrainShadow(exppriment_parameters).get_trained_model(train_data, test_set) train_deltas=TrainUpdateModels('train',exppriment_parameters).get_update_dataset(probe_data, train_update_sets, shadow_model) test_deltas=TrainUpdateModels('test',exppriment_parameters).get_update_dataset(probe_data, test_update_sets, shadow_model) (generator, encoder)=TrainGan(train_deltas,train_update_sets, exppriment_parameters).get_GAN() generated_images=ImageGenerator(exppriment_parameters).get_images(test_update_sets,test_deltas, encoder, generator) Utils().add_images_to_dict(3,image_dict,generated_images, exppriment_parameters) print (f'finished running experiment {i+1} out of {len(self.experiment_parameters_list)}') plot_dict(image_dict, len(self.experiment_parameters_list)) Utils().plot_update(test_update_sets[1]) Utils().plot_generated(generated_images[1]) return generated_images
def setUp(self): self.extractor = DataExtractor(raw_data) self.extractor.extract() self.race = self.extractor.get_race()
def __init__(self, network): self._network = network self._data = DataExtractor(network.name)
class GreedyHillClimber: _max_change_count = 20 def __init__(self, hyperparameter, initial_bayesian_network, tabu_list_size, max_change_count): self._bayesian_network = initial_bayesian_network self._best_score = -float('inf') self._best_solution = initial_bayesian_network self._actions_list = ['add', 'remove', 'reverse'] self._tabu_list = OrderedDict() self._tabulist_size = tabu_list_size self._max_change_count = max_change_count self._data = DataExtractor(initial_bayesian_network.name) self._node_names = self._data.get_variable_values_sets().keys() values_sets = self._data.get_variable_values_sets() data_vectors = self._data.get_data_vectors() self._score_util = BDeuScoreUtil(hyperparameter, self._bayesian_network, data_vectors, values_sets) def _get_score(self, action = None, edge = None): # We calculate the score using the BDeu score # calculator return self._score_util.get_score(action, edge) def _equals(self, bayesian_network_A, bayesian_network_B): # Return true if two bayesian network with identical nodes # also have identical edges. signature_A = self._get_bn_signature(bayesian_network_A) signature_B = self._get_bn_signature(bayesian_network_B) return signature_A == signature_B def _tabu_list_contains(self, bayesian_network): # Returns true if the tabu list contains the given # bayesian network solution_signature = self._get_bn_signature(bayesian_network) has_solution = solution_signature in self._tabu_list if has_solution: pass # print 'solution is contained in tabulist(length = ', len(self._tabu_list), ')' else: pass # print 'solution is not contained in tabulist' return has_solution def _get_bn_signature(self, bayesian_network): # Generate a string from the edge set of the given bayesian # network which is unique for a given edge set edge_string_list = [] for edge in bayesian_network.edges(): edge_string = str(edge[0]) + '-' + str(edge[1]) edge_string_list.append(edge_string) signature = ' '.join(edge_string_list) return signature def _add_solution_to_tabu_list(self, bayesian_network): # Adds the given bayesian network to the tabu list if len(self._tabu_list) == self._tabulist_size: first_key = self._tabu_list.keys()[0] self._tabu_list.pop(first_key) solution_signature = self._get_bn_signature(bayesian_network) self._tabu_list[solution_signature] = 'dummy' def _get_feasible_local_solutions(self, bayesian_network, undirected_graph, edge): local_solutions_action_pairs = [] # Calculate all possible local solutions by applying # all the possible actions. temp_bn = deepcopy(bayesian_network) temp_graph = deepcopy(undirected_graph) for action in self._actions_list: # print action + 'ing', edge, ' in ', bayesian_network.edges() is_feasible = GraphUtils.apply_action(temp_bn, temp_graph, (edge), action, 2) if not is_feasible: # If the action was not feasible then try again # print 'Infeasible action.. trying with different action' continue if self._tabu_list_contains(temp_bn): # If generated solution is already in the tabu list then try again # print 'Solution already in tabu list trying again' continue # print 'Got ', temp_bn.edges() local_solutions_action_pairs.append((temp_bn, action)) temp_bn = deepcopy(bayesian_network) temp_graph = deepcopy(undirected_graph) return local_solutions_action_pairs def _get_best_local_solution(self, bayesian_network, undirected_graph, edge): local_solutions_action_pairs = self._get_feasible_local_solutions(bayesian_network, undirected_graph, edge) if len(local_solutions_action_pairs) == 0: return self._get_score(bayesian_network), bayesian_network scores = [self._get_score(solution_action_pair[1], edge) for solution_action_pair in local_solutions_action_pairs] # The solution with maximum score is the most optimal one sorted_scores = sorted(scores, reverse = True) # print 'Scores: ', scores best_local_solution_score = sorted_scores[0] best_solution_index = scores.index(best_local_solution_score) # print local_solutions_action_pairs[best_solution_index][1], ' action is the best action' best_local_solution = local_solutions_action_pairs[best_solution_index][0] return best_local_solution_score, best_local_solution def perform_GHC(self): current_solution = self._bayesian_network self._best_score = current_score = self._get_score(current_solution) # draw(self._bayesian_network) # plt.show() print 'Initial score :', self._best_score undirected_graph = current_solution.to_undirected() change_count = 0 max_count = self._max_change_count print max_count while True: # Pick a random edge and decide the best action to be # applied on the edge random_edge = GraphUtils.get_random_edge(self._node_names) # print random_edge, ' is the edge selected' current_score, current_solution = \ self._get_best_local_solution(current_solution, undirected_graph, random_edge) undirected_graph = current_solution.to_undirected() if current_score > self._best_score: change_count = 0 # Update the new best solution self._best_solution = deepcopy(current_solution) self._best_score = current_score print '-----------', self._best_score , '------------------' else: change_count += 1 self._add_solution_to_tabu_list(current_solution) if change_count == max_count: break def get_solution(self): return self._best_solution, self._best_score
class RandomBNGenerator: ''' A randomized construction heuristic for generating initial bayesian network with a given number of nodes. ''' def __init__(self, network_name, num_BNs, max_parents): self._num_BNs = num_BNs self._network_name = network_name self._data_extractor = DataExtractor(network_name) self._node_names = self._data_extractor.get_variable_values_sets().keys() self._num_nodes = len(self._data_extractor.get_variable_values_sets()) self._max_parents = max_parents if num_BNs > pow(2, (self._num_nodes * (self._num_nodes - 1)) / float(2)): raise('Invalid number of unique bayesian networks!') def _generate_initial_graph(self): # Generated a simple ordered tree height = floor(log(self._num_nodes) / log(2)) graph = balanced_tree(2, height, DiGraph()) for node in graph.nodes(): if int(node) >= self._num_nodes: graph.remove_node(node) # We rename the nodes according to the target bayesian # network we are trying to learn return self._rename_nodes(graph) def get_bayesian_networks(self): bayesian_network = GraphUtils.read_graph(self._network_name + '-' + str(self._max_parents)) bayesian_networks = [] if bayesian_network == None: # Generate a tree (graph) with required number of nodes. bayesian_network = self._generate_initial_graph() bayesian_network.name = self._network_name + '-' + str(self._max_parents) GraphUtils.write_graph(bayesian_network) # theoretical bound is infinity but this also does well num_iterations = 4 * self._num_nodes * self._num_nodes # Since connectedness is only defined for undirected graphs # so we have to keep a copy of the bayesian network # except that all the edges are undirected undirected_BN = bayesian_network.to_undirected() bayesian_network.name = self._network_name # Repeat for a large number of times. for i in xrange(self._num_BNs): count, i, j = 0, 0, 0; while count < num_iterations: edge = (i, j) = GraphUtils.get_random_edge(self._node_names) if bayesian_network.has_edge(*edge): # If (i,j) is in the graph, remove it GraphUtils.apply_action(bayesian_network, undirected_BN, edge, 'remove', self._max_parents) else: # If the edge (i,j) is not in the graph, add it. GraphUtils.apply_action(bayesian_network, undirected_BN, edge, 'add', self._max_parents) count += 1 bayesian_networks.append(deepcopy(bayesian_network)) # Return the obtained graph return bayesian_networks def _rename_nodes(self, graph): new_graph = DiGraph() # print len(self._node_names) for node in graph.nodes(): # Add all the nodes with the names given in the data set new_node_name = self._node_names[node] new_graph.add_node(new_node_name) for edge in graph.edges(): # Add all the with the names given in the data set new_source_node = self._node_names[ edge[0] ] new_destination_node = self._node_names[ edge[1] ] new_graph.add_edge(new_source_node, new_destination_node) new_graph.name = self._network_name return new_graph def _exceeded_parent_limit(self, bayesian_network, node): return len(bayesian_network.predecessors(node)) > self._max_parents @staticmethod def test(): generator = RandomBNGenerator('cancer', 3, max_parents = 2); bns = generator.get_bayesian_networks() for bn in bns: print bn.nodes() print GraphUtils.has_cycle(bn) draw(bn) plt.show()
class PC: _mutual_info_thresholds = [0.0005, 0.005, 0.025, 0.025] def __init__(self, network_name): self._network_name = network_name self._data = DataExtractor(network_name) self._values_dict = self._data.get_variable_values_sets() self._node_names = self._values_dict.keys() self._graph = None self._nmis = {} def _get_probabilities(self, X, Y, S, S_combinations): data_vectors = self._data.get_data_vectors() N = len(data_vectors[data_vectors.keys()[0]]) X_values = data_vectors[X] Y_values = data_vectors[Y] observed_prob_dict = {} # Now we look for the value x of the variable X, and value y of the variable Y for x in self._values_dict[X]: # finding matches for x x_indices = set([element_index for (element_index, element) in enumerate(X_values) if element == x]) observed_prob_dict['P(' + X + '=' + x + ')'] = len(x_indices) / float(N) for y in self._values_dict[Y]: # finding matches for y y_indices = set([element_index for (element_index, element) in enumerate(Y_values) if element == y]) observed_prob_dict['P(' + Y + '=' + y + ')'] = len(y_indices) / float(N) xy = x_indices.intersection(y_indices) observed_prob_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ')'] = len(xy) / float(N) for S_combination in S_combinations: z_indices = PGMUtils.get_z_indices(S, S_combination, data_vectors) z = z_indices y_z = y_indices.intersection(z) x_z = x_indices.intersection(z) xyz = xy.intersection(z) observed_prob_dict['P(' + X + '=' + x + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(x_z) / float(len(z)) observed_prob_dict['P(' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(z) / float(N) observed_prob_dict['P(' + Y + '=' + y + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(y_z) / float(len(z)) observed_prob_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ',' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(xyz) / float(N) return observed_prob_dict def _are_dseparated(self, X, Y, S, n): H_X = H_Y = H_XY = 0 S_combinations = PGMUtils.get_combinations(S, self._values_dict) probability_dict = self._get_probabilities(X, Y, S, S_combinations) for x in self._values_dict[X]: p_x = probability_dict['P(' + X + '=' + x + ')'] for y in self._values_dict[Y]: p_y = probability_dict['P(' + Y + '=' + y + ')'] # in case we are looking for zero order conditional dependency if len(S_combinations) == 0: H_Y += -log(p_y + 0.001) H_X += -log(p_x + 0.001) p_xy = probability_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ')'] H_XY += -log(p_xy + 0.001) else: for S_combination in S_combinations: p_y_z = probability_dict['P(' + Y + '=' + y + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] p_x_z = probability_dict['P(' + X + '=' + x + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] p_xyz = probability_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ',' + ','.join(S) + '=' + ','.join(S_combination) + ')'] p_z = probability_dict['P(' + ','.join(S) + '=' + ','.join(S_combination) + ')'] H_X += -log(p_x_z + 0.001) H_Y += -log(p_y_z + 0.001) H_XY += -log(p_xyz * p_z + 0.001) # If mutual information is greater than certain threshhold # then X and Y are dependent otherwise not n_X = 2 * len(self._values_dict[X]) n_Y = 2 * len(self._values_dict[Y]) n_XY = 4 * len(S_combinations) if n_XY == 0: n_XY = 4 MI = abs((H_X / n_X) + (H_Y / n_Y) - (H_XY / n_XY)) # print 'MI(', X + ',' + Y + '|' + ','.join(S), ') = ', MI self._nmis[X + ',' + Y + '|' + ','.join(S)] = MI if MI < self._mutual_info_thresholds[n]: return True return False def _eliminate_edges(self, Sep): num_nodes = len(self._values_dict.keys()) graph = complete_graph(num_nodes, Graph()) self._graph = GraphUtils.rename_nodes(graph, self._node_names) n = 0 max_allowed_degree = settings.networks_settings['genome']['max_allowed_degree'] while n <= 3: print '--------------------------------------------------------' # We repeat the iterations unless each node X has # less than or equal to n neighbors for X in self._graph: for Y in self._graph: if X != Y and GraphUtils.is_degree_greater(self._graph, max_allowed_degree) \ and is_connected(self._graph): # all the neighbors of X excluding Y neighbors = self._graph.neighbors(X) if Y in neighbors: neighbors.remove(Y) # We only consider X,Y if #neighbors of X excluding Y are more than # or equal to n if len(neighbors) >= n: # Combinations of all the adjacent nodes of X excluding Y # each subset in the observed_sets has cardinality 'n' observed_subsets = combinations(neighbors, n) for S in observed_subsets: # We only consider the subsets which have exactly S = [s for s in sorted(S)] are_deseparated = self._are_dseparated(X, Y, S, n) if are_deseparated: if self._graph.has_edge(X, Y): self._graph.remove_edge(X, Y) print 'Removed', X, '-', Y Sep[X + ',' + Y] = S Sep[Y + ',' + X] = S n += 1 def _has_directed_path(self, A, B): has_directed_path = False paths = all_simple_paths(self._graph, A, B) for path in paths: has_directed_path = has_directed_path or has_directed_path if has_directed_path: break i = 0 while i < len(path) - 1: src_node = path[i] next_node = path[i + 1] edge = self._graph.edge[src_node] [next_node] if 'direction' in edge: if edge['direction'] == src_node + '->' + next_node: has_directed_path = True else: has_directed_path = False break i += 1 return has_directed_path def _all_edges_oriented(self): ''' for edge in self._graph.edges(): if 'direction'in self._graph[edge[0]][edge[1]]: print self._graph[edge[0]][edge[1]]['direction'] ''' for edge in self._graph.edges(): if 'direction'not in self._graph[edge[0]][edge[1]]: return False return True def _orient_edges(self, Sep): triplets = [] for source in self._graph.nodes(): for target in self._graph.nodes(): if source != target: if not self._graph.has_edge(source, target): # Each element in triplets lists will be a list of three nodes # [X, Y , Z] such that X and Z are not adjacent in the graph # while X,Y and Y,Z are adjacent triplets.append(list(all_simple_paths(self._graph, source, target, 2))) for triplet in triplets: if triplet != []: X, Y , Z = triplet[0][0], triplet[0][1], triplet[0][2] if Y not in Sep[X + ',' + Z]: # We dont have partially connected graphs in networkx library # so we attach a direction attribute to all the edges which we # want to be directed edgeXY = self._graph.edge[X][Y] edgeXY['direction'] = X + '->' + Y edgeZY = self._graph.edge[Z][Y] edgeZY['direction'] = Z + '->' + Y while not self._all_edges_oriented(): for edge in self._graph.edges(): A = edge[0] B = edge[1] edgeAB = self._graph.edge[A][B] if 'direction' in edgeAB: if edgeAB['direction'] == A + '->' + 'B': for C in self._graph.neighbors(B): # A & C are not adjacent if not self._graph.has_edge(A, C): edgeBC = self._graph.edge[B][C] if 'direction' not in edgeBC: edgeBC['direction'] = B + '->' + C elif self._has_directed_path(A, B): edgeAB['direction'] = A + '->' + B def perform_PC(self): # Implementation of the PC algorithm given here: # http://www.lowcaliber.org/influence/spirtes-causation-prediction-search.pdf Sep = {} self._eliminate_edges(Sep) pprint(sorted(self._nmis.iteritems(), key = itemgetter(1), reverse = True)) print self._graph.edges() draw(self._graph) plt.show() if is_connected(self._graph): print 'The graph is connected' else: print 'The graph is not connected' self._orient_edges(Sep) pprint (self._graph.edges()) def get_skeleton(self): self._graph = GraphUtils.convert_to_directed(self._graph) self._graph.name = self._network_name return self._graph
Created on Dec 10, 2013 @author: himanshu ''' import json from networkx import DiGraph, draw from libpgm.nodedata import NodeData from libpgm.graphskeleton import GraphSkeleton from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork from libpgm.pgmlearner import PGMLearner import matplotlib.pyplot as plt from data_extractor import DataExtractor # generate some data to use data_ext = DataExtractor('genome', format = 'json') data = data_ext.get_data_vectors() print 'Got data with ', len(data), ' vectors' # instantiate my learner learner = PGMLearner() print 'learning the structure' # estimate structure result = learner.discrete_constraint_estimatestruct(data, pvalparam = 0.02) # output print json.dumps(result.E, indent = 2) graph = DiGraph() graph.add_edges_from(result.E) draw(graph) plt.show()