class ParameterEstimation:
    def __init__(self, network):
        self._network = network
        self._data = DataExtractor(network.name)

    def _get_probabilities(self, X, S, S_combinations):
        data_vectors = self._data.get_data_vectors()
        N = len(data_vectors[data_vectors.keys()[0]])
        X_values = data_vectors[X]
        observed_prob_dict = {}
        #  Now we look for the  value x of the variable X
        for x in self._values_dict[X]:
            #  finding matches for x
            x_indices = set([element_index for (element_index, element) in enumerate(X_values) if element == x])
            observed_prob_dict['P(' + X + '=' + x + ')'] = (len(x_indices) / N) + 0.001

            for S_combination in S_combinations:
                z_indices = self._get_z_indices(S, S_combination)
                z = z_indices
                x_z = x_indices.intersection(z)
                observed_prob_dict['P(' + X + '=' + x + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = (len(x_z) / float(len(z))) + 0.001
        return observed_prob_dict

    def get_estimated_cpds(self):
        values_dict = self._data.get_variable_values_sets()
        cpds = []
        for node in self._network:
            parents = self._network.predecessors(node)
            value_combinations = PGMUtils.get_combinations(parents, values_dict)
            probability_dict = self._get_probabilities(node, parents, value_combinations)
            cpds.append(probability_dict)
        return cpds
 def __init__(self):
     super().__init__()
     self.data_set_loc = conf.config_section_mapper("filePath").get(
         "data_set_loc")
     self.data_extractor = DataExtractor(self.data_set_loc)
     self.util = Util()
     self.sim_act_diff_mov_tf = SimilarActorsFromDiffMovies()
    def __init__(self):
        """
		Method Explanation:
			Intializes all the variables for the analysis task.
		"""
        self.util = Util()
        self.data_extractor = DataExtractor()

        self.location_id_to_title_map = self.data_extractor.location_mapping()
        self.location_title_to_id_map = self.data_extractor.location_title_to_id_mapping(
        )

        self.location_list = list(
            self.location_title_to_id_map.values())  # List of location ids
        self.LOCATION_COUNT = len(self.location_list)  # constant

        self.global_term_dictionary_current_index = 0  # To store the count of unique terms and indexing a given term in the global dictionary
        self.global_term_dictionary = dict(
        )  # To store the global list of terms as keys and their indices as values
        self.global_term_index_dictionary = dict(
        )  # To store the global list of terms referenced via the indices as the keys and terms as the values
        self.location_dictionary = dict(
        )  # To store the terms of a particular location and their corresponding attributes
        self.similarity_matrix = numpy.zeros(
            (self.LOCATION_COUNT,
             self.LOCATION_COUNT))  # To capture location-location similarity
Esempio n. 4
0
 def __init__(self):
     """
     Initializing the data extractor object to get data from the csv files
     """
     self.data_set_loc = conf.config_section_mapper("filePath").get(
         "data_set_loc")
     self.data_extractor = DataExtractor(self.data_set_loc)
Esempio n. 5
0
    def run(self):
        """
        Demo for histogram of replytime
        """
        print('Initialized!')
        print("=====================================")
        print('Unzipping JSONs:')
        print("=====================================")
        zipper = Unzipper(os.path.abspath('zipped_data'))
        zipper.unzip_all()
        print("=====================================")
        print('Importing Raw Data from JSON\'s')
        print("=====================================")
        extractor = DataExtractor(directory='unzipped/',
                                  features=[
                                      'id_str', 'created_at',
                                      ('user', 'id_str'),
                                      'in_reply_to_status_id'
                                  ])
        extractor.save_csv()

        print("=====================================")
        print('Extracting Reply Time data')
        print("=====================================")
        wrangler = DataWrangler()
        wrangler.replytime_wrangle()

        print("=====================================")
        print('Saving Visualization as "PLOT.png"')
        print("=====================================")
        # read in replytimedata
        with open('processed_data', 'rb') as fp:
            replytime_data = pickle.load(fp)
        self.save_plot(replytime_data)
        print('Ran Succesfully.')
Esempio n. 6
0
 def test_quadratic_choose_versions(self):
     project = ProjectName.CommonsLang.value
     extractor = DataExtractor(project)
     extractor.choose_versions(algorithm="quadratic",
                               strict="true",
                               version_type=VersionType.Untyped)
     assert True
Esempio n. 7
0
 def __init__(self):
     """
     Initialiazing the data extractor object to get data from the csv files
     """
     super().__init__()
     self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc")
     self.data_extractor = DataExtractor(self.data_set_loc)
     actor_actor_matrix_obj.fetchActorActorSimilarityMatrix()
Esempio n. 8
0
 def _extract(self):
     extractor = DataExtractor(self.project)
     path = extractor.get_bugged_methods_path(self.version)
     df = pd.read_csv(path, sep=';')
     key = 'method_id'
     bugged = df.groupby(key).apply(lambda x: dict(
         zip(["is_method_buggy"], x.is_method_buggy))).to_dict()
     self.data.set_raw_data(bugged)
 def __init__(self):
     self.conf = ParseConfig()
     self.data_set_loc = self.conf.config_section_mapper("filePath").get(
         "data_set_loc")
     self.data_extractor = DataExtractor(self.data_set_loc)
     self.mlmovies = self.data_extractor.get_mlmovies_data()
     self.genre_tag = GenreTag()
     self.genre_data = self.genre_tag.get_genre_data()
 def __init__(self):
     self.conf = ParseConfig()
     self.data_set_loc = os.path.join(os.path.abspath(os.path.dirname(__file__)), self.conf.config_section_mapper("filePath").get("data_set_loc"))
     self.data_extractor = DataExtractor(self.data_set_loc)
     self.mlratings = self.data_extractor.get_mlratings_data()
     self.mlmovies = self.data_extractor.get_mlmovies_data()
     self.imdb_actor_info = self.data_extractor.get_imdb_actor_info_data()
     self.genome_tags = self.data_extractor.get_genome_tags_data()
Esempio n. 11
0
 def _extract(self):
     extractor = DataExtractor(self.project)
     path = extractor.get_bugged_files_path(self.version, True)
     df = pd.read_csv(path, sep=';')
     key = 'file_name'
     assert key in df.columns
     bugged = df.groupby(key).apply(
         lambda x: dict(zip(["is_buggy"], x.is_buggy))).to_dict()
     self.data.set_raw_data(bugged)
    def _extract(self):
        # get version_date from apache_versions
        config = Config().config
        repository_data = config["CACHING"]["RepositoryData"]
        path = os.path.join(repository_data,
                            config['DATA_EXTRACTION']["AllVersions"],
                            self.project.github_name,
                            self.project.github_name + ".csv")
        df = pd.read_csv(path, sep=';')
        version_date = df[df['version_name'] ==
                          self.version]['version_date'].to_list()[0]
        version_date = datetime.strptime(version_date, '%Y-%m-%d %H:%M:%S')
        # get file list from committed_files
        path = os.path.join(repository_data,
                            config['DATA_EXTRACTION']["CommittedFiles"],
                            self.project.github_name,
                            self.project.github_name + ".csv")
        df = pd.read_csv(path, sep=';')
        issues_path = os.path.join(repository_data,
                                   config['DATA_EXTRACTION']["Issues"],
                                   self.project.github_name,
                                   self.project.github_name + "_dummies.csv")
        issues_df = pd.read_csv(issues_path, sep=';')
        issues_df = df[['commit_id', 'issue_id']].merge(issues_df,
                                                        on=['issue_id'],
                                                        how='right')
        # filter commits after version date
        df = df[df.apply(lambda r: datetime.strptime(r[
            'commit_date'], '%Y-%m-%d %H:%M:%S') < version_date,
                         axis=1)]
        # split by file_name
        data = {}
        issues_data = {}

        extractor = DataExtractor(self.project)
        path = extractor.get_bugged_files_path(self.version, True)
        files = pd.read_csv(path, sep=';')['file_name'].to_list()
        df = df[df.apply(lambda r: r['file_name'].endswith('.java') and r[
            'file_name'] in files,
                         axis=1)]

        for file_name, file_df in df.groupby('file_name', as_index=False):
            norm_name = os.path.normpath(file_name).lower()
            if norm_name not in self.file_analyser.relative_paths:
                continue
            name = self.file_analyser.relative_paths[norm_name]
            data[name] = self._extract_process_features(file_df)
            issues_data[name] = self._extract_issues_features(
                file_df, issues_df, self._get_blame_data(file_name))
        # extract the following features:
        self.data.add(ProcessData(self.project, self.version, data=data)).add(
            IssuesProductData(self.project, self.version,
                              data=issues_data)).add(
                                  IssuesProcessData(self.project,
                                                    self.version,
                                                    data=issues_data))
Esempio n. 13
0
 def __init__(self):
     self.data_extractor = DataExtractor()
     self.mapping = self.data_extractor.location_mapping()
     self.location_names = list(self.mapping.values())
     self.reference_model = 'CM3x3'
     self.model_list = self.init_model_list()
     self.reference_df = pd.DataFrame()
     self.df_list = self.init_df_list()
     self.data_dict = dict()
     self.minmax_scaler = MinMaxScaler()
 def __init__(self):
     self.conf = ParseConfig()
     self.data_set_loc = self.conf.config_section_mapper("filePath").get("data_set_loc")
     self.data_extractor = DataExtractor(self.data_set_loc)
     self.ordered_years = []
     self.ordered_movie_names = []
     self.ordered_actor_names = []
     self.print_list = ["\n\nFor Years:", "\n\nFor Movies:", "\n\nFor Actors:"]
     self.util = Util()
     self.tensor = self.fetchActorMovieYearTensor()
     self.factors = self.util.CPDecomposition(self.tensor, 5)
 def generate_img_img_adj_matrix(self):
     """ Method: generate image-image similarity matrix and stash in pickle file"""
     print("getting and normalizing data...")
     data_extractor = DataExtractor()
     loc_mapping = data_extractor.location_mapping()
     self.img_feature_matrix = data_extractor.prepare_dataset_for_task6(
         loc_mapping)
     scaler = MinMaxScaler()
     scaler.fit(list(self.img_feature_matrix.values()))
     for img, feature in self.img_feature_matrix.items():
         self.img_feature_matrix[img] = scaler.transform([feature])[0]
     self.img_ids = list(self.img_feature_matrix.keys())
 def _extract(self):
     extractor = DataExtractor(self.project)
     path = extractor.get_bugged_files_path(self.version, True)
     df = pd.read_csv(path, sep=';')
     key = 'file_name'
     assert key in df.columns
     bugged = df.groupby(key).apply(
         lambda x: dict(zip(["is_buggy"], x.is_buggy))).to_dict()
     ans = dict()
     for name, value in bugged.items():
         norm_name = os.path.normpath(name.lower())
         if norm_name in self.file_analyser.relative_paths:
             ans[self.file_analyser.relative_paths[norm_name]] = value
     self.data.set_raw_data(ans)
def train_test_dog_breed_detector(do_model_test=True):
    '''
    Trains and tests the dog breed detector model

    :param do_model_test: set to True for running prediction on the test data
    :return: dog breed detector model
    '''
    # get train and validation data and labels
    data_extractor = DataExtractor('dog_images/train', 'dog_images/valid',
                                   'dog_images/test')

    train_data = data_extractor.load_train_data()
    train_labels, train_num_dog_breeds = data_extractor.load_train_labels()

    valid_data = data_extractor.load_valid_data()
    valid_labels, valid_num_dog_breeds = data_extractor.load_valid_labels()

    assert train_num_dog_breeds == valid_num_dog_breeds

    # create and init the model
    model = DogBreedDetectorModel(num_dog_breeds=train_num_dog_breeds)
    model.init_model(train_data)

    # train the model
    if not model.load_model():
        model.train(train_data, train_labels, valid_data, valid_labels)

    # load test data and predict
    if do_model_test:
        test_data = data_extractor.load_test_data()
        test_labels, test_num_dog_breeds = data_extractor.load_test_labels()
        assert train_num_dog_breeds == test_num_dog_breeds

        model.predict(test_data, test_labels)
    return model
def get_FRIENDS_summary():
    read_path = "html/FRIENDS/summary/summary.json"
    save_dir = "data/FRIENDS/summary/"
    save_name = "summary.json"

    fh = open(read_path, "r")
    json_str = fh.read()
    fh.close()

    json_obj = json.loads(json_str)

    html = json_obj["summary"]

    extractor = DataExtractor("html")
    extractor.read(html)
    h3_tags = extractor.findAllTags("h3")
    h3_contents = extractor.extractTextFromTagList(h3_tags)
    h3_contents.append("END")

    dl_tags = extractor.findAllTags("dl")
    dl_contents = " ".join(extractor.extractTextFromTagList(dl_tags))
    dl_contents = dl_contents + " END"

    res = {}
    for i in range(len(h3_contents) - 1):
        begin_txt = h3_contents[i]
        end_txt = h3_contents[i + 1]
        begin_idx = dl_contents.index(begin_txt)
        end_idx = dl_contents.index(end_txt)
        summary = dl_contents[begin_idx + len(begin_txt) + 1:end_idx]
        try:
            strip_idx = summary.index("   ")
            summary = summary[:strip_idx]
        except Exception:
            pass
        seq_num = begin_txt[:begin_txt.index(" ")]
        dot_idx = seq_num.index(".")
        s_num = seq_num[:dot_idx]
        e_num = seq_num[dot_idx + 1:]
        seq_num = s_num.zfill(2) + e_num.zfill(2)
        res[seq_num] = summary

    print("Serializing...")
    json_str = json.dumps(res)
    print("Done.")

    print("Saving...")
    extractor.save(json_str, save_dir + save_name)
    print("Done.")
 def __init__(self):
     self.conf = ParseConfig()
     self.data_set_loc = self.conf.config_section_mapper("filePath").get(
         "data_set_loc")
     self.data_extractor = DataExtractor(self.data_set_loc)
     self.max_ratings = 5
     self.ordered_ratings = [0, 1, 2, 3, 4, 5]
     self.ordered_movie_names = []
     self.ordered_tag_names = []
     self.print_list = [
         "\n\nFor Tags:", "\n\nFor Movies:", "\n\nFor Ratings:"
     ]
     self.util = Util()
     self.tensor = self.fetchTagMovieRatingTensor()
     self.factors = self.util.CPDecomposition(self.tensor, 5)
Esempio n. 20
0
def get_data(project_dirs, batch_dict):
    df_ls = []
    batches_found = []
    batches_not_found = []
    for project_dir in project_dirs:
        extractor = DataExtractor(project_dir=project_dir,
            accession_dict=batch_dict, lib_type='rna')
        df_project_dir = extractor.collect_data()
        df_ls.append(df_project_dir)
        batches_found.extend(list(extractor.batches_found))
        batches_not_found.extend(list(extractor.batches_not_found))
    df = pd.concat(df_ls)
    # batches not found in one project directory may be found in another
    batches_not_found_all = list(
        set(batches_not_found).difference(batches_found))
    return df, batches_found, batches_not_found_all
Esempio n. 21
0
 def __init__(self, network_name):
     self._network_name = network_name
     self._data = DataExtractor(network_name)
     self._values_dict = self._data.get_variable_values_sets()
     self._node_names = self._values_dict.keys()
     self._graph = None
     self._nmis = {}
	def __init__(self, crawlerDAO, site):

		self.url = site

		self.crawlerDAO =  crawlerDAO
		self.visited		= []
		self.extractor  = DataExtractor()
Esempio n. 23
0
class DataExtractorTest(unittest.TestCase):
    def setUp(self):
        self.extractor = DataExtractor(raw_data)
        self.extractor.extract()
        self.race = self.extractor.get_race()

    def test_extracts_heat(self):
        self.assertEquals(60, self.race.heat)

    def test_extracts_race_date_and_time(self):
        self.assertEquals(datetime.date(2011,12,23), self.race.date)
        self.assertEquals(datetime.time(20,36), self.race.time)

    def test_extract_driver_list(self):
        drivers = [u'CiglaR', u'CASPER', u'Brzi', u'bR1ck', u'gogoGT', u'Shorty', u'dastrong', u'skrla', u'slavisha', u'VINKO']
        self.assertEquals(drivers, self.race.driver_list)
Esempio n. 24
0
class CoactorCoactorMatrix(object):
    """
    Class to compute the Coactor Matrix which represents the number of movies each pair of actors have acted in, together
    """
    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = os.path.join(
            os.path.abspath(os.path.dirname(__file__)),
            self.conf.config_section_mapper("filePath").get("data_set_loc"))
        self.data_extractor = DataExtractor(self.data_set_loc)

    def fetchCoactorCoactorSimilarityMatrix(self):
        """
        Creates the coactor matrix with all the actors in a given set
        :return: coactor matrix
        """
        movie_actor_df = self.data_extractor.get_movie_actor_data()
        movie_actor_set_df = movie_actor_df.groupby(
            ['actorid'])["movieid"].apply(set).reset_index()
        num_of_actors = len(movie_actor_df.actorid.unique())
        coactor_matrix = [[0] * num_of_actors for i in range(num_of_actors)]
        for index, movie_set in zip(movie_actor_set_df.index,
                                    movie_actor_set_df.movieid):
            for index_2, movie_set_2 in zip(movie_actor_set_df.index,
                                            movie_actor_set_df.movieid):
                if index != index_2:
                    coactor_matrix[index][index_2] = len(
                        movie_set.intersection(movie_set_2))

        numpy.savetxt("coactor_coactor_matrix.csv",
                      coactor_matrix,
                      delimiter=",")
        return coactor_matrix, movie_actor_set_df.actorid.unique()
Esempio n. 25
0
 def __init__(self):
     """
     Initialiazing the data extractor object to get data from the csv files
     """
     super().__init__()
     self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc")
     self.data_extractor = DataExtractor(self.data_set_loc)
     self.sim_act_diff_mov_tf = SimilarActorsFromDiffMovies()
 def __init__(self, network_name, num_BNs, max_parents):
     self._num_BNs = num_BNs
     self._network_name = network_name
     self._data_extractor = DataExtractor(network_name)
     self._node_names = self._data_extractor.get_variable_values_sets().keys()
     self._num_nodes = len(self._data_extractor.get_variable_values_sets())
     self._max_parents = max_parents
     if num_BNs > pow(2, (self._num_nodes * (self._num_nodes - 1)) / float(2)):
         raise('Invalid number of unique bayesian networks!')
Esempio n. 27
0
 def __init__(self):
     data_extractor = DataExtractor()
     self.sorter = Sorter(contigs=data_extractor.ctgs,
                          markers=data_extractor.mrkrs)
     self.chr_ctg = self.sorter.chr_ctg_dict()
     self.ctg_fasta = list(Fasta_B10v2_c_corr().generator(
     ))  # namedtuple('FastaRecord', ['id', 'sequence'])
     self.ctg_fasta_dict = self.dict_ctg_fasta()
     self.chr_ctg_order = self.sorter.ctg_order_in_chr()
     self.Contig = namedtuple("Contig", ['id', 'start', 'length', 'seq'])
class LdaGenreActor(GenreTag):
    def __init__(self):
        super().__init__()
        self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)

    def get_lda_data(self, genre):
        """
        Does LDA on movie-actor counts and outputs movies in terms of latent semantics as U
        and actor in terms of latent semantics as Vh
        :param genre:
        :return: returns U and Vh
        """

        # Getting movie_genre_data
        movie_genre_data_frame = self.data_extractor.get_mlmovies_data()
        movie_genre_data_frame = self.split_genres(movie_genre_data_frame)

        # Getting actor_movie_data
        movie_actor_data_frame = self.data_extractor.get_movie_actor_data()

        genre_actor_frame = movie_genre_data_frame.merge(movie_actor_data_frame, how="left", left_on="movieid",
                                                         right_on="movieid")
        # genre_actor_frame = genre_actor_frame[genre_actor_frame['year'].notnull()].reset_index()
        genre_actor_frame = genre_actor_frame[["movieid", "year", "genre", "actorid", "actor_movie_rank"]]

        genre_actor_frame["actorid_string"] = pd.Series(
            [str(id) for id in genre_actor_frame.actorid],
            index=genre_actor_frame.index)

        genre_data_frame = genre_actor_frame[genre_actor_frame["genre"]==genre]
        actor_df = genre_data_frame.groupby(['movieid'])['actorid_string'].apply(list).reset_index()
        actor_df = actor_df.sort_values('movieid')
        actor_df.to_csv('movie_actor_lda.csv', index=True, encoding='utf-8')

        actor_df = list(actor_df.iloc[:,1])

        (U, Vh) = util.LDA(actor_df, num_topics=4, num_features=1000)

        for latent in Vh:
            print ("\n")
            print(latent)
Esempio n. 29
0
    def load_data(self):
        directory_path = filedialog.askdirectory(
            initialdir=os.getcwd(),
            mustexist=True,
            title="Please select the data directory...")

        extractor = DataExtractor(directory_path, self.best_nest_var.get(),
                                  self.max_sim_time_var.get())
        invalid_files, unfinished_sims = extractor.extract_data()

        self.data_set = extractor.data_set
        self.data_plot = DataPlotter(self.data_set)
        msg_string = "%s simulations had missing or blank files.\n" % invalid_files
        msg_string += "%s simulations exceeded than the maximum time and so were removed." % unfinished_sims
        messagebox.showinfo('Data Loaded', msg_string)

        self.list_box.delete(0, tk.END)

        grid_row = 0
        for data in self.data_set:
            raw_data_string = ""
            for key, value in data.items():
                raw_data_string += "%s=%s, " % (key, value)
            grid_row += 1
            self.list_box.insert(tk.END, raw_data_string[:-2])
            if grid_row % 2 == 0:
                self.list_box.itemconfig(tk.END, bg='#e0e0e0')
            else:
                self.list_box.itemconfig(tk.END, bg='#f4f4f4')

        # Updating the list of options to split the data by
        options = self.data_set[0].keys()
        menu = self.split_options["menu"]
        menu.delete(0, "end")
        menu.add_command(label='none',
                         command=lambda: self.split_on_var.set('none'))
        for string in options:
            menu.add_command(
                label=string,
                command=lambda option=string: self.split_on_var.set(option))

        self.add_button.config(state=tk.ACTIVE)
def extract_data(project_ref):
    index = project_ref[0]
    project = project_ref[1]

    general_log = logging.getLogger(__name__)
    success_log = logging.getLogger("success")
    failure_log = logging.getLogger("failure")
    failure_verbose_log = logging.getLogger("failure_verbose")

    general_log.info(str(index) + ": " + project.github())
    try:
        extractor = DataExtractor(project)
        extractor.extract()
        success_log.info("Succeeded to extract {0}.".format(project.github()))
    except Exception as e:
        failure_log.error("Failed to extract {0}.".format(project.github()))
        failure_verbose_log.exception("Failed to extract {0}.".format(
            project.github()))
        return e
    return
 def __init__(self):
     super().__init__()
     self.data_set_loc = conf.config_section_mapper("filePath").get(
         "data_set_loc")
     self.data_extractor = DataExtractor(self.data_set_loc)
     self.actor_matrix, self.actorids = self.fetchActorActorSimilarityMatrix(
     )
     self.coactor_obj = CoactorCoactorMatrix()
     self.coactor_matrix, self.coactorids = self.coactor_obj.fetchCoactorCoactorSimilarityMatrix(
     )
     self.util = Util()
Esempio n. 32
0
def load_data(exp_params):
    ''' Loads the data from a CSV or from the db, and it will cache the results '''
    start = time.time()

    if CSV_LOCATION:  # uses a global variable, ugly but prettier than passing it every time
        print "reading data from local csv"
        df = pd.read_csv(CSV_LOCATION)
    elif not LOADING_DB_DATA:
        print "fetching data from db or cache"
        global LOADING_DB_DATA
        LOADING_DB_DATA = True
        df = DataExtractor(exp_params).get_data()
        LOADING_DB_DATA = False

    else:
        print "waiting for query to end"
        while (LOADING_DB_DATA):
            time.sleep(1)
        df = DataExtractor(exp_params).get_data()

    # get date to be a string and also saves the unix time for every date
    df['theday'] = df['theday'].astype('str')
    df['thedayunix'] = pd.to_datetime(df['theday']).astype(np.int64) // 10**9

    # drop any empty values
    df = df.dropna()

    end = time.time()
    print "loading data took {} seconds".format(end - start)

    return df.to_json(date_format='iso', orient='split')
 def __init__(self, hyperparameter, initial_bayesian_network, tabu_list_size, max_change_count):
     self._bayesian_network = initial_bayesian_network
     self._best_score = -float('inf')
     self._best_solution = initial_bayesian_network
     self._actions_list = ['add', 'remove', 'reverse']
     self._tabu_list = OrderedDict()
     self._tabulist_size = tabu_list_size
     self._max_change_count = max_change_count
     self._data = DataExtractor(initial_bayesian_network.name)
     self._node_names = self._data.get_variable_values_sets().keys()
     values_sets = self._data.get_variable_values_sets()
     data_vectors = self._data.get_data_vectors()
     self._score_util = BDeuScoreUtil(hyperparameter, self._bayesian_network, data_vectors, values_sets)
class Crawler():
	
	def __init__(self, crawlerDAO, site):

		self.url = site

		self.crawlerDAO =  crawlerDAO
		self.visited		= []
		self.extractor  = DataExtractor()

	def run(self):

		urls = [self.url]

		print(colored('INICIANDO CRAWLING NO SITE:' + self.url, 'green'))
		start = time.time()

		self.search(urls)

		end = time.time() - start
		# Final Calcs
		
		print(colored('[LOG] SUCCESS FINISHED CRAWL', 'green'))
		print('-------------------------------------------------------------')
		print('BUSCA TERMINOU - TEMPO DE BUSCA: ' + str(end / 60))  
		print('-------------------------------------------------------------')

	def sync(self, data):
		
		#Serializa objeto data em JSON
		jsonDATA = json.dumps(data, default=lambda o: o.__dict__)
		self.crawlerDAO.insertDataJSON(jsonDATA)
	
	def search(self, urls):

			for url in urls:
				if url not in self.visited:

					data = Data()
					data = self.extractor.getData(url)
			
					#Adiciona novas URLS achadas no processo de getData.
					urls = urls + data.toCrawl
					data.toCrawl = []

					#Sync com o client novas informacoes.
					self.sync(data)

					self.visited.append(url)
					self.search(urls)
def execute(project):
    general_log = logging.getLogger(__name__)
    success_log = logging.getLogger('success')
    failure_log = logging.getLogger('failure')

    general_log.info("Extracting project {}...".format(project.github()))

    try:
        DataExtractor(project)
    except Exception as e:
        failure_log.exception("Failed to extract {0}.".format(
            project.github()))
        return e
    success_log.info("Succeeded to extract {0}.".format(project.github()))
    return
Esempio n. 36
0
def setup_data(meta_data_loc):
    """
    Sets up the data so we can facilitate search queries on it.
    :param meta_data_loc: location of the meta data file
    :return: list of all data
    """

    meta_data = []
    with open(meta_data_loc) as json_file:
        meta_data = json.load(json_file)
    data = []

    for row in meta_data:
        data.append(
            DataExtractor(row["name_references"], row["location"], DIR_PATH))

    return data
Esempio n. 37
0
    def run(self):
        np.random.seed(self.config['seed'])

        image_dict = {}
        for i,exppriment_parameters in enumerate(self.experiment_parameters_list):
            print(exppriment_parameters)
            (train_data, probe_data, train_update_sets, test_update_sets,  test_set)=DataExtractor().get_data()
            shadow_model=TrainShadow(exppriment_parameters).get_trained_model(train_data, test_set)
            train_deltas=TrainUpdateModels('train',exppriment_parameters).get_update_dataset(probe_data, train_update_sets, shadow_model)
            test_deltas=TrainUpdateModels('test',exppriment_parameters).get_update_dataset(probe_data, test_update_sets, shadow_model)
            (generator, encoder)=TrainGan(train_deltas,train_update_sets, exppriment_parameters).get_GAN()
            generated_images=ImageGenerator(exppriment_parameters).get_images(test_update_sets,test_deltas, encoder, generator)
            Utils().add_images_to_dict(3,image_dict,generated_images, exppriment_parameters)
            print (f'finished running experiment {i+1} out of {len(self.experiment_parameters_list)}')

        plot_dict(image_dict, len(self.experiment_parameters_list))
        Utils().plot_update(test_update_sets[1])
        Utils().plot_generated(generated_images[1])

        return generated_images
Esempio n. 38
0
 def setUp(self):
     self.extractor = DataExtractor(raw_data)
     self.extractor.extract()
     self.race = self.extractor.get_race()
Esempio n. 39
0
 def __init__(self, network):
     self._network = network
     self._data = DataExtractor(network.name)
class GreedyHillClimber:
    _max_change_count = 20

    def __init__(self, hyperparameter, initial_bayesian_network, tabu_list_size, max_change_count):
        self._bayesian_network = initial_bayesian_network
        self._best_score = -float('inf')
        self._best_solution = initial_bayesian_network
        self._actions_list = ['add', 'remove', 'reverse']
        self._tabu_list = OrderedDict()
        self._tabulist_size = tabu_list_size
        self._max_change_count = max_change_count
        self._data = DataExtractor(initial_bayesian_network.name)
        self._node_names = self._data.get_variable_values_sets().keys()
        values_sets = self._data.get_variable_values_sets()
        data_vectors = self._data.get_data_vectors()
        self._score_util = BDeuScoreUtil(hyperparameter, self._bayesian_network, data_vectors, values_sets)

    def _get_score(self, action = None, edge = None):
        #  We calculate the score using the BDeu score
        #  calculator
        return self._score_util.get_score(action, edge)

    def _equals(self, bayesian_network_A, bayesian_network_B):
        #  Return true if two bayesian network with identical nodes
        #  also have identical edges.
        signature_A = self._get_bn_signature(bayesian_network_A)
        signature_B = self._get_bn_signature(bayesian_network_B)
        return signature_A == signature_B

    def _tabu_list_contains(self, bayesian_network):
        #  Returns true if the tabu list contains the given
        #  bayesian network
        solution_signature = self._get_bn_signature(bayesian_network)
        has_solution = solution_signature in self._tabu_list
        if has_solution:
            pass    #  print 'solution is  contained in  tabulist(length = ', len(self._tabu_list), ')'
        else:
            pass    #  print  'solution is  not contained in  tabulist'
        return has_solution

    def _get_bn_signature(self, bayesian_network):
        #  Generate a string from the edge set of the given bayesian
        #  network which is unique for a given edge set
        edge_string_list = []
        for edge in bayesian_network.edges():
            edge_string = str(edge[0]) + '-' + str(edge[1])
            edge_string_list.append(edge_string)
        signature = ' '.join(edge_string_list)
        return signature

    def _add_solution_to_tabu_list(self, bayesian_network):
        #  Adds the given bayesian network to the tabu list
        if len(self._tabu_list) == self._tabulist_size:
            first_key = self._tabu_list.keys()[0]
            self._tabu_list.pop(first_key)

        solution_signature = self._get_bn_signature(bayesian_network)
        self._tabu_list[solution_signature] = 'dummy'

    def _get_feasible_local_solutions(self, bayesian_network, undirected_graph, edge):
        local_solutions_action_pairs = []
        #  Calculate all possible local solutions by applying
        #  all the possible actions.
        temp_bn = deepcopy(bayesian_network)
        temp_graph = deepcopy(undirected_graph)
        for action in self._actions_list:
            #  print action + 'ing', edge, ' in ', bayesian_network.edges()
            is_feasible = GraphUtils.apply_action(temp_bn, temp_graph, (edge), action, 2)

            if not is_feasible:
                #  If the action was not feasible  then try again
                #  print 'Infeasible action.. trying with different action'
                continue

            if self._tabu_list_contains(temp_bn):
                #  If generated solution is already in the tabu list then try again
                #  print 'Solution already in tabu list trying again'
                continue
            #  print 'Got ', temp_bn.edges()
            local_solutions_action_pairs.append((temp_bn, action))
            temp_bn = deepcopy(bayesian_network)
            temp_graph = deepcopy(undirected_graph)
        return local_solutions_action_pairs

    def _get_best_local_solution(self, bayesian_network, undirected_graph, edge):
        local_solutions_action_pairs = self._get_feasible_local_solutions(bayesian_network, undirected_graph, edge)
        if len(local_solutions_action_pairs) == 0:
            return self._get_score(bayesian_network), bayesian_network
        scores = [self._get_score(solution_action_pair[1], edge) for solution_action_pair in local_solutions_action_pairs]
        #  The solution with maximum score is the most optimal one
        sorted_scores = sorted(scores, reverse = True)
        #  print 'Scores: ', scores
        best_local_solution_score = sorted_scores[0]
        best_solution_index = scores.index(best_local_solution_score)
        #  print local_solutions_action_pairs[best_solution_index][1], ' action is the best action'
        best_local_solution = local_solutions_action_pairs[best_solution_index][0]
        return best_local_solution_score, best_local_solution

    def perform_GHC(self):
        current_solution = self._bayesian_network
        self._best_score = current_score = self._get_score(current_solution)
        #  draw(self._bayesian_network)
        #  plt.show()
        print 'Initial score :', self._best_score
        undirected_graph = current_solution.to_undirected()
        change_count = 0
        max_count = self._max_change_count
        print max_count
        while True:
            #  Pick a random edge and decide the best action to be
            #  applied on the edge
            random_edge = GraphUtils.get_random_edge(self._node_names)
            #  print random_edge, ' is the edge selected'
            current_score, current_solution = \
            self._get_best_local_solution(current_solution,
                                              undirected_graph, random_edge)
            undirected_graph = current_solution.to_undirected()

            if current_score > self._best_score:
                change_count = 0
                #  Update the new best solution
                self._best_solution = deepcopy(current_solution)
                self._best_score = current_score
                print '-----------', self._best_score , '------------------'

            else:
                change_count += 1

            self._add_solution_to_tabu_list(current_solution)

            if change_count == max_count:
                break

    def get_solution(self):
        return self._best_solution, self._best_score
class RandomBNGenerator:
    '''
    A randomized construction heuristic for generating initial bayesian network 
    with a given number of nodes.  
    '''
    def __init__(self, network_name, num_BNs, max_parents):
        self._num_BNs = num_BNs
        self._network_name = network_name
        self._data_extractor = DataExtractor(network_name)
        self._node_names = self._data_extractor.get_variable_values_sets().keys()
        self._num_nodes = len(self._data_extractor.get_variable_values_sets())
        self._max_parents = max_parents
        if num_BNs > pow(2, (self._num_nodes * (self._num_nodes - 1)) / float(2)):
            raise('Invalid number of unique bayesian networks!')

    def _generate_initial_graph(self):
        #  Generated a simple ordered tree
        height = floor(log(self._num_nodes) / log(2))
        graph = balanced_tree(2, height, DiGraph())
        for node in graph.nodes():
            if int(node) >= self._num_nodes:
                graph.remove_node(node)
        #  We rename the nodes according to the target bayesian
        #  network we are trying to learn
        return self._rename_nodes(graph)

    def get_bayesian_networks(self):
        bayesian_network = GraphUtils.read_graph(self._network_name + '-' + str(self._max_parents))
        bayesian_networks = []

        if bayesian_network == None:
            #  Generate a tree (graph) with required number of nodes.
            bayesian_network = self._generate_initial_graph()
            bayesian_network.name = self._network_name + '-' + str(self._max_parents)
            GraphUtils.write_graph(bayesian_network)
            #  theoretical bound is infinity but this also does well
        num_iterations = 4 * self._num_nodes * self._num_nodes

        #  Since connectedness is only defined for undirected graphs
        #  so we have to keep a copy of the bayesian network
        #  except that all the edges are undirected
        undirected_BN = bayesian_network.to_undirected()
        bayesian_network.name = self._network_name
        #  Repeat for a large number of times.
        for i in xrange(self._num_BNs):
            count, i, j = 0, 0, 0;
            while count < num_iterations:
                edge = (i, j) = GraphUtils.get_random_edge(self._node_names)

                if bayesian_network.has_edge(*edge):
                    #  If (i,j) is in the graph, remove it
                    GraphUtils.apply_action(bayesian_network, undirected_BN, edge, 'remove', self._max_parents)
                else:
                    #  If the edge  (i,j) is not in the graph, add it.
                    GraphUtils.apply_action(bayesian_network, undirected_BN, edge, 'add', self._max_parents)
                count += 1
            bayesian_networks.append(deepcopy(bayesian_network))

        #  Return the obtained graph
        return bayesian_networks

    def _rename_nodes(self, graph):
        new_graph = DiGraph()
        #  print len(self._node_names)
        for node in graph.nodes():
            #  Add all the nodes with the names given in the data set
            new_node_name = self._node_names[node]
            new_graph.add_node(new_node_name)

        for edge in graph.edges():
            #  Add all the  with the names given in the data set
            new_source_node = self._node_names[ edge[0] ]
            new_destination_node = self._node_names[ edge[1] ]
            new_graph.add_edge(new_source_node, new_destination_node)
        new_graph.name = self._network_name
        return new_graph

    def _exceeded_parent_limit(self, bayesian_network, node):
        return len(bayesian_network.predecessors(node)) > self._max_parents

    @staticmethod
    def test():
        generator = RandomBNGenerator('cancer', 3, max_parents = 2);
        bns = generator.get_bayesian_networks()
        for bn in bns:
            print bn.nodes()
            print GraphUtils.has_cycle(bn)
            draw(bn)
            plt.show()
Esempio n. 42
0
class PC:

    _mutual_info_thresholds = [0.0005, 0.005, 0.025, 0.025]

    def __init__(self, network_name):
        self._network_name = network_name
        self._data = DataExtractor(network_name)
        self._values_dict = self._data.get_variable_values_sets()
        self._node_names = self._values_dict.keys()
        self._graph = None
        self._nmis = {}

    def _get_probabilities(self, X, Y, S, S_combinations):
        data_vectors = self._data.get_data_vectors()
        N = len(data_vectors[data_vectors.keys()[0]])
        X_values = data_vectors[X]
        Y_values = data_vectors[Y]
        observed_prob_dict = {}
        #  Now we look for the  value x of the variable X, and value y of the variable Y
        for x in self._values_dict[X]:    #  finding matches for x
            x_indices = set([element_index for (element_index, element) in enumerate(X_values) if element == x])
            observed_prob_dict['P(' + X + '=' + x + ')'] = len(x_indices) / float(N)
            for y in self._values_dict[Y]:
                #  finding matches for y
                y_indices = set([element_index for (element_index, element) in enumerate(Y_values) if element == y])
                observed_prob_dict['P(' + Y + '=' + y + ')'] = len(y_indices) / float(N)
                xy = x_indices.intersection(y_indices)
                observed_prob_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ')'] = len(xy) / float(N)

                for S_combination in S_combinations:
                    z_indices = PGMUtils.get_z_indices(S, S_combination, data_vectors)
                    z = z_indices
                    y_z = y_indices.intersection(z)
                    x_z = x_indices.intersection(z)
                    xyz = xy.intersection(z)
                    observed_prob_dict['P(' + X + '=' + x + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(x_z) / float(len(z))
                    observed_prob_dict['P(' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(z) / float(N)
                    observed_prob_dict['P(' + Y + '=' + y + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(y_z) / float(len(z))
                    observed_prob_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ',' + ','.join(S) + '=' + ','.join(S_combination) + ')'] = len(xyz) / float(N)
        return observed_prob_dict

    def _are_dseparated(self, X, Y, S, n):
        H_X = H_Y = H_XY = 0
        S_combinations = PGMUtils.get_combinations(S, self._values_dict)
        probability_dict = self._get_probabilities(X, Y, S, S_combinations)
        for x in self._values_dict[X]:
            p_x = probability_dict['P(' + X + '=' + x + ')']
            for y in self._values_dict[Y]:
                p_y = probability_dict['P(' + Y + '=' + y + ')']
                #  in case we are looking for zero order conditional dependency
                if len(S_combinations) == 0:
                    H_Y += -log(p_y + 0.001)
                    H_X += -log(p_x + 0.001)
                    p_xy = probability_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ')']
                    H_XY += -log(p_xy + 0.001)
                else:
                    for S_combination in S_combinations:
                        p_y_z = probability_dict['P(' + Y + '=' + y + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')']
                        p_x_z = probability_dict['P(' + X + '=' + x + '|' + ','.join(S) + '=' + ','.join(S_combination) + ')']
                        p_xyz = probability_dict['P(' + X + '=' + x + ',' + Y + '=' + y + ',' + ','.join(S) + '=' + ','.join(S_combination) + ')']
                        p_z = probability_dict['P(' + ','.join(S) + '=' + ','.join(S_combination) + ')']
                        H_X += -log(p_x_z + 0.001)
                        H_Y += -log(p_y_z + 0.001)
                        H_XY += -log(p_xyz * p_z + 0.001)

        #  If mutual information is greater than certain threshhold
        #  then X and Y are dependent otherwise not
        n_X = 2 * len(self._values_dict[X])
        n_Y = 2 * len(self._values_dict[Y])
        n_XY = 4 * len(S_combinations)
        if n_XY == 0:
            n_XY = 4
        MI = abs((H_X / n_X) + (H_Y / n_Y) - (H_XY / n_XY))
        #  print 'MI(', X + ',' + Y + '|' + ','.join(S), ') = ', MI
        self._nmis[X + ',' + Y + '|' + ','.join(S)] = MI
        if MI < self._mutual_info_thresholds[n]:
            return True
        return False

    def _eliminate_edges(self, Sep):
        num_nodes = len(self._values_dict.keys())
        graph = complete_graph(num_nodes, Graph())
        self._graph = GraphUtils.rename_nodes(graph, self._node_names)
        n = 0
        max_allowed_degree = settings.networks_settings['genome']['max_allowed_degree']
        while n <= 3:
            print '--------------------------------------------------------'
            #  We repeat the iterations unless each node X has
            #  less than or equal to n neighbors
            for X in self._graph:
                for Y in self._graph:
                    if X != Y and GraphUtils.is_degree_greater(self._graph, max_allowed_degree) \
                    and is_connected(self._graph):
                        #  all the neighbors of X excluding Y
                        neighbors = self._graph.neighbors(X)
                        if Y in neighbors:
                            neighbors.remove(Y)
                            #  We only consider X,Y if #neighbors of X excluding Y are more than
                            #  or equal to n
                            if len(neighbors) >= n:
                                #  Combinations of all the adjacent nodes of X excluding Y
                                #  each subset in the observed_sets has cardinality 'n'
                                observed_subsets = combinations(neighbors, n)
                                for S in observed_subsets:
                                    #  We only consider the subsets which have exactly
                                    S = [s for s in sorted(S)]
                                    are_deseparated = self._are_dseparated(X, Y, S, n)
                                    if are_deseparated:
                                        if self._graph.has_edge(X, Y):
                                            self._graph.remove_edge(X, Y)
                                            print 'Removed', X, '-', Y
                                            Sep[X + ',' + Y] = S
                                            Sep[Y + ',' + X] = S
            n += 1

    def _has_directed_path(self, A, B):
        has_directed_path = False
        paths = all_simple_paths(self._graph, A, B)
        for path in paths:
            has_directed_path = has_directed_path or has_directed_path
            if has_directed_path:
                break
            i = 0
            while i < len(path) - 1:
                src_node = path[i]
                next_node = path[i + 1]
                edge = self._graph.edge[src_node] [next_node]
                if 'direction' in edge:
                    if edge['direction'] == src_node + '->' + next_node:
                        has_directed_path = True
                else:
                    has_directed_path = False
                    break
                i += 1
        return has_directed_path

    def _all_edges_oriented(self):
        '''
        for edge in self._graph.edges():
            if 'direction'in self._graph[edge[0]][edge[1]]:
                print self._graph[edge[0]][edge[1]]['direction']
        '''
        for edge in self._graph.edges():
            if 'direction'not in self._graph[edge[0]][edge[1]]:
                return False
        return True

    def _orient_edges(self, Sep):
        triplets = []
        for source in self._graph.nodes():
            for target in self._graph.nodes():
                if source != target:
                    if not self._graph.has_edge(source, target):
                        #  Each element in triplets lists will be a list of three nodes
                        #  [X, Y , Z] such that X and Z are not adjacent in the graph
                        #  while X,Y and Y,Z are adjacent
                        triplets.append(list(all_simple_paths(self._graph, source, target, 2)))

        for triplet in triplets:
            if triplet != []:
                X, Y , Z = triplet[0][0], triplet[0][1], triplet[0][2]
                if Y not in Sep[X + ',' + Z]:
                    #  We dont have partially connected graphs in networkx library
                    #  so we attach a direction attribute to all the edges which we
                    #  want to be directed
                    edgeXY = self._graph.edge[X][Y]
                    edgeXY['direction'] = X + '->' + Y
                    edgeZY = self._graph.edge[Z][Y]
                    edgeZY['direction'] = Z + '->' + Y

        while not self._all_edges_oriented():
            for edge in self._graph.edges():
                A = edge[0]
                B = edge[1]
                edgeAB = self._graph.edge[A][B]
                if 'direction' in edgeAB:
                    if edgeAB['direction'] == A + '->' + 'B':
                        for C in self._graph.neighbors(B):
                            #  A & C are not adjacent
                            if not self._graph.has_edge(A, C):
                                edgeBC = self._graph.edge[B][C]
                                if 'direction' not in edgeBC:
                                    edgeBC['direction'] = B + '->' + C

                elif self._has_directed_path(A, B):
                    edgeAB['direction'] = A + '->' + B

    def perform_PC(self):
        #  Implementation of the PC algorithm given here:
        #  http://www.lowcaliber.org/influence/spirtes-causation-prediction-search.pdf
        Sep = {}
        self._eliminate_edges(Sep)
        pprint(sorted(self._nmis.iteritems(), key = itemgetter(1), reverse = True))
        print self._graph.edges()
        draw(self._graph)
        plt.show()
        if is_connected(self._graph):
            print 'The graph is connected'
        else:
            print 'The graph is not connected'

        self._orient_edges(Sep)
        pprint (self._graph.edges())

    def get_skeleton(self):
        self._graph = GraphUtils.convert_to_directed(self._graph)
        self._graph.name = self._network_name
        return self._graph
Esempio n. 43
0
Created on Dec 10, 2013

@author: himanshu
'''
import json
from networkx import DiGraph, draw
from libpgm.nodedata import NodeData
from libpgm.graphskeleton import GraphSkeleton
from libpgm.discretebayesiannetwork import DiscreteBayesianNetwork
from libpgm.pgmlearner import PGMLearner
import matplotlib.pyplot as plt
from data_extractor import DataExtractor


#  generate some data to use
data_ext = DataExtractor('genome', format = 'json')
data = data_ext.get_data_vectors()
print 'Got data with ', len(data), ' vectors'
#  instantiate my learner
learner = PGMLearner()

print 'learning the structure'
#  estimate structure
result = learner.discrete_constraint_estimatestruct(data, pvalparam = 0.02)

#  output
print json.dumps(result.E, indent = 2)
graph = DiGraph()
graph.add_edges_from(result.E)
draw(graph)
plt.show()