def __init__(self): ''' Constructor ''' self._extractor = Extractor() # will store a dictionary with the necessary data self.vis_data = None
def __init__(self): ''' Constructor ''' self._counter = 0 self._got_items = False # initialize default configuration options #TODO: IVisualisation shouldn't use the extractor at all, but rely on a data organiser self._extractor = Extractor() if conf.cache_enabled: self._extractor.enable_cache(conf.cache_host, conf.cache_port)
def __init__(self, look_back_years, cache_enabled=False, cache_host="localhost", cache_port=27017): ''' Constructor ''' self.t_loc = conf.sample_selection_file self.extractor = Extractor() self.cache_enabled = cache_enabled if self.cache_enabled: self.extractor.enable_cache(cache_host, cache_port) self.look_back_years = look_back_years self.preprocessor = Preprocessor() # sample set placeholders self.crisis_samples = [] self.normal_samples = [] self.metadata = Metadata(conf, look_back_years)
class IOrganiser(object): ''' Fetches data using the Extractor and organises it in json files for an appropriate visualisation presenter. ''' def __init__(self): ''' Constructor ''' self._extractor = Extractor() # will store a dictionary with the necessary data self.vis_data = None def _write_data(self): """ @param vis_data: a file where data should be stored """ filename = "data.json" with open(filename, "w") as out_file: json_text = json.dumps(self.vis_data, indent=4) out_file.write(json_text) def _organise_data(self, conf): """ should use the extractor to fetch what is needed and format it in a dictionary (store in self.vis_data). @attention: must override """ raise MustOverrideError def get_representation(self, conf): if conf.cache_enabled: self._extractor.enable_cache(conf.cache_host, conf.cache_port) self._organise_data(conf) if conf.cache_enabled and self._extractor.was_cached(): print("Cache was hit, didn't have to query the World Bank API.") elif conf.cache_enabled: print("Data wasn't cached, queried the World Bank API.") #self._write_data() return self.vis_data
def __init__(self, look_back_years, cache_enabled = False, cache_host = "localhost", cache_port=27017): ''' Constructor ''' self.t_loc = conf.sample_selection_file self.extractor = Extractor() self.cache_enabled = cache_enabled if self.cache_enabled: self.extractor.enable_cache(cache_host, cache_port) self.look_back_years = look_back_years self.preprocessor = Preprocessor() # sample set placeholders self.crisis_samples = [] self.normal_samples = [] self.metadata = Metadata(conf, look_back_years)
def test_extractor(self): extractor = Extractor() countries = extractor.grab() self.assertTrue(len(countries) > 0) arg = extractor.arg() arg["country_codes"] = ["usa", "hrv"] arg["indicator_codes"] = ["SP.POP.TOTL", "SL.TLF.PART.MA.ZS"] arg["interval"] = (2005, 2006) countries = extractor.grab(arg) self.assertTrue(len(countries) > 0) arg = extractor.arg() arg["country_codes"] = ["hrv"] arg["indicator_codes"] = ["SP.POP.TOTL"] arg["interval"] = (1998, 1999) countries = extractor.grab(arg) indicator = countries[0].get_indicator("SP.POP.TOTL") #print indicator.get_values() self.assertEqual(indicator.values, [4501000.0, 4554000.0]) self.assertEqual(indicator.dates, [1998, 1999])
def test_extractor(self): extractor = Extractor() countries = extractor.grab() self.assertTrue(len(countries)>0) arg = extractor.arg() arg["country_codes"] = ["usa", "hrv"] arg["indicator_codes"] = ["SP.POP.TOTL", "SL.TLF.PART.MA.ZS"] arg["interval"] = (2005, 2006) countries = extractor.grab(arg) self.assertTrue(len(countries)>0) arg = extractor.arg() arg["country_codes"] = ["hrv"] arg["indicator_codes"] = ["SP.POP.TOTL"] arg["interval"] = (1998, 1999) countries = extractor.grab(arg) indicator = countries[0].get_indicator("SP.POP.TOTL") #print indicator.get_values() self.assertEqual(indicator.values, [4501000.0, 4554000.0]) self.assertEqual(indicator.dates, [1998, 1999])
#!/usr/bin/python import sys import fileinput from dracula.extractor import Extractor if __name__=="__main__": try: desc_path = sys.argv[1] except: print("Usage:\npython extend_sgd_description.py <desired .sgd_description file>") exit() extractor = Extractor() country_names = {} countries = extractor.grab_metadata("countries") for country in countries: country_names[country.code]=country.name for line in fileinput.input(desc_path, inplace = 1): num_country, year = (line.rstrip().split("-")) number, country_code = num_country.split(" ") try: country_name = country_names[country_code] except: country_name = "????" sys.stdout.write("%s %s-%s-%s\n" % (number, country_code, year, country_name))
class IVisualisation(object): ''' Abstract visualisation, independent of concrete implementations. ''' def __init__(self): ''' Constructor ''' self._counter = 0 self._got_items = False # initialize default configuration options #TODO: IVisualisation shouldn't use the extractor at all, but rely on a data organiser self._extractor = Extractor() if conf.cache_enabled: self._extractor.enable_cache(conf.cache_host, conf.cache_port) def get_conf(self): return self.conf def _should_add_meta_marks(self): """ Tells if additional graph stuff (axis labels, legend) should be placed @return: Boolean """ return not self.conf.combine_plots or self._counter == 0 def get_title(self): if conf.auto_title: return self._auto_graph_title() else: return conf.graph_title def _auto_graph_title(self): if conf.combine_plots: country_representation = ", ".join([str(item).upper() for item in self._get_items()]) else: country_representation = str(self._get_items()[self._counter]).upper() title = "%s - %s" % (country_representation, conf.title_end) return title def _get_items(self): """ Get items that form independent units of data for drawing. Normally these are countries, but this can be overriden. Each item then gets passed to the _create_figure function to draw them on a graph. @return: list of items """ #TODO: this should just get a data list from the data organiser if not self._got_items: arg = self._extractor.arg() arg["country_codes"] = conf.countries arg["indicator_codes"] = conf.indicators arg["interval"] = (conf.start_date, conf.end_date) self.countries = self._extractor.grab(arg) #TODO: preprocessor # self._extractor.process(conf.process_indicators, # method = "slope", # look_back_years=conf.look_back_years) if conf.cache_enabled and self._extractor.was_cached(): print("Cache was hit, didn't have to query the World Bank API.") elif conf.cache_enabled: print("Data wasn't cached, queried the World Bank API.") self._got_items = True return self.countries def _start_new_figure(self): """ a hook for doing pre-plot stuff """ raise MustOverrideError def _finish_figure(self): """ a hook for doing post-plot stuff """ raise MustOverrideError def _create_figure(self, item): """ Create a figure and return it as a matplotlib object. Must override. """ raise MustOverrideError def _show(self): """ a hook to actually show the graph (potentially blocking code)""" pass def show(self): if not conf.write_to_file: # interactive self._show() def create_all_figures(self, vis_data): """ Write all figures to file(s) or plot in one or more windows. """ # we create only one figure if this is a combo plot if conf.combine_plots: self._start_new_figure() # iterate through items (e.g. countries) #TODO: actually use vis_data here and # in the complete_multigroup_visualisation for item in self._get_items(): if not conf.combine_plots: self._start_new_figure() self._create_figure(item) # this counter is important for subclasses. Be careful! self._counter += 1 if not conf.combine_plots: self._finish_figure() # store the plots in case this is a combined plot if conf.combine_plots: self._finish_figure()
class SamplesSet(object): ''' Responsible for building train and test sets ''' def __init__(self, look_back_years, cache_enabled=False, cache_host="localhost", cache_port=27017): ''' Constructor ''' self.t_loc = conf.sample_selection_file self.extractor = Extractor() self.cache_enabled = cache_enabled if self.cache_enabled: self.extractor.enable_cache(cache_host, cache_port) self.look_back_years = look_back_years self.preprocessor = Preprocessor() # sample set placeholders self.crisis_samples = [] self.normal_samples = [] self.metadata = Metadata(conf, look_back_years) # def buil_per_conf(self): # self.build_from_crises_file_from_crises_file(True) # pass def interesting_years_before(self, target_year): return range(target_year - self.look_back_years, target_year) def assign_samples(self, indicators, event_years, event_class, country_code="?"): # method creates machine learning samples from indicators # arguments: # event_years - years of crises or normal periods # (as specified in the sample selection file or in a rule) # classification - desired class corresponding to these years samples = [] # select only interesting values from the indicator for event_year in event_years: interesting_years = self.interesting_years_before(event_year) try: features = [] for indicator in indicators: new_features = self.preprocessor.preprocess_indicator( indicator, interesting_years) features.extend(new_features) sample_description = country_code.upper() + "-" + str( event_year) sample = Sample(features, event_class, description=sample_description) samples.append(sample) except NonExistentDataError: pass return samples def convert_to_boundaries(self, event_years, look_back_years): """ convert a list of event years and look back years into a list of 2-tuples of boundaries (begin_year, end_year) """ boundaries = [] for event_year in event_years: boundaries.append((event_year - look_back_years, event_year - 1)) return boundaries def events_to_boundaries(self, all_events, look_back_years): event_boundaries = {} for key, value in all_events.items(): event_boundaries[key] = self.convert_to_boundaries( value, look_back_years) return event_boundaries def divide_single(self, samples, test_percentage): # divide a list of samples to train and test samples if test_percentage == 0: train_samples = samples test_samples = [] else: number_test = int(len(samples) * test_percentage) test_samples = sample(samples, number_test) train_samples = list(set(samples).difference(set(test_samples))) return train_samples, test_samples def divide(self, crisis_samples, normal_samples, test_percentage): # same as divide_simple, only does that for both crisis and normal samples and combines them # into single train and test lists self.train_samples, self.test_samples = self.divide_single( crisis_samples, test_percentage) new_train_samples, new_test_samples = self.divide_single( normal_samples, test_percentage) self.train_samples.extend(new_train_samples) self.test_samples.extend(new_test_samples) return self.train_samples, self.test_samples def build_from_crises_file(self, country_codes, feature_indicators, test_percentage): """ Entry method that builds a samples set by fetching the data using the extractor. Classes are determined from a crisis XLS file. sparse - if True it fetches the data for the necessary years only. Shown to be non-efficient. """ # clear the sample sets self.crisis_samples = [] self.normal_samples = [] # get the years classified as crises / normal periods dates_input = Input() t_crises, t_normal = dates_input.parse_sample_selection(self.t_loc) crises_list, normal_list = dates_input.parse_sample_selection_to_list( self.t_loc) if country_codes[ 0] == "EVERYTHING": # we take everything available in the samples set wb_countries = self.extractor.grab_metadata("countries") wb_country_codes = set([country.code for country in wb_countries]) samples_definition_codes = set(t_crises.keys()) | set( t_normal.keys()) country_codes = list(wb_country_codes & samples_definition_codes) country_codes.sort() # we fetch all the data here # boundaries start_date = min(min(crises_list), min(normal_list)) - conf.look_back_years end_date = max(max(crises_list), max(normal_list)) arg = self.extractor.arg() arg["country_codes"] = country_codes arg["indicator_codes"] = feature_indicators arg["interval"] = (start_date, end_date) arg["pause"] = conf.wb_pause countries = self.extractor.grab(arg) if self.cache_enabled and self.extractor.was_cached(): print("Cache was hit, didn't have to query the World Bank API.") elif self.cache_enabled: print("Data wasn't cached, queried the World Bank API.") # assign the samples for country in countries: # fetch all the indicators for target country indicators = [] for ind_code in feature_indicators: indicator = country.get_indicator(ind_code) indicators.append(indicator) # create samples from those indicators - in crises... try: crisis_years = t_crises[country.code] except KeyError: continue # we skip this country new_samples = self.assign_samples(indicators, crisis_years, CRISIS_CLASS, country.code) self.crisis_samples.extend(new_samples) # ... and in normal periods normal_years = t_normal[country.code] new_samples = self.assign_samples(indicators, normal_years, NORMAL_CLASS, country.code) self.normal_samples.extend(new_samples) return self.divide(self.crisis_samples, self.normal_samples, test_percentage) def build_by_condition(self, country_codes, indicators, feature_indicators, test_percentage): # determine crises according to some condition/rule raise NotImplemented
# <nbformat>3.0</nbformat> # <markdowncell> # Reading data # =========== # In this part we'll get the crisis years from a published IMF data set. # # First let's get the standard country codes coresponding to these countries using dracula. # <codecell> import inspect from dracula.extractor import Extractor import dracula extractor = Extractor() country_codes = {} countries = extractor.grab_metadata("countries") print(inspect.getsourcelines(dracula.wb.parser.parse_multiple_countries_alone)) for country in countries: #print(dir(country)) country_codes[country.name] = country.code print(country_codes) # <markdowncell> # Manual fixing # <codecell> country_codes["Serbia, Republic of"] = 'SRB'
def test_cache(self): #host = "localhost" host = "lis.irb.hr" extractor = Extractor() # enable cache, but use a test DB extractor.enable_cache(host, 27017, test=True) extractor.clear_cache() # grab some data arg = extractor.arg() arg["country_codes"] = ["hrv", "usa"] arg["interval"] = (1997, 1999) arg["indicator_codes"] = ["SP.POP.TOTL"] countries = extractor.grab(arg) # see if it's cached self.assertEqual(extractor.is_cached(arg), True, "Countries must be cached after grab") arg["country_codes"].append("fin") self.assertEqual(extractor.is_cached(arg), False, "Countries must match to give a cache hit") arg["country_codes"]= ["hrv", "usa"] arg["interval"] = (1996, 1999) self.assertEqual(extractor.is_cached(arg), False, "Years must match to give a cache hit") arg["interval"] = (1997, 1999) arg["indicator_codes"].append("FR.INR.RINR") self.assertEqual(extractor.is_cached(arg), False, "Indicators must match to give a cache hit") # grab some more data and see if there are duplicate countries countries = extractor.grab(arg) country_count = len([c for c in extractor._cacher.db.countries.find()]) self.assertEqual(country_count, 2, "Grabing a wider set must not leave duplicates!")
def test_cache(self): #host = "localhost" host = "lis.irb.hr" extractor = Extractor() # enable cache, but use a test DB extractor.enable_cache(host, 27017, test=True) extractor.clear_cache() # grab some data arg = extractor.arg() arg["country_codes"] = ["hrv", "usa"] arg["interval"] = (1997, 1999) arg["indicator_codes"] = ["SP.POP.TOTL"] countries = extractor.grab(arg) # see if it's cached self.assertEqual(extractor.is_cached(arg), True, "Countries must be cached after grab") arg["country_codes"].append("fin") self.assertEqual(extractor.is_cached(arg), False, "Countries must match to give a cache hit") arg["country_codes"] = ["hrv", "usa"] arg["interval"] = (1996, 1999) self.assertEqual(extractor.is_cached(arg), False, "Years must match to give a cache hit") arg["interval"] = (1997, 1999) arg["indicator_codes"].append("FR.INR.RINR") self.assertEqual(extractor.is_cached(arg), False, "Indicators must match to give a cache hit") # grab some more data and see if there are duplicate countries countries = extractor.grab(arg) country_count = len([c for c in extractor._cacher.db.countries.find()]) self.assertEqual(country_count, 2, "Grabing a wider set must not leave duplicates!")
class SamplesSet(object): ''' Responsible for building train and test sets ''' def __init__(self, look_back_years, cache_enabled = False, cache_host = "localhost", cache_port=27017): ''' Constructor ''' self.t_loc = conf.sample_selection_file self.extractor = Extractor() self.cache_enabled = cache_enabled if self.cache_enabled: self.extractor.enable_cache(cache_host, cache_port) self.look_back_years = look_back_years self.preprocessor = Preprocessor() # sample set placeholders self.crisis_samples = [] self.normal_samples = [] self.metadata = Metadata(conf, look_back_years) # def buil_per_conf(self): # self.build_from_crises_file_from_crises_file(True) # pass def interesting_years_before(self, target_year): return range(target_year-self.look_back_years, target_year) def assign_samples(self, indicators, event_years, event_class, country_code="?"): # method creates machine learning samples from indicators # arguments: # event_years - years of crises or normal periods # (as specified in the sample selection file or in a rule) # classification - desired class corresponding to these years samples = [] # select only interesting values from the indicator for event_year in event_years: interesting_years = self.interesting_years_before(event_year) try: features = [] for indicator in indicators: new_features = self.preprocessor.preprocess_indicator(indicator, interesting_years) features.extend(new_features) sample_description = country_code.upper() + "-" + str(event_year) sample = Sample(features, event_class, description=sample_description) samples.append(sample) except NonExistentDataError: pass return samples def convert_to_boundaries(self, event_years, look_back_years): """ convert a list of event years and look back years into a list of 2-tuples of boundaries (begin_year, end_year) """ boundaries = [] for event_year in event_years: boundaries.append((event_year-look_back_years, event_year-1)) return boundaries def events_to_boundaries(self, all_events, look_back_years): event_boundaries = {} for key, value in all_events.items(): event_boundaries[key] = self.convert_to_boundaries(value, look_back_years) return event_boundaries def divide_single(self, samples, test_percentage): # divide a list of samples to train and test samples if test_percentage==0: train_samples = samples test_samples = [] else: number_test =int(len(samples)*test_percentage) test_samples = sample(samples, number_test) train_samples = list(set(samples).difference(set(test_samples))) return train_samples, test_samples def divide(self, crisis_samples, normal_samples, test_percentage): # same as divide_simple, only does that for both crisis and normal samples and combines them # into single train and test lists self.train_samples, self.test_samples = self.divide_single(crisis_samples, test_percentage) new_train_samples, new_test_samples = self.divide_single(normal_samples, test_percentage) self.train_samples.extend(new_train_samples) self.test_samples.extend(new_test_samples) return self.train_samples, self.test_samples def build_from_crises_file(self, country_codes, feature_indicators, test_percentage): """ Entry method that builds a samples set by fetching the data using the extractor. Classes are determined from a crisis XLS file. sparse - if True it fetches the data for the necessary years only. Shown to be non-efficient. """ # clear the sample sets self.crisis_samples = [] self.normal_samples = [] # get the years classified as crises / normal periods dates_input= Input() t_crises, t_normal = dates_input.parse_sample_selection(self.t_loc) crises_list, normal_list = dates_input.parse_sample_selection_to_list(self.t_loc) if country_codes[0]=="EVERYTHING": # we take everything available in the samples set wb_countries = self.extractor.grab_metadata("countries") wb_country_codes = set([country.code for country in wb_countries]) samples_definition_codes = set(t_crises.keys()) | set(t_normal.keys()) country_codes = list(wb_country_codes & samples_definition_codes) country_codes.sort() # we fetch all the data here # boundaries start_date = min(min(crises_list), min(normal_list))-conf.look_back_years end_date = max(max(crises_list), max(normal_list)) arg = self.extractor.arg() arg["country_codes"] = country_codes arg["indicator_codes"] = feature_indicators arg["interval"] = (start_date, end_date) arg["pause"] = conf.wb_pause countries = self.extractor.grab(arg) if self.cache_enabled and self.extractor.was_cached(): print("Cache was hit, didn't have to query the World Bank API.") elif self.cache_enabled: print("Data wasn't cached, queried the World Bank API.") # assign the samples for country in countries: # fetch all the indicators for target country indicators = [] for ind_code in feature_indicators: indicator = country.get_indicator(ind_code) indicators.append(indicator) # create samples from those indicators - in crises... try: crisis_years = t_crises[country.code] except KeyError: continue # we skip this country new_samples = self.assign_samples(indicators, crisis_years, CRISIS_CLASS, country.code) self.crisis_samples.extend(new_samples) # ... and in normal periods normal_years = t_normal[country.code] new_samples = self.assign_samples(indicators, normal_years, NORMAL_CLASS, country.code) self.normal_samples.extend(new_samples) return self.divide(self.crisis_samples, self.normal_samples, test_percentage) def build_by_condition(self, country_codes, indicators, feature_indicators, test_percentage): # determine crises according to some condition/rule raise NotImplemented