Example #1
0
def model(all_x, all_x_dates, all_y, all_y_dates):
    values = all_x[-(look_back_years + 1):-1]
    dates = all_x_dates[-(look_back_years + 1):-1]
    ind1 = Preprocessor(dates, values).slope()
    values = all_y[-(look_back_years + 1):-1]
    dates = all_y_dates[-(look_back_years + 1):-1]
    ind2 = Preprocessor(dates, values).slope()
    return (ind1 < 0.20 and ind2 < 0.11)
Example #2
0
 def __init__(self,
              look_back_years,
              cache_enabled=False,
              cache_host="localhost",
              cache_port=27017):
     '''
     Constructor
     '''
     self.t_loc = conf.sample_selection_file
     self.extractor = Extractor()
     self.cache_enabled = cache_enabled
     if self.cache_enabled:
         self.extractor.enable_cache(cache_host, cache_port)
     self.look_back_years = look_back_years
     self.preprocessor = Preprocessor()
     # sample set placeholders
     self.crisis_samples = []
     self.normal_samples = []
     self.metadata = Metadata(conf, look_back_years)
Example #3
0
 def __init__(self, look_back_years, cache_enabled = False, cache_host = "localhost", cache_port=27017):
     '''
     Constructor
     '''
     self.t_loc = conf.sample_selection_file
     self.extractor = Extractor()
     self.cache_enabled = cache_enabled
     if self.cache_enabled:
         self.extractor.enable_cache(cache_host, cache_port)
     self.look_back_years = look_back_years
     self.preprocessor = Preprocessor()
     # sample set placeholders
     self.crisis_samples = []
     self.normal_samples = []
     self.metadata = Metadata(conf, look_back_years)
Example #4
0
 def apply_slope(self, *args):
     """
     @param *args:
     look_back_year - integer stating how many
     values back to look in the slope
     """
     #TODO: move this method to preprocessor & get rid of the foc dependency
     look_back_years = args[0]
     new_values = []
     past_values = []
     past_dates = []
     for i in range(len(self.dates)):
         past_dates.append(self.dates[i])
         past_values.append(self.values[i])
         if i >= look_back_years - 1:
             new_values.append(
                 Preprocessor(past_dates, past_values).slope())
             past_dates.pop(0)
             past_values.pop(0)
     try:
         [self.dates.pop(0) for i in range(look_back_years - 1)]
     except:
         pass
     self.values = new_values
Example #5
0
class SamplesSet(object):
    '''
    Responsible for building train and test sets
    '''
    def __init__(self,
                 look_back_years,
                 cache_enabled=False,
                 cache_host="localhost",
                 cache_port=27017):
        '''
        Constructor
        '''
        self.t_loc = conf.sample_selection_file
        self.extractor = Extractor()
        self.cache_enabled = cache_enabled
        if self.cache_enabled:
            self.extractor.enable_cache(cache_host, cache_port)
        self.look_back_years = look_back_years
        self.preprocessor = Preprocessor()
        # sample set placeholders
        self.crisis_samples = []
        self.normal_samples = []
        self.metadata = Metadata(conf, look_back_years)

#    def buil_per_conf(self):
#        self.build_from_crises_file_from_crises_file(True)
#        pass

    def interesting_years_before(self, target_year):
        return range(target_year - self.look_back_years, target_year)

    def assign_samples(self,
                       indicators,
                       event_years,
                       event_class,
                       country_code="?"):
        # method creates machine learning samples from indicators
        # arguments:
        # event_years - years of crises or normal periods
        # (as specified in the sample selection file or in a rule)
        # classification - desired class corresponding to these years
        samples = []
        # select only interesting values from the indicator
        for event_year in event_years:
            interesting_years = self.interesting_years_before(event_year)
            try:
                features = []
                for indicator in indicators:
                    new_features = self.preprocessor.preprocess_indicator(
                        indicator, interesting_years)
                    features.extend(new_features)
                sample_description = country_code.upper() + "-" + str(
                    event_year)
                sample = Sample(features,
                                event_class,
                                description=sample_description)
                samples.append(sample)
            except NonExistentDataError:
                pass
        return samples

    def convert_to_boundaries(self, event_years, look_back_years):
        """
        convert a list of event years and look back years into
        a list of 2-tuples of boundaries (begin_year, end_year)
        """
        boundaries = []
        for event_year in event_years:
            boundaries.append((event_year - look_back_years, event_year - 1))
        return boundaries

    def events_to_boundaries(self, all_events, look_back_years):
        event_boundaries = {}
        for key, value in all_events.items():
            event_boundaries[key] = self.convert_to_boundaries(
                value, look_back_years)
        return event_boundaries

    def divide_single(self, samples, test_percentage):
        # divide a list of samples to train and test samples
        if test_percentage == 0:
            train_samples = samples
            test_samples = []
        else:
            number_test = int(len(samples) * test_percentage)
            test_samples = sample(samples, number_test)
            train_samples = list(set(samples).difference(set(test_samples)))
        return train_samples, test_samples

    def divide(self, crisis_samples, normal_samples, test_percentage):
        # same as divide_simple, only does that for both crisis and normal samples and combines them
        # into single train and test lists
        self.train_samples, self.test_samples = self.divide_single(
            crisis_samples, test_percentage)
        new_train_samples, new_test_samples = self.divide_single(
            normal_samples, test_percentage)
        self.train_samples.extend(new_train_samples)
        self.test_samples.extend(new_test_samples)
        return self.train_samples, self.test_samples

    def build_from_crises_file(self, country_codes, feature_indicators,
                               test_percentage):
        """
        Entry method that builds a samples set by fetching the data using the extractor.
        Classes are determined from a crisis XLS file.
        
        sparse - if True it fetches the data for the necessary years only. Shown to be non-efficient.
        """
        # clear the sample sets
        self.crisis_samples = []
        self.normal_samples = []
        # get the years classified as crises / normal periods
        dates_input = Input()
        t_crises, t_normal = dates_input.parse_sample_selection(self.t_loc)
        crises_list, normal_list = dates_input.parse_sample_selection_to_list(
            self.t_loc)

        if country_codes[
                0] == "EVERYTHING":  # we take everything available in the samples set
            wb_countries = self.extractor.grab_metadata("countries")
            wb_country_codes = set([country.code for country in wb_countries])
            samples_definition_codes = set(t_crises.keys()) | set(
                t_normal.keys())
            country_codes = list(wb_country_codes & samples_definition_codes)
            country_codes.sort()

        # we fetch all the data here
        # boundaries
        start_date = min(min(crises_list),
                         min(normal_list)) - conf.look_back_years
        end_date = max(max(crises_list), max(normal_list))
        arg = self.extractor.arg()
        arg["country_codes"] = country_codes
        arg["indicator_codes"] = feature_indicators
        arg["interval"] = (start_date, end_date)
        arg["pause"] = conf.wb_pause
        countries = self.extractor.grab(arg)
        if self.cache_enabled and self.extractor.was_cached():
            print("Cache was hit, didn't have to query the World Bank API.")
        elif self.cache_enabled:
            print("Data wasn't cached, queried the World Bank API.")

        # assign the samples
        for country in countries:
            # fetch all the indicators for target country
            indicators = []
            for ind_code in feature_indicators:
                indicator = country.get_indicator(ind_code)
                indicators.append(indicator)
            # create samples from those indicators - in crises...
            try:
                crisis_years = t_crises[country.code]
            except KeyError:
                continue  # we skip this country
            new_samples = self.assign_samples(indicators, crisis_years,
                                              CRISIS_CLASS, country.code)
            self.crisis_samples.extend(new_samples)
            # ... and in normal periods
            normal_years = t_normal[country.code]
            new_samples = self.assign_samples(indicators, normal_years,
                                              NORMAL_CLASS, country.code)
            self.normal_samples.extend(new_samples)
        return self.divide(self.crisis_samples, self.normal_samples,
                           test_percentage)

    def build_by_condition(self, country_codes, indicators, feature_indicators,
                           test_percentage):
        # determine crises according to some condition/rule
        raise NotImplemented
Example #6
0
class SamplesSet(object):
    '''
    Responsible for building train and test sets
    '''


    def __init__(self, look_back_years, cache_enabled = False, cache_host = "localhost", cache_port=27017):
        '''
        Constructor
        '''
        self.t_loc = conf.sample_selection_file
        self.extractor = Extractor()
        self.cache_enabled = cache_enabled
        if self.cache_enabled:
            self.extractor.enable_cache(cache_host, cache_port)
        self.look_back_years = look_back_years
        self.preprocessor = Preprocessor()
        # sample set placeholders
        self.crisis_samples = []
        self.normal_samples = []
        self.metadata = Metadata(conf, look_back_years)
    
#    def buil_per_conf(self):
#        self.build_from_crises_file_from_crises_file(True)
#        pass
       
    def interesting_years_before(self, target_year):
        return range(target_year-self.look_back_years, target_year)
    
    def assign_samples(self, indicators, event_years, event_class, country_code="?"):
        # method creates machine learning samples from indicators
        # arguments:
        # event_years - years of crises or normal periods
        # (as specified in the sample selection file or in a rule)
        # classification - desired class corresponding to these years
        samples = []
        # select only interesting values from the indicator
        for event_year in event_years:
            interesting_years = self.interesting_years_before(event_year)
            try:
                features = []
                for indicator in indicators:
                    new_features = self.preprocessor.preprocess_indicator(indicator,
                                                                          interesting_years)
                    features.extend(new_features)
                sample_description = country_code.upper() + "-" + str(event_year)
                sample = Sample(features, event_class,
                                description=sample_description)
                samples.append(sample)
            except NonExistentDataError:
                pass
        return samples
    
    def convert_to_boundaries(self, event_years, look_back_years):
        """
        convert a list of event years and look back years into
        a list of 2-tuples of boundaries (begin_year, end_year)
        """
        boundaries = []
        for event_year in event_years:
            boundaries.append((event_year-look_back_years, event_year-1))
        return boundaries
    
    def events_to_boundaries(self, all_events, look_back_years):
        event_boundaries = {}
        for key, value in all_events.items():
            event_boundaries[key] = self.convert_to_boundaries(value, look_back_years)
        return event_boundaries
            
    def divide_single(self, samples, test_percentage):
        # divide a list of samples to train and test samples
        if test_percentage==0:
            train_samples = samples
            test_samples = []
        else:
            number_test =int(len(samples)*test_percentage)
            test_samples = sample(samples, number_test)
            train_samples = list(set(samples).difference(set(test_samples)))
        return train_samples, test_samples 
        
    def divide(self, crisis_samples, normal_samples, test_percentage):
        # same as divide_simple, only does that for both crisis and normal samples and combines them
        # into single train and test lists
        self.train_samples, self.test_samples = self.divide_single(crisis_samples, test_percentage)
        new_train_samples, new_test_samples = self.divide_single(normal_samples, test_percentage)
        self.train_samples.extend(new_train_samples)
        self.test_samples.extend(new_test_samples)
        return self.train_samples, self.test_samples
        
    
    def build_from_crises_file(self, country_codes, feature_indicators, test_percentage):
        """
        Entry method that builds a samples set by fetching the data using the extractor.
        Classes are determined from a crisis XLS file.
        
        sparse - if True it fetches the data for the necessary years only. Shown to be non-efficient.
        """
        # clear the sample sets
        self.crisis_samples = []
        self.normal_samples = []
        # get the years classified as crises / normal periods
        dates_input= Input()
        t_crises, t_normal = dates_input.parse_sample_selection(self.t_loc)
        crises_list, normal_list = dates_input.parse_sample_selection_to_list(self.t_loc)
        
        if country_codes[0]=="EVERYTHING": # we take everything available in the samples set
            wb_countries = self.extractor.grab_metadata("countries")
            wb_country_codes = set([country.code for country in wb_countries])
            samples_definition_codes = set(t_crises.keys()) | set(t_normal.keys())
            country_codes = list(wb_country_codes & samples_definition_codes)
            country_codes.sort()
        
        # we fetch all the data here
        # boundaries
        start_date = min(min(crises_list), min(normal_list))-conf.look_back_years
        end_date = max(max(crises_list), max(normal_list))
        arg = self.extractor.arg()
        arg["country_codes"] = country_codes
        arg["indicator_codes"] = feature_indicators
        arg["interval"] = (start_date, end_date)
        arg["pause"] = conf.wb_pause
        countries = self.extractor.grab(arg)
        if self.cache_enabled and self.extractor.was_cached():
            print("Cache was hit, didn't have to query the World Bank API.")
        elif self.cache_enabled:
            print("Data wasn't cached, queried the World Bank API.")
        
        # assign the samples
        for country in countries:
            # fetch all the indicators for target country
            indicators = []
            for ind_code in feature_indicators:
                indicator = country.get_indicator(ind_code)
                indicators.append(indicator)
            # create samples from those indicators - in crises...
            try:
                crisis_years = t_crises[country.code]
            except KeyError:
                continue # we skip this country
            new_samples = self.assign_samples(indicators,
                                              crisis_years,
                                              CRISIS_CLASS,
                                              country.code)
            self.crisis_samples.extend(new_samples)
            # ... and in normal periods
            normal_years = t_normal[country.code]
            new_samples = self.assign_samples(indicators,
                                              normal_years,
                                              NORMAL_CLASS,
                                              country.code)
            self.normal_samples.extend(new_samples)
        return self.divide(self.crisis_samples, self.normal_samples, test_percentage)
            
    def build_by_condition(self, country_codes, indicators, feature_indicators, test_percentage):
        # determine crises according to some condition/rule
        raise NotImplemented