Example #1
0
def fetch_latest_backup(destination_path, force_all=False):
    datasets = Datasets(destination_path)

    if force_all:
        files = datasets.downloader.LATEST
    else:
        files = tuple(f for f in datasets.downloader.LATEST
                      if not os.path.exists(os.path.join(destination_path, f)))

    if not files:
        log.info(
            'You already have all the latest datasets! Nothing to download.')

    return datasets.downloader.download(files)
Example #2
0
def fetch_session_start_times(data_dir, pivot, session_dates):
    """
    :param data_dir: (str) directory in which the output file will be saved
    :param pivot: (int) congressperson document to use as a pivot for scraping the data
    :param session_dates: (list) datetime objects to fetch the start times for
    """
    session_start_times = SessionStartTimesDataset()
    df = session_start_times.fetch(pivot, session_dates)
    save_to_csv(df, data_dir, "session-start-times")

    log.info("Dates requested:", len(session_dates))
    found = pd.to_datetime(df['date'],
                           format="%Y-%m-%d %H:%M:%S").dt.date.unique()
    log.info("Dates found:", len(found))
    return df
 def read_csv(self, name):
     filepath = os.path.join(self.path, name)
     log.info('Loading {}…'.format(name))
     dtype = {
         'applicant_id': np.str,
         'batch_number': np.str,
         'cnpj_cpf': np.str,
         'congressperson_document': np.str,
         'congressperson_id': np.str,
         'document_id': np.str,
         'document_number': np.str,
         'document_type': np.str,
         'leg_of_the_trip': np.str,
         'passenger': np.str,
         'reimbursement_number': np.str,
         'subquota_group_description': np.str,
         'subquota_group_id': np.str,
         'subquota_number': np.str,
         'term_id': np.str,
     }
     return pd.read_csv(filepath, dtype=dtype)
Example #4
0
 def _try_fetch_xml(self, attempts, url):
     while attempts > 0:
         try:
             return urllib.request.urlopen(url, data=None, timeout=10)
         except urllib.error.HTTPError as err:
             log.error("HTTP Error", err.code, "when loading URL", url)
             # 500 seems to be the error code for "no data found for the
             # params provided"
             if err.code == 500:
                 log.info("Skipping [HTTP Status 500] {}".format(url))
                 return None
             time.sleep(self.sleep_interval / 2)
             attempts -= 1
             if attempts > 0:
                 log.info("Trying again", attempts)
             else:
                 log.error("FAIL {}".format(url))
         except socket.error as socketerror:
             log.error("Socket error:", socketerror)
             time.sleep(self.sleep_interval * 10)
             attempts -= 1
             if attempts > 0:
                 log.info("Trying again", attempts)
             else:
                 log.error("FAIL {}".format(url))
    def write_reimbursement_file(self, receipts):
        log.info('Casting changes to a new DataFrame…')
        df = pd.DataFrame(data=receipts)

        log.info('Writing it to file…')
        filepath = os.path.join(self.path, self.FILE_BASE_NAME)
        df.to_csv(filepath, **self.CSV_PARAMS)

        log.info('Done.')
Example #6
0
    def __init__(self):
        self.credentials = None
        self.client = None
        self.config = find_config(self.CONFIG)

        if not self.config_exists:
            log.info('Could not find {} file.'.format(self.CONFIG))
            log.info('You need Amazon section in it to interact with S3')
            log.info('(Check config.ini.example if you need a reference.)')
            return

        settings = configparser.RawConfigParser()
        settings.read(self.config)
        self.settings = partial(settings.get, 'Amazon')

        try:
            self.credentials = {
                'aws_access_key_id': self.settings('AccessKey'),
                'aws_secret_access_key': self.settings('SecretKey'),
                'region_name': self.settings('Region')
            }

            # friendly user message warning about old config.ini version
            region = self.credentials.get('region_name', '')
            if region and region.startswith('s3-'):
                msg = (
                    'It looks like you have an old version of the config.ini '
                    'file. We do not need anymore the service (s3) appended '
                    'to the region (sa-east-1). Please update your config.ini '
                    'replacing regions like `s3-sa-east-1` by `sa-east-1`.')
                log.info(msg)

        except configparser.NoSectionError:
            msg = ('You need an Amazon section in {} to interact with S3 '
                   '(Check config.ini.example if you need a reference.)')
            log.info(msg.format(self.CONFIG))
Example #7
0
def fetch_deputies(data_dir):
    """
    :param data_dir: (str) directory in which the output file will be saved
    """
    deputies = DeputiesDataset()
    df = deputies.fetch()
    save_to_csv(df, data_dir, "deputies")

    holders = df.condition == 'Holder'
    substitutes = df.condition == 'Substitute'
    log.info("Total deputies:", len(df))
    log.info("Holder deputies:", len(df[holders]))
    log.info("Substitute deputies:", len(df[substitutes]))
    return df
Example #8
0
def fetch_presences(data_dir, deputies, date_start, date_end):
    """
    :param data_dir: (str) directory in which the output file will be saved
    :param deputies: (pandas.DataFrame) a dataframe with deputies data
    :param date_start: (str) a date in the format dd/mm/yyyy
    :param date_end: (str) a date in the format dd/mm/yyyy
    """
    presences = PresencesDataset()
    df = presences.fetch(deputies, date_start, date_end)
    save_to_csv(df, data_dir, "presences")

    log.info("Presence records:", len(df))
    log.info("Records of deputies present on a session:",
             len(df[df.presence == 'Present']))
    log.info("Records of deputies absent from a session:",
             len(df[df.presence == 'Absent']))

    return df
    def group(self, receipts):
        log.info(
            'Dropping rows without document_value or reimbursement_number…')
        subset = ('document_value', 'reimbursement_number')
        receipts = receipts.dropna(subset=subset)

        groupby_keys = ('year', 'applicant_id', 'document_id')
        receipts = receipts.dropna(subset=subset + groupby_keys)

        receipts = receipts[receipts['document_value'] != 0]
        receipts = receipts[receipts['reimbursement_number'] != '0']
        receipts = receipts[receipts['year'] != 0]
        receipts = receipts[receipts['applicant_id'] != '0']
        receipts = receipts[receipts['document_id'] != '0']

        log.info('Grouping dataset by applicant_id, document_id and year…')
        grouped = receipts.groupby(groupby_keys)

        log.info('Gathering all reimbursement numbers together…')
        numbers = self.aggregate(grouped, 'reimbursement_number',
                                 'reimbursement_numbers',
                                 lambda x: ', '.join(set(x)))

        log.info('Summing all net values together…')
        net_total = self.aggregate(grouped, 'net_value', 'total_net_value',
                                   np.sum)

        log.info('Summing all reimbursement values together…')
        total = self.aggregate(grouped, 'reimbursement_value',
                               'reimbursement_value_total', np.sum)

        log.info('Generating the new dataset…')
        final = pd.merge(pd.merge(pd.merge(total, net_total, on=groupby_keys),
                                  numbers,
                                  on=groupby_keys),
                         receipts,
                         on=groupby_keys)
        final = final.drop_duplicates(subset=groupby_keys)
        final.rename(columns={
            'net_value': 'net_values',
            'reimbursement_value': 'reimbursement_values'
        },
                     inplace=True)
        final = final.drop('reimbursement_number', 1)
        return final
 def receipts(self):
     log.info('Merging all datasets…')
     datasets = ["reimbursements-{}.xz".format(n) for n in self.years]
     data = (self.read_csv(name) for name in datasets)
     return pd.concat(data)