Ejemplo n.º 1
0
 def __exit__(self, exception_type, exception_value, tb_value):
     """Contextmanager exit: nothing to do here if no exception is raised"""
     if any(v is not None
            for v in [exception_type, exception_value, tb_value]):
         # only if any of these variables is not None
         # catch exception and output additional information
         logging.getLogger('__main__').warning(
             f"\nError while reading records: "
             f"{exception_type}\n{exception_value}\n"
             f"{traceback.print_tb(tb_value)}\n")
         logging.getLogger('__main__').warning(
             f"Current source: \n {self.current_source}\n")
         stats_str = HF.report_stats(self.count_glob, self.continue_number)
         logging.getLogger('__main__').warning(stats_str)
     return False
Ejemplo n.º 2
0
def main():
    """ Main function for cli-mode to process data
    from postgres db or local file input
    to postgres db or local file output
    """

    # Load Config, will be overwritten if args are given
    config = BaseConfig()
    # Parse args
    config.parse_args()

    # initialize mapping class
    # depending on lbsn origin
    # e.g. 1 = Instagram,
    #      2 = Flickr, 2.1 = Flickr YFCC100m,
    #      3 = Twitter)
    importer = HF.load_importer_mapping_module(config.origin,
                                               config.mappings_path)

    # initialize lbsntransform
    lbsntransform = LBSNTransform(
        importer=importer,
        logging_level=config.logging_level,
        is_local_input=config.is_local_input,
        transfer_count=config.transfer_count,
        csv_output=config.csv_output,
        csv_suppress_linebreaks=config.csv_suppress_linebreaks,
        dbuser_output=config.dbuser_output,
        dbserveraddress_output=config.dbserveraddress_output,
        dbname_output=config.dbname_output,
        dbpassword_output=config.dbpassword_output,
        dbserverport_output=config.dbserverport_output,
        dbformat_output=config.dbformat_output,
        dbuser_input=config.dbuser_input,
        dbserveraddress_input=config.dbserveraddress_input,
        dbname_input=config.dbname_input,
        dbpassword_input=config.dbpassword_input,
        dbserverport_input=config.dbserverport_input,
        dbuser_hllworker=config.dbuser_hllworker,
        dbserveraddress_hllworker=config.dbserveraddress_hllworker,
        dbname_hllworker=config.dbname_hllworker,
        dbpassword_hllworker=config.dbpassword_hllworker,
        dbserverport_hllworker=config.dbserverport_hllworker,
        include_lbsn_bases=config.include_lbsn_bases,
        dry_run=config.dry_run,
        hmac_key=config.hmac_key)

    # initialize input reader
    input_data = LoadData(
        importer=importer,
        is_local_input=config.is_local_input,
        startwith_db_rownumber=config.startwith_db_rownumber,
        skip_until_file=config.skip_until_file,
        cursor_input=lbsntransform.cursor_input,
        input_path=config.input_path,
        recursive_load=config.recursive_load,
        local_file_type=config.local_file_type,
        endwith_db_rownumber=config.endwith_db_rownumber,
        is_stacked_json=config.is_stacked_json,
        is_line_separated_json=config.is_line_separated_json,
        csv_delim=config.csv_delim,
        use_csv_dictreader=config.use_csv_dictreader,
        input_lbsn_type=config.input_lbsn_type,
        dbformat_input=config.dbformat_input,
        geocode_locations=config.geocode_locations,
        ignore_input_source_list=config.ignore_input_source_list,
        disable_reactionpost_ref=config.disable_reactionpost_ref,
        map_relations=config.map_relations,
        transfer_reactions=config.transfer_reactions,
        ignore_non_geotagged=config.ignore_non_geotagged,
        min_geoaccuracy=config.min_geoaccuracy,
        source_web=config.source_web,
        skip_until_record=config.skip_until_record,
        zip_records=config.zip_records,
        include_lbsn_objects=config.include_lbsn_objects,
        override_lbsn_query_schema=config.override_lbsn_query_schema)

    # Manually add entries that need submission prior to parsing data
    # add_bundestag_group_example(import_mapper)

    # init time monitoring
    how_long = TimeMonitor()

    # read and process unfiltered input records from csv
    # start settings
    with input_data as records:
        for record in records:
            lbsntransform.add_processed_records(record)
            # report progress
            if lbsntransform.processed_total % 1000 == 0:
                stats_str = HF.report_stats(input_data.count_glob,
                                            input_data.continue_number,
                                            lbsntransform.lbsn_records)
                print(stats_str, end='\r')
                sys.stdout.flush()
            if (config.transferlimit
                    and lbsntransform.processed_total >= config.transferlimit):
                break

    # finalize output (close db connection, submit remaining)
    lbsntransform.log.info(
        f'\nTransferring remaining '
        f'{lbsntransform.lbsn_records.count_glob} to db.. '
        f'{HF.null_notice(input_data.import_mapper.null_island)})')
    lbsntransform.finalize_output()

    # final report
    lbsntransform.log.info(
        f'\n\n{"".join([f"(Dry Run){chr(10)}" if config.dry_run else ""])}'
        f'Processed {input_data.count_glob} input records '
        f'(Input {input_data.start_number} to '
        f'{input_data.continue_number}). '
        f'\n\nIdentified {lbsntransform.processed_total} LBSN records, '
        f'with {lbsntransform.lbsn_records.count_glob_total} '
        f'distinct LBSN records overall. '
        f'{HF.get_skipped_report(input_data.import_mapper)}. '
        f'Merged {lbsntransform.lbsn_records.count_dup_merge_total} '
        f'duplicate records.')
    lbsntransform.log.info(
        f'\n{HF.get_count_stats(lbsntransform.lbsn_records)}')

    lbsntransform.log.info(f'Done. {how_long.stop_time()}')

    lbsntransform.close_log()