class JsonPost: """Handles the data to find duplicates.""" logger = LoggerFactory.get_enhancement_logger() def __init__(self, json): """Constructor of JsonPost.""" self.json = json self.mod_json = self.__remove_not_needed_data(deepcopy(json)) self.is_duplicate = False @staticmethod def __remove_not_needed_data(json): """Removes data from json which should not be use in the comparison.""" JsonPost.logger.debug("__remove_not_needed_data()") if 'link' in json: del json['link'] if 'source' in json: del json['source'] if 'id' in json: del json['id'] if 'post_struct' in json: del json['post_struct'] return json
def __init__(self, name): """Constructor of the scraper.""" # Scraper name -> Overwritten by name of the scraper file self.name = name # The URLs which will be parsed and scraped self.urls = [] # An error object to keep track of error occurences (which is used for logging) self.errors = [] # logger for Scraper self.logger = LoggerFactory.get_logger(name) # start time of logger self.start = None
def __init__(self, data, domain_name): """Constructor of Enhancer.""" self.__data = data self.__domain_name = domain_name self.logger = LoggerFactory.get_enhancement_logger() # link function containing domain specific enhancement to said domain here # 'domain' : self.__enhance_domain_function_name # make sure to enter 'self.__enhance_domain_function_name' and not # 'self.__enhance_domain_function_name()' as brackets would make this # a function call instead of a reference to the function self.__function_map = { 'ehrenamt_hessen': self.__enhance_ehrenamt_hessen, 'weltwaerts': self.__enhance_weltwaerts, 'gutetat_berlin': self.__enhance_gute_tat, 'gutetat_hamburg': self.__enhance_gute_tat, 'gutetat_munich': self.__enhance_gute_tat, 'ein_jahr_freiwillig': self.__enhance_ein_jahr_freiwillig, 'bundesfreiwilligendienst': self.__enhance_bundesfreiwilligendienst, }
from shared.LoggerFactory import LoggerFactory logger = LoggerFactory.get_enhancement_logger() def add_map_address(data): """ dummy function for adding map_adress """ logger.debug("add_map_address()") for post in data: post['post_struct']['map_address'] = ''
# Root Directory (/etl) ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) os.environ['ROOT_DIR'] = ROOT_DIR # Adds path of the data extraction modules sys.path.extend([f'{ROOT_DIR}/data_extraction', f'{ROOT_DIR}/shared']) from data_enhancement import enhance_data as enhance_data from data_extraction.scrape_data import run as run_extraction from shared.utils import write_data_to_json, read_data_from_json from shared.LoggerFactory import LoggerFactory from data_management.DataManager import DataManager logger = LoggerFactory.get_general_logger() # Runs the extraction process and writes the scraped data to data_extraction/data directory run_extraction() for file in os.scandir(os.path.join(ROOT_DIR, 'data_extraction/data')): file_name = os.path.splitext(file.name)[0] # read scraped data for enhancement data = read_data_from_json(file.path) # Enhance data enhanced_data = enhance_data.Enhancer(data, file_name).run() # Write enhanced data to files write_data_to_json(os.path.join(ROOT_DIR, 'data_enhancement/data', f'{file_name}.json'), enhanced_data) DataManager.run_backup_process()
class DataManager: """ Class that collects the enhanced data from the scraping process, manages backups and composes the data that is will be uploaded to elasticsearch - After the enhancement process, the results are stored in a backup folder named after the time of the backup - the upload folder contains the data that will be uploaded to elasticsearch For the upload data, the most recent backup data is selected. The amount of posts in the selected dataset is compared against older backups according to the value set in fallback_depth. A fallback_depth of 2 means that the data from the last 2 backups that are older than the current selected backup is compared with the upload data. If the current dataset for the upload contains less than X% of the posts in the backup dataset, where X is the defined threshold, the backup dataset is selected instead. This is done on a file by file basis, meaning the upload can contain a mixture of files from different backups. The source of all files in the upload folder at the end of the process gets logged.""" # manages how many backups are to be kept. if the number of existing backups would exceed this threshold, the oldest # backup gets deleted max_number_of_backups = 7 # manages how many backups into the past should be considered for the upload fallback_depth = 3 # defines the percentage threshold at which data from an older backup may be used. data from an older backup may be # used if the current data selected for upload contains less than [threshold] * [number of posts in backup], the # data from the backup is selected for upload instead threshold = 0.75 enhanced_data_location = os.path.join(ROOT_DIR, 'data_enhancement/data') backup_directory = os.path.join(ROOT_DIR, 'data_management', 'backups') upload_directory = os.path.join(ROOT_DIR, 'data_management', 'upload') file_upload_data_origin = os.path.join(ROOT_DIR, 'logs', 'upload_data_origin.log') mask_timestamp = '%d.%m.%Y' logger = LoggerFactory.get_datamanagement_logger() data_origin = dict() @staticmethod def timestamp_to_datestring(timestamp): """Converts unix timestamp into datestring""" DataManager.logger.debug("timestamp_to_datestring()") return datetime.datetime.fromtimestamp(timestamp).strftime( DataManager.mask_timestamp) @staticmethod def datestring_to_timestamp(datestring): """Converts datestring into unix timestamp""" DataManager.logger.debug("datestring_to_timestamp()") return time.mktime( datetime.datetime.strptime(datestring, DataManager.mask_timestamp).timetuple()) @staticmethod def save_upload_data_origin(upload_data_origin): """Saves the information about the origin of the data inside the upload folder into a text file""" file = open(DataManager.file_upload_data_origin, 'w', encoding='utf-8') file.write( f"last upload: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" ) file.write(f"Source for upload data:") file.write(upload_data_origin) file.close() @staticmethod def copy_from_backup(backup): """Copies all the files from a backup into the upload folder and documents the files origin""" DataManager.logger.debug("copy_from_backup()") path_backup = os.path.join(DataManager.backup_directory, backup) for file in os.listdir(path_backup): shutil.copy(os.path.join(path_backup, file), os.path.join(DataManager.upload_directory, file)) DataManager.data_origin[file] = backup @staticmethod def backup_current_data(): """Creates a backup for the data in einander-helfen/etl/data_enhancement/data with current date as timestamp""" DataManager.logger.debug("backup_current_data()") backup_location = os.path.join( DataManager.backup_directory, DataManager.timestamp_to_datestring(time.time())) if os.path.exists(backup_location): DataManager.logger.warning( "There already exists a backup from today, deleting old backup" ) shutil.rmtree(backup_location) os.makedirs(backup_location) enhancement_files = os.listdir(DataManager.enhanced_data_location) for file in enhancement_files: enhancement_file = os.path.join(DataManager.enhanced_data_location, file) if os.path.isfile(enhancement_file): shutil.copy(enhancement_file, backup_location) @staticmethod def get_sorted_list_of_backups(): """ returns a list containing all the backup folders in a sorted order from old to new""" DataManager.logger.debug("get_sorted_list_of_backups()") backups = os.listdir(DataManager.backup_directory) backup_timestamps = list() for folder in backups: backup_timestamps.append( DataManager.datestring_to_timestamp(folder)) backup_timestamps.sort() sorted_filenames = list() for timestamp in backup_timestamps: sorted_filenames.append( DataManager.timestamp_to_datestring(timestamp)) return sorted_filenames @staticmethod def remove_old_backups(): """Checks if the backup folder contains more than the maximum of set backups and deletes surplus""" DataManager.logger.debug("remove_old_backups()") backups = DataManager.get_sorted_list_of_backups() if len(backups) > DataManager.max_number_of_backups: DataManager.logger.info( f"More than {DataManager.max_number_of_backups} backups exist({len(backups)})" f", deleting {len(backups)- DataManager.max_number_of_backups} backup(s)" ) for file in backups[:len(backups) - DataManager.max_number_of_backups]: DataManager.logger.info(f"Deleting backup {file}") shutil.rmtree(os.path.join(DataManager.backup_directory, file)) @staticmethod def clear_upload(): """Clears the upload folder as preparation for the fresh upload data""" DataManager.logger.debug("clear_upload()") shutil.rmtree(DataManager.upload_directory) os.makedirs(DataManager.upload_directory) @staticmethod def get_eligible_backups(): """Returns list of backups that are eligible as a fallback""" DataManager.logger.debug("get_eligible_backups()") if len(DataManager.get_sorted_list_of_backups() ) < DataManager.fallback_depth: DataManager.fallback_depth = len( DataManager.get_sorted_list_of_backups()) return DataManager.get_sorted_list_of_backups( )[-DataManager.fallback_depth - 1:] @staticmethod def initialise_upload_data(backups): """Copies files from all backups within fallback depth into upload folder, the most recent backup is the last to get copied. As a result, upload now contains all files from the most recent scrape and any additional files from older backups within fallback range.""" DataManager.logger.debug("initialise_upload_data()") for backup_folder in backups[-DataManager.fallback_depth - 1:]: DataManager.copy_from_backup(backup_folder) @staticmethod def build_string_data_origin(): """Builds string with summary of which backup files in the upload folder are taken from""" DataManager.logger.debug("build_string_data_origin()") max_length = 0 for entry in DataManager.data_origin: if len(entry) > max_length: max_length = len(entry) string_data_origin = "" for entry in DataManager.data_origin: string_data_origin = string_data_origin+"\n" + \ f"{entry.rjust(max_length)} : {DataManager.data_origin[entry]}" return string_data_origin @staticmethod def compose_upload(): """Composes the upload according to the general behaviour described for this class and the set parameters""" DataManager.logger.debug("compose_upload()") DataManager.clear_upload() eligible_backups = DataManager.get_eligible_backups() DataManager.initialise_upload_data(eligible_backups) eligible_backups = eligible_backups[:-1] # ignore most recent backup for backup in eligible_backups: for upload_file in os.listdir(DataManager.upload_directory): if os.path.isfile( os.path.join(DataManager.backup_directory, backup, upload_file)): data_in_upload = read_data_from_json( os.path.join(DataManager.upload_directory, upload_file)) data_in_backup = read_data_from_json( os.path.join(DataManager.backup_directory, backup, upload_file)) if len(data_in_upload ) < DataManager.threshold * len(data_in_backup): DataManager.logger.info( f"{upload_file} contains less than 75% of the posts in backup " f"'{backup}' ({len(data_in_upload)} posts vs {len(data_in_backup)} " f"posts). Current data for {upload_file} will be replaced with backup " f"data") write_data_to_json( os.path.join(DataManager.upload_directory, upload_file), data_in_backup) DataManager.data_origin[upload_file] = backup upload_data_origin = DataManager.build_string_data_origin() DataManager.save_upload_data_origin(upload_data_origin) DataManager.logger.info( f"Source for upload data: {upload_data_origin}") @staticmethod def init(): """Sets up the required folders and corrects set parameters if needed""" DataManager.logger.debug("init()") if not os.path.exists(DataManager.backup_directory): DataManager.logger.info("Creating backup directory") os.makedirs(DataManager.backup_directory) if not os.path.exists(DataManager.upload_directory): DataManager.logger.info("Creating upload directory") os.makedirs(DataManager.upload_directory) if DataManager.fallback_depth > DataManager.max_number_of_backups: DataManager.logger.warning( f"fallback depth exceeds maximal number of backups (" f"{DataManager.fallback_depth} > {DataManager.max_number_of_backups}), " f"fallback depth will be limited to number of backups") DataManager.fallback_depth = DataManager.max_number_of_backups @staticmethod def run_backup_process(): """Runs the datamangement process for creating backups""" DataManager.logger.debug("run_backup_process()") DataManager.init() DataManager.backup_current_data() DataManager.remove_old_backups() @staticmethod def run_compose_upload_process(): """Runs the datamangement process for composing the upload""" DataManager.logger.debug("run_compose_upload_process()") DataManager.init() DataManager.compose_upload()
import hashlib import json import os from elasticsearch import Elasticsearch from shared.utils import read_data_from_json from shared.LoggerFactory import LoggerFactory ROOT_DIR = os.environ['ROOT_DIR'] client = Elasticsearch([{'host': '127.0.0.1', 'port': 9200}]) logger = LoggerFactory.get_elastic_logger() def run_elastic_upload(): logger.debug("run_elastic_upload") logger.info("Starting Index Process!") index = 'posts' if client.indices.exists(index=index): client.indices.delete(index=index, ignore=[400, 404]) request_body = { 'mappings': { 'properties': { 'geo_location': {'type': 'geo_point'}, }} } client.indices.create(index=index, body=request_body) logger.info("Finished Indexing!")
import os ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) os.environ['ROOT_DIR'] = ROOT_DIR from upload_to_elasticsearch.elastic import run_elastic_upload from shared.LoggerFactory import LoggerFactory from data_management.DataManager import DataManager LoggerFactory.get_elastic_logger().info("running elastic upload") # starts the process of selecting the files to upload to elastic search DataManager.run_compose_upload_process() # execute the upload to elastic search run_elastic_upload()
class LatLonEnhancer: """Class handling the enhancement of posts by adding geo data.""" logger = LoggerFactory.get_enhancement_logger() dict_file = os.path.join(os.getenv('ROOT_DIR'), 'data_enhancement', 'enhancement_location', 'geocoder_lat_lon.csv') lat_lon_dict = {} def __init__(self): """Initializes the enhancer.""" self.__setup() self.geo_locator = Nominatim(user_agent="einander-helfen.org") self.__load_local_storage() def __setup(self): """Checks if the local storage file exists and creates it if it is missing""" LatLonEnhancer.logger.debug("__setup()") if not os.path.exists(self.dict_file): LatLonEnhancer.logger.warn( f"Create missing geocoder_lat_lon.csv as {self.dict_file}") open(self.dict_file, "x", encoding='utf-8') def enhance(self, post): """Adds latitude and longitude to a given post, if both are missing. Returns the enhanced post.""" LatLonEnhancer.logger.debug("enhance()") # If object has lat lon: return object if None is post['geo_location']: request_string = LatLonEnhancer.get_api_request_string(post) lat_lon = self.__check_local_storage(request_string) if lat_lon is None: LatLonEnhancer.logger.info(f"enhancing lat lon for {post}") lat_lon = self.__handle_api_requests(request_string) if lat_lon: self.__add_new_entry(request_string, lat_lon) post['geo_location'] = lat_lon post['post_struct']['geo_location'] = lat_lon def __check_local_storage(self, request_string): """Checks if local storage contains a result for the query. If it does, the geo_location object is returned. Returns None if local storage doesn't contain a result for the request""" LatLonEnhancer.logger.debug("__check_local_storage()") if request_string in self.lat_lon_dict: return self.lat_lon_dict[request_string] return None def __load_local_storage(self): """Reads local storage file (.csv) into class attribute""" LatLonEnhancer.logger.debug("__load_local_storage()") # Initialize the file, if it is not with open(self.dict_file, 'a', newline='', encoding='utf-8') as csvfile: if not csvfile.tell(): fieldnames = ['request', 'lat', 'lon'] writer = csv.DictWriter(csvfile, fieldnames) writer.writeheader() # Read the file with open(self.dict_file, newline='', encoding='utf-8') as csvfile: geocoder_lat_lon = csv.reader(csvfile, delimiter=',') for row in geocoder_lat_lon: if row and row[0] != 'request': # row[0]: request string, row[1]: lat, row[2]: lon self.lat_lon_dict[row[0]] = { 'lat': float(row[1]), 'lon': float(row[2]) } def __add_new_entry(self, request_string, geo_location): """Adds new entry to local storage""" LatLonEnhancer.logger.debug("__add_new_entry()") self.lat_lon_dict[request_string] = geo_location with open(self.dict_file, 'a', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) writer.writerow([ request_string, str(geo_location['lat']), str(geo_location['lon']) ]) LatLonEnhancer.logger.info( f'Added geo location of \'{request_string}\' to the dictionary') def __handle_api_requests(self, request_string): """Executes the API request""" LatLonEnhancer.logger.debug(f"__handle_api_requests({request_string})") if request_string != "": location = self.geo_locator.geocode(request_string) time.sleep(1) if location: geo_location = { 'lat': location.latitude, 'lon': location.longitude } return geo_location return None @staticmethod def get_api_request_string(post): """Build the API request string""" LatLonEnhancer.logger.debug("get_api_request_string()") struct_data = post['post_struct'] request_string = "" # Try to build request string from: # 1. structured location 2. structured address of contact 3. structured address of organisation for field in ['location', 'contact', 'organization']: if len(request_string ) < 1 and field in struct_data and struct_data[ field] and len(struct_data[field]) > 0: request_string += struct_data[field]['street'] + ' ' if 'street' in struct_data[field] and \ struct_data[field]['street'] else '' request_string += struct_data[field]['zipcode'] + ' ' if 'zipcode' in struct_data[field] and \ struct_data[field]['zipcode'] else '' request_string += struct_data[field]['city'] + ' ' if 'city' in struct_data[field] and \ struct_data[field]['city'] else '' request_string += struct_data[field]['country'] + ' ' if 'country' in struct_data[field] and \ struct_data[field]['country'] else '' request_string = request_string.strip() return request_string