def read_from_csv(self, input_file, delimiter): """ Read venues from a CSV file (header required). :param input_file: Path to the CSV file. :param delimiter: Column delimiter in CSV file (typically ','). """ # read CSV as UTF-8 encoded file (see also http://stackoverflow.com/a/844443) with codecs.open(input_file, encoding='utf8') as fp: logger.info("Reading venues from " + input_file + "...") reader = csv.reader(fp, delimiter=delimiter) # read header header = next(reader, None) if not header: raise IllegalArgumentError("Missing header in CSV file.") venue_index = header.index("venue") year_index = header.index("year") identifier_index = header.index("identifier") # read CSV file for row in reader: if row: self.venues.append( Venue(row[venue_index], row[year_index], row[identifier_index])) else: raise IllegalArgumentError("Wrong CSV format.") self.filename = os.path.basename(input_file) logger.info(str(len(self.venues)) + " venues have been imported.")
def read_from_csv(self, input_file, exact_matches, replace_parentheses, delimiter): """ Read search queries from a CSV file (header required). :param replace_parentheses: Replace Wikipedia parentheses in query strings :param exact_matches: Only search for exact matches of query strings :param input_file: Path to the CSV file. :param delimiter: Column delimiter in CSV file (typically ','). """ # read CSV as UTF-8 encoded file (see also http://stackoverflow.com/a/844443) with codecs.open(input_file, encoding='utf8') as fp: logger.info("Reading search queries from " + input_file + "...") reader = csv.reader(fp, delimiter=delimiter) # read header header = next(reader, None) if not header: raise IllegalArgumentError("Missing header in CSV file.") query = header.index("query") # read CSV file for row in reader: if row: self.values.append( Query(row[query], exact_matches, replace_parentheses)) else: raise IllegalArgumentError("Wrong CSV format.") self.filename = os.path.basename(input_file) logger.info( str(len(self.values)) + " search queries have been imported.")
def add(self, entities): error_message = "Argument must be object of class Entity or class EntityList." if isinstance(entities, Entity): self.entities.append(entities) elif isinstance(entities, EntityList): self.entities = self.entities + entities.entities elif isinstance(entities, list): for element in entities: if not isinstance(element, Entity): raise IllegalArgumentError(error_message) self.entities.append(element) else: raise IllegalArgumentError(error_message) self.set_predecessors()
def __init__(self, configuration, input_parameter_values, predecessor): """ To initialize an entity, a corresponding entity configuration together and values for the input parameter(s) are needed. :param configuration: an object of class EntityConfiguration :param input_parameter_values: A dictionary with values for the input parameters defined in the configuration. :param predecessor: predecessor in entity list """ # corresponding entity configuration self.configuration = configuration # parameters needed to identify entity (or for validation) self.input_parameters = OrderedDict.fromkeys( configuration.input_parameters) # parameters that should be retrieved using the API self.output_parameters = OrderedDict.fromkeys( configuration.output_parameter_mapping.keys()) # destination path for raw download self.destination = None # set values for input parameters for parameter in configuration.input_parameters: if parameter in input_parameter_values: self.input_parameters[parameter] = input_parameter_values[ parameter] else: raise IllegalArgumentError("Illegal input parameter: " + parameter) # get uri for this entity from uri template in the configuration uri_variable_values = {**self.input_parameters} # add values for API keys for i in range(0, len(self.configuration.api_keys)): uri_variable_values["api_key_" + str(i + 1)] = self.configuration.api_keys[i] # set values for range variables for range_var_name in configuration.range_vars: if not range_var_name in input_parameter_values: continue uri_variable_values[range_var_name] = input_parameter_values[ range_var_name] self.uri = self.configuration.uri_template.replace_variables( uri_variable_values) # set predecessor self.predecessor = predecessor # root entity is set if range variables are used self.root_entity = None # store JSON response data (may be needed by callbacks) self.json_response = None
def write_to_csv(self, output_dir, delimiter, include_language, filename=None): """ Export search results to a CSV file. :param output_dir: Target directory for generated CSV file. :param delimiter: Column delimiter in CSV file (typically ','). :param include_language: Add column "language" if tool was configured to detect languages of snippets. :param filename: Filename of file to export. """ if filename is not None: self.filename = filename if len(self.values) == 0: logger.info("Nothing to export.") return if not os.path.exists(output_dir): os.makedirs(output_dir) file_path = os.path.join(output_dir, self.filename) # write search results to UTF8-encoded CSV file (see also http://stackoverflow.com/a/844443) with codecs.open(file_path, 'w', encoding='utf8') as fp: logger.info('Exporting search results to ' + file_path + '...') writer = csv.writer(fp, delimiter=delimiter) column_names = SearchResult.get_column_names(include_language) # write header of CSV file writer.writerow(column_names) count = 0 try: for row in self.get_rows(include_language): if len(row) == len(column_names): writer.writerow(row) count = count + 1 else: raise IllegalArgumentError( str(abs(len(column_names) - len(row))) + ' parameter(s) is/are missing for "' + str(row) + '"') except UnicodeEncodeError: logger.error('Encoding error while writing data for: ' + str(row)) logger.info(str(count) + ' search results have been exported.')
def read_from_csv(self, input_file, delimiter): """ Read search results from a CSV file (header required). :param input_file: Path to the CSV file. :param delimiter: Column delimiter in CSV file (typically ','). """ # read CSV as UTF-8 encoded file (see also http://stackoverflow.com/a/844443) with codecs.open(input_file, encoding='utf8') as fp: logger.info("Reading search results from " + input_file + "...") reader = csv.reader(fp, delimiter=delimiter) # read header header = next(reader, None) if not header: raise IllegalArgumentError("Missing header in CSV file.") query_index = header.index("query") rank_index = header.index("rank") url_index = header.index("url") title_index = header.index("title") snippet_index = header.index("snippet") # read CSV file for row in reader: if row: self.values.append( SearchResult(row[query_index], row[rank_index], row[url_index], row[title_index], row[snippet_index])) else: raise IllegalArgumentError("Wrong CSV format.") self.filename = os.path.basename(input_file) logger.info( str(len(self.values)) + " search results have been imported.")
def write_to_csv(self, output_dir, delimiter): """ Export papers retrieved from venues to a CSV file. :param output_dir: Target directory for generated CSV file. :param delimiter: Column delimiter in CSV file (typically ','). """ if len(self.venues) == 0: logger.info("Nothing to export.") return if not os.path.exists(output_dir): os.makedirs(output_dir) file_path = os.path.join(output_dir, self.filename) # write paper list to UTF8-encoded CSV file (see also http://stackoverflow.com/a/844443) with codecs.open(file_path, 'w', encoding='utf8') as fp: logger.info('Exporting papers to ' + file_path + '...') writer = csv.writer(fp, delimiter=delimiter) column_names = Paper.get_column_names() # write header of CSV file writer.writerow(column_names) count = 0 for venue in self.venues: try: for row in venue.get_rows(): if len(row) == len(column_names): writer.writerow(row) count = count + 1 else: raise IllegalArgumentError( str(len(column_names) - len(row)) + " parameter(s) is/are missing for venue " + venue.identifier) except UnicodeEncodeError: logger.error( "Encoding error while writing data for venue: " + venue.identifier) logger.info(str(count) + ' papers have been exported.')
def replace_variables(self, variable_values): """ Replace all variables in the URI template with actual values. :param variable_values: A dictionary with values for the variables in the URI template. :return: The final URI string. """ uri = self.uri_template_str uri_variables = self.get_variables() for variable in uri_variables: value = variable_values.get(variable, None) if value: uri = uri.replace("{" + variable + "}", urllib.parse.quote(value)) else: IllegalArgumentError("Value for URI variable " + variable + " missing.") return uri
def main(): # parse command line arguments parser = get_argument_parser() args = parser.parse_args() # parse config file config = configparser.ConfigParser() config.read(args.config_file) # read configuration if 'DEFAULT' not in config: logger.error("DEFAULT configuration missing.\nTerminating.") sys.exit() # i/o input_file = str(config['DEFAULT'].get('InputFile', None)) output_dir = str(config['DEFAULT'].get('OutputDirectory', None)) delimiter = str(config['DEFAULT'].get('Delimiter', None)) if input_file is None or output_dir is None or delimiter is None: logger.error("Required configuration missing.\nTerminating.") sys.exit() # requests exact_matches = config['DEFAULT'].getboolean('ExactMatches', True) replace_parentheses = config['DEFAULT'].getboolean('ReplaceParentheses', True) max_results = config['DEFAULT'].getint('MaxResults', 25) min_wait = config['DEFAULT'].getint('MinWait', 500) max_wait = config['DEFAULT'].getint('MaxWait', 2000) # detecting languages of snippets detect_languages = config['DEFAULT'].getboolean('DetectLanguages', True) queries_only = False # read CSV as UTF-8 encoded file (see also http://stackoverflow.com/a/844443) with codecs.open(input_file, encoding='utf8') as fp: logger.info("Checking input format in " + input_file + "...") reader = csv.reader(fp, delimiter=delimiter) # read header header = next(reader, None) if not header: raise IllegalArgumentError("Missing header in CSV file.") queries_only = len(header) == 1 if queries_only: logger.info( "Input file contains only queries, retrieving search results...") query_list = QueryList() query_list.read_from_csv(input_file, exact_matches, replace_parentheses, delimiter) query_list.retrieve_search_results(max_results, min_wait, max_wait, detect_languages) query_list.write_search_results_to_csv(output_dir, delimiter, detect_languages) elif detect_languages: logger.info( "Input file contains search results, detecting language of snippets..." ) search_result_list = SearchResultList() search_result_list.read_from_csv(input_file, delimiter) search_result_list.detect_languages() search_result_list.write_to_csv(output_dir, delimiter, detect_languages) else: logger.info("No action configured, terminating...")
def get_chained_request_entities(self, chained_request_config): """ Execute a chained request after retrieving the data for this entity. :param chained_request_config: The configuration to use for the chained request. :return: The entities retrieved using the chained request. """ # check if provided configuration has same name as defined for chained request in own configuration if self.configuration.chained_request_name == chained_request_config.name: # get input parameters for chained request from input and output parameters of this entity try: selected_input_parameters = self.configuration.chained_request_input_parameters[ "input_parameters"] selected_output_parameters = self.configuration.chained_request_input_parameters[ "output_parameters"] # simple input parameters for chained request selected from input and output parameters of this entity input_parameters_chained_request = {} # the operator "._" can be used to flatten a list output parameter for the chained request flatten_parameters_chained_request = {} for parameter in selected_input_parameters: if parameter in self.input_parameters.keys(): input_parameters_chained_request[ parameter] = self.input_parameters[parameter] else: raise IllegalConfigurationError( "Input parameter for chained request not found: " + str(parameter)) for parameter in selected_output_parameters: if "._" in parameter: # flatten operator # get parameter that should be flattened flatten_parameter_match = FLATTEN_OPERATOR_REGEX.match( parameter) if flatten_parameter_match: flatten_parameter = flatten_parameter_match.group( 1) # get parameter to flatten from output parameters of this entity parameter_to_flatten_list = self.output_parameters[ flatten_parameter] if parameter_to_flatten_list: if isinstance( parameter_to_flatten_list, list): # only lists can be flattened flatten_parameters_chained_request[ flatten_parameter] = parameter_to_flatten_list else: raise IllegalConfigurationError( "Parameter should be flattened, but is not a list: " + str(parameter)) else: raise IllegalConfigurationError( "Wrong usage of flatten operator: Expected: <parameter>._ " "Actual: " + str(parameter)) else: # simple parameter if parameter in self.output_parameters.keys(): input_parameters_chained_request[ parameter] = self.output_parameters[parameter] else: raise IllegalConfigurationError( "Input parameter for chained request not found: " + str(parameter)) chained_request_entities = list() if len(flatten_parameters_chained_request ) > 0: # flatten parameters defined # we only support one flatten operator in the input parameter mapping for the chained request if len(flatten_parameters_chained_request) > 1: raise IllegalConfigurationError( "Only one flatten operator supported, but " + str(len(flatten_parameters_chained_request)) + " provided.") for flatten_parameter in flatten_parameters_chained_request.keys( ): parameter_to_flatten_list = flatten_parameters_chained_request[ flatten_parameter] if len(parameter_to_flatten_list) > 0: inner_parameters = parameter_to_flatten_list[ 0].keys() # check if inner parameter name conflicts with existing input parameters for chained request for inner_parameter in inner_parameters: if inner_parameter in input_parameters_chained_request.keys( ): raise IllegalConfigurationError( "Inner parameter " + inner_parameter + " of " + str(flatten_parameter) + " already exists in list of chained input parameters." ) # extract inner parameters and combine them with outer parameters to flatten the list for list_element in parameter_to_flatten_list: flattened_input_parameters_chained_request = { **input_parameters_chained_request } for inner_parameter in inner_parameters: flattened_input_parameters_chained_request[inner_parameter] = \ list_element[inner_parameter] chained_request_entities.append( Entity( chained_request_config, flattened_input_parameters_chained_request, None)) else: # no flatten parameters defined chained_request_entities.append( Entity(chained_request_config, input_parameters_chained_request, None)) except KeyError as e: raise IllegalConfigurationError( "Reading chained request from configuration failed: Parameter " + str(e) + " not found.") else: raise IllegalArgumentError( "Configuration <" + str(chained_request_config.name) + "> provided, but <" + str(self.configuration.chained_request_name) + "> needed for chained request.") return chained_request_entities
def apply_filter(json_response, parameter_filter): """ Use an access path (e.g., ["user", "first_name"]) to filter a nested dictionary. :param json_response: The JSON response to filter. :param parameter_filter: A list with keys for filtering a nested dictionary or with the list matching operator "*" followed by an optional parameter mapping for the list elements. :return: The extracted value if the filter has successfully been applied (can be a simple value, dict, or list), None otherwise. """ # start with whole JSON response filtered_response = json_response # apply the filter path for pos in range(len(parameter_filter)): current_filter = parameter_filter[pos] if current_filter == "*": # list matching operator extracted_list = [] if isinstance(filtered_response, list): if pos == len( parameter_filter ) - 1: # if no further arguments are provided, save complete list for element in filtered_response: extracted_list.append(element) elif pos == len( parameter_filter ) - 2: # next element is mapping for list element parameters if isinstance(parameter_filter[pos + 1], dict): list_element_filter = parameter_filter[pos + 1] for element in filtered_response: filtered_element = OrderedDict.fromkeys( list_element_filter.keys()) for parameter in filtered_element.keys(): filtered_element[parameter] = \ Entity.apply_filter(element, list_element_filter[parameter]) extracted_list.append(filtered_element) else: raise IllegalArgumentError( "The list matching operator must be succeeded by a filter " "object.") else: raise IllegalArgumentError( "The list matching operator must be the last or second-last element " "of the filter path.") else: raise IllegalArgumentError( "List matching operator reached, but current position in response is " "not a list.") # return extracted list as defined by the list matching operator return extracted_list else: # normal filter path if not isinstance(current_filter, list) and not isinstance( current_filter, dict): try: # filter may be an index for a list if isinstance(filtered_response, list) and Entity.parsable_as_int( current_filter): index = int(current_filter) filtered_response = filtered_response[index] else: # use current string as dictionary key to filter the response if filtered_response[current_filter] is None: logger.info("Result for filter " + current_filter + " was None.") return "None" else: filtered_response = filtered_response[ current_filter] except (KeyError, IndexError): logger.error("Could not apply filter <" + str(current_filter) + "> to response " + str(filtered_response) + ".") return None else: raise IllegalArgumentError( "A filter path must only contain filter strings or the list matching " "operator (optionally followed by a filter object).") return filtered_response
def read_from_csv(self, input_file, delimiter): """ Read entity input parameter values from a CSV file (header required). :param input_file: Path to the CSV file. :param delimiter: Column delimiter in CSV file (typically ','). """ # read CSV as UTF-8 encoded file (see also http://stackoverflow.com/a/844443) with codecs.open(input_file, encoding='utf8') as fp: if self.chunk_size == 0: interval = "[" + str(self.start_index) + ", max]" else: interval = "[" + str(self.start_index) + ", " + str(self.start_index+self.chunk_size-1) + "]" logger.info("Reading entities in " + interval + " from " + input_file + "...") reader = csv.reader(fp, delimiter=delimiter) # check if one of the input parameters is an URI uri_input_parameters = OrderedDict() for parameter in self.configuration.input_parameters: if isinstance(parameter, list): if not len(parameter) == 3 and parameter[1].startswith("http"): raise IllegalConfigurationError("Malformed URI input parameter, should be" + "[parameter, uri, response_filter].") uri_parameter = parameter[0] uri = parameter[1] response_filter = parameter[2] logger.info("Found URI input parameter: " + str(uri_parameter)) logger.info("Retrieving data for URI input parameter " + str(uri_parameter) + "...") try: # retrieve data response = self.session.get(uri) if response.ok: logger.info("Successfully retrieved data for URI input parameter " + str(uri_parameter) + ".") # deserialize JSON string json_response = json.loads(response.text) filter_result = Entity.apply_filter(json_response, response_filter) uri_input_parameters[uri_parameter] = filter_result else: raise IllegalConfigurationError("Error " + str(response.status_code) + ": Could not retrieve data for URI input parameter " + str(uri_parameter) + ". Response: " + str(response.content)) except (gaierror, ConnectionError, MaxRetryError, NewConnectionError): logger.error("An error occurred while retrieving data for URI input parameter " + str(uri_parameter) + ".") # replace URI parameter with URI parameter name self.configuration.input_parameters.remove(parameter) self.configuration.input_parameters.append(uri_parameter) # dictionary to store CSV column indices for input parameters input_parameter_indices = OrderedDict.fromkeys(self.configuration.input_parameters) # read header header = next(reader, None) if not header: raise IllegalArgumentError("Missing header in CSV file.") # number of columns must equal number of input parameters minus number of uri input parameters if not len(header) == len(input_parameter_indices) - len(uri_input_parameters): raise IllegalArgumentError("Wrong number of columns in CSV file.") # check if columns and parameters match, store indices for index in range(len(header)): if header[index] in input_parameter_indices.keys(): input_parameter_indices[header[index]] = index else: raise IllegalArgumentError("Unknown column name in CSV file: " + header[index]) # read CSV file predecessor = None current_index = 0 for row in reader: # only read value from start_index to start_index+chunk_size-1 (if chunk_size is 0, read until the end) if current_index < self.start_index: current_index += 1 continue elif (self.chunk_size != 0) and (current_index >= self.start_index+self.chunk_size): current_index += 1 break if row: # dictionary to store imported parameter values input_parameter_values = OrderedDict.fromkeys(self.configuration.input_parameters) # read parameters for parameter in input_parameter_values.keys(): # if parameter was URI input parameter, get value from dict if parameter in uri_input_parameters.keys(): value = uri_input_parameters[parameter] else: # get value from CSV parameter_index = input_parameter_indices[parameter] value = row[parameter_index] # unescape escaped double quotes value = str(value).replace("\"\"", "\"") if value: input_parameter_values[parameter] = value else: raise IllegalArgumentError("No value for parameter " + parameter) # create entity from values in row new_entity = Entity(self.configuration, input_parameter_values, predecessor) predecessor = new_entity # if ignore_input_duplicates is configured, check if entity already exists if self.configuration.ignore_input_duplicates: entity_exists = False for entity in self.entities: if entity.equals(new_entity): entity_exists = True if not entity_exists: # add new entity to list self.entities.append(new_entity) else: # ignore_input_duplicates is false # add new entity to list self.entities.append(new_entity) else: raise IllegalArgumentError("Wrong CSV format.") current_index += 1 logger.info(str(len(self.entities)) + " entities have been imported.")
def write_to_csv(self, output_dir, delimiter): """ Export entities together with retrieved data to a CSV file. :param output_dir: Target directory for generated CSV file. :param delimiter: Column delimiter in CSV file (typically ','). """ if len(self.entities) == 0: logger.info("Nothing to export.") return if not os.path.exists(output_dir): os.makedirs(output_dir) if self.chunk_size != 0: filename = '{0}_{1}-{2}.csv'.format(self.configuration.name, str(self.start_index), str(self.start_index + min(len(self.entities), self.chunk_size) - 1)) else: filename = '{0}.csv'.format(self.configuration.name) file_path = os.path.join(output_dir, filename) # write entity list to UTF8-encoded CSV file (see also http://stackoverflow.com/a/844443) with codecs.open(file_path, 'w', encoding='utf8') as fp: logger.info('Exporting entities to ' + file_path + '...') writer = csv.writer(fp, delimiter=delimiter) # check if input and output parameters overlap -> validate these parameters later validation_parameters = OrderedSet(self.configuration.input_parameters).intersection( OrderedSet(self.configuration.output_parameter_mapping.keys()) ) # get column names for CSV file (start with input parameters) column_names = self.configuration.input_parameters + [ parameter for parameter in self.configuration.output_parameter_mapping.keys() if parameter not in validation_parameters ] # check if an output parameter has been added and/or removed by a callback function and update column names parameters_removed = OrderedSet() parameters_added = OrderedSet() for entity in self.entities: parameters_removed.update(OrderedSet(self.configuration.output_parameter_mapping.keys()).difference( OrderedSet(entity.output_parameters.keys())) ) parameters_added.update(OrderedSet(entity.output_parameters.keys()).difference( OrderedSet(self.configuration.output_parameter_mapping.keys())) ) for parameter in parameters_removed: column_names.remove(parameter) for parameter in parameters_added: column_names.append(parameter) # write header of CSV file writer.writerow(column_names) for entity in self.entities: try: row = OrderedDict.fromkeys(column_names) # check validation parameters for parameter in validation_parameters: if entity.output_parameters[parameter]: if str(entity.input_parameters[parameter]) == str(entity.output_parameters[parameter]): logger.info("Validation of parameter " + parameter + " successful for entity " + str(entity) + ".") else: logger.error("Validation of parameter " + parameter + " failed for entity " + str(entity) + ": Expected: " + str(entity.input_parameters[parameter]) + ", Actual: " + str(entity.output_parameters[parameter]) + ". Retrieved value will be exported.") else: logger.error("Validation of parameter " + parameter + " failed for entity " + str(entity) + ": Empty value.") # write data for column_name in column_names: if column_name in entity.output_parameters.keys(): row[column_name] = entity.output_parameters[column_name] elif column_name in entity.input_parameters.keys(): row[column_name] = entity.input_parameters[column_name] if len(row) == len(column_names): writer.writerow(list(row.values())) else: raise IllegalArgumentError(str(len(column_names) - len(row)) + " parameter(s) is/are missing " "for entity " + str(entity)) except UnicodeEncodeError: logger.error("Encoding error while writing data for entity: " + str(entity)) logger.info(str(len(self.entities)) + ' entities have been exported.')