def __get_unmatched(self, item: dict): unmatched = {} for (key, value) in item.items(): if not Utils.empty(value): value = Utils.clean_if_nan(value) if value is not None: unmatched[key] = value return unmatched
def __get_unmatched(self, data, labels): unmatched = {} for ix, col in enumerate(data): if col is not None: if self.__has(labels, ix): label = labels[ix] else: label = str(ix) col = Utils.clean_if_nan(col) if col is not None and not Utils.empty(col): unmatched[label] = col return unmatched
def __get_unmatched(self, xmlitem, inputunmatched=None): unmatched = {} if inputunmatched is not None: unmatched = inputunmatched for ix, item in enumerate(list(xmlitem)): if self.__has_children(item): unmatched[item.tag] = {} self.__get_unmatched(item, unmatched[item.tag]) else: value = Utils.clean_if_nan(item.text) if value is not None and not Utils.empty(value): unmatched[item.tag] = value return unmatched
def get(self, key, subject, multiple=None, value_index=None, default=None): split = key.split('.') split_count = len(split) for index, val in enumerate(split): if val not in subject: if default is None: return False else: return default it = index + 1 if it == split_count: subject = subject[val] if type(subject) is list and len(subject) > 0: if type(value_index) is int: try: subject = subject[value_index] except: if default is None: return False else: return default elif type(value_index) is str: if value_index == 'first': subject = subject[0] elif value_index == 'last': subject = subject[-1] else: subject = subject[val] # Set to None if value is NaN subject = Utils.clean_if_nan(subject) return subject
def process(self): self.__csv = pd.read_csv(self.filepath, delimiter=self.delimiter, skip_blank_lines=self.skip_blank_lines, header=None) self.__csv = self.__csv.values labels = None result = [] for it, data in enumerate(self.__csv): # Updating stats self.__stats['total_count_with_header'] += 1 # Skipping the first line if needed if self.skip_header and it == 0: labels = data # Updating stats self.__stats['header_skipped'] = True continue # Updating stats self.__stats['total_count'] += 1 item = {} cols_to_delete = [] for map_key, map_value in self.mapping.items(): if 'col' in map_value: col = int(map_value['col']) default = None if 'default' in map_value: default = map_value['default'] if col == '_all_': finalvalue = data else: finalvalue = data[col] # Set to None if value is NaN finalvalue = Utils.clean_if_nan(finalvalue) if finalvalue is None: if default is None: finalvalue = '' else: finalvalue = default if 'transformations' in map_value: finalvalue = handle_transformations(map_value['transformations'], finalvalue, error_tolerance=self.__error_tolerance) item = apply_value(item, map_key, finalvalue) if 'conditions' in map_value: finalvalue = handle_conditions(map_value['conditions'], item, data) item = apply_value(item, map_key, finalvalue) # To remember which cols have already been retrieved if self.__save_unmatched: cols_to_delete.append(col) elif 'value' in map_value: finalvalue = map_value['value'] if type(finalvalue) == str: finalvalue = finalvalue.replace('$subject', 'item') expr = parser.expr(finalvalue) finalvalue = eval(expr.compile('')) # Set to None if value is NaN finalvalue = Utils.clean_if_nan(finalvalue) item = apply_value(item, map_key, finalvalue) if 'conditions' in map_value: finalvalue = handle_conditions(map_value['conditions'], item, data) item = apply_value(item, map_key, finalvalue) elif 'conditions' in map_value: finalvalue = handle_conditions(map_value['conditions'], item, data) item = apply_value(item, map_key, finalvalue) else: text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format(map_key) if self.__error_tolerance: Utils.log('error', text) continue else: raise Exception(text) # Unmatched if self.__save_unmatched: for col in cols_to_delete: data[col] = None item[self.__unmatched_key] = self.__get_unmatched(data, labels) result.append(item) return result
def process(self): from xml.etree import ElementTree as ET self.xml = ET.parse(self.filepath) result = [] for xmlitem in self.xml.findall(self.item_node): # Updating stats self.__stats['total_count'] += 1 item = {} for yaml_key, yaml_value in self.mapping.items(): if 'col' in yaml_value: col = yaml_value['col'] multiple = None value_index = None default = None raw = True if 'multiple' in yaml_value: multiple = yaml_value['multiple'] if 'index' in yaml_value: value_index = yaml_value['index'] if 'default' in yaml_value: default = yaml_value['default'] if 'raw' in yaml_value: raw_ = yaml_value['raw'] if type(raw_) is bool: raw = raw_ if col == '_all_': finalvalue = xmlitem else: finalvalue = self.get(col, xmlitem, multiple, value_index, default, raw) if 'transformations' in yaml_value: finalvalue = handle_transformations( yaml_value['transformations'], finalvalue, error_tolerance=self.error_tolerance) item = apply_value(item, yaml_key, finalvalue) if 'conditions' in yaml_value: finalvalue = handle_conditions( yaml_value['conditions'], item) item = apply_value(item, yaml_key, finalvalue) # Deleting the value from original input object if self.__save_unmatched: self.__delete(col, xmlitem) elif 'value' in yaml_value: finalvalue = yaml_value['value'] if type(finalvalue) == str: finalvalue = finalvalue.replace('$subject', 'item') expr = parser.expr(finalvalue) finalvalue = eval(expr.compile('')) # Set to None if value is NaN finalvalue = Utils.clean_if_nan(finalvalue) item = apply_value(item, yaml_key, finalvalue) if 'conditions' in yaml_value: finalvalue = handle_conditions( yaml_value['conditions'], item) item = apply_value(item, yaml_key, finalvalue) elif 'conditions' in yaml_value: finalvalue = handle_conditions(yaml_value['conditions'], item) item = apply_value(item, yaml_key, finalvalue) else: text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format( yaml_key) if self.error_tolerance: Utils.log('error', text) continue else: raise Exception(text) # Unmatched if self.__save_unmatched: item[self.__unmatched_key] = self.__get_unmatched(xmlitem) result.append(item) return result
def get(self, key, subject, multiple=None, value_index=None, default=None, raw=True): split = key.split('.') split_count = len(split) for index, val in enumerate(split): if '$subject' == val: continue it = index + 1 if it != split_count: value = subject.find(val) else: value = subject.findall(val) if value is None: if default is None: return False else: return default if type(value) is list and len(value) == 0: if default is None: return False else: return default if it == split_count: if type(value) is not list: if not multiple: if raw: subject = value.text else: subject = value else: res = [] if raw: res.append(value.text) else: res.append(value) subject = res else: if not multiple: if not value_index: if raw: subject = value[0].text else: subject = value[0] else: if type(value_index) is int: try: if raw: subject = value[value_index].text else: subject = value[value_index] except: if default is None: subject = False else: subject = default else: res = [] if not value_index: for item in value: if raw: res.append(item.text) else: res.append(item) subject = res else: if type(value_index) is int: try: if raw: res.append(value[value_index].text) else: res.append(value[value_index]) except: pass subject = res else: subject = value # Set to None if value is NaN subject = Utils.clean_if_nan(subject) if subject is None and default is not None: subject = default return subject
def lazy_process(self): import lxml.etree as ET self.xml = ET.iterparse(self.filepath) result = [] for ev, elem in iter(self.xml): if elem.tag == self.item_node: # Updating stats self.__stats['total_count'] += 1 item = {} for map_key, map_value in self.mapping.items(): if 'col' in map_value: col = map_value['col'] multiple = None value_index = None default = None raw = True if 'multiple' in map_value: multiple = map_value['multiple'] if 'index' in map_value: value_index = map_value['index'] if 'default' in map_value: default = map_value['default'] if 'raw' in map_value: raw_ = map_value['raw'] if type(raw_) is bool: raw = raw_ finalvalue = self.get(col, elem, multiple, value_index, default, raw) if 'transformations' in map_value: finalvalue = handle_transformations( map_value['transformations'], finalvalue, error_tolerance=self.error_tolerance) item = apply_value(item, map_key, finalvalue) if 'conditions' in map_value: finalvalue = handle_conditions( map_value['conditions'], item) item = apply_value(item, map_key, finalvalue) # Deleting the value from original input object if self.__save_unmatched: self.__delete(col, elem) elif 'value' in map_value: finalvalue = map_value['value'] if type(finalvalue) == str: finalvalue = finalvalue.replace('$subject', 'item') expr = parser.expr(finalvalue) finalvalue = eval(expr.compile('')) # Set to None if value is NaN finalvalue = Utils.clean_if_nan(finalvalue) item = apply_value(item, map_key, finalvalue) if 'conditions' in map_value: finalvalue = handle_conditions( map_value['conditions'], item) item = apply_value(item, map_key, finalvalue) elif 'conditions' in map_value: finalvalue = handle_conditions(map_value['conditions'], item) item = apply_value(item, map_key, finalvalue) else: text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format( map_key) if self.error_tolerance: Utils.log('error', text) continue else: raise Exception(text) # Unmatched if self.__save_unmatched: item[self.__unmatched_key] = self.__get_unmatched(elem) result.append(item) if (len(result) % self.bulksize) == 0: self.callback(result) result.clear() gc.collect() # Clearing the element now that the values have been extracted elem.clear() for ancestor in elem.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None: del ancestor.getparent()[0] if len(result) > 0: self.callback(result) result.clear() gc.collect()
def process(self): import json with open(self.__filepath, 'r') as fh: jsoncontent = fh.read() self.json = json.loads(jsoncontent) result = [] content = self.json if type(self.__root_node) is not str: iterator = iter(content) else: iterator = iter(content[self.__root_node]) for jsonitem in iterator: # Updating stats self.__stats['total_count'] += 1 item = {} for yaml_key, yaml_value in self.__mapping.items(): if 'col' in yaml_value: col = yaml_value['col'] multiple = None value_index = None default = None if 'multiple' in yaml_value: multiple = yaml_value['multiple'] if 'index' in yaml_value: val_index = yaml_value['index'] if type(val_index) is int: value_index = val_index if 'default' in yaml_value: default = yaml_value['default'] if col == '_all_': finalvalue = jsonitem else: finalvalue = self.get(col, jsonitem, multiple, value_index, default) if 'transformations' in yaml_value: finalvalue = handle_transformations( yaml_value['transformations'], finalvalue, error_tolerance=self.__error_tolerance) item = apply_value(item, yaml_key, finalvalue) if 'conditions' in yaml_value: finalvalue = handle_conditions( yaml_value['conditions'], item, jsonitem) item = apply_value(item, yaml_key, finalvalue) # Deleting the value from original input object if self.__save_unmatched: self.__delete(col, jsonitem) elif 'value' in yaml_value: finalvalue = yaml_value['value'] if type(finalvalue) == str: finalvalue = finalvalue.replace('$subject', 'item') expr = parser.expr(finalvalue) finalvalue = eval(expr.compile('')) # Clean if NaN finalvalue = Utils.clean_if_nan(finalvalue) item = apply_value(item, yaml_key, finalvalue) if 'conditions' in yaml_value: finalvalue = handle_conditions( yaml_value['conditions'], item) item = apply_value(item, yaml_key, finalvalue) elif 'conditions' in yaml_value: finalvalue = handle_conditions(yaml_value['conditions'], item) # Set to None if value is NaN Utils.clean_if_nan(finalvalue) item = apply_value(item, yaml_key, finalvalue) # Unmatched if self.__save_unmatched: item[self.__unmatched_key] = self.__get_unmatched(jsonitem) result.append(item) return result
def lazy_process(self): import ijson root_node = 'item' if self.__root_node is not None and type(self.__root_node) is str: root_node = '{}.item'.format(self.__root_node) with open(self.__filepath, 'rb') as fh: self.__content = ijson.items(fh, root_node) results = [] it = 0 # Iterating over JSON generator for jsonobject in self.__content: # Updating stats self.__stats['total_count'] += 1 item = {} # For each JSON Object, iterating over the YAML mapping and retrieving data for yaml_key, yaml_value in self.__mapping.items(): if 'col' in yaml_value: col = yaml_value['col'] multiple = None value_index = None default = None if 'multiple' in yaml_value: multiple = yaml_value['multiple'] if 'index' in yaml_value: val_index = yaml_value['index'] if type(val_index) is int: value_index = val_index if 'default' in yaml_value: default = yaml_value['default'] if col == '_all_': finalvalue = jsonobject else: finalvalue = self.get(col, jsonobject, multiple, value_index, default) # If transformations are defined in the mapping, applying them if 'transformations' in yaml_value: finalvalue = handle_transformations( yaml_value['transformations'], finalvalue, error_tolerance=self.__error_tolerance) item = apply_value(item, yaml_key, finalvalue) # Handling conditions if 'conditions' in yaml_value: finalvalue = handle_conditions( yaml_value['conditions'], item, jsonobject) item = apply_value(item, yaml_key, finalvalue) # Deleting the value from original input object if self.__save_unmatched: self.__delete(col, jsonobject) elif 'value' in yaml_value: finalvalue = yaml_value['value'] if type(finalvalue) == str: finalvalue = finalvalue.replace('$subject', 'item') expr = parser.expr(finalvalue) finalvalue = eval(expr.compile('')) # Set to None if value is NaN finalvalue = Utils.clean_if_nan(finalvalue) item = apply_value(item, yaml_key, finalvalue) if 'conditions' in yaml_value: finalvalue = handle_conditions( yaml_value['conditions'], item) item = apply_value(item, yaml_key, finalvalue) elif 'conditions' in yaml_value: finalvalue = handle_conditions( yaml_value['conditions'], item) item = apply_value(item, yaml_key, finalvalue) else: text = '{} : No supported options found in mapping. Supported: [col, value, conditions]'.format( yaml_key) if self.__error_tolerance: Utils.log('error', text) continue else: raise Exception(text) # Unmatched if self.__save_unmatched: item[self.__unmatched_key] = self.__get_unmatched( jsonobject) results.append(item) if len(results) % self.__bulksize == 0: self.__callback(results) results = [] gc.collect() if len(results) > 0: self.__callback(results) results = [] gc.collect()