def _merge_records(args): data = args[0] config = args[1] if len(args) > 2: all_data = args[2] else: all_data = data if len(data) == 1: return data[0] result = {} for k, v in config.iteritems(): if isinstance(v, dict): result[k] = _merge_records((extract_from_tuple(data, k), v, all_data)) elif isinstance(v, tuple): for x in v: if isclass(x): x = x() result[k] = x.resolve(extract_from_tuple(data, k), all_data) if result[k] is not None: break else: if isclass(v): v = v() result[k] = v.resolve(extract_from_tuple(data, k), all_data) return result
def resolve(self, conflict_data, all_data): dates = extract_from_tuple(all_data, self._date_field) result = conflict_data[0] last_date = dates[0] for x in range(0, len(dates)): if dates[x] > last_date and conflict_data[x] is not None: result = conflict_data[x] last_date = dates[x] return result
def next(self): pair = self.pairs.next() features_vector = {} for feat, path in self.features: if isclass(feat): feat = feat() e = extract_from_tuple(pair, path) result = feat.extract(e[0], e[1]) if isinstance(result, dict): features_vector.update({feat.__class__.__name__ + '@' + path + ':' + str(name): value for name, value in result.iteritems()}) else: features_vector[feat.__class__.__name__ + '@' + path] = result return features_vector
def __init__(self, blocking_algorithm, unique_attribute, debug=False): consumed = set() self._pairs = [] count = 0 if debug: logger.info('joinblock start') for x in blocking_algorithm: for y in x: count += 1 if debug and count % 10000 == 0: logger.info('tick ' + str(count)) u = tuple(sorted(extract_from_tuple(y, unique_attribute))) if u not in consumed: consumed.add(u) self._pairs.append(y) if debug: logger.info('joinblock done')
def resolve(self, conflict_data, all_data): sources = extract_from_tuple(all_data, self._source_field) for x in range(0, len(sources)): if sources[x] == self._source_value: return conflict_data[x]