def test_merge(self): p1 = self.person('Eve 1') p2 = self.person('Eve 2') merger = CromObjectMerger() merger.merge(p1, p2) referrers = p1.referred_to_by self.assertEqual(len(referrers), 1)
def test_pipeline_sales(self): ''' When dimensions get merged, the Unknown physical dimension classification (300055642) gets dropped if there are any other classifications. ''' h1 = vocab.Height(ident='', content=9.0) h1.unit = vocab.instances.get('inches') self.assertEqual({c._label for c in h1.classified_as}, {'Height'}) h2 = vocab.PhysicalDimension(ident='', content=9.0) self.assertEqual({c._label for c in h2.classified_as}, {'Unknown physical dimension'}) merger = CromObjectMerger() h = merger.merge(h1, h2) self.assertEqual({c._label for c in h.classified_as}, {'Height'})
def merge_objects(self, objects): r = JSONValueRewriter(self.prev_post_sales_map) for k in list(objects.keys()): data = objects[k] updated = r.rewrite(data) ident = updated['id'] if k != ident: if ident in objects: read = reader.Reader() m = read.read(json.dumps(objects[ident])) n = read.read(json.dumps(updated)) merger = CromObjectMerger() m = merger.merge(m, n) objects[ident] = json.loads(factory.toString(m, False)) else: objects[ident] = updated del (objects[k])
def _rewrite_output_files(files, r, update_filename, worker_id, total_workers, kwargs): i = 0 if not files: return print( f'rewrite worker partition {worker_id} called with {len(files)} files [{files[0]} .. {files[-1]}]' ) start = time.time() rewritten_count = 0 processed_count = 0 ignore_errors = kwargs.get('ignore_errors', False) for i, f in enumerate(files): processed_count += 1 # print(f'{i} {f}', end="\r", flush=True) with open(f) as data_file: try: bytes = data_file.read() if 'content_filter_re' in kwargs: filter_re = kwargs['content_filter_re'] if not re.search(filter_re, bytes): pass # print(f'skipping {f}') continue else: pass # print(f'processing {f}') data = json.loads(bytes) except json.decoder.JSONDecodeError: sys.stderr.write( f'Failed to load JSON during rewriting of {f}\n') if ignore_errors: continue else: raise d = r.rewrite(data, file=f) if update_filename: newfile = filename_for(d, original_filename=f, **kwargs) else: newfile = f if d == data and f == newfile: # nothing changed; do not rewrite the file continue else: pass # print(f'*** rewrote data in {f} --> {newfile}') if newfile != f: if os.path.exists(newfile): read = reader.Reader() merger = CromObjectMerger() with open(newfile, 'r') as fh: content = fh.read() try: m = read.read(content) n = read.read(d) # print('========================= MERGING =========================') # print('merging objects:') # print(f'- {m}') # print(f'- {n}') merger.merge(m, n) # except model.DataError as e: except Exception as e: print( f'Exception caught while merging data from {newfile} ({str(e)}):' ) print(d) print(content) if ignore_errors: continue else: raise data = factory.toString(m, False) d = json.loads(data) with open(newfile, 'w') as data_file: rewritten_count += 1 json.dump(d, data_file, indent=2, ensure_ascii=False) if newfile != f: os.remove(f) end = time.time() elapsed = end - start if rewritten_count: print( f'worker partition {worker_id}/{total_workers} finished with {rewritten_count}/{processed_count} files rewritten in %.1fs' % (elapsed, )) else: print( f'worker partition {worker_id}/{total_workers} finished in %.1fs' % (elapsed, ))
with open(filename, 'r') as fh: content = fh.read() canon_file = None canon_content = None try: m = read.read(content) id = m.id if id in seen: canon_file = seen[id] # print(f'*** {id} already seen in {canon_file} ; merging {filename}') merger = CromObjectMerger() with open(canon_file, 'r') as cfh: canon_content = cfh.read() n = read.read(canon_content) try: merger.merge(m, n) except model.DataError as e: print( f'Exception caught while merging data from {newfile} ({str(e)}):' ) print(d) print(content) raise merged_data = factory.toString(m, False) d = json.loads(merged_data) with open(canon_file, 'w') as data_file: json.dump(d, data_file, indent=2, ensure_ascii=False) os.remove(filename)