def make_absolute_paths(config): if not "streamcorpus_pipeline" in config: logger.critical("bad config: %r", config) raise ConfigurationError('missing "streamcorpus_pipeline" from config') ## remove the root_path, so it does not get extended itself root_path = config["streamcorpus_pipeline"].pop("root_path", None) if not root_path: root_path = os.getcwd() if not root_path.startswith("/"): root_path = os.path.join(os.getcwd(), root_path) def recursive_abs_path(sub_config, root_path): for key, val in sub_config.items(): if isinstance(val, basestring): if key.endswith("path"): ## we have a path... is it already absolute? if not val.startswith("/"): ## make the path absolute sub_config[key] = os.path.join(root_path, val) elif isinstance(val, dict): recursive_abs_path(val, root_path) recursive_abs_path(config, root_path) ## put the root_path back config["root_path"] = root_path
def test_aligner_separate(tmp_dir_path): si = make_hyperlink_labeled_test_stream_item() assert len(si.body.clean_visible) > 200 #for x in si.body.labels['author']: # print x.offsets[OffsetType.BYTES].first, x.offsets[OffsetType.BYTES].value, x.target.target_id c_path = os.path.join(tmp_dir_path, 'chunk.sc') chunk = streamcorpus.Chunk(c_path, mode='wb') chunk.add(si) chunk.close() lp = streamcorpus_pipeline.stages._init_stage( 'lingpipe', dict( tmp_dir_path = tmp_dir_path, exit_code_on_out_of_memory=1, pipeline_root_path=os.path.join(os.path.dirname(__file__), '../../../third/'), offset_types = ['BYTES'], offset_debugging = True, cleanup_tmp_files = False, ) ) logger.critical(c_path) lp.process_path(c_path) assert os.path.exists(tmp_dir_path) assert open(os.path.join(tmp_dir_path, 'chunk.sc-clean_visible.xml')).read() assert open(os.path.join(tmp_dir_path, 'chunk.sc-ner.xml')).read() logger.critical(os.listdir(tmp_dir_path)) ## run the aligner separately aligner = streamcorpus_pipeline.stages._init_stage( 'byte_offset_align_labels', dict( annotator_id = 'author', tagger_id = 'lingpipe' ) ) aligner.process_path(c_path) ## verify that we get the same answer as test above si = list(streamcorpus.Chunk(c_path))[0] logger.critical('%d bytes clean_visible for %s', len(si.body.clean_visible), si.stream_id) assert len(si.body.clean_visible) > 200 logger.critical('%d sentences for %s', len(si.body.sentences['lingpipe']), si.stream_id) assert len(si.body.sentences['lingpipe']) == 41, open(os.path.join(tmp_dir_path, 'chunk.sc-ner.xml')).read() assert os.path.exists(tmp_dir_path)
def test_pipeline_run(cmd_expect_success): cmd, expect_success = cmd_expect_success logger.critical(cmd) p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) ret = None start_time = time.time() max_time = 900 elapsed = 0 while elapsed < max_time: elapsed = time.time() - start_time ret = p.poll() if ret is not None: break out, err = p.communicate() logger.critical( out ) logger.critical( err ) if elapsed >= max_time: raise Exception('timed out after %d seconds' % (time.time() - start_time)) if expect_success: assert ret == 0 else: assert ret != 0