def test_google_vision_face_batch(): stims = ['apple', 'obama', 'thai_people'] stim_files = [join(get_test_data_path(), 'image', '%s.jpg' % s) for s in stims] stims = [ImageStim(s) for s in stim_files] ext = GoogleVisionAPIFaceExtractor(batch_size=5) result = ext.transform(stims) result = merge_results(result, format='wide', extractor_names=False, handle_annotations='first') assert result.shape == (2, 139) assert 'joyLikelihood' in result.columns assert result['joyLikelihood'][0] == 'VERY_LIKELY' assert result['joyLikelihood'][1] == 'VERY_LIKELY' video = VideoStim(join(VIDEO_DIR, 'obama_speech.mp4')) conv = FrameSamplingFilter(every=10) video = conv.transform(video) result = ext.transform(video) result = merge_results(result, format='wide', extractor_names=False) assert 'joyLikelihood' in result.columns assert result.shape == (22, 139) video = VideoStim(join(VIDEO_DIR, 'small.mp4')) video = conv.transform(video) result = ext.transform(video) result = merge_results(result, format='wide', extractor_names=False) assert 'joyLikelihood' not in result.columns assert len(result) == 0
def test_batch_transformer(): img1 = ImageStim(join(get_test_data_path(), 'image', 'apple.jpg')) img2 = ImageStim(join(get_test_data_path(), 'image', 'button.jpg')) img3 = ImageStim(join(get_test_data_path(), 'image', 'obama.jpg')) ext = DummyBatchExtractor() res = merge_results(ext.transform([img1, img2, img3])) assert ext.num_calls == 1 assert res.shape == (3, 10) ext = DummyBatchExtractor(batch_size=1) res2 = merge_results(ext.transform([img1, img2, img3])) assert ext.num_calls == 3 assert res.equals(res2)
def test_microsoft_vision_api_extractor_large(): default = config.get_option('allow_large_jobs') default_large = config.get_option('large_job') config.set_option('allow_large_jobs', False) config.set_option('large_job', 3) ext = MicrosoftVisionAPITagExtractor() video = VideoStim(join(VIDEO_DIR, 'small.mp4')) with pytest.raises(ValueError): merge_results(ext.transform(video)) config.set_option('allow_large_jobs', default) config.set_option('large_job', default_large)
def test_metric_er_as_stim(): stim = ComplexTextStim(text='This is [MASK] test') ext_bert = BertLMExtractor(return_softmax=True) ext_metric = MetricExtractor(functions='numpy.sum') r = ext_metric.transform(ext_bert.transform(stim)) df = merge_results(r, extractor_names=False) assert np.isclose(df['sum'][0], 1)
def extract_visual_semantics(visual_events, glove=True): res = pd.DataFrame.from_csv(visual_events) onsets = res['onset'] durations = res['duration'] res = res.drop(['onset', 'duration', 'order', 'object_id'], axis=1) words = res.apply(lambda x: list(res.columns[x.values.astype('bool')]), axis=1) texts = [] for tags, o, d in zip(words, onsets, durations): for w in tags: # Slicing here to get rid of b'' texts.append(TextStim(text=w[2:-1], onset=o, duration=d)) if glove: ext = WordEmbeddingExtractor(GLOVE_PATH, binary=False) out = 'events/visual_semantic_events.csv' else: ext = WordEmbeddingExtractor(WORD2VEC_PATH, binary=True) out = 'events/visual_glove_events.csv' results = ext.transform(texts) res = merge_results(results, metadata=False, flatten_columns=True, format='long') res = res.drop(['object_id', 'order', 'duration'], axis=1) res = res.groupby('onset').sum().reset_index() res['duration'] = durations res.rename(columns={ 'value': 'modulation', 'feature': 'trial_type' }, inplace=True) res.to_csv(out)
def test_small_pipeline_json_spec3(): pytest.importorskip('pytesseract') filename = join(get_test_data_path(), 'image', 'button.jpg') stim = ImageStim(filename) nodes = { "roots": [{ "transformer": "GoogleVisionAPITextConverter", "parameters": { "num_retries": 5, "max_results": 10 }, "children": [{ "transformer": "LengthExtractor" }] }] } graph = Graph(nodes) result = list(graph.run([stim], merge=False)) history = result[0].history.to_df() assert history.shape == (2, 8) assert history.iloc[0]['result_class'] == 'TextStim' result = merge_results(result, format='wide', extractor_names='multi') assert (0, 'text[Exit\n]') in result['stim_name'].values assert ('LengthExtractor', 'text_length') in result.columns assert result[('LengthExtractor', 'text_length')].values[0] == 4
def extract_image_labels(video, save_frames=False): frame_sampling_filter = FrameSamplingFilter(hertz=1) sampled_video = frame_sampling_filter.transform(video) if save_frames: # Save frames as images for i, f in enumerate(sampled_video): if i % 100 == 0: f.save('stims/frames/frame_%d.png' % i) # Use a Vision API to extract object labels ext = GoogleVisionAPILabelExtractor(max_results=10) results = ext.transform(sampled_video) res = merge_results(results, metadata=False, extractor_names='multi') # Clean and write out data res = res.fillna(0) label_key = 'GoogleVisionAPILabelExtractor' res[label_key] = np.round(res[label_key]) new_cols = [] for col in res.columns.values: if col[0].startswith('Google'): new_cols.append(col[1].encode('utf-8')) else: new_cols.append(col[0]) res.columns = new_cols res.to_csv('events/raw_visual_events.csv')
def test_big_pipeline(): pytest.importorskip('pygraphviz') filename = join(get_test_data_path(), 'video', 'obama_speech.mp4') video = VideoStim(filename) visual_nodes = [(FrameSamplingFilter(every=15), [ (TesseractConverter(), [LengthExtractor()]), VibranceExtractor(), 'BrightnessExtractor', ])] audio_nodes = [(VideoToAudioConverter(), [WitTranscriptionConverter(), 'LengthExtractor'], 'video_to_audio')] graph = Graph() graph.add_nodes(visual_nodes) graph.add_nodes(audio_nodes) results = graph.run(video, merge=False) result = merge_results(results, format='wide', extractor_names='multi') # Test that pygraphviz outputs a file drawfile = next(tempfile._get_candidate_names()) graph.draw(drawfile) assert exists(drawfile) os.remove(drawfile) assert ('LengthExtractor', 'text_length') in result.columns assert ('VibranceExtractor', 'vibrance') in result.columns # assert not result[('onset', '')].isnull().any() assert 'text[negotiations]' in result['stim_name'].values assert 'frame[90]' in result['stim_name'].values
def test_batch_transformer(): cache_default = config.get_option('cache_transformers') config.set_option('cache_transformers', False) img1 = ImageStim(join(get_test_data_path(), 'image', 'apple.jpg')) img2 = ImageStim(join(get_test_data_path(), 'image', 'button.jpg')) img3 = ImageStim(join(get_test_data_path(), 'image', 'obama.jpg')) ext = DummyBatchExtractor() res = merge_results(ext.transform([img1, img2, img3])) assert ext.num_calls == 1 assert res.shape == (3, 10) ext = DummyBatchExtractor(batch_size=1) res2 = merge_results(ext.transform([img1, img2, img3])) assert ext.num_calls == 3 assert res.equals(res2) config.set_option('cache_transformers', cache_default)
def test_metric_er_as_stim(): stim = TextStim(text='This is a test') ext_sent = VADERSentimentExtractor() ext_metric = MetricExtractor( functions='numpy.sum', subset_idx=[f'sentiment_{d}' for d in ['neg', 'pos', 'neu']]) r = ext_metric.transform(ext_sent.transform(stim)) df = merge_results(r, extractor_names=False) assert np.isclose(df['sum'][0], 1)
def test_google_vision_api_extractor_large(): default = config.get_option('allow_large_jobs') default_large = config.get_option('large_job') config.set_option('allow_large_jobs', False) config.set_option('large_job', 1) ext = GoogleVisionAPILabelExtractor() images = [ImageStim(join(get_test_data_path(), 'image', 'apple.jpg'))] * 2 with pytest.raises(ValueError): merge_results(ext.transform(images)) config.set_option('allow_large_jobs', True) results = merge_results(ext.transform(images)) assert 'GoogleVisionAPILabelExtractor#apple' in results.columns assert results.shape == (1, 16) # not 2 cause all the same instance config.set_option('allow_large_jobs', default) config.set_option('large_job', default_large)
def _to_csv(results, dataset_name, task_name): """ Save extracted Pliers results to file. """ if results != [] and 'EXTRACTION_DIR' in current_app.config: results_df = merge_results(list(zip(*results))[1]) outfile = Path( current_app.config['EXTRACTION_DIR']) / '{}_{}_{}.csv'.format( dataset_name, task_name, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) outfile.parents[0].mkdir(exist_ok=True) results_df.to_csv(outfile)
def test_google_vision_api_extractor_large(): default = config.get_option('allow_large_jobs') default_large = config.get_option('large_job') default_cache = config.get_option('cache_transformers') config.set_option('allow_large_jobs', False) config.set_option('large_job', 1) config.set_option('cache_transformers', False) ext = GoogleVisionAPILabelExtractor() images = [ImageStim(join(IMAGE_DIR, 'apple.jpg')), ImageStim(join(IMAGE_DIR, 'obama.jpg'))] with pytest.raises(ValueError): merge_results(ext.transform(images)) config.set_option('allow_large_jobs', True) results = merge_results(ext.transform(images)) assert 'GoogleVisionAPILabelExtractor#Apple' in results.columns assert results.shape == (2, 32) config.set_option('allow_large_jobs', default) config.set_option('large_job', default_large) config.set_option('cache_transformers', default_cache)
def test_google_vision_api_extractor_large(): default = config.get_option('allow_large_jobs') default_large = config.get_option('large_job') default_cache = config.get_option('cache_transformers') config.set_option('allow_large_jobs', False) config.set_option('large_job', 1) config.set_option('cache_transformers', False) ext = GoogleVisionAPILabelExtractor() images = [ImageStim(join(IMAGE_DIR, 'apple.jpg')), ImageStim(join(IMAGE_DIR, 'obama.jpg'))] with pytest.raises(ValueError): merge_results(ext.transform(images)) config.set_option('allow_large_jobs', True) results = merge_results(ext.transform(images)) assert 'GoogleVisionAPILabelExtractor#apple' in results.columns assert results.shape == (2, 32) config.set_option('allow_large_jobs', default) config.set_option('large_job', default_large) config.set_option('cache_transformers', default_cache)
def test_small_pipeline_json_spec2(): pytest.importorskip('pytesseract') filename = join(get_test_data_path(), 'image', 'button.jpg') stim = ImageStim(filename) spec = join(get_test_data_path(), 'graph', 'simple_graph.json') graph = Graph(spec=spec) result = list(graph.run([stim], merge=False)) history = result[0].history.to_df() assert history.shape == (2, 8) assert history.iloc[0]['result_class'] == 'TextStim' result = merge_results(result, format='wide', extractor_names='multi') assert (0, 'text[Exit]') in result['stim_name'].values assert ('LengthExtractor', 'text_length') in result.columns assert result[('LengthExtractor', 'text_length')].values[0] == 4
def test_small_pipeline2(): filename = join(get_test_data_path(), 'image', 'button.jpg') nodes = [BrightnessExtractor(), VibranceExtractor()] graph = Graph(nodes) result = list(graph.run([filename], merge=False)) history = result[0].history.to_df() assert history.shape == (1, 8) result = merge_results(result, format='wide', extractor_names='multi') assert ('BrightnessExtractor', 'brightness') in result.columns brightness = result[('BrightnessExtractor', 'brightness')].values[0] vibrance = result[('VibranceExtractor', 'vibrance')].values[0] assert_almost_equal(brightness, 0.746965, 5) assert ('VibranceExtractor', 'vibrance') in result.columns assert_almost_equal(vibrance, 841.577274, 5)
def test_small_pipeline(): pytest.importorskip('pytesseract') filename = join(get_test_data_path(), 'image', 'button.jpg') stim = ImageStim(filename) nodes = [(TesseractConverter(), [LengthExtractor()])] graph = Graph(nodes) result = list(graph.run([stim], merge=False)) history = result[0].history.to_df() assert history.shape == (2, 8) assert history.iloc[0]['result_class'] == 'TextStim' result = merge_results(result, format='wide', extractor_names='prepend') assert (0, 'text[Exit]') in result['stim_name'].values assert 'LengthExtractor#text_length' in result.columns assert result['LengthExtractor#text_length'].values[0] == 4
def transform(self, stimulus_files): if isinstance(self.transformer, Extractor): result = self.transformer.transform(stimulus_files).to_df() else: result = self.transformer.transform(stimulus_files, merge=False) result = merge_results(result, flatten_columns=True) extra_columns = list(set(['onset', 'duration', 'history', 'class', 'filename', 'stim_name', 'source_file']) & set(result.columns)) self.metadata_ = result[extra_columns] result.drop(extra_columns, axis=1, inplace=True, errors='ignore') return result.as_matrix()
def transform(self, stimulus_files): if isinstance(self.transformer, Extractor): result = self.transformer.transform(stimulus_files).to_df() else: result = self.transformer.transform(stimulus_files, merge=False) result = merge_results(result, format='wide', extractor_names=False) extra_columns = list(set(['onset', 'duration', 'order', 'history', 'class', 'filename', 'stim_name', 'source_file', 'object_id', 'extractor']) & set(result.columns)) self.metadata_ = result[extra_columns] result.drop(extra_columns, axis=1, inplace=True, errors='ignore') return result.as_matrix()
def transform(self, stimulus_files): if isinstance(self.transformer, Extractor): result = self.transformer.transform(stimulus_files).to_df() else: result = self.transformer.transform(stimulus_files, merge=False) result = merge_results(result, format='wide', extractor_names=False) extra_columns = list({ 'onset', 'duration', 'order', 'history', 'class', 'filename', 'stim_name', 'source_file', 'object_id', 'extractor' } & set(result.columns)) self.metadata_ = result[extra_columns] result.drop(extra_columns, axis=1, inplace=True, errors='ignore') return result.to_numpy()
def test_big_pipeline_json(): pytest.importorskip('pygraphviz') filename = join(get_test_data_path(), 'video', 'obama_speech.mp4') video = VideoStim(filename) nodes = { "roots": [{ "transformer": "FrameSamplingFilter", "parameters": { "every": 15 }, "children": [{ "transformer": "TesseractConverter", "children": [{ "transformer": "LengthExtractor" }] }, { "transformer": "VibranceExtractor" }, { "transformer": "BrightnessExtractor" }] }, { "transformer": "VideoToAudioConverter", "children": [{ "transformer": "WitTranscriptionConverter", "children": [{ "transformer": "LengthExtractor" }] }] }] } graph = Graph(nodes) results = graph.run(video, merge=False) result = merge_results(results, format='wide', extractor_names='multi') # Test that pygraphviz outputs a file drawfile = next(tempfile._get_candidate_names()) graph.draw(drawfile) assert exists(drawfile) os.remove(drawfile) assert ('LengthExtractor', 'text_length') in result.columns assert ('VibranceExtractor', 'vibrance') in result.columns # assert not result[('onset', '')].isnull().any() assert 'text[negotiations]' in result['stim_name'].values assert 'frame[90]' in result['stim_name'].values
def extract_audio_semantics(stims, glove=True): if glove: ext = WordEmbeddingExtractor(GLOVE_PATH, binary=False) out = 'events/audio_glove_events.csv' else: ext = WordEmbeddingExtractor(WORD2VEC_PATH, binary=True) out = 'events/audio_semantic_events.csv' results = ext.transform(stims) res = merge_results(results, metadata=False, flatten_columns=True, format='long') res = res.drop(['object_id', 'order'], axis=1) res.rename(columns={ 'value': 'modulation', 'feature': 'trial_type' }, inplace=True) res.to_csv(out)
def test_batch_transformer_caching(): cache_default = config.get_option('cache_transformers') config.set_option('cache_transformers', True) img1 = ImageStim(join(get_test_data_path(), 'image', 'apple.jpg')) ext = DummyBatchExtractor(name='penguin') res = ext.transform(img1).to_df(timing=False, object_id=False) assert ext.num_calls == 1 assert res.shape == (1, 1) img2 = ImageStim(join(get_test_data_path(), 'image', 'button.jpg')) img3 = ImageStim(join(get_test_data_path(), 'image', 'obama.jpg')) res2 = ext.transform([img1, img2, img2, img3, img3, img1, img2]) assert ext.num_calls == 3 assert len(res2) == 7 assert res2[0] == res2[5] and res2[1] == res2[2] and res2[3] == res2[4] res2 = merge_results(res2) assert res2.shape == (3, 10) config.set_option('cache_transformers', cache_default)
def extract_faces(video): frame_sampling_filter = FrameSamplingFilter(hertz=1) sampled_video = frame_sampling_filter.transform(video) ext = GoogleVisionAPIFaceExtractor() results = ext.transform(sampled_video) res = merge_results(results, metadata=False, format='long', extractor_names=False, object_id=False) res = res[res['feature'] == 'face_detectionConfidence'] res = res.drop(['order'], axis=1) res = res.fillna(0) res['value'] = np.round(res['value']) res.rename(columns={ 'value': 'modulation', 'feature': 'trial_type' }, inplace=True) res.to_csv('events/visual_face_events.csv')
def test_small_pipeline_json_spec(): pytest.importorskip('pytesseract') filename = join(get_test_data_path(), 'image', 'button.jpg') stim = ImageStim(filename) nodes = { "roots": [{ "transformer": "TesseractConverter", "children": [{ "transformer": "LengthExtractor", "children": [] }] }] } graph = Graph(nodes) result = list(graph.run([stim], merge=False)) history = result[0].history.to_df() assert history.shape == (2, 8) assert history.iloc[0]['result_class'] == 'TextStim' result = merge_results(result) assert (0, 'text[Exit]') in result['stim_name'].values assert ('LengthExtractor', 'text_length') in result.columns assert result[('LengthExtractor', 'text_length')].values[0] == 4
Next, we can use the `FaceRecognitionFaceLocationsExtractor` to detect and label face locations in the subset of frames Note that since we transformed a collection of frames, the result of this operation is a *list* of `ExtractedResult` objects. To merge these objects into a single pandas DataFrame, we can use the helper function `merge_results` from pliers.extractors import merge_results from pliers import config # Disable progress bar for Jupyter Book config.set_option('progress_bar', False) # Detect faces in selected frames face_features = face_ext.transform(selected_frames) merged_faces = merge_results(face_features, metadata=False) # Show only first few rows merged_faces.head(12) len(merged_faces.onset.unique()) There are 89 unique onsets, which indicates that faces were found in 89/143 frames. The `FaceRecognitionFaceLocationsExtractor#face_locations` column also indicates the location of each face in CSS order (i.e., top, right, bottom, left). ```{tip} In some frames (e.g. 240s), multiple faces were found, and there are multiple rows for a given `onset`. To disambiguate these rows, *pliers* assigns each occurace a unique `object_id` column vale. Read more](http://tyarkoni.github.io/pliers/results.html#understanding-object-ids) about *object_id*.