def test_word_embedding_extractor(): pytest.importorskip('gensim') stims = [TextStim(text='this'), TextStim(text='sentence')] ext = WordEmbeddingExtractor(join(TEXT_DIR, 'simple_vectors.bin'), binary=True) result = merge_results(ext.transform(stims), extractor_names='multi', format='wide') assert ('WordEmbeddingExtractor', 'embedding_dim99') in result.columns assert np.allclose(0.0010911, result[('WordEmbeddingExtractor', 'embedding_dim0')][0]) unk = TextStim(text='nowaythisinvocab') result = ext.transform(unk).to_df() assert result['embedding_dim10'][0] == 0.0 ones = np.ones(100) ext = WordEmbeddingExtractor(join(TEXT_DIR, 'simple_vectors.bin'), binary=True, unk_vector=ones) result = ext.transform(unk).to_df() assert result['embedding_dim10'][0] == 1.0 ext = WordEmbeddingExtractor(join(TEXT_DIR, 'simple_vectors.bin'), binary=True, unk_vector='random') result = ext.transform(unk).to_df() assert result['embedding_dim10'][0] <= 1.0 assert result['embedding_dim10'][0] >= -1.0 ext = WordEmbeddingExtractor(join(TEXT_DIR, 'simple_vectors.bin'), binary=True, unk_vector='nothing') result = ext.transform(unk).to_df() assert result['embedding_dim10'][0] == 0.0
def test_word_embedding_extractor(): pytest.importorskip('gensim') stims = [TextStim(text='this'), TextStim(text='sentence')] ext = WordEmbeddingExtractor(join(TEXT_DIR, 'simple_vectors.bin'), binary=True) result = merge_results(ext.transform(stims)) assert ('WordEmbeddingExtractor', 'embedding_dim99') in result.columns assert 0.001091 in result[('WordEmbeddingExtractor', 'embedding_dim0')]
def test_vader_sentiment_extractor(): stim = TextStim(join(TEXT_DIR, 'scandal.txt')) ext = VADERSentimentExtractor() result = ext.transform(stim).to_df() assert result['sentiment_neu'][0] == 0.752 stim2 = TextStim(text='VADER is smart, handsome, and funny!') result2 = ext.transform(stim2).to_df() assert result2['sentiment_pos'][0] == 0.752 assert result2['sentiment_neg'][0] == 0.0 assert result2['sentiment_neu'][0] == 0.248 assert result2['sentiment_compound'][0] == 0.8439
def extract_visual_semantics(visual_events, glove=True): res = pd.DataFrame.from_csv(visual_events) onsets = res['onset'] durations = res['duration'] res = res.drop(['onset', 'duration', 'order', 'object_id'], axis=1) words = res.apply(lambda x: list(res.columns[x.values.astype('bool')]), axis=1) texts = [] for tags, o, d in zip(words, onsets, durations): for w in tags: # Slicing here to get rid of b'' texts.append(TextStim(text=w[2:-1], onset=o, duration=d)) if glove: ext = WordEmbeddingExtractor(GLOVE_PATH, binary=False) out = 'events/visual_semantic_events.csv' else: ext = WordEmbeddingExtractor(WORD2VEC_PATH, binary=True) out = 'events/visual_glove_events.csv' results = ext.transform(texts) res = merge_results(results, metadata=False, flatten_columns=True, format='long') res = res.drop(['object_id', 'order', 'duration'], axis=1) res = res.groupby('onset').sum().reset_index() res['duration'] = durations res.rename(columns={ 'value': 'modulation', 'feature': 'trial_type' }, inplace=True) res.to_csv(out)
def test_multiple_text_filters(): stim = TextStim(text='testing the filtering features') filt1 = TokenizingFilter() filt2 = WordStemmingFilter() stemmed_tokens = filt2.transform(filt1.transform(stim)) full_text = ' '.join([s.text for s in stemmed_tokens]) assert full_text == 'test the filter featur'
def test_tfhub_text(): stim = TextStim(join(TEXT_DIR, 'scandal.txt')) ext = TFHubTextExtractor(SENTENC_URL, output_key=None) df = ext.transform(stim).to_df() assert all([f'feature_{i}' in df.columns for i in range(512)]) true = hub.KerasLayer(SENTENC_URL)([stim.text])[0, 10].numpy() assert np.isclose(df['feature_10'][0], true)
def test_word_stemming_filter(): stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'), columns='to', default_duration=1) # With all defaults (porter stemmer) filt = WordStemmingFilter() assert isinstance(filt.stemmer, nls.PorterStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] target = ['some', 'sampl', 'text', 'for', 'test', 'annot'] assert stems == target # Try a different stemmer filt = WordStemmingFilter(stemmer='snowball', language='english') assert isinstance(filt.stemmer, nls.SnowballStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Handles StemmerI stemmer stemmer = nls.SnowballStemmer(language='english') filt = WordStemmingFilter(stemmer=stemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Fails on invalid values with pytest.raises(ValueError): filt = WordStemmingFilter(stemmer='nonexistent_stemmer') # Try a long text stim stim2 = TextStim(text='theres something happening here') filt = WordStemmingFilter() assert filt.transform(stim2).text == 'there someth happen here'
def test_indico_api_text_extractor(): ext = IndicoAPITextExtractor(api_key=os.environ['INDICO_APP_KEY'], models=['emotion', 'personality']) # With ComplexTextStim input srtfile = join(get_test_data_path(), 'text', 'wonderful.srt') srt_stim = ComplexTextStim(srtfile, onset=4.2) result = merge_results(ext.transform(srt_stim), extractor_names=False) outdfKeysCheck = { 'onset', 'duration', 'order', 'object_id', 'emotion_anger', 'emotion_fear', 'emotion_joy', 'emotion_sadness', 'emotion_surprise', 'personality_openness', 'personality_extraversion', 'personality_agreeableness', 'personality_conscientiousness' } meta_columns = {'source_file', 'history', 'class', 'filename'} assert set(result.columns) - set(['stim_name' ]) == outdfKeysCheck | meta_columns assert result['onset'][1] == 92.622 # With TextStim input ts = TextStim(text="It's a wonderful life.") result = ext.transform(ts).to_df(object_id=True) assert set(result.columns) == outdfKeysCheck assert len(result) == 1
def test_validation_levels(caplog): cache_default = config.get_option('cache_transformers') config.set_option('cache_transformers', False) ext = BrightnessExtractor() stim = TextStim(text='hello world') with pytest.raises(TypeError): ext.transform(stim) res = ext.transform(stim, validation='warn') log_message = caplog.records[0].message assert log_message == ( "Transformers of type BrightnessExtractor can " "only be applied to stimuli of type(s) <class 'pliers" ".stimuli.image.ImageStim'> (not type TextStim), and no " "applicable Converter was found.") assert not res res = ext.transform(stim, validation='loose') assert not res stim2 = ImageStim(join(get_test_data_path(), 'image', 'apple.jpg')) res = ext.transform([stim, stim2], validation='loose') assert len(res) == 1 assert np.isclose(res[0].to_df()['brightness'][0], 0.88784294, 1e-5) config.set_option('cache_transformers', cache_default)
def test_spacy_token_extractor(): stim = TextStim(text='This is a test.') ext = SpaCyExtractor(extractor_type='token') assert ext.model is not None ext2 = SpaCyExtractor(model='en_core_web_sm') assert isinstance(ext2.model, spacy.lang.en.English) result = ext.transform(stim).to_df() assert result['text'][0] == 'This' assert result['lemma_'][0].lower() == 'this' assert result['pos_'][0] == 'DET' assert result['tag_'][0] == 'DT' assert result['dep_'][0] == 'nsubj' assert result['shape_'][0] == 'Xxxx' assert result['is_alpha'][0] == 'True' assert result['is_stop'][0] == 'True' assert result['is_punct'][0] == 'False' assert result['is_ascii'][0] == 'True' assert result['is_digit'][0] == 'False' assert result['sentiment'][0] == '0.0' assert result['text'][1] == 'is' assert result['lemma_'][1].lower() == 'be' assert result['pos_'][1] == 'VERB' assert result['tag_'][1] == 'VBZ' assert result['dep_'][1] == 'ROOT' assert result['shape_'][1] == 'xx' assert result['is_alpha'][1] == 'True' assert result['is_stop'][1] == 'True' assert result['is_punct'][1] == 'False' assert result['is_ascii'][1] == 'True' assert result['is_digit'][1] == 'False' assert result['sentiment'][1] == '0.0' assert result['text'][2] == 'a' assert result['lemma_'][2].lower() == 'a' assert result['pos_'][2] == 'DET' assert result['tag_'][2] == 'DT' assert result['dep_'][2] == 'det' assert result['shape_'][2] == 'x' assert result['is_alpha'][2] == 'True' assert result['is_stop'][2] == 'True' assert result['is_punct'][2] == 'False' assert result['is_ascii'][2] == 'True' assert result['is_digit'][2] == 'False' assert result['sentiment'][2] == '0.0' assert result['text'][3] == 'test' assert result['lemma_'][3].lower() == 'test' assert result['pos_'][3] == 'NOUN' assert result['tag_'][3] == 'NN' assert result['dep_'][3] == 'attr' assert result['shape_'][3] == 'xxxx' assert result['is_alpha'][3] == 'True' assert result['is_stop'][3] == 'False' assert result['is_punct'][3] == 'False' assert result['is_ascii'][3] == 'True' assert result['is_digit'][3] == 'False' assert result['sentiment'][3] == '0.0'
def test_indico_api_text_extractor(): ext = IndicoAPITextExtractor(api_key=os.environ['INDICO_APP_KEY'], models=['emotion', 'personality']) # With ComplexTextStim input srtfile = join(get_test_data_path(), 'text', 'wonderful.srt') srt_stim = ComplexTextStim(srtfile, onset=4.2) result = ext.transform(srt_stim).to_df() outdfKeysCheck = set([ 'onset', 'duration', 'emotion_anger', 'emotion_fear', 'emotion_joy', 'emotion_sadness', 'emotion_surprise', 'personality_openness', 'personality_extraversion', 'personality_agreeableness', 'personality_conscientiousness']) assert set(result.columns) == outdfKeysCheck assert result['onset'][1] == 92.622 # With TextStim input ts = TextStim(text="It's a wonderful life.") result = ext.transform(ts).to_df() assert set(result.columns) == outdfKeysCheck assert len(result) == 1
def clean_transcript(input_transcript, input_media, onset=None, offset=None): stim = load_stims([input_media])[0] if not isinstance(stim, AudioStim): conv = VideoToAudioConverter() stim = conv.transform(stim) input_media = '/tmp/input_audio.wav' stim.save(input_media) _, extension = splitext(input_transcript) clean_transcript = '/tmp/clean_transcript.txt' with open(clean_transcript, 'w') as new_file: if extension == 'srt': txt = ComplexTextStim(input_transcript) for el in txt.elements: _clean_save(el.text, new_file, el.onset, el.duration) else: # Treat as a singe block of text if onset is None or offset is None: raise Exception("Onset and offset must be declared") txt = TextStim(input_transcript) _clean_save(txt.text, new_file, onset, stim.duration - offset) return clean_transcript, input_media
def test_predefined_dictionary_extractor(): stim = TextStim(text='enormous') td = PredefinedDictionaryExtractor(['aoa/Freq_pm']) result = td.transform(stim).to_df() assert result.shape == (1, 5) assert 'aoa_Freq_pm' in result.columns assert np.isclose(result['aoa_Freq_pm'][0], 10.313725, 1e-5)
def test_dictionary_extractor(): td = DictionaryExtractor(join(TEXT_DIR, 'test_lexical_dictionary.txt'), variables=['length', 'frequency']) assert td.data.shape == (7, 2) stim = TextStim(text='annotation') result = td.transform(stim).to_df() assert np.isnan(result['onset'][0]) assert 'length' in result.columns assert result['length'][0] == 10 stim2 = TextStim(text='some') result = td.transform(stim2).to_df() assert np.isnan(result['onset'][0]) assert 'frequency' in result.columns assert np.isnan(result['frequency'][0])
def test_token_removal_filter(): stim = TextStim(text='this is not a very long sentence') filt = TokenRemovalFilter() assert filt.transform(stim).text == 'long sentence' filt2 = TokenRemovalFilter(tokens=['a', 'the', 'is']) assert filt2.transform(stim).text == 'this not very long sentence' stim2 = TextStim(text='More. is Real, sentence that\'ll work') try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') from nltk.corpus import stopwords tokens = set(stopwords.words('english')) | set(string.punctuation) filt3 = TokenRemovalFilter(tokens=tokens) assert filt3.transform(stim2).text == 'More Real sentence \'ll work'
def test_text_length_extractor(): stim = TextStim(text='hello world', onset=4.2, duration=1) ext = LengthExtractor() result = ext.transform(stim).to_df() assert 'text_length' in result.columns assert result['text_length'][0] == 11 assert result['onset'][0] == 4.2 assert result['duration'][0] == 1
def test_word_stemming_filter(): stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'), columns='to', default_duration=1) # With all defaults (porter stemmer) filt = WordStemmingFilter() assert isinstance(filt.stemmer, nls.PorterStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] target = ['some', 'sampl', 'text', 'for', 'test', 'annot'] assert stems == target # Try a different stemmer filt = WordStemmingFilter(stemmer='snowball', language='english') assert isinstance(filt.stemmer, nls.SnowballStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Handles StemmerI stemmer stemmer = nls.SnowballStemmer(language='english') filt = WordStemmingFilter(stemmer=stemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Try lemmatization filter try: nltk.find('taggers/universal_tagset') except LookupError: nltk.download('universal_tagset') try: nltk.find('corpora/wordnet') except LookupError: nltk.download('wordnet') stim = ComplexTextStim(text='These are tests for Stemming filters') filt = WordStemmingFilter(stemmer='wordnet') lemmatized = filt.transform(stim) lemmas = [l.text for l in lemmatized] target = ['these', 'be', 'test', 'for', 'stem', 'filter'] assert lemmas == target # Try case sensitive filt = WordStemmingFilter(stemmer='wordnet', case_sensitive=True) lemmatized = filt.transform(stim) lemmas = [l.text for l in lemmatized] target = ['These', 'be', 'test', 'for', 'Stemming', 'filter'] assert lemmas == target # Fails on invalid values with pytest.raises(ValueError): filt = WordStemmingFilter(stemmer='nonexistent_stemmer') # Try a long text stim stim2 = TextStim(text='theres something happening here') filt = WordStemmingFilter() assert filt.transform(stim2).text == 'there someth happen here'
def test_metric_er_as_stim(): stim = TextStim(text='This is a test') ext_sent = VADERSentimentExtractor() ext_metric = MetricExtractor( functions='numpy.sum', subset_idx=[f'sentiment_{d}' for d in ['neg', 'pos', 'neu']]) r = ext_metric.transform(ext_sent.transform(stim)) df = merge_results(r, extractor_names=False) assert np.isclose(df['sum'][0], 1)
def test_tfhub_text_one_feature(): stim = TextStim(join(TEXT_DIR, 'scandal.txt')) cstim = ComplexTextStim(join(TEXT_DIR, 'wonderful.txt')) ext = TFHubTextExtractor(GNEWS_URL, output_key=None, features='embedding') df = merge_results(ext.transform(cstim), extractor_names=False) assert df.shape[0] == len(cstim.elements) true = hub.KerasLayer(GNEWS_URL)([cstim.elements[3].text])[0, 2].numpy() assert np.isclose(df['embedding'][3][2], true) with pytest.raises(ValueError) as err: TFHubTextExtractor(GNEWS_URL, output_key='key').transform(stim) assert 'not a dictionary' in str(err.value)
def test_google_language_api_category_extractor(): verify_dependencies(['googleapiclient']) ext = GoogleLanguageAPITextCategoryExtractor() stim = TextStim(join(TEXT_DIR, 'sample_text_with_entities.txt')) result = ext.transform(stim).to_df(timing=False, object_id='auto') assert result.shape == (1, 4) assert 'category_/Computers & Electronics' in result.columns assert result['category_/Computers & Electronics'][0] > 0.3 assert 'category_/News' in result.columns assert result['category_/News'][0] > 0.3 assert result['language'][0] == 'en'
def test_google_language_api_entity_sentiment_extractor(): verify_dependencies(['googleapiclient']) ext = GoogleLanguageAPIEntitySentimentExtractor() stim = TextStim(join(TEXT_DIR, 'sample_text_with_entities.txt')) result = ext.transform(stim).to_df(timing=False, object_id='auto') # Produces same result as entity extractor with sentiment columns assert result.shape == (10, 11) assert result['text'][8] == 'phones' assert result['type'][8] == 'CONSUMER_GOOD' assert 'sentiment_score' in result.columns assert result['sentiment_score'][8] > 0.6 # 'love their ... phones'
def test_multiple_text_filters(): stim = TextStim(text='testing the filtering features') filt1 = TokenizingFilter() filt2 = WordStemmingFilter() stemmed_tokens = filt2.transform(filt1.transform(stim)) full_text = ' '.join([s.text for s in stemmed_tokens]) assert full_text == 'test the filter featur' stim = TextStim(text='ARTICLE ONE: Rights') g = Graph() g.add_node(LowerCasingFilter()) filt1 = LowerCasingFilter() filt2 = PunctuationRemovalFilter() filt3 = TokenizingFilter() final_texts = filt3.transform(filt2.transform(filt1.transform(stim))) assert len(final_texts) == 3 assert final_texts[0].text == 'article' assert final_texts[0].order == 0 assert final_texts[1].text == 'one' assert final_texts[2].text == 'rights' assert final_texts[2].order == 2
def test_vectorizer_extractor(): pytest.importorskip('sklearn') stim = TextStim(join(TEXT_DIR, 'scandal.txt')) result = TextVectorizerExtractor().transform(stim).to_df() assert 'woman' in result.columns assert result['woman'][0] == 3 from sklearn.feature_extraction.text import TfidfVectorizer custom_vectorizer = TfidfVectorizer() ext = TextVectorizerExtractor(vectorizer=custom_vectorizer) stim2 = TextStim(join(TEXT_DIR, 'simple_text.txt')) result = merge_results(ext.transform([stim, stim2])) assert ('TextVectorizerExtractor', 'woman') in result.columns assert 0.129568189476 in result[('TextVectorizerExtractor', 'woman')] ext = TextVectorizerExtractor(vectorizer='CountVectorizer', analyzer='char_wb', ngram_range=(2, 2)) result = ext.transform(stim).to_df() assert 'wo' in result.columns assert result['wo'][0] == 6
def test_google_language_api_extractor(): verify_dependencies(['googleapiclient']) ext = GoogleLanguageAPIExtractor(features=['classifyText', 'extractEntities']) stim = TextStim(text='hello world') with pytest.raises(googleapiclient.errors.HttpError): # Should fail because too few tokens ext.transform(stim) stim = TextStim(join(TEXT_DIR, 'scandal.txt')) result = ext.transform(stim).to_df(timing=False, object_id='auto') assert result.shape == (43, 10) assert 'category_/Books & Literature' in result.columns assert result['category_/Books & Literature'][0] > 0.5 irene = result[result['text'] == 'Irene Adler'] assert (irene['type'] == 'PERSON').all() assert not irene['metadata_wikipedia_url'].isna().any() # Document row shouldn't have entity features, and vice versa assert np.isnan(result.iloc[0]['text']) assert np.isnan(result.iloc[1]['category_/Books & Literature']).all()
def test_google_language_api_sentiment_extractor(): verify_dependencies(['googleapiclient']) ext = GoogleLanguageAPISentimentExtractor() stim = TextStim(join(TEXT_DIR, 'scandal.txt')) result = ext.transform(stim).to_df(timing=False, object_id='auto') assert result.shape == (12, 7) assert 'sentiment_magnitude' in result.columns assert 'text' in result.columns doc_sentiment = result['sentiment_score'][11] assert doc_sentiment < 0.3 and doc_sentiment > -0.3 assert result['begin_char_index'][7] == 565.0 assert result['end_char_index'][7] == 672.0 assert result['sentiment_magnitude'][7] > 0.6
def test_google_language_api_entity_extractor(): verify_dependencies(['googleapiclient']) ext = GoogleLanguageAPIEntityExtractor() stim = TextStim(join(TEXT_DIR, 'sample_text_with_entities.txt')) result = ext.transform(stim).to_df(timing=False, object_id='auto') assert result.shape == (10, 9) assert result['text'][0] == 'Google' assert result['type'][0] == 'ORGANIZATION' assert result['salience'][0] > 0.0 and result['salience'][0] < 0.5 assert result['begin_char_index'][4] == 165.0 assert result['end_char_index'][4] == 172.0 assert result['text'][4] == 'Android' assert result['type'][4] == 'CONSUMER_GOOD'
def test_save(): text_dir = join(get_test_data_path(), 'text') complextext_stim = ComplexTextStim(join(text_dir, 'complex_stim_no_header.txt'), columns='ot', default_duration=0.2) text_stim = TextStim(text='hello') video_stim = VideoStim(join(get_test_data_path(), 'video', 'small.mp4')) audio_stim = AudioStim(join(get_test_data_path(), 'audio', 'crowd.mp3')) image_stim = ImageStim(join(get_test_data_path(), 'image', 'apple.jpg')) stims = [complextext_stim, text_stim, video_stim, audio_stim, image_stim] for s in stims: path = tempfile.mktemp() + s._default_file_extension s.save(path) assert exists(path) os.remove(path)
def test_save(): cts_file = join(get_test_data_path(), 'text', 'complex_stim_no_header.txt') complextext_stim = ComplexTextStim(cts_file, columns='ot', default_duration=0.2) text_stim = TextStim(text='hello') audio_stim = AudioStim(join(get_test_data_path(), 'audio', 'crowd.mp3')) image_stim = ImageStim(join(get_test_data_path(), 'image', 'apple.jpg')) # Video gives travis problems stims = [complextext_stim, text_stim, audio_stim, image_stim] for s in stims: path = tempfile.mktemp() + s._default_file_extension s.save(path) assert exists(path) os.remove(path)
def parse_textgrid(transcript_path): with open(transcript_path) as f: start_parse = False # Indicates we are on the 'word' portion of output all_lines = f.readlines() texts = [] for i, line in enumerate(all_lines): if line == '\titem [2]:\n': start_parse = True if start_parse and line.startswith('\t\t\ti'): onset = float(all_lines[i + 1].split()[-1]) duration = float(all_lines[i + 2].split()[-1]) - onset text = str(all_lines[i + 3].split()[-1])[1:-1].lower() if not (text == 'sp'): # Space/hesitation in audio texts.append( TextStim(text=text, onset=onset, duration=duration)) return texts
def test_remote_stims(): url = 'http://www.obamadownloads.com/videos/iran-deal-speech.mp4' video = VideoStim(url=url) assert video.fps == 12 url = 'http://www.bobainsworth.com/wav/simpsons/themodyn.wav' audio = AudioStim(url=url) assert round(audio.duration) == 3 url = 'https://www.whitehouse.gov/sites/whitehouse.gov/files/images/twitter_cards_potus.jpg' image = ImageStim(url=url) assert image.data.shape == (240, 240, 3) url = 'https://github.com/tyarkoni/pliers/blob/master/README.md' text = TextStim(url=url) assert len(text.text) > 1