def text_featurize(filename, jsondump): # transcribe with sphinx transcript = ts.transcribe_sphinx('test.wav') # now put transcript through various feature engines nltk_featureset, nltk_labels = nf.nltk_featurize(transcript) spacy_featureset, spacy_labels = spf.spacy_featurize(transcript) # make gensim embedding on alice and wonderland text # (or any text corpus you'd like) modelname = 'alice.pickle' if modelname not in os.listdir(): text = open('alice.txt').read() gf.w2v_train(text, 100, modelname) gensim_featureset = gf.sentence_embedding(transcript, 100, modelname) data = { 'transcript': transcript, 'transcript type': 'sphinx', 'nltk': np.array(nltk_featureset).tolist(), 'spacy': np.array(spacy_featureset).tolist(), 'gensim': np.array(gensim_featureset).tolist(), } if jsondump == True: jsonfilename = filename[0:-4] + '.json' jsonfile = open(jsonfilename, 'w') json.dump(data, jsonfile) jsonfile.close() return data
def audio_featurize(feature_set, audiofile, transcript): # long conditional on all the types of features that can happen and featurizes accordingly. if feature_set == 'librosa_features': features, labels = lf.librosa_featurize(audiofile, False) elif feature_set == 'standard_features': features, labels = sf.standard_featurize(audiofile) elif feature_set == 'audioset_features': features, labels = af.audioset_featurize(audiofile, basedir, foldername) elif feature_set == 'sox_features': features, labels = soxf.sox_featurize(audiofile) elif feature_set == 'sa_features': features, labels = saf.sa_featurize(audiofile) elif feature_set == 'pyaudio_features': features, labels = pf.pyaudio_featurize(audiofile, basedir) elif feature_set == 'spectrogram_features': features, labels= specf.spectrogram_featurize(audiofile) elif feature_set == 'meta_features': features, labels = mf.meta_featurize(audiofile, cur_dir, help_dir) elif feature_set == 'opensmile_features': features, labels = osm.opensmile_featurize(audiofile, basedir, 'GeMAPSv01a.conf') elif feature_set == 'praat_features': features, labels = prf.praat_featurize(audiofile) elif feature_set == 'pspeech_features': features, labels = psf.pspeech_featurize(audiofile) elif feature_set == 'specimage_features': features, labels = sif.specimage_featurize(audiofile,cur_dir, haar_dir) elif feature_set == 'specimage2_features': features, labels = sif2.specimage2_featurize(audiofile, cur_dir, haar_dir) elif feature_set == 'myprosody_features': print('Myprosody features are coming soon!! Currently debugging this feature set.') # features, labels = mpf.myprosody_featurize(audiofile, cur_dir, help_dir) elif feature_set == 'nltk_features': features, labels = nf.nltk_featurize(transcript) elif feature_set == 'mixed_features': features, labels = mixf.mixed_featurize(audiofile, transcript, help_dir) elif feature_set == 'audiotext_features': features, labels = atf.audiotext_featurize(audiofile, transcript) elif feature_set == 'prosody_features': features, labels = prosf.prosody_featurize(audiofile, 20) elif feature_set == 'pyworld_features': features, labels = pywf.pyworld_featurize(audiofile) # make sure all the features do not have any infinity or NaN features=np.nan_to_num(np.array(features)) features=features.tolist() return features, labels
def audiotext_featurize(wavfile, transcript): # get features # librosa_features, librosa_labels=lf.librosa_featurize(wavfile, False) nltk_features, nltk_labels = nf.nltk_featurize(transcript) textacy_features, textacy_labels = tfe.textacy_featurize(transcript) spacy_features, spacy_labels = spf.spacy_featurize(transcript) text_features, text_labels = tfea.text_featurize(transcript) # features=np.append(np.array(librosa_features), np.array(nltk_features)) features = np.append(np.array(nltk_features), np.array(textacy_features)) features = np.append(features, np.array(spacy_features)) labels = nltk_labels + textacy_labels + spacy_labels # labels=librosa_labels+nltk_labels+textacy_labels+spacy_features return features, labels
def audiotext_featurize(wavfile, transcript): # get features nltk_features, nltk_labels = nf.nltk_featurize(transcript) textacy_features, textacy_labels = tfe.textacy_featurize(transcript) spacy_features, spacy_labels = spf.spacy_featurize(transcript) text_features, text_labels = tfea.text_featurize(transcript) # concatenate feature arrays features = np.append(np.array(nltk_features), np.array(textacy_features)) features = np.append(features, np.array(spacy_features)) features = np.append(features, np.array(text_features)) # concatenate labels labels = nltk_labels + textacy_labels + spacy_labels + text_labels return features, labels
def mixed_featurize(wavfile, transcript, help_dir): print(os.getcwd()) g = json.load(open(help_dir + '/mixed/mixed_feature_0.json')) labels = g['labels'] inds = g['mixed_inds'] # get features librosa_features, librosa_labels = lf.librosa_featurize(wavfile, False) nltk_features, nltk_labels = nf.nltk_featurize(transcript) features = list() for j in range(len(inds)): nltk_feature = inds[j][0] librosa_feature = inds[j][1] try: feature = nltk_feature / librosa_feature except: # put zero value if the feature is not available feature = 0 features.append(feature) return features, labels
def audiotext_featurize(wavfile, transcript): try: # get features try: nltk_features, nltk_labels = nf.nltk_featurize(transcript) except: nltk_labels = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'space', 'numbers', 'capletters', 'cc', 'cd', 'dt', 'ex', 'in', 'jj', 'jjr', 'jjs', 'ls', 'md', 'nn', 'nnp', 'nns', 'pdt', 'pos', 'prp', 'prp2', 'rbr', 'rbs', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbp', 'vbz', 'wdt', 'wp', 'wrb', 'polarity', 'subjectivity', 'repeat' ] nltk_features = list(np.zeros(len(nltk_labels))) try: textacy_features, textacy_labels = tfe.textacy_featurize( transcript) except: textacy_labels = [ 'uniquewords', 'n_sents', 'n_words', 'n_chars', 'n_syllables', 'n_unique_words', 'n_long_words', 'n_monosyllable_words', 'n_polysyllable_words', 'flesch_kincaid_grade_level', 'flesch_kincaid_grade_level', 'flesch_reading_ease', 'smog_index', 'gunning_fog_index', 'coleman_liau_index', 'automated_readability_index', 'lix', 'gulpease_index', 'wiener_sachtextformel' ] textacy_features = list(np.zeros(len(textacy_labels))) try: spacy_features, spacy_labels = spf.spacy_featurize(transcript) except: spacy_labels = [ 'PROPN', 'ADP', 'DET', 'NUM', 'PUNCT', 'SPACE', 'VERB', 'NOUN', 'ADV', 'CCONJ', 'PRON', 'ADJ', 'SYM', 'PART', 'INTJ', 'X', 'pos_other', 'NNP', 'IN', 'DT', 'CD', 'NNPS', ',', '_SP', 'VBZ', 'NN', 'RB', 'CC', '', 'NNS', '.', 'PRP', 'MD', 'VB', 'HYPH', 'VBD', 'JJ', ':', '-LRB-', '$', '-RRB-', 'VBG', 'VBN', 'NFP', 'RBR', 'POS', 'VBP', 'RP', 'JJS', 'PRP$', 'EX', 'JJR', 'WP', 'WDT', 'TO', 'WRB', "''", '``', 'PDT', 'AFX', 'RBS', 'UH', 'WP$', 'FW', 'XX', 'SYM', 'LS', 'ADD', 'tag_other', 'compound', 'ROOT', 'prep', 'det', 'pobj', 'nummod', 'punct', '', 'nsubj', 'advmod', 'cc', 'conj', 'aux', 'dobj', 'nmod', 'acl', 'appos', 'npadvmod', 'amod', 'agent', 'case', 'intj', 'prt', 'pcomp', 'ccomp', 'attr', 'dep', 'acomp', 'poss', 'auxpass', 'expl', 'mark', 'nsubjpass', 'quantmod', 'advcl', 'relcl', 'oprd', 'neg', 'xcomp', 'csubj', 'predet', 'parataxis', 'dative', 'preconj', 'csubjpass', 'meta', 'dep_other', '\ufeffXxx', 'Xxxxx', 'XXxxx', 'xx', 'X', 'Xxxx', 'Xxx', ',', '\n\n', 'xXxxx', 'xxx', 'xxxx', '\n', '.', ' ', '-', 'xxx.xxxx.xxx', '\n\n\n', ':', '\n ', 'dddd', '[', '#', 'dd', ']', 'd', 'XXX-d', '*', 'XXXX', 'XX', 'XXX', '\n\n\n\n', 'Xx', '\n\n\n ', '--', '\n\n ', ' ', ' ', ' ', "'x", 'x', 'X.', 'xxx--', ';', 'Xxx.', '(', ')', "'", '“', '”', 'Xx.', '!', "'xx", 'xx!--Xxx', "x'xxxx", '?', '_', "x'x", "x'xx", "Xxx'xxxx", 'Xxxxx--', 'xxxx--', '--xxxx', 'X--', 'xx--', 'xxxx”--xxx', 'xxx--“xxxx', "Xxx'x", ';--', 'xxx--_xxx', "xxx'x", 'xxx!--xxxx', 'xxxx?--_Xxx', "Xxxxx'x", 'xxxx--“xxxx', "xxxx'xxx", '--Xxxxx', ',--', '?--', 'xx--“xx', 'xx!--X', '.--', 'xxx--“xxx', ':--', 'Xxxxx--“xxxx', 'xxxx!--xxxx', 'xx”--xxx', 'xxxx--_xxx', 'xxxx--“xxx', '--xx', '--X', 'xxxx!--Xxx', '--xxx', 'xxx_.', 'xxxx--_xx', 'xxxx--_xx_xxxx', 'xx!--xxxx', 'xxxx!--xx', "X'xx", "xxxx'x", "X_'x", "xxx'xxx", '--Xxxx', "X'Xxxxx", "Xx'xxxx", '--Xxx', 'xxxx”--xxxx', 'xxxx!--', 'xxxx--“x', 'Xxxx!--Xxxx', 'xxx!--Xxx', 'Xxxxx.', 'xxxx_.', 'xx--“Xxxx', '\n\n ', 'Xxxxx”--xxx', 'xxxx”--xx', 'xxxx--“xx', "Xxxxx!--Xxx'x", "X'xxxx", 'Xxxxx?--', '--Xx', 'xxxx!”--Xx', "xxxx--“X'x", "xxxx'", 'xxx.--“Xxxx', 'xxxx--“X', 'xxxx!--X', 'Xxx”--xx', 'xxx”--xxx', 'xxx-_xxx', "x'Xxxxx", 'Xxxxx!--X', 'Xxxxx!--Xxx', 'dd-d.xxx', 'xxxx://xxx.xxxx.xxx/d/dd/', 'xXxxxx', 'xxxx://xxxx.xxx/xxxx', 'd.X.', '/', 'd.X.d', 'd.X', '%', 'Xd', 'xxxx://xxx.xxxx.xxx', 'ddd(x)(d', 'X.X.', 'ddd', '*****@*****.**', 'xxxx://xxxx.xxx', '$', 'd,ddd', 'shape_other', 'mean sentence polarity', 'std sentence polarity', 'max sentence polarity', 'min sentence polarity', 'median sentence polarity', 'mean sentence subjectivity', 'std sentence subjectivity', 'max sentence subjectivity', 'min sentence subjectivity', 'median sentence subjectivity', 'character count', 'word count', 'sentence number', 'words per sentence', 'unique chunk noun text', 'unique chunk root text', 'unique chunk root head text', 'chunkdep ROOT', 'chunkdep pobj', 'chunkdep nsubj', 'chunkdep dobj', 'chunkdep conj', 'chunkdep appos', 'chunkdep attr', 'chunkdep nsubjpass', 'chunkdep dative', 'chunkdep pcomp', 'number of named entities', 'PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL' ] spacy_features = list(np.zeros(len(spacy_labels))) try: text_features, text_labels = tfea.text_featurize(transcript) except: text_labels = [ 'filler ratio', 'type token ratio', 'standardized word entropy', 'question ratio', 'number ratio', 'Brunets Index', 'Honores statistic', 'datewords freq', 'word number', 'five word count', 'max word length', 'min word length', 'variance of vocabulary', 'std of vocabulary', 'sentencenum', 'periods', 'questions', 'interjections', 'repeatavg' ] text_features = list(np.zeros(len(text_labels))) # concatenate feature arrays features = np.append(np.array(nltk_features), np.array(textacy_features)) features = np.append(features, np.array(spacy_features)) features = np.append(features, np.array(text_features)) # concatenate labels labels = nltk_labels + textacy_labels + spacy_labels + text_labels except: labels = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'space', 'numbers', 'capletters', 'cc', 'cd', 'dt', 'ex', 'in', 'jj', 'jjr', 'jjs', 'ls', 'md', 'nn', 'nnp', 'nns', 'pdt', 'pos', 'prp', 'prp2', 'rbr', 'rbs', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbp', 'vbz', 'wdt', 'wp', 'wrb', 'polarity', 'subjectivity', 'repeat', 'uniquewords', 'n_sents', 'n_words', 'n_chars', 'n_syllables', 'n_unique_words', 'n_long_words', 'n_monosyllable_words', 'n_polysyllable_words', 'flesch_kincaid_grade_level', 'flesch_kincaid_grade_level', 'flesch_reading_ease', 'smog_index', 'gunning_fog_index', 'coleman_liau_index', 'automated_readability_index', 'lix', 'gulpease_index', 'wiener_sachtextformel', 'PROPN', 'ADP', 'DET', 'NUM', 'PUNCT', 'SPACE', 'VERB', 'NOUN', 'ADV', 'CCONJ', 'PRON', 'ADJ', 'SYM', 'PART', 'INTJ', 'X', 'pos_other', 'NNP', 'IN', 'DT', 'CD', 'NNPS', ',', '_SP', 'VBZ', 'NN', 'RB', 'CC', '', 'NNS', '.', 'PRP', 'MD', 'VB', 'HYPH', 'VBD', 'JJ', ':', '-LRB-', '$', '-RRB-', 'VBG', 'VBN', 'NFP', 'RBR', 'POS', 'VBP', 'RP', 'JJS', 'PRP$', 'EX', 'JJR', 'WP', 'WDT', 'TO', 'WRB', "''", '``', 'PDT', 'AFX', 'RBS', 'UH', 'WP$', 'FW', 'XX', 'SYM', 'LS', 'ADD', 'tag_other', 'compound', 'ROOT', 'prep', 'det', 'pobj', 'nummod', 'punct', '', 'nsubj', 'advmod', 'cc', 'conj', 'aux', 'dobj', 'nmod', 'acl', 'appos', 'npadvmod', 'amod', 'agent', 'case', 'intj', 'prt', 'pcomp', 'ccomp', 'attr', 'dep', 'acomp', 'poss', 'auxpass', 'expl', 'mark', 'nsubjpass', 'quantmod', 'advcl', 'relcl', 'oprd', 'neg', 'xcomp', 'csubj', 'predet', 'parataxis', 'dative', 'preconj', 'csubjpass', 'meta', 'dep_other', '\ufeffXxx', 'Xxxxx', 'XXxxx', 'xx', 'X', 'Xxxx', 'Xxx', ',', '\n\n', 'xXxxx', 'xxx', 'xxxx', '\n', '.', ' ', '-', 'xxx.xxxx.xxx', '\n\n\n', ':', '\n ', 'dddd', '[', '#', 'dd', ']', 'd', 'XXX-d', '*', 'XXXX', 'XX', 'XXX', '\n\n\n\n', 'Xx', '\n\n\n ', '--', '\n\n ', ' ', ' ', ' ', "'x", 'x', 'X.', 'xxx--', ';', 'Xxx.', '(', ')', "'", '“', '”', 'Xx.', '!', "'xx", 'xx!--Xxx', "x'xxxx", '?', '_', "x'x", "x'xx", "Xxx'xxxx", 'Xxxxx--', 'xxxx--', '--xxxx', 'X--', 'xx--', 'xxxx”--xxx', 'xxx--“xxxx', "Xxx'x", ';--', 'xxx--_xxx', "xxx'x", 'xxx!--xxxx', 'xxxx?--_Xxx', "Xxxxx'x", 'xxxx--“xxxx', "xxxx'xxx", '--Xxxxx', ',--', '?--', 'xx--“xx', 'xx!--X', '.--', 'xxx--“xxx', ':--', 'Xxxxx--“xxxx', 'xxxx!--xxxx', 'xx”--xxx', 'xxxx--_xxx', 'xxxx--“xxx', '--xx', '--X', 'xxxx!--Xxx', '--xxx', 'xxx_.', 'xxxx--_xx', 'xxxx--_xx_xxxx', 'xx!--xxxx', 'xxxx!--xx', "X'xx", "xxxx'x", "X_'x", "xxx'xxx", '--Xxxx', "X'Xxxxx", "Xx'xxxx", '--Xxx', 'xxxx”--xxxx', 'xxxx!--', 'xxxx--“x', 'Xxxx!--Xxxx', 'xxx!--Xxx', 'Xxxxx.', 'xxxx_.', 'xx--“Xxxx', '\n\n ', 'Xxxxx”--xxx', 'xxxx”--xx', 'xxxx--“xx', "Xxxxx!--Xxx'x", "X'xxxx", 'Xxxxx?--', '--Xx', 'xxxx!”--Xx', "xxxx--“X'x", "xxxx'", 'xxx.--“Xxxx', 'xxxx--“X', 'xxxx!--X', 'Xxx”--xx', 'xxx”--xxx', 'xxx-_xxx', "x'Xxxxx", 'Xxxxx!--X', 'Xxxxx!--Xxx', 'dd-d.xxx', 'xxxx://xxx.xxxx.xxx/d/dd/', 'xXxxxx', 'xxxx://xxxx.xxx/xxxx', 'd.X.', '/', 'd.X.d', 'd.X', '%', 'Xd', 'xxxx://xxx.xxxx.xxx', 'ddd(x)(d', 'X.X.', 'ddd', '*****@*****.**', 'xxxx://xxxx.xxx', '$', 'd,ddd', 'shape_other', 'mean sentence polarity', 'std sentence polarity', 'max sentence polarity', 'min sentence polarity', 'median sentence polarity', 'mean sentence subjectivity', 'std sentence subjectivity', 'max sentence subjectivity', 'min sentence subjectivity', 'median sentence subjectivity', 'character count', 'word count', 'sentence number', 'words per sentence', 'unique chunk noun text', 'unique chunk root text', 'unique chunk root head text', 'chunkdep ROOT', 'chunkdep pobj', 'chunkdep nsubj', 'chunkdep dobj', 'chunkdep conj', 'chunkdep appos', 'chunkdep attr', 'chunkdep nsubjpass', 'chunkdep dative', 'chunkdep pcomp', 'number of named entities', 'PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL', 'filler ratio', 'type token ratio', 'standardized word entropy', 'question ratio', 'number ratio', 'Brunets Index', 'Honores statistic', 'datewords freq', 'word number', 'five word count', 'max word length', 'min word length', 'variance of vocabulary', 'std of vocabulary', 'sentencenum', 'periods', 'questions', 'interjections', 'repeatavg' ] features = list(np.zeros(len(labels))) return features, labels
def tesseract_featurize(imgfile): # can stitch across an entire length of video frames too transcript=transcribe_image(imgfile) features, labels = nf.nltk_featurize(transcript) return transcript, features, labels
dir_ = dir_ + g[i] else: dir_ = dir_ + '/' + g[i] # print(dir_) return dir_ directory = os.getcwd() prevdir = prev_dir(directory) sys.path.append(prevdir + '/text_features') import nltk_features as nf # get features librosa_features, librosa_labels = lf.librosa_featurize('test.wav', False) transcript = ts.transcribe_sphinx('test.wav') nltk_features, nltk_labels = nf.nltk_featurize(transcript) # relate some features to each other # engineer 10 random features by dividing them and making new labels mixed_features = list() mixed_labels = list() mixed_inds = list() for i in range(5): while len(mixed_labels) < 100: # get some random features from both text and audio i1 = random.randint(0, len(librosa_features) - 1) label_1 = librosa_labels[i1] feature_1 = librosa_features[i1] i2 = random.randint(0, len(nltk_features) - 1)