def text_featurize(filename, jsondump):
    # transcribe with sphinx
    transcript = ts.transcribe_sphinx('test.wav')
    # now put transcript through various feature engines
    nltk_featureset, nltk_labels = nf.nltk_featurize(transcript)
    spacy_featureset, spacy_labels = spf.spacy_featurize(transcript)
    # make gensim embedding on alice and wonderland text
    # (or any text corpus you'd like)
    modelname = 'alice.pickle'
    if modelname not in os.listdir():
        text = open('alice.txt').read()
        gf.w2v_train(text, 100, modelname)
    gensim_featureset = gf.sentence_embedding(transcript, 100, modelname)

    data = {
        'transcript': transcript,
        'transcript type': 'sphinx',
        'nltk': np.array(nltk_featureset).tolist(),
        'spacy': np.array(spacy_featureset).tolist(),
        'gensim': np.array(gensim_featureset).tolist(),
    }

    if jsondump == True:
        jsonfilename = filename[0:-4] + '.json'
        jsonfile = open(jsonfilename, 'w')
        json.dump(data, jsonfile)
        jsonfile.close()

    return data
def audio_featurize(feature_set, audiofile, transcript):

		# long conditional on all the types of features that can happen and featurizes accordingly.
		if feature_set == 'librosa_features':
				features, labels = lf.librosa_featurize(audiofile, False)
		elif feature_set == 'standard_features':
				features, labels = sf.standard_featurize(audiofile)
		elif feature_set == 'audioset_features':
				features, labels = af.audioset_featurize(audiofile, basedir, foldername)
		elif feature_set == 'sox_features':
				features, labels = soxf.sox_featurize(audiofile)
		elif feature_set == 'sa_features':
				features, labels = saf.sa_featurize(audiofile)
		elif feature_set == 'pyaudio_features':
				features, labels = pf.pyaudio_featurize(audiofile, basedir)
		elif feature_set == 'spectrogram_features':
				features, labels= specf.spectrogram_featurize(audiofile)
		elif feature_set == 'meta_features':
				features, labels = mf.meta_featurize(audiofile, cur_dir, help_dir)
		elif feature_set == 'opensmile_features':
				features, labels = osm.opensmile_featurize(audiofile, basedir, 'GeMAPSv01a.conf')
		elif feature_set == 'praat_features':
				features, labels = prf.praat_featurize(audiofile)
		elif feature_set == 'pspeech_features':
				features, labels = psf.pspeech_featurize(audiofile)
		elif feature_set == 'specimage_features':
				features, labels = sif.specimage_featurize(audiofile,cur_dir, haar_dir)
		elif feature_set == 'specimage2_features':
				features, labels = sif2.specimage2_featurize(audiofile, cur_dir, haar_dir)
		elif feature_set == 'myprosody_features':
				print('Myprosody features are coming soon!! Currently debugging this feature set.')
				# features, labels = mpf.myprosody_featurize(audiofile, cur_dir, help_dir)
		elif feature_set == 'nltk_features':
				features, labels = nf.nltk_featurize(transcript)
		elif feature_set == 'mixed_features':
				features, labels = mixf.mixed_featurize(audiofile, transcript, help_dir)
		elif feature_set == 'audiotext_features':
				features, labels = atf.audiotext_featurize(audiofile, transcript)
		elif feature_set == 'prosody_features':
				features, labels = prosf.prosody_featurize(audiofile, 20)
		elif feature_set == 'pyworld_features':
				features, labels = pywf.pyworld_featurize(audiofile)

		# make sure all the features do not have any infinity or NaN
		features=np.nan_to_num(np.array(features))
		features=features.tolist()

		return features, labels
Example #3
0
def audiotext_featurize(wavfile, transcript):

    # get features
    # librosa_features, librosa_labels=lf.librosa_featurize(wavfile, False)
    nltk_features, nltk_labels = nf.nltk_featurize(transcript)
    textacy_features, textacy_labels = tfe.textacy_featurize(transcript)
    spacy_features, spacy_labels = spf.spacy_featurize(transcript)
    text_features, text_labels = tfea.text_featurize(transcript)

    # features=np.append(np.array(librosa_features), np.array(nltk_features))
    features = np.append(np.array(nltk_features), np.array(textacy_features))
    features = np.append(features, np.array(spacy_features))
    labels = nltk_labels + textacy_labels + spacy_labels
    # labels=librosa_labels+nltk_labels+textacy_labels+spacy_features

    return features, labels
Example #4
0
def audiotext_featurize(wavfile, transcript):

    # get features
    nltk_features, nltk_labels = nf.nltk_featurize(transcript)
    textacy_features, textacy_labels = tfe.textacy_featurize(transcript)
    spacy_features, spacy_labels = spf.spacy_featurize(transcript)
    text_features, text_labels = tfea.text_featurize(transcript)

    # concatenate feature arrays
    features = np.append(np.array(nltk_features), np.array(textacy_features))
    features = np.append(features, np.array(spacy_features))
    features = np.append(features, np.array(text_features))

    # concatenate labels
    labels = nltk_labels + textacy_labels + spacy_labels + text_labels

    return features, labels
Example #5
0
def mixed_featurize(wavfile, transcript, help_dir):
    print(os.getcwd())
    g = json.load(open(help_dir + '/mixed/mixed_feature_0.json'))

    labels = g['labels']
    inds = g['mixed_inds']

    # get features
    librosa_features, librosa_labels = lf.librosa_featurize(wavfile, False)
    nltk_features, nltk_labels = nf.nltk_featurize(transcript)
    features = list()

    for j in range(len(inds)):
        nltk_feature = inds[j][0]
        librosa_feature = inds[j][1]
        try:
            feature = nltk_feature / librosa_feature
        except:
            # put zero value if the feature is not available
            feature = 0
        features.append(feature)

    return features, labels
def audiotext_featurize(wavfile, transcript):

    try:
        # get features
        try:
            nltk_features, nltk_labels = nf.nltk_featurize(transcript)
        except:
            nltk_labels = [
                'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                'y', 'z', 'space', 'numbers', 'capletters', 'cc', 'cd', 'dt',
                'ex', 'in', 'jj', 'jjr', 'jjs', 'ls', 'md', 'nn', 'nnp', 'nns',
                'pdt', 'pos', 'prp', 'prp2', 'rbr', 'rbs', 'rp', 'to', 'uh',
                'vb', 'vbd', 'vbg', 'vbn', 'vbp', 'vbz', 'wdt', 'wp', 'wrb',
                'polarity', 'subjectivity', 'repeat'
            ]
            nltk_features = list(np.zeros(len(nltk_labels)))

        try:
            textacy_features, textacy_labels = tfe.textacy_featurize(
                transcript)
        except:
            textacy_labels = [
                'uniquewords', 'n_sents', 'n_words', 'n_chars', 'n_syllables',
                'n_unique_words', 'n_long_words', 'n_monosyllable_words',
                'n_polysyllable_words', 'flesch_kincaid_grade_level',
                'flesch_kincaid_grade_level', 'flesch_reading_ease',
                'smog_index', 'gunning_fog_index', 'coleman_liau_index',
                'automated_readability_index', 'lix', 'gulpease_index',
                'wiener_sachtextformel'
            ]
            textacy_features = list(np.zeros(len(textacy_labels)))

        try:
            spacy_features, spacy_labels = spf.spacy_featurize(transcript)
        except:
            spacy_labels = [
                'PROPN', 'ADP', 'DET', 'NUM', 'PUNCT', 'SPACE', 'VERB', 'NOUN',
                'ADV', 'CCONJ', 'PRON', 'ADJ', 'SYM', 'PART', 'INTJ', 'X',
                'pos_other', 'NNP', 'IN', 'DT', 'CD', 'NNPS', ',', '_SP',
                'VBZ', 'NN', 'RB', 'CC', '', 'NNS', '.', 'PRP', 'MD', 'VB',
                'HYPH', 'VBD', 'JJ', ':', '-LRB-', '$', '-RRB-', 'VBG', 'VBN',
                'NFP', 'RBR', 'POS', 'VBP', 'RP', 'JJS', 'PRP$', 'EX', 'JJR',
                'WP', 'WDT', 'TO', 'WRB', "''", '``', 'PDT', 'AFX', 'RBS',
                'UH', 'WP$', 'FW', 'XX', 'SYM', 'LS', 'ADD', 'tag_other',
                'compound', 'ROOT', 'prep', 'det', 'pobj', 'nummod', 'punct',
                '', 'nsubj', 'advmod', 'cc', 'conj', 'aux', 'dobj', 'nmod',
                'acl', 'appos', 'npadvmod', 'amod', 'agent', 'case', 'intj',
                'prt', 'pcomp', 'ccomp', 'attr', 'dep', 'acomp', 'poss',
                'auxpass', 'expl', 'mark', 'nsubjpass', 'quantmod', 'advcl',
                'relcl', 'oprd', 'neg', 'xcomp', 'csubj', 'predet',
                'parataxis', 'dative', 'preconj', 'csubjpass', 'meta',
                'dep_other', '\ufeffXxx', 'Xxxxx', 'XXxxx', 'xx', 'X', 'Xxxx',
                'Xxx', ',', '\n\n', 'xXxxx', 'xxx', 'xxxx', '\n', '.', ' ',
                '-', 'xxx.xxxx.xxx', '\n\n\n', ':', '\n    ', 'dddd', '[', '#',
                'dd', ']', 'd', 'XXX-d', '*', 'XXXX', 'XX', 'XXX', '\n\n\n\n',
                'Xx', '\n\n\n    ', '--', '\n\n    ', '    ', '   ', '  ',
                "'x", 'x', 'X.', 'xxx--', ';', 'Xxx.', '(', ')', "'", '“', '”',
                'Xx.', '!', "'xx", 'xx!--Xxx', "x'xxxx", '?', '_', "x'x",
                "x'xx", "Xxx'xxxx", 'Xxxxx--', 'xxxx--', '--xxxx', 'X--',
                'xx--', 'xxxx”--xxx', 'xxx--“xxxx', "Xxx'x", ';--',
                'xxx--_xxx', "xxx'x", 'xxx!--xxxx', 'xxxx?--_Xxx', "Xxxxx'x",
                'xxxx--“xxxx', "xxxx'xxx", '--Xxxxx', ',--', '?--', 'xx--“xx',
                'xx!--X', '.--', 'xxx--“xxx', ':--', 'Xxxxx--“xxxx',
                'xxxx!--xxxx', 'xx”--xxx', 'xxxx--_xxx', 'xxxx--“xxx', '--xx',
                '--X', 'xxxx!--Xxx', '--xxx', 'xxx_.', 'xxxx--_xx',
                'xxxx--_xx_xxxx', 'xx!--xxxx', 'xxxx!--xx', "X'xx", "xxxx'x",
                "X_'x", "xxx'xxx", '--Xxxx', "X'Xxxxx", "Xx'xxxx", '--Xxx',
                'xxxx”--xxxx', 'xxxx!--', 'xxxx--“x', 'Xxxx!--Xxxx',
                'xxx!--Xxx', 'Xxxxx.', 'xxxx_.', 'xx--“Xxxx', '\n\n   ',
                'Xxxxx”--xxx', 'xxxx”--xx', 'xxxx--“xx', "Xxxxx!--Xxx'x",
                "X'xxxx", 'Xxxxx?--', '--Xx', 'xxxx!”--Xx', "xxxx--“X'x",
                "xxxx'", 'xxx.--“Xxxx', 'xxxx--“X', 'xxxx!--X', 'Xxx”--xx',
                'xxx”--xxx', 'xxx-_xxx', "x'Xxxxx", 'Xxxxx!--X', 'Xxxxx!--Xxx',
                'dd-d.xxx', 'xxxx://xxx.xxxx.xxx/d/dd/', 'xXxxxx',
                'xxxx://xxxx.xxx/xxxx', 'd.X.', '/', 'd.X.d', 'd.X', '%', 'Xd',
                'xxxx://xxx.xxxx.xxx', 'ddd(x)(d', 'X.X.', 'ddd',
                '*****@*****.**', 'xxxx://xxxx.xxx', '$', 'd,ddd',
                'shape_other', 'mean sentence polarity',
                'std sentence polarity', 'max sentence polarity',
                'min sentence polarity', 'median sentence polarity',
                'mean sentence subjectivity', 'std sentence subjectivity',
                'max sentence subjectivity', 'min sentence subjectivity',
                'median sentence subjectivity', 'character count',
                'word count', 'sentence number', 'words per sentence',
                'unique chunk noun text', 'unique chunk root text',
                'unique chunk root head text', 'chunkdep ROOT',
                'chunkdep pobj', 'chunkdep nsubj', 'chunkdep dobj',
                'chunkdep conj', 'chunkdep appos', 'chunkdep attr',
                'chunkdep nsubjpass', 'chunkdep dative', 'chunkdep pcomp',
                'number of named entities', 'PERSON', 'NORP', 'FAC', 'ORG',
                'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW',
                'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY',
                'ORDINAL', 'CARDINAL'
            ]
            spacy_features = list(np.zeros(len(spacy_labels)))

        try:
            text_features, text_labels = tfea.text_featurize(transcript)
        except:
            text_labels = [
                'filler ratio', 'type token ratio',
                'standardized word entropy', 'question ratio', 'number ratio',
                'Brunets Index', 'Honores statistic', 'datewords freq',
                'word number', 'five word count', 'max word length',
                'min word length', 'variance of vocabulary',
                'std of vocabulary', 'sentencenum', 'periods', 'questions',
                'interjections', 'repeatavg'
            ]
            text_features = list(np.zeros(len(text_labels)))

        # concatenate feature arrays
        features = np.append(np.array(nltk_features),
                             np.array(textacy_features))
        features = np.append(features, np.array(spacy_features))
        features = np.append(features, np.array(text_features))

        # concatenate labels
        labels = nltk_labels + textacy_labels + spacy_labels + text_labels
    except:
        labels = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
            'space', 'numbers', 'capletters', 'cc', 'cd', 'dt', 'ex', 'in',
            'jj', 'jjr', 'jjs', 'ls', 'md', 'nn', 'nnp', 'nns', 'pdt', 'pos',
            'prp', 'prp2', 'rbr', 'rbs', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg',
            'vbn', 'vbp', 'vbz', 'wdt', 'wp', 'wrb', 'polarity',
            'subjectivity', 'repeat', 'uniquewords', 'n_sents', 'n_words',
            'n_chars', 'n_syllables', 'n_unique_words', 'n_long_words',
            'n_monosyllable_words', 'n_polysyllable_words',
            'flesch_kincaid_grade_level', 'flesch_kincaid_grade_level',
            'flesch_reading_ease', 'smog_index', 'gunning_fog_index',
            'coleman_liau_index', 'automated_readability_index', 'lix',
            'gulpease_index', 'wiener_sachtextformel', 'PROPN', 'ADP', 'DET',
            'NUM', 'PUNCT', 'SPACE', 'VERB', 'NOUN', 'ADV', 'CCONJ', 'PRON',
            'ADJ', 'SYM', 'PART', 'INTJ', 'X', 'pos_other', 'NNP', 'IN', 'DT',
            'CD', 'NNPS', ',', '_SP', 'VBZ', 'NN', 'RB', 'CC', '', 'NNS', '.',
            'PRP', 'MD', 'VB', 'HYPH', 'VBD', 'JJ', ':', '-LRB-', '$', '-RRB-',
            'VBG', 'VBN', 'NFP', 'RBR', 'POS', 'VBP', 'RP', 'JJS', 'PRP$',
            'EX', 'JJR', 'WP', 'WDT', 'TO', 'WRB', "''", '``', 'PDT', 'AFX',
            'RBS', 'UH', 'WP$', 'FW', 'XX', 'SYM', 'LS', 'ADD', 'tag_other',
            'compound', 'ROOT', 'prep', 'det', 'pobj', 'nummod', 'punct', '',
            'nsubj', 'advmod', 'cc', 'conj', 'aux', 'dobj', 'nmod', 'acl',
            'appos', 'npadvmod', 'amod', 'agent', 'case', 'intj', 'prt',
            'pcomp', 'ccomp', 'attr', 'dep', 'acomp', 'poss', 'auxpass',
            'expl', 'mark', 'nsubjpass', 'quantmod', 'advcl', 'relcl', 'oprd',
            'neg', 'xcomp', 'csubj', 'predet', 'parataxis', 'dative',
            'preconj', 'csubjpass', 'meta', 'dep_other', '\ufeffXxx', 'Xxxxx',
            'XXxxx', 'xx', 'X', 'Xxxx', 'Xxx', ',', '\n\n', 'xXxxx', 'xxx',
            'xxxx', '\n', '.', ' ', '-', 'xxx.xxxx.xxx', '\n\n\n', ':',
            '\n    ', 'dddd', '[', '#', 'dd', ']', 'd', 'XXX-d', '*', 'XXXX',
            'XX', 'XXX', '\n\n\n\n', 'Xx', '\n\n\n    ', '--', '\n\n    ',
            '    ', '   ', '  ', "'x", 'x', 'X.', 'xxx--', ';', 'Xxx.', '(',
            ')', "'", '“', '”', 'Xx.', '!', "'xx", 'xx!--Xxx', "x'xxxx", '?',
            '_', "x'x", "x'xx", "Xxx'xxxx", 'Xxxxx--', 'xxxx--', '--xxxx',
            'X--', 'xx--', 'xxxx”--xxx', 'xxx--“xxxx', "Xxx'x", ';--',
            'xxx--_xxx', "xxx'x", 'xxx!--xxxx', 'xxxx?--_Xxx', "Xxxxx'x",
            'xxxx--“xxxx', "xxxx'xxx", '--Xxxxx', ',--', '?--', 'xx--“xx',
            'xx!--X', '.--', 'xxx--“xxx', ':--', 'Xxxxx--“xxxx', 'xxxx!--xxxx',
            'xx”--xxx', 'xxxx--_xxx', 'xxxx--“xxx', '--xx', '--X',
            'xxxx!--Xxx', '--xxx', 'xxx_.', 'xxxx--_xx', 'xxxx--_xx_xxxx',
            'xx!--xxxx', 'xxxx!--xx', "X'xx", "xxxx'x", "X_'x", "xxx'xxx",
            '--Xxxx', "X'Xxxxx", "Xx'xxxx", '--Xxx', 'xxxx”--xxxx', 'xxxx!--',
            'xxxx--“x', 'Xxxx!--Xxxx', 'xxx!--Xxx', 'Xxxxx.', 'xxxx_.',
            'xx--“Xxxx', '\n\n   ', 'Xxxxx”--xxx', 'xxxx”--xx', 'xxxx--“xx',
            "Xxxxx!--Xxx'x", "X'xxxx", 'Xxxxx?--', '--Xx', 'xxxx!”--Xx',
            "xxxx--“X'x", "xxxx'", 'xxx.--“Xxxx', 'xxxx--“X', 'xxxx!--X',
            'Xxx”--xx', 'xxx”--xxx', 'xxx-_xxx', "x'Xxxxx", 'Xxxxx!--X',
            'Xxxxx!--Xxx', 'dd-d.xxx', 'xxxx://xxx.xxxx.xxx/d/dd/', 'xXxxxx',
            'xxxx://xxxx.xxx/xxxx', 'd.X.', '/', 'd.X.d', 'd.X', '%', 'Xd',
            'xxxx://xxx.xxxx.xxx', 'ddd(x)(d', 'X.X.', 'ddd', '*****@*****.**',
            'xxxx://xxxx.xxx', '$', 'd,ddd', 'shape_other',
            'mean sentence polarity', 'std sentence polarity',
            'max sentence polarity', 'min sentence polarity',
            'median sentence polarity', 'mean sentence subjectivity',
            'std sentence subjectivity', 'max sentence subjectivity',
            'min sentence subjectivity', 'median sentence subjectivity',
            'character count', 'word count', 'sentence number',
            'words per sentence', 'unique chunk noun text',
            'unique chunk root text', 'unique chunk root head text',
            'chunkdep ROOT', 'chunkdep pobj', 'chunkdep nsubj',
            'chunkdep dobj', 'chunkdep conj', 'chunkdep appos',
            'chunkdep attr', 'chunkdep nsubjpass', 'chunkdep dative',
            'chunkdep pcomp', 'number of named entities', 'PERSON', 'NORP',
            'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART',
            'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY',
            'ORDINAL', 'CARDINAL', 'filler ratio', 'type token ratio',
            'standardized word entropy', 'question ratio', 'number ratio',
            'Brunets Index', 'Honores statistic', 'datewords freq',
            'word number', 'five word count', 'max word length',
            'min word length', 'variance of vocabulary', 'std of vocabulary',
            'sentencenum', 'periods', 'questions', 'interjections', 'repeatavg'
        ]

        features = list(np.zeros(len(labels)))

    return features, labels
Example #7
0
def tesseract_featurize(imgfile):
	# can stitch across an entire length of video frames too 
	transcript=transcribe_image(imgfile)
	features, labels = nf.nltk_featurize(transcript)
	
	return transcript, features, labels 
                dir_ = dir_ + g[i]
            else:
                dir_ = dir_ + '/' + g[i]
    # print(dir_)
    return dir_


directory = os.getcwd()
prevdir = prev_dir(directory)
sys.path.append(prevdir + '/text_features')
import nltk_features as nf

# get features
librosa_features, librosa_labels = lf.librosa_featurize('test.wav', False)
transcript = ts.transcribe_sphinx('test.wav')
nltk_features, nltk_labels = nf.nltk_featurize(transcript)

# relate some features to each other
# engineer 10 random features by dividing them and making new labels
mixed_features = list()
mixed_labels = list()
mixed_inds = list()

for i in range(5):
    while len(mixed_labels) < 100:

        # get some random features from both text and audio
        i1 = random.randint(0, len(librosa_features) - 1)
        label_1 = librosa_labels[i1]
        feature_1 = librosa_features[i1]
        i2 = random.randint(0, len(nltk_features) - 1)