Ejemplo n.º 1
0
def trainRanker(training_data_file):
    
    context_question_list = readFile(training_data_file)
    
    
    featureMatrix = FE.extractFeatures(context_question_list)
    
    '''
    (n,m) = featureMatrix.shape
    
    for i in range(n):
        for j in range(m):
            sys.stdout.write(str(featureMatrix[i,j])+" ")
        print ''
    '''
    
    writeFeatureMatrixToFile(featureMatrix,context_question_list,'train')
    

    #train the model using svm-rank
    
    svm_rank_learn_exec_path = "svm_rank/svm_rank_learn"
    feature_vectors_file_path = "models/train.dat"
    model_file_path = "models/model.dat"
    
    command = svm_rank_learn_exec_path+" -c"+" 0.001 "+" -t"+" 1 "+feature_vectors_file_path+" "+model_file_path
    #command = svm_rank_learn_exec_path+" -c"+" 0.001 "+feature_vectors_file_path+" "+model_file_path
    #print command
    
    os.system(command)
Ejemplo n.º 2
0
def answerQues(doc, nlp):

    startTime = time()
    qclass = classifyQuestion(doc)
    print("\nQclass: ", qclass)
    endTime = time()
    totalTime = endTime - startTime
    print("Classification Total Time Taken: ", totalTime)

    startTime = time()
    keywords = extractFeatures(doc)
    print("\nKeywords: ", keywords)
    endTime = time()
    totalTime = endTime - startTime
    print("Feature exraction Total Time Taken: ", totalTime)

    startTime = time()
    docRank = rankDocs(keywords)
    print("\nDocRank: ", docRank[:3])
    endTime = time()
    totalTime = endTime - startTime
    print("Doc scoring Total Time Taken: ", totalTime)

    startTime = time()
    answers = possibleAnswers(keywords, docRank, nlp)
    endTime = time()
    totalTime = endTime - startTime
    print("\nFinding Answers Total Time Taken: ", totalTime)

    return answers
Ejemplo n.º 3
0
def answerQues(question):
    nlp = en_core_web_sm.load()
    doc = nlp(u'' + question)

    qclass = classifyQuestion(doc)
    keywords = extractFeatures(doc)
    docRank = rankDocs(keywords)
    answers = possibleAnswers(keywords, docRank, nlp)
    return answers
Ejemplo n.º 4
0
def main():
    # load dnn params
    dnn = joblib.load(
        os.path.dirname(os.path.realpath(__file__)) + "/dnnParameters.sav")
    print("DNN loaded...")

    # load elm param
    svm = joblib.load(
        os.path.dirname(os.path.realpath(__file__)) + "/svmParameters.sav")

    #  load test file
    testFile = "keaton.wav"
    testFilePath = os.path.dirname(
        os.path.realpath(__file__)) + "/testWavs/" + testFile
    audioRate, audioData = wavfile.read(testFilePath)
    frameFeatureMatrix = extractFeatures(audioData, audioRate)
    topSegmentIndices = getTopEnergySegmentsIndices(audioData, audioRate)
    topSegmentFeatureMatrix = getSegmentFeaturesUsingIndices(
        frameFeatureMatrix, 25, topSegmentIndices)
    print("Segments generated...")

    # normalize the data
    scaler = joblib.load(
        os.path.dirname(os.path.realpath(__file__)) + "/scalerParameters.sav")
    topSegmentFeatureMatrix = scaler.transform(topSegmentFeatureMatrix)
    print("Data normalized...")

    # for each segement generate the probability distribution
    segmentProbabilities = dnn.predict_proba(topSegmentFeatureMatrix)

    # convert to percent
    # segmentProbabilities = segmentProbabilities * 100

    feat1 = np.amax(segmentProbabilities, axis=0)
    feat2 = np.amin(segmentProbabilities, axis=0)
    feat3 = np.mean(segmentProbabilities, axis=0)

    prob0 = segmentProbabilities[:, 0]
    prob1 = segmentProbabilities[:, 1]
    prob2 = segmentProbabilities[:, 2]
    prob3 = segmentProbabilities[:, 3]

    count0 = np.sum(prob0[prob0 > 0.2]) / len(segmentProbabilities)
    count1 = np.sum(prob1[prob1 > 0.2]) / len(segmentProbabilities)
    count2 = np.sum(prob1[prob2 > 0.2]) / len(segmentProbabilities)
    count3 = np.sum(prob1[prob3 > 0.2]) / len(segmentProbabilities)
    feat4 = np.array([count0, count1, count2, count3])

    featureVector = np.hstack([feat1, feat2, feat3, feat4])
    print("feature vector : ")
    print(featureVector)
    probs = svm.predict_proba(featureVector.reshape(-1, 1))
    print(probs)
Ejemplo n.º 5
0
def main():
    DIR = os.path.dirname(os.path.realpath(__file__))
    # load dnn params
    dnn = joblib.load(DIR + "/dnnParameters.sav")
    print("DNN loaded...")

    #  load test file
    testFile = sys.argv[1]
    # testFile = "keaton.wav"
    testFilePath = DIR + "/testWavs/" + testFile
    audioRate, audioData = wavfile.read(testFilePath)
    frameFeatureMatrix = extractFeatures(audioData, audioRate)
    topSegmentIndices = getTopEnergySegmentsIndices(audioData, audioRate)
    topSegmentFeatureMatrix = getSegmentFeaturesUsingIndices(
        frameFeatureMatrix, 25, topSegmentIndices)
    print("Segments generated...")

    # normalize the data
    scaler = joblib.load(DIR + "/scalerParameters.sav")
    topSegmentFeatureMatrix = scaler.transform(topSegmentFeatureMatrix)
    print("Data normalized...")

    # for each segement generate the probability distribution
    segmentProbabilities = dnn.predict_proba(topSegmentFeatureMatrix)

    # convert to percent
    segmentProbabilities = segmentProbabilities * 100

    print(
        str(segmentProbabilities) + ", samples : " +
        str(len(segmentProbabilities)))

    # plot probability distribution
    prob0 = segmentProbabilities[:, 0]
    prob1 = segmentProbabilities[:, 1]
    prob2 = segmentProbabilities[:, 2]
    prob3 = segmentProbabilities[:, 3]

    # plot the data
    # plt.style.use('seaborn')
    plt.xlabel("Samples")
    plt.ylabel("Confidence")
    plt.plot(range(1, len(prob0) + 1), prob0, label="neu", color="red")
    plt.plot(range(1, len(prob1) + 1), prob1, label="sad_fea", color="blue")
    plt.plot(range(1, len(prob2) + 1), prob2, label="ang_fru", color="green")
    plt.plot(range(1,
                   len(prob3) + 1),
             prob3,
             label="hap_exc_sur",
             color="black")
    plt.legend(loc="upper left")
    plt.show()
Ejemplo n.º 6
0
def predict():
    ques1 = request.args.get('ques1')
    ques2 = request.args.get('ques2')
    extractFeatures(ques1,ques2)

    df = pd.read_csv(os.path.join(APP_DATA, 'quora_features_test.csv'))
    predictionJSON=predictProbabilityForDifferentFeatures()

    feature_list = ['fuzz_qratio', 'fuzz_WRatio', 'wmd', 'norm_wmd', 'cosine_distance',
                    'jaccard_distance', 'euclidean_distance', 'braycurtis_distance', 'cosSim']

    response_JSON = {}

    for feature in feature_list:
        if df.iloc[0][feature] < 1.0:
            response_JSON[feature] = str(df.iloc[0][feature]*100)
        else:
            response_JSON[feature] = str(df.iloc[0][feature])

    outputJSON=[predictionJSON,response_JSON]

    return jsonify(outputJSON)
Ejemplo n.º 7
0
def main():
    emotions = ['neu','sad_fea', 'ang_fru','hap_exc_sur']

    DIR = os.path.dirname(os.path.realpath(__file__))
 
    # load wav
    WAVS_DIR = os.path.join(DIR, "testWavs")
    testWavs = glob.glob(WAVS_DIR + "/*")
    # print(testWavs)

    for i in range(len(testWavs)):
        # detect features
        audioRate, audioData = wavfile.read(testWavs[i])    
        frameFeatureMatrix = extractFeatures(audioData, audioRate)
        topSegmentIndices = getTopEnergySegmentsIndices(audioData, audioRate)
        topSegmentFeatureMatrix = getSegmentFeaturesUsingIndices(frameFeatureMatrix, 25, topSegmentIndices)

        # normalize data
        scaler = joblib.load(DIR + "/scalerParameters.sav")
        topSegmentFeatureMatrix = scaler.transform(topSegmentFeatureMatrix)

        # generate probabilities with DNN
        dnn = joblib.load(DIR  + "/dnnParameters.sav")
        segmentProbabilities = dnn.predict_proba(topSegmentFeatureMatrix)

        # create high level features
        feat1 = np.amax(segmentProbabilities, axis=0)

        feat2 = np.amin(segmentProbabilities, axis=0)
        feat3 = np.mean(segmentProbabilities, axis=0)
        prob0 = segmentProbabilities[:,0]
        prob1 = segmentProbabilities[:,1]
        prob2 = segmentProbabilities[:,2]
        prob3 = segmentProbabilities[:,3]
        count0 = np.sum(prob0[prob0>0.5])/len(segmentProbabilities)
        count1 = np.sum(prob1[prob1>0.5])/len(segmentProbabilities)
        count2 = np.sum(prob1[prob2>0.5])/len(segmentProbabilities)
        count3 = np.sum(prob1[prob3>0.5])/len(segmentProbabilities)
        feat4 = np.array([count0, count1, count2, count3])

        featureVector = np.hstack([feat1, feat2, feat3, feat4])

        # predict with svm
        # svm = joblib.load(DIR + "/svmParameters.sav")
        # emotionLabelNum, = svm.predict(featureVector.reshape(1,-1))

        emotionLabelNum = np.argmax(feat3)
        # display result
        print(testWavs[i].split("/")[-1][:-4] + " ---> " + emotions[emotionLabelNum])
Ejemplo n.º 8
0
def testRanker(test_data_file):
    
    context_question_list = readFile(test_data_file)
    
    featureMatrix = FE.extractFeatures(context_question_list)
    
    writeFeatureMatrixToFile(featureMatrix,context_question_list,"test")
    
    svm_rank_classify_exec_path = "svm_rank/svm_rank_classify"
    feature_vectors_file_path = "models/test.dat"
    model_file_path = "models/model.dat"
    output_scores_file_path = "result/prediction.txt"
    
    command = svm_rank_classify_exec_path+" "+feature_vectors_file_path+" "+model_file_path+ " "+output_scores_file_path

    os.system(command)
Ejemplo n.º 9
0
def main():
    # path to audio file dataset
    AUDIO_DATASET = os.path.dirname(os.path.realpath(__file__)) + "/dataset/"

    # path to feature extracted dataset for DNN
    CSV_DATASET = os.path.dirname(
        os.path.realpath(__file__)) + "/datasetForDNN.csv"
    # CSV_DATASET = os.path.dirname(os.path.realpath(__file__)) + "/datasetForDNN_test.csv"

    # maintain a dictionary for emotion label and labelNumber
    emotions = {}
    emotionLabelNum = 0  # this will be needed to set the target in csv file

    emotionDirPaths = glob.glob(AUDIO_DATASET + "*")

    csv_dataset = open(CSV_DATASET, "w")

    for emotionDirPath in emotionDirPaths:
        # keep a count of segment per emotions
        countSegmentsPerEmotion = 0

        # emotionLabel is also the directory name in the dataset directory
        emotionLabel = emotionDirPath.split("/")[-1]

        # set the emotion label
        emotions[emotionLabel] = emotionLabelNum

        # for all files in the emotionLabel directory, generate csv data
        print("Generating csv data for : " + emotionLabel)

        wavFilesPath = os.path.join(AUDIO_DATASET, emotionLabel, "*")
        # print(wavFilesPath)

        # setup progressBar
        progressBarWidth = 50
        sys.stdout.write("[%s]" % (" " * progressBarWidth))
        sys.stdout.flush()
        sys.stdout.write(
            "\b" *
            (progressBarWidth + 1))  # return to start of line, after '['

        wavFiles = glob.glob(wavFilesPath)
        numberOfWavFiles = len(wavFiles)
        progressBarUpdatePerFiles = int(numberOfWavFiles / progressBarWidth)
        countFiles = 0

        for wavFile in wavFiles:
            utteranceName = wavFile.split("/")[-1]
            audioRate, audioData = scipy.io.wavfile.read(wavFile)
            frameFeatureMatrix = extractFeatures(audioData, audioRate)
            topSegmentIndices = getTopEnergySegmentsIndices(
                audioData, audioRate)
            topSegmentFeatureMatrix = getSegmentFeaturesUsingIndices(
                frameFeatureMatrix, 25, topSegmentIndices)
            # for each top segment in audioData, write the feature vector into csv_dataset along with target emotionLabelNum
            for topSegmentIndex in range(len(topSegmentFeatureMatrix)):
                featureVector = ",".join([
                    '%.8f' % num
                    for num in topSegmentFeatureMatrix[topSegmentIndex]
                ])
                featureVector = utteranceName + "," + featureVector + "," + str(
                    emotionLabelNum) + "\n"
                csv_dataset.write(featureVector)
                countSegmentsPerEmotion += 1

            # update the progressBar
            countFiles += 1
            if (countFiles % progressBarUpdatePerFiles == 0) and (
                    int(countFiles / progressBarUpdatePerFiles) <=
                    50):  # won't let the progressbar #'s exceed 50 repetitions
                sys.stdout.write("#")
                sys.stdout.flush()

        sys.stdout.write("\n")
        # print the count for each emotion
        print("Number of segments for emotion : " + emotionLabel + " [ " +
              str(emotionLabelNum) + " ] : " + str(countSegmentsPerEmotion))

        emotionLabelNum += 1
    csv_dataset.close()
Ejemplo n.º 10
0
def main():
    emotions = ['neu', 'sad_fea', 'ang_fru', 'hap_exc_sur']

    DIR = os.path.dirname(os.path.realpath(__file__))

    # default parameters
    RATE = 16000
    CHUNK = 1024
    DEVICE_IN_HW = "Camera"
    DEVICE_OP_HW = "pulse"
    UTTERANCE_SECONDS = 5

    # load FILE
    try:
        FILE = sys.argv[1]
    except:
        FILE = "keaton"

    MP4_DIR = os.path.join(DIR, "testMp4s")
    MP4_FILE = os.path.join(MP4_DIR, FILE + ".mp4")

    print("Extracting audio from video : ",
          end=" ")  # end=" " suppresses new line
    command = "ffmpeg -i " + MP4_FILE + " -ac 1 -ar 16000 -vn " + MP4_FILE[:
                                                                           -4] + ".wav"
    print(command)

    subprocess.call(command, shell=True)
    print("DONE")

    WAV_IN = os.path.join(MP4_DIR, FILE + ".wav")
    testWav = wave.open(WAV_IN, "r")

    # aggregate 5 seconds of frames, process each 5 second utterance
    # NOTE : it is possible to directly read frames for 5 seconds
    # i.e. (RATE*UTTERANCE_SECONDS), instead of reading them CHUNK by CHUNK
    # and aggregating them, but we are using a for loop on CHUNKS, to
    # keep it consistent with pyaudio stream input, which will be added later
    utteranceProbabilities = []

    utteranceCount = 0
    while (True):
        try:
            utterance = b''  # empty byte string
            for _ in range(int(RATE * UTTERANCE_SECONDS / CHUNK)):
                samples = testWav.readframes(CHUNK)
                utterance += samples

            # # for testing, see if each 5 second utterance matches the deteced emotion
            # WAV_OUT = os.path.join(MP4_DIR,FILE+ "_" +str(utteranceCount)+".wav")
            # outWav = wave.open(WAV_OUT, "w")
            # outWav.setnchannels(1)
            # outWav.setsampwidth(2)
            # outWav.setframerate(RATE)
            # outWav.writeframes(utterance)
            # outWav.close()
            # print("Saved : " + str(WAV_OUT))

            utterance = np.fromstring(utterance, np.int16)
            frameFeatureMatrix = extractFeatures(utterance, RATE)
            topSegmentIndices = getTopEnergySegmentsIndices(utterance, RATE)
            topSegmentFeatureMatrix = getSegmentFeaturesUsingIndices(
                frameFeatureMatrix, 25, topSegmentIndices)

            # normalize data
            scaler = joblib.load(DIR + "/scalerParameters.sav")
            topSegmentFeatureMatrix = scaler.transform(topSegmentFeatureMatrix)

            # generate probabilities with DNN
            dnn = joblib.load(DIR + "/dnnParameters.sav")
            segmentProbabilities = dnn.predict_proba(topSegmentFeatureMatrix)

            # create high level features
            avgSegmentProbabilities = np.mean(segmentProbabilities, axis=0)

            # determine emotionLabelNum
            emotionLabelNum = np.argmax(avgSegmentProbabilities)

            # display result
            print("Probabilities : " + str(avgSegmentProbabilities))
            utteranceProbabilities.append(
                avgSegmentProbabilities)  # save for plotting
            print(MP4_FILE + " : " + str(utteranceCount) + " ---> " +
                  emotions[emotionLabelNum])

            # update the utterance count, for the next CHUNKs read from the file

            utteranceCount += 1
        except:
            break

    # remove wav file
    os.remove(os.path.join(MP4_DIR, FILE + ".wav"))

    utteranceProbabilities = np.array(utteranceProbabilities)
    prob0 = utteranceProbabilities[:, 0]
    prob1 = utteranceProbabilities[:, 1]
    prob2 = utteranceProbabilities[:, 2]
    prob3 = utteranceProbabilities[:, 3]
    # plot the data
    plt.xlabel("Seconds")
    plt.ylabel("Confidence")
    plt.plot(np.arange(1, (len(prob0)) * UTTERANCE_SECONDS, UTTERANCE_SECONDS),
             prob0,
             label="neu",
             color="red")
    plt.plot(np.arange(1, (len(prob0)) * UTTERANCE_SECONDS, UTTERANCE_SECONDS),
             prob1,
             label="sad_fea",
             color="blue")
    plt.plot(np.arange(1, (len(prob0)) * UTTERANCE_SECONDS, UTTERANCE_SECONDS),
             prob2,
             label="ang_fru",
             color="green")
    plt.plot(np.arange(1, (len(prob0)) * UTTERANCE_SECONDS, UTTERANCE_SECONDS),
             prob3,
             label="hap_exc_sur",
             color="black")
    plt.legend(loc="upper left")
    plt.show()