def process_speaker(db): TEST_FOLD = 2 #db = connect_to_database() #db.add_son_manipulator(TransformToBinary()) #db = NeoEngine('/data/neo4j') #Impostor ratio is ratio of impostor records in training #and testing population. For ratio=N, subject is 1/(N+1), #impostor N/(N+1) of population IMPOSTOR_RATIO = 3 LIMIT = 30 #Set up queue ctx = zmq.Context() q = ctx.socket(zmq.PULL) q.connect("tcp://127.0.0.1:5555") outQ = ctx.socket(zmq.PUSH) outQ.connect("tcp://127.0.0.1:5556") while True: speaker_name = q.recv_json() print 'Received %s' % speaker_name #time.sleep(random.randint(1,10)) #Find all SVs for current subject #print 'Speaker Name: %s' % speaker.name #print 'Count:', db.sv.find({'speaker_name': speaker.name}).count() #cursor_subject = Concurrent_cursor(SV.objects(speaker_name=speaker.name)) sv_subject = stack_SVs(db.get('sv', {'speaker_name': speaker_name}), limit=LIMIT) num_subject = np.size(sv_subject, 0) print num_subject if num_subject < 20: continue #Get random SVs from rest of database for test population #cursor_impostor = db.sv.find({'speaker_name': {'$ne': speaker['name']}}) sv_impostor = stack_SVs_random(db, speaker_name, num_subject * IMPOSTOR_RATIO) num_impostor = np.size(sv_impostor, 0) print 'Subject: %i, Impostor: %i' % (num_subject, num_impostor) #generate total dataset of observations X with class labels y X = np.vstack((sv_subject, sv_impostor)) y = np.array([1] * num_subject + [0] * num_impostor) #Pick random assortment from each set to form training observations #Switch ensures that smaller number always used for training if TEST_FOLD < 3: train, test = iter(StratifiedKFold(y, TEST_FOLD)).next() else: test, train = iter(StratifiedKFold(y, TEST_FOLD)).next() #Perform crossvalidated SVM training #print type(X), type(y) #print np.shape(X[train]), np.shape(y[train]) clf = train_svm_crossvalidated(X[train], y[train]) #print type(clf) #clf_rec = {'classifier': SVMModelField(clf), 'speaker_name': speaker.name} #db.svm.insert(clf_rec, safe=True) #Collect classification statistics accuracy = test_svm_accuracy(X[test], y[test], clf) num_subject_test = np.sum(y[test]) num_impostor_test = len(y[test]) - num_subject_test print 'Accuracy: %f' % (float(accuracy['correct_subject']) / float(num_subject_test)) #print 'Sub: %i/%i Imp: %i/%i' % (accuracy['correct_subject'], num_subject_test, accuracy['correct_impostor'], num_impostor_test) #print 'False Neg: %i False Pos: %i' % (accuracy['false_neg'], accuracy['false_pos']) msg = { 'speaker_name': speaker_name, 'accuracy': accuracy, 'num_subject': num_subject, 'num_subject_test': num_subject_test, 'num_impostor_test': num_impostor_test } outQ.send_pyobj(msg)
def process_speaker(db): TEST_FOLD = 2 #db = connect_to_database() #db.add_son_manipulator(TransformToBinary()) #db = NeoEngine('/data/neo4j') #Impostor ratio is ratio of impostor records in training #and testing population. For ratio=N, subject is 1/(N+1), #impostor N/(N+1) of population IMPOSTOR_RATIO = 3 LIMIT = 30 #Set up queue ctx = zmq.Context() q = ctx.socket(zmq.PULL) q.connect("tcp://127.0.0.1:5555") outQ = ctx.socket(zmq.PUSH) outQ.connect("tcp://127.0.0.1:5556") while True: speaker_name = q.recv_json() print 'Received %s' % speaker_name #time.sleep(random.randint(1,10)) #Find all SVs for current subject #print 'Speaker Name: %s' % speaker.name #print 'Count:', db.sv.find({'speaker_name': speaker.name}).count() #cursor_subject = Concurrent_cursor(SV.objects(speaker_name=speaker.name)) sv_subject = stack_SVs(db.get('sv', {'speaker_name': speaker_name}),limit=LIMIT) num_subject = np.size(sv_subject,0) print num_subject if num_subject < 20: continue #Get random SVs from rest of database for test population #cursor_impostor = db.sv.find({'speaker_name': {'$ne': speaker['name']}}) sv_impostor = stack_SVs_random(db, speaker_name, num_subject*IMPOSTOR_RATIO) num_impostor = np.size(sv_impostor,0) print 'Subject: %i, Impostor: %i' % (num_subject, num_impostor) #generate total dataset of observations X with class labels y X = np.vstack((sv_subject, sv_impostor)) y = np.array([1] * num_subject + [0] * num_impostor) #Pick random assortment from each set to form training observations #Switch ensures that smaller number always used for training if TEST_FOLD < 3: train, test = iter(StratifiedKFold(y, TEST_FOLD)).next() else: test, train = iter(StratifiedKFold(y, TEST_FOLD)).next() #Perform crossvalidated SVM training #print type(X), type(y) #print np.shape(X[train]), np.shape(y[train]) clf = train_svm_crossvalidated(X[train], y[train]) #print type(clf) #clf_rec = {'classifier': SVMModelField(clf), 'speaker_name': speaker.name} #db.svm.insert(clf_rec, safe=True) #Collect classification statistics accuracy = test_svm_accuracy(X[test], y[test], clf) num_subject_test = np.sum(y[test]) num_impostor_test = len(y[test]) - num_subject_test print 'Accuracy: %f' % (float(accuracy['correct_subject'])/float(num_subject_test)) #print 'Sub: %i/%i Imp: %i/%i' % (accuracy['correct_subject'], num_subject_test, accuracy['correct_impostor'], num_impostor_test) #print 'False Neg: %i False Pos: %i' % (accuracy['false_neg'], accuracy['false_pos']) msg = {'speaker_name': speaker_name, 'accuracy': accuracy, 'num_subject': num_subject, 'num_subject_test': num_subject_test, 'num_impostor_test': num_impostor_test} outQ.send_pyobj(msg)
def test_accuracy_concurrent(worker, concurrency): TEST_FOLD = 2 RUN_MAX = False iter_num = 0 db = connect_to_database() db.add_son_manipulator(TransformToBinary()) #take this out later when testing complete fid = open('/home/ubuntu/project/backend-search/src/spkrec/utils/hist'+str(worker)+'.csv', 'wb') csv_writer = csv.writer(fid) if worker == 0: csv_writer.writerow(['name', 'num speaker SVs', 'test subjects', 'test impostors', 'correct subject', 'correct impostor', 'false neg', 'false pos']) cursor = Concurrent_cursor(Speaker.objects()) cursor.set_concurrency(concurrency) cursor.set_worker(worker) for speaker in Speaker.objects(): #Impostor ratio is ratio of impostor records in training #and testing population. For ratio=N, subject is 1/(N+1), #impostor N/(N+1) of population IMPOSTOR_RATIO = 3 #Find all SVs for current subject print 'Speaker Name: %s' % speaker.name #print 'Count:', db.sv.find({'speaker_name': speaker.name}).count() #cursor_subject = Concurrent_cursor(SV.objects(speaker_name=speaker.name)) sv_subject = stack_SVs(db.sv.find({'speaker_name': speaker.name})) num_subject = np.size(sv_subject,0) #csv_writer.writerow([speaker.name, num_subject]) #print num_subject if num_subject < 20: continue #Get random SVs from rest of database for test population #cursor_impostor = db.sv.find({'speaker_name': {'$ne': speaker['name']}}) sv_impostor = stack_SVs_random(db, speaker.name, num_subject*IMPOSTOR_RATIO) num_impostor = np.size(sv_impostor,0) print 'Subject: %i, Impostor: %i' % (num_subject, num_impostor) #generate total dataset of observations X with class labels y X = np.vstack((sv_subject, sv_impostor)) y = np.array([1] * num_subject + [0] * num_impostor) #Pick random assortment from each set to form training observations #Switch ensures that smaller number always used for training if TEST_FOLD < 3: train, test = iter(StratifiedKFold(y, TEST_FOLD)).next() else: test, train = iter(StratifiedKFold(y, TEST_FOLD)).next() #print train #Perform crossvalidated SVM training #print type(X), type(y) #print np.shape(X[train]), np.shape(y[train]) clf = train_svm_crossvalidated(X[train], y[train]) #print type(clf) #clf_rec = {'classifier': SVMModelField(clf), 'speaker_name': speaker.name} #db.svm.insert(clf_rec, safe=True) #Collect classification statistics accuracy = test_svm_accuracy(X[test], y[test], clf) num_subject_test = np.sum(y[test]) num_impostor_test = len(y[test]) - num_subject_test print 'Accuracy: %f' % (float(accuracy['correct_subject'])/float(num_subject_test)) print 'Sub: %i/%i Imp: %i/%i' % (accuracy['correct_subject'], num_subject_test, accuracy['correct_impostor'], num_impostor_test) print 'False Neg: %i False Pos: %i' % (accuracy['false_neg'], accuracy['false_pos']) csv_writer.writerow([speaker.name, num_subject, num_subject_test, num_impostor_test, accuracy['correct_subject'], accuracy['correct_impostor'], accuracy['false_neg'], accuracy['false_pos']]) iter_num = iter_num + 1 #if RUN_MAX and iter_num >= RUN_MAX: # print "I'm breaking" # break #print num_subject, num_impostor fid.close() print "Complete"