def scan_data(): ''' Walks through data directory and finds new grammars, creating them and adding them to the right clients and projects. ''' #1. get all filenames+paths in project dir #2. get all filenames from all csv files in project dir -> dictionary #3. data_dir = os.path.join(settings.DJANGO_ROOT, 'data') for name in os.listdir(data_dir): client, created = Client.objects.get_or_create(name=name) if created: #scan directory for grammars client.client_path = os.path.join(data_dir, name) client.save() print('created client: ' + str(client)) for project_name in [dir_i for dir_i in os.listdir(client.client_path) if os.path.isdir(os.path.join(client.client_path, dir_i))]: project, created = client.projects.get_or_create(name=project_name) if created: project.id_token = generate_id_token(Project) project.project_path = os.path.join(client.client_path, project_name) project.save() print('created project: ' + str(project)) #generate list of .csv files and list of .wav files csv_file_list = [] wav_file_dictionary = {} for sup, subs, file_list in os.walk(project.project_path): for file_name in file_list: if '.csv' in file_name and 'Unsorted' not in sup and 'save' not in sup: csv_file_list.append(file_name) root, ext = os.path.splitext(file_name) project.csv_files.get_or_create(client=client, name=root, file_name=file_name, path=sup) elif '.wav' in file_name: wav_file_dictionary[file_name] = os.path.join(sup, file_name) for i, csv_file in enumerate(project.csv_files.all()): grammar, created = project.grammars.get_or_create(client=client, name=csv_file.name) if created: grammar.csv_file = csv_file grammar.id_token = generate_id_token(Grammar) print('created grammar ' + str(grammar)) with open(os.path.join(csv_file.path, csv_file.file_name)) as open_rel_file: lines = open_rel_file.readlines() for j, line in enumerate(lines): tokens = line.split('|') #this can be part of a relfile parser object with delimeter '|' transcription_audio_file_name = os.path.basename(tokens[0]) grammar.wav_files.get_or_create(client=client, project=project, path=wav_file_dictionary[transcription_audio_file_name], file_name=transcription_audio_file_name) print('grammar %d/%d, wav %d/%d'%(i+1,project.csv_files.count(),j+1,len(lines)), end='\r' if j<len(lines)-1 else '\n') grammar.save() csv_file.save()
def process_words(self): words = self.utterance.split() for word in words: #many to many relationship if not (('[' in word and ']' not in word) or (']' in word and '[' not in word)): #reject with only one bracket w, created = self.job.project.words.get_or_create(char=word) #unique by char to project if created: w.client = self.transcription.client w.grammar = self.transcription.grammar w.id_token = generate_id_token(Word) w.tag = ('[' in word and ']' in word) self.words.add(w) w.save()
def action_register(request): if request.user.is_authenticated: #get user object user = User.objects.get(email=request.user) transcription = Transcription.objects.get(id_token=request.POST['transcription_id']) #make action object transcription.actions.create(client=transcription.client, job=Job.objects.get(id_token=request.POST['job_id']), user=user, id_token=generate_id_token(Action), char=request.POST['action_name'], audio_time=float(request.POST['audio_time'])) return HttpResponse('')
def process_words(self): if self.words.count()==0: words = self.utterance.split() for word in words: tag = ('[' in word or ']' in word) #many to many relationship if tag and not (('[' in word and ']' not in word) or (']' in word and '[' not in word)): w, created = self.project.words.get_or_create(char=word) #unique by char to project if created: w.client = self.client w.grammar = self.grammar w.id_token = generate_id_token(Word) w.tag = True self.words.add(w) w.save()
def create_jobs(self): ''' Create all possible jobs from the set of transcriptions. ''' if self.jobs.count()==0: print('creating jobs...') filter_set = self.transcriptions.filter(is_available=True).order_by('grammar__name') counter = filter_set.count() - 1 if filter_set.count() else 0 while counter: print('available: %d'%(counter), end='\r') job = self.jobs.create(client=self.client, id_token=generate_id_token(Job)) lower_bound = counter-settings.NUMBER_OF_TRANSCRIPTIONS_PER_JOB if counter>=settings.NUMBER_OF_TRANSCRIPTIONS_PER_JOB else 0 job_set = filter_set[lower_bound:counter] job.get_transcription_set(job_set) job.save() counter = lower_bound print('available: 0')
def update_revision(request): if request.user.is_authenticated: #get user and update revision utterance transcription = Transcription.objects.get(id_token=request.POST['transcription_id']) revision, created = transcription.revisions.get_or_create(user=User.objects.get(email=request.user), job=Job.objects.get(id_token=request.POST['job_id'])) if created: revision.id_token = generate_id_token(Revision) #split utterance revision.utterance = request.POST['utterance'] revision.save() #processing revision.process_words() revision.job.update() return HttpResponse('')