Example #1
0
def scan_data():
  '''
  Walks through data directory and finds new grammars, creating them and adding them to the right clients and projects.
  '''

  #1. get all filenames+paths in project dir
  #2. get all filenames from all csv files in project dir -> dictionary
  #3.

  data_dir = os.path.join(settings.DJANGO_ROOT, 'data')
  for name in os.listdir(data_dir):
    client, created = Client.objects.get_or_create(name=name)

    if created: #scan directory for grammars
      client.client_path = os.path.join(data_dir, name)
      client.save()
      print('created client: ' + str(client))

    for project_name in [dir_i for dir_i in os.listdir(client.client_path) if os.path.isdir(os.path.join(client.client_path, dir_i))]:
      project, created = client.projects.get_or_create(name=project_name)

      if created:
        project.id_token = generate_id_token(Project)
        project.project_path = os.path.join(client.client_path, project_name)
        project.save()
        print('created project: ' + str(project))

      #generate list of .csv files and list of .wav files
      csv_file_list = []
      wav_file_dictionary = {}
      for sup, subs, file_list in os.walk(project.project_path):
        for file_name in file_list:
          if '.csv' in file_name and 'Unsorted' not in sup and 'save' not in sup:
            csv_file_list.append(file_name)
            root, ext = os.path.splitext(file_name)
            project.csv_files.get_or_create(client=client, name=root, file_name=file_name, path=sup)
          elif '.wav' in file_name:
            wav_file_dictionary[file_name] = os.path.join(sup, file_name)

      for i, csv_file in enumerate(project.csv_files.all()):
        grammar, created = project.grammars.get_or_create(client=client, name=csv_file.name)

        if created:
          grammar.csv_file = csv_file
          grammar.id_token = generate_id_token(Grammar)
          print('created grammar ' + str(grammar))

          with open(os.path.join(csv_file.path, csv_file.file_name)) as open_rel_file:
            lines = open_rel_file.readlines()
            for j, line in enumerate(lines):
              tokens = line.split('|') #this can be part of a relfile parser object with delimeter '|'
              transcription_audio_file_name = os.path.basename(tokens[0])
              grammar.wav_files.get_or_create(client=client, project=project, path=wav_file_dictionary[transcription_audio_file_name], file_name=transcription_audio_file_name)
              print('grammar %d/%d, wav %d/%d'%(i+1,project.csv_files.count(),j+1,len(lines)), end='\r' if j<len(lines)-1 else '\n')

          grammar.save()
          csv_file.save()
Example #2
0
  def process_words(self):
    words = self.utterance.split()
    for word in words:
      #many to many relationship
      if not (('[' in word and ']' not in word) or (']' in word and '[' not in word)): #reject with only one bracket
        w, created = self.job.project.words.get_or_create(char=word) #unique by char to project
        if created:
          w.client = self.transcription.client
          w.grammar = self.transcription.grammar
          w.id_token = generate_id_token(Word)
          w.tag = ('[' in word and ']' in word)

        self.words.add(w)
        w.save()
Example #3
0
def action_register(request):
  if request.user.is_authenticated:
    #get user object
    user = User.objects.get(email=request.user)
    transcription = Transcription.objects.get(id_token=request.POST['transcription_id'])

    #make action object
    transcription.actions.create(client=transcription.client,
                                 job=Job.objects.get(id_token=request.POST['job_id']),
                                 user=user,
                                 id_token=generate_id_token(Action),
                                 char=request.POST['action_name'],
                                 audio_time=float(request.POST['audio_time']))

    return HttpResponse('')
Example #4
0
  def process_words(self):
    if self.words.count()==0:
      words = self.utterance.split()
      for word in words:
        tag = ('[' in word or ']' in word)

        #many to many relationship
        if tag and not (('[' in word and ']' not in word) or (']' in word and '[' not in word)):
          w, created = self.project.words.get_or_create(char=word) #unique by char to project
          if created:
            w.client = self.client
            w.grammar = self.grammar
            w.id_token = generate_id_token(Word)
            w.tag = True
            self.words.add(w)
            w.save()
Example #5
0
 def create_jobs(self):
   '''
   Create all possible jobs from the set of transcriptions.
   '''
   if self.jobs.count()==0:
     print('creating jobs...')
     filter_set = self.transcriptions.filter(is_available=True).order_by('grammar__name')
     counter = filter_set.count() - 1 if filter_set.count() else 0
     while counter:
       print('available: %d'%(counter), end='\r')
       job = self.jobs.create(client=self.client, id_token=generate_id_token(Job))
       lower_bound = counter-settings.NUMBER_OF_TRANSCRIPTIONS_PER_JOB if counter>=settings.NUMBER_OF_TRANSCRIPTIONS_PER_JOB else 0
       job_set = filter_set[lower_bound:counter]
       job.get_transcription_set(job_set)
       job.save()
       counter = lower_bound
     print('available: 0')
Example #6
0
def update_revision(request):
  if request.user.is_authenticated:
    #get user and update revision utterance
    transcription = Transcription.objects.get(id_token=request.POST['transcription_id'])
    revision, created = transcription.revisions.get_or_create(user=User.objects.get(email=request.user),
                                                              job=Job.objects.get(id_token=request.POST['job_id']))

    if created:
      revision.id_token = generate_id_token(Revision)

    #split utterance
    revision.utterance = request.POST['utterance']
    revision.save()

    #processing
    revision.process_words()
    revision.job.update()

    return HttpResponse('')