def test_set_then_get_sample_rate_hertz(): from model.configuration import Configuration config = Configuration() expected = 16000 config.set_sample_rate_hertz(expected) result = config.get_sample_rate_hertz() assert result == expected
def test_set_then_get_model(): from model.configuration import Configuration config = Configuration() expected = 'command_and_search' config.set_model(expected) result = config.get_model() assert result == expected
def test_set_then_get_enable_separate_recognition_per_channel(): from model.configuration import Configuration config = Configuration() expected = True config.set_enable_separate_recognition_per_channel(expected) result = config.get_enable_separate_recognition_per_channel() assert result == expected
def test_set_then_get_language_code(): from model.configuration import Configuration config = Configuration() expected = 'fr-FR' config.set_language_code(expected) result = config.get_language_code() assert result == expected
def test_set_get_alternative_language_codes(): from model.configuration import Configuration config = Configuration() expected = ['gu-IN', 'ru-RU'] config.set_alternative_language_codes(expected) result = config.get_alternative_language_codes() assert result == expected
def test_set_get_encoding(): from model.configuration import Configuration config = Configuration() expected = 'FLAC' config.set_encoding(expected) result = config.get_encoding() assert result == expected
def test_set_get_speech_context(): from model.configuration import Configuration config = Configuration() expected_boost = 3 expected_phrases = ['foo', 'bar', 'baz'] config.set_speech_context(expected_phrases, expected_boost) result = config.get_speech_context() assert result[0]['boost'] == expected_boost assert result[0]['phrases'] == expected_phrases
def test_get_other_configuration_value(self): extended_configuration_map = copy.deepcopy( ConfigurationTests.BASE_VALID_CONFIGURATION_MAP) extended_configuration_map.update( {ConfigurationTests.OTHER_KEY: ConfigurationTests.OTHER_VALUE}) extended_configuration_json = json.dumps(extended_configuration_map) configuration = Configuration(extended_configuration_json) self.assertEqual( ConfigurationTests.OTHER_VALUE, configuration.get_configuration_value( ConfigurationTests.OTHER_KEY))
def test_get_hypothesis(): from model.configuration import Configuration from utilities.speech_to_text import SpeechToText uri = 'gs://brb/test_audio_n_truth/1.wav' configuration_object = Configuration() configuration_object.set_language_code('en-US') configuration_object.set_encoding('LINEAR16') configuration_object.set_sample_rate_hertz(44100) configuration_object.set_model('default') speech = SpeechToText() result = speech.get_hypothesis(uri, configuration_object) expected = ' testing 1 2 3 hello hello testing one two three' assert result == expected
class SpeechToText(object): configuration = Configuration() def get_hypothesis(self, uri, configuration): import time """Asynchronously transcribes the audio uri specified by the gcs_uri.""" client = speech.SpeechClient() config = { "model": configuration.get_model(), "use_enhanced": configuration.get_use_enhanced(), "encoding": configuration.get_encoding(), "sample_rate_hertz": configuration.get_sample_rate_hertz(), "language_code": configuration.get_language_code(), "alternative_language_codes": configuration.get_alternative_language_codes(), "audio_channel_count": configuration.get_audio_channel_count(), "enable_separate_recognition_per_channel": configuration.get_enable_separate_recognition_per_channel(), "enable_speaker_diarization": configuration.get_enableSpeakerDiarization(), "diarization_speaker_count": configuration.get_diarizationSpeakerCount(), "enable_automatic_punctuation": configuration.get_enableAutomaticPunctuation(), "speech_contexts": configuration.get_speech_context() } audio = {"uri": uri} operation = client.long_running_recognize(config, audio) count = 0 sleep_time = 15 while not operation.done() and count != 30000: print( f"{operation.metadata.progress_percent}% complete - updates every {sleep_time} seconds" ) if count == 29999: raise TimeoutError("Time out processing audio") count += 1 time.sleep(sleep_time) print( f"{operation.metadata.progress_percent}% complete - updates every {sleep_time} seconds" ) response = operation.result(timeout=1200) transcript = str() for result in response.results: # First alternative is the most probable result transcript += " " + result.alternatives[0].transcript if not transcript: logger.debug('No transcript returned') return transcript
class ConfigurationTests(unittest.TestCase): CITY = "Seattle" COUNTRY = "USA" BASE_VALID_CONFIGURATION_MAP = { Configuration.CONFIGURATION_KEY_CITY: CITY, Configuration.CONFIGURATION_KEY_COUNTRY: COUNTRY } OTHER_KEY = "otherKey" OTHER_VALUE = "otherValue" BASE_VALID_CONFIGURATION_JSON = json.dumps(BASE_VALID_CONFIGURATION_MAP) INVALID_CONFIGURATION_JSON = json.dumps({}) def setUp(self): self.configuration = Configuration( ConfigurationTests.BASE_VALID_CONFIGURATION_JSON) def test_invalid_configuration(self): self.assertRaises(KeyError, Configuration, self.INVALID_CONFIGURATION_JSON) def test_get_city(self): self.assertEqual(self.CITY, self.configuration.get_city()) def test_get_country(self): self.assertEqual(self.COUNTRY, self.configuration.get_country()) def test_get_other_configuration_value(self): extended_configuration_map = copy.deepcopy( ConfigurationTests.BASE_VALID_CONFIGURATION_MAP) extended_configuration_map.update( {ConfigurationTests.OTHER_KEY: ConfigurationTests.OTHER_VALUE}) extended_configuration_json = json.dumps(extended_configuration_map) configuration = Configuration(extended_configuration_json) self.assertEqual( ConfigurationTests.OTHER_VALUE, configuration.get_configuration_value( ConfigurationTests.OTHER_KEY))
def test_update_csv(): from utilities.io_handler import IOHandler from model.configuration import Configuration from model.nlp import NLPModel import os configuration = Configuration() nlp_model = NLPModel() io = IOHandler() result_file_name = io._result_file_name io.set_result_path('test_results_path') io.write_csv_header() expected_uri = 'gs://foo/bar/baz/test.flac' expected_lang = 'fr-FR' nlp_model.set_apply_stemming(True) configuration.set_language_code(expected_lang) io.update_csv(expected_uri, configuration, nlp_model) full_path = f'{io.get_result_path()}/{result_file_name}' with open(full_path, 'r') as file: contents = file.read() os.remove(full_path) assert expected_uri in contents assert expected_lang in contents assert 'True' in contents
def test_get_audio_channel_count(): from model.configuration import Configuration config = Configuration() config.get_audio_channel_count()
def test_get_use_enhanced(): from model.configuration import Configuration config = Configuration() result = config.get_use_enhanced() expected = False assert result == expected
def test_set_model(): from model.configuration import Configuration config = Configuration() config.set_model('default')
def test_set_use_enhanced(): from model.configuration import Configuration config = Configuration() config.set_use_enhanced(True)
def setUp(self): self.configuration = Configuration( ConfigurationTests.BASE_VALID_CONFIGURATION_JSON)
def test_set_language_code(): from model.configuration import Configuration config = Configuration() config.set_language_code('en-US')
def test_get_language_code(): from model.configuration import Configuration config = Configuration() config.get_language_code()
def test_str(): from model.configuration import Configuration config = Configuration() expected_model = 'phone_call' expected_language_code = 'hi-IN' expected_use_enhanced = True expected_sample_rate_hertz = 48000 expected_audio_channel_count = 5 expected_enable_separate_recognition_per_channel = False expected_boost = 6 expected_phrases = ['testing', '$ADDRESSNUM'] config.set_model(expected_model) config.set_language_code(expected_language_code) config.set_enable_separate_recognition_per_channel( expected_enable_separate_recognition_per_channel) config.set_audio_channel_count(expected_audio_channel_count) config.set_use_enhanced(expected_use_enhanced) config.set_sample_rate_hertz(expected_sample_rate_hertz) config.set_speech_context(expected_phrases, expected_boost) result = config.__str__() assert isinstance(result, str) assert expected_model in result assert expected_language_code in result assert str(expected_use_enhanced) in result assert str(expected_sample_rate_hertz) in result assert str(expected_audio_channel_count) in result assert str(expected_enable_separate_recognition_per_channel) in result assert str(expected_boost) in result
def test_get_alternative_language_codes(): from model.configuration import Configuration config = Configuration() config.get_alternative_language_codes()
def test_set_encoding(): from model.configuration import Configuration config = Configuration() data = 'MP3' config.set_encoding(data)
def test_get_encoding(): from model.configuration import Configuration config = Configuration() config.get_encoding()
class SpeechToText(object): configuration = Configuration() def get_hypothesis(self, uri, configuration): import time """Asynchronously transcribes the audio uri specified by the gcs_uri.""" client = speech.SpeechClient() config = { "model": configuration.get_model(), "use_enhanced": configuration.get_use_enhanced(), "encoding": configuration.get_encoding(), "sample_rate_hertz": configuration.get_sample_rate_hertz(), "language_code": configuration.get_language_code(), "alternative_language_codes": configuration.get_alternative_language_codes(), "audio_channel_count": configuration.get_audio_channel_count(), "enable_separate_recognition_per_channel": configuration.get_enable_separate_recognition_per_channel(), "enable_speaker_diarization": configuration.get_enableSpeakerDiarization(), "diarization_speaker_count": configuration.get_diarizationSpeakerCount(), "enable_automatic_punctuation": configuration.get_enableAutomaticPunctuation(), "speech_contexts": configuration.get_speech_context() } audio = {"uri": uri} operation = object try: operation = client.long_running_recognize(config=config, audio=audio) except google.api_core.exceptions.InvalidArgument as e: raise e count = 0 sleep_time = 5 while not operation.done() and count != 30000: print( f"{operation.metadata.progress_percent}% complete - updates every {sleep_time} seconds" ) if count == 29999: raise TimeoutError("Time out processing audio") count += 1 time.sleep(sleep_time) print( f"{operation.metadata.progress_percent}% complete - updates every {sleep_time} seconds" ) response = operation.result(timeout=1200) transcript = str() for result in response.results: # First alternative is the most probable result transcript += " " + result.alternatives[0].transcript if not transcript: logger.debug('No transcript returned') utilities = Utilities() t = utilities.strip_puc(text=transcript) return t.lower() def transcribe_streaming(self, stream_file, configuration): """Streams transcription of the given audio file.""" import io client = speech.SpeechClient() output = '' with io.open(stream_file, 'rb') as audio_file: audio_content = audio_file.read() config = { "model": configuration.get_model(), "use_enhanced": configuration.get_use_enhanced(), "encoding": configuration.get_encoding(), "sample_rate_hertz": configuration.get_sample_rate_hertz(), "language_code": configuration.get_language_code(), "alternative_language_codes": configuration.get_alternative_language_codes(), "audio_channel_count": configuration.get_audio_channel_count(), "enable_separate_recognition_per_channel": configuration.get_enable_separate_recognition_per_channel(), "enable_speaker_diarization": configuration.get_enableSpeakerDiarization(), "diarization_speaker_count": configuration.get_diarizationSpeakerCount(), "enable_automatic_punctuation": configuration.get_enableAutomaticPunctuation(), "speech_contexts": configuration.get_speech_context() } streaming_config = speech.types.StreamingRecognitionConfig( config=config, interim_results=True) # BUG IS HERE #requests = speech.types.StreamingRecognizeRequest( # audio_content=audio_content) stream = [audio_content] requests = (speech.types.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream) responses = client.streaming_recognize(streaming_config, requests) #import pdb; pdb.set_trace() for response in responses: # Once the transcription has settled, the first result will contain the # is_final result. The other results will be for subsequent portions of # the audio. for result in response.results: alternatives = result.alternatives # The alternatives are ordered from most likely to least. for alternative in alternatives: output = ''.join(alternative.transcript) return output
help='Path to file containing comma separated phrases') parser.add_argument('-b', '--boosts', default=list(), nargs='+', required=False, help=('Space separated list of boost values to evaluate for speech adaptation')) parser.add_argument('-ch', '--multi', required=False, type=int, help='Integer indicating the number of channels if more than one') parser.add_argument('-q', '--random_queue', required=False, action='store_true', help='Replaces default queue.txt with randomly named queue file') parser.add_argument('-fake', '--fake_hyp', required=False, action='store_true', help='Use a fake hypothesis for testing') parser.add_argument('-limit', '--limit', required=False, default=None,type= int, help = 'Limit to X number of audio files') parser.add_argument('-nzb', '--no_zeros_boost', required=False, action='store_true', help='skip boost of 0' ) parser.add_argument('-single', '--single_word', required=False, action='store_true', help='process each letter rather than whole words') parser.add_argument('-lf','--local_files_path', required=False, type=str, help='process local files', default=None) nlp_model = NLPModel() io_handler = IOHandler() nlp_options = NLPOptions() configuration = Configuration() # Turn on punctuation .. why not.. no bearing on WER configuration.set_enableAutomaticPunctuation(True) args = parser.parse_args() no_zeros_for_boost = args.no_zeros_boost process_each_letter = args.single_word local_files_path = args.local_files_path limit = args.limit cloud_store_uri = args.cloud_store_uri io_handler.set_result_path(args.local_results_path) only_transcribe = args.transcriptions_only nlp_model.set_n2w(args.numbers_to_words) nlp_model.set_apply_stemming(args.stem) nlp_model.set_remove_stop_words(args.remove_stop_words) nlp_model.set_expand_contractions(args.expand)
def test_get_enable_separate_recognition_per_channel(): from model.configuration import Configuration config = Configuration() config.get_enable_separate_recognition_per_channel()
def run_train(): out_dir = RESULTS_DIR initial_checkpoint = RESULTS_DIR / ' 00072200_model.pth' # pretrain_file = None # imagenet pretrain ## setup ----------------- (out_dir / 'checkpoint').mkdir(exist_ok=True) (out_dir / 'train').mkdir(exist_ok=True) (out_dir / 'backup').mkdir(exist_ok=True) backup_project_as_zip( PROJECT_PATH, str(out_dir / 'backup' / ' code.train.%s.zip') % IDENTIFIER) log = Logger() log.open(out_dir + '/log.train.txt', mode='a') log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64)) log.write('** some experiment setting **\n') log.write('\tSEED = %u\n' % SEED) log.write('\tPROJECT_PATH = %s\n' % PROJECT_PATH) log.write('\tout_dir = %s\n' % out_dir) log.write('\n') ## net ---------------------- log.write('** net setting **\n') cfg = Configuration() net = MaskRcnnNet(cfg).cuda() if initial_checkpoint is not None: log.write('\tinitial_checkpoint = %s\n' % initial_checkpoint) net.load_state_dict( torch.load(initial_checkpoint, map_location=lambda storage, loc: storage)) elif pretrain_file is not None: log.write('\tpretrained_file = %s\n' % pretrain_file) # load_pretrain_file(net, pretrain_file) log.write('%s\n\n' % (type(net))) log.write('\n') ## optimiser ---------------------------------- iter_accum = 1 batch_size = 4 ##NUM_CUDA_DEVICES*512 #256//iter_accum #512 #2*288//iter_accum num_iters = 1000 * 1000 iter_smooth = 20 iter_log = 50 iter_valid = 100 iter_save = [0, num_iters - 1] \ + list(range(0, num_iters, 100)) # 1*1000 LR = None # LR = StepLR([ (0, 0.01), (200, 0.001), (300, -1)]) optimizer = SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=0.001 / iter_accum, momentum=0.9, weight_decay=0.0001) start_iter = 0 start_epoch = 0. if initial_checkpoint is not None: checkpoint = torch.load( initial_checkpoint.replace('_model.pth', '_optimizer.pth')) start_iter = checkpoint['iter'] start_epoch = checkpoint['epoch'] # optimizer.load_state_dict(checkpoint['optimizer']) ## dataset ---------------------------------------- log.write('** dataset setting **\n') train_dataset = ScienceDataset( # 'train1_ids_gray_only1_500', mode='train', 'valid1_ids_gray_only1_43', mode='train', transform=train_augment) train_loader = DataLoader( train_dataset, sampler=RandomSampler(train_dataset), # sampler = ConstantSampler(train_dataset,list(range(16))), batch_size=batch_size, drop_last=True, num_workers=4, pin_memory=True, collate_fn=train_collate) valid_dataset = ScienceDataset( 'valid1_ids_gray_only1_43', mode='train', # 'debug1_ids_gray_only1_10', mode='train', transform=valid_augment) valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=batch_size, drop_last=False, num_workers=4, pin_memory=True, collate_fn=train_collate) log.write('\ttrain_dataset.split = %s\n' % (train_dataset.split)) log.write('\tvalid_dataset.split = %s\n' % (valid_dataset.split)) log.write('\tlen(train_dataset) = %d\n' % (len(train_dataset))) log.write('\tlen(valid_dataset) = %d\n' % (len(valid_dataset))) log.write('\tlen(train_loader) = %d\n' % (len(train_loader))) log.write('\tlen(valid_loader) = %d\n' % (len(valid_loader))) log.write('\tbatch_size = %d\n' % (batch_size)) log.write('\titer_accum = %d\n' % (iter_accum)) log.write('\tbatch_size*iter_accum = %d\n' % (batch_size * iter_accum)) log.write('\n') # log.write(inspect.getsource(train_augment)+'\n') # log.write(inspect.getsource(valid_augment)+'\n') # log.write('\n') if 0: # <debug> for inputs, truth_boxes, truth_labels, truth_instances, indices in valid_loader: batch_size, C, H, W = inputs.size() print(batch_size) images = inputs.cpu().numpy() for b in range(batch_size): image = (images[b].transpose((1, 2, 0)) * 255) image = np.clip(image.astype(np.float32) * 3, 0, 255) image1 = image.copy() truth_box = truth_boxes[b] truth_label = truth_labels[b] truth_instance = truth_instances[b] if truth_box is not None: for box, label, instance in zip(truth_box, truth_label, truth_instance): x0, y0, x1, y1 = box.astype(np.int32) cv2.rectangle(image, (x0, y0), (x1, y1), (0, 0, 255), 1) print(label) thresh = instance > 0.5 contour = thresh_to_inner_contour(thresh) contour = contour.astype(np.float32) * 0.5 image1 = contour[:, :, np.newaxis] * np.array( (0, 255, 0)) + (1 - contour[:, :, np.newaxis]) * image1 print('') image_show('image', image) image_show('image1', image1) cv2.waitKey(0) ## start training here! ############################################## log.write('** start training here! **\n') log.write(' optimizer=%s\n' % str(optimizer)) log.write(' momentum=%f\n' % optimizer.param_groups[0]['momentum']) log.write(' LR=%s\n\n' % str(LR)) log.write(' images_per_epoch = %d\n\n' % len(train_dataset)) log.write( ' rate iter epoch num | valid_loss | train_loss | batch_loss | time \n' ) log.write( '------------------------------------------------------------------------------------------------------------------------------------------------------------------\n' ) train_loss = np.zeros(6, np.float32) train_acc = 0.0 valid_loss = np.zeros(6, np.float32) valid_acc = 0.0 batch_loss = np.zeros(6, np.float32) batch_acc = 0.0 rate = 0 start = time.time() j = 0 i = 0 while i < num_iters: # loop over the dataset multiple times sum_train_loss = np.zeros(6, np.float32) sum_train_acc = 0.0 sum = 0 net.set_mode('train') optimizer.zero_grad() for inputs, truth_boxes, truth_labels, truth_instances, indices in train_loader: batch_size = len(indices) i = j / iter_accum + start_iter epoch = (i - start_iter) * batch_size * iter_accum / len( train_dataset) + start_epoch num_products = epoch * len(train_dataset) if i % iter_valid == 0: net.set_mode('valid') valid_loss, valid_acc = evaluate(net, valid_loader) net.set_mode('train') print('\r', end='', flush=True) log.write( '%0.4f %5.1f k %6.2f %4.1f m | %0.3f %0.2f %0.2f %0.2f %0.2f %0.2f | %0.3f %0.2f %0.2f %0.2f %0.2f %0.2f | %0.3f %0.2f %0.2f %0.2f %0.2f %0.2f | %s\n' % ( \ rate, i / 1000, epoch, num_products / 1000000, valid_loss[0], valid_loss[1], valid_loss[2], valid_loss[3], valid_loss[4], valid_loss[5], # valid_acc, train_loss[0], train_loss[1], train_loss[2], train_loss[3], train_loss[4], train_loss[5], # train_acc, batch_loss[0], batch_loss[1], batch_loss[2], batch_loss[3], batch_loss[4], batch_loss[5], # batch_acc, time_to_str((time.time() - start) / 60))) time.sleep(0.01) # if 1: if i in iter_save: torch.save(net.state_dict(), out_dir + '/checkpoint/%08d_model.pth' % (i)) torch.save( { 'optimizer': optimizer.state_dict(), 'iter': i, 'epoch': epoch, }, out_dir + '/checkpoint/%08d_optimizer.pth' % (i)) # learning rate schduler ------------- if LR is not None: lr = LR.get_rate(i) if lr < 0: break adjust_learning_rate(optimizer, lr / iter_accum) rate = get_learning_rate(optimizer)[0] * iter_accum # one iteration update ------------- inputs = Variable(inputs).cuda() net(inputs, truth_boxes, truth_labels, truth_instances) loss = net.loss(inputs, truth_boxes, truth_labels, truth_instances) if 1: # <debug> debug_and_draw(net, inputs, truth_boxes, truth_labels, truth_instances, mode='test') # masks = (probs>0.5).float() # acc = dice_loss(masks, labels) # accumulated update loss.backward() if j % iter_accum == 0: # torch.nn.utils.clip_grad_norm(net.parameters(), 1) optimizer.step() optimizer.zero_grad() # print statistics ------------ batch_acc = 0 # acc[0][0] batch_loss = np.array(( loss.cpu().data.numpy()[0], net.rpn_cls_loss.cpu().data.numpy()[0], net.rpn_reg_loss.cpu().data.numpy()[0], net.rcnn_cls_loss.cpu().data.numpy()[0], net.rcnn_reg_loss.cpu().data.numpy()[0], net.mask_cls_loss.cpu().data.numpy()[0], )) sum_train_loss += batch_loss sum_train_acc += batch_acc sum += 1 if i % iter_smooth == 0: train_loss = sum_train_loss / sum train_acc = sum_train_acc / sum sum_train_loss = np.zeros(6, np.float32) sum_train_acc = 0. sum = 0 print( '\r%0.4f %5.1f k %6.2f %4.1f m | %0.3f %0.2f %0.2f %0.2f %0.2f %0.2f | %0.3f %0.2f %0.2f %0.2f %0.2f %0.2f | %0.3f %0.2f %0.2f %0.2f %0.2f %0.2f | %s %d,%d,%s' % ( \ rate, i / 1000, epoch, num_products / 1000000, valid_loss[0], valid_loss[1], valid_loss[2], valid_loss[3], valid_loss[4], valid_loss[5], # valid_acc, train_loss[0], train_loss[1], train_loss[2], train_loss[3], train_loss[4], train_loss[5], # train_acc, batch_loss[0], batch_loss[1], batch_loss[2], batch_loss[3], batch_loss[4], batch_loss[5], # batch_acc, time_to_str((time.time() - start) / 60), i, j, str(inputs.size())), end='', flush=True) j = j + 1 pass # -- end of one data loader -- pass # -- end of all iterations -- if 1: # save last torch.save(net.state_dict(), out_dir + '/checkpoint/%d_model.pth' % (i)) torch.save( { 'optimizer': optimizer.state_dict(), 'iter': i, 'epoch': epoch, }, out_dir + '/checkpoint/%d_optimizer.pth' % (i)) log.write('\n')
def test_set_sample_rate_hertz(): from model.configuration import Configuration config = Configuration() config.set_sample_rate_hertz(16000)
class IOHandler(object): _result_path = '' _result_file_name = 'results.csv' _csv_header = 'AUDIO_FILE, MODEL, ENHANCED, LANGUAGE, ALTERNATIVE_LANGS, PHRASE_HINTS_APPLIED, BOOST, REF_WORD_COUNT, REF_ERROR_COUNT , WER,STEMMING_APPLIED , STOP_WORDS_REMOVED, NUMBER_TO_WORD_CONVERSION, CONTRACTIONS_EXPANDED, INSERTIONS, DELETIONS, SUBSTITUTIONS, DELETED_WORDS, INSERTED_WORDS, SUBSTITUTE_WORDS\n' _csv_header_written = False configuration = Configuration() nlp_model = NLPModel() _queue_file_name = 'queue.txt' def set_queue_file_name(self, name): self._queue_file_name = name def get_queue_file_name(self): return self._queue_file_name def set_result_path(self, result_path): self._result_path = result_path def get_result_path(self): return self._result_path def write_csv_header(self): import os if not self._csv_header_written: full_path = f'{self.get_result_path()}/{self._result_file_name}' # if path does not exists, make it if not os.path.exists(self.get_result_path()): os.makedirs(self.get_result_path()) with open(full_path, 'w') as file: try: file.write(self._csv_header) except IOError as i: print(f'Can not write csv header: {i}') except FileNotFoundError as x: print(f'Can not find csv file: {x}') self._csv_header_written = True def update_csv(self, uri, configuration, nlp_model, word_count_list=None, ref_total_word_count=0, ref_error_count=0, word_error_rate=0, ins=0, deletions=0, subs=0): import logging logging.basicConfig(filename='wer_app.log') logger = logging.getLogger(__name__) from collections import OrderedDict deleted_words_dict = dict() inserted_words_dict = dict() substitute_words_dict = dict() if word_count_list: try: deleted_words_dict = OrderedDict( sorted(word_count_list[0].items(), key=lambda x: x[1])) inserted_words_dict = OrderedDict( sorted(word_count_list[1].items(), key=lambda x: x[1])) substitute_words_dict = OrderedDict( sorted(word_count_list[2].items(), key=lambda x: x[1])) except TypeError as t: string = f'{t}' logger.debug(string) print(string) deleted_words_dict = None inserted_words_dict = None substitute_words_dict = None deleted_words = '' inserted_words = '' substitute_words = '' if deleted_words_dict: for k, v in deleted_words_dict.items(): deleted_words += f'{k}:{v}, ' if inserted_words_dict: for k, v in inserted_words_dict.items(): inserted_words += f'{k}:{v}, ' if substitute_words_dict: for k, v in substitute_words_dict.items(): substitute_words += f'{k}:{v}, ' full_path = f'{self.get_result_path()}/{self._result_file_name}' alts = '' for item in (configuration.get_alternative_language_codes()): alts += item + ' ' string = f'{uri}, {configuration.get_model()}, {configuration.get_use_enhanced()}, {configuration.get_language_code()},' \ f'{alts}, {bool(configuration.get_phrases())},' \ f'{configuration.get_boost()}, {ref_total_word_count}, {ref_error_count}, {word_error_rate}, {nlp_model.get_apply_stemming()},' \ f'{nlp_model.get_remove_stop_words()}, {nlp_model.get_n2w()}, {nlp_model.get_expand_contractions()}, {ins}, {deletions}, {subs}, ' \ f'{deleted_words}, {inserted_words}, {substitute_words}\n' with open( full_path, 'a+', ) as file: try: file.write(string) except IOError as i: print(f'Can not update csv file: {i}') print(f'UPDATED: {full_path}') def write_html_diagnostic(self, wer_obj, unique_root, result_path): aligned_html = '<br>'.join(wer_obj.aligned_htmls) result_file = unique_root + '.html' write_path = f'{result_path}/{result_file}' with open(write_path, 'w') as f: try: f.write(aligned_html) except IOError as i: print(f'Can not write html diagnostic {write_path}: {i}') print(f'WROTE: diagnostic file: {write_path} ') def write_queue_file(self, data): try: with open(self._queue_file_name, 'a+') as f: if isinstance(data, str): info = data.split() else: info = data for item in info: f.write(item + ',') except IOError as e: print(f'Can not write diagnostic file: {e}') def read_queue_file(self): result = None try: with open(self._queue_file_name, 'r') as f: result = f.read() except IOError as e: print(f'Can not read queue file: {e}') except FileNotFoundError as x: print(f'Queue file not found: {x}') if not result: raise IOError('No contents found in queue') return result def write_hyp(self, file_name, text): import os.path if not os.path.exists(self.get_result_path()): os.makedirs(self.get_result_path()) p = f'{self.get_result_path()}/{file_name}' with open(p, 'w+') as f: f.write(text)
def test_set_alternative_language_codes(): from model.configuration import Configuration config = Configuration() codes = ['gu-IN', 'ru-RU'] config.set_alternative_language_codes(codes)