def compute_accurate_group_status(cls): from Dashboard.models import LANGUAGE_CODES_AND_NAMES user_status = defaultdict(list) qs = cls.objects.filter(completed=True) value_names = ('createdBy', 'item__itemType', 'task__id') for result in qs.values_list(*value_names): if result[1].lower() != 'tgt': continue annotatorID = result[0] taskID = result[2] user_status[annotatorID].append(taskID) group_status = defaultdict(list) for annotatorID in user_status: user = User.objects.get(pk=annotatorID) usergroups = ';'.join([ x.name for x in user.groups.all() if not x.name in LANGUAGE_CODES_AND_NAMES.keys() ]) if not usergroups: usergroups = 'NoGroupInfo' group_status[usergroups].extend(user_status[annotatorID]) group_hits = {} for group_name in group_status: task_ids = set(group_status[group_name]) completed_tasks = 0 for task_id in task_ids: if group_status[group_name].count(task_id) >= 70: completed_tasks += 1 group_hits[group_name] = (completed_tasks, len(task_ids)) return group_hits
def dump_all_results_to_csv_file(cls, csv_file): from Dashboard.models import LANGUAGE_CODES_AND_NAMES system_scores = defaultdict(list) user_data = {} qs = cls.objects.filter(completed=True) value_names = ('item__target1ID', 'score1', 'item__target2ID', 'score2', 'start_time', 'end_time', 'createdBy', 'item__itemID', 'item__metadata__market__sourceLanguageCode', 'item__metadata__market__targetLanguageCode', 'item__metadata__market__domainName', 'item__itemType', 'task__id', 'task__campaign__campaignName') for result in qs.values_list(*value_names): system1ID = result[0] score1 = result[1] system2ID = result[2] score2 = result[3] start_time = result[4] end_time = result[5] duration = round(float(end_time) - float(start_time), 1) annotatorID = result[6] segmentID = result[7] marketID = '{0}-{1}'.format(result[8], result[9]) domainName = result[10] itemType = result[11] taskID = result[12] campaignName = result[13] if annotatorID in user_data: username = user_data[annotatorID][0] useremail = user_data[annotatorID][1] usergroups = user_data[annotatorID][2] else: user = User.objects.get(pk=annotatorID) username = user.username useremail = user.email usergroups = ';'.join([ x.name for x in user.groups.all() if not x.name in LANGUAGE_CODES_AND_NAMES.keys() ]) if not usergroups: usergroups = 'NoGroupInfo' user_data[annotatorID] = (username, useremail, usergroups) system_scores[marketID + '-' + domainName].append( (taskID, segmentID, username, useremail, usergroups, system1ID, score1, system2ID, score2, start_time, end_time, duration, itemType, campaignName)) # TODO: this is very intransparent... and needs to be fixed! x = system_scores s = [ 'taskID,segmentID,username,email,groups,system1ID,score1,system2ID,score2,startTime,endTime,durationInSeconds,itemType,campaignName' ] for l in x: for i in x[l]: s.append(','.join([str(a) for a in i])) from os.path import join from Appraise.settings import BASE_DIR media_file_path = join(BASE_DIR, 'media', csv_file) with open(media_file_path, 'w') as outfile: for c in s: outfile.write(c) outfile.write('\n')
def marketTargetLanguageCode(self): tokens = str(self.items.first().metadata.market).split('_') if len(tokens) == 3 and tokens[1] in LANGUAGE_CODES_AND_NAMES.keys(): return tokens[1] return None
def marketSourceLanguage(self): tokens = str(self.items.first().metadata.market).split('_') if len(tokens) == 3 and tokens[0] in LANGUAGE_CODES_AND_NAMES.keys(): return LANGUAGE_CODES_AND_NAMES[tokens[0]] return None
def handle(self, *args, **options): # Validate source and target language codes _all = list(set([x.lower() for x in LANGUAGE_CODES_AND_NAMES.keys()])) _all.sort() _src = options['source_language'].lower() if not _src in _all: self.stdout.write('Unknown source language: {0}!'.format(_src)) self.stdout.write('Known languages: {0}'.format(', '.join(_all))) return _tgt = options['target_language'].lower() if not _tgt in _all: self.stdout.write('Unknown target language: {0}!'.format(_tgt)) self.stdout.write('Known languages: {0}'.format(', '.join(_all))) return # Initialize random number generator # Extract batch size number of pairs, randomizing order if requested # Serialize pairs into JSON format # Write out JSON output file batch_size = options['batch_size'] block_size = 10 block_annotations = 7 block_redundants = 1 block_references = 1 block_badrefs = 1 # IF BLOCK DEF IS GIVEN, DO SOMETHING WITH IT if options['block_definition'] is not None: print("WOOHOO") if (batch_size % block_size) > 0: self.stdout.write( 'Batch size needs to be divisible by block size!') return # CHECK THAT WE END UP WITH EVEN NUMBER OF BLOCKS print('We will create {0} blocks'.format(int(batch_size / block_size))) # TODO: add parameter to set encoding # TODO: need to use OrderedDict to preserve segment IDs' order! source_file = Command._load_text_from_file(options['source_file'], 'utf8') print('Loaded {0} source segments'.format(len(source_file.keys()))) reference_file = Command._load_text_from_file( options['reference_file'], 'utf8') print('Loaded {0} reference segments'.format(len( reference_file.keys()))) systems_files = [] systems_path = options['systems_path'] from glob import iglob import os.path for system_file in iglob('{0}{1}{2}'.format(systems_path, os.path.sep, "*.txt")): systems_files.append(system_file) random_seed_value = 123456 systems_files.sort() seed(random_seed_value) shuffle(systems_files) # ADD RANDOMIZED SHUFFLING HERE? import hashlib hashed_text = {} for system_path in systems_files: system_txt = Command._load_text_from_file(system_path, 'utf8') system_bad = Command._load_text_from_file( system_path.replace('.txt', '.bad'), 'utf8') system_ids = Command._load_text_from_file( system_path.replace('.txt', '.ids'), 'utf8') system_url = Command._load_text_from_file( system_path.replace('.txt', '.url'), 'utf8') for segment_id, segment_text in system_txt.items(): md5hash = hashlib.new('md5', segment_text.encode('utf8')).hexdigest() if not md5hash in hashed_text.keys(): hashed_text[md5hash] = { 'segment_id': segment_id, 'segment_text': segment_text, 'segment_bad': system_bad[segment_id], 'segment_ref': reference_file[segment_id], 'segment_src': source_file[segment_id], 'segment_url': system_url[segment_id], 'systems': [os.path.basename(system_path)] } else: hashed_text[md5hash]['systems'].append( os.path.basename(system_path)) print('Loaded {0} system {1} segments'.format( len(system_txt.keys()), os.path.basename(system_path))) all_keys = list(hashed_text.keys()) all_keys.sort() shuffle(all_keys) items_per_batch = 10 * 7 missing_items = items_per_batch - len(all_keys) % items_per_batch print('Missing items is {0}/{1}'.format(missing_items, items_per_batch)) all_keys.extend(all_keys[0:missing_items]) print('Added {0} missing items rotating keys'.format(missing_items)) total_batches = int(floor(len(all_keys) / items_per_batch)) print('Total number of batches is {0}'.format(total_batches)) batch_no = options['batch_no'] all_batches = options['all_batches'] source_based = options['source_based'] # If we don't produce all batches, our batch_id will be batch_no-1. # This is because batch numbers are one-based, ids zero-indexed. # # If we produce all batches, we just use range(total_batches). # This implicitly gives us zero-indexed ids already. batch_nos = [batch_no-1] if not all_batches \ else list(range(total_batches)) json_data = [] for batch_id in batch_nos: # range(batch_no): block_data = [] block_offset = batch_id * 10 * 7 num_blocks = int(batch_size / block_size) for block_id in range(num_blocks): # Human readable ids are one-based, hence +1 print('Creating batch {0:05}/{1:05}, block {2:02}'.format( batch_id + 1, total_batches, block_id + 1)) # Get 7 random system outputs block_start = block_offset + 7 * (block_id) block_end = block_start + 7 block_hashes = all_keys[block_start:block_end] current_block = {'systems': block_hashes} block_data.append(current_block) # Compute redundant, reference, bad reference bits for block_id in range(num_blocks): check_id = int((block_id + (num_blocks / 2)) % num_blocks) # Human readable ids are one-based, hence +1 print('Add checks for batch {0:05}/{1:05}, ' \ 'block {2:02} to block {3:02}'.format( batch_id+1, total_batches, check_id+1, block_id+1 ) ) check_systems = block_data[check_id]['systems'] check_systems.sort() shuffle(check_systems) block_data[block_id]['redundant'] = check_systems[0] block_data[block_id]['reference'] = check_systems[1] block_data[block_id]['badref'] = check_systems[2] # Direct assessment is reference-based for WMT17 sourceID = basename(options['reference_file']) # Remember, batch numbers are one-based taskData = OrderedDict({ 'batchNo': batch_id + 1, 'batchSize': options['batch_size'], 'sourceLanguage': options['source_language'], 'targetLanguage': options['target_language'], 'requiredAnnotations': 1, 'randomSeed': random_seed_value }) itemsData = [] _item = 0 for block_id in range(num_blocks): all_items = [(x, 'TGT') for x in block_data[block_id]['systems']] all_items.append((block_data[block_id]['redundant'], 'CHK')) all_items.append((block_data[block_id]['reference'], 'REF')) all_items.append((block_data[block_id]['badref'], 'BAD')) shuffle(all_items) for current_item, current_type in all_items: item_data = hashed_text[current_item] item_id = item_data['segment_id'] item_text = item_data['segment_text'] item_bad = item_data['segment_bad'] item_ref = item_data['segment_ref'] item_src = item_data['segment_src'] item_url = item_data['segment_url'] item_systems = item_data['systems'] targetID = '+'.join(set(item_systems)) targetText = item_text if current_type == 'REF': targetID = basename(options['reference_file']) targetText = item_ref elif current_item == 'BAD': targetText = item_bad obj = OrderedDict() obj['_item'] = _item obj['_block'] = block_id + (10 * batch_id) obj['sourceID'] = sourceID obj['sourceText'] = item_ref if not source_based else item_src obj['targetID'] = targetID obj['targetText'] = targetText obj['itemID'] = item_id obj['itemType'] = current_type obj['imageURL'] = item_url itemsData.append(obj) _item += 1 outputData = OrderedDict({'task': taskData, 'items': itemsData}) json_data.append(outputData) print(json.dumps(json_data, indent=2)) json_data = json.dumps(json_data, indent=2) with open(options['output_json_file'], mode='w', encoding='utf8') as output_file: self.stdout.write('Creating {0} ... '.format( options['output_json_file']), ending='') output_file.write(str(json_data)) self.stdout.write('OK')
def handle(self, *args, **options): # Validate source and target language codes _all = list(set([x.lower() for x in LANGUAGE_CODES_AND_NAMES.keys()])) _all.sort() _src = options['source_language'].lower() if not _src in _all: self.stdout.write('Unknown source language: {0}!'.format(_src)) self.stdout.write('Known languages: {0}'.format(', '.join(_all))) return _tgt = options['target_language'].lower() if not _tgt in _all: self.stdout.write('Unknown target language: {0}!'.format(_tgt)) self.stdout.write('Known languages: {0}'.format(', '.join(_all))) return # Initialize random number generator # Extract batch size number of pairs, randomizing order if requested # Serialize pairs into JSON format # Write out JSON output file batch_size = options['batch_size'] unicode_enc = options['unicode'] use_local_src = options['local_src'] use_local_ref = options['local_ref'] create_ids = options['create_ids'] source_based = options['source_based'] block_size = 10 block_annotations = 7 block_redundants = 1 block_references = 1 block_badrefs = 1 # IF BLOCK DEF IS GIVEN, DO SOMETHING WITH IT if options['block_definition'] is not None: print("WOOHOO") if (batch_size % block_size) > 0: self.stdout.write( 'Batch size needs to be divisible by block size!') return # CHECK THAT WE END UP WITH EVEN NUMBER OF BLOCKS print('We will create {0} blocks'.format(int(batch_size / block_size))) # TODO: add parameter to set encoding # TODO: need to use OrderedDict to preserve segment IDs' order! encoding = 'utf16' if unicode_enc else 'utf8' source_file = [] if not use_local_src: source_file = Command._load_text_from_file(options['source_file'], encoding) print('Loaded {0} source segments'.format(len(source_file.keys()))) reference_file = [] if not use_local_ref: reference_file = Command._load_text_from_file( options['reference_file'], encoding) print('Loaded {0} reference segments'.format( len(reference_file.keys()))) systems_files = [] systems_path = options['systems_path'] from glob import iglob import os.path for system_file in iglob('{0}{1}{2}'.format(systems_path, os.path.sep, "*.txt")): if '+' in basename(system_file): print('Cannot use system files with + in names ' \ 'as this breaks multi-system meta systems:\n' \ '{0}'.format(system_file)) sys_exit(-1) systems_files.append(system_file) random_seed_value = 123456 systems_files.sort() seed(random_seed_value) shuffle(systems_files) # ADD RANDOMIZED SHUFFLING HERE? import hashlib hashed_text = {} hashes_by_ids = defaultdict(list) character_based = _tgt == 'zho' or _tgt == 'jpn' \ or options['character_based'] for system_path in systems_files: system_txt = Command._load_text_from_file(system_path, encoding) # Generate bad references on the fly # # To do so, we will load a random source segment to fill in a # randomly positioned phrase in the given candidate translation. # # system_bad = Command._load_text_from_file(system_path.replace('.txt', '.bad'), encoding) if not create_ids: system_ids = Command._load_text_from_file( system_path.replace('.txt', '.ids'), encoding) else: system_ids = [x + 1 for x in range(len(system_txt))] # BASICALLY: add support for local system_src and system_ref files here. # If such files are present, this will overwrite the global src/ref values. # However, this does not fully resolve the issue as we still have to give # a source text file, while is assumed to be shared... # # IN A SENSE, using these local files makes better sense. It is wasteful, though. # MAYBE, it is better to simply generate a simple JSON config file?! local_src = [] local_ref = [] if use_local_src: local_src_path = system_path.replace('.txt', '.src') if os.path.exists(local_src_path): local_src = Command._load_text_from_file( local_src_path, encoding) if use_local_ref: local_ref_path = system_path.replace('.txt', '.ref') if os.path.exists(local_ref_path): local_ref = Command._load_text_from_file( local_src_path, encoding) for segment_id, segment_text in system_txt.items(): _src = local_src[segment_id] if use_local_src else source_file[ segment_id] _ref = local_src[ segment_id] if use_local_ref else reference_file[segment_id] md5hash = hashlib.new( 'md5', segment_text.encode(encoding) + _src.encode(encoding) + _ref.encode(encoding)).hexdigest() # Determine length of bad phrase, relative to segment length # # This follows WMT17: # - http://statmt.org/wmt17/pdf/WMT17.pdf _bad_len = 1 _tokens = segment_text \ if character_based \ else segment_text.split(' ') if len(_tokens) == 1: _bad_len = 1 elif len(_tokens) > 1 and len(_tokens) <= 5: _bad_len = 2 elif len(_tokens) > 5 and len(_tokens) <= 8: _bad_len = 3 elif len(_tokens) > 8 and len(_tokens) <= 15: _bad_len = 4 elif len(_tokens) > 15 and len(_tokens) <= 20: _bad_len = 5 else: _bad_len = len(_tokens) // 4 if character_based: _bad_len = 2 * _bad_len # Choose random src/ref segment _bad_tokens = [] while len(_bad_tokens) <= _bad_len: _bad_id = randrange(0, len(local_ref)) + 1 \ if use_local_ref else randrange(0, len(reference_file)) + 1 if source_based: _bad_id = randrange(0, len(local_src)) + 1 \ if use_local_src else randrange(0, len(source_file)) + 1 _bad_text = None # if source_based: # _bad_text = local_src[_bad_id] if use_local_src else source_file[_bad_id] # else: # # We are currently forcing reference-based bad reference # generation. If no reference is available, then a copy # of the source file will work just fine. # if True: _bad_text = local_ref[ _bad_id] if use_local_ref else reference_file[ _bad_id] _bad_tokens = _bad_text \ if character_based \ else _bad_text.split(' ') # If dealing with Chinese or Japanese, use double the amount # of characters for the bad replacement phrase. _bad_phrase = None _index = randrange(0, len(_bad_tokens) - _bad_len) \ if len(_bad_tokens) - _bad_len > 0 else 0 _bad_phrase = _bad_tokens[_index:_index + _bad_len] _index = randrange(0, len(_tokens) - _bad_len) \ if len(_tokens) - _bad_len > 0 else 0 _bad = _tokens[:_index] + _bad_phrase \ + _tokens[_index + _bad_len:] segment_bad = ''.join(_bad) \ if character_based \ else ' '.join(_bad) if not md5hash in hashed_text.keys(): hashed_text[md5hash] = { 'segment_id': segment_id, 'segment_text': segment_text, 'segment_bad': segment_bad, 'segment_ref': _ref, 'segment_src': _src, 'systems': [os.path.basename(system_path)] } hashes_by_ids[segment_id].append(md5hash) else: hashed_text[md5hash]['systems'].append( os.path.basename(system_path)) print('Loaded {0} system {1} segments'.format( len(system_txt.keys()), os.path.basename(system_path))) # Dump deduplicated segment data to JSON file. json_data = json.dumps(hashed_text, indent=2, sort_keys=True) with open(options['output_json_file'] + '.segments', mode='w', encoding='utf8') as output_file: self.stdout.write( 'Creating {0} ... '.format(options['output_json_file'] + '.segments'), ending='') output_file.write(str(json_data)) self.stdout.write('OK') all_keys = list(hashed_text.keys()) all_keys.sort() shuffle(all_keys) # If --full-coverage is specified, we want to collect annotations for # all unique translations for any given segment ID. To do so, we loop # over the all_keys list and for each MD5 hash we have not consumed, # we add not only the MD5 hash itself but also all other MD5 hashes # matching the respective segment ID. full_coverage = options['full_coverage'] if full_coverage: _sorted_keys = [] for key in all_keys: if not key in _sorted_keys: segment_id = hashed_text[key]['segment_id'] matching_keys = hashes_by_ids[segment_id] matching_keys.sort() _sorted_keys.extend(matching_keys) all_keys = _sorted_keys items_per_batch = 10 * 7 missing_items = items_per_batch - len(all_keys) % items_per_batch print('Missing items is {0}/{1}'.format(missing_items, items_per_batch)) all_keys.extend(all_keys[0:missing_items]) print('Added {0} missing items rotating keys'.format(missing_items)) total_batches = int(floor(len(all_keys) / items_per_batch)) print('Total number of batches is {0}'.format(total_batches)) batch_no = options['batch_no'] max_batches = options['max_batches'] all_batches = options['all_batches'] # If we don't produce all batches, our batch_id will be batch_no-1. # This is because batch numbers are one-based, ids zero-indexed. # # If we produce all batches, we just use range(total_batches). # This implicitly gives us zero-indexed ids already. batch_nos = [batch_no-1] if not all_batches \ else list(range(total_batches)) if max_batches: batch_nos = batch_nos[:max_batches] json_data = [] for batch_id in batch_nos: # range(batch_no): block_data = [] block_offset = batch_id * 10 * 7 num_blocks = int(batch_size / block_size) for block_id in range(num_blocks): # Human readable ids are one-based, hence +1 print('Creating batch {0:05}/{1:05}, block {2:02}'.format( batch_id + 1, total_batches, block_id + 1)) # Get 7 random system outputs block_start = block_offset + 7 * (block_id) block_end = block_start + 7 block_hashes = all_keys[block_start:block_end] current_block = {'systems': block_hashes} block_data.append(current_block) # Compute redundant, reference, bad reference bits for block_id in range(num_blocks): check_id = int((block_id + (num_blocks / 2)) % num_blocks) # Human readable ids are one-based, hence +1 print('Add checks for batch {0:05}/{1:05}, ' \ 'block {2:02} to block {3:02}'.format( batch_id+1, total_batches, check_id+1, block_id+1 ) ) check_systems = block_data[check_id]['systems'] check_systems.sort() shuffle(check_systems) block_data[block_id]['redundant'] = check_systems[0] block_data[block_id]['reference'] = check_systems[1] block_data[block_id]['badref'] = check_systems[2] # Direct assessment is reference-based for WMT17 if source_based: sourceID = 'LOCAL_SRC' if use_local_src else basename( options['source_file']) else: sourceID = 'LOCAL_REF' if use_local_ref else basename( options['reference_file']) # Remember, batch numbers are one-based taskData = OrderedDict({ 'batchNo': batch_id + 1, 'batchSize': options['batch_size'], 'sourceLanguage': options['source_language'], 'targetLanguage': options['target_language'], 'requiredAnnotations': options['required_annotations'], 'randomSeed': random_seed_value }) itemsData = [] _item = 0 for block_id in range(num_blocks): all_items = [(x, 'TGT') for x in block_data[block_id]['systems']] all_items.append((block_data[block_id]['redundant'], 'CHK')) all_items.append((block_data[block_id]['reference'], 'REF')) all_items.append((block_data[block_id]['badref'], 'BAD')) shuffle(all_items) for current_item, current_type in all_items: item_data = hashed_text[current_item] item_id = item_data['segment_id'] item_text = item_data['segment_text'] item_bad = item_data['segment_bad'] item_ref = item_data['segment_ref'] item_src = item_data['segment_src'] item_systems = item_data['systems'] targetID = '+'.join(sorted(set(item_systems))) targetText = item_text if current_type == 'REF': targetID = basename(options['reference_file']) targetText = item_ref elif current_type == 'BAD': targetText = item_bad obj = OrderedDict() obj['_item'] = _item obj['_block'] = block_id + (10 * batch_id) obj['sourceID'] = sourceID obj['sourceText'] = item_ref if not source_based else item_src obj['targetID'] = targetID obj['targetText'] = targetText obj['itemID'] = item_id obj['itemType'] = current_type itemsData.append(obj) _item += 1 outputData = OrderedDict({'task': taskData, 'items': itemsData}) json_data.append(outputData) json_data = json.dumps(json_data, indent=2, sort_keys=True) print(json_data) with open(options['output_json_file'], mode='w', encoding='utf8') as output_file: self.stdout.write('Creating {0} ... '.format( options['output_json_file']), ending='') output_file.write(str(json_data)) self.stdout.write('OK')