class TextSegment(EvalItem): """ Models a single text segment. """ segmentID = models.CharField( max_length=MAX_SEGMENTID_LENGTH, verbose_name=_('Segment ID'), help_text=_(f('(max. {value} characters)', value=MAX_SEGMENTID_LENGTH)) ) segmentText = models.TextField( max_length=MAX_SEGMENTTEXT_LENGTH, verbose_name=_('Segment text'), help_text=_(f('(max. {value} characters)', value=MAX_SEGMENTTEXT_LENGTH)) ) # pylint: disable=E1101 def is_valid(self): """ Validates the current TextSegment instance, checking text. """ if not isinstance(self.segmentText, type('This is a test sentence.')): return False _len = len(self.segmentText) if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH: return False return super(TextSegment, self).is_valid()
class ObjectID(models.Model): """ Encodes an object type and ID for retrieval. """ typeName = models.CharField( db_index=True, max_length=MAX_TYPENAME_LENGTH, verbose_name=_('Type name'), help_text=_(f('(max. {value} characters)', value=MAX_TYPENAME_LENGTH)) ) primaryID = models.CharField( db_index=True, max_length=MAX_PRIMARYID_LENGTH, verbose_name=_('Primary ID'), help_text=_(f('(max. {value} characters)', value=MAX_PRIMARYID_LENGTH)) ) def get_object_instance(self): """ Returns actual object instance for current ObjectID instance. """ instance = None try: # TODO: add registry of type names to models.py and ensure only # those are used for typeName. Furthermore, verify that the # given primaryID does not contain ')'. _code = '{0}.objects.get(id={1})'.format( self.typeName, self.primaryID ) # Hack for Python 3.5.2 from EvalData.models import ( DataAssessmentTask, DirectAssessmentTask, DirectAssessmentContextTask, DirectAssessmentDocumentTask, MultiModalAssessmentTask, PairwiseAssessmentTask, ) instance = eval(_code) except: _msg = 'ObjectID {0}.{1} invalid'.format( self.typeName, self.primaryID ) LOGGER.warn(_msg) LOGGER.warn(format_exc()) finally: return instance def __str__(self): return str(self.id)+'.'+self.typeName+'.'+self.primaryID
class TextPairWithImage(EvalItem): """ Models a pair of two text segments and an image. """ sourceID = models.CharField(max_length=MAX_SEGMENTID_LENGTH, verbose_name=_('Source ID'), help_text=_( f('(max. {value} characters)', value=MAX_SEGMENTID_LENGTH))) sourceText = models.CharField(max_length=MAX_SEGMENTTEXT_LENGTH, verbose_name=_('Source text'), help_text=_( f('(max. {value} characters)', value=MAX_SEGMENTTEXT_LENGTH))) targetID = models.CharField(max_length=MAX_SEGMENTID_LENGTH, verbose_name=_('Target ID'), help_text=_( f('(max. {value} characters)', value=MAX_SEGMENTID_LENGTH))) targetText = models.CharField(max_length=MAX_SEGMENTTEXT_LENGTH, verbose_name=_('Target text'), help_text=_( f('(max. {value} characters)', value=MAX_SEGMENTTEXT_LENGTH))) imageURL = models.URLField(verbose_name=_('image URL')) # pylint: disable=E1101 def is_valid(self): """ Validates the current TextPair instance, checking text. """ if isinstance(self.sourceText, type('This is a test sentence.')): return False _len = len(self.sourceText) if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH: return False if isinstance(self.targetText, type('This is a test sentence.')): return False _len = len(self.targetText) if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH: return False # This does not implement validation for image URLs yet. return super(TextPairWithImage, self).is_valid()
class TextPair(EvalItem): """ Models a pair of two text segments. """ sourceID = models.CharField( max_length=MAX_SEGMENTID_LENGTH, verbose_name=_('Source ID'), help_text=_(f('(max. {value} characters)', value=MAX_SEGMENTID_LENGTH)) ) sourceText = models.TextField( blank=True, verbose_name=_('Source text'), ) targetID = models.CharField( max_length=MAX_SEGMENTID_LENGTH, verbose_name=_('Target ID'), help_text=_(f('(max. {value} characters)', value=MAX_SEGMENTID_LENGTH)) ) targetText = models.TextField( blank=True, verbose_name=_('Target text'), ) # pylint: disable=E1101 def is_valid(self): """ Validates the current TextPair instance, checking text. """ if isinstance(self.sourceText, type('This is a test sentence.')): return False _len = len(self.sourceText) if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH: return False if isinstance(self.targetText, type('This is a test sentence.')): return False _len = len(self.targetText) if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH: return False return super(TextPair, self).is_valid()
class TextPairWithContext(TextPair): """ Models a pair of two text segments and corresponding context. """ documentID = models.CharField(max_length=MAX_DOCUMENTID_LENGTH, verbose_name=_('Document ID'), help_text=_( f('(max. {value} characters)', value=MAX_DOCUMENTID_LENGTH))) isCompleteDocument = models.BooleanField( blank=True, db_index=True, default=False, verbose_name=_('Complete document?')) sourceContextLeft = models.TextField( blank=True, null=True, verbose_name=_('Source context (left)')) sourceContextRight = models.TextField( blank=True, null=True, verbose_name=_('Source context (right)')) targetContextLeft = models.TextField( blank=True, null=True, verbose_name=_('Target context (left)')) targetContextRight = models.TextField( blank=True, null=True, verbose_name=_('Target context (right)')) # pylint: disable=E1101 def is_valid(self): """ Validates the current TextPairWithContext instance, checking text. """ return super(TextPairWithContext, self).is_valid()
class Metadata(BaseMetadata): """ Models metadata associated to tasks. """ market = models.ForeignKey( Market, db_index=True, on_delete=models.PROTECT ) corpusName = models.CharField( max_length=MAX_CORPUSNAME_LENGTH, verbose_name=_('Corpus name'), help_text=_(f('(max. {value} characters)', value=MAX_CORPUSNAME_LENGTH)) ) versionInfo = models.CharField( max_length=MAX_VERSIONINFO_LENGTH, verbose_name=_('Version info'), help_text=_(f('(max. {value} characters)', value=MAX_VERSIONINFO_LENGTH)) ) source = models.CharField( max_length=MAX_SOURCE_LENGTH, verbose_name=_('Source'), help_text=_(f('(max. {value} characters)', value=MAX_SOURCE_LENGTH)) ) class Meta: ordering = ['_str_name'] verbose_name = 'Metadata record' def _generate_str_name(self): return '{0}->{1}/{2}["{3}"]'.format( self.market.sourceLanguageCode, self.market.targetLanguageCode, self.corpusName, self.versionInfo )
def clean_fields(self, exclude=None): """ Verifies that desired marketID is still available. """ _new_marketID = '{0}_{1}_{2}'.format( self.sourceLanguageCode, self.targetLanguageCode, self.domainName ) _market_instance = Market.objects.filter(marketID=_new_marketID) if _market_instance.exists(): raise ValidationError( _(f('Market with identical marketID ("{mID}") already exists.', mID=_new_marketID)) ) super(Market, self).clean_fields(exclude)
class TextSegmentWithTwoTargets(TextSegment): """ Models a text segment with one or two sub-segments. """ target1ID = models.CharField(max_length=MAX_SEGMENTID_LENGTH, verbose_name=_('Item ID (1)'), help_text=_( f('(max. {value} characters)', value=MAX_SEGMENTID_LENGTH))) target1Text = models.TextField( blank=True, verbose_name=_('Text (1)'), ) target2ID = models.CharField(null=True, max_length=MAX_SEGMENTID_LENGTH, verbose_name=_('Item ID (2)'), help_text=_( f('(max. {value} characters)', value=MAX_SEGMENTID_LENGTH))) target2Text = models.TextField( blank=True, null=True, verbose_name=_('Text (2)'), ) contextLeft = models.TextField(blank=True, null=True, verbose_name=_('Context (left)')) contextRight = models.TextField(blank=True, null=True, verbose_name=_('Context (right)')) def has_context(self): """Checks if the current segment has context provided.""" return self.contextLeft or self.contextRight def context_left(self, last=5, separator=' '): """ Returns formatted last 5 sentences from the left context. Use separator='<br>' to show one sentence per line. """ return (separator.join(self.contextLeft.split('\n')[-last:]) if self.contextLeft else '') def context_right(self, first=5, separator=' '): """ Returns formatted first 5 sentences from the right context. Use separator='<br>' to show one sentence per line. """ return (separator.join(self.contextRight.split('\n')[:first]) if self.contextRight else '') def target_texts_with_diffs(self): """ Returns the pair of texts with HTML tags highlighting token differences. Both texts must be non empty. For example, 'a b c d e' and 'a B c e f' will become: 'a <span class="diff diff-sub">b</span> c <span class="diff diff-del">d</span> e', 'a <span class="diff diff-sub">B</span> c e <span class="diff diff-ins">f</span>' """ if not self.target1Text or not self.target2Text: return (self.target1Text, self.target2Text) toks1 = self.target1Text.split() toks2 = self.target2Text.split() matcher = SequenceMatcher(None, toks1, toks2) text1 = '' text2 = '' for tag, i1, i2, j1, j2 in matcher.get_opcodes(): if tag == 'equal': text1 += ' ' + ' '.join(toks1[i1:i2]) text2 += ' ' + ' '.join(toks2[j1:j2]) elif tag == 'replace': text1 += ' <span class="diff diff-sub">' + ' '.join( toks1[i1:i2]) + '</span>' text2 += ' <span class="diff diff-sub">' + ' '.join( toks2[j1:j2]) + '</span>' elif tag == 'insert': text2 += ' <span class="diff diff-ins">' + ' '.join( toks2[j1:j2]) + '</span>' elif tag == 'delete': text1 += ' <span class="diff diff-del">' + ' '.join( toks1[i1:i2]) + '</span>' return (text1.strip(), text2.strip()) # pylint: disable=E1101 def is_valid(self): """ Validates the current TextSegmentWithTwoTargets instance, checking text. """ if isinstance(self.target1Text, type('This is a test sentence.')): return False _len = len(self.target1Text) if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH: return False if target2Text and len(target2Text) > 0: if isinstance(self.target2Text, type('This is a test sentence.')): return False _len = len(self.target2Text) if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH: return False # Texts must be different if self.target1Text == self.target2Text: return False return super(TextSegmentWithTwoTargets, self).is_valid()
class PairwiseAssessmentTask(BaseMetadata): """ Models a direct assessment evaluation task. """ campaign = models.ForeignKey( 'Campaign.Campaign', db_index=True, on_delete=models.PROTECT, related_name='%(app_label)s_%(class)s_campaign', related_query_name="%(app_label)s_%(class)ss", verbose_name=_('Campaign')) items = models.ManyToManyField( TextSegmentWithTwoTargets, related_name='%(app_label)s_%(class)s_items', related_query_name="%(app_label)s_%(class)ss", verbose_name=_('Items')) requiredAnnotations = models.PositiveSmallIntegerField( verbose_name=_('Required annotations'), help_text=_( f('(value in range=[1,{value}])', value=MAX_REQUIREDANNOTATIONS_VALUE))) assignedTo = models.ManyToManyField( User, blank=True, db_index=True, related_name='%(app_label)s_%(class)s_assignedTo', related_query_name="%(app_label)s_%(class)ss", verbose_name=_('Assigned to'), help_text=_('(users working on this task)')) batchNo = models.PositiveIntegerField(verbose_name=_('Batch number'), help_text=_('(1-based)')) batchData = models.ForeignKey( 'Campaign.CampaignData', on_delete=models.PROTECT, blank=True, db_index=True, null=True, related_name='%(app_label)s_%(class)s_batchData', related_query_name="%(app_label)s_%(class)ss", verbose_name=_('Batch data')) def dataName(self): return str(self.batchData) def marketName(self): return str(self.items.first().metadata.market) def marketSourceLanguage(self): tokens = str(self.items.first().metadata.market).split('_') if len(tokens) == 3 and tokens[0] in LANGUAGE_CODES_AND_NAMES.keys(): return LANGUAGE_CODES_AND_NAMES[tokens[0]] return None def marketSourceLanguageCode(self): tokens = str(self.items.first().metadata.market).split('_') if len(tokens) == 3 and tokens[0] in LANGUAGE_CODES_AND_NAMES.keys(): return tokens[0] return None def marketTargetLanguage(self): tokens = str(self.items.first().metadata.market).split('_') if len(tokens) == 3 and tokens[1] in LANGUAGE_CODES_AND_NAMES.keys(): return LANGUAGE_CODES_AND_NAMES[tokens[1]] return None def marketTargetLanguageCode(self): tokens = str(self.items.first().metadata.market).split('_') if len(tokens) == 3 and tokens[1] in LANGUAGE_CODES_AND_NAMES.keys(): return tokens[1] return None def completed_items_for_user(self, user): results = PairwiseAssessmentResult.objects.filter( task=self, activated=False, completed=True, createdBy=user).values_list('item_id', flat=True) return len(set(results)) def is_trusted_user(self, user): from Campaign.models import TrustedUser trusted_user = TrustedUser.objects.filter(\ user=user, campaign=self.campaign ) return trusted_user.exists() def next_item_for_user(self, user, return_completed_items=False): trusted_user = self.is_trusted_user(user) next_item = None completed_items = 0 for item in self.items.all().order_by('id'): result = PairwiseAssessmentResult.objects.filter(item=item, activated=False, completed=True, createdBy=user) if not result.exists(): print('identified next item: {0}/{1} for trusted={2}'.format( item.id, item.itemType, trusted_user)) if not trusted_user or item.itemType.startswith('TGT'): next_item = item print(' - got it') break completed_items += 1 if not next_item: LOGGER.info('No next item found for task {0}'.format(self.id)) annotations = PairwiseAssessmentResult.objects.filter( task=self, activated=False, completed=True).values_list('item_id', flat=True) uniqueAnnotations = len(set(annotations)) required_user_results = 100 if trusted_user: required_user_results = 70 _total_required = self.requiredAnnotations * required_user_results LOGGER.info('Unique annotations={0}/{1}'.format( uniqueAnnotations, _total_required)) if uniqueAnnotations >= _total_required: LOGGER.info('Completing task {0}'.format(self.id)) self.complete() self.save() # Not sure why I would complete the batch here? # self.batchData.complete() # self.batchData.save() if return_completed_items: return (next_item, completed_items) return next_item @classmethod def get_task_for_user(cls, user): for active_task in cls.objects.filter(assignedTo=user, activated=True, completed=False).order_by('-id'): next_item = active_task.next_item_for_user(user) if next_item is not None: return active_task return None @classmethod def get_next_free_task_for_language(cls, code, campaign=None, user=None): print(' Looking for next free task for language: {0}'.format(code)) print(' Campaign: {0}'.format(campaign)) print(' User: {0}'.format(user)) active_tasks = cls.objects.filter( activated=True, completed=False, items__metadata__market__targetLanguageCode=code) print(' Number of active tasks: ({0})'.format(len(active_tasks))) if campaign: active_tasks = active_tasks.filter(campaign=campaign) for active_task in active_tasks.order_by('id'): active_users = active_task.assignedTo.count() if active_users < active_task.requiredAnnotations: if user and not user in active_task.assignedTo.all(): return active_task print(' No next free task available') return None # It seems that assignedTo is converted to an integer count. active_tasks = active_tasks.order_by('id') \ .values_list('id', 'requiredAnnotations', 'assignedTo') for active_task in active_tasks: print(active_task) active_users = active_task[2] or 0 if active_users < active_task[1]: return cls.objects.get(pk=active_task[0]) return None # TODO: this needs to be removed. for active_task in active_tasks: market = active_task.items.first().metadata.market if not market.targetLanguageCode == code: continue active_users = active_task.assignedTo.count() if active_users < active_task.requiredAnnotations: return active_task return None @classmethod def get_next_free_task_for_language_and_campaign(cls, code, campaign): return cls.get_next_free_task_for_language(code, campaign) @classmethod def import_from_json(cls, campaign, batch_user, batch_data, max_count): """ Creates new PairwiseAssessmentTask instances based on JSON input. """ batch_meta = batch_data.metadata batch_name = batch_data.dataFile.name batch_file = batch_data.dataFile batch_json = None if batch_name.endswith('.zip'): if not is_zipfile(batch_file): _msg = 'Batch {0} not a valid ZIP archive'.format(batch_name) LOGGER.warn(_msg) return batch_zip = ZipFile(batch_file) batch_json_files = [ x for x in batch_zip.namelist() if x.endswith('.json') ] # TODO: implement proper support for multiple json files in archive. for batch_json_file in batch_json_files: batch_content = batch_zip.read(batch_json_file).decode('utf-8') batch_json = loads(batch_content, encoding='utf-8') else: batch_json = loads(str(batch_file.read(), encoding="utf-8")) from datetime import datetime t1 = datetime.now() current_count = 0 max_length_id = 0 max_length_text = 0 for batch_task in batch_json: if max_count > 0 and current_count >= max_count: _msg = 'Stopping after max_count={0} iterations'.format( max_count) LOGGER.info(_msg) t2 = datetime.now() print(t2 - t1) return print('Loading batch:', batch_name, batch_task['task']['batchNo']) new_items = [] count_items = 0 for item in batch_task['items']: count_items += 1 # TODO: check if target1 + target2 should be used here current_length_id = len(item['sourceID']) current_length_text = len(item['sourceText']) if current_length_id > max_length_id: print(current_length_id, item['sourceID']) max_length_id = current_length_id if current_length_text > max_length_text: print(current_length_text, item['sourceText'].encode('utf-8')) max_length_text = current_length_text item_targets = item['targets'] # TODO: check if 'targets' is empty or has more elements # than 2 item_tgt1_idx = item_targets[0]['targetID'] item_tgt1_txt = item_targets[0]['targetText'] item_tgt2_idx = None item_tgt2_txt = None if len(item_targets) > 1: item_tgt2_idx = item_targets[1]['targetID'] item_tgt2_txt = item_targets[1]['targetText'] context_left = item.get('contextLeft', None) context_right = item.get('contextRight', None) new_item = TextSegmentWithTwoTargets( segmentID=item['sourceID'], segmentText=item['sourceText'], target1ID=item_tgt1_idx, target1Text=item_tgt1_txt, target2ID=item_tgt2_idx, target2Text=item_tgt2_txt, createdBy=batch_user, itemID=item['itemID'], itemType=item['itemType'], contextLeft=context_left, contextRight=context_right, ) new_items.append(new_item) if not len(new_items) == 100: _msg = 'Expected 100 items for task but found {0}'.format( count_items) LOGGER.warn(_msg) continue current_count += 1 #for new_item in new_items: # new_item.metadata = batch_meta # new_item.save() batch_meta.textsegment_set.add(*new_items, bulk=False) batch_meta.save() new_task = PairwiseAssessmentTask( campaign=campaign, requiredAnnotations=batch_task['task']['requiredAnnotations'], batchNo=batch_task['task']['batchNo'], batchData=batch_data, createdBy=batch_user, ) new_task.save() #for new_item in new_items: # new_task.items.add(new_item) new_task.items.add(*new_items) new_task.save() _msg = 'Success processing batch {0}, task {1}'.format( str(batch_data), batch_task['task']['batchNo']) LOGGER.info(_msg) _msg = 'Max length ID={0}, text={1}'.format(max_length_id, max_length_text) LOGGER.info(_msg) t2 = datetime.now() print(t2 - t1) # pylint: disable=E1101 def is_valid(self): """ Validates the current DA task, checking campaign and items exist. """ if not hasattr(self, 'campaign') or not self.campaign.is_valid(): return False if not hasattr(self, 'items'): return False for item in self.items: if not item.is_valid(): return False return True def _generate_str_name(self): return '{0}.{1}[{2}]'.format(self.__class__.__name__, self.campaign, self.id)
class TextPairWithDomain(TextPair): """ Models a pair of two multi-line text segments with domain and URL. """ SENTENCE_DELIMITER = '\n' documentDomain = models.CharField(max_length=MAX_SEGMENTID_LENGTH, verbose_name=_('Domain'), help_text=_( f('(max. {value} characters)', value=MAX_SEGMENTID_LENGTH))) sourceURL = models.TextField( blank=True, verbose_name=_('Source URL'), ) targetURL = models.TextField( blank=True, verbose_name=_('Target URL'), ) def get_sentence_pairs(self): """ Returns pairs of source and target sentences created from source and target segments. """ return zip(self.sourceText.split(self.SENTENCE_DELIMITER), self.targetText.split(self.SENTENCE_DELIMITER)) # pylint: disable=E1101 def is_valid(self): """ Validates the current TextPairWithDomain instance, checking text. """ if isinstance(self.sourceText, type('This is a test sentence.')): return False _len = len(self.sourceText) if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH: return False if isinstance(self.targetText, type('This is a test sentence.')): return False _len = len(self.targetText) if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH: return False # Check if multi-line segments are of the same length _src_segs = self.sourceText.strip().split(self.SENTENCE_DELIMITER) _tgt_segs = self.targetText.strip().split(self.SENTENCE_DELIMITER) if len(_src_segs) != len(_tgt_segs): return False _len = len(self.sourceURL) if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH: return False _len = len(self.targetURL) if _len < 1 or _len > MAX_SEGMENTTEXT_LENGTH: return False return super(TextPairWithDomain, self).is_valid()
class DirectAssessmentDocumentTask(BaseMetadata): """ Models a direct assessment document evaluation task. Note: this task is, similarily to other models, a shameless copy of DirectAssessmentContextTask, with one additional method for retrieving all items belonging to the same document in the task called `next_document_for_user`, and a helper method `get_results_for_each_item`. The underlying model is the same as for DirectAssessmentContextTask. """ campaign = models.ForeignKey( 'Campaign.Campaign', db_index=True, on_delete=models.PROTECT, related_name='%(app_label)s_%(class)s_campaign', related_query_name="%(app_label)s_%(class)ss", verbose_name=_('Campaign')) items = models.ManyToManyField( TextPairWithContext, related_name='%(app_label)s_%(class)s_items', related_query_name="%(app_label)s_%(class)ss", verbose_name=_('Items')) requiredAnnotations = models.PositiveSmallIntegerField( verbose_name=_('Required annotations'), help_text=_( f('(value in range=[1,{value}])', value=MAX_REQUIREDANNOTATIONS_VALUE))) assignedTo = models.ManyToManyField( User, blank=True, db_index=True, related_name='%(app_label)s_%(class)s_assignedTo', related_query_name="%(app_label)s_%(class)ss", verbose_name=_('Assigned to'), help_text=_('(users working on this task)')) batchNo = models.PositiveIntegerField(verbose_name=_('Batch number'), help_text=_('(1-based)')) batchData = models.ForeignKey( 'Campaign.CampaignData', on_delete=models.PROTECT, blank=True, db_index=True, null=True, related_name='%(app_label)s_%(class)s_batchData', related_query_name="%(app_label)s_%(class)ss", verbose_name=_('Batch data')) def dataName(self): return str(self.batchData) def marketName(self): return str(self.items.first().metadata.market) def marketSourceLanguage(self): tokens = str(self.items.first().metadata.market).split('_') if len(tokens) == 3 and tokens[0] in LANGUAGE_CODES_AND_NAMES.keys(): return LANGUAGE_CODES_AND_NAMES[tokens[0]] return None def marketSourceLanguageCode(self): tokens = str(self.items.first().metadata.market).split('_') if len(tokens) == 3 and tokens[0] in LANGUAGE_CODES_AND_NAMES.keys(): return tokens[0] return None def marketTargetLanguage(self): tokens = str(self.items.first().metadata.market).split('_') if len(tokens) == 3 and tokens[1] in LANGUAGE_CODES_AND_NAMES.keys(): return LANGUAGE_CODES_AND_NAMES[tokens[1]] return None def marketTargetLanguageCode(self): tokens = str(self.items.first().metadata.market).split('_') if len(tokens) == 3 and tokens[1] in LANGUAGE_CODES_AND_NAMES.keys(): return tokens[1] return None def completed_items_for_user(self, user): results = DirectAssessmentDocumentResult.objects.filter( task=self, activated=False, completed=True, createdBy=user).values_list('item_id', flat=True) return len(set(results)) def is_trusted_user(self, user): from Campaign.models import TrustedUser trusted_user = TrustedUser.objects.filter(\ user=user, campaign=self.campaign ) return trusted_user.exists() def next_item_for_user(self, user, return_completed_items=False): trusted_user = self.is_trusted_user(user) next_item = None completed_items = 0 for item in self.items.all().order_by('id'): result = DirectAssessmentDocumentResult.objects.filter( item=item, activated=False, completed=True, createdBy=user) if not result.exists(): print( 'Identified next item: {}/{} (itemID={}) for trusted={}' \ .format(item.id, item.itemType, item.itemID, trusted_user) ) if not trusted_user or item.itemType == 'TGT': next_item = item break completed_items += 1 if not next_item: LOGGER.info('No next item found for task {0}'.format(self.id)) annotations = DirectAssessmentDocumentResult.objects.filter( task=self, activated=False, completed=True).values_list('item_id', flat=True) uniqueAnnotations = len(set(annotations)) required_user_results = 100 if trusted_user: required_user_results = 70 _total_required = self.requiredAnnotations * required_user_results LOGGER.info('Unique annotations={0}/{1}'.format( uniqueAnnotations, _total_required)) if uniqueAnnotations >= _total_required: LOGGER.info('Completing task {0}'.format(self.id)) self.complete() self.save() # Not sure why I would complete the batch here? # self.batchData.complete() # self.batchData.save() if return_completed_items: return (next_item, completed_items) return next_item def next_document_for_user(self, user, return_statistics=True): """Returns the next item and all items from its document.""" # Find the next not annotated item ( next_item, completed_items, ) = self.next_item_for_user(user, return_completed_items=True) if not next_item: if not return_statistics: return (next_item, [], []) return (next_item, completed_items, 0, 0, [], [], 0) # Retrieve all items from the document which next_item belongs to _items = self.items.filter( documentID=next_item.documentID, ).order_by('id') block_items = [] current_block = False for item in _items: block_items.append(item) if item.id == next_item.id: current_block = True if item.isCompleteDocument: if current_block: break block_items.clear() # Get results for completed items in this block block_results = self.get_results_for_each_item(block_items, user) if not return_statistics: return (next_item, block_items, block_results) # Collect statistics completed_items_in_block = len( [res for res in block_results if res is not None]) completed_blocks = DirectAssessmentDocumentResult.objects.filter( task=self, item__isCompleteDocument=True, completed=True, createdBy=user).count() total_blocks = self.items.filter(isCompleteDocument=True).count() print( 'Completed {}/{} documents, {}/{} items in the current document, completed {} items in total' \ .format(completed_blocks, total_blocks, completed_items_in_block, len(block_items), completed_items) ) return ( next_item, # the first unannotated item for the user completed_items, # the number of completed items in the task completed_blocks, # the number of completed documents in the task completed_items_in_block, # the number of completed items in the current document block_items, # all items from the current document block_results, # all score results from the current document total_blocks, # the total number of documents in the task ) def get_results_for_each_item(self, block_items, user): """Returns the latest result object for each item or none.""" # TODO: optimize, this possibly makes too many individual queries block_results = [] for item in block_items: result = DirectAssessmentDocumentResult.objects.filter( item__id=item.id, completed=True, createdBy=user, # TODO: is passing user as an argument needed? task=self).order_by('item__id', 'dateModified').first() block_results.append(result) # Sanity checks for items and results if len(block_items) != len(block_results): print('Warning: incorrect number of retrieved results!') for item, result in zip(block_items, block_results): # print(f' >> item={item} result={result}') if result and item.id != result.item.id: print('Warning: incorrect order of items and results!') return block_results @classmethod def get_task_for_user(cls, user): for active_task in cls.objects.filter(assignedTo=user, activated=True, completed=False).order_by('-id'): next_item = active_task.next_item_for_user(user) if next_item is not None: return active_task return None @classmethod def get_next_free_task_for_language(cls, code, campaign=None, user=None): active_tasks = cls.objects.filter( activated=True, completed=False, items__metadata__market__targetLanguageCode=code) if campaign: active_tasks = active_tasks.filter(campaign=campaign) for active_task in active_tasks.order_by('id'): active_users = active_task.assignedTo.count() if active_users < active_task.requiredAnnotations: if user and not user in active_task.assignedTo.all(): return active_task return None # It seems that assignedTo is converted to an integer count. active_tasks = active_tasks.order_by('id') \ .values_list('id', 'requiredAnnotations', 'assignedTo') for active_task in active_tasks: print(active_task) active_users = active_task[2] or 0 if active_users < active_task[1]: return cls.objects.get(pk=active_task[0]) return None # TODO: this needs to be removed. for active_task in active_tasks: market = active_task.items.first().metadata.market if not market.targetLanguageCode == code: continue active_users = active_task.assignedTo.count() if active_users < active_task.requiredAnnotations: return active_task return None @classmethod def get_next_free_task_for_language_and_campaign(cls, code, campaign): return cls.get_next_free_task_for_language(code, campaign) @classmethod def import_from_json(cls, campaign, batch_user, batch_data, max_count): """ Creates new DirectAssessmentDocumentTask instances based on JSON input. """ batch_meta = batch_data.metadata batch_name = batch_data.dataFile.name batch_file = batch_data.dataFile batch_json = None if batch_name.endswith('.zip'): if not is_zipfile(batch_file): _msg = 'Batch {0} not a valid ZIP archive'.format(batch_name) LOGGER.warn(_msg) return batch_zip = ZipFile(batch_file) batch_json_files = [ x for x in batch_zip.namelist() if x.endswith('.json') ] # TODO: implement proper support for multiple json files in archive. for batch_json_file in batch_json_files: batch_content = batch_zip.read(batch_json_file).decode('utf-8') batch_json = loads(batch_content, encoding='utf-8') else: batch_json = loads(str(batch_file.read(), encoding="utf-8")) from datetime import datetime t1 = datetime.now() current_count = 0 max_length_id = 0 max_length_text = 0 for batch_task in batch_json: if max_count > 0 and current_count >= max_count: _msg = 'Stopping after max_count={0} iterations'.format( max_count) LOGGER.info(_msg) t2 = datetime.now() print(t2 - t1) return print(batch_name, batch_task['task']['batchNo']) doc_items = 0 new_items = [] for item in batch_task['items']: current_length_id = len(item['targetID']) current_length_text = len(item['targetText']) if current_length_id > max_length_id: print(current_length_id, item['targetID']) max_length_id = current_length_id if current_length_text > max_length_text: print(current_length_text, item['targetText'].encode('utf-8')) max_length_text = current_length_text new_item = TextPairWithContext( sourceID=item['sourceID'], sourceText=item['sourceText'], sourceContextLeft=item.get('sourceContextLeft', None), sourceContextRight=item.get('sourceContextRight', None), targetID=item['targetID'], targetText=item['targetText'], targetContextLeft=item.get('targetContextLeft', None), targetContextRight=item.get('targetContextRight', None), createdBy=batch_user, itemID=item['itemID'], itemType=item['itemType'], documentID=item['documentID'], isCompleteDocument=item['isCompleteDocument'], ) new_items.append(new_item) if item['isCompleteDocument']: doc_items += 1 if (len(new_items) - doc_items) != 100: _msg = 'Expected 100 items for task but found {0}'.format( len(new_items) - doc_items) LOGGER.warn(_msg) continue current_count += 1 for new_item in new_items: new_item.metadata = batch_meta new_item.save() #batch_meta.textpairwithcontext_set.add(*new_items, bulk=False) #batch_meta.save() new_task = DirectAssessmentDocumentTask( campaign=campaign, requiredAnnotations=batch_task['task']['requiredAnnotations'], batchNo=batch_task['task']['batchNo'], batchData=batch_data, createdBy=batch_user, ) new_task.save() #for new_item in new_items: # new_task.items.add(new_item) new_task.items.add(*new_items) new_task.save() _msg = 'Success processing batch {0}, task {1}'.format( str(batch_data), batch_task['task']['batchNo']) LOGGER.info(_msg) _msg = 'Max length ID={0}, text={1}'.format(max_length_id, max_length_text) LOGGER.info(_msg) t2 = datetime.now() print(t2 - t1) # pylint: disable=E1101 def is_valid(self): """ Validates the current DA task, checking campaign and items exist. """ if not hasattr(self, 'campaign') or not self.campaign.is_valid(): return False if not hasattr(self, 'items'): return False for item in self.items: if not item.is_valid(): return False return True def _generate_str_name(self): return '{0}.{1}[{2}]'.format(self.__class__.__name__, self.campaign, self.id)
class Campaign(BaseMetadata): """ Models an evaluation campaign. """ campaignName = models.CharField( max_length=MAX_CAMPAIGNNAME_LENGTH, verbose_name=_('Campaign name'), help_text=_( f('(max. {value} characters)', value=MAX_CAMPAIGNNAME_LENGTH)), ) teams = models.ManyToManyField( CampaignTeam, blank=True, related_name='%(app_label)s_%(class)s_teams', related_query_name="%(app_label)s_%(class)ss", verbose_name=_('Teams'), ) batches = models.ManyToManyField( CampaignData, blank=True, related_name='%(app_label)s_%(class)s_batches', related_query_name="%(app_label)s_%(class)ss", verbose_name=_('Batches'), ) packageFile = models.FileField( blank=True, null=True, verbose_name=_('Package file'), upload_to='Packages', validators=[_validate_package_file], ) def _generate_str_name(self): return self.campaignName @classmethod def get_campaign_or_raise(cls, campaign_name): """ Get campaign with name campaign_name from database. Returns Campaign instance if exists, otherwise LookupError. """ _obj = Campaign.objects.filter(campaignName=campaign_name) if not _obj.exists(): _msg = 'Failure to identify campaign {0}'.format(campaign_name) raise LookupError(_msg) return _obj.first() # if multiple campaigns, return first def get_campaign_type(self) -> str: """ Get campaign type based on evaldata_{cls_name}_campaign QuerySet. For now, we assume that campaigns can only have a single type. We use the following check to identify the campaign's type: c.evaldata_directassessmentcontexttask_campaign.exists() Returns class object, which is a sub class of BaseAnnotationTask. """ for cls_name in AnnotationTaskRegistry.get_types(): qs_name = cls_name.lower() qs_attr = 'evaldata_{0}_campaign'.format(qs_name) qs_obj = getattr(self, qs_attr, None) if qs_obj and qs_obj.exists(): return cls_name _msg = 'Unknown type for campaign {0}'.format(self.campaignName) raise LookupError(_msg) # This should never happen, thus raise!
class CampaignTeam(BaseMetadata): """ Models a campaign team. """ teamName = models.CharField( max_length=MAX_TEAMNAME_LENGTH, verbose_name=_('Team name'), help_text=_(f('(max. {value} characters)', value=MAX_TEAMNAME_LENGTH)), ) owner = models.ForeignKey( User, limit_choices_to={'is_staff': True}, on_delete=models.PROTECT, related_name='%(app_label)s_%(class)s_owner', related_query_name="%(app_label)s_%(class)ss", verbose_name=_('Team owner'), help_text=_('(must be staff member)'), ) members = models.ManyToManyField( User, related_name='%(app_label)s_%(class)s_members', related_query_name="%(app_label)s_%(class)ss", verbose_name=_('Team members'), ) requiredAnnotations = models.PositiveSmallIntegerField( verbose_name=_('Required annotations'), help_text=_( f('(value in range=[1,{value}])', value=MAX_SMALLINTEGER_VALUE)), ) requiredHours = models.PositiveSmallIntegerField( verbose_name=_('Required hours'), help_text=_( f('(value in range=[1,{value}])', value=MAX_SMALLINTEGER_VALUE)), ) # pylint: disable=C0111,R0903 class Meta: ordering = ['_str_name'] verbose_name = 'Team' verbose_name_plural = 'Teams' def _generate_str_name(self): return '{0} ({1})'.format(self.teamName, self.owner) def is_valid(self): """ Validates the current CampaignTeam instance. """ try: self.full_clean() return True except ValidationError: return False # pylint: disable=C0103,E1101 def teamMembers(self): """ Proxy method returning members count. """ return self.members.count() teamMembers.short_description = '# of team members' # TODO: Connect to actual data, producing correct completion status. # pylint: disable=no-self-use def completionStatus(self): """ Proxy method return completion status in percent. This is defined to be the minimum of: - # of completed annotations / # required annotations; and - # of completed hours / # required hours. """ return '0%' completionStatus.short_description = 'Completion status'
class Market(BaseMetadata): """ Models a language/locale market. """ ### # Each market has a unique ID composed of source, target language codes # and application domain name. This also acts as primary lookup key. # # By assumption, source language content has been produced natively. # For monolingual content, source and target codes are identical. ### marketID = models.CharField( max_length=2 * MAX_LANGUAGECODE_LENGTH + MAX_DOMAINNAME_LENGTH + 2, editable=False, unique=True ) sourceLanguageCode = models.CharField( max_length=MAX_LANGUAGECODE_LENGTH, verbose_name=_('Source language'), help_text=_(f('(max. {value} characters)', value=MAX_LANGUAGECODE_LENGTH)) ) targetLanguageCode = models.CharField( max_length=MAX_LANGUAGECODE_LENGTH, verbose_name=_('Target language'), help_text=_(f('(max. {value} characters)', value=MAX_LANGUAGECODE_LENGTH)) ) domainName = models.CharField( max_length=MAX_DOMAINNAME_LENGTH, verbose_name=_('Domain name'), help_text=_(f('(max. {value} characters)', value=MAX_DOMAINNAME_LENGTH)) ) def clean_fields(self, exclude=None): """ Verifies that desired marketID is still available. """ _new_marketID = '{0}_{1}_{2}'.format( self.sourceLanguageCode, self.targetLanguageCode, self.domainName ) _market_instance = Market.objects.filter(marketID=_new_marketID) if _market_instance.exists(): raise ValidationError( _(f('Market with identical marketID ("{mID}") already exists.', mID=_new_marketID)) ) super(Market, self).clean_fields(exclude) def save(self, *args, **kwargs): _new_marketID = '{0}_{1}_{2}'.format( self.sourceLanguageCode, self.targetLanguageCode, self.domainName ) self.marketID = _new_marketID super(Market, self).save(*args, **kwargs) # TODO: what is this used for? Candidate for deprecation/removal. # # pylint: disable=E1101 def my_is_valid(self): """ Validates the current Market instance, checking marketID uniqueness. """ _expected_marketID = '{0}_{1}_{2}'.format( self.sourceLanguageCode, self.targetLanguageCode, self.domainName ) _market_instance = Market.objects.filter( marketID=_expected_marketID) if not hasattr(self, "marketID") or self.marketID == '': if _market_instance.exists(): return False else: _market_instance_obj = _market_instance.get() if _market_instance_obj is not None \ and self.id != _market_instance_obj.id: return False return super(Market, self).is_valid() def _generate_str_name(self): return self.marketID