def extract_tags(text): text_clean = clean(text) return sum( [ [dict(tag) for tag in tagger.tag(tag_group, text_clean)] for tag_group in pattern.tag_groups.itervalues() ], [] )
def check_taggers(taggers, input, expected): tag_group = TagGroup(taggers, '') tags = tag(tag_group, input) if expected is not None: assert tags is not None if isinstance(tags, list): assert len(tags) == 1 tags = tags[0] for key, value in expected.iteritems(): assert tags[key] == value else: assert (tags is None or tags == [])
def test(): tags = tag(taggers, input) if output is not None: assert_true(tags is not None) if isinstance(tags, list): assert_equal(len(tags), 1) tags = tags[0] for key, value in output.iteritems(): assert_equal( tags[key], value ) else: assert_true(tags is None or len(tags) == 0)
def tag(self, tag_groups=None, overwrite=False, save=True): """Add tags to article. :param list tag_groups: List of TagGroup objects :param bool overwrite: Overwrite existing tags :param bool save: Save record after update :return list: New or modified extracted tags """ tag_groups = tag_groups or pattern.tag_groups.values() if overwrite: self.tags = [] existing_tags = [] else: existing_tags = [ tagger.Tag(tag) for tag in self.tags ] new_tags = [] self.verify(save=False) for document_type in self.verified: document_field = DOCUMENT_TYPES_TO_FIELDS[document_type] document = getattr(self, document_field) # Quit if document not set if document is None: continue doc = document.read() # Quit if document empty or fails verification if not doc: continue # Clean document text doc = clean(doc) for tag_group in tag_groups: # Extract tags tags = tagger.tag(tag_group, doc) for tag in tags: # Build context documents context_data = {document_type: tag['context']} group_data = {document_type: tag['group']} span_data = {document_type: tag['span']} # Update existing tag with context if tag in existing_tags: idx = existing_tags.index(tag) if document_type not in existing_tags[idx]['context']: existing_tags[idx]['context'].update(context_data) existing_tags[idx]['group'].update(group_data) existing_tags[idx]['span'].update(span_data) new_tags.append(existing_tags[idx]) # Create new tag in database else: tag['context'] = context_data tag['group'] = group_data tag['span'] = span_data existing_tags.append(tag) new_tags.append(tag) # Cast tags to dictionaries for ODM compatibility self.tags = [ dict(tag) for tag in existing_tags ] # Update tagged date self.date_last_tagged = datetime.datetime.utcnow() if save: self.save() return new_tags
def tag(self, tag_groups=None, overwrite=False, save=True): """Add tags to article. :param list tag_groups: List of TagGroup objects :param bool overwrite: Overwrite existing tags :param bool save: Save record after update :return list: New or modified extracted tags """ tag_groups = tag_groups or pattern.tag_groups.values() if overwrite: self.tags = [] existing_tags = [] else: existing_tags = [tagger.Tag(tag) for tag in self.tags] new_tags = [] self.verify(save=False) for document_type in self.verified: document_field = DOCUMENT_TYPES_TO_FIELDS[document_type] document = getattr(self, document_field) # Quit if document not set if document is None: continue doc = document.read() # Quit if document empty or fails verification if not doc: continue # Clean document text doc = clean(doc) for tag_group in tag_groups: # Extract tags tags = tagger.tag(tag_group, doc) for tag in tags: # Build context documents context_data = {document_type: tag['context']} group_data = {document_type: tag['group']} span_data = {document_type: tag['span']} # Update existing tag with context if tag in existing_tags: idx = existing_tags.index(tag) if document_type not in existing_tags[idx]['context']: existing_tags[idx]['context'].update(context_data) existing_tags[idx]['group'].update(group_data) existing_tags[idx]['span'].update(span_data) new_tags.append(existing_tags[idx]) # Create new tag in database else: tag['context'] = context_data tag['group'] = group_data tag['span'] = span_data existing_tags.append(tag) new_tags.append(tag) # Cast tags to dictionaries for ODM compatibility self.tags = [dict(tag) for tag in existing_tags] # Update tagged date self.date_last_tagged = datetime.datetime.utcnow() if save: self.save() return new_tags
def extract_tags(text): text_clean = clean(text) return sum([[dict(tag) for tag in tagger.tag(tag_group, text_clean)] for tag_group in pattern.tag_groups.itervalues()], [])