def extract_tags(text): text_clean = clean(text) return sum( [ [dict(tag) for tag in tagger.tag(tag_group, text_clean)] for tag_group in pattern.tag_groups.itervalues() ], [] )
def tag(self, tag_groups=None, overwrite=False, save=True): """Add tags to article. :param list tag_groups: List of TagGroup objects :param bool overwrite: Overwrite existing tags :param bool save: Save record after update :return list: New or modified extracted tags """ tag_groups = tag_groups or pattern.tag_groups.values() if overwrite: self.tags = [] existing_tags = [] else: existing_tags = [ tagger.Tag(tag) for tag in self.tags ] new_tags = [] self.verify(save=False) for document_type in self.verified: document_field = DOCUMENT_TYPES_TO_FIELDS[document_type] document = getattr(self, document_field) # Quit if document not set if document is None: continue doc = document.read() # Quit if document empty or fails verification if not doc: continue # Clean document text doc = clean(doc) for tag_group in tag_groups: # Extract tags tags = tagger.tag(tag_group, doc) for tag in tags: # Build context documents context_data = {document_type: tag['context']} group_data = {document_type: tag['group']} span_data = {document_type: tag['span']} # Update existing tag with context if tag in existing_tags: idx = existing_tags.index(tag) if document_type not in existing_tags[idx]['context']: existing_tags[idx]['context'].update(context_data) existing_tags[idx]['group'].update(group_data) existing_tags[idx]['span'].update(span_data) new_tags.append(existing_tags[idx]) # Create new tag in database else: tag['context'] = context_data tag['group'] = group_data tag['span'] = span_data existing_tags.append(tag) new_tags.append(tag) # Cast tags to dictionaries for ODM compatibility self.tags = [ dict(tag) for tag in existing_tags ] # Update tagged date self.date_last_tagged = datetime.datetime.utcnow() if save: self.save() return new_tags
def tag(self, tag_groups=None, overwrite=False, save=True): """Add tags to article. :param list tag_groups: List of TagGroup objects :param bool overwrite: Overwrite existing tags :param bool save: Save record after update :return list: New or modified extracted tags """ tag_groups = tag_groups or pattern.tag_groups.values() if overwrite: self.tags = [] existing_tags = [] else: existing_tags = [tagger.Tag(tag) for tag in self.tags] new_tags = [] self.verify(save=False) for document_type in self.verified: document_field = DOCUMENT_TYPES_TO_FIELDS[document_type] document = getattr(self, document_field) # Quit if document not set if document is None: continue doc = document.read() # Quit if document empty or fails verification if not doc: continue # Clean document text doc = clean(doc) for tag_group in tag_groups: # Extract tags tags = tagger.tag(tag_group, doc) for tag in tags: # Build context documents context_data = {document_type: tag['context']} group_data = {document_type: tag['group']} span_data = {document_type: tag['span']} # Update existing tag with context if tag in existing_tags: idx = existing_tags.index(tag) if document_type not in existing_tags[idx]['context']: existing_tags[idx]['context'].update(context_data) existing_tags[idx]['group'].update(group_data) existing_tags[idx]['span'].update(span_data) new_tags.append(existing_tags[idx]) # Create new tag in database else: tag['context'] = context_data tag['group'] = group_data tag['span'] = span_data existing_tags.append(tag) new_tags.append(tag) # Cast tags to dictionaries for ODM compatibility self.tags = [dict(tag) for tag in existing_tags] # Update tagged date self.date_last_tagged = datetime.datetime.utcnow() if save: self.save() return new_tags
def test_clean(): assert misc.clean(u"bold\u2212 signal") == "bold signal"
def test_clean(): assert misc.clean(u'bold\u2212 signal') == 'bold signal'
def extract_tags(text): text_clean = clean(text) return sum([[dict(tag) for tag in tagger.tag(tag_group, text_clean)] for tag_group in pattern.tag_groups.itervalues()], [])