Python extract_tagの例、utils.extract_tag Pythonの例

コード例 #1

0

ファイルを表示

ファイル: pfif_validator.py プロジェクト: google/personfinder

 def validate_field_order(self, records, field_type):
   """Validates that all subnodes of field_type (either person or note) are in
   the correct order.  For version 1.1, this means that all fields must be in
   the order specified in the spec (the same as the FIELD_ORDER data structure)
   or omitted.  For version 1.2, person_record_id must appear first and notes
   must appear last in a person, and note_record_id and person_record_id must
   appear first in notes"""
   messages = []
   # 1.3 and above don't have order
   if self.version < 1.3:
     field_order = PfifValidator.FIELD_ORDER[self.version][field_type]
     for record in records:
       # foreach field, if this field is lower than the current max field, it
       # represents an invalid order
       curr_max = 0
       for field in record.getchildren():
         tag = utils.extract_tag(field.tag)
         if tag in field_order:
           field_order_num = field_order[tag]
           if field_order_num >= curr_max:
             curr_max = field_order_num
           else:
             messages.append(self.make_message(
                 'One of your fields was out of order.',
                 record=record, element=field))
             break
   return messages

コード例 #2

0

ファイルを表示

 def validate_field_order(self, records, field_type):
     """Validates that all subnodes of field_type (either person or note) are in
 the correct order.  For version 1.1, this means that all fields must be in
 the order specified in the spec (the same as the FIELD_ORDER data structure)
 or omitted.  For version 1.2, person_record_id must appear first and notes
 must appear last in a person, and note_record_id and person_record_id must
 appear first in notes"""
     messages = []
     # 1.3 and above don't have order
     if self.version < 1.3:
         field_order = PfifValidator.FIELD_ORDER[self.version][field_type]
         for record in records:
             # foreach field, if this field is lower than the current max field, it
             # represents an invalid order
             curr_max = 0
             for field in record.getchildren():
                 tag = utils.extract_tag(field.tag)
                 if tag in field_order:
                     field_order_num = field_order[tag]
                     if field_order_num >= curr_max:
                         curr_max = field_order_num
                     else:
                         messages.append(
                             self.make_message(
                                 'One of your fields was out of order.',
                                 record=record,
                                 element=field))
                         break
     return messages

コード例 #3

0

ファイルを表示

ファイル: pfif_validator.py プロジェクト: google/personfinder

 def validate_notes_belong_to_persons(self):
   """Validates that every note that is at the top level contains a
   person_record_id and that every note inside a person with a person_record_id
   matches the id of the parent person.  Returns a list of all unmatched
   notes"""
   messages = []
   top_level_notes = self.tree.get_top_level_notes()
   for note in top_level_notes:
     person_id = note.find(self.tree.add_namespace_to_tag('person_record_id'))
     if person_id == None:
       messages.append(self.make_message(
           'A top level note (a note not contained within a person) is '
           'missing a person_record_id.', record=note, element=note))
   persons = self.tree.get_all_persons()
   for person in persons:
     person_id = person.find(
         self.tree.add_namespace_to_tag('person_record_id'))
     if person_id != None:
       notes = person.findall(self.tree.add_namespace_to_tag('note'))
       for note in notes:
         note_person_id = note.find(
             self.tree.add_namespace_to_tag('person_record_id'))
         if note_person_id != None and note_person_id.text != person_id.text:
           messages.append(utils.Message(
               'You have a note that has a person_record_id that does not '
               'match the person_record_id of the person that owns the note.',
               xml_line_number=self.tree.line_numbers[note_person_id],
               xml_tag=utils.extract_tag(note_person_id.tag),
               xml_text=note_person_id.text,
               person_record_id=self.tree.get_field_text(person,
                                                         'person_record_id'),
               note_record_id=self.tree.get_field_text(note,
                                                       'note_record_id')))
   return messages

コード例 #4

0

ファイルを表示

 def make_message(self,
                  category,
                  record,
                  element=None,
                  xml_tag=None,
                  is_error=True):
     """Wrapper for initializing a Message that extracts the person_record_id and
 note_record_id, if present, from a record and the text and line number from
 an element"""
     person_record_id = self.tree.get_field_text(record, 'person_record_id')
     note_record_id = self.tree.get_field_text(record, 'note_record_id')
     tag = xml_tag
     text = None
     line = None
     if element != None:
         tag = utils.extract_tag(element.tag)
         text = element.text
         line = self.tree.line_numbers[element]
     return utils.Message(category,
                          is_error=is_error,
                          xml_line_number=line,
                          xml_tag=tag,
                          xml_text=text,
                          person_record_id=person_record_id,
                          note_record_id=note_record_id)

コード例 #5

0

ファイルを表示

ファイル: create_json.py プロジェクト: sanskrit-today/kosha

def create_json(content):
    verseDetails = VerseInfo()
    lines = content.split('\n')
    print(len(lines))
    verse = ''
    for lin in lines:
        if lin.startswith(';'):
            (tag, value) = utils.extract_tag(lin)
            if tag == 'p':
                verseDetails.update_pageNum(value)
            if tag == 'k':
                verseDetails.update_kanda(value)
            if tag == 'v':
                verseDetails.update_varga(value)
            if tag == 'vv':
                verseDetails.update_subvarga(value)
        else:
            verse += lin + '<BR>'
            ser = re.search('॥ ([०१२३४५६७८९\-\/]+) ॥', lin)
            if ser:
                versenum = ser.group(1)
                verse = re.sub('।([^ ])', '। \g<1>', verse)
                verseDetails.update_verseNum(verse)
                #verse = sanscript.transliterate(verse, 'devanagari', 'slp1')
                verseNumDetails = verseDetails.give_verse_details()
                pageNumDetails = verseDetails.give_page_details()
                verse = verse.rstrip('<BR>')
                print(verseNumDetails)
                print(pageNumDetails)
                print(verse)
                verse = ''

コード例 #6

0

ファイルを表示

def objectify_parents(parents,
                      is_person,
                      object_map,
                      tree,
                      parent_person_record_id=None,
                      ignore_fields=None,
                      omit_blank_fields=False):
    """Adds the object representation of each parent in parents to object_map.
  If is_person, all parents are assumed to be persons (else, notes).  Tree is
  a PfifXmlTree.  Specifying parent_person_record_id is used for recursive
  calls when a person has a note as a child.  Any fields in ignore_fields will
  not be added to object_map."""
    if ignore_fields is None:
        ignore_fields = []
    if is_person:
        record_id_tag = 'person_record_id'
    else:
        record_id_tag = 'note_record_id'
    for parent in parents:
        record_id = tree.get_field_text(parent, record_id_tag)
        if record_id is None:
            # TODO(samking): better handling of this error?
            print('Invalid PFIF XML: a record is missing its ' + record_id_tag)
        else:
            record_map = object_map.setdefault(
                record_id_to_key(record_id, is_person), {})
            # If this note is a child of a person, it isn't required to have a
            # person_record_id, but it's easier to deal with notes that have
            # person_record_ids, so we force-add it.
            if (not is_person and parent_person_record_id is not None
                    and 'person_record_id' not in ignore_fields):
                record_map['person_record_id'] = parent_person_record_id
            for child in parent.getchildren():
                field_name = utils.extract_tag(child.tag)
                # Don't add any ignored fields.  Also, we'll deal with notes together,
                # so skip them.
                if (field_name
                        not in ignore_fields) and (not is_person
                                                   or field_name != 'note'):
                    # if there is no text in the node, use the empty string, not None
                    field_value = child.text or ''
                    # Add the record unless field_value is blank and omit_blank_fields
                    if field_value or not omit_blank_fields:
                        record_map[field_name] = field_value
            if is_person:
                sub_notes = parent.findall(tree.add_namespace_to_tag('note'))
                objectify_parents(sub_notes,
                                  False,
                                  object_map,
                                  tree,
                                  parent_person_record_id=record_id,
                                  ignore_fields=ignore_fields,
                                  omit_blank_fields=omit_blank_fields)

コード例 #7

0

ファイルを表示

ファイル: pfif_validator.py プロジェクト: google/personfinder

 def validate_root_has_mandatory_children(self):
   """In 1.1, the root must have at least a person node.  In 1.2+, the root
   must either have a person or note node.  Returns true if the tree has the
   required node.  Note that extraneous nodes will not be reported here, but in
   a later test, so if the root has a person and a note node in version 1.1,
   that will return true."""
   children = self.tree.getroot().getchildren()
   for child in children:
     tag = utils.extract_tag(child.tag)
     if tag == 'person' or (self.version >= 1.2 and tag == 'note'):
       return []
   return [utils.Message('Having a person tag (or a note tag in PFIF 1.2+) as '
                         'one of the children of the root node is mandatory.')]

コード例 #8

0

ファイルを表示

 def validate_root_has_mandatory_children(self):
     """In 1.1, the root must have at least a person node.  In 1.2+, the root
 must either have a person or note node.  Returns true if the tree has the
 required node.  Note that extraneous nodes will not be reported here, but in
 a later test, so if the root has a person and a note node in version 1.1,
 that will return true."""
     children = self.tree.getroot().getchildren()
     for child in children:
         tag = utils.extract_tag(child.tag)
         if tag == 'person' or (self.version >= 1.2 and tag == 'note'):
             return []
     return [
         utils.Message(
             'Having a person tag (or a note tag in PFIF 1.2+) as '
             'one of the children of the root node is mandatory.')
     ]

コード例 #9

0

ファイルを表示

ファイル: search_in_dict.py プロジェクト: sanskrit-today/kosha

def search_in_dict(query, code):
    # ENSK -> ekaksharanamamala_sadhukalashagani
    fullName = utils.code_to_dict(code)
    # ekaksharanamamala, sadhukalashagani
    bookName, author = fullName.split('_')
    # Read the .txt file
    filein = os.path.join('..', fullName, 'slp', bookName + '.txt')
    result = []
    verseDetails = utils.VerseInfo()
    verse = ''
    writeVerse = False
    for lin in codecs.open(filein, 'r', 'utf-8'):
        if lin.startswith(';'):
            (tag, value) = utils.extract_tag(lin)
            if tag == 'p':
                verseDetails.update_pageNum(value)
            if tag == 'k':
                verseDetails.update_kanda(value)
            if tag == 'v':
                verseDetails.update_varga(value)
            if tag == 'vv':
                verseDetails.update_subvarga(value)
        elif re.search('^[$#]', lin):
            pass
        else:
            verse += lin
            if query in lin:
                writeVerse = True
            if '..' in lin:
                verseDetails.update_verseNum(verse)
                if writeVerse:
                    page = verseDetails.give_page_details()
                    kanda = verseDetails.kanda
                    varga = verseDetails.varga
                    adhyaya = verseDetails.subvarga
                    versenum = verseDetails.verseNum
                    result.append({
                        'verse': verse,
                        'page': page,
                        'versenum': versenum,
                        'kanda': kanda,
                        'varga': varga,
                        'adhyaya': adhyaya
                    })
                writeVerse = False
                verse = ''
    return result

コード例 #10

0

ファイルを表示

ファイル: pfif_diff.py プロジェクト: google/personfinder

def objectify_parents(parents, is_person, object_map, tree,
                      parent_person_record_id=None, ignore_fields=None,
                      omit_blank_fields=False):
  """Adds the object representation of each parent in parents to object_map.
  If is_person, all parents are assumed to be persons (else, notes).  Tree is
  a PfifXmlTree.  Specifying parent_person_record_id is used for recursive
  calls when a person has a note as a child.  Any fields in ignore_fields will
  not be added to object_map."""
  if ignore_fields is None:
    ignore_fields = []
  if is_person:
    record_id_tag = 'person_record_id'
  else:
    record_id_tag = 'note_record_id'
  for parent in parents:
    record_id = tree.get_field_text(parent, record_id_tag)
    if record_id is None:
      # TODO(samking): better handling of this error?
      print('Invalid PFIF XML: a record is missing its ' + record_id_tag)
    else:
      record_map = object_map.setdefault(
          record_id_to_key(record_id, is_person), {})
      # If this note is a child of a person, it isn't required to have a
      # person_record_id, but it's easier to deal with notes that have
      # person_record_ids, so we force-add it.
      if (not is_person and parent_person_record_id is not None and
          'person_record_id' not in ignore_fields):
        record_map['person_record_id'] = parent_person_record_id
      for child in parent.getchildren():
        field_name = utils.extract_tag(child.tag)
        # Don't add any ignored fields.  Also, we'll deal with notes together,
        # so skip them.
        if (field_name not in ignore_fields) and (not is_person or field_name !=
                                                  'note'):
          # if there is no text in the node, use the empty string, not None
          field_value = child.text or ''
          # Add the record unless field_value is blank and omit_blank_fields
          if field_value or not omit_blank_fields:
            record_map[field_name] = field_value
      if is_person:
        sub_notes = parent.findall(tree.add_namespace_to_tag('note'))
        objectify_parents(sub_notes, False, object_map, tree,
                          parent_person_record_id=record_id,
                          ignore_fields=ignore_fields,
                          omit_blank_fields=omit_blank_fields)

コード例 #11

0

ファイルを表示

ファイル: pfif_validator.py プロジェクト: google/personfinder

 def make_message(self, category, record, element=None,
                  xml_tag=None, is_error=True):
   """Wrapper for initializing a Message that extracts the person_record_id and
   note_record_id, if present, from a record and the text and line number from
   an element"""
   person_record_id = self.tree.get_field_text(record, 'person_record_id')
   note_record_id = self.tree.get_field_text(record, 'note_record_id')
   tag = xml_tag
   text = None
   line = None
   if element != None:
     tag = utils.extract_tag(element.tag)
     text = element.text
     line = self.tree.line_numbers[element]
   return utils.Message(category, is_error=is_error, xml_line_number=line,
                        xml_tag=tag, xml_text=text,
                        person_record_id=person_record_id,
                        note_record_id=note_record_id)

コード例 #12

0

ファイルを表示

ファイル: pfif_validator.py プロジェクト: google/personfinder

 def validate_extraneous_children(self, parents, approved_tags):
   """For each parent in parents, ensures that every child's tag is in
   approved_tags and is not a duplicate (except for notes and persons).
   Returns a list of all extraneous tags."""
   messages = []
   for parent in parents:
     used_tags = []
     for child in parent.getchildren():
       tag = utils.extract_tag(child.tag)
       if tag in used_tags and tag != 'note' and tag != 'person':
         messages.append(self.make_message('Duplicate Tag.', record=parent,
                                           element=child))
       elif tag not in approved_tags:
         messages.append(self.make_message('Extraneous Tag.', record=parent,
                                           element=child))
       else:
         used_tags.append(tag)
   return messages

コード例 #13

0

ファイルを表示

ファイル: pfif_validator.py プロジェクト: google/personfinder

 def validate_personal_data_removed(self, record):
   """After expiration, a person can only contain placeholder data, which
   includes all fields aside from PLACEHOLDER_FIELDS.  All other data is
   personal data.  Adds an error message if there is any personal data in
   the record"""
   messages = []
   children = record.getchildren()
   for child in children:
     tag = utils.extract_tag(child.tag)
     if tag not in PfifValidator.PLACEHOLDER_FIELDS:
       if child.text:
         # notes with text are okay as long as none of their children have text
         if tag == 'note':
           messages.extend(self.validate_personal_data_removed(child))
         else:
           messages.append(self.make_message(
               'An expired record still has personal data.',
               record=record, element=child))
   return messages

コード例 #14

0

ファイルを表示

 def validate_notes_belong_to_persons(self):
     """Validates that every note that is at the top level contains a
 person_record_id and that every note inside a person with a person_record_id
 matches the id of the parent person.  Returns a list of all unmatched
 notes"""
     messages = []
     top_level_notes = self.tree.get_top_level_notes()
     for note in top_level_notes:
         person_id = note.find(
             self.tree.add_namespace_to_tag('person_record_id'))
         if person_id == None:
             messages.append(
                 self.make_message(
                     'A top level note (a note not contained within a person) is '
                     'missing a person_record_id.',
                     record=note,
                     element=note))
     persons = self.tree.get_all_persons()
     for person in persons:
         person_id = person.find(
             self.tree.add_namespace_to_tag('person_record_id'))
         if person_id != None:
             notes = person.findall(self.tree.add_namespace_to_tag('note'))
             for note in notes:
                 note_person_id = note.find(
                     self.tree.add_namespace_to_tag('person_record_id'))
                 if note_person_id != None and note_person_id.text != person_id.text:
                     messages.append(
                         utils.Message(
                             'You have a note that has a person_record_id that does not '
                             'match the person_record_id of the person that owns the note.',
                             xml_line_number=self.tree.
                             line_numbers[note_person_id],
                             xml_tag=utils.extract_tag(note_person_id.tag),
                             xml_text=note_person_id.text,
                             person_record_id=self.tree.get_field_text(
                                 person, 'person_record_id'),
                             note_record_id=self.tree.get_field_text(
                                 note, 'note_record_id')))
     return messages

コード例 #15

0

ファイルを表示

 def validate_personal_data_removed(self, record):
     """After expiration, a person can only contain placeholder data, which
 includes all fields aside from PLACEHOLDER_FIELDS.  All other data is
 personal data.  Adds an error message if there is any personal data in
 the record"""
     messages = []
     children = record.getchildren()
     for child in children:
         tag = utils.extract_tag(child.tag)
         if tag not in PfifValidator.PLACEHOLDER_FIELDS:
             if child.text:
                 # notes with text are okay as long as none of their children have text
                 if tag == 'note':
                     messages.extend(
                         self.validate_personal_data_removed(child))
                 else:
                     messages.append(
                         self.make_message(
                             'An expired record still has personal data.',
                             record=record,
                             element=child))
     return messages

コード例 #16

0

ファイルを表示

 def validate_extraneous_children(self, parents, approved_tags):
     """For each parent in parents, ensures that every child's tag is in
 approved_tags and is not a duplicate (except for notes and persons).
 Returns a list of all extraneous tags."""
     messages = []
     for parent in parents:
         used_tags = []
         for child in parent.getchildren():
             tag = utils.extract_tag(child.tag)
             if tag in used_tags and tag != 'note' and tag != 'person':
                 messages.append(
                     self.make_message('Duplicate Tag.',
                                       record=parent,
                                       element=child))
             elif tag not in approved_tags:
                 messages.append(
                     self.make_message('Extraneous Tag.',
                                       record=parent,
                                       element=child))
             else:
                 used_tags.append(tag)
     return messages

コード例 #17

0

ファイルを表示

ファイル: test_utils.py プロジェクト: yashLadha/personfinder

 def test_tag(self):
     """extract_tag should return the original string when the string does not
 start with a namespace"""
     self.assertEqual(utils.extract_tag("foo"), "foo")

コード例 #18

0

ファイルを表示

ファイル: test_utils.py プロジェクト: yashLadha/personfinder

 def test_tag_and_namespace(self):
     """extract_tag should return the local tag when the string starts with a
 namespace"""
     self.assertEqual(utils.extract_tag("{foo}bar"), "bar")

コード例 #19

0

ファイルを表示

ファイル: test_utils.py プロジェクト: yashLadha/personfinder

 def test_blank_input(self):
     """extract_tag should return an empty string on blank input"""
     self.assertEqual(utils.extract_tag(""), "")

コード例 #20

0

ファイルを表示

ファイル: test_utils.py プロジェクト: google/personfinder

 def test_tag(self):
   """extract_tag should return the original string when the string does not
   start with a namespace"""
   self.assertEqual(utils.extract_tag("foo"), "foo")

コード例 #21

0

ファイルを表示

ファイル: test_utils.py プロジェクト: google/personfinder

 def test_tag_and_namespace(self):
   """extract_tag should return the local tag when the string starts with a
   namespace"""
   self.assertEqual(utils.extract_tag("{foo}bar"), "bar")

コード例 #22

0

ファイルを表示

def homonymic_list_generator(content):
    """Prepare a list with (headword, meanings, verse, verseNumDetails and pageNumDetails) tuple."""
    # Making it global so that it can be used in other functions too.
    global verseDetails
    # Initialize a VerseInfo class instance.
    verseDetails = utils.VerseInfo()
    # Result will store tuples (headword, meaning, verse)
    result = []
    # Initialize blank verse
    verse = ''
    # lineType list holds 'h', 'm', 'v' for headword, meaning and verse lines.
    lineType = []
    # Read the content into list of lines.
    lines = content.split('\n')
    # A temporary placeholder which will be emptied into result list
    # whenever the verse is allocated to it.
    wordsOnHand = []
    for line in lines:
        # If the line is headword line,
        if line.startswith('$'):
            # If the preceding line was a verse, and current a headword,
            # time to add to result list
            if lineType[-1] == 'v':
                verseDetails.update_verseNum(verse)
                (verse, wordsOnHand,
                 result) = putVerse(verse, wordsOnHand, result)
            # Extract the headword and gender from headword line.
            # Typical headword line is `$headword;gender`
            headword, gender = line.rstrip().lstrip('$').split(';')
            # lineType is appended with 'h' for headword.
            lineType.append('h')
        # If the line is a meaning line,
        elif line.startswith('#'):
            # typical meaning line is `#meaning1,meaning2,meaning3,...`
            meanings = line.rstrip().lstrip('#').split(',')
            # Store the (headword, meaning) tuples in temporary wordsOnHand list.
            # They will keep on waiting for the verse.
            # Once verse is added, and a new headword starts, this will be added to result list.
            wordsOnHand.append((headword, meanings))
            # lineType is marked 'm' for meaning.
            lineType.append('m')
        elif line.startswith(';'):
            (tag, value) = utils.extract_tag(line)
            if tag == 'p':
                verseDetails.update_pageNum(value)
            if tag == 'k':
                verseDetails.update_kanda(value)
            if tag == 'v':
                verseDetails.update_varga(value)
            if tag == 'vv':
                verseDetails.update_subvarga(value)
        # Pass the lines having some other markers like ;k for kanda, ;v for varga etc.
        elif line.startswith(';end'):
            # Put the last verse, as there will not be any next headword.
            putVerse(verse, wordsOnHand, result)
        # Lines which are unmarked are verses.
        # The verses may span more than one line too. Therefore adding them up.
        else:
            verse += line + '<BR>'
            # Mark lineType 'v' for verse.
            lineType.append('v')
    return result

コード例 #23

0

ファイルを表示

ファイル: test_utils.py プロジェクト: google/personfinder

 def test_blank_input(self):
   """extract_tag should return an empty string on blank input"""
   self.assertEqual(utils.extract_tag(""), "")