def validate_field_order(self, records, field_type): """Validates that all subnodes of field_type (either person or note) are in the correct order. For version 1.1, this means that all fields must be in the order specified in the spec (the same as the FIELD_ORDER data structure) or omitted. For version 1.2, person_record_id must appear first and notes must appear last in a person, and note_record_id and person_record_id must appear first in notes""" messages = [] # 1.3 and above don't have order if self.version < 1.3: field_order = PfifValidator.FIELD_ORDER[self.version][field_type] for record in records: # foreach field, if this field is lower than the current max field, it # represents an invalid order curr_max = 0 for field in record.getchildren(): tag = utils.extract_tag(field.tag) if tag in field_order: field_order_num = field_order[tag] if field_order_num >= curr_max: curr_max = field_order_num else: messages.append(self.make_message( 'One of your fields was out of order.', record=record, element=field)) break return messages
def validate_field_order(self, records, field_type): """Validates that all subnodes of field_type (either person or note) are in the correct order. For version 1.1, this means that all fields must be in the order specified in the spec (the same as the FIELD_ORDER data structure) or omitted. For version 1.2, person_record_id must appear first and notes must appear last in a person, and note_record_id and person_record_id must appear first in notes""" messages = [] # 1.3 and above don't have order if self.version < 1.3: field_order = PfifValidator.FIELD_ORDER[self.version][field_type] for record in records: # foreach field, if this field is lower than the current max field, it # represents an invalid order curr_max = 0 for field in record.getchildren(): tag = utils.extract_tag(field.tag) if tag in field_order: field_order_num = field_order[tag] if field_order_num >= curr_max: curr_max = field_order_num else: messages.append( self.make_message( 'One of your fields was out of order.', record=record, element=field)) break return messages
def validate_notes_belong_to_persons(self): """Validates that every note that is at the top level contains a person_record_id and that every note inside a person with a person_record_id matches the id of the parent person. Returns a list of all unmatched notes""" messages = [] top_level_notes = self.tree.get_top_level_notes() for note in top_level_notes: person_id = note.find(self.tree.add_namespace_to_tag('person_record_id')) if person_id == None: messages.append(self.make_message( 'A top level note (a note not contained within a person) is ' 'missing a person_record_id.', record=note, element=note)) persons = self.tree.get_all_persons() for person in persons: person_id = person.find( self.tree.add_namespace_to_tag('person_record_id')) if person_id != None: notes = person.findall(self.tree.add_namespace_to_tag('note')) for note in notes: note_person_id = note.find( self.tree.add_namespace_to_tag('person_record_id')) if note_person_id != None and note_person_id.text != person_id.text: messages.append(utils.Message( 'You have a note that has a person_record_id that does not ' 'match the person_record_id of the person that owns the note.', xml_line_number=self.tree.line_numbers[note_person_id], xml_tag=utils.extract_tag(note_person_id.tag), xml_text=note_person_id.text, person_record_id=self.tree.get_field_text(person, 'person_record_id'), note_record_id=self.tree.get_field_text(note, 'note_record_id'))) return messages
def make_message(self, category, record, element=None, xml_tag=None, is_error=True): """Wrapper for initializing a Message that extracts the person_record_id and note_record_id, if present, from a record and the text and line number from an element""" person_record_id = self.tree.get_field_text(record, 'person_record_id') note_record_id = self.tree.get_field_text(record, 'note_record_id') tag = xml_tag text = None line = None if element != None: tag = utils.extract_tag(element.tag) text = element.text line = self.tree.line_numbers[element] return utils.Message(category, is_error=is_error, xml_line_number=line, xml_tag=tag, xml_text=text, person_record_id=person_record_id, note_record_id=note_record_id)
def create_json(content): verseDetails = VerseInfo() lines = content.split('\n') print(len(lines)) verse = '' for lin in lines: if lin.startswith(';'): (tag, value) = utils.extract_tag(lin) if tag == 'p': verseDetails.update_pageNum(value) if tag == 'k': verseDetails.update_kanda(value) if tag == 'v': verseDetails.update_varga(value) if tag == 'vv': verseDetails.update_subvarga(value) else: verse += lin + '<BR>' ser = re.search('॥ ([०१२३४५६७८९\-\/]+) ॥', lin) if ser: versenum = ser.group(1) verse = re.sub('।([^ ])', '। \g<1>', verse) verseDetails.update_verseNum(verse) #verse = sanscript.transliterate(verse, 'devanagari', 'slp1') verseNumDetails = verseDetails.give_verse_details() pageNumDetails = verseDetails.give_page_details() verse = verse.rstrip('<BR>') print(verseNumDetails) print(pageNumDetails) print(verse) verse = ''
def objectify_parents(parents, is_person, object_map, tree, parent_person_record_id=None, ignore_fields=None, omit_blank_fields=False): """Adds the object representation of each parent in parents to object_map. If is_person, all parents are assumed to be persons (else, notes). Tree is a PfifXmlTree. Specifying parent_person_record_id is used for recursive calls when a person has a note as a child. Any fields in ignore_fields will not be added to object_map.""" if ignore_fields is None: ignore_fields = [] if is_person: record_id_tag = 'person_record_id' else: record_id_tag = 'note_record_id' for parent in parents: record_id = tree.get_field_text(parent, record_id_tag) if record_id is None: # TODO(samking): better handling of this error? print('Invalid PFIF XML: a record is missing its ' + record_id_tag) else: record_map = object_map.setdefault( record_id_to_key(record_id, is_person), {}) # If this note is a child of a person, it isn't required to have a # person_record_id, but it's easier to deal with notes that have # person_record_ids, so we force-add it. if (not is_person and parent_person_record_id is not None and 'person_record_id' not in ignore_fields): record_map['person_record_id'] = parent_person_record_id for child in parent.getchildren(): field_name = utils.extract_tag(child.tag) # Don't add any ignored fields. Also, we'll deal with notes together, # so skip them. if (field_name not in ignore_fields) and (not is_person or field_name != 'note'): # if there is no text in the node, use the empty string, not None field_value = child.text or '' # Add the record unless field_value is blank and omit_blank_fields if field_value or not omit_blank_fields: record_map[field_name] = field_value if is_person: sub_notes = parent.findall(tree.add_namespace_to_tag('note')) objectify_parents(sub_notes, False, object_map, tree, parent_person_record_id=record_id, ignore_fields=ignore_fields, omit_blank_fields=omit_blank_fields)
def validate_root_has_mandatory_children(self): """In 1.1, the root must have at least a person node. In 1.2+, the root must either have a person or note node. Returns true if the tree has the required node. Note that extraneous nodes will not be reported here, but in a later test, so if the root has a person and a note node in version 1.1, that will return true.""" children = self.tree.getroot().getchildren() for child in children: tag = utils.extract_tag(child.tag) if tag == 'person' or (self.version >= 1.2 and tag == 'note'): return [] return [utils.Message('Having a person tag (or a note tag in PFIF 1.2+) as ' 'one of the children of the root node is mandatory.')]
def validate_root_has_mandatory_children(self): """In 1.1, the root must have at least a person node. In 1.2+, the root must either have a person or note node. Returns true if the tree has the required node. Note that extraneous nodes will not be reported here, but in a later test, so if the root has a person and a note node in version 1.1, that will return true.""" children = self.tree.getroot().getchildren() for child in children: tag = utils.extract_tag(child.tag) if tag == 'person' or (self.version >= 1.2 and tag == 'note'): return [] return [ utils.Message( 'Having a person tag (or a note tag in PFIF 1.2+) as ' 'one of the children of the root node is mandatory.') ]
def search_in_dict(query, code): # ENSK -> ekaksharanamamala_sadhukalashagani fullName = utils.code_to_dict(code) # ekaksharanamamala, sadhukalashagani bookName, author = fullName.split('_') # Read the .txt file filein = os.path.join('..', fullName, 'slp', bookName + '.txt') result = [] verseDetails = utils.VerseInfo() verse = '' writeVerse = False for lin in codecs.open(filein, 'r', 'utf-8'): if lin.startswith(';'): (tag, value) = utils.extract_tag(lin) if tag == 'p': verseDetails.update_pageNum(value) if tag == 'k': verseDetails.update_kanda(value) if tag == 'v': verseDetails.update_varga(value) if tag == 'vv': verseDetails.update_subvarga(value) elif re.search('^[$#]', lin): pass else: verse += lin if query in lin: writeVerse = True if '..' in lin: verseDetails.update_verseNum(verse) if writeVerse: page = verseDetails.give_page_details() kanda = verseDetails.kanda varga = verseDetails.varga adhyaya = verseDetails.subvarga versenum = verseDetails.verseNum result.append({ 'verse': verse, 'page': page, 'versenum': versenum, 'kanda': kanda, 'varga': varga, 'adhyaya': adhyaya }) writeVerse = False verse = '' return result
def validate_extraneous_children(self, parents, approved_tags): """For each parent in parents, ensures that every child's tag is in approved_tags and is not a duplicate (except for notes and persons). Returns a list of all extraneous tags.""" messages = [] for parent in parents: used_tags = [] for child in parent.getchildren(): tag = utils.extract_tag(child.tag) if tag in used_tags and tag != 'note' and tag != 'person': messages.append(self.make_message('Duplicate Tag.', record=parent, element=child)) elif tag not in approved_tags: messages.append(self.make_message('Extraneous Tag.', record=parent, element=child)) else: used_tags.append(tag) return messages
def validate_personal_data_removed(self, record): """After expiration, a person can only contain placeholder data, which includes all fields aside from PLACEHOLDER_FIELDS. All other data is personal data. Adds an error message if there is any personal data in the record""" messages = [] children = record.getchildren() for child in children: tag = utils.extract_tag(child.tag) if tag not in PfifValidator.PLACEHOLDER_FIELDS: if child.text: # notes with text are okay as long as none of their children have text if tag == 'note': messages.extend(self.validate_personal_data_removed(child)) else: messages.append(self.make_message( 'An expired record still has personal data.', record=record, element=child)) return messages
def validate_notes_belong_to_persons(self): """Validates that every note that is at the top level contains a person_record_id and that every note inside a person with a person_record_id matches the id of the parent person. Returns a list of all unmatched notes""" messages = [] top_level_notes = self.tree.get_top_level_notes() for note in top_level_notes: person_id = note.find( self.tree.add_namespace_to_tag('person_record_id')) if person_id == None: messages.append( self.make_message( 'A top level note (a note not contained within a person) is ' 'missing a person_record_id.', record=note, element=note)) persons = self.tree.get_all_persons() for person in persons: person_id = person.find( self.tree.add_namespace_to_tag('person_record_id')) if person_id != None: notes = person.findall(self.tree.add_namespace_to_tag('note')) for note in notes: note_person_id = note.find( self.tree.add_namespace_to_tag('person_record_id')) if note_person_id != None and note_person_id.text != person_id.text: messages.append( utils.Message( 'You have a note that has a person_record_id that does not ' 'match the person_record_id of the person that owns the note.', xml_line_number=self.tree. line_numbers[note_person_id], xml_tag=utils.extract_tag(note_person_id.tag), xml_text=note_person_id.text, person_record_id=self.tree.get_field_text( person, 'person_record_id'), note_record_id=self.tree.get_field_text( note, 'note_record_id'))) return messages
def validate_personal_data_removed(self, record): """After expiration, a person can only contain placeholder data, which includes all fields aside from PLACEHOLDER_FIELDS. All other data is personal data. Adds an error message if there is any personal data in the record""" messages = [] children = record.getchildren() for child in children: tag = utils.extract_tag(child.tag) if tag not in PfifValidator.PLACEHOLDER_FIELDS: if child.text: # notes with text are okay as long as none of their children have text if tag == 'note': messages.extend( self.validate_personal_data_removed(child)) else: messages.append( self.make_message( 'An expired record still has personal data.', record=record, element=child)) return messages
def validate_extraneous_children(self, parents, approved_tags): """For each parent in parents, ensures that every child's tag is in approved_tags and is not a duplicate (except for notes and persons). Returns a list of all extraneous tags.""" messages = [] for parent in parents: used_tags = [] for child in parent.getchildren(): tag = utils.extract_tag(child.tag) if tag in used_tags and tag != 'note' and tag != 'person': messages.append( self.make_message('Duplicate Tag.', record=parent, element=child)) elif tag not in approved_tags: messages.append( self.make_message('Extraneous Tag.', record=parent, element=child)) else: used_tags.append(tag) return messages
def test_tag(self): """extract_tag should return the original string when the string does not start with a namespace""" self.assertEqual(utils.extract_tag("foo"), "foo")
def test_tag_and_namespace(self): """extract_tag should return the local tag when the string starts with a namespace""" self.assertEqual(utils.extract_tag("{foo}bar"), "bar")
def test_blank_input(self): """extract_tag should return an empty string on blank input""" self.assertEqual(utils.extract_tag(""), "")
def homonymic_list_generator(content): """Prepare a list with (headword, meanings, verse, verseNumDetails and pageNumDetails) tuple.""" # Making it global so that it can be used in other functions too. global verseDetails # Initialize a VerseInfo class instance. verseDetails = utils.VerseInfo() # Result will store tuples (headword, meaning, verse) result = [] # Initialize blank verse verse = '' # lineType list holds 'h', 'm', 'v' for headword, meaning and verse lines. lineType = [] # Read the content into list of lines. lines = content.split('\n') # A temporary placeholder which will be emptied into result list # whenever the verse is allocated to it. wordsOnHand = [] for line in lines: # If the line is headword line, if line.startswith('$'): # If the preceding line was a verse, and current a headword, # time to add to result list if lineType[-1] == 'v': verseDetails.update_verseNum(verse) (verse, wordsOnHand, result) = putVerse(verse, wordsOnHand, result) # Extract the headword and gender from headword line. # Typical headword line is `$headword;gender` headword, gender = line.rstrip().lstrip('$').split(';') # lineType is appended with 'h' for headword. lineType.append('h') # If the line is a meaning line, elif line.startswith('#'): # typical meaning line is `#meaning1,meaning2,meaning3,...` meanings = line.rstrip().lstrip('#').split(',') # Store the (headword, meaning) tuples in temporary wordsOnHand list. # They will keep on waiting for the verse. # Once verse is added, and a new headword starts, this will be added to result list. wordsOnHand.append((headword, meanings)) # lineType is marked 'm' for meaning. lineType.append('m') elif line.startswith(';'): (tag, value) = utils.extract_tag(line) if tag == 'p': verseDetails.update_pageNum(value) if tag == 'k': verseDetails.update_kanda(value) if tag == 'v': verseDetails.update_varga(value) if tag == 'vv': verseDetails.update_subvarga(value) # Pass the lines having some other markers like ;k for kanda, ;v for varga etc. elif line.startswith(';end'): # Put the last verse, as there will not be any next headword. putVerse(verse, wordsOnHand, result) # Lines which are unmarked are verses. # The verses may span more than one line too. Therefore adding them up. else: verse += line + '<BR>' # Mark lineType 'v' for verse. lineType.append('v') return result