Beispiel #1
0
 def __str__(self):
     string = "\n".join([line.strip() for line in self.header.splitlines() if line.strip()]) + '\n\n'
     written_keys = []
     for key, value in self.sorted_defaults:
         if value["comment"]:
             string += "\n" + "\n".join(line.strip() for line in value["comment"].splitlines() if line.strip())
         # string += "\n%s = %s\n" % (key,repr(self.data[key]))
         string += "\n%s = %s\n" % (key, pretty_print(self.data[key]))
         written_keys.append(key)
     for key in self.data:
         if key not in written_keys:
             # string += "\n%s = %s\n" % (key,repr(self.data[key]))
             string += "\n%s = %s\n" % (key, pretty_print(self.data[key]))
             written_keys.append(key)
     return string
Beispiel #2
0
 def parse_dc_header(self):
     load_metadata = []
     for filename in self.list_files():
         data = {}
         fn = self.textdir + filename
         header = ""
         with open(fn) as fh:
             for line in fh:
                 start_scan = re.search("<teiheader>|<temphead>|<head>", line, re.IGNORECASE)
                 end_scan = re.search("</teiheader>|<\/?temphead>|</head>", line, re.IGNORECASE)
                 if start_scan:
                     header += line[start_scan.start():]
                 elif end_scan:
                     header += line[:end_scan.end()]
                     break
                 else:
                     header += line
         matches = re.findall('<meta name="DC\.([^"]+)" content="([^"]+)"', header)
         if not matches:
             matches = re.findall('<dc:([^>]+)>([^>]+)>', header)
         for metadata_name, metadata_value in matches:
             metadata_value = metadata_value
             metadata_value = convert_entities(metadata_value.decode('utf-8')).encode('utf-8')
             metadata_name = metadata_name.lower()
             data[metadata_name] = metadata_value
         data["filename"] = filename  # place at the end in case the value was in the header
         data = self.create_year_field(data)
         if self.debug:
             print(pretty_print(data))
         load_metadata.append(data)
     return load_metadata
Beispiel #3
0
 def parse_tei_header(self):
     load_metadata = []
     metadata_xpaths = self.parser_config["doc_xpaths"]
     deleted_files = []
     for file in os.scandir(self.textdir):
         data = {"filename": file.name}
         header = ""
         with open(file.path) as text_file:
             try:
                 file_content = "".join(text_file.readlines())
             except UnicodeDecodeError:
                 deleted_files.append(file.name)
                 continue
         try:
             start_header_index = re.search(r'<teiheader', file_content, re.I).start()
             end_header_index = re.search(r'</teiheader', file_content, re.I).start()
         except AttributeError:  # tag not found
             deleted_files.append(file.name)
             continue
         header = file_content[start_header_index:end_header_index]
         header = convert_entities(header)
         if self.debug:
             print("parsing %s header..." % file.name)
         parser = etree.XMLParser(recover=True)
         try:
             tree = etree.fromstring(header, parser)
             trimmed_metadata_xpaths = []
             for field in metadata_xpaths:
                 for xpath in metadata_xpaths[field]:
                     attr_pattern_match = re.search(r"@([^\/\[\]]+)$", xpath)
                     if attr_pattern_match:
                         xp_prefix = xpath[:attr_pattern_match.start(0)]
                         attr_name = attr_pattern_match.group(1)
                         elements = tree.findall(xp_prefix)
                         for el in elements:
                             if el is not None and el.get(attr_name, ""):
                                 data[field] = el.get(attr_name, "")
                                 break
                     else:
                         el = tree.find(xpath)
                         if el is not None and el.text is not None:
                             data[field] = el.text
                             break
             trimmed_metadata_xpaths = [
                 (metadata_type, xpath, field)
                 for metadata_type in ["div", "para", "sent", "word", "page"]
                 if metadata_type in metadata_xpaths for field in metadata_xpaths[metadata_type]
                 for xpath in metadata_xpaths[metadata_type][field]
             ]
             data = self.create_year_field(data)
             if self.debug:
                 print(pretty_print(data))
             data["options"] = {"metadata_xpaths": trimmed_metadata_xpaths}
             load_metadata.append(data)
         except etree.XMLSyntaxError:
             deleted_files.append(f)
     if deleted_files:
         for f in deleted_files:
             print("%s has no valid TEI header or contains invalid data: removing from database load..." % f)
     return load_metadata
Beispiel #4
0
 def parse_dc_header(self):
     """Parse Dublin Core header"""
     load_metadata = []
     for file in os.scandir(self.textdir):
         data = {}
         header = ""
         with open(file.path) as fh:
             for line in fh:
                 start_scan = re.search(r"<teiheader>|<temphead>|<head>",
                                        line, re.IGNORECASE)
                 end_scan = re.search(r"</teiheader>|<\/?temphead>|</head>",
                                      line, re.IGNORECASE)
                 if start_scan:
                     header += line[start_scan.start():]
                 elif end_scan:
                     header += line[:end_scan.end()]
                     break
                 else:
                     header += line
         matches = re.findall(r'<meta name="DC\.([^"]+)" content="([^"]+)"',
                              header)
         if not matches:
             matches = re.findall(r"<dc:([^>]+)>([^>]+)>", header)
         for metadata_name, metadata_value in matches:
             metadata_value = convert_entities(metadata_value)
             metadata_name = metadata_name.lower()
             data[metadata_name] = metadata_value
         data[
             "filename"] = file.name  # place at the end in case the value was in the header
         data = self.create_year_field(data)
         if self.debug:
             print(pretty_print(data))
         load_metadata.append(data)
     return load_metadata
Beispiel #5
0
 def __str__(self):
     string = "\n".join([
         line.strip() for line in self.header.splitlines() if line.strip()
     ]) + "\n\n"
     written_keys = []
     for key, value in self.defaults.items():
         if value["comment"]:
             string += "\n" + "\n".join(
                 line.strip()
                 for line in value["comment"].splitlines() if line.strip())
         string += "\n%s = %s\n" % (key, pretty_print(self.data[key]))
         written_keys.append(key)
     for key in self.data:
         if key not in written_keys:
             string += "\n%s = %s\n" % (key, pretty_print(self.data[key]))
             written_keys.append(key)
     return string
Beispiel #6
0
 def parse_tei_header(self):
     """Parse header in TEI files"""
     load_metadata = []
     metadata_xpaths = self.parser_config["doc_xpaths"]
     doc_count = len(os.listdir(self.textdir))
     for pos, file in enumerate(os.scandir(self.textdir)):
         data = {"filename": file.name}
         header = ""
         with open(file.path) as text_file:
             try:
                 file_content = "".join(text_file.readlines())
             except UnicodeDecodeError:
                 self.deleted_files.append(file.name)
                 continue
         try:
             start_header_index = re.search(r"<teiheader", file_content,
                                            re.I).start()
             end_header_index = re.search(r"</teiheader", file_content,
                                          re.I).start()
         except AttributeError:  # tag not found
             self.deleted_files.append(file.name)
             continue
         header = file_content[start_header_index:end_header_index]
         header = convert_entities(header)
         if self.debug:
             print("parsing %s header..." % file.name)
         parser = lxml.etree.XMLParser(recover=True)
         try:
             tree = lxml.etree.fromstring(header, parser)
             trimmed_metadata_xpaths = []
             for field in metadata_xpaths:
                 for xpath in metadata_xpaths[field]:
                     xpath = xpath.rstrip(
                         "/"
                     )  # make sure there are no trailing slashes which make lxml die
                     try:
                         elements = tree.xpath(xpath)
                     except lxml.etree.XPathEvalError:
                         continue
                     for element in elements:
                         if element is not None:
                             value = ""
                             if isinstance(element, lxml.etree._Element
                                           ) and element.text is not None:
                                 value = element.text.strip()
                             elif isinstance(
                                     element,
                                     lxml.etree._ElementUnicodeResult):
                                 value = str(element).strip()
                             if value:
                                 data[field] = value
                                 break
                     else:  # only continue looping over xpaths if no break in inner loop
                         continue
                     break
             trimmed_metadata_xpaths = [
                 (metadata_type, xpath, field) for metadata_type in
                 ["div", "para", "sent", "word", "page"]
                 if metadata_type in metadata_xpaths
                 for field in metadata_xpaths[metadata_type]
                 for xpath in metadata_xpaths[metadata_type][field]
             ]
             data = self.create_year_field(data)
             if self.debug:
                 print(pretty_print(data))
             data["options"] = {"metadata_xpaths": trimmed_metadata_xpaths}
             load_metadata.append(data)
         except lxml.etree.XMLSyntaxError:
             self.deleted_files.append(file.name)
         print(
             f"\r{time.ctime()}: Parsing document level metadata: {pos+1}/{doc_count} done...",
             flush=True,
             end="")
     if self.deleted_files:
         for f in self.deleted_files:
             print(
                 "%s has no valid TEI header or contains invalid data: removing from database load..."
                 % f)
     return load_metadata
Beispiel #7
0
 def __str__(self):
     """String representation of parsed loader config."""
     return pretty_print(self.values)
Beispiel #8
0
 def __str__(self):
     """String representation of parsed loader config."""
     return pretty_print(self.values)
Beispiel #9
0
 def parse_tei_header(self):
     """Parse header in TEI files"""
     load_metadata = []
     metadata_xpaths = self.parser_config["doc_xpaths"]
     self.deleted_files = []
     for file in os.scandir(self.textdir):
         data = {"filename": file.name}
         header = ""
         with open(file.path) as text_file:
             try:
                 file_content = "".join(text_file.readlines())
             except UnicodeDecodeError:
                 self.deleted_files.append(file.name)
                 continue
         try:
             start_header_index = re.search(r"<teiheader", file_content, re.I).start()
             end_header_index = re.search(r"</teiheader", file_content, re.I).start()
         except AttributeError:  # tag not found
             self.deleted_files.append(file.name)
             continue
         header = file_content[start_header_index:end_header_index]
         header = convert_entities(header)
         if self.debug:
             print("parsing %s header..." % file.name)
         parser = etree.XMLParser(recover=True)
         try:
             tree = etree.fromstring(header, parser)
             trimmed_metadata_xpaths = []
             for field in metadata_xpaths:
                 for xpath in metadata_xpaths[field]:
                     attr_pattern_match = re.search(r"@([^\/\[\]]+)$", xpath)
                     if attr_pattern_match:
                         xp_prefix = xpath[: attr_pattern_match.start(0)]
                         attr_name = attr_pattern_match.group(1)
                         elements = tree.findall(xp_prefix)
                         for el in elements:
                             if el is not None and el.get(attr_name, ""):
                                 data[field] = el.get(attr_name, "")
                                 break
                     else:
                         el = tree.find(xpath)
                         if el is not None and el.text is not None:
                             data[field] = el.text
                             break
             trimmed_metadata_xpaths = [
                 (metadata_type, xpath, field)
                 for metadata_type in ["div", "para", "sent", "word", "page"]
                 if metadata_type in metadata_xpaths
                 for field in metadata_xpaths[metadata_type]
                 for xpath in metadata_xpaths[metadata_type][field]
             ]
             data = self.create_year_field(data)
             if self.debug:
                 print(pretty_print(data))
             data["options"] = {"metadata_xpaths": trimmed_metadata_xpaths}
             load_metadata.append(data)
         except etree.XMLSyntaxError:
             self.deleted_files.append(file.name)
     if self.deleted_files:
         for f in self.deleted_files:
             print("%s has no valid TEI header or contains invalid data: removing from database load..." % f)
     return load_metadata
Beispiel #10
0
 def parse_tei_header(self):
     """Parse header in TEI files"""
     load_metadata = []
     metadata_xpaths = self.parser_config["doc_xpaths"]
     for file in os.scandir(self.textdir):
         data = {"filename": file.name}
         header = ""
         with open(file.path) as text_file:
             try:
                 file_content = "".join(text_file.readlines())
             except UnicodeDecodeError:
                 self.deleted_files.append(file.name)
                 continue
         try:
             start_header_index = re.search(r"<teiheader", file_content, re.I).start()
             end_header_index = re.search(r"</teiheader", file_content, re.I).start()
         except AttributeError:  # tag not found
             self.deleted_files.append(file.name)
             continue
         header = file_content[start_header_index:end_header_index]
         header = convert_entities(header)
         if self.debug:
             print("parsing %s header..." % file.name)
         parser = lxml.etree.XMLParser(recover=True)
         try:
             tree = lxml.etree.fromstring(header, parser)
             trimmed_metadata_xpaths = []
             for field in metadata_xpaths:
                 for xpath in metadata_xpaths[field]:
                     xpath = xpath.rstrip("/") # make sure there are no trailing slashes which make lxml die
                     try:
                         elements = tree.xpath(xpath)
                     except lxml.etree.XPathEvalError:
                         continue
                     for element in elements:
                         if element is not None:
                             value = ""
                             if isinstance(element, lxml.etree._Element) and element.text is not None:
                                 value = element.text.strip()
                             elif isinstance(element, lxml.etree._ElementUnicodeResult):
                                 value = str(element).strip()
                             if value:
                                 data[field] = value
                                 break
                     else: # only continue looping over xpaths if no break in inner loop
                         continue
                     break
             trimmed_metadata_xpaths = [
                 (metadata_type, xpath, field)
                 for metadata_type in ["div", "para", "sent", "word", "page"]
                 if metadata_type in metadata_xpaths
                 for field in metadata_xpaths[metadata_type]
                 for xpath in metadata_xpaths[metadata_type][field]
             ]
             data = self.create_year_field(data)
             if self.debug:
                 print(pretty_print(data))
             data["options"] = {"metadata_xpaths": trimmed_metadata_xpaths}
             load_metadata.append(data)
         except lxml.etree.XMLSyntaxError:
             self.deleted_files.append(file.name)
     if self.deleted_files:
         for f in self.deleted_files:
             print("%s has no valid TEI header or contains invalid data: removing from database load..." % f)
     return load_metadata