def minidom_pretty_print(content): pretty = None try: content = content.replace('\r', '') content = ' '.join([item for item in content.split('\n')]) content = preserve_styles(content) content = remove_exceeding_spaces_in_all_tags(content) prefix, content = split_prefix(content) if isinstance(content, unicode): content = content.encode('utf-8') print(content[content.find('Stern et al'):content.find('Stern et al')+400]) doc = xml.dom.minidom.parseString(content) pretty = doc.toprettyxml().strip() if not isinstance(pretty, unicode): pretty = pretty.decode('utf-8') ign, pretty = split_prefix(pretty) pretty = '\n'.join([item for item in pretty.split('\n') if item.strip() != '']) pretty = remove_break_lines_off_element_content(pretty) pretty = restore_styles(pretty) pretty = prefix + remove_exceding_style_tags(pretty).strip() except Exception as e: print('ERROR in pretty') print(e) #print(content) #print(pretty) fs_utils.write_file('./pretty_print.xml', content) return pretty
def java_xml_utils_style_validation(xml_filename, doctype, report_filename, xsl_prep_report, xsl_report): # STYLE CHECKER REPORT register_log('java_xml_utils_style_validation: inicio') is_valid_style = False xml_report = report_filename.replace('.html', '.xml') if os.path.exists(xml_report): os.unlink(xml_report) if os.path.exists(report_filename): os.unlink(report_filename) parameters = {} bkp_xml_filename = xml_utils.apply_dtd(xml_filename, doctype) if java_xml_utils.xml_transform(xml_filename, xsl_prep_report, xml_report, parameters): #parameters = {'filename': xml_report} java_xml_utils.xml_transform(xml_report, xsl_report, report_filename, parameters) else: fs_utils.write_file(report_filename, validation_status.STATUS_FATAL_ERROR + ': ' + _('Unable to create') + ' ' + report_filename) if os.path.isfile(report_filename): c = fs_utils.read_file(report_filename) is_valid_style = ('Total of errors = 0' in c) and (('Total of warnings = 0' in c) or (not 'Total of warnings =' in c)) if os.path.isfile(bkp_xml_filename): xml_utils.restore_xml_file(xml_filename, bkp_xml_filename) if os.path.isfile(xml_report): os.unlink(xml_report) register_log('java_xml_utils_style_validation: fim') return is_valid_style
def minidom_pretty_print(content): pretty = None try: content = content.replace("\r", "") content = " ".join([item for item in content.split("\n")]) content = preserve_styles(content) content = remove_exceeding_spaces_in_all_tags(content) prefix, content = split_prefix(content) if isinstance(content, unicode): content = content.encode("utf-8") doc = xml.dom.minidom.parseString(content) pretty = doc.toprettyxml().strip() if not isinstance(pretty, unicode): pretty = pretty.decode("utf-8") ign, pretty = split_prefix(pretty) pretty = "\n".join([item for item in pretty.split("\n") if item.strip() != ""]) pretty = remove_break_lines_off_element_content(pretty) pretty = restore_styles(pretty) pretty = prefix + remove_exceding_style_tags(pretty).strip() except Exception as e: print("ERROR in pretty") print(e) print(content) # print(pretty) fs_utils.write_file("./pretty_print.xml", content) raise return pretty
def validate(self, xml_filename, dtd_report_filename, style_report_filename): self.logger.register('XMLValidator.validate - inicio') self.logger.register('XMLValidator.validate - self.validator.setup()') self.validator.logger = self.logger self.validator.setup(xml_filename) self.logger.register('XMLValidator.validate - xml_utils.load_xml') xml, e = xml_utils.load_xml(self.validator.xml.content) self.logger.register('XMLValidator.validate - self.validator.dtd_validation') is_valid_dtd = self.validator.dtd_validation(dtd_report_filename) content = '' if e is None: self.logger.register('XMLValidator.validate - self.validator.style_validation') self.validator.style_validation(style_report_filename) self.logger.register('XMLValidator.validate - fs_utils.read_file') content = fs_utils.read_file(style_report_filename) else: self.logger.register('XMLValidator.validate - e is not None') content = validation_status.STATUS_FATAL_ERROR + ': ' + _('Unable to load {xml}. ').format(xml=xml_filename) + '\n' + e fs_utils.write_file(style_report_filename, content) self.logger.register('XMLValidator.validate - style_checker_statistics') f, e, w = style_checker_statistics(content) self.logger.register('XMLValidator.validate - self.validator.finish()') self.validator.finish() self.logger.register('XMLValidator.validate - fim') return (xml, is_valid_dtd, (f, e, w))
def packtools_dtd_validation(xml_filename, report_filename): import packtools xml_validator = packtools.stylechecker.XMLValidator(xml_filename) is_valid, errors = xml_validator.validate() r = '\n'.join([err.message for err in errors]) fs_utils.write_file(report_filename, r) return is_valid
def _change_doctype(self): if self.logger is not None: self.logger.register('XML._change_doctype - inicio') self.content = self.content.replace('\r\n', '\n') if '<!DOCTYPE' in self.content: find_text = self.content[self.content.find('<!DOCTYPE'):] find_text = find_text[0:find_text.find('>')+1] if len(find_text) > 0: if len(self.doctype) > 0: self.content = self.content.replace(find_text, self.doctype) else: if find_text + '\n' in self.content: self.content = self.content.replace(find_text + '\n', self.doctype) elif self.content.startswith('<?xml '): if '?>' in self.content: xml_proc = self.content[0:self.content.find('?>')+2] xml = self.content[1:] if '<' in xml: xml = xml[xml.find('<'):] if len(self.doctype) > 0: self.content = xml_proc + '\n' + self.doctype + '\n' + xml else: self.content = xml_proc + '\n' + xml fs_utils.write_file(self.xml_filename, self.content) if self.logger is not None: self.logger.register('XML._change_doctype - fim')
def update_wayta_orgname_location_country(source, wayta_normalized_aff, wayta_orgname_location_country): items = fs_utils.get_downloaded_data(source, wayta_normalized_aff) print('wayta normalized aff') print(len(items.split('\n'))) items = items.replace(';', '\t') print(1) print(len(items.split('\n'))) items = remove_exceding_blank_spaces(items) print(2) print(len(items.split('\n'))) items = items.split('\n') print(3) print(len(items)) results = [] for item in items: if item.startswith('"') and '"\t' in item: item = item[1:].replace('"\t', '\t') item = item.replace('""', '"') parts = item.split('\t') if len(parts) == 6: bad, correct, country_name, country_code, state, city = parts results.append('\t'.join([correct, city, state, country_code, country_name])) results = list(set(results)) print('downloaded:') print(len(results)) fs_utils.write_file(wayta_orgname_location_country, '\n'.join(sorted(results)))
def temp_xml_filename(self): temp_filename = self.issue_stuff.temp_path + '/pubmed_tmp_' + os.path.basename(self.pubmed_filename) xml_content = '<?xml version="1.0" encoding="utf-8"?>\n' xml_content += '<root>' xml_content += self.articles_filenames_xml_content xml_content += self.articles_pids_xml_content xml_content += '</root>' fs_utils.write_file(temp_filename, xml_content) return temp_filename
def xml_content_transform(content, xsl_filename): f = tempfile.NamedTemporaryFile(delete=False) f.close() fs_utils.write_file(f.name, content) f2 = tempfile.NamedTemporaryFile(delete=False) f2.close() if xml_transform(f.name, xsl_filename, f2.name): content = fs_utils.read_file(f2.name) os.unlink(f2.name) if os.path.exists(f.name): os.unlink(f.name) return content
def validate_article_xml(xml_filename, dtd_files, dtd_report_filename, style_report_filename): register_log('validate_article_xml: inicio') is_valid_style = False register_log('validate_article_xml: inicio') xml, e = xml_utils.load_xml(xml_filename) is_valid_dtd = dtd_validation(xml_filename, dtd_report_filename, dtd_files.doctype_with_local_path, dtd_files.database_name) if e is None: is_valid_style = style_validation(xml_filename, dtd_files.doctype_with_local_path, style_report_filename, dtd_files.xsl_prep_report, dtd_files.xsl_report, dtd_files.database_name) else: text = validation_status.STATUS_FATAL_ERROR + ': ' + _('Unable to load') + ' ' + xml_filename + '\n' + e fs_utils.write_file(style_report_filename, text) f, e, w = style_checker_statistics(style_report_filename) register_log('validate_article_xml: fim') #open(os.path.dirname(style_report_filename) + '/validate_article_xml.log', 'a+').write('\n'.join(log_items)) return (xml, is_valid_dtd, (f, e, w))
def format_reports_for_web(report_path, pkg_path, issue_path): if not os.path.isdir(converter_env.local_web_app_path + '/htdocs/reports/' + issue_path): os.makedirs(converter_env.local_web_app_path + '/htdocs/reports/' + issue_path) #utils.debugging('format_reports_for_web') #utils.debugging('content of ' + report_path) #utils.debugging('\n'.join(os.listdir(report_path))) for f in os.listdir(report_path): if f.endswith('.zip') or f == 'xml_converter.txt': os.unlink(report_path + '/' + f) else: #utils.debugging(report_path + '/' + f) content = fs_utils.read_file(report_path + '/' + f) content = content.replace('file:///' + pkg_path, '/img/revistas/' + issue_path) content = content.replace('file:///' + report_path, '/reports/' + issue_path) if isinstance(content, unicode): content = content.encode('utf-8') fs_utils.write_file(converter_env.local_web_app_path + '/htdocs/reports/' + issue_path + '/' + f, content)
def report_differences(old, new, deleted_report, added_report, fixed_report, replaced_report): old_items = fs_utils.read_file(old) old_items = old_items.split('\n') print('current:') print(len(old_items)) new_items = fs_utils.read_file(new) new_items = new_items.split('\n') print('new:') print(len(new_items)) maybe_deleted = [] for item in old_items: if not item in new_items: maybe_deleted.append(item) maybe_added = [] for item in new_items: if not item in old_items: maybe_added.append(item) print('=>') print([len(maybe_deleted), len(maybe_added)]) organized_items = classify_items_by_len(maybe_added) deleted = [] replaced = [] fixed = [] total = '/' + str(len(maybe_deleted)) i = 0 for item in maybe_deleted: i += 1 if str(i).endswith('500') or str(i).endswith('000'): print(str(i) + total) similar = found_similar(item, maybe_added) if similar is None: similar = found_similar_2(item, organized_items.get(len(item), [])) if similar is None: deleted.append(item) else: replaced.append(item + '\n' + similar + '\n') fixed.append(similar) added = [item for item in maybe_added if not item in fixed] fs_utils.write_file(replaced_report, '\n'.join(replaced)) fs_utils.write_file(fixed_report, '\n'.join(fixed)) fs_utils.write_file(deleted_report, '\n'.join(deleted)) fs_utils.write_file(added_report, '\n'.join(added)) return [len(deleted), len(added), len(fixed)]
def transform_content(self, xsl_filename): if self.logger is not None: self.logger.register('XML.transform_content - inicio') f = tempfile.NamedTemporaryFile(delete=False) f.close() f2 = tempfile.NamedTemporaryFile(delete=False) f2.close() fs_utils.write_file(f.name, self.content) content = '' if self.transform_file(f.name, xsl_filename, f2.name): content = fs_utils.read_file(f2.name) for item in [f.name, f2.name]: os.unlink(f.name) if self.logger is not None: self.logger.register('XML.transform_content - fim') return content
def xml_validate(xml_filename, result_filename, doctype=None): #register_log('xml_validate: inicio') validation_type = '' if doctype is None: doctype = '' else: validation_type = '--validate' bkp_xml_filename = xml_utils.apply_dtd(xml_filename, doctype) temp_result_filename = TMP_DIR + '/' + os.path.basename(result_filename) if os.path.isfile(result_filename): os.unlink(result_filename) if not os.path.isdir(os.path.dirname(result_filename)): os.makedirs(os.path.dirname(result_filename)) cmd = JAVA_PATH + ' -cp "' + JAR_VALIDATE + '" br.bireme.XMLCheck.XMLCheck "' + xml_filename + '" ' + validation_type + '>"' + temp_result_filename + '"' cmd = cmd.encode(encoding=sys.getfilesystemencoding()) os.system(cmd) if os.path.exists(temp_result_filename): result = fs_utils.read_file(temp_result_filename, sys.getfilesystemencoding()) if 'ERROR' in result.upper(): n = 0 s = '' for line in open(xml_filename, 'r').readlines(): if n > 0: s += str(n) + ':' + line n += 1 result += '\n' + s.decode('utf-8') fs_utils.write_file(temp_result_filename, result) else: result = 'ERROR: Not valid. Unknown error.\n' + cmd fs_utils.write_file(temp_result_filename, result) shutil.move(temp_result_filename, result_filename) shutil.move(bkp_xml_filename, xml_filename) #register_log('xml_validate: fim') return not 'ERROR' in result.upper()
def transform_file(self, xsl_filename, result_filename, parameters={}): if self.logger is not None: self.logger.register('XML.transform_file - inicio') error = False temp_result_filename = self.prepare(result_filename) if self.logger is not None: self.logger.register('XML.transform_file - command - inicio') cmd = JAVA_PATH + ' -jar "' + JAR_TRANSFORM + '" -novw -w0 -o "' + temp_result_filename + '" "' + self.xml_filename + '" "' + xsl_filename + '" ' + format_parameters(parameters) cmd = cmd.encode(encoding=sys.getfilesystemencoding()) os.system(cmd) if self.logger is not None: self.logger.register('XML.transform_file - command - fim') if not os.path.exists(temp_result_filename): fs_utils.write_file(temp_result_filename, 'ERROR: transformation error.\n' + cmd) error = True shutil.move(temp_result_filename, result_filename) if self.logger is not None: self.logger.register('XML.transform_file - fim') return (not error)
def style_validation(self, report_filename): is_valid_style = False xml_report = report_filename.replace('.html', '.xml') for item in [xml_report, report_filename]: if os.path.exists(item): os.unlink(item) parameters = {} if self.xml.transform_file(self.xsl_prep_report, xml_report, parameters): xml_transformer_report = java_xml_utils.XML(xml_report, None) xml_transformer_report.logger = self.logger xml_transformer_report.transform_file(self.xsl_report, report_filename, parameters) result = fs_utils.read_file(report_filename) if os.path.isfile(xml_report): os.unlink(xml_report) if not os.path.isfile(report_filename): result = 'ERROR: ' + _('Unable to create') + ' ' + report_filename fs_utils.write_file(report_filename, result) is_valid_style = ('Total of errors = 0' in result) and (('Total of warnings = 0' in result) or (not 'Total of warnings =' in result)) return is_valid_style
def xml_transform(xml_filename, xsl_filename, result_filename, parameters={}): #register_log('xml_transform: inicio') error = False temp_result_filename = TMP_DIR + '/' + os.path.basename(result_filename) if not os.path.isdir(os.path.dirname(result_filename)): os.makedirs(os.path.dirname(result_filename)) for f in [result_filename, temp_result_filename]: if os.path.isfile(f): os.unlink(f) tmp_xml_filename = create_temp_xml_filename(xml_filename) cmd = JAVA_PATH + ' -jar "' + JAR_TRANSFORM + '" -novw -w0 -o "' + temp_result_filename + '" "' + tmp_xml_filename + '" "' + xsl_filename + '" ' + format_parameters(parameters) cmd = cmd.encode(encoding=sys.getfilesystemencoding()) os.system(cmd) if not os.path.exists(temp_result_filename): fs_utils.write_file(temp_result_filename, 'ERROR: transformation error.\n' + cmd) error = True shutil.move(temp_result_filename, result_filename) fs_utils.delete_file_or_folder(tmp_xml_filename) #register_log('xml_transform: fim') return (not error)
def xml_validate(self, result_filename): if self.logger is not None: self.logger.register('XML.xml_validate - inicio') validation_type = '' if self.doctype == '' else '--validate' temp_result_filename = self.prepare(result_filename) if self.logger is not None: self.logger.register('XML.transform_file - command - inicio') cmd = JAVA_PATH + ' -cp "' + JAR_VALIDATE + '" br.bireme.XMLCheck.XMLCheck "' + self.xml_filename + '" ' + validation_type + '>"' + temp_result_filename + '"' cmd = cmd.encode(encoding=sys.getfilesystemencoding()) os.system(cmd) if self.logger is not None: self.logger.register('XML.transform_file - command - fim') if os.path.exists(temp_result_filename): result = fs_utils.read_file(temp_result_filename, sys.getfilesystemencoding()) if 'ERROR' in result.upper(): n = 0 s = '' for line in open(self.xml_filename, 'r').readlines(): if n > 0: s += str(n) + ':' + line n += 1 result += '\n' + s.decode('utf-8') fs_utils.write_file(result_filename, result) os.unlink(temp_result_filename) else: shutil.move(temp_result_filename, result_filename) else: result = 'ERROR: Not valid. Unknown error.\n' + cmd fs_utils.write_file(result_filename, result) if self.logger is not None: self.logger.register('XML.transform_file - command - fim') if self.logger is not None: self.logger.register('XML.xml_validate - fim') return not 'ERROR' in result.upper()
def apply_dtd(xml_filename, doctype): temp_filename = tempfile.mkdtemp() + '/' + os.path.basename(xml_filename) shutil.copyfile(xml_filename, temp_filename) content = replace_doctype(fs_utils.read_file(xml_filename), doctype) fs_utils.write_file(xml_filename, content) return temp_filename
def dtd_validation(self, report_filename): fs_utils.write_file(report_filename, self._dtd_validation) return self.is_valid
if len(parts) == 6: bad, correct, country_name, country_code, state, city = parts results.append('\t'.join([correct, city, state, country_code, country_name])) results = list(set(results)) print('downloaded:') print(len(results)) fs_utils.write_file(wayta_orgname_location_country, '\n'.join(sorted(results))) execute_update = False if len(sys.argv) == 1: update_wayta_orgname_location_country(source, wayta_normalized_aff, wayta_orgname_location_country) counts = report_differences(local_orgname_location_country, wayta_orgname_location_country, deleted_report, added_report, fixed_report, replaced_report) print('->') print(counts) print(sum(counts)) elif len(sys.argv) == 2: execute_update = (sys.argv[1] == 'update') if sys.argv[1] == 'fix_local': fs_utils.write_file(local_orgname_location_country, remove_exceding_blank_spaces(fs_utils.read_file(local_orgname_location_country))) if execute_update is True: import institutions_service a = institutions_service.OrgManager() a.create_db() print('db updated') else: print('No update')
def fix_endoflines(filename, destination): r = [] items = fs_utils.read_file(filename) for item in items.split('\n'): r.append(item.strip()) fs_utils.write_file(destination, '\n'.join(sorted(items)))