def read_xml(path): """ parse xml and return tree """ parser = etree.XMLParser(remove_blank_text=True) tree = etree.parse(path, parser) return tree
def to_dict(self): """ Return a mapping representing this POM """ return OrderedDict([ ('group_id', self.group_id), ('artifact_id', self.artifact_id), ('version', str(self.version) if self.version else None), ('classifier', self.classifier), ('type', self.type), ]) POM_PARSER = etree.XMLParser(recover=True, remove_comments=True, remove_pis=True, remove_blank_text=True, resolve_entities=False) STRIP_NAMESPACE_RE = re.compile(r"<project(.|\s)*?>", re.UNICODE) class MavenPom(pom.Pom): def __init__(self, location): # NOTE: most of this is copied over from Pom.__init__ try: with codecs.open(location, 'rb', encoding='UTF-8') as fh: xml = fh.read() except UnicodeDecodeError as _a: xml = analysis.unicode_text(location)
from lxml import etree schemadoc = etree.parse("../xsd/top_artistsPT.xsd") schema = etree.XMLSchema(schemadoc) parser = etree.XMLParser(schema=schema) tree = etree.parse("../top_artistsPT.xml") schema.assertValid(tree)
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree # total_results = XPath('//openSearch:totalResults') # start_index = XPath('//openSearch:startIndex') # items_per_page = XPath('//openSearch:itemsPerPage') entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') url = XPath('descendant::atom:link[@rel="self"]/@href') creator = XPath('descendant::dc:creator') identifier = XPath('descendant::dc:identifier') title = XPath('descendant::dc:title') date = XPath('descendant::dc:date') publisher = XPath('descendant::dc:publisher') subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') # print(etree.tostring(entry_, pretty_print=True)) def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text google_id = id_url.split('/')[-1] details_url = url(entry_)[0] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'google': google_id} try: raw = get_details(browser, details_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) lang = canonicalize_lang(get_text(extra, language)) if lang: mi.language = lang mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in identifier(extra): t = type('')(x.text).strip() if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): if t[:5].upper() == 'ISBN:': t = check_isbn(t[5:]) if t: isbns.append(t) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x.text for x in subject(extra) if x.text] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: from calibre.utils.date import parse_date, utcnow try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Cover mi.has_google_cover = None for x in extra.xpath( '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]' ): mi.has_google_cover = x.get('href') break return mi
stk_path = os.path.join(os.path.realpath(os.path.dirname(__file__)), os.pardir, os.pardir) sys.path.insert(0, stk_path) from stk.supertree_toolkit import _check_uniqueness, _check_taxa, _check_data, get_all_characters, data_independence, add_weights from stk.supertree_toolkit import get_fossil_taxa, get_publication_years, data_summary, get_character_numbers, get_analyses_used from stk.supertree_toolkit import data_overlap, read_matrix, subs_file_from_str, clean_data, obtain_trees, get_all_source_names from stk.supertree_toolkit import add_historical_event, _sort_data, _parse_xml, _check_sources, _swap_tree_in_XML, replace_genera from stk.supertree_toolkit import get_all_taxa, _get_all_siblings, _parse_tree, get_characters_used, _trees_equal, get_weights from stk.supertree_toolkit import get_outgroup, set_all_tree_names, create_tree_name, taxonomic_checker, load_taxonomy, load_equivalents from stk.supertree_toolkit import create_taxonomy, create_taxonomy_from_tree, get_all_tree_names from lxml import etree from util import * from stk.stk_exceptions import * from collections import defaultdict import tempfile parser = etree.XMLParser(remove_blank_text=True) import re # Class to test all those loverly internal methods # or stuff that doesn't fit within the other tests class TestSTK(unittest.TestCase): def test_check_uniqueness(self): non_unique_names = etree.parse("data/input/non_unique_names.phyml") try: _check_uniqueness(etree.tostring(non_unique_names)) except NotUniqueError: self.assert_(True) return
from . import pycompat import odoo # get_encodings, ustr and exception_to_unicode were originally from tools.misc. # There are moved to loglevels until we refactor tools. from odoo.loglevels import get_encodings, ustr, exception_to_unicode # noqa _logger = logging.getLogger(__name__) # List of etree._Element subclasses that we choose to ignore when parsing XML. # We include the *Base ones just in case, currently they seem to be subclasses of the _* ones. SKIPPED_ELEMENT_TYPES = (etree._Comment, etree._ProcessingInstruction, etree.CommentBase, etree.PIBase, etree._Entity) # Configure default global parser etree.set_default_parser(etree.XMLParser(resolve_entities=False)) #---------------------------------------------------------- # Subprocesses #---------------------------------------------------------- def find_in_path(name): path = os.environ.get('PATH', os.defpath).split(os.pathsep) if config.get('bin_path') and config['bin_path'] != 'None': path.append(config['bin_path']) return which(name, path=os.pathsep.join(path)) def _exec_pipe(prog, args, env=None): cmd = (prog, ) + args
def parseXML(xml_in, params, state): """ parse the document XML """ # import pdb; pdb.set_trace() # if two fragments of text are within LINE_TOLERANCE of each other they're # on the same line text_margin_left = 57 text_margin_right = 289 indentation_bound_left = 71 indentation_bound_right = 305 NO_INTERJECTION = re.compile(r'^.{1,3}' + re.escape(params['closing_mark'])) debug = False # get the page elements parser = etree.XMLParser(recover=True) tree = etree.parse(xml_in, parser=parser) #tree = ET.ElementTree(file=xml_in) pages = tree.getroot() if pages.tag != "pages": sys.exit("ERROR: pages.tag is %s instead of pages!" % pages.tag) if (int(xml_in[12:14]) >= 15 and int(xml_in[14:17]) > 20) or (int(xml_in[12:14]) == 16): text_margin_right = 270 indentation_bound_right = 312 text = [] # step through the pages for page in pages: # gets page_id page_id = page.attrib['id'] # get all the textline elements textboxes = page.findall("./textbox") #print "found %s textlines" % len(textlines) # step through the textlines page_text = [] interjection = False # import pdb; pdb.set_trace() left = [round(float(textbox.attrib["bbox"].split(',')[0:1][0])) for textbox in textboxes] line_half = int((indentation_bound_right - indentation_bound_left)/2) identation = [e for e in Counter(left).keys() if (e > indentation_bound_left + 3 and e < indentation_bound_right - 3) or (e > indentation_bound_right + 3 and e < indentation_bound_right + line_half)] if state != 'BE': if any([e in range(params["identation_bound_left_1"], params["identation_bound_left_1"] + line_half) for e in identation]): identation_bounds = 'first' elif any([e in range(params["identation_bound_left_1"], params["identation_bound_left_1"] + line_half) for e in identation]): identation_bounds = 'second' else: identation_bounds = None if not identation: logging.warning('no x0 values within specified ranges' + page.attrib['id']) #import pdb; pdb.set_trace() for textbox in textboxes: # get the boundaries of the textline #import pdb; pdb.set_trace() textbox_bounds = [float(s) for s in textbox.attrib["bbox"].split(',')] #print "line_bounds: %s" % line_bounds # get all the texts in this textline lines = list(textbox) #print("found %s characters in this line." % len(chars)) # combine all the characters into a single string textbox_text = "" poi = False issue = False for line, has_more in lookahead(lines): chars = list(line) for char in chars: if poi: if char.attrib: if "Bold" not in char.attrib['font']: #import pdb; pdb.set_trace() textbox_text = textbox_text + '<poi_end>' poi = False elif char.attrib: if "Bold" in char.attrib['font']: #import pdb; pdb.set_trace() textbox_text = textbox_text + '<poi_begin>' poi = True try: textbox_text = textbox_text + char.text except TypeError: print('===============================> Attention! You have a TypeError here!', page_id) if not has_more and poi: textbox_text = textbox_text + '<poi_end>' #if re.compile(r'.+\n<poi_end>').match(textbox_text): #import pdb; pdb.set_trace() textbox_text = textbox_text.replace('\n<poi_end>', '<poi_end>\n') # if 'Beifall' in textbox_text: # import pdb; pdb.set_trace() # strip edge & multiple spaces textbox_text = re.sub(' +', ' ', textbox_text.strip()) # removes header/footer if textbox_bounds[1]>params['header_bound'] and page_id not in ['1']: #import pdb; pdb.set_trace() print('removed header ' + textbox_text) continue # if '(Alterspräsident Dr. Uwe Lehmann-Brauns)' in textbox_text: # import pdb; pdb.set_trace() # save a description of the line textbox = {'left': textbox_bounds[0], 'top': textbox_bounds[1], 'text': textbox_text} # if state != 'BE': # if identation_bounds=='first': # if textbox['left']>46 and textbox['left']<290 or textbox['left']>316: # if textbox_text.lstrip().startswith('(') and not NO_INTERJECTION.match(textbox_text): # textbox['text'] = '<interjection_begin>' + textbox['text'] + '<interjection_end>' # else: # textbox['text'] = '<identation_begin>' + textbox['text'] + '<identation_end>' # elif identation_bounds=='second': # if textbox['left']>75 and textbox['left']<320 or textbox['left']>344: # if textbox_text.lstrip().startswith('(') and not NO_INTERJECTION.match(textbox_text): # textbox['text'] = '<interjection_begin>' + textbox['text'] + '<interjection_end>' # else: # textbox['text'] = '<identation_begin>' + textbox['text'] + '<identation_end>' # else: # logging.info('no ordinary text boxes on page' + page_id) # else: # if textbox['left']>params['identation_bound_left_1'] + 3 and textbox['left']<params['identation_bound_right_1'] - 3 or textbox['left']>params['identation_bound_right_1'] + 3: # if textbox_text.lstrip().startswith(params['opening_mark']) and not NO_INTERJECTION.match(textbox_text): # textbox['text'] = '<interjection_begin>' + textbox['text'] + '<interjection_end>' # else: # textbox['text'] = '<identation_begin>' + textbox['text'] + '<identation_end>' if textbox['left'] > indentation_bound_left - 5 and textbox['left'] < text_margin_right - 5: textbox['text'] = '<interjection_begin>' + textbox['text'].replace('\n', '<interjection_end>\n<interjection_begin>') + '<interjection_end>' elif textbox['left'] > indentation_bound_right - 5: textbox['text'] = '<interjection_begin>' + textbox['text'].replace('\n', '<interjection_end>\n<interjection_begin>') + '<interjection_end>' if textbox['left'] < text_margin_right - 5: textbox['left'] = 30 else: textbox['left'] = 30 textbox['top'] = textbox['top']-1000 page_text.append(textbox) #print "page %s has %s lines" % (page.attrib["id"], len(lines)) # sort the lines by left, then top position # if debug: # import pdb; pdb.set_trace() page_text.sort(key=itemgetter('left')) page_text.sort(key=itemgetter('top'), reverse=True) # consolidate lines that have the same top (within tolerance) # consolidated_lines = [] # line_segments = [] # line_top = lines[0]['top'] # for line in lines: # if abs(line['top'] - line_top) < LINE_TOLERANCE: # line_segments.append(line) # else: # # assure that text segments appear in the correct order # line_segments.sort(key=itemgetter('left')) # # create a new line object combining partial texts, preserving the left-most text position # merged_line = dict(line_segments[0]) # merged_line['text'] = "" # for item in line_segments: # merged_line['text'] = merged_line['text'] + " " + item['text'] # consolidated_lines.append(merged_line) # # reset # line_segments = [line] # line_top = line['top'] #import pdb; pdb.set_trace() page_text = '\n\n'.join([e['text'] for e in page_text]) text.append(page_text + '\n\n') #import pdb; pdb.set_trace() return text
def process_xml(self, xml): ''' Parse tool configuration data out of the Common Cartridge LTI link XML. ''' root = objectify.fromstring(xml, parser=etree.XMLParser()) # Parse all children of the root node for child in root.getchildren(): if 'title' in child.tag: self.title = child.text if 'description' in child.tag: self.description = child.text if 'secure_launch_url' in child.tag: self.secure_launch_url = child.text elif 'launch_url' in child.tag: self.launch_url = child.text if 'icon' in child.tag: self.icon = child.text if 'secure_icon' in child.tag: self.secure_icon = child.text if 'cartridge_bundle' in child.tag: self.cartridge_bundle = child.attrib['identifierref'] if 'catridge_icon' in child.tag: self.cartridge_icon = child.atrib['identifierref'] if 'vendor' in child.tag: # Parse vendor tag for v_child in child.getchildren(): if 'code' in v_child.tag: self.vendor_code = v_child.text if 'description' in v_child.tag: self.vendor_description = v_child.text if 'name' in v_child.tag: self.vendor_name = v_child.text if 'url' in v_child.tag: self.vendor_url = v_child.text if 'contact' in v_child.tag: # Parse contact tag for email and name for c_child in v_child: if 'name' in c_child.tag: self.vendor_contact_name = c_child.text if 'email' in c_child.tag: self.vendor_contact_email = c_child.text if 'custom' in child.tag: # Parse custom tags for custom_child in child.getchildren(): self.custom_params[custom_child.attrib['name']] =\ custom_child.text if 'extensions' in child.tag: platform = child.attrib['platform'] properties = {} # Parse extension tags for ext_child in child.getchildren(): if 'property' in ext_child.tag: properties[ext_child.attrib['name']] = ext_child.text elif 'options' in ext_child.tag: opt_name = ext_child.attrib['name'] options = {} for option_child in ext_child.getchildren(): options[option_child.attrib['name']] =\ option_child.text properties[opt_name] = options self.set_ext_params(platform, properties)
from lxml import etree as ElementTree import htmlentitydefs import csv import operator import re # import gzip generateLog = True parser = ElementTree.XMLParser(attribute_defaults=True, load_dtd=True) # Papers must be at least 4 pages long to count. pageCountThreshold = 4 # Match ordinary page numbers (as in 10-17). pageCounterNormal = re.compile('(\d+)-(\d+)') # Match page number in the form volume:page (as in 12:140-12:150). pageCounterColon = re.compile('[0-9]+:([1-9][0-9]*)-[0-9]+:([1-9][0-9]*)') def pagecount(input): pageCounterMatcher1 = pageCounterNormal.match(input) pageCounterMatcher2 = pageCounterColon.match(input) start = 0 end = 0 count = 0 if (not (pageCounterMatcher1 is None)): start = int(pageCounterMatcher1.group(1)) end = int(pageCounterMatcher1.group(2)) count = end - start + 1
infile = open("../.repopath.pickle", 'rb') new_dict = pickle.load(infile) infile.close() repopath = new_dict.get("repository") print("Repository path saved is {}".format(repopath)) else: repopath = input("Enter the path to the repository: ") variable = {"repository":repopath} f = open(".repopath.pickle",'wb') pickle.dump(variable,f) f.close() pluginName = input("Please enter the plugin name: ") # profileXml = '/home/sthummala/workspace/repo/centina/sa/profiles/' + pluginName + '.xml' profileXml = repopath+'/centina/sa/profiles/' + pluginName + '.xml' parser = etree.XMLParser(strip_cdata=False) root = etree.parse(profileXml, parser) meta = root.find("meta") protocol = meta.find("protocol").get("name") dependencies = root.find("dependencies") file = dependencies.findall("file") for a in file: if a.get("path").startswith("pm/templates"): if a.get("path").endswith(".dtd"): continue pmtemplate = repopath+"/centina/sa/profiles/" + a.get("path") print("Found pm template ", pmtemplate) # try: # template = getParser(pmtemplate,"template") parsertemplate = etree.XMLParser(strip_cdata=False)
def main(): logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') parser = argparse.ArgumentParser( "VCF2cytosure - convert SV vcf files to cytosure") group = parser.add_argument_group('Filtering') group.add_argument('--size', default=1000, type=int, help='Minimum variant size. Default: %(default)s') group.add_argument('--frequency', default=0.01, type=float, help='Maximum frequency. Default: %(default)s') group.add_argument( '--frequency_tag', default='FRQ', type=str, help='Frequency tag of the info field. Default: %(default)s') group.add_argument('--no-filter', dest='do_filtering', action='store_false', default=True, help='Disable any filtering') group = parser.add_argument_group('Input') group.add_argument( '--genome', required=False, default=37, help= 'Human genome version. Use 37 for GRCh37/hg19, 38 for GRCh38 template.' ) group.add_argument('--sex', required=False, default='female', help='Sample sex male/female. Default: %(default)s') group.add_argument('--vcf', required=True, help='VCF file') group.add_argument( '--bins', type=int, default=20, help='the number of coverage bins per probes default=20') group.add_argument('--coverage', help='Coverage file') group.add_argument( '--cn', type=str, help= 'add probes using cnvkit cn file(cannot be used together with --coverage)' ) group.add_argument( '--snv', type=str, help= 'snv vcf file, use coverage annotation to position the height of the probes(cannot be used together with --coverage)' ) group.add_argument( '--dp', type=str, default="DP", help= 'read depth tag of snv vcf file. This option is only used if you use snv to set the heigth of the probes. The dp tag is a tag which is used to retrieve the depth of coverage across the snv (default=DP)' ) group.add_argument( '--maxbnd', type=int, default=10000, help='Maxixmum BND size, BND events exceeding this size are discarded') group.add_argument( '--out', help='output file (default = the prefix of the input vcf)') group.add_argument( '--blacklist', help= 'Blacklist bed format file to exclude completely contained variants.') group.add_argument('-V', '--version', action='version', version="%(prog)s " + __version__, help='Print program version and exit.') # parser.add_argument('xml', help='CytoSure design file') args = parser.parse_args() logger.info('vcf2cytosure %s', __version__) if (args.coverage and args.cn) or (args.coverage and args.snv) or (args.snv and args.cn): print( "Choose one of --coverage, --snv and --cn. They cannot be combined." ) quit() if int(args.genome) == 38: CGH_TEMPLATE = CGH_TEMPLATE_38 CONTIG_LENGTHS = CONTIG_LENGTHS_38 N_INTERVALS = N_INTERVALS_38 else: CGH_TEMPLATE = CGH_TEMPLATE_37 CONTIG_LENGTHS = CONTIG_LENGTHS_37 N_INTERVALS = N_INTERVALS_37 if not args.out: args.out = ".".join( args.vcf.split(".")[0:len(args.vcf.split(".")) - 1]) + ".cgh" parser = etree.XMLParser(remove_blank_text=True) sex_male = "false" promega_sex = 'Female' if args.sex == "male": sex_male = 'true' promega_sex = 'Male' vcf = VCF(args.vcf) sample_id = retrieve_sample_id(vcf, args.vcf) tree = etree.parse( StringIO( CGH_TEMPLATE.format(sample_id, sample_id, sample_id, sample_id, sex_male, promega_sex, sex_male)), parser) segmentation = tree.xpath('/data/cgh/segmentation')[0] probes = tree.xpath('/data/cgh/probes')[0] submission = tree.xpath('/data/cgh/submission')[0] if args.blacklist: blacklist = [ r for r in read_blacklist(args.blacklist) if r.chrom in CONTIG_LENGTHS ] chr_intervals = defaultdict(list) if args.do_filtering: vcf = variant_filter(vcf, min_size=args.size, max_frequency=args.frequency, frequency_tag=args.frequency_tag) n = 0 for event in events(vcf, CONTIG_LENGTHS): height = ABERRATION_HEIGHTS[event.type] end = event.end make_segment(segmentation, event.chrom, event.start, end, height) comment = format_comment(event.info) if "rankScore" in event.info: rank_score = int(event.info['RankScore'].partition(':')[2]) else: rank_score = 0 #occ=0 #if args.frequency_tag in event.info: # occ=event.info[args.frequency_tag] occ = 0 if "OCC" in event.info: occ = event.info["OCC"] if event.type in ("INV", 'INS', 'BND', "TRA") and not event.end: continue #pass elif event.type in ("INV", 'INS', 'BND', "TRA") and ( abs(event.start - event.end) > args.maxbnd): #pass continue elif args.blacklist: if contained_by_blacklist(event, blacklist): continue make_aberration(submission, event.chrom, event.start, end, confirmation=event.type, comment=comment, n_probes=occ, copy_number=rank_score) chr_intervals[event.chrom].append((event.start, event.end)) # show probes at slightly different height than segments for pos in spaced_probes(event.start, event.end - 1): make_probe(probes, event.chrom, pos, pos + 60, height, event.type) n += 1 if args.coverage or args.snv or args.cn: add_coverage_probes(probes, args.coverage, args, CONTIG_LENGTHS, N_INTERVALS) else: add_probes_between_events(probes, chr_intervals, CONTIG_LENGTHS) tree.write(args.out, pretty_print=True) logger.info('Wrote %d variants to CGH', n)
def process(type, db, config): location = '%s/%s' % (config.ACT_DIR, type) count = 0 with db.cursor() as cur: parser = etree.XMLParser(resolve_entities=False, huge_tree=True) print location for dirpath, dirs, files in os.walk(location): files = [f for f in files if f.endswith('.xml')] if len(files): path = os.path.join(dirpath.replace(config.ACT_DIR + '/', ''), files[0]) try: print path tree = etree.parse(os.path.join(dirpath, files[0]), parser) objectify.deannotate(tree, cleanup_namespaces=True) for elem in tree.iter(): if not hasattr(elem.tag, 'find'): continue i = elem.tag.find('}') if i >= 0: elem.tag = elem.tag[i + 1:] attrib = tree.getroot().attrib if attrib.get('id'): title = etree.tostring( tree.xpath('.//billref|.//title')[0], method="text", encoding="UTF-8") #TODO title = title.replace('\n', '').strip() query = """INSERT INTO instruments (id, govt_id, version, title, path, number, date_as_at,date_assent, type, date_first_valid, date_gazetted, date_terminated, date_imprint, year, repealed, in_amend, pco_suffix, raised_by, official, subtype, terminated, stage, date_signed, imperial, instructing_office, attributes) VALUES (%(id)s, %(govt_id)s, %(version)s, %(title)s, %(path)s, %(number)s, %(date_as_at)s,%(date_assent)s, %(type)s, %(date_first_valid)s, %(date_gazetted)s, %(date_terminated)s, %(date_imprint)s, %(year)s, %(repealed)s, %(in_amend)s, %(pco_suffix)s, %(raised_by)s, %(official)s, %(subtype)s, %(terminated)s, %(stage)s, %(date_signed)s, %(imperial)s, %(instructing_office)s, %(attr)s); """ with open(os.path.join(dirpath, files[0])) as r: cur.execute( """ INSERT INTO documents (document, type) VALUES (%(document)s, 'xml') returning id""", {'document': r.read()}) document_id = cur.fetchone()[0] values = { 'id': document_id, 'govt_id': attrib.get('id'), 'title': title, 'version': int(float(dirpath.split('/')[-1])), 'path': path, 'number': attrib.get( 'sr.no', attrib.get( 'sop.no', attrib.get('act.no', attrib.get('bill.no')))), 'date_first_valid': safe_date(attrib.get('date.first.valid')), 'date_gazetted': safe_date(attrib.get('date.date_gazetted')), 'date_terminated': safe_date(attrib.get('date.terminated')), 'date_imprint': safe_date(attrib.get('date.imprint')), 'date_as_at': safe_date(attrib.get('date.as.at')), 'date_assent': safe_date(attrib.get('date.assent')), 'year': int(attrib.get('year')), 'repealed': attrib.get('terminated') == "repealed", 'in_amend': attrib.get('in.amend') != 'false', 'pco_suffix': attrib.get('pco.suffix'), 'raised_by': attrib.get('raised.by'), 'official': attrib.get('official'), 'type': type, 'subtype': attrib.get( 'act.type', attrib.get('sr.type', attrib.get('bill.type'))), 'terminated': attrib.get('terminated'), 'stage': attrib.get('stage'), 'date_signed': safe_date(attrib.get('date.signed')), 'imperial': attrib.get('imperial') == 'yes', 'instructing_office': attrib.get('instructing_office'), 'attr': json.dumps(dict(attrib)) } cur.execute(query, values) except etree.XMLSyntaxError, e: print 'ERROR', e, path
def train_data(datapath): # define sign and stop words sign = [ "!", ",", ".", ":", ";", "'", "#", "$", "%", "&", "(", ")", "*", "[", "]", "?", "@", "_", "/", "{", "|", "}", "~", "--" ] f_stopword = open("/home/iialab/TREC2018/stopwords.txt") stopwords = f_stopword.readlines() stop_sign = [] for stopword in stopwords: stopword = stopword.strip("\n") stop_sign.append(stopword) # 解析xml文件 files = os.listdir(datapath) keys = ["clean_background", "clean_title", "doc_id"] clean_docs = [] for file in files: # print(file) Doc_id = file.strip(".xml") filepath = os.path.join(datapath, file) # print(filepath) parser = etree.XMLParser(recover=True) text = open(filepath, encoding="utf-8").read() root = ET.fromstring(text, parser=parser) # print(root.tag) clean_title = [] clean_background = [] for child in root: if child.tag == "Doc_title": title = child.text for c in sign: title = title.replace(c, "").lower().strip("\n") sentence = nltk.word_tokenize(title, language='english') # print(sentence) for item in sentence: if item in stop_sign: continue # print(item) else: clean_title.append( nltk.stem.SnowballStemmer('english').stem(item)) print(clean_title) elif child.tag == "Background": abstract = child.text for c in sign: abstract = abstract.replace(c, "").lower().strip("\n") sentence = nltk.word_tokenize(abstract, language='english') # print(sentence) for item in sentence: if item in stop_sign: continue # print(item) else: clean_background.append( nltk.stem.SnowballStemmer('english').stem(item)) print(clean_background) dictionary = dict(zip(keys, [clean_background, clean_title, Doc_id])) print(dictionary) clean_docs.append(dictionary) new_dic = dict(zip(["numFound", "docs"], [265, clean_docs])) print(new_dic) with open( "/home/iialab/TREC2018/training_data_2017/train_proceedings.json", "w") as f: json.dump(new_dic, f) print("finish!")
def button_reformat_callback(): """ what to do when the "Reformat" button is pressed """ xmlfile = entry.get() if xmlfile.rsplit(".")[-1] != "xml": statusText.set("Filename must have a .xml extension!") message.configure(fg="red") return IOH_xmlfile = get_IOH_filename(xmlfile) copyfile(xmlfile, IOH_xmlfile) """ make it pretty """ parser = etree.XMLParser(resolve_entities=False, strip_cdata=False) document = etree.parse(IOH_xmlfile, parser) document.write(IOH_xmlfile, pretty_print=True, encoding='utf-8') """ identify all the speaker tags """ q = etree.parse(IOH_xmlfile) speaker_tags = q.findall('.//speaker') speakers = dict() num = 1 for tag in speaker_tags: if tag.text: full = tag.text.strip() if ' ' not in full: first = full else: first, rest = full.split(' ', 1) first = first.strip() if first not in speakers: speakers[first] = { 'number': num, 'class': "<span class='oh_speaker_" + str(num) + "'>", 'full_name': full } num += 1 """ examine each cue, identify THE speaker and modify the cue accordingly """ cue_tags = q.findall('.//cue') speakers_found = [] for tag in cue_tags: s = tag.find('speaker') if ' ' not in s.text.strip(): first = s.text.strip() else: first, rest = s.text.strip().split(' ', 1) first = first.strip() if first not in speakers_found: speakers_found.append(first) t = tag.find('transcript') if t.text is None: statusText.set("Transcript has no text at source line " + str(t.sourceline) + "!") message.configure(fg="red") return text = t.text.replace('\n', ' ').replace(' ', ' ').replace( ' :', ':').replace(' |', '|') t.text = '' try: t.text += speakers[first][ 'class'] + first + ": " + "<span class='oh_speaker_text'>" + text + '</span></span>' except KeyError: statusText.set("Transcript 'KeyError' at source line " + str(t.sourceline) + "! Please investigate.") message.configure(fg="red") return q.write(IOH_xmlfile) entry.delete(0, END) entry.insert(0, IOH_xmlfile) statusText.set( "Speaker reformatting for transcript `{}' is complete.".format( IOH_xmlfile)) message.configure(fg="dark green")
def xml(self): if '_xml' not in self.__dict__: self._xml = etree.fromstring(self.bill_xml.encode('utf-8'), etree.XMLParser(recover=True)) return self._xml
def get_tile_prototypes(): parser = etree.XMLParser(remove_blank_text=True) tree = etree.parse('tiles.svg', parser) root = tree.getroot() tile_defs = root.find('{http://www.w3.org/2000/svg}defs') return tile_defs
def read_message_name_in_message_file(xml_file: str) -> str: xml = BytesIO(xml_file.encode()) tree = etree.parse(xml, etree.XMLParser()) node = tree.find('//ns:GrpHdr/ns:MsgId', namespaces=XML_NAMESPACE) return node.text
# -*- coding: utf-8 -*- from lxml import etree as ET from openpyxl import load_workbook from operator import itemgetter import os import copy import requests import datetime #lxml parser for parsing XML files from strings parser = ET.XMLParser(remove_blank_text=True) if os.name == "nt": #Windows Directory Names #Finding Aid Directory faDir = "g:\WebArch" #Collection and Subject spreadsheets directory spreadDir = "g:\WebArch" #parse Collection List spreadsheet collectionListFile = os.path.join(spreadDir, "collectionList.xlsx") collectionWorkbook = load_workbook(filename=collectionListFile) collectionList = collectionWorkbook.active #Parse List of Collections to list of lists rowIndex = 0 collections = [] for row in collectionList.rows: rowIndex = rowIndex + 1 if rowIndex > 1:
def fetch_information(HTML, requrl): global evtnamePattern global evtdescPattern global starttimePattern global startdatePattern global endtimePattern global enddatePattern global timePattern global locationPattern global dateAndTimePattern global evtsource global datePattern global picurlPattern global tagsPattern global additionalTags global specificLocation global evtsourceCommunityDict global evtsourceYearDict currentTime = datetime.datetime.now() currentDate = currentTime.strftime('%Y-%m-%d') currentDate = datetime.datetime.strptime(currentDate, '%Y-%m-%d') formerDate = currentDate + datetime.timedelta(days=-1) parser = etree.XMLParser(recover = True) tree = etree.fromstring(HTML, parser) evtnameList = [] evtdescList = [] starttimeList = [] startdateList = [] endtimeList = [] enddateList = [] timeList = [] dateAndTimeList = [] locationList = [] dateList = [] picurlList = [] tagsList = [] # raw_input(requrl) # print HTML # raw_input(123) eventCount = len(tree.xpath(evtnamePattern)) i = 0 while i < eventCount: evtnameList.append("") evtdescList.append("") starttimeList.append("") startdateList.append("") endtimeList.append("") enddateList.append("") timeList.append("") dateAndTimeList.append("") locationList.append("") dateList.append("") picurlList.append("") tagsList.append([]) i += 1 evtnameLxmlItemList = tree.xpath(evtnamePattern) evtnameList = [] for evtnameLxmlItem in evtnameLxmlItemList: evtnameList.append(get_text(evtnameLxmlItem)) evtdescLxmlItemList = tree.xpath(evtdescPattern) evtdescList = [] for evtdescLxmlItem in evtdescLxmlItemList: evtdescList.append(get_text(evtdescLxmlItem)) if locationPattern != "": locationLxmlItemList = tree.xpath(locationPattern) locationList = [] for locationLxmlItem in locationLxmlItemList: locationList.append(get_text(locationLxmlItem)) if specificLocation != "": locationList = [] i = 0 while i < eventCount: locationList.append(specificLocation) i += 1 if picurlPattern != "": picurlLxmlItemList = tree.xpath(picurlPattern) picurlList = [] for picurlLxmlItem in picurlLxmlItemList: picurl = get_picurl(picurlLxmlItem) if picurl != "" and picurl[0] == "/" and picurl[1] != "/": picurl = evtsource + picurl elif picurl != "" and picurl[0] == "/" and picurl[1] == "/": picurl = picurl[2:] picurlList.append(picurl) if tagsPattern != "": tagsLxmlItemList = tree.xpath(tagsPattern) tagsList = [] for tagLxmlItem in tagsLxmlItemList: tags = get_text(tagLxmlItem) tags = analyze_tags(tags) tagsList.append(tags) if dateAndTimePattern != "": dateAndTimeLxmlItemList = tree.xpath(dateAndTimePattern) dateAndTimeList = [] for dateAndTimeLxmlItem in dateAndTimeLxmlItemList: dateAndTime = get_text(dateAndTimeLxmlItem) dateAndTimeList.append(dateAndTime) if datePattern != "": dateLxmlItemList = tree.xpath(datePattern) dateList = [] for dateLxmlItem in dateLxmlItemList: date = get_text(dateLxmlItem) dateList.append(date) if timePattern != "": timeLxmlItemList = tree.xpath(timePattern) for timeLxmlItem in timeLxmlItemList: time = get_text(timeLxmlItem) timeList.append(time) if starttimePattern != "": starttimeLxmlItemList = tree.xpath(starttimePattern) for starttimeLxmlItem in starttimeLxmlItemList: starttime = get_text(starttimeLxmlItem) starttimeList.append(starttime) if endtimePattern != "": endtimeLxmlItemList = tree.xpath(endtimePattern) for endtimeLxmlItem in endtimeLxmlItemList: endtime = get_text(endtimeLxmlItem) endtimeList.append(endtime) if startdatePattern != "": startdateLxmlItemList = tree.xpath(startdatePattern) for startdateLxmlItem in startdateLxmlItemList: startdate = get_text(startdateLxmlItem) startdateList.append(startdate) if enddatePattern != "": enddateLxmlItemList = tree.xpath(enddatePattern) for enddateLxmlItem in enddateLxmlItemList: enddate = get_text(enddateLxmlItem) enddateList.append(enddate) url = requrl #decode as unicode and analyze text i = 0 while i < eventCount: evtname = evtnameList[i] evtdesc = evtdescList[i] location = locationList[i] dateAndTime = dateAndTimeList[i] date = dateList[i] time = timeList[i] starttime = starttimeList[i] endtime = endtimeList[i] startdate = startdateList[i] enddate = enddateList[i] tags = tagsList[i] picurl = picurlList[i] evtname = analyze_text(unidecode.unidecode(evtname)) evtdesc = analyze_text(unidecode.unidecode(evtdesc)) location = analyze_text(location) dateAndTime = analyze_text(dateAndTime) date = analyze_text(date) time = analyze_text(time) starttime = analyze_text(starttime) endtime = analyze_text(endtime) starttime, endtime = analyze_time(dateAndTime, date, time, starttime, endtime, startdate, enddate) if evtname == "": print "Can't crawl evtname information: ", print requrl i += 1 continue if starttime == "": print "Can't crawl time information: ", print requrl i += 1 continue if location == "": print "Can't crawl location information: ", print requrl i += 1 continue community = evtsourceCommunityDict[evtsource] year = evtsourceYearDict[evtsource] fetch_data(url, evtname, evtdesc, starttime, endtime, location, community, evtsource, formerDate, tags, additionalTags, picurl, year) i += 1
Note that the data we're dealing with seems to suffer from this tesseract bug: https://groups.google.com/forum/#!topic/tesseract-ocr/UiyIMUWMzsU so we're assuming it's actually latin-1 encoded. """ import unicodedata from lxml import etree from lxml.etree import tostring from StringIO import StringIO from hocr_parser.document_parser import document_parser from hocr_parser.parse_utils import get_words_from_page, get_words_with_lines_from_page, get_annotated_bbox flexible_parser = etree.XMLParser(encoding='utf-8', recover=True) file_name = "58-1723645_990_201204" file_path = "hocr_parser/test_hocr/" + file_name + ".html" parser = document_parser(file_path, encoding='latin-1') page_num = 0 while True: this_page = parser.read_page() if not this_page: break page_num += 1 print "Processing page %s" % page_num outfile = "../display/hocr_pages/" + file_name + "p" + str( page_num) + ".html"
def identify( # {{{ self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): from lxml import etree entry = XPath('//atom:entry') query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: log.error('Insufficient metadata to construct query') return br = self.browser log('Making query:', query) try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) try: feed = etree.fromstring( xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) entries = entry(feed) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) if not entries and title and not abort.is_set(): if identifiers: log('No results found, retrying without identifiers') return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) ntitle = cleanup_title(title) if ntitle and ntitle != title: log('No results found, retrying without sub-title') return self.identify(log, result_queue, abort, title=ntitle, authors=authors, timeout=timeout) # There is no point running these queries in threads as google # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout)
def __init__(self, dataset, root=None, force_reparse=False): """ Given a IATI dataset, prepare an IATI parser """ if settings.IATI_PARSER_DISABLED: raise ParserDisabledError( "The parser is disabled on this instance of OIPA") self.dataset = dataset self.url = dataset.source_url self.force_reparse = force_reparse self.hash_changed = True self.valid_dataset = True if root is not None: self.root = root self.parser = self._prepare_parser(self.root, dataset) return response = None headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X ' '10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } # NOQA: E501 # This was rolled back to how it was before adding dataset failed # pickups in commit e5d0ba6454b180d8b4e27820184007fdacb6c3b4 try: response = requests.get(self.url, headers=headers, timeout=30) response.raise_for_status() except requests.exceptions.SSLError: try: response = requests.get(self.url, verify=False, timeout=30) except requests.exceptions.SSLError: pass except requests.exceptions.Timeout: try: response = requests.get(self.url, verify=False, timeout=30) except requests.exceptions.Timeout: pass except requests.exceptions.RequestException: pass # We do not add a generic exception, because that would mean that # an internal datastore error would show up in the API. finally: pass if not response or response.status_code != 200: self.valid_dataset = False note = DatasetNote(dataset=self.dataset, iati_identifier="n/a", model="n/a", field="n/a", message="Cannot access the URL", exception_type='UrlError', line_number=None) note.save() self.dataset.note_count = 1 # If not a XML file them sha1 should blank self.dataset.sha1 = '' self.dataset.save() return # 1. Turn bytestring into string (treat it using specified encoding): try: iati_file = smart_text(response.content, 'utf-8') # XXX: some files contain non utf-8 characters: # FIXME: this is hardcoded: except UnicodeDecodeError: iati_file = smart_text(response.content, 'latin-1') # 2. Encode the string to use for hashing: hasher = hashlib.sha1() hasher.update(iati_file.encode('utf-8')) sha1 = hasher.hexdigest() if dataset.sha1 == sha1: # dataset did not change, no need to reparse normally self.hash_changed = False else: dataset.sha1 = sha1 # Save a sha1 in the first time of the process parse dataset.save() try: parser = etree.XMLParser(huge_tree=True) tree = etree.parse(BytesIO(response.content), parser) self.root = tree.getroot() self.parser = self._prepare_parser(self.root, dataset) if settings.ERROR_LOGS_ENABLED: self.xsd_validate() # TODO: when moving error messages to frontend, create a separate error # for wrong file type: except etree.XMLSyntaxError as e: self.valid_dataset = False DatasetNote.objects.filter(dataset=self.dataset).delete() note = DatasetNote( dataset=self.dataset, iati_identifier="n/a", model="n/a", field="n/a", message="This file contains XML syntax errors or it's not an " "XML file", exception_type='XMLSyntaxError', line_number=None) note.save() self.dataset.note_count = 1 # If not the XML should not have a sha1 self.dataset.sha1 = '' self.dataset.save() return
def __init__(self): print('---- Principio del archivo') def start(self, tag, attrib): if tag == "item": self.numeroElementos += 1 if tag == "enclosure": if attrib["type"] == "image/jpeg": self.numeroImagenes += 1 print attrib["url"] #urllib.urlretrieve(attrib["url"], "imagenesDescargadas/" + str(self.numeroImagenes) + ".jpg") def data(self, data): if len(sys.argv) > 1: encuentra = re.compile('\s' + self.termino + '\s') if encuentra.search(data) != None: self.encontrado = True def close(self): print('---- Fin del archivo') print "Numero de elementos: " + str(self.numeroElementos) print "Numero de imagenes: " + str(self.numeroImagenes) if len(sys.argv) > 1: if self.encontrado: print 'El termino ' + self.termino + ' esta.' else: print 'El termino ' + self.termino + ' no esta.' parser = etree.XMLParser(target=ParseRssNews()) etree.parse('portada.xml', parser)
def read_xml_file(path, log=None): """ Read an xml file and return the root node as for lxml.etree """ def remove_utf8_from_xml(fileContent): """ Removes the header from the file content. <?xml version="1.0" encoding="UTF-8"?> """ indexStart = fileContent.find('<?xml') if indexStart < 0: return fileContent indexStart = fileContent.find('<', indexStart + 2) if indexStart < 0: return fileContent return fileContent[indexStart:] def remove_xmlns_from_xml(fileContent): """ Removes the "xmlns=" part from file content because lxml api supports this part only by specifying exactly its value whenever we want to access a part of xml content, and its value can change between xml files. <web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://java.sun.com/xml/ns/javaee" xmlns:web="http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd" xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd" id="WebApp_ID" version="2.5"> </web-app> """ if not 'xmlns=' in fileContent: return fileContent indexStart = fileContent.find('xmlns=') indexValueStart = fileContent.find('"', indexStart) if indexValueStart < 0: return fileContent indexValueEnd = fileContent.find('"', indexValueStart + 1) if indexValueEnd < 0: return fileContent return fileContent.replace(fileContent[indexStart:indexValueEnd + 1], '') ''' def get_root(text): class LineNumberingParser(ET.XMLParser): def _start(self, *args, **kwargs): # Here we assume the default XML parser which is expat # and copy its element position attributes into output Elements element = super(self.__class__, self)._start(*args, **kwargs) element.start_line_number = self.parser.CurrentLineNumber element.start_column_number = self.parser.CurrentColumnNumber element._start_byte_index = self.parser.CurrentByteIndex return element def _end(self, *args, **kwargs): element = super(self.__class__, self)._end(*args, **kwargs) element.end_line_number = self.parser.CurrentLineNumber element.end_column_number = self.parser.CurrentColumnNumber element._end_byte_index = self.parser.CurrentByteIndex return element return ET.fromstring(text, LineNumberingParser()) ''' with open_source_file(path) as f: file_content = f.read() file_content = remove_utf8_from_xml(file_content) file_content = remove_xmlns_from_xml(file_content) parser = ET.XMLParser(recover=True) return ET.fromstring(file_content, parser)
def update_ikfast_package(args): # Copy the source code generated by IKFast into our src folder src_path = args.ikfast_plugin_pkg_path + '/src/' solver_file_path = src_path + args.robot_name + '_' + args.planning_group_name + '_ikfast_solver.cpp' if not os.path.exists(solver_file_path) or not os.path.samefile( args.ikfast_output_path, solver_file_path): shutil.copy2(args.ikfast_output_path, solver_file_path) if not os.path.exists(solver_file_path): raise Exception( "Failed to copy IKFast source code from '%s' to '%s'\n" "Manually copy the source file generated by IKFast to this location and re-run" % (args.ikfast_output_path, solver_file_path)) # Remember ikfast solver file for update of MoveIt package args.ikfast_output_path = solver_file_path # Get template folder location template_dir = find_template_dir() # namespace for the plugin setattr(args, 'namespace', args.robot_name + "_" + args.planning_group_name) replacements = dict(_ROBOT_NAME_=args.robot_name, _GROUP_NAME_=args.planning_group_name, _SEARCH_MODE_=args.search_mode, _EEF_LINK_=args.eef_link_name, _BASE_LINK_=args.base_link_name, _PACKAGE_NAME_=args.ikfast_plugin_pkg, _NAMESPACE_=args.namespace) # Copy ikfast header file copy_file(template_dir + '/ikfast.h', args.ikfast_plugin_pkg_path + "/include/ikfast.h", "ikfast header file") # Create ikfast plugin template copy_file( template_dir + '/ikfast' + str(args.template_version) + '_moveit_plugin_template.cpp', args.ikfast_plugin_pkg_path + "/src/" + args.robot_name + '_' + args.planning_group_name + "_ikfast_moveit_plugin.cpp", "ikfast plugin file", replacements) # Create plugin definition .xml file ik_library_name = args.namespace + "_moveit_ikfast_plugin" plugin_def = etree.Element("library", path="lib/lib" + ik_library_name) setattr(args, 'plugin_name', args.namespace + '/IKFastKinematicsPlugin') cl = etree.SubElement(plugin_def, "class", name=args.plugin_name, type=args.namespace + "::IKFastKinematicsPlugin", base_class_type="kinematics::KinematicsBase") desc = etree.SubElement(cl, "description") desc.text = 'IKFast{template} plugin for closed-form kinematics of {robot} {group}' \ .format(template=args.template_version, robot=args.robot_name, group=args.planning_group_name) # Write plugin definition to file plugin_file_name = ik_library_name + "_description.xml" plugin_file_path = args.ikfast_plugin_pkg_path + "/" + plugin_file_name with open(plugin_file_path, 'w') as f: etree.ElementTree(plugin_def).write(f, xml_declaration=True, pretty_print=True, encoding="UTF-8") print("Created plugin definition at '%s'" % plugin_file_path) # Create CMakeLists file replacements.update(dict(_LIBRARY_NAME_=ik_library_name)) copy_file(template_dir + "/CMakeLists.txt", args.ikfast_plugin_pkg_path + '/CMakeLists.txt', "cmake file", replacements) # Add plugin export to package manifest parser = etree.XMLParser(remove_blank_text=True) package_file_name = args.ikfast_plugin_pkg_path + "/package.xml" package_xml = etree.parse(package_file_name, parser).getroot() # Make sure at least all required dependencies are in the depends lists build_deps = [ "liblapack-dev", "moveit_core", "pluginlib", "rclcpp", "tf2_kdl", "tf2_eigen" ] run_deps = ["liblapack-dev", "moveit_core", "pluginlib", "rclcpp"] update_deps(build_deps, "build_depend", package_xml) update_deps(run_deps, "exec_depend", package_xml) # Check that plugin definition file is in the export list new_export = etree.Element("moveit_core", plugin="${prefix}/" + plugin_file_name) export_element = package_xml.find("export") if export_element is None: export_element = etree.SubElement(package_xml, "export") found = False for el in export_element.findall("moveit_core"): found = (etree.tostring(new_export) == etree.tostring(el)) if found: break if not found: export_element.append(new_export) # Always write the package xml file, even if there are no changes, to ensure # proper encodings are used in the future (UTF-8) with open(package_file_name, "w") as f: etree.ElementTree(package_xml).write(f, xml_declaration=True, pretty_print=True, encoding="UTF-8") print("Wrote package.xml at '%s'" % package_file_name) # Create a script for easily updating the plugin in the future in case the plugin needs to be updated easy_script_file_path = args.ikfast_plugin_pkg_path + "/update_ikfast_plugin.sh" with open(easy_script_file_path, 'w') as f: f.write("search_mode=" + args.search_mode + "\n" + "srdf_filename=" + args.srdf_filename + "\n" + "robot_name_in_srdf=" + args.robot_name_in_srdf + "\n" + "moveit_config_pkg=" + args.moveit_config_pkg + "\n" + "robot_name=" + args.robot_name + "\n" + "planning_group_name=" + args.planning_group_name + "\n" + "ikfast_plugin_pkg=" + args.ikfast_plugin_pkg + "\n" + "base_link_name=" + args.base_link_name + "\n" + "eef_link_name=" + args.eef_link_name + "\n" + "ikfast_output_path=" + args.ikfast_output_path + "\n\n" + "rosrun moveit_kinematics create_ikfast_moveit_plugin.py\\\n" + " --search_mode=$search_mode\\\n" + " --srdf_filename=$srdf_filename\\\n" + " --robot_name_in_srdf=$robot_name_in_srdf\\\n" + " --moveit_config_pkg=$moveit_config_pkg\\\n" + " $robot_name\\\n" + " $planning_group_name\\\n" + " $ikfast_plugin_pkg\\\n" + " $base_link_name\\\n" + " $eef_link_name\\\n" + " $ikfast_output_path\n") print("Created update plugin script at '%s'" % easy_script_file_path)
def fields_view_get(cls, view_id=None, view_type='form'): ''' Return a view definition. If view_id is None the first one will be used of view_type. The definition is a dictionary with keys: - model: the model name - type: the type of the view - view_id: the id of the view - arch: the xml description of the view - fields: a dictionary with the definition of each field in the view - field_childs: the name of the childs field for tree ''' key = (cls.__name__, view_id, view_type) result = cls._fields_view_get_cache.get(key) if result: return result result = {'model': cls.__name__} pool = Pool() View = pool.get('ir.ui.view') view = None inherit_view_id = None if view_id: view = View(view_id) else: domain = [ ('model', '=', cls.__name__), ('type', '=', view_type), [ 'OR', ('inherit', '=', None), ('inherit.model', '!=', cls.__name__), ], ] views = View.search(domain) if views: view = views[0] if view: if view.inherit: inherit_view_id = view.id view = view.inherit view_id = view.id # if a view was found if view: result['type'] = view.type result['view_id'] = view_id result['arch'] = view.arch result['field_childs'] = view.field_childs # Check if view is not from an inherited model if view.model != cls.__name__: Inherit = pool.get(view.model) result['arch'] = Inherit.fields_view_get( result['view_id'])['arch'] view_id = inherit_view_id # get all views which inherit from (ie modify) this view views = View.search([ 'OR', [ ('inherit', '=', view_id), ('model', '=', cls.__name__), ], [ ('id', '=', view_id), ('inherit', '!=', None), ], ]) raise_p = False while True: try: views.sort(key=lambda x: cls._modules_list.index(x.module or None)) break except ValueError: if raise_p: raise # There is perhaps a new module in the directory ModelView._reset_modules_list() raise_p = True for view in views: if view.domain: if not PYSONDecoder({ 'context': Transaction().context }).decode(view.domain): continue if not view.arch or not view.arch.strip(): continue result['arch'] = _inherit_apply(result['arch'], view.arch) # otherwise, build some kind of default view else: if view_type == 'form': res = cls.fields_get() xml = '''<?xml version="1.0"?>''' \ '''<form string="%s" col="4">''' % (cls.__doc__,) for i in res: if i in ('create_uid', 'create_date', 'write_uid', 'write_date', 'id', 'rec_name'): continue if res[i]['type'] not in ('one2many', 'many2many'): xml += '<label name="%s"/>' % (i, ) xml += '<field name="%s"/>' % (i, ) if res[i]['type'] == 'text': xml += "<newline/>" else: xml += '<field name="%s" colspan="4"/>' % (i, ) xml += "</form>" elif view_type == 'tree': field = 'id' if cls._rec_name in cls._fields: field = cls._rec_name xml = '''<?xml version="1.0"?>''' \ '''<tree string="%s"><field name="%s"/></tree>''' \ % (cls.__doc__, field) else: xml = '' result['type'] = view_type result['arch'] = xml result['field_childs'] = None result['view_id'] = 0 # Update arch and compute fields from arch parser = etree.XMLParser(remove_blank_text=True) tree = etree.fromstring(result['arch'], parser) xarch, xfields = cls._view_look_dom_arch(tree, result['type'], result['field_childs']) result['arch'] = xarch result['fields'] = xfields cls._fields_view_get_cache.set(key, result) return result
namespace = current_app.config["XML_NAMESPACE"] api_endpoint = current_app.config["MTD_API_ENDPOINT"] NOMENCLATURE_MAPPING = { "id_nomenclature_data_type": "DATA_TYP", "id_nomenclature_dataset_objectif": "JDD_OBJECTIFS", "id_nomenclature_data_origin": "DS_PUBLIQUE", "id_nomenclature_source_status": "STATUT_SOURCE", } # get the root logger log = logging.getLogger() gunicorn_error_logger = logging.getLogger("gunicorn.error") xml_parser = ET.XMLParser(ns_clean=True, recover=True, encoding="utf-8") def get_acquisition_framework(uuid_af): """ Fetch a AF from the MTD WS with the uuid of the AD Parameters: - uuid_af (str): the uuid of the AF Returns: byte: the xml of the AF as byte """ url = "{}/cadre/export/xml/GetRecordById?id={}" try: r = utilsrequests.get(url.format(api_endpoint, uuid_af)) except AssertionError:
def text_blob(self): xml_parser = et.XMLParser(recover=True) tr_tree = et.fromstring('<xml>' + self.transcription + '</xml>', xml_parser) return et.tostring(tr_tree, encoding='utf8', method='text').decode('utf-8')
from xmodule.modulestore.xml_exporter import DEFAULT_CONTENT_FIELDS from xmodule.tabs import CourseTabList from xmodule.modulestore.keys import UsageKey from xmodule.modulestore.locations import SlashSeparatedCourseKey from xblock.field_data import DictFieldData from xblock.runtime import DictKeyValueStore, IdGenerator from . import ModuleStoreReadBase, Location, XML_MODULESTORE_TYPE from .exceptions import ItemNotFoundError from .inheritance import compute_inherited_metadata, inheriting_field_data from xblock.fields import ScopeIds, Reference, ReferenceList, ReferenceValueDict edx_xml_parser = etree.XMLParser(dtd_validation=False, load_dtd=False, remove_comments=True, remove_blank_text=True) etree.set_default_parser(edx_xml_parser) log = logging.getLogger(__name__) # VS[compat] # TODO (cpennington): Remove this once all fall 2012 courses have been imported # into the cms from xml def clean_out_mako_templating(xml_string): xml_string = xml_string.replace('%include', 'include') xml_string = re.sub(r"(?m)^\s*%.*$", '', xml_string) return xml_string
def remove_whitespace_from_xml(xmlstr): parser = etree.XMLParser(remove_blank_text=True) elem = etree.XML(xmlstr, parser=parser) return etree.tostring(elem)