def read_xml(path):
    """ parse xml and return tree """
    parser = etree.XMLParser(remove_blank_text=True)
    tree = etree.parse(path, parser)
    return tree
Ejemplo n.º 2
0
    def to_dict(self):
        """
        Return a mapping representing this POM
        """
        return OrderedDict([
            ('group_id', self.group_id),
            ('artifact_id', self.artifact_id),
            ('version', str(self.version) if self.version else None),
            ('classifier', self.classifier),
            ('type', self.type),
        ])


POM_PARSER = etree.XMLParser(recover=True,
                             remove_comments=True,
                             remove_pis=True,
                             remove_blank_text=True,
                             resolve_entities=False)

STRIP_NAMESPACE_RE = re.compile(r"<project(.|\s)*?>", re.UNICODE)


class MavenPom(pom.Pom):
    def __init__(self, location):
        # NOTE: most of this is copied over from Pom.__init__
        try:
            with codecs.open(location, 'rb', encoding='UTF-8') as fh:
                xml = fh.read()
        except UnicodeDecodeError as _a:
            xml = analysis.unicode_text(location)
Ejemplo n.º 3
0
from lxml import etree
schemadoc = etree.parse("../xsd/top_artistsPT.xsd")
schema = etree.XMLSchema(schemadoc)
parser = etree.XMLParser(schema=schema)
tree = etree.parse("../top_artistsPT.xml")
schema.assertValid(tree)
Ejemplo n.º 4
0
def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree

    # total_results  = XPath('//openSearch:totalResults')
    # start_index    = XPath('//openSearch:startIndex')
    # items_per_page = XPath('//openSearch:itemsPerPage')
    entry = XPath('//atom:entry')
    entry_id = XPath('descendant::atom:id')
    url = XPath('descendant::atom:link[@rel="self"]/@href')
    creator = XPath('descendant::dc:creator')
    identifier = XPath('descendant::dc:identifier')
    title = XPath('descendant::dc:title')
    date = XPath('descendant::dc:date')
    publisher = XPath('descendant::dc:publisher')
    subject = XPath('descendant::dc:subject')
    description = XPath('descendant::dc:description')
    language = XPath('descendant::dc:language')

    # print(etree.tostring(entry_, pretty_print=True))

    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text
    google_id = id_url.split('/')[-1]
    details_url = url(entry_)[0]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'google': google_id}
    try:
        raw = get_details(browser, details_url, timeout)
        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
                                               strip_encoding_pats=True)[0],
                                parser=etree.XMLParser(recover=True,
                                                       no_network=True,
                                                       resolve_entities=False))
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi

    mi.comments = get_text(extra, description)
    lang = canonicalize_lang(get_text(extra, language))
    if lang:
        mi.language = lang
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in identifier(extra):
        t = type('')(x.text).strip()
        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
            if t[:5].upper() == 'ISBN:':
                t = check_isbn(t[5:])
                if t:
                    isbns.append(t)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x.text for x in subject(extra) if x.text]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        from calibre.utils.date import parse_date, utcnow
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r' % pubdate)

    # Cover
    mi.has_google_cover = None
    for x in extra.xpath(
            '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'
    ):
        mi.has_google_cover = x.get('href')
        break

    return mi
Ejemplo n.º 5
0
stk_path = os.path.join(os.path.realpath(os.path.dirname(__file__)), os.pardir,
                        os.pardir)
sys.path.insert(0, stk_path)
from stk.supertree_toolkit import _check_uniqueness, _check_taxa, _check_data, get_all_characters, data_independence, add_weights
from stk.supertree_toolkit import get_fossil_taxa, get_publication_years, data_summary, get_character_numbers, get_analyses_used
from stk.supertree_toolkit import data_overlap, read_matrix, subs_file_from_str, clean_data, obtain_trees, get_all_source_names
from stk.supertree_toolkit import add_historical_event, _sort_data, _parse_xml, _check_sources, _swap_tree_in_XML, replace_genera
from stk.supertree_toolkit import get_all_taxa, _get_all_siblings, _parse_tree, get_characters_used, _trees_equal, get_weights
from stk.supertree_toolkit import get_outgroup, set_all_tree_names, create_tree_name, taxonomic_checker, load_taxonomy, load_equivalents
from stk.supertree_toolkit import create_taxonomy, create_taxonomy_from_tree, get_all_tree_names
from lxml import etree
from util import *
from stk.stk_exceptions import *
from collections import defaultdict
import tempfile
parser = etree.XMLParser(remove_blank_text=True)
import re

# Class to test all those loverly internal methods
# or stuff that doesn't fit within the other tests


class TestSTK(unittest.TestCase):
    def test_check_uniqueness(self):
        non_unique_names = etree.parse("data/input/non_unique_names.phyml")
        try:
            _check_uniqueness(etree.tostring(non_unique_names))
        except NotUniqueError:
            self.assert_(True)
            return
Ejemplo n.º 6
0
from . import pycompat

import odoo
# get_encodings, ustr and exception_to_unicode were originally from tools.misc.
# There are moved to loglevels until we refactor tools.
from odoo.loglevels import get_encodings, ustr, exception_to_unicode  # noqa

_logger = logging.getLogger(__name__)

# List of etree._Element subclasses that we choose to ignore when parsing XML.
# We include the *Base ones just in case, currently they seem to be subclasses of the _* ones.
SKIPPED_ELEMENT_TYPES = (etree._Comment, etree._ProcessingInstruction,
                         etree.CommentBase, etree.PIBase, etree._Entity)

# Configure default global parser
etree.set_default_parser(etree.XMLParser(resolve_entities=False))

#----------------------------------------------------------
# Subprocesses
#----------------------------------------------------------


def find_in_path(name):
    path = os.environ.get('PATH', os.defpath).split(os.pathsep)
    if config.get('bin_path') and config['bin_path'] != 'None':
        path.append(config['bin_path'])
    return which(name, path=os.pathsep.join(path))


def _exec_pipe(prog, args, env=None):
    cmd = (prog, ) + args
def parseXML(xml_in, params, state):
    """
    parse the document XML
    """
    # import pdb; pdb.set_trace()
    # if two fragments of text are within LINE_TOLERANCE of each other they're
    # on the same line
    text_margin_left = 57
    text_margin_right = 289
    indentation_bound_left = 71
    indentation_bound_right = 305

    NO_INTERJECTION = re.compile(r'^.{1,3}' + re.escape(params['closing_mark']))

    debug = False

    # get the page elements
    parser = etree.XMLParser(recover=True)
    tree = etree.parse(xml_in, parser=parser)  
    #tree = ET.ElementTree(file=xml_in)
    pages = tree.getroot()

    if pages.tag != "pages":
        sys.exit("ERROR: pages.tag is %s instead of pages!" % pages.tag)

    if (int(xml_in[12:14]) >= 15 and int(xml_in[14:17]) > 20) or (int(xml_in[12:14]) == 16):
        text_margin_right = 270
        indentation_bound_right = 312
    
    text = []
    # step through the pages
    for page in pages:
        # gets page_id
        page_id = page.attrib['id']

        # get all the textline elements
        textboxes = page.findall("./textbox")

        #print "found %s textlines" % len(textlines)
        # step through the textlines
        page_text = []
        interjection = False
        # import pdb; pdb.set_trace()
        left =  [round(float(textbox.attrib["bbox"].split(',')[0:1][0])) for textbox in textboxes]

        line_half = int((indentation_bound_right -
                      indentation_bound_left)/2)

        identation = [e for e in Counter(left).keys()
                      if (e > indentation_bound_left + 3
                      and e < indentation_bound_right - 3)
                      or
                      (e > indentation_bound_right + 3
                      and e < indentation_bound_right +
                              line_half)]

        if state != 'BE':
            if any([e in range(params["identation_bound_left_1"], params["identation_bound_left_1"] + line_half) for e in identation]):
                identation_bounds = 'first'
            elif any([e in range(params["identation_bound_left_1"], params["identation_bound_left_1"] + line_half) for e in identation]):
                identation_bounds = 'second'
            else:
                identation_bounds = None

        if not identation:
            logging.warning('no x0 values within specified ranges' + page.attrib['id'])

        #import pdb; pdb.set_trace()
        for textbox in textboxes:
            # get the boundaries of the textline
            #import pdb; pdb.set_trace()
            textbox_bounds = [float(s) for s in textbox.attrib["bbox"].split(',')]
            #print "line_bounds: %s" % line_bounds

            # get all the texts in this textline
            lines = list(textbox)
            #print("found %s characters in this line." % len(chars))

            # combine all the characters into a single string
            textbox_text = ""
            poi = False
            issue = False
            for line, has_more in lookahead(lines):
                chars = list(line)
                for char in chars:
                    if poi:
                        if char.attrib:
                            if "Bold" not in char.attrib['font']:
                                #import pdb; pdb.set_trace()
                                textbox_text = textbox_text + '<poi_end>'
                                poi = False
                    elif char.attrib:
                        if "Bold" in char.attrib['font']:
                            #import pdb; pdb.set_trace()
                            textbox_text = textbox_text + '<poi_begin>'
                            poi = True
                    try:
                        textbox_text = textbox_text + char.text
                    except TypeError:
                        print('===============================> Attention! You have a TypeError here!', page_id)
                if not has_more and poi:
                    textbox_text = textbox_text + '<poi_end>'

            #if re.compile(r'.+\n<poi_end>').match(textbox_text):
                #import pdb; pdb.set_trace()

            textbox_text = textbox_text.replace('\n<poi_end>', '<poi_end>\n')
            # if 'Beifall' in textbox_text:
            #    import pdb; pdb.set_trace()
            # strip edge & multiple spaces
            textbox_text = re.sub(' +', ' ', textbox_text.strip())

            # removes header/footer
            if textbox_bounds[1]>params['header_bound'] and page_id not in ['1']:
                #import pdb; pdb.set_trace()
                print('removed header ' + textbox_text)
                continue

            # if '(Alterspräsident Dr. Uwe Lehmann-Brauns)' in textbox_text:
               # import pdb; pdb.set_trace()

            # save a description of the line
            textbox = {'left': textbox_bounds[0], 'top': textbox_bounds[1], 'text': textbox_text}

            # if state != 'BE':
            #     if identation_bounds=='first':
            #         if textbox['left']>46 and textbox['left']<290 or textbox['left']>316:
            #             if textbox_text.lstrip().startswith('(') and not NO_INTERJECTION.match(textbox_text):
            #                 textbox['text'] = '<interjection_begin>' + textbox['text'] + '<interjection_end>'
            #             else:
            #                 textbox['text'] = '<identation_begin>' + textbox['text'] + '<identation_end>'
            #     elif identation_bounds=='second':
            #         if textbox['left']>75 and textbox['left']<320 or textbox['left']>344:
            #             if textbox_text.lstrip().startswith('(') and not NO_INTERJECTION.match(textbox_text):
            #                 textbox['text'] = '<interjection_begin>' + textbox['text'] + '<interjection_end>'
            #             else:
            #                 textbox['text'] = '<identation_begin>' + textbox['text'] + '<identation_end>'
            #     else:
            #         logging.info('no ordinary text boxes on page' + page_id)
            # else:
            #     if textbox['left']>params['identation_bound_left_1'] + 3 and textbox['left']<params['identation_bound_right_1'] - 3 or textbox['left']>params['identation_bound_right_1'] + 3:
            #         if textbox_text.lstrip().startswith(params['opening_mark']) and not NO_INTERJECTION.match(textbox_text):
            #             textbox['text'] = '<interjection_begin>' + textbox['text'] + '<interjection_end>'
            #         else:
            #             textbox['text'] = '<identation_begin>' + textbox['text'] + '<identation_end>'

            if textbox['left'] > indentation_bound_left - 5 and textbox['left'] < text_margin_right - 5:
                textbox['text'] = '<interjection_begin>' + textbox['text'].replace('\n', '<interjection_end>\n<interjection_begin>') + '<interjection_end>'
            elif textbox['left'] > indentation_bound_right - 5:
                textbox['text'] = '<interjection_begin>' +  textbox['text'].replace('\n', '<interjection_end>\n<interjection_begin>') + '<interjection_end>'

            if textbox['left'] < text_margin_right - 5:
                textbox['left'] = 30
            else:
                textbox['left'] = 30
                textbox['top'] = textbox['top']-1000
                
            page_text.append(textbox)

        #print "page %s has %s lines" % (page.attrib["id"], len(lines))

        # sort the lines by left, then top position
        # if debug:
        #     import pdb; pdb.set_trace()
        page_text.sort(key=itemgetter('left'))
        page_text.sort(key=itemgetter('top'), reverse=True)

        # consolidate lines that have the same top (within tolerance)
        # consolidated_lines = []
        # line_segments = []
        # line_top = lines[0]['top']
        # for line in lines:
        #   if abs(line['top'] - line_top) < LINE_TOLERANCE:
        #       line_segments.append(line)

        #   else:
        #       # assure that text segments appear in the correct order
        #       line_segments.sort(key=itemgetter('left'))
        #       # create a new line object combining partial texts, preserving the left-most text position
        #       merged_line = dict(line_segments[0])
        #       merged_line['text'] = ""
        #       for item in line_segments:
        #           merged_line['text'] = merged_line['text'] + " " + item['text']

        #       consolidated_lines.append(merged_line)

        #       # reset
        #       line_segments = [line]
        #       line_top = line['top']
        #import pdb; pdb.set_trace()
        page_text = '\n\n'.join([e['text'] for e in page_text])

        text.append(page_text + '\n\n')

    #import pdb; pdb.set_trace()
    return text
Ejemplo n.º 8
0
    def process_xml(self, xml):
        '''
        Parse tool configuration data out of the Common Cartridge LTI link XML.
        '''
        root = objectify.fromstring(xml, parser=etree.XMLParser())
        # Parse all children of the root node
        for child in root.getchildren():
            if 'title' in child.tag:
                self.title = child.text
            if 'description' in child.tag:
                self.description = child.text
            if 'secure_launch_url' in child.tag:
                self.secure_launch_url = child.text
            elif 'launch_url' in child.tag:
                self.launch_url = child.text
            if 'icon' in child.tag:
                self.icon = child.text
            if 'secure_icon' in child.tag:
                self.secure_icon = child.text
            if 'cartridge_bundle' in child.tag:
                self.cartridge_bundle = child.attrib['identifierref']
            if 'catridge_icon' in child.tag:
                self.cartridge_icon = child.atrib['identifierref']

            if 'vendor' in child.tag:
                # Parse vendor tag
                for v_child in child.getchildren():
                    if 'code' in v_child.tag:
                        self.vendor_code = v_child.text
                    if 'description' in v_child.tag:
                        self.vendor_description = v_child.text
                    if 'name' in v_child.tag:
                        self.vendor_name = v_child.text
                    if 'url' in v_child.tag:
                        self.vendor_url = v_child.text
                    if 'contact' in v_child.tag:
                        # Parse contact tag for email and name
                        for c_child in v_child:
                            if 'name' in c_child.tag:
                                self.vendor_contact_name = c_child.text
                            if 'email' in c_child.tag:
                                self.vendor_contact_email = c_child.text

            if 'custom' in child.tag:
                # Parse custom tags
                for custom_child in child.getchildren():
                    self.custom_params[custom_child.attrib['name']] =\
                            custom_child.text

            if 'extensions' in child.tag:
                platform = child.attrib['platform']
                properties = {}

                # Parse extension tags
                for ext_child in child.getchildren():
                    if 'property' in ext_child.tag:
                        properties[ext_child.attrib['name']] = ext_child.text
                    elif 'options' in ext_child.tag:
                        opt_name = ext_child.attrib['name']
                        options = {}
                        for option_child in ext_child.getchildren():
                            options[option_child.attrib['name']] =\
                                    option_child.text
                        properties[opt_name] = options

                self.set_ext_params(platform, properties)
Ejemplo n.º 9
0
from lxml import etree as ElementTree
import htmlentitydefs
import csv
import operator
import re

# import gzip

generateLog = True

parser = ElementTree.XMLParser(attribute_defaults=True, load_dtd=True)

# Papers must be at least 4 pages long to count.
pageCountThreshold = 4
# Match ordinary page numbers (as in 10-17).
pageCounterNormal = re.compile('(\d+)-(\d+)')
# Match page number in the form volume:page (as in 12:140-12:150).
pageCounterColon = re.compile('[0-9]+:([1-9][0-9]*)-[0-9]+:([1-9][0-9]*)')


def pagecount(input):
    pageCounterMatcher1 = pageCounterNormal.match(input)
    pageCounterMatcher2 = pageCounterColon.match(input)
    start = 0
    end = 0
    count = 0

    if (not (pageCounterMatcher1 is None)):
        start = int(pageCounterMatcher1.group(1))
        end = int(pageCounterMatcher1.group(2))
        count = end - start + 1
Ejemplo n.º 10
0
    infile = open("../.repopath.pickle", 'rb')
    new_dict = pickle.load(infile)
    infile.close()
    repopath = new_dict.get("repository")
    print("Repository path saved is {}".format(repopath))
else:
    repopath = input("Enter the path to the repository: ")
    variable = {"repository":repopath}
    f = open(".repopath.pickle",'wb')
    pickle.dump(variable,f)
    f.close()

pluginName = input("Please enter the plugin name: ")
# profileXml = '/home/sthummala/workspace/repo/centina/sa/profiles/' + pluginName + '.xml'
profileXml = repopath+'/centina/sa/profiles/' + pluginName + '.xml'
parser = etree.XMLParser(strip_cdata=False)
root = etree.parse(profileXml, parser)
meta = root.find("meta")
protocol = meta.find("protocol").get("name")
dependencies = root.find("dependencies")

file = dependencies.findall("file")
for a in file:
    if a.get("path").startswith("pm/templates"):
        if a.get("path").endswith(".dtd"):
            continue
        pmtemplate = repopath+"/centina/sa/profiles/" + a.get("path")
        print("Found pm template ", pmtemplate)
        # try:
        # template = getParser(pmtemplate,"template")
        parsertemplate = etree.XMLParser(strip_cdata=False)
Ejemplo n.º 11
0
def main():
    logging.basicConfig(level=logging.INFO,
                        format='%(levelname)s: %(message)s')
    parser = argparse.ArgumentParser(
        "VCF2cytosure - convert SV vcf files to cytosure")

    group = parser.add_argument_group('Filtering')
    group.add_argument('--size',
                       default=1000,
                       type=int,
                       help='Minimum variant size. Default: %(default)s')
    group.add_argument('--frequency',
                       default=0.01,
                       type=float,
                       help='Maximum frequency. Default: %(default)s')
    group.add_argument(
        '--frequency_tag',
        default='FRQ',
        type=str,
        help='Frequency tag of the info field. Default: %(default)s')
    group.add_argument('--no-filter',
                       dest='do_filtering',
                       action='store_false',
                       default=True,
                       help='Disable any filtering')

    group = parser.add_argument_group('Input')
    group.add_argument(
        '--genome',
        required=False,
        default=37,
        help=
        'Human genome version. Use 37 for GRCh37/hg19, 38 for GRCh38 template.'
    )
    group.add_argument('--sex',
                       required=False,
                       default='female',
                       help='Sample sex male/female. Default: %(default)s')
    group.add_argument('--vcf', required=True, help='VCF file')
    group.add_argument(
        '--bins',
        type=int,
        default=20,
        help='the number of coverage bins per probes default=20')
    group.add_argument('--coverage', help='Coverage file')
    group.add_argument(
        '--cn',
        type=str,
        help=
        'add probes using cnvkit cn file(cannot be used together with --coverage)'
    )
    group.add_argument(
        '--snv',
        type=str,
        help=
        'snv vcf file, use coverage annotation to position the height of the probes(cannot be used together with --coverage)'
    )
    group.add_argument(
        '--dp',
        type=str,
        default="DP",
        help=
        'read depth tag of snv vcf file. This option is only used if you use snv to set the heigth of the probes. The dp tag is a tag which is used to retrieve the depth of coverage across the snv (default=DP)'
    )
    group.add_argument(
        '--maxbnd',
        type=int,
        default=10000,
        help='Maxixmum BND size, BND events exceeding this size are discarded')
    group.add_argument(
        '--out', help='output file (default = the prefix of the input vcf)')

    group.add_argument(
        '--blacklist',
        help=
        'Blacklist bed format file to exclude completely contained variants.')

    group.add_argument('-V',
                       '--version',
                       action='version',
                       version="%(prog)s " + __version__,
                       help='Print program version and exit.')
    # parser.add_argument('xml', help='CytoSure design file')
    args = parser.parse_args()

    logger.info('vcf2cytosure %s', __version__)

    if (args.coverage and args.cn) or (args.coverage
                                       and args.snv) or (args.snv and args.cn):
        print(
            "Choose one of --coverage, --snv and --cn. They cannot be combined."
        )
        quit()

    if int(args.genome) == 38:
        CGH_TEMPLATE = CGH_TEMPLATE_38
        CONTIG_LENGTHS = CONTIG_LENGTHS_38
        N_INTERVALS = N_INTERVALS_38
    else:
        CGH_TEMPLATE = CGH_TEMPLATE_37
        CONTIG_LENGTHS = CONTIG_LENGTHS_37
        N_INTERVALS = N_INTERVALS_37

    if not args.out:
        args.out = ".".join(
            args.vcf.split(".")[0:len(args.vcf.split(".")) - 1]) + ".cgh"
    parser = etree.XMLParser(remove_blank_text=True)

    sex_male = "false"
    promega_sex = 'Female'
    if args.sex == "male":
        sex_male = 'true'
        promega_sex = 'Male'

    vcf = VCF(args.vcf)

    sample_id = retrieve_sample_id(vcf, args.vcf)
    tree = etree.parse(
        StringIO(
            CGH_TEMPLATE.format(sample_id, sample_id, sample_id, sample_id,
                                sex_male, promega_sex, sex_male)), parser)

    segmentation = tree.xpath('/data/cgh/segmentation')[0]
    probes = tree.xpath('/data/cgh/probes')[0]
    submission = tree.xpath('/data/cgh/submission')[0]

    if args.blacklist:
        blacklist = [
            r for r in read_blacklist(args.blacklist)
            if r.chrom in CONTIG_LENGTHS
        ]

    chr_intervals = defaultdict(list)
    if args.do_filtering:
        vcf = variant_filter(vcf,
                             min_size=args.size,
                             max_frequency=args.frequency,
                             frequency_tag=args.frequency_tag)
    n = 0
    for event in events(vcf, CONTIG_LENGTHS):
        height = ABERRATION_HEIGHTS[event.type]
        end = event.end
        make_segment(segmentation, event.chrom, event.start, end, height)
        comment = format_comment(event.info)
        if "rankScore" in event.info:
            rank_score = int(event.info['RankScore'].partition(':')[2])
        else:
            rank_score = 0

        #occ=0
        #if args.frequency_tag in event.info:
        #	occ=event.info[args.frequency_tag]
        occ = 0
        if "OCC" in event.info:
            occ = event.info["OCC"]

        if event.type in ("INV", 'INS', 'BND', "TRA") and not event.end:
            continue
            #pass
        elif event.type in ("INV", 'INS', 'BND', "TRA") and (
                abs(event.start - event.end) > args.maxbnd):
            #pass
            continue
        elif args.blacklist:
            if contained_by_blacklist(event, blacklist):
                continue

        make_aberration(submission,
                        event.chrom,
                        event.start,
                        end,
                        confirmation=event.type,
                        comment=comment,
                        n_probes=occ,
                        copy_number=rank_score)

        chr_intervals[event.chrom].append((event.start, event.end))
        # show probes at slightly different height than segments
        for pos in spaced_probes(event.start, event.end - 1):
            make_probe(probes, event.chrom, pos, pos + 60, height, event.type)
        n += 1
    if args.coverage or args.snv or args.cn:
        add_coverage_probes(probes, args.coverage, args, CONTIG_LENGTHS,
                            N_INTERVALS)

    else:
        add_probes_between_events(probes, chr_intervals, CONTIG_LENGTHS)

    tree.write(args.out, pretty_print=True)
    logger.info('Wrote %d variants to CGH', n)
Ejemplo n.º 12
0
def process(type, db, config):
    location = '%s/%s' % (config.ACT_DIR, type)
    count = 0
    with db.cursor() as cur:

        parser = etree.XMLParser(resolve_entities=False, huge_tree=True)
        print location
        for dirpath, dirs, files in os.walk(location):
            files = [f for f in files if f.endswith('.xml')]

            if len(files):

                path = os.path.join(dirpath.replace(config.ACT_DIR + '/', ''),
                                    files[0])
                try:
                    print path
                    tree = etree.parse(os.path.join(dirpath, files[0]), parser)
                    objectify.deannotate(tree, cleanup_namespaces=True)
                    for elem in tree.iter():
                        if not hasattr(elem.tag, 'find'): continue
                        i = elem.tag.find('}')
                        if i >= 0:
                            elem.tag = elem.tag[i + 1:]

                    attrib = tree.getroot().attrib
                    if attrib.get('id'):
                        title = etree.tostring(
                            tree.xpath('.//billref|.//title')[0],
                            method="text",
                            encoding="UTF-8")
                        #TODO
                        title = title.replace('\n', '').strip()
                        query = """INSERT INTO instruments (id, govt_id, version, title, path, number, date_as_at,date_assent, type,
                                date_first_valid, date_gazetted, date_terminated, date_imprint, year, repealed, in_amend,
                                pco_suffix, raised_by, official, subtype, terminated, stage, date_signed, imperial, instructing_office, attributes)
                            VALUES (%(id)s, %(govt_id)s, %(version)s, %(title)s, %(path)s, %(number)s, %(date_as_at)s,%(date_assent)s, %(type)s,
                                %(date_first_valid)s, %(date_gazetted)s, %(date_terminated)s, %(date_imprint)s,
                                %(year)s, %(repealed)s, %(in_amend)s, %(pco_suffix)s, %(raised_by)s, %(official)s, %(subtype)s,
                                %(terminated)s, %(stage)s, %(date_signed)s, %(imperial)s, %(instructing_office)s, %(attr)s); """

                        with open(os.path.join(dirpath, files[0])) as r:
                            cur.execute(
                                """ INSERT INTO documents (document, type) VALUES (%(document)s, 'xml') returning id""",
                                {'document': r.read()})

                        document_id = cur.fetchone()[0]

                        values = {
                            'id':
                            document_id,
                            'govt_id':
                            attrib.get('id'),
                            'title':
                            title,
                            'version':
                            int(float(dirpath.split('/')[-1])),
                            'path':
                            path,
                            'number':
                            attrib.get(
                                'sr.no',
                                attrib.get(
                                    'sop.no',
                                    attrib.get('act.no',
                                               attrib.get('bill.no')))),
                            'date_first_valid':
                            safe_date(attrib.get('date.first.valid')),
                            'date_gazetted':
                            safe_date(attrib.get('date.date_gazetted')),
                            'date_terminated':
                            safe_date(attrib.get('date.terminated')),
                            'date_imprint':
                            safe_date(attrib.get('date.imprint')),
                            'date_as_at':
                            safe_date(attrib.get('date.as.at')),
                            'date_assent':
                            safe_date(attrib.get('date.assent')),
                            'year':
                            int(attrib.get('year')),
                            'repealed':
                            attrib.get('terminated') == "repealed",
                            'in_amend':
                            attrib.get('in.amend') != 'false',
                            'pco_suffix':
                            attrib.get('pco.suffix'),
                            'raised_by':
                            attrib.get('raised.by'),
                            'official':
                            attrib.get('official'),
                            'type':
                            type,
                            'subtype':
                            attrib.get(
                                'act.type',
                                attrib.get('sr.type',
                                           attrib.get('bill.type'))),
                            'terminated':
                            attrib.get('terminated'),
                            'stage':
                            attrib.get('stage'),
                            'date_signed':
                            safe_date(attrib.get('date.signed')),
                            'imperial':
                            attrib.get('imperial') == 'yes',
                            'instructing_office':
                            attrib.get('instructing_office'),
                            'attr':
                            json.dumps(dict(attrib))
                        }
                        cur.execute(query, values)

                except etree.XMLSyntaxError, e:
                    print 'ERROR', e, path
Ejemplo n.º 13
0
def train_data(datapath):
    # define sign and stop words
    sign = [
        "!", ",", ".", ":", ";", "'", "#", "$", "%", "&", "(", ")", "*", "[",
        "]", "?", "@", "_", "/", "{", "|", "}", "~", "--"
    ]
    f_stopword = open("/home/iialab/TREC2018/stopwords.txt")
    stopwords = f_stopword.readlines()
    stop_sign = []
    for stopword in stopwords:
        stopword = stopword.strip("\n")
        stop_sign.append(stopword)

    # 解析xml文件
    files = os.listdir(datapath)
    keys = ["clean_background", "clean_title", "doc_id"]
    clean_docs = []
    for file in files:
        # print(file)
        Doc_id = file.strip(".xml")
        filepath = os.path.join(datapath, file)
        # print(filepath)
        parser = etree.XMLParser(recover=True)
        text = open(filepath, encoding="utf-8").read()
        root = ET.fromstring(text, parser=parser)
        # print(root.tag)
        clean_title = []
        clean_background = []
        for child in root:
            if child.tag == "Doc_title":
                title = child.text
                for c in sign:
                    title = title.replace(c, "").lower().strip("\n")
                    sentence = nltk.word_tokenize(title, language='english')
                # print(sentence)

                for item in sentence:
                    if item in stop_sign:
                        continue
                        # print(item)
                    else:
                        clean_title.append(
                            nltk.stem.SnowballStemmer('english').stem(item))
                print(clean_title)

            elif child.tag == "Background":
                abstract = child.text
                for c in sign:
                    abstract = abstract.replace(c, "").lower().strip("\n")
                    sentence = nltk.word_tokenize(abstract, language='english')
                # print(sentence)

                for item in sentence:
                    if item in stop_sign:
                        continue
                        # print(item)
                    else:
                        clean_background.append(
                            nltk.stem.SnowballStemmer('english').stem(item))
                print(clean_background)

        dictionary = dict(zip(keys, [clean_background, clean_title, Doc_id]))
        print(dictionary)
        clean_docs.append(dictionary)
    new_dic = dict(zip(["numFound", "docs"], [265, clean_docs]))
    print(new_dic)
    with open(
            "/home/iialab/TREC2018/training_data_2017/train_proceedings.json",
            "w") as f:
        json.dump(new_dic, f)
        print("finish!")
Ejemplo n.º 14
0
    def button_reformat_callback():
        """ what to do when the "Reformat" button is pressed """

        xmlfile = entry.get()
        if xmlfile.rsplit(".")[-1] != "xml":
            statusText.set("Filename must have a .xml extension!")
            message.configure(fg="red")
            return

        IOH_xmlfile = get_IOH_filename(xmlfile)
        copyfile(xmlfile, IOH_xmlfile)
        """ make it pretty """
        parser = etree.XMLParser(resolve_entities=False, strip_cdata=False)
        document = etree.parse(IOH_xmlfile, parser)
        document.write(IOH_xmlfile, pretty_print=True, encoding='utf-8')
        """ identify all the speaker tags """
        q = etree.parse(IOH_xmlfile)
        speaker_tags = q.findall('.//speaker')
        speakers = dict()
        num = 1

        for tag in speaker_tags:
            if tag.text:
                full = tag.text.strip()
                if ' ' not in full:
                    first = full
                else:
                    first, rest = full.split(' ', 1)

                first = first.strip()
                if first not in speakers:
                    speakers[first] = {
                        'number': num,
                        'class': "<span class='oh_speaker_" + str(num) + "'>",
                        'full_name': full
                    }
                    num += 1
        """ examine each cue, identify THE speaker and modify the cue accordingly """
        cue_tags = q.findall('.//cue')
        speakers_found = []

        for tag in cue_tags:
            s = tag.find('speaker')
            if ' ' not in s.text.strip():
                first = s.text.strip()
            else:
                first, rest = s.text.strip().split(' ', 1)
            first = first.strip()
            if first not in speakers_found:
                speakers_found.append(first)
            t = tag.find('transcript')
            if t.text is None:
                statusText.set("Transcript has no text at source line " +
                               str(t.sourceline) + "!")
                message.configure(fg="red")
                return

            text = t.text.replace('\n', ' ').replace('  ', ' ').replace(
                ' :', ':').replace(' |', '|')
            t.text = ''
            try:
                t.text += speakers[first][
                    'class'] + first + ": " + "<span class='oh_speaker_text'>" + text + '</span></span>'
            except KeyError:
                statusText.set("Transcript 'KeyError' at source line " +
                               str(t.sourceline) + "! Please investigate.")
                message.configure(fg="red")
                return

        q.write(IOH_xmlfile)
        entry.delete(0, END)
        entry.insert(0, IOH_xmlfile)

        statusText.set(
            "Speaker reformatting for transcript `{}' is complete.".format(
                IOH_xmlfile))
        message.configure(fg="dark green")
Ejemplo n.º 15
0
 def xml(self):
     if '_xml' not in self.__dict__:
         self._xml = etree.fromstring(self.bill_xml.encode('utf-8'),
                                      etree.XMLParser(recover=True))
     return self._xml
Ejemplo n.º 16
0
def get_tile_prototypes():
    parser = etree.XMLParser(remove_blank_text=True)
    tree = etree.parse('tiles.svg', parser)
    root = tree.getroot()
    tile_defs = root.find('{http://www.w3.org/2000/svg}defs')
    return tile_defs
Ejemplo n.º 17
0
def read_message_name_in_message_file(xml_file: str) -> str:
    xml = BytesIO(xml_file.encode())
    tree = etree.parse(xml, etree.XMLParser())
    node = tree.find('//ns:GrpHdr/ns:MsgId', namespaces=XML_NAMESPACE)
    return node.text
Ejemplo n.º 18
0
# -*- coding: utf-8 -*-

from lxml import etree as ET
from openpyxl import load_workbook
from operator import itemgetter
import os
import copy
import requests
import datetime

#lxml parser for parsing XML files from strings
parser = ET.XMLParser(remove_blank_text=True)

if os.name == "nt":
    #Windows Directory Names

    #Finding Aid Directory
    faDir = "g:\WebArch"
    #Collection and Subject spreadsheets directory
    spreadDir = "g:\WebArch"
    #parse Collection List spreadsheet
collectionListFile = os.path.join(spreadDir, "collectionList.xlsx")
collectionWorkbook = load_workbook(filename=collectionListFile)
collectionList = collectionWorkbook.active

#Parse List of Collections to list of lists
rowIndex = 0
collections = []
for row in collectionList.rows:
    rowIndex = rowIndex + 1
    if rowIndex > 1:
Ejemplo n.º 19
0
def fetch_information(HTML, requrl):
	global evtnamePattern
	global evtdescPattern
	global starttimePattern
	global startdatePattern
	global endtimePattern
	global enddatePattern
	global timePattern
	global locationPattern
	global dateAndTimePattern
	global evtsource
	global datePattern
	global picurlPattern
	global tagsPattern
	global additionalTags
	global specificLocation
	global evtsourceCommunityDict
	global evtsourceYearDict

	currentTime =  datetime.datetime.now()
	currentDate = currentTime.strftime('%Y-%m-%d')
	currentDate = datetime.datetime.strptime(currentDate, '%Y-%m-%d')
	formerDate = currentDate + datetime.timedelta(days=-1)

	parser = etree.XMLParser(recover = True)
	tree = etree.fromstring(HTML, parser)

	evtnameList = []
	evtdescList = []
	starttimeList = []
	startdateList = []
	endtimeList = []
	enddateList = []
	timeList = []
	dateAndTimeList = []
	locationList = []
	dateList = []
	picurlList = []
	tagsList = []

	# raw_input(requrl)
	# print HTML
	# raw_input(123)

	eventCount = len(tree.xpath(evtnamePattern))

	i = 0
	while i < eventCount:
		evtnameList.append("")
		evtdescList.append("")
		starttimeList.append("")
		startdateList.append("")
		endtimeList.append("")
		enddateList.append("")
		timeList.append("")
		dateAndTimeList.append("")
		locationList.append("")
		dateList.append("")
		picurlList.append("")
		tagsList.append([])
		i += 1

	evtnameLxmlItemList = tree.xpath(evtnamePattern)
	evtnameList = []
	for evtnameLxmlItem in evtnameLxmlItemList:
		evtnameList.append(get_text(evtnameLxmlItem))
	
	evtdescLxmlItemList = tree.xpath(evtdescPattern)
	evtdescList = []
	for evtdescLxmlItem in evtdescLxmlItemList:
		evtdescList.append(get_text(evtdescLxmlItem))

	if locationPattern != "":
		locationLxmlItemList = tree.xpath(locationPattern)
		locationList = []
		for locationLxmlItem in locationLxmlItemList:
			locationList.append(get_text(locationLxmlItem))

	if specificLocation != "":
		locationList = []
		i = 0
		while i < eventCount:
			locationList.append(specificLocation)
			i += 1

	if picurlPattern != "":
		picurlLxmlItemList = tree.xpath(picurlPattern)
		picurlList = []
		for picurlLxmlItem in picurlLxmlItemList:
			picurl = get_picurl(picurlLxmlItem)
			if picurl != "" and picurl[0] == "/" and picurl[1] != "/":
				picurl = evtsource + picurl
			elif picurl != "" and picurl[0] == "/" and picurl[1] == "/":
				picurl = picurl[2:]
			picurlList.append(picurl)


	if tagsPattern != "":
		tagsLxmlItemList = tree.xpath(tagsPattern)
		tagsList = []
		for tagLxmlItem in tagsLxmlItemList:
			tags = get_text(tagLxmlItem)
			tags = analyze_tags(tags)
			tagsList.append(tags)
	
	if dateAndTimePattern != "":
		dateAndTimeLxmlItemList = tree.xpath(dateAndTimePattern)
		dateAndTimeList = []
		for dateAndTimeLxmlItem in dateAndTimeLxmlItemList:
			dateAndTime = get_text(dateAndTimeLxmlItem)
			dateAndTimeList.append(dateAndTime)
	
	if datePattern != "":
		dateLxmlItemList = tree.xpath(datePattern)
		dateList = []
		for dateLxmlItem in dateLxmlItemList:
			date = get_text(dateLxmlItem)
			dateList.append(date)

	if timePattern != "":
		timeLxmlItemList = tree.xpath(timePattern)
		for timeLxmlItem in timeLxmlItemList:
			time = get_text(timeLxmlItem)
			timeList.append(time)

	if starttimePattern != "":
		starttimeLxmlItemList = tree.xpath(starttimePattern)
		for starttimeLxmlItem in starttimeLxmlItemList:
			starttime = get_text(starttimeLxmlItem)
			starttimeList.append(starttime)

	if endtimePattern != "":
		endtimeLxmlItemList = tree.xpath(endtimePattern)
		for endtimeLxmlItem in endtimeLxmlItemList:
			endtime = get_text(endtimeLxmlItem)
			endtimeList.append(endtime)

	if startdatePattern != "":
		startdateLxmlItemList = tree.xpath(startdatePattern)
		for startdateLxmlItem in startdateLxmlItemList:
			startdate = get_text(startdateLxmlItem)
			startdateList.append(startdate)

	if enddatePattern != "":
		enddateLxmlItemList = tree.xpath(enddatePattern)
		for enddateLxmlItem in enddateLxmlItemList:
			enddate = get_text(enddateLxmlItem)
			enddateList.append(enddate)

	url = requrl

	#decode as unicode and analyze text
	i = 0
	while i < eventCount:
		evtname = evtnameList[i]
		evtdesc = evtdescList[i]
		location = locationList[i]
		dateAndTime = dateAndTimeList[i]
		date = dateList[i]
		time = timeList[i]
		starttime = starttimeList[i]
		endtime = endtimeList[i]
		startdate = startdateList[i]
		enddate = enddateList[i]
		tags = tagsList[i]
		picurl = picurlList[i]

		evtname = analyze_text(unidecode.unidecode(evtname))
		evtdesc = analyze_text(unidecode.unidecode(evtdesc))
		location = analyze_text(location)
		dateAndTime = analyze_text(dateAndTime)
		date = analyze_text(date)
		time = analyze_text(time)
		starttime = analyze_text(starttime)
		endtime = analyze_text(endtime)
		
		starttime, endtime = analyze_time(dateAndTime, date, time, starttime, endtime, startdate, enddate)

		if evtname == "":
			print "Can't crawl evtname information: ",
			print requrl
			i += 1
			continue

		if starttime == "":
			print "Can't crawl time information: ",
			print requrl
			i += 1
			continue

		if location == "":
			print "Can't crawl location information: ",
			print requrl
			i += 1
			continue
			
		community = evtsourceCommunityDict[evtsource]
		year = evtsourceYearDict[evtsource]
		fetch_data(url, evtname, evtdesc, starttime, endtime, location, community, evtsource, formerDate, tags, additionalTags, picurl, year)
		i += 1
Note that the data we're dealing with seems to suffer from this tesseract bug:
https://groups.google.com/forum/#!topic/tesseract-ocr/UiyIMUWMzsU
so we're assuming it's actually latin-1 encoded. 

"""

import unicodedata

from lxml import etree
from lxml.etree import tostring
from StringIO import StringIO

from hocr_parser.document_parser import document_parser
from hocr_parser.parse_utils import get_words_from_page, get_words_with_lines_from_page, get_annotated_bbox

flexible_parser = etree.XMLParser(encoding='utf-8', recover=True)

file_name = "58-1723645_990_201204"

file_path = "hocr_parser/test_hocr/" + file_name + ".html"
parser = document_parser(file_path, encoding='latin-1')

page_num = 0
while True:
    this_page = parser.read_page()
    if not this_page:
        break
    page_num += 1
    print "Processing page %s" % page_num
    outfile = "../display/hocr_pages/" + file_name + "p" + str(
        page_num) + ".html"
Ejemplo n.º 21
0
    def identify(  # {{{
            self,
            log,
            result_queue,
            abort,
            title=None,
            authors=None,
            identifiers={},
            timeout=30):
        from lxml import etree
        entry = XPath('//atom:entry')

        query = self.create_query(log,
                                  title=title,
                                  authors=authors,
                                  identifiers=identifiers)
        if not query:
            log.error('Insufficient metadata to construct query')
            return
        br = self.browser
        log('Making query:', query)
        try:
            raw = br.open_novisit(query, timeout=timeout).read()
        except Exception as e:
            log.exception('Failed to make identify query: %r' % query)
            return as_unicode(e)

        try:
            feed = etree.fromstring(
                xml_to_unicode(clean_ascii_chars(raw),
                               strip_encoding_pats=True)[0],
                parser=etree.XMLParser(recover=True,
                                       no_network=True,
                                       resolve_entities=False))
            entries = entry(feed)
        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)

        if not entries and title and not abort.is_set():
            if identifiers:
                log('No results found, retrying without identifiers')
                return self.identify(log,
                                     result_queue,
                                     abort,
                                     title=title,
                                     authors=authors,
                                     timeout=timeout)
            ntitle = cleanup_title(title)
            if ntitle and ntitle != title:
                log('No results found, retrying without sub-title')
                return self.identify(log,
                                     result_queue,
                                     abort,
                                     title=ntitle,
                                     authors=authors,
                                     timeout=timeout)

        # There is no point running these queries in threads as google
        # throttles requests returning 403 Forbidden errors
        self.get_all_details(br, log, entries, abort, result_queue, timeout)
Ejemplo n.º 22
0
    def __init__(self, dataset, root=None, force_reparse=False):
        """
        Given a IATI dataset, prepare an IATI parser
        """

        if settings.IATI_PARSER_DISABLED:
            raise ParserDisabledError(
                "The parser is disabled on this instance of OIPA")

        self.dataset = dataset
        self.url = dataset.source_url
        self.force_reparse = force_reparse
        self.hash_changed = True
        self.valid_dataset = True

        if root is not None:
            self.root = root
            self.parser = self._prepare_parser(self.root, dataset)
            return

        response = None
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X '
            '10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }  # NOQA: E501

        # This was rolled back to how it was before adding dataset failed
        # pickups in commit e5d0ba6454b180d8b4e27820184007fdacb6c3b4
        try:
            response = requests.get(self.url, headers=headers, timeout=30)
            response.raise_for_status()
        except requests.exceptions.SSLError:
            try:
                response = requests.get(self.url, verify=False, timeout=30)
            except requests.exceptions.SSLError:
                pass
        except requests.exceptions.Timeout:
            try:
                response = requests.get(self.url, verify=False, timeout=30)
            except requests.exceptions.Timeout:
                pass
        except requests.exceptions.RequestException:
            pass
        # We do not add a generic exception, because that would mean that
        # an internal datastore error would show up in the API.
        finally:
            pass

        if not response or response.status_code != 200:
            self.valid_dataset = False
            note = DatasetNote(dataset=self.dataset,
                               iati_identifier="n/a",
                               model="n/a",
                               field="n/a",
                               message="Cannot access the URL",
                               exception_type='UrlError',
                               line_number=None)
            note.save()
            self.dataset.note_count = 1

            # If not a XML file them sha1 should blank
            self.dataset.sha1 = ''

            self.dataset.save()
            return

        # 1. Turn bytestring into string (treat it using specified encoding):
        try:
            iati_file = smart_text(response.content, 'utf-8')
        # XXX: some files contain non utf-8 characters:
        # FIXME: this is hardcoded:
        except UnicodeDecodeError:
            iati_file = smart_text(response.content, 'latin-1')

        # 2. Encode the string to use for hashing:
        hasher = hashlib.sha1()
        hasher.update(iati_file.encode('utf-8'))
        sha1 = hasher.hexdigest()

        if dataset.sha1 == sha1:
            # dataset did not change, no need to reparse normally
            self.hash_changed = False
        else:
            dataset.sha1 = sha1

            # Save a sha1 in the first time of the process parse
            dataset.save()

        try:
            parser = etree.XMLParser(huge_tree=True)
            tree = etree.parse(BytesIO(response.content), parser)
            self.root = tree.getroot()
            self.parser = self._prepare_parser(self.root, dataset)

            if settings.ERROR_LOGS_ENABLED:
                self.xsd_validate()

        # TODO: when moving error messages to frontend, create a separate error
        # for wrong file type:
        except etree.XMLSyntaxError as e:
            self.valid_dataset = False
            DatasetNote.objects.filter(dataset=self.dataset).delete()
            note = DatasetNote(
                dataset=self.dataset,
                iati_identifier="n/a",
                model="n/a",
                field="n/a",
                message="This file contains XML syntax errors or it's not an "
                "XML file",
                exception_type='XMLSyntaxError',
                line_number=None)
            note.save()
            self.dataset.note_count = 1

            # If not the XML should not have a sha1
            self.dataset.sha1 = ''

            self.dataset.save()
            return
Ejemplo n.º 23
0
    def __init__(self):
        print('---- Principio del archivo')

    def start(self, tag, attrib):
        if tag == "item":
            self.numeroElementos += 1
        if tag == "enclosure":
            if attrib["type"] == "image/jpeg":
                self.numeroImagenes += 1
                print attrib["url"]
                #urllib.urlretrieve(attrib["url"], "imagenesDescargadas/" + str(self.numeroImagenes) + ".jpg")
    def data(self, data):
        if len(sys.argv) > 1:
            encuentra = re.compile('\s' + self.termino + '\s')
            if encuentra.search(data) != None:
                self.encontrado = True

    def close(self):
        print('---- Fin del archivo')
        print "Numero de elementos: " + str(self.numeroElementos)
        print "Numero de imagenes: " + str(self.numeroImagenes)

        if len(sys.argv) > 1:
            if self.encontrado:
                print 'El termino ' + self.termino + ' esta.'
            else:
                print 'El termino ' + self.termino + ' no esta.'


parser = etree.XMLParser(target=ParseRssNews())
etree.parse('portada.xml', parser)
def read_xml_file(path, log=None):
    """
    Read an xml file and return the root node as for lxml.etree
    """
    def remove_utf8_from_xml(fileContent):
        """
        Removes the header from the file content.
        
    <?xml version="1.0" encoding="UTF-8"?>
        """
        indexStart = fileContent.find('<?xml')
        if indexStart < 0:
            return fileContent

        indexStart = fileContent.find('<', indexStart + 2)
        if indexStart < 0:
            return fileContent

        return fileContent[indexStart:]

    def remove_xmlns_from_xml(fileContent):
        """
        Removes the "xmlns=" part from file content because lxml api supports this part only by specifying exactly
        its value whenever we want to access a part of xml content, and its value can change between xml files.
        
    <web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://java.sun.com/xml/ns/javaee" xmlns:web="http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd" xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd" id="WebApp_ID" version="2.5">
    </web-app>
        """
        if not 'xmlns=' in fileContent:
            return fileContent

        indexStart = fileContent.find('xmlns=')
        indexValueStart = fileContent.find('"', indexStart)
        if indexValueStart < 0:
            return fileContent
        indexValueEnd = fileContent.find('"', indexValueStart + 1)
        if indexValueEnd < 0:
            return fileContent

        return fileContent.replace(fileContent[indexStart:indexValueEnd + 1],
                                   '')

    '''
    def get_root(text):
    
    
        class LineNumberingParser(ET.XMLParser):
            def _start(self, *args, **kwargs):
                # Here we assume the default XML parser which is expat
                # and copy its element position attributes into output Elements
                element = super(self.__class__, self)._start(*args, **kwargs)
                element.start_line_number = self.parser.CurrentLineNumber
                element.start_column_number = self.parser.CurrentColumnNumber
                element._start_byte_index = self.parser.CurrentByteIndex
                return element
        
            def _end(self, *args, **kwargs):
                element = super(self.__class__, self)._end(*args, **kwargs)
                element.end_line_number = self.parser.CurrentLineNumber
                element.end_column_number = self.parser.CurrentColumnNumber
                element._end_byte_index = self.parser.CurrentByteIndex
                return element    
        
        return ET.fromstring(text, LineNumberingParser())    
    '''

    with open_source_file(path) as f:
        file_content = f.read()
        file_content = remove_utf8_from_xml(file_content)
        file_content = remove_xmlns_from_xml(file_content)

        parser = ET.XMLParser(recover=True)
        return ET.fromstring(file_content, parser)
Ejemplo n.º 25
0
def update_ikfast_package(args):
    # Copy the source code generated by IKFast into our src folder
    src_path = args.ikfast_plugin_pkg_path + '/src/'
    solver_file_path = src_path + args.robot_name + '_' + args.planning_group_name + '_ikfast_solver.cpp'
    if not os.path.exists(solver_file_path) or not os.path.samefile(
            args.ikfast_output_path, solver_file_path):
        shutil.copy2(args.ikfast_output_path, solver_file_path)

    if not os.path.exists(solver_file_path):
        raise Exception(
            "Failed to copy IKFast source code from '%s' to '%s'\n"
            "Manually copy the source file generated by IKFast to this location and re-run"
            % (args.ikfast_output_path, solver_file_path))
    # Remember ikfast solver file for update of MoveIt package
    args.ikfast_output_path = solver_file_path

    # Get template folder location
    template_dir = find_template_dir()

    # namespace for the plugin
    setattr(args, 'namespace',
            args.robot_name + "_" + args.planning_group_name)
    replacements = dict(_ROBOT_NAME_=args.robot_name,
                        _GROUP_NAME_=args.planning_group_name,
                        _SEARCH_MODE_=args.search_mode,
                        _EEF_LINK_=args.eef_link_name,
                        _BASE_LINK_=args.base_link_name,
                        _PACKAGE_NAME_=args.ikfast_plugin_pkg,
                        _NAMESPACE_=args.namespace)

    # Copy ikfast header file
    copy_file(template_dir + '/ikfast.h',
              args.ikfast_plugin_pkg_path + "/include/ikfast.h",
              "ikfast header file")
    # Create ikfast plugin template
    copy_file(
        template_dir + '/ikfast' + str(args.template_version) +
        '_moveit_plugin_template.cpp',
        args.ikfast_plugin_pkg_path + "/src/" + args.robot_name + '_' +
        args.planning_group_name + "_ikfast_moveit_plugin.cpp",
        "ikfast plugin file", replacements)

    # Create plugin definition .xml file
    ik_library_name = args.namespace + "_moveit_ikfast_plugin"
    plugin_def = etree.Element("library", path="lib/lib" + ik_library_name)
    setattr(args, 'plugin_name', args.namespace + '/IKFastKinematicsPlugin')
    cl = etree.SubElement(plugin_def,
                          "class",
                          name=args.plugin_name,
                          type=args.namespace + "::IKFastKinematicsPlugin",
                          base_class_type="kinematics::KinematicsBase")
    desc = etree.SubElement(cl, "description")
    desc.text = 'IKFast{template} plugin for closed-form kinematics of {robot} {group}' \
        .format(template=args.template_version, robot=args.robot_name, group=args.planning_group_name)

    # Write plugin definition to file
    plugin_file_name = ik_library_name + "_description.xml"
    plugin_file_path = args.ikfast_plugin_pkg_path + "/" + plugin_file_name
    with open(plugin_file_path, 'w') as f:
        etree.ElementTree(plugin_def).write(f,
                                            xml_declaration=True,
                                            pretty_print=True,
                                            encoding="UTF-8")
    print("Created plugin definition at  '%s'" % plugin_file_path)

    # Create CMakeLists file
    replacements.update(dict(_LIBRARY_NAME_=ik_library_name))
    copy_file(template_dir + "/CMakeLists.txt",
              args.ikfast_plugin_pkg_path + '/CMakeLists.txt', "cmake file",
              replacements)

    # Add plugin export to package manifest
    parser = etree.XMLParser(remove_blank_text=True)
    package_file_name = args.ikfast_plugin_pkg_path + "/package.xml"
    package_xml = etree.parse(package_file_name, parser).getroot()

    # Make sure at least all required dependencies are in the depends lists
    build_deps = [
        "liblapack-dev", "moveit_core", "pluginlib", "rclcpp", "tf2_kdl",
        "tf2_eigen"
    ]
    run_deps = ["liblapack-dev", "moveit_core", "pluginlib", "rclcpp"]

    update_deps(build_deps, "build_depend", package_xml)
    update_deps(run_deps, "exec_depend", package_xml)

    # Check that plugin definition file is in the export list
    new_export = etree.Element("moveit_core",
                               plugin="${prefix}/" + plugin_file_name)

    export_element = package_xml.find("export")
    if export_element is None:
        export_element = etree.SubElement(package_xml, "export")

    found = False
    for el in export_element.findall("moveit_core"):
        found = (etree.tostring(new_export) == etree.tostring(el))
        if found:
            break

    if not found:
        export_element.append(new_export)

    # Always write the package xml file, even if there are no changes, to ensure
    # proper encodings are used in the future (UTF-8)
    with open(package_file_name, "w") as f:
        etree.ElementTree(package_xml).write(f,
                                             xml_declaration=True,
                                             pretty_print=True,
                                             encoding="UTF-8")
    print("Wrote package.xml at  '%s'" % package_file_name)

    # Create a script for easily updating the plugin in the future in case the plugin needs to be updated
    easy_script_file_path = args.ikfast_plugin_pkg_path + "/update_ikfast_plugin.sh"
    with open(easy_script_file_path, 'w') as f:
        f.write("search_mode=" + args.search_mode + "\n" + "srdf_filename=" +
                args.srdf_filename + "\n" + "robot_name_in_srdf=" +
                args.robot_name_in_srdf + "\n" + "moveit_config_pkg=" +
                args.moveit_config_pkg + "\n" + "robot_name=" +
                args.robot_name + "\n" + "planning_group_name=" +
                args.planning_group_name + "\n" + "ikfast_plugin_pkg=" +
                args.ikfast_plugin_pkg + "\n" + "base_link_name=" +
                args.base_link_name + "\n" + "eef_link_name=" +
                args.eef_link_name + "\n" + "ikfast_output_path=" +
                args.ikfast_output_path + "\n\n" +
                "rosrun moveit_kinematics create_ikfast_moveit_plugin.py\\\n" +
                "  --search_mode=$search_mode\\\n" +
                "  --srdf_filename=$srdf_filename\\\n" +
                "  --robot_name_in_srdf=$robot_name_in_srdf\\\n" +
                "  --moveit_config_pkg=$moveit_config_pkg\\\n" +
                "  $robot_name\\\n" + "  $planning_group_name\\\n" +
                "  $ikfast_plugin_pkg\\\n" + "  $base_link_name\\\n" +
                "  $eef_link_name\\\n" + "  $ikfast_output_path\n")

    print("Created update plugin script at '%s'" % easy_script_file_path)
Ejemplo n.º 26
0
    def fields_view_get(cls, view_id=None, view_type='form'):
        '''
        Return a view definition.
        If view_id is None the first one will be used of view_type.
        The definition is a dictionary with keys:
           - model: the model name
           - type: the type of the view
           - view_id: the id of the view
           - arch: the xml description of the view
           - fields: a dictionary with the definition of each field in the view
           - field_childs: the name of the childs field for tree
        '''
        key = (cls.__name__, view_id, view_type)
        result = cls._fields_view_get_cache.get(key)
        if result:
            return result
        result = {'model': cls.__name__}
        pool = Pool()
        View = pool.get('ir.ui.view')

        view = None
        inherit_view_id = None
        if view_id:
            view = View(view_id)
        else:
            domain = [
                ('model', '=', cls.__name__),
                ('type', '=', view_type),
                [
                    'OR',
                    ('inherit', '=', None),
                    ('inherit.model', '!=', cls.__name__),
                ],
            ]
            views = View.search(domain)
            if views:
                view = views[0]
        if view:
            if view.inherit:
                inherit_view_id = view.id
                view = view.inherit
            view_id = view.id

        # if a view was found
        if view:
            result['type'] = view.type
            result['view_id'] = view_id
            result['arch'] = view.arch
            result['field_childs'] = view.field_childs

            # Check if view is not from an inherited model
            if view.model != cls.__name__:
                Inherit = pool.get(view.model)
                result['arch'] = Inherit.fields_view_get(
                    result['view_id'])['arch']
                view_id = inherit_view_id

            # get all views which inherit from (ie modify) this view
            views = View.search([
                'OR',
                [
                    ('inherit', '=', view_id),
                    ('model', '=', cls.__name__),
                ],
                [
                    ('id', '=', view_id),
                    ('inherit', '!=', None),
                ],
            ])
            raise_p = False
            while True:
                try:
                    views.sort(key=lambda x: cls._modules_list.index(x.module
                                                                     or None))
                    break
                except ValueError:
                    if raise_p:
                        raise
                    # There is perhaps a new module in the directory
                    ModelView._reset_modules_list()
                    raise_p = True
            for view in views:
                if view.domain:
                    if not PYSONDecoder({
                            'context': Transaction().context
                    }).decode(view.domain):
                        continue
                if not view.arch or not view.arch.strip():
                    continue
                result['arch'] = _inherit_apply(result['arch'], view.arch)

        # otherwise, build some kind of default view
        else:
            if view_type == 'form':
                res = cls.fields_get()
                xml = '''<?xml version="1.0"?>''' \
                    '''<form string="%s" col="4">''' % (cls.__doc__,)
                for i in res:
                    if i in ('create_uid', 'create_date', 'write_uid',
                             'write_date', 'id', 'rec_name'):
                        continue
                    if res[i]['type'] not in ('one2many', 'many2many'):
                        xml += '<label name="%s"/>' % (i, )
                        xml += '<field name="%s"/>' % (i, )
                        if res[i]['type'] == 'text':
                            xml += "<newline/>"
                    else:
                        xml += '<field name="%s" colspan="4"/>' % (i, )
                xml += "</form>"
            elif view_type == 'tree':
                field = 'id'
                if cls._rec_name in cls._fields:
                    field = cls._rec_name
                xml = '''<?xml version="1.0"?>''' \
                    '''<tree string="%s"><field name="%s"/></tree>''' \
                    % (cls.__doc__, field)
            else:
                xml = ''
            result['type'] = view_type
            result['arch'] = xml
            result['field_childs'] = None
            result['view_id'] = 0

        # Update arch and compute fields from arch
        parser = etree.XMLParser(remove_blank_text=True)
        tree = etree.fromstring(result['arch'], parser)
        xarch, xfields = cls._view_look_dom_arch(tree, result['type'],
                                                 result['field_childs'])
        result['arch'] = xarch
        result['fields'] = xfields

        cls._fields_view_get_cache.set(key, result)
        return result
Ejemplo n.º 27
0
namespace = current_app.config["XML_NAMESPACE"]
api_endpoint = current_app.config["MTD_API_ENDPOINT"]

NOMENCLATURE_MAPPING = {
    "id_nomenclature_data_type": "DATA_TYP",
    "id_nomenclature_dataset_objectif": "JDD_OBJECTIFS",
    "id_nomenclature_data_origin": "DS_PUBLIQUE",
    "id_nomenclature_source_status": "STATUT_SOURCE",
}

# get the root logger
log = logging.getLogger()
gunicorn_error_logger = logging.getLogger("gunicorn.error")

xml_parser = ET.XMLParser(ns_clean=True, recover=True, encoding="utf-8")


def get_acquisition_framework(uuid_af):
    """
        Fetch a AF from the MTD WS with the uuid of the AD

        Parameters:
            - uuid_af (str): the uuid of the AF
        Returns:
            byte: the xml of the AF as byte
    """
    url = "{}/cadre/export/xml/GetRecordById?id={}"
    try:
        r = utilsrequests.get(url.format(api_endpoint, uuid_af))
    except AssertionError:
Ejemplo n.º 28
0
 def text_blob(self):
     xml_parser = et.XMLParser(recover=True)
     tr_tree = et.fromstring('<xml>' + self.transcription + '</xml>',
                             xml_parser)
     return et.tostring(tr_tree, encoding='utf8',
                        method='text').decode('utf-8')
Ejemplo n.º 29
0
from xmodule.modulestore.xml_exporter import DEFAULT_CONTENT_FIELDS
from xmodule.tabs import CourseTabList
from xmodule.modulestore.keys import UsageKey
from xmodule.modulestore.locations import SlashSeparatedCourseKey

from xblock.field_data import DictFieldData
from xblock.runtime import DictKeyValueStore, IdGenerator

from . import ModuleStoreReadBase, Location, XML_MODULESTORE_TYPE

from .exceptions import ItemNotFoundError
from .inheritance import compute_inherited_metadata, inheriting_field_data

from xblock.fields import ScopeIds, Reference, ReferenceList, ReferenceValueDict

edx_xml_parser = etree.XMLParser(dtd_validation=False, load_dtd=False,
                                 remove_comments=True, remove_blank_text=True)

etree.set_default_parser(edx_xml_parser)

log = logging.getLogger(__name__)


# VS[compat]
# TODO (cpennington): Remove this once all fall 2012 courses have been imported
# into the cms from xml
def clean_out_mako_templating(xml_string):
    xml_string = xml_string.replace('%include', 'include')
    xml_string = re.sub(r"(?m)^\s*%.*$", '', xml_string)
    return xml_string

Ejemplo n.º 30
0
def remove_whitespace_from_xml(xmlstr):
    parser = etree.XMLParser(remove_blank_text=True)
    elem = etree.XML(xmlstr, parser=parser)
    return etree.tostring(elem)