Example #1
0
def ensure_elementtree_imported(verbosity, logfile):
    global ET, ET_has_iterparse
    if ET is not None:
        return
    if "IronPython" in sys.version:
        import xml.etree.ElementTree as ET
        #### 2.7.2.1: fails later with
        #### NotImplementedError: iterparse is not supported on IronPython. (CP #31923)
    else:
        try: import xml.etree.cElementTree as ET
        except ImportError:
            try: import cElementTree as ET
            except ImportError:
                try: import lxml.etree as ET
                except ImportError:
                    try: import xml.etree.ElementTree as ET
                    except ImportError:
                        try: import elementtree.ElementTree as ET
                        except ImportError:
                            raise Exception("Failed to import an ElementTree implementation")
    if hasattr(ET, 'iterparse'):
        _dummy_stream = BYTES_IO(b'')
        try:
            ET.iterparse(_dummy_stream)
            ET_has_iterparse = True
        except NotImplementedError:
            pass
    if verbosity:
        etree_version = repr([
            (item, getattr(ET, item))
            for item in ET.__dict__.keys()
            if item.lower().replace('_', '') == 'version'
            ])
        print(ET.__file__, ET.__name__, etree_version, ET_has_iterparse, file=logfile)
Example #2
0
def process_map(filename):
    st = set()
    for _, element in ET.iterparse(filename):
        if element.tag in ("tag"):
            if element.attrib['k']=="cuisine":
                if element.attrib['v'].lower() in ('coffe_shop','coffee_shop'):
                    element.attrib['v']='coffee'
                if element.attrib['v'].lower() in ('steak_house','steaks'):
                    element.attrib['v']='steak' 
                if element.attrib['v'].lower() in ('mexican','mexcian_food'):
                    element.attrib['v']='mexican'                     
                st.add(element.attrib['v'].lower())
    dct={}
    for item in st:
        dct[item]=0
    for _, element in ET.iterparse(filename):
        if element.tag in ("tag"):
            if element.attrib['k']=="cuisine":
                if element.attrib['v'].lower() in ('coffe_shop','coffee_shop'):
                    element.attrib['v']='coffee'
                if element.attrib['v'].lower() in ('steak_house','steaks'):
                    element.attrib['v']='steak'   
                if element.attrib['v'].lower() in ('mexican','mexcian_food'):
                    element.attrib['v']='mexican'                 
                dct[element.attrib['v'].lower()]=dct[element.attrib['v'].lower()]+1
    dctFinal={}
    for key in dct:
        if ";" not in key and "," not in key:
            dctFinal[key]=dct[key]
    
    return dctFinal
Example #3
0
 def _read (self):
     try:
         i = ET.iterparse(self.f, ('start', 'end'))
     except FileNotFoundError:
         self._create()
         i = ET.iterparse(self.f, ('start', 'end'))
     return i
Example #4
0
  def _GetSheetRows(self, filename):
    """Parses the contents of the first sheet of an XLSX document.

    Args:
      filename (str): The file path of the XLSX document to parse.

    Returns:
      list[list[str]]: A list of lists representing the rows of the first sheet.

    Raises:
      ValueError: if the sheet cannot be found, or a string cannot be read.
    """
    zip_file = zipfile.ZipFile(filename)

    # Fail if we can't find the expected first sheet.
    if self._SHEET1 not in zip_file.namelist():
      raise ValueError(
          'Unable to locate expected sheet: {0:s}'.format(self._SHEET1))

    # Generate a reference table of shared strings if available.
    strings = []
    if self._SHARED_STRINGS in zip_file.namelist():
      zip_file_object = zip_file.open(self._SHARED_STRINGS)
      for _, element in ElementTree.iterparse(zip_file_object):
        if element.tag.endswith(self._SHARED_STRING_TAG):
          strings.append(element.text)

    row = []
    rows = []
    value = ''
    zip_file_object = zip_file.open(self._SHEET1)
    for _, element in ElementTree.iterparse(zip_file_object):
      if (element.tag.endswith(self._VALUE_STRING_TAG) or
          element.tag.endswith(self._SHARED_STRING_TAG)):
        value = element.text

      if element.tag.endswith(self._COLUMN_TAG):
        # Grab value from shared string reference table if type shared string.
        if (strings and element.attrib.get(self._TYPE_ATTRIBUTE) ==
            self._SHARED_STRING_TYPE):
          try:
            value = strings[int(value)]
          except (IndexError, ValueError):
            raise ValueError(
                'Unable to successfully dereference shared string.')

        row.append(value)

      # If we see the end tag of the row, record row in rows and reset.
      if element.tag.endswith(self._ROW_TAG):
        rows.append(row)
        row = []

    return rows
Example #5
0
def process(fn, options):
    if options.output_dir == '-':
        outdir = None    # use STDOUT
    else:
        outdir = make_output_directory(fn, options)

    if not fn.endswith('.gz'):
        process_stream(ET.iterparse(fn), fn, outdir, options)
    else:
        with gzip.GzipFile(fn) as stream:
            process_stream(ET.iterparse(stream), fn, outdir, options)
Example #6
0
def process_map(filename):
    st = set()
    for _, element in ET.iterparse(filename):
        if element.tag in ("tag"):
            if element.attrib['k']=="power":
                st.add(element.attrib['v'])
    dct={}
    for item in st:
        dct[item]=0
    for _, element in ET.iterparse(filename):
        if element.tag in ("tag"):
            if element.attrib['k']=="power":
                dct[element.attrib['v']]=dct[element.attrib['v']]+1
    return dct
Example #7
0
    def getDataFromExternal(self, date, progress_callback=None):
        if self.xmltvType == XMLTVSource.TYPE_LOCAL_FILE:
            f = FileWrapper(self.xmltvFile)
            context = ElementTree.iterparse(f, events=("start", "end"))
            size = f.size
        else:
            u = urllib2.urlopen(self.xmltvUrl, timeout=30)
            xml = u.read()
            u.close()

            f = StringIO.StringIO(xml)
            context = ElementTree.iterparse(f)
            size = len(xml)

        return self.parseXMLTV(context, f, size, self.logoFolder, progress_callback)
def process_map(filename):
    users = set() #create empty set, stores each value once
    for event, element in ET.iterparse(filename): #looping through elements
        if 'uid' in element.attrib: #we search for UID attribute
            users.add(element.attrib['uid']) # we add to set the UID, 

    return users
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
    	if 'user' in element.attrib and element.attrib['user'] not in users:
            users.add(element.attrib['uid'])

    return users
Example #10
0
def xml_hunt(xml_file):
    """
    Gets list of all XML entries with "filename" attribute,
    and returns a dictionary of the file attributes keyed
    by a ":"-joined string of parent names.

    """
    root = ET.iterparse(xml_file, events=("start", "end"))
    parents = []
    matches = {}
    for event, element in root:
        if element.tag not in ["folder", "file"]:  # skip topmost categories
            continue
        if element.tag == "folder":
            if event == "start":  # add to parents
                parents.append(element.attrib["name"])
            elif event == "end":  # strip from parents
                del parents[-1]
            continue
        if event == "start" and element.tag == "file":
            parent_string = ":".join(parents)
            try:
                matches[parent_string].append(element.attrib)
            except KeyError:
                matches[parent_string] = [element.attrib]
    return matches
def reports(request):
    for element in ET.iterparse(request):
        # do stuff to parse this element
        # save a models.Report
        pass
    # return list of file hashes we need uploaded
    return HttpResponse()
Example #12
0
def read_xml_input(inputfile,outputfile):
	
	output = []

	tree = ET.iterparse(inputfile)
	
	for event, elem in tree:
		if event == "end" and elem.tag == "Sentence":
			story = elem

			# Check to make sure all the proper XML attributes are included
			attribute_check = [key in story.attrib for key in ['date', 'id', 'sentence', 'source']]
			if not attribute_check:
				print('Need to properly format your XML...')
				break

			
			text = story.find('Text').text
			text = text.replace('\n', ' ').replace('  ', ' ').strip()

			output.append(text+"\n")
		

			elem.clear()

	ofile = open(outputfile,'w')
	for line in output:
		ofile.write(line.encode('utf8'))
	ofile.close()
Example #13
0
def main():

    parser = argparse.ArgumentParser(description='Get file paths for input/output')
    parser.add_argument('--in', dest='input_file', required=True)
    parser.add_argument('--out', dest='output_file')
    args = parser.parse_args()

    input_file = args.input_file
    if not args.output_file:
        output_file = input_file[:input_file.rfind('.')] + ".json"

    # convert XML to json
    with open(output_file, 'a') as f:
        # parse input XML file
        for event, elem in Et.iterparse(input_file):
            if elem.tag == "row" and '_uuid' in elem.attrib:
                this_dict = dict()
                # Add _address
                this_dict['source_uri'] = elem.attrib['_address']
                for child in elem.findall('*'):
                    if child.tag == "location_1":
                        if 'latitude' in child.attrib:
                            this_dict['latitude'] = child.attrib['latitude']
                        if 'longitude' in child.attrib:
                            this_dict['longitude'] = child.attrib['longitude']
                    else:
                        this_dict[child.tag] = child.text

                # Write record to file as JSON
                json.dump(this_dict, f)
                print('', file=f)

                # this helps reduce mem usage but more can be done (see http://effbot.org/zone/element-iterparse.htm)
                elem.clear()
def count_tags(filename):
    tags={}
    for evt,elem in ET.iterparse(filename):
        if(tags.has_key(elem.tag)):
            tags[elem.tag]=tags[elem.tag]+1
        else:
            tags[elem.tag]=0
def parse_repomd_xml(repomd_file_path, skip_data_types=None):
    skip_data_types = skip_data_types or []

    if not os.access(repomd_file_path, os.F_OK | os.R_OK):
        return {}

    xml_parser = ElementTree.iterparse(repomd_file_path, events=("end",))
    xml_iterator = iter(xml_parser)

    data_type_dict = {}

    for event, element in xml_iterator:
        if element.tag != _DATA_TAG:
            continue

        if element.attrib["type"] in skip_data_types:
            continue

        data_type = copy.deepcopy(_DATA_SKEL)

        data_type["data_type"] = element.attrib["type"]

        location_element = element.find(_LOCATION_TAG)
        if location_element is not None:
            data_type["relative_path"] = location_element.attrib["href"]

        checksum_element = element.find(_CHECKSUM_TAG)
        if checksum_element is not None:
            data_type["checksum_type"] = checksum_element.attrib["type"]
            data_type["checksum"] = checksum_element.text

        data_type_dict[data_type["data_type"]] = data_type

    return data_type_dict
Example #16
0
    def getDataFromExternal(self, date, progress_callback=None):

        f = FileWrapper(self.xmltvFile)
        context = ElementTree.iterparse(f, events=("start", "end"))
        size = f.size

        return self.parseXMLTV(context, f, size, self.logoFolder, progress_callback)
Example #17
0
 def __iter__(self):
   if self.is_debug:
     fname = self.protxml + '.dump'
     logging.debug('Dumping protxml reads into ' + fname)
     self.debug_file = open(fname, 'w')
     self.debug_file.write('{\n')
   for event, elem in etree.iterparse(self.protxml, events=('end', 'start-ns')):
     if event == 'start-ns':
       self.nsmap.update({elem})
     if event == 'end':
       if elem.tag == parse.fixtag('', 'protein_group', self.nsmap):
         group = parse_protein_group(elem, self.nsmap)
         yield group
         if self.is_debug:
           pprint(group, stream=self.debug_file)
           self.debug_file.write(',\n')
         elem.clear()
       elif elem.tag == parse.fixtag('', 'proteinprophet_details', self.nsmap):
         self.distribution = parse_protein_probabilities(elem, self.nsmap)
         if self.is_debug:
           fname = self.protxml + '.distribution.dump'
           pprint(self.distribution, open(fname, 'w'))
         elem.clear()
   if self.is_debug:
     self.debug_file.write('}\n')
     self.debug_file.close()
def process_map(filename):
    """
    Takes in an OSM file, and print information that is useful for auditing
    Calls a function that outputs a text file with all keys and a set of 
    their distinct values
    """
    tag_dictionary = {}
    entry_counts= {}
    street_types = {}
    
    keys_to_inspect = {"lower_colon": set(), "problemchars": set(), 
                                                                "other": set()}
    key_counts = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}

    for event, element in ET.iterparse(filename, events=("start",)):
        
        # Update key_counts, keys_to_inspect
        key_counts, keys_to_inspect= key_type(element, key_counts, 
                                                                keys_to_inspect)
        
        #Update tag_dcitionary, entry_counts, street_types
        tag_dictionary, entry_counts, street_types = build_dictionary(element, 
                                    tag_dictionary, entry_counts, street_types)

    # Create text file and print report
    create_text_file(tag_dictionary, "tag_key_values")    
    print_report(key_counts, keys_to_inspect, entry_counts, street_types)    
    return     
Example #19
0
 def __init__(self, file):
     """Initialize the class."""
     # Get an iterable context for XML parsing events
     context = iter(ElementTree.iterparse(file, events=('start', 'end')))
     event, root = next(context)
     self.root = root
     self.context = context
Example #20
0
def process_map(filename):
    users = set()

    # the following lines allow us
    # to access the root element from within
    # the iterator in order to clear the memory...
	########
	########
    # get an iterable
    context = ET.iterparse(filename, events=("start", "end"))
    # turn it into an iterator
    context = iter(context)
    # get the root element
    event, root = context.next()
	########
	########

    # iterate through the elements, 
    # aggregating each distinct user 
    # into the users set. 
    for _,element in context:
        if 'uid' in element.attrib.keys():
            user = element.attrib['uid']
            if user in users:
                continue
            else:
                users.add(user)
        pass
	root.clear()

    return users
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if "uid" in element.attrib:
            users.add(element.attrib["uid"])
    
    return users
Example #22
0
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}

    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys
Example #23
0
def do_parse(opt, filename):

    ls = subprocess.Popen([ djvutoxml, filename], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True)

    page_nr = 1
    for event, elem in etree.iterparse(XmlFile(ls.stdout)):
        if elem.tag.lower() == 'object':
            page = OcrPage()
            if not opt.silent:
                print >> sys.stderr, page_nr, '\r',
            page.start_page(elem)
            parse_page(page, elem, page_nr)
            page.end_page(elem)

            filename = opt.out_dir + 'page_%04d.html' % page_nr

            if opt.compress:
                text = page.get_hocr_html().encode('utf-8')
                utils.compress_file_data(filename, text, opt.compress)
            else:
                utils.write_file(filename, text)

            elem.clear()
            page_nr += 1

    if not opt.silent:
        print >> sys.stderr

    ls.wait()
    return ls.returncode
Example #24
0
  def __init__(self, filename):
    print >> log.v4, "Loading lexicon", filename
    lex_file = open(filename, 'rb')
    if filename.endswith(".gz"):
      lex_file = gzip.GzipFile(fileobj=lex_file)
    self.phonemes = {}
    self.lemmas = {}

    context = iter(etree.iterparse(lex_file, events=('start', 'end')))
    _, root = next(context)  # get root element
    tree = [root]
    for event, elem in context:
      if event == "start":
        tree += [elem]
      elif event == "end":
        assert tree[-1] is elem
        tree = tree[:-1]
        if elem.tag == "phoneme":
          symbol = elem.find("symbol").text.strip()  # should be unicode
          variation = elem.find("variation").text.strip()
          assert symbol not in self.phonemes
          assert variation in ["context", "none"]
          self.phonemes[symbol] = {"index": len(self.phonemes), "symbol": symbol, "variation": variation}
          root.clear()  # free memory
        elif elem.tag == "phoneme-inventory":
          print >> log.v4, "Finished phoneme inventory, %i phonemes" % len(self.phonemes)
          root.clear()  # free memory
        elif elem.tag == "lemma":
          orth = elem.find("orth").text.strip()
          phons = [{"phon": e.text.strip(), "score": float(e.attrib.get("score", 0))} for e in elem.findall("phon")]
          assert orth not in self.lemmas
          self.lemmas[orth] = {"orth": orth, "phons": phons}
          root.clear()  # free memory
    print >> log.v4, "Finished whole lexicon, %i lemmas" % len(self.lemmas)
Example #25
0
    def from_config_xml(config_xml):
        if not path.isfile(config_xml):
            raise Exception("Not a file: {}".format(config_xml))

        # Workaround for ET stripping some namespaces
        events = ("start", "start-ns", "end-ns")
        namespace_mapping = {}
        namespaces = []
        root = None
        for event, elem in Et.iterparse(config_xml, events=events):
            if event == "start-ns":
                namespaces.append(elem)
            elif event == "start":
                namespace_mapping[elem] = dict(namespaces)
                namespaces = []
                # Find root while we are here
                if elem.tag == "snapshot":
                    root = elem

        # Empty
        if root is None:
            return OdlConfig([], [])

        modules_elements = root.findall(OdlConfig.MODULE_PATH, OdlConfig.NAMESPACES)
        modules = OdlConfig.__parse_modules(modules_elements, namespace_mapping)
        services_elements = root.findall(OdlConfig.SERVICE_PATH, OdlConfig.NAMESPACES)
        services = OdlConfig.__parse_services(services_elements, namespace_mapping)

        if not modules and not services:
            return OdlConfig([], [])
        return OdlConfig(modules, services)
def process_map(filename):
    #creates the dictionary keys and calls key_type
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
        element.clear() 
    return keys
def jsonify(file_in, pretty = False):
    # processes file into JSON
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for event, element in ET.iterparse(file_in, events=("start",)):     
            if element.tag == "node" or element.tag == "way":               
                for tag in element.iter("tag"):                   
                    if is_street_name(tag): 
                            m = street_type_re.search(tag.attrib['v'])
                            if m:
                                street_name = update_name(tag.attrib['v'], mapping)
                            m = directions_re.search(street_name)
                            if m:
                                street_name = update_direction(street_name, direction_mapping)
                                m = directions_re.search(street_name)
                                tag.set('v', street_name)
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
            element.clear()  
    #pprint.pprint(data)
    return data
Example #28
0
    def getCategories(self):        
        cat  = dict()
        path = os.path.join(datapath, 'cats.xml')
        dixie.log("Checking for category XML path at: "+path)
        try:
            if sfile.exists(path):
                xml = sfile.read(path)
        except:
            dixie.log("### cats.xml does not exist")
#        xml = xml.replace('&', '&')
        xml = StringIO.StringIO(xml)
        xml = ElementTree.iterparse(xml, events=("start", "end"))

        for event, elem in xml:
            try:
                if event == 'end':
                    if elem.tag == 'cats':
                        channel  = elem.findtext('channel')
                        category = elem.findtext('category')
                        if channel != '' and category != '':
                            cat[channel] = category
            except:
                pass

        return cat
Example #29
0
    def parse(self, xml_file, from_string=False):
        """Import .nessus file"""
        # Parse XML file
        if from_string:
            xml_file = StringIO(xml_file)

            # Iterate through each host scanned and create objects for each
        for event, elem in ET.iterparse(xml_file):

            # Grab the report name from the Report element
            if event == "end" and elem.tag == "Report":
                self.name = elem.attrib.get("name")
                continue

                # Only process ReportHost elements
            elif event == "end" and elem.tag != "ReportHost":
                continue

            rh_obj = ReportHost(elem)
            if rh_obj:
                self.targets.append(rh_obj)

                # Update Report dates
                if not self.scan_start and rh_obj.get("host_start"):
                    self.scan_start = rh_obj.host_start
                if not self.scan_end:
                    self.scan_end = rh_obj.host_end
                if rh_obj.get("host_start"):
                    if rh_obj.host_start < self.scan_start:
                        self.scan_start = rh_obj.host_start
                if rh_obj.host_end > self.scan_end:
                    self.scan_end = rh_obj.host_end
Example #30
0
def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
    """Generator function to parse UniProt XML as SeqRecord objects.

    parses an XML entry at a time from any UniProt XML file
    returns a SeqRecord for each iteration

    This generator can be used in Bio.SeqIO

    return_raw_comments = True --> comment fields are returned as complete XML to allow further processing
    skip_parsing_errors = True --> if parsing errors are found, skip to next entry
    """
    if isinstance(alphabet, Alphabet.NucleotideAlphabet):
        raise ValueError("Wrong alphabet %r" % alphabet)
    if isinstance(alphabet, Alphabet.Gapped):
        if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet):
            raise ValueError("Wrong alphabet %r" % alphabet)

    if not hasattr(handle, "read"):
        if isinstance(handle, str):
            handle = StringIO(handle)
        else:
            raise Exception('An XML-containing handler or an XML string must be passed')

    if ElementTree is None:
        from Bio import MissingExternalDependencyError
        raise MissingExternalDependencyError(
                "No ElementTree module was found. "
                "Use Python 2.5+, lxml or elementtree if you "
                "want to use Bio.SeqIO.UniprotIO.")

    for event, elem in ElementTree.iterparse(handle, events=("start", "end")):
        if event == "end" and elem.tag == NS + "entry":
            yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse()
            elem.clear()
Example #31
0
def check_street(filename):
    for event, elem in ET.iterparse(filename):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])
    print(street_types, "%s: %d")
    return (street_types)
Example #32
0
def check_amenity(filename):
    for event, elem in ET.iterparse(filename):
        if is_amenity_name(elem):
            audit_amenity_type(amenity_types, elem.attrib['v'])
    print(amenity_types, "%s: %d")
    return (amenity_types)
#!/Applications/QGIS3.14.app/Contents/MacOS/bin/python3
#!/bin/env python3
#
#
# (c) 2020 - Alessandro Frigeri, Istituto Nazionale di Astrofisica
#
# symbolsplitter - splits qgis xml style file into n xml files containing one symbol each
#

import xml.etree.ElementTree as ET
import sys, string

infile = sys.argv[1]

context = ET.iterparse(infile, events=('end', 'start'))
depth = 0
for event, elem in context:
    if elem.tag == 'symbol':
        if event == 'end':
            depth -= 1
        if event == 'start':
            depth += 1
        name = elem.get('name')
        if depth == 0:
            filename = format(name.split(':')[0] + ".xml")
            with open(filename, 'wb') as f:
                #f.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
                f.write(b"<qgis_style version=1>\n<symbols>\n")
                f.write(ET.tostring(elem))
                f.write(b"\n</symbols>")
Example #34
0
'''
Created on 1 июня 2016 г.

@author: Михаил Булыгин <*****@*****.**>
'''
import xml.etree.ElementTree as ET
from pg import DB

if __name__ == '__main__':
    file = open("ruwiki.xml", encoding="utf-8")
    tree = ET.iterparse(file)

    db = DB(host="localhost", user="******", passwd="1234", dbname="wiki")
    db.query("TRUNCATE TABLE wiki")

    for i, line in enumerate(tree):
        event, element = line
        if element.tag == "page":
            pageid = element.find("id").text
            title = element.find("title").text
            timestamp = element.find("revision").find(
                "timestamp").text.replace("T", " ")
            username = element.find("revision").find("contributor").find(
                "username")
            if not username is None:
                author = username.text
            else:
                author = element.find("revision").find("contributor").find(
                    "ip").text
            text = element.find("revision").find("text").text
Example #35
0
    q.put((editid, timestamp, articleid, userid, quality, delta_edit,
           len(text_prev), len(text_final), future_edits))

print("//{}".format(args.xml_file))

# number of processes to use
NUMBER_PROCESSES = args.processes

# whether should 'cleanly' separate the computations into test/training sets
THRESHOLD_TIMESTAMP = args.threshold

# time start in seconds
time_start = time.time()

## open XML file
parse_iterator = ET.iterparse(args.xml_file, events=('end', 'start'))
parse_iterator = iter(parse_iterator)

# keep track of the root element
event, root = next(parse_iterator)

rev_count = 0
results = []
with mp.Pool(NUMBER_PROCESSES) as pool:
    manager = mp.Manager()
    q = manager.Queue()
    writer = csv.writer(sys.stdout, delimiter='#')

    edits11 = [""]
    ids11 = ["-"]
    users11 = [-1]
Example #36
0
def process_map(filename):
    """process file iternatively"""
    users = set()
    for _, element in ET.iterparse(filename):
        users.update(get_user(element))
    return users
Example #37
0
#!/usr/bin/python
import re
from xml.etree import ElementTree as ET

#parser = ET.itparse('10000_new')

for event, element in ET.iterparse('10000_new', events=('start', 'end')):
    if element.tag == 'doc':
        print(element.text)
Example #38
0
    def parse(self, values_are_confidence=False, rooted=False):
        """Parse the text stream this object was initialized with."""
        nexml_doc = ElementTree.iterparse(self.handle, events=("end",))

        for event, node in nexml_doc:
            if node.tag == qUri("nex:tree"):
                node_dict = {}
                node_children = {}
                root = None

                child_tags = node.getchildren()
                nodes = []
                edges = []
                for child in child_tags:
                    if child.tag == qUri("nex:node"):
                        nodes.append(child)
                    if child.tag == qUri("nex:edge"):
                        edges.append(child)

                for node in nodes:
                    node_id = node.attrib["id"]
                    this_node = node_dict[node_id] = {}
                    if "otu" in node.attrib and node.attrib["otu"]:
                        this_node["name"] = node.attrib["otu"]
                    if "root" in node.attrib and node.attrib["root"] == "true":
                        root = node_id

                    for child in node.getchildren():
                        if child.tag == qUri("nex:meta"):
                            self.add_annotation(node_dict[node_id], child)

                srcs = set()
                tars = set()
                for edge in edges:
                    src, tar = edge.attrib["source"], edge.attrib["target"]
                    srcs.add(src)
                    tars.add(tar)
                    if src not in node_children:
                        node_children[src] = set()

                    node_children[src].add(tar)
                    if "length" in edge.attrib:
                        node_dict[tar]["branch_length"] = float(edge.attrib["length"])
                    if "property" in edge.attrib and edge.attrib["property"] in matches(
                        "cdao:has_Support_Value"
                    ):
                        node_dict[tar]["confidence"] = float(edge.attrib["content"])

                    for child in edge.getchildren():
                        if child.tag == qUri("nex:meta"):
                            self.add_annotation(node_dict[tar], child)

                if root is None:
                    # if no root specified, start the recursive tree creation function
                    # with the first node that's not a child of any other nodes
                    rooted = False
                    possible_roots = (
                        node.attrib["id"]
                        for node in nodes
                        if node.attrib["id"] in srcs and node.attrib["id"] not in tars
                    )
                    root = next(possible_roots)
                else:
                    rooted = True

                yield NeXML.Tree(
                    root=self._make_tree(root, node_dict, node_children), rooted=rooted
                )
Example #39
0
def parseMML(mmlinput):
    exppy = ""  # this is the python expression
    symvars = [
    ]  # these are symbolic variables which can eventually take part in the expression
    events = ("start", "end")
    level = 0
    context = ET.iterparse(mmlinput, events=events)
    for action, elem in context:
        if (action == 'start') and (elem.tag == 'mfrac'):
            level += 1
            tree = ET.ElementTree(elem[0])
            tree.write('output.xml')
            (a, b) = parseMML('output.xml')
            symvars.append(b)
            for index in a:
                exppy += index
            exppy += '/'
            tree = ET.ElementTree(elem[1])
            tree.write('output.xml')
            (a, b) = parseMML('output.xml')
            symvars.append(b)
            for index in a:
                exppy += index
        if (action == 'end') and (elem.tag == 'mfrac'):
            level -= 1
        if level:
            continue
        if (action == 'start') and (elem.tag == 'mrow'):
            exppy += '('
        if (action == 'end') and (elem.tag == 'mrow'):
            exppy += ')'
        if action == 'start' and elem.tag == 'msub':  # this is a power
            # level += 1
            # tree = ET.ElementTree(elem[0])
            # tree.write('output.xml')
            # (a, b) = parseMML('output.xml')
            # symvars.append(b)
            # for index in a:
            #     exppy += '['
            #     exppy += index
            #     exppy += ']'
            # exppy += '**'
            # tree = ET.ElementTree(elem[1])
            # tree.write('output.xml')
            # (a, b) = parseMML('output.xml')
            # symvars.append(b)
            # for index in a:
            #     exppy += index
            level += 1
            tree = ET.ElementTree(elem[0])
            tree.write('output.xml')
            (a, b) = parseMML('output.xml')
            symvars.append(b)
            for index in a:
                exppy += '['
                exppy += index
                exppy += ']'
            exppy += '**'
            tree = ET.ElementTree(elem[1])
            tree.write('output.xml')
            (a, b) = parseMML('output.xml')
            symvars.append(b)
            for index in a:
                exppy += index
                # exppy += ''
        if action == 'start' and elem.tag == 'mn':  # this is a number
            exppy += elem.text
        if action == 'start' and elem.tag == 'mi':  # this is a variable
            exppy += elem.text
            symvars.append(
                elem.text
            )  # we'll return the variable, so sympy can sympify it afterwards
        if action == 'start' and elem.tag == 'mo':  # this is a operation
            exppy += elem.text

    if exppy.startswith('(') and exppy.endswith(')'):
        exppy = exppy[1:-1]

    exppyarray = exppy.split("=")

    # for exppy in exppyarray :
    #     print(exppy)

    return exppyarray, symvars
Example #40
0
def appGmlToMetadataElementDict(gmlPath):
    """słownik metadataElementDict na podstawie pliku zbioru APP"""
    metadataElementDict = {}

    ns = {
        'gco': "http://www.isotc211.org/2005/gco",
        'app':
        "https://www.gov.pl/static/zagospodarowanieprzestrzenne/schemas/app/1.0",
        'gmd': "http://www.isotc211.org/2005/gmd",
        'gml': "http://www.opengis.net/gml/3.2",
        'wfs': "http://www.opengis.net/wfs/2.0",
        'xlink': "http://www.w3.org/1999/xlink",
        'xsi': "http://www.w3.org/2001/XMLSchema-instance"
    }

    root = ET.parse(gmlPath)

    # E1
    element = root.find('//app:AktPlanowaniaPrzestrzennego/app:typPlanu', ns)
    if element is not None:
        typPlanu = element.attrib['{%s}title' % ns['xlink']].replace(
            'miejscowy', 'miejscowych').replace('plan', 'planów').replace(
                'kier.',
                'kierunków').replace('zagosp.', 'zagospodarowania').replace(
                    'przestrz.', 'przestrzennego')
        metadataElementDict['e1'] = {
            'e1_lineEdit':
            "Zbiór danych przestrzennych dla %s <typ_jednostki> <nazwa_jednostki>"
            % typPlanu
        }

    # E5
    date = root.find('//app:AktPlanowaniaPrzestrzennego//app:przestrzenNazw',
                     ns)
    if date is None:
        utils.showPopup(
            "Błąd pliku",
            "wczytany plik nie jest poprawną definicją GML dla zbioru APP. Zwaliduj plik przed wczytaniem do formularza metadanych",
            QMessageBox.Warning)
        return False
    metadataElementDict['e5'] = [{'e5_lineEdit': date.text}]

    # E7 - kodowanie z nagłówka GML
    with open(gmlPath, 'r') as file:
        line = file.readlines(1)[0]
        line.replace("'", '"')
        encoding = re.search('encoding="[a-zA-Z0-9\-]{3,10}"',
                             line)[0].split("=")[-1].strip('"').replace(
                                 ' ', '').replace('-', '').lower()
        if encoding == 'usascii':
            encoding = 'usAscii'
        metadataElementDict['e7'] = [{'e7_cmbbx': encoding}]

    # E9, E10 - słowa kluczowe
    itemsList = []
    date = root.find('//app:AktPlanowaniaPrzestrzennego/app:poziomHierarchii',
                     ns)
    if date is not None:
        atrybut_title = date.attrib['{%s}title' % ns['xlink']]
        atrybut_href = date.attrib['{%s}href' % ns['xlink']]

    tekst = 'Regionalnym' if atrybut_title == 'regionalny' else 'Lokalne'

    # poziom administracyjny
    itemsList.append({
        'e9_lineEdit':
        tekst,
        'e10_cmbbx':
        'Data opublikowania',
        'e10_dateTimeEdit':
        QDateTime(2019, 5, 22, 0, 0),
        'e10_lineEdit':
        'Zakres przestrzenny',
        'xlink':
        "http://inspire.ec.europa.eu/metadata-codelist/SpatialScope"
    })

    # poziom jednostki
    itemsList.append({
        'e9_lineEdit':
        atrybut_title,
        'e10_cmbbx':
        'Data opublikowania',
        'e10_dateTimeEdit':
        QDateTime(2013, 12, 10, 0, 0),
        'e10_lineEdit':
        'Poziom planu zagospodarowania przestrzennego',
        'xlink':
        "http://inspire.ec.europa.eu/codelist/LevelOfSpatialPlanValue"
    })

    # dodanie domyslnych wartosci kluczowych
    itemsList.extend(dictionaries.metadataListWidgetsDefaultItems['e9'])
    metadataElementDict['e9'] = itemsList

    # E11
    layer = QgsVectorLayer(gmlPath + '|layername=AktPlanowaniaPrzestrzennego',
                           "gml", 'ogr')
    # if not layer.isValid(): # sprawdzanie czy AktPlanowaniaPrzestrzennego jest w wfs:member lub gml:featureMember (bezpośrednio)
    #     layer = QgsVectorLayer(gmlPath + '|layername=featureMember', "gml", 'ogr')
    if layer.isValid():

        sourceCrs = layer.crs()
        extent = layer.extent()
        # w zwiazku z niepoprawnym zaczytywaniem zasiegu GML przez QGIS - odwrocenie osi
        '''
        Dla wersji QGIS <= 3.14 przy wczytywaniu GML
        # z definicją układu
        # jako uri do opengis.net np. http://www.opengis.net/def/crs/EPSG/0/2177
        # QGIS wczytuje zasięg z odwróconymi X i Y
        # TODO: do wykomentowania gdy błąd zostanie naprawiony w nowej wersji programu
        # dla starych - pozostaje
        '''
        extentInverted = QgsRectangle(extent.yMinimum(), extent.xMinimum(),
                                      extent.yMaximum(), extent.xMaximum())
        crsDest = QgsCoordinateReferenceSystem(4326)  # WGS84
        xform = QgsCoordinateTransform(sourceCrs, crsDest,
                                       QgsProject.instance())
        extent84 = xform.transform(extentInverted)
        metadataElementDict['e11'] = [{
            'e11_lineEdit':
            '%s,%s,%s,%s' % (extent84.xMinimum(), extent84.xMaximum(),
                             extent84.yMinimum(), extent84.yMaximum())
        }]

    # E12
    itemsList = []
    # szukaj w rysunkach APP
    for uklad in root.findall('//*/app:ukladOdniesieniaPrzestrzennego', ns):
        if {'e12_cmbbx': uklad.text} not in itemsList:
            itemsList.append({'e12_cmbbx': uklad.text})
    # szukaj w zasięgach APP
    for multiSurface in root.findall(
            '//*/app:zasiegPrzestrzenny/gml:MultiSurface', ns):
        if {'e12_cmbbx': multiSurface.attrib['srsName']} not in itemsList:
            itemsList.append({'e12_cmbbx': multiSurface.attrib['srsName']})
    metadataElementDict['e12'] = itemsList

    # E13
    dates = []
    for date in root.findall(
            '//app:AktPlanowaniaPrzestrzennego/app:poczatekWersjiObiektu', ns):
        dates.append(QDateTime.fromString(date.text, "yyyy-MM-dd'T'hh:mm:ss"))
    oldestDate = utils.oldestQDateTime(dates)
    if oldestDate is not None:
        metadataElementDict['e13'] = {'e13_dateTimeEdit': oldestDate}

    # E16
    itemsList = []
    for rozdzielczosc in root.findall('//*/app:rozdzielczoscPrzestrzenna', ns):
        if {'e16_lineEdit': rozdzielczosc.text} not in itemsList:
            itemsList.append({'e16_lineEdit': rozdzielczosc.text})
    metadataElementDict['e16'] = itemsList

    # E18 i E19 i E24 i E25
    itemsList = []
    inspire1 = "Rozporządzenie Komisji (UE) Nr 1089/2010 z dnia 23 listopada 2010 r. w sprawie wykonania dyrektywy 2007/2/WE Parlamentu Europejskiego i Rady w zakresie interoperacyjności zbiorów i usług danych przestrzennych"
    inspire2 = "D2.8.III.4 Data Specification on Land Use – Technical Guidelines"
    krajowy1 = "Rozporządzenie Ministra Rozwoju, Pracy i Technologii z dnia 26 października 2020 r. w sprawie zbiorów danych przestrzennych oraz metadanych w zakresie zagospodarowania przestrzennego"
    krajowy2 = "Planowanie przestrzenne: Specyfikacja danych"

    ifKrajowy = False
    ifInspire = False
    namespaces = dict(
        [node for _, node in ET.iterparse(gmlPath, events=['start-ns'])])
    for v in namespaces.values():
        if 'https://www.gov.pl/static/zagospodarowanieprzestrzenne' in v:
            ifKrajowy = True
            # ifInspire = False
            break
        if 'http://inspire.ec.europa.eu/schemas/plu/4.0/PlannedLandUse.xsd' in v:
            # ifKrajowy = False
            ifInspire = True
            break

    # E18 i E19 inspire1
    itemsList.append({
        'e18_lineEdit': inspire1,
        'e18_dateTimeEdit': QDateTime(2010, 12, 8, 0, 0),
        'e18_cmbbx': 'Data opublikowania',
        'e19_cmbbx':
        'Zgodny (conformant)' if ifInspire else 'Niezgodny (notConformant)',
        'xlink': "http://data.europa.eu/eli/reg/2010/1089"
    })
    # E18 i E19 inspire2
    itemsList.append({
        'e18_lineEdit':
        inspire2,
        'e18_dateTimeEdit':
        QDateTime(2013, 12, 10, 0, 0),
        'e18_cmbbx':
        'Data opublikowania',
        'e19_cmbbx':
        'Zgodny (conformant)' if ifInspire else 'Niezgodny (notConformant)'
    })
    # E18 i E19 krajowy1
    itemsList.append({
        'e18_lineEdit': krajowy1,
        'e18_dateTimeEdit': QDateTime(2020, 10, 31, 0, 0),
        'e18_cmbbx': 'Data opublikowania',
        'e19_cmbbx':
        'Zgodny (conformant)' if ifKrajowy else 'Niezgodny (notConformant)',
        'xlink': "https://dziennikustaw.gov.pl/DU/2020/1916"
    })
    # E18 i E19 krajowy2
    itemsList.append({
        'e18_lineEdit': krajowy2,
        'e18_dateTimeEdit': QDateTime(2020, 10, 31, 0, 0),
        'e18_cmbbx': 'Data opublikowania',
        'e19_cmbbx':
        'Zgodny (conformant)' if ifKrajowy else 'Niezgodny (notConformant)',
        'xlink': ""  # TODO: uaktualnić po publikacji
    })
    metadataElementDict['e18'] = itemsList

    # E24 i E25 krajowy
    itemsList = []
    if ifKrajowy:
        itemsList.append({
            'e24_lineEdit': "Schemat aplikacyjny GML Planowanie przestrzenne",
            'e25_lineEdit': "1.0"
        })
    if ifInspire:
        itemsList.append({
            'e24_lineEdit': "Planned Land Use GML Application Schema",
            'e25_lineEdit': "4.0"
        })
    metadataElementDict['e24'] = itemsList

    return metadataElementDict
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys
Example #42
0
 def getDataFromExternal(self, date, progress_callback = None):
     f = FileWrapper(self.xmltvFile)
     context = ElementTree.iterparse(f, events=("start", "end"))
     return parseXMLTV(context, f, f.size, self.logoFolder, progress_callback)
Example #43
0
    def parse(self, values_are_confidence=False, rooted=False):
        """Parse the text stream this object was initialized with."""

        nexml_doc = ElementTree.iterparse(self.handle, events=('end', ))

        for event, node in nexml_doc:
            if node.tag == qUri('nex:tree'):
                node_dict = {}
                node_children = {}
                root = None

                child_tags = node.getchildren()
                nodes = []
                edges = []
                for child in child_tags:
                    if child.tag == qUri('nex:node'):
                        nodes.append(child)
                    if child.tag == qUri('nex:edge'):
                        edges.append(child)

                for node in nodes:
                    node_id = node.attrib['id']
                    this_node = node_dict[node_id] = {}
                    if 'otu' in node.attrib and node.attrib['otu']:
                        this_node['name'] = node.attrib['otu']
                    if 'root' in node.attrib and node.attrib['root'] == 'true':
                        root = node_id

                    for child in node.getchildren():
                        if child.tag == qUri('nex:meta'):
                            self.add_annotation(node_dict[node_id], child)

                srcs = set()
                tars = set()
                for edge in edges:
                    src, tar = edge.attrib['source'], edge.attrib['target']
                    srcs.add(src)
                    tars.add(tar)
                    if src not in node_children:
                        node_children[src] = set()

                    node_children[src].add(tar)
                    if 'length' in edge.attrib:
                        node_dict[tar]['branch_length'] = float(
                            edge.attrib['length'])
                    if 'property' in edge.attrib and edge.attrib[
                            'property'] in matches('cdao:has_Support_Value'):
                        node_dict[tar]['confidence'] = float(
                            edge.attrib['content'])

                    for child in edge.getchildren():
                        if child.tag == qUri('nex:meta'):
                            self.add_annotation(node_dict[tar], child)

                if root is None:
                    # if no root specified, start the recursive tree creation function
                    # with the first node that's not a child of any other nodes
                    rooted = False
                    possible_roots = (node.attrib['id'] for node in nodes
                                      if node.attrib['id'] in srcs
                                      and not node.attrib['id'] in tars)
                    root = next(possible_roots)
                else:
                    rooted = True

                yield NeXML.Tree(root=self._make_tree(root, node_dict,
                                                      node_children),
                                 rooted=rooted)
Example #44
0
    def activity_summary(self, file_path):
        '''
        Main XML parsing script, streams in file and proccesses each branch into
        lists, which get converted to data frames and exported to csv files for database upload
        '''
        # activity data
        date = []
        energy_burned = []
        energy_burned_goal = []
        energy_burned_unit = []
        exercise_time = []
        exercise_time_goal = []
        stand_hours = []
        stand_hours_goal = []
        file = file_path + '/apple_health_export/export.xml'

        # exercise time
        exercise_time_type = []
        exercise_time_date = []
        exercise_time_duration = []
        exercise_time_durationUnit = []

        # workout data
        workoutActivityType = []
        duration = []
        durationUnit = []
        totalDistance = []
        totalDistanceUnit = []
        totalEnergyBurned = []
        totalEnergyBurnedUnit = []
        sourceName = []
        sourceVersion = []
        device = []
        creationDate = []
        startDate = []
        endDate = []

        # heartrate data
        record_type = []
        record_unit = []
        record_value = []
        record_sourceName = []
        record_sourceVersion = []
        record_device = []
        record_creationDate = []
        record_startDate = []
        record_endDate = []

        for event, elem in ET.iterparse(file, events=("start", "end")):
            if event == 'end':
                # process the tag
                if elem.tag == 'ActivitySummary':
                    # import pdb;pdb.set_trace()
                    for item in elem.items():
                        if item[0] == 'dateComponents':
                            date.append(item[1])
                        elif item[0] == 'activeEnergyBurned':
                            energy_burned.append(item[1])
                        elif item[0] == 'activeEnergyBurnedGoal':
                            energy_burned_goal.append(item[1])
                        elif item[0] == 'activeEnergyBurnedUnit':
                            energy_burned_unit.append(item[1])
                        elif item[0] == 'appleExerciseTime':
                            exercise_time.append(item[1])
                        elif item[0] == 'appleExerciseTimeGoal':
                            exercise_time_goal.append(item[1])
                        elif item[0] == 'appleStandHours':
                            stand_hours.append(item[1])
                        elif item[0] == 'appleStandHoursGoal':
                            stand_hours_goal.append(item[1])
                if elem.tag == 'WorkoutEvent':
                    for item in elem.items():
                        if item[0] == 'type':
                            exercise_time_type.append(item[1])
                        elif item[0] == 'date':
                            exercise_time_date.append(item[1])
                        elif item[0] == 'duration':
                            exercise_time_duration.append(item[1])
                        elif item[0] == 'durationUnit':
                            exercise_time_durationUnit.append(item[1])
                if elem.tag == 'Workout':
                    for item in elem.items():
                        if item[0] == 'workoutActivityType':
                            workoutActivityType.append(item[1])
                        if item[0] == 'duration':
                            duration.append(item[1])
                        if item[0] == 'durationUnit':
                            durationUnit.append(item[1])
                        if item[0] == 'totalDistance':
                            totalDistance.append(item[1])
                        if item[0] == 'totalDistanceUnit':
                            totalDistanceUnit.append(item[1])
                        if item[0] == 'totalEnergyBurned':
                            totalEnergyBurned.append(item[1])
                        if item[0] == 'totalEnergyBurnedUnit':
                            totalEnergyBurnedUnit.append(item[1])
                        if item[0] == 'sourceName':
                            sourceName.append(item[1])
                        if item[0] == 'sourceVersion':
                            sourceVersion.append(item[1])
                        if item[0] == 'device':
                            device.append(item[1])
                        if item[0] == 'creationDate':
                            creationDate.append(item[1])
                        if item[0] == 'startDate':
                            startDate.append(item[1])
                        if item[0] == 'endDate':
                            endDate.append(item[1])
                '''
                if elem.tag == 'Record':
                    for item in elem.items():
                        if item[0] == 'type':
                            record_type.append(item[1])
                        if item[0] == 'unit':
                            record_unit.append(item[1])
                        if item[0] == 'value':
                            record_value.append(item[1])
                        if item[0] == 'sourceName':
                            record_sourceName.append(item[1])
                        if item[0] == 'sourceVersion':
                            record_sourceVersion.append(item[1])
                        if item[0] == 'device':
                            record_device.append(item[1])
                        if item[0] == 'creationDate':
                            record_creationDate.append(item[1])
                        if item[0] == 'startDate':
                            record_startDate.append(item[1])
                        if item[0] == 'endDate':
                            record_endDate.append(item[1])
                '''
                # this is the key to memory management on the server
                elem.clear()

        # create activity data data frame
        print('Creating activity data...')
        li = list(
            zip(date, energy_burned, energy_burned_goal, energy_burned_unit,
                exercise_time, exercise_time_goal, stand_hours,
                stand_hours_goal))
        df = pd.DataFrame(li,
                          columns=[
                              'date', 'energy_burned', 'energy_burned_goal',
                              'energy_burned_unit', 'exercise_time',
                              'exercise_time_goal', 'stand_hours',
                              'stand_hours_goal'
                          ])
        # remove dates before 2000-01-01
        df['datetime'] = pd.to_datetime(df['date'])
        df = df[df['datetime'] > '2000-01-01']
        # drop datetime column
        df = df.drop(['datetime'], axis=1)
        # add created_at, last_updated_by
        df['created_at'] = pd.to_datetime('now')
        df['updated_at'] = pd.to_datetime('now')
        df.fillna(0, inplace=True)

        # create exercise time data frame
        print('Creating exercise time data...')
        li = list(
            zip(exercise_time_date, exercise_time_type, exercise_time_duration,
                exercise_time_durationUnit))
        exercise_time = pd.DataFrame(li,
                                     columns=[
                                         'date', 'exercise_time_type',
                                         'exercise_time_duration',
                                         'exercise_time_durationUnit'
                                     ])
        # remove dates before 2000-01-01
        exercise_time['datetime'] = pd.to_datetime(exercise_time['date'])
        exercise_time = exercise_time[exercise_time['datetime'] > '2000-01-01']
        # drop datetime column
        exercise_time = exercise_time.drop(['datetime'], axis=1)
        # add created_at, last_updated_by
        exercise_time['created_at'] = pd.to_datetime('now')
        exercise_time['updated_at'] = pd.to_datetime('now')
        exercise_time.fillna(0, inplace=True)

        # create workout data frame
        print('Creating workout data...')
        li = list(
            zip(workoutActivityType, duration, durationUnit, totalDistance,
                totalDistanceUnit, totalEnergyBurned, totalEnergyBurnedUnit,
                sourceName, sourceVersion, device, creationDate, startDate,
                endDate))
        workout = pd.DataFrame(
            li,
            columns=[
                'activity_type', 'duration', 'duration_unit', 'total_distance',
                'total_distance_unit', 'total_energy_burned',
                'total_energy_burned_unit', 'source_name', 'source_version',
                'device', 'creation_date', 'start_date', 'end_date'
            ])
        # remove dates before 2000-01-01
        workout['creation_datetime'] = pd.to_datetime(workout['creation_date'])
        workout = workout[workout['creation_datetime'] > '2000-01-01']
        workout['start_datetime'] = pd.to_datetime(workout['start_date'])
        workout = workout[workout['start_datetime'] > '2000-01-01']
        workout['end_datetime'] = pd.to_datetime(workout['end_date'])
        workout = workout[workout['end_datetime'] > '2000-01-01']
        workout['date'] = workout['start_datetime'].dt.date

        # drop datetime column
        workout = workout.drop(
            ['creation_datetime', 'start_datetime', 'end_datetime'], axis=1)
        # add created_at, last_updated_by
        workout['created_at'] = pd.to_datetime('now')
        workout['updated_at'] = pd.to_datetime('now')
        workout.fillna(0, inplace=True)

        # cleanup activity_type and device column text, date
        workout['gadget'] = np.where(
            workout['device'].str.contains('Apple Watch'), 'Apple Watch',
            'iPhone')
        # remove HKWorkoutActivityType from activity_type text
        workout['activity'] = (workout['activity_type'].str.replace(
            'HKWorkoutActivityType', ''))
        # remove unnecessary columns
        workout = workout[[
            'date', 'activity', 'duration', 'duration_unit', 'total_distance',
            'total_distance_unit', 'total_energy_burned',
            'total_energy_burned_unit', 'gadget', 'start_date', 'end_date',
            'created_at', 'updated_at'
        ]]
        '''
        # create heartrate data frame
        print('Creating heartrate data...')
        li = list(zip(record_type, record_unit, record_value, record_sourceName,
                    record_sourceVersion, record_device, record_creationDate,
                    record_startDate, record_endDate))
        record = pd.DataFrame(li,
                            columns=['type', 'unit',
                                    'value', 'source_name',
                                    'source_version', 'device',
                                    'creation_date', 'start_date',
                                    'end_date'])
        # remove dates before 2000-01-01
        record['creation_datetime'] = pd.to_datetime(record['creation_date'])
        record = record[record['creation_datetime'] > '2000-01-01']
        record['start_datetime'] = pd.to_datetime(record['start_date'])
        record = record[record['start_datetime'] > '2000-01-01']
        record['end_datetime'] = pd.to_datetime(record['end_date'])
        record = record[record['end_datetime'] > '2000-01-01']
        record['date'] = record['start_datetime'].dt.strftime('%Y-%m-%d')

        # drop datetime column
        record = record.drop(['creation_datetime', 'start_datetime',
                            'end_datetime'], axis=1)

        # filter down to heartrate
        record = record[record['type'] == 'HKQuantityTypeIdentifierHeartRate']
        # clean up device data (look for Apple Watch, iPhone)
        record['gadget'] = np.where(record['device'].str.contains('Apple Watch'),
                                    'Apple Watch', 'iPhone')
        # decrease columns to necessary info
        record = record[['date', 'gadget', 'value']]
        record['value'] = record['value'].astype(float)
        # aggregate this before adding to db (max, min, avg)
        record_avg = record.groupby(['date', 'gadget'], as_index=False).mean()
        record_max = record.groupby(['date', 'gadget'], as_index=False).max()
        record_min = record.groupby(['date', 'gadget'], as_index=False).min()
        # combine these into a single df for record
        record_avg.columns = ['date', 'gadget', 'avg']
        record_max.columns = ['date', 'gadget', 'max']
        record_min.columns = ['date', 'gadget', 'min']
        heartrate = record_avg.merge(record_max, on=['date', 'gadget'])
        heartrate = heartrate.merge(record_min, on=['date', 'gadget'])
        # add created_at, last_updated_by
        heartrate['created_at'] = pd.to_datetime('now')
        heartrate['updated_at'] = pd.to_datetime('now')
        heartrate.fillna(0, inplace=True)
        # import pdb; pdb.set_trace()
        '''
        # csv exports
        df.to_csv(file_path + 'activity_summary.csv', index=False)
        exercise_time.to_csv(file_path + 'exercise_time.csv', index=False)
        workout.to_csv(file_path + 'workout.csv', index=False)
        # heartrate.to_csv(file_path + 'heartrate.csv', index=False)
        return df
Example #45
0
    def iterconfigurations(self):
        """
        Create and return iterator for the available Configuration objects.
        The iterator loops over all Configurations in the dump file tree, in document order.
        """
        cfg = None
        cfg_arguments = [
        ]  # function arguments for Configuration node initialization
        cfg_function = None
        cfg_valueflow = None

        # Iterating <varlist> in a <scope>.
        iter_scope_varlist = False

        # Iterating <typedef-info>
        iter_typedef_info = False

        # Use iterable objects to traverse XML tree for dump files incrementally.
        # Iterative approach is required to avoid large memory consumption.
        # Calling .clear() is necessary to let the element be garbage collected.
        for event, node in ElementTree.iterparse(self.filename,
                                                 events=('start', 'end')):
            # Serialize new configuration node
            if node.tag == 'dump':
                if event == 'start':
                    cfg = Configuration(node.get('cfg'))
                    continue
                elif event == 'end':
                    cfg.setIdMap(cfg_arguments)
                    yield cfg
                    cfg = None
                    cfg_arguments = []

            elif node.tag == 'clang-warning' and event == 'start':
                cfg.clang_warnings.append({
                    'file': node.get('file'),
                    'line': int(node.get('line')),
                    'column': int(node.get('column')),
                    'message': node.get('message')
                })

            # Parse standards
            elif node.tag == "standards" and event == 'start':
                continue
            elif node.tag == 'c' and event == 'start':
                cfg.standards.set_c(node)
            elif node.tag == 'cpp' and event == 'start':
                cfg.standards.set_cpp(node)
            elif node.tag == 'posix' and event == 'start':
                cfg.standards.set_posix(node)

            # Parse directives list
            elif node.tag == 'directive' and event == 'start':
                cfg.directives.append(Directive(node))

            # Parse macro usage
            elif node.tag == 'macro' and event == 'start':
                cfg.macro_usage.append(MacroUsage(node))

            # Preprocessor #if/#elif condition
            elif node.tag == "if-cond" and event == 'start':
                cfg.preprocessor_if_conditions.append(
                    PreprocessorIfCondition(node))

            # Parse tokens
            elif node.tag == 'tokenlist' and event == 'start':
                continue
            elif node.tag == 'token' and event == 'start':
                cfg.tokenlist.append(Token(node))

            # Parse scopes
            elif node.tag == 'scopes' and event == 'start':
                continue
            elif node.tag == 'scope' and event == 'start':
                cfg.scopes.append(Scope(node))
            elif node.tag == 'varlist':
                if event == 'start':
                    iter_scope_varlist = True
                elif event == 'end':
                    iter_scope_varlist = False

            # Parse functions
            elif node.tag == 'functionList' and event == 'start':
                continue
            elif node.tag == 'function':
                if event == 'start':
                    cfg_function = Function(node, cfg.scopes[-1])
                    continue
                elif event == 'end':
                    cfg.functions.append(cfg_function)
                    cfg_function = None

            # Parse function arguments
            elif node.tag == 'arg' and event == 'start':
                arg_nr = int(node.get('nr'))
                arg_variable_id = node.get('variable')
                cfg_function.argumentId[arg_nr] = arg_variable_id

            # Parse variables
            elif node.tag == 'var' and event == 'start':
                if iter_scope_varlist:
                    cfg.scopes[-1].varlistId.append(node.get('id'))
                else:
                    var = Variable(node)
                    if var.nameTokenId:
                        cfg.variables.append(var)
                    else:
                        cfg_arguments.append(var)

            # Parse typedef info
            elif node.tag == 'typedef-info':
                iter_typedef_info = (event == 'start')
            elif iter_typedef_info and node.tag == 'info' and event == 'start':
                cfg.typedefInfo.append(TypedefInfo(node))

            # Parse valueflows (list of values)
            elif node.tag == 'valueflow' and event == 'start':
                continue
            elif node.tag == 'values':
                if event == 'start':
                    cfg_valueflow = ValueFlow(node)
                    continue
                elif event == 'end':
                    cfg.valueflow.append(cfg_valueflow)
                    cfg_valueflow = None

            # Parse values
            elif node.tag == 'value' and event == 'start':
                cfg_valueflow.values.append(Value(node))

            # Remove links to the sibling nodes
            node.clear()
Example #46
0
def parse_xml(file_name):
    events = ("start", "end")
    context = ET.iterparse(file_name, events=events)
    pt(context)
Example #47
0
 def getDataFromExternal(self, date, progress_callback = None):
     xml = self._downloadUrl(self.ontvUrl)
     io = StringIO.StringIO(xml)
     context = ElementTree.iterparse(io)
     return parseXMLTV(context, io, len(xml), None, progress_callback)
Example #48
0
    print("Processing ", sys.argv[1])
    #idx = 0
    #nsmap = {}
    #for event, elem in etree.iterparse(sys.argv[1], events=('start-ns', )):
    #ns, url = elem
    #print(ns, url)
    #nsmap[ns] = url
    #idx += 1
    #if idx == 10:
    #break
    #print(nsmap)

    #idx = 0
    data_dict = {}
    for event, elem in etree.iterparse(sys.argv[1], events=(
            'start',
            'end',
    )):
        tag = elem.tag.split('}')[1]
        if 'start' == event and 'page' == tag:
            data_dict = {}
        elif 'end' == event and 'page' == tag:
            elem.clear()
            #print(data_dict['ns'], '-', data_dict['title'])
            if data_dict['ns'] == '0' and len(data_dict['text']) > 500:
                # Save to file
                filepath = sys.argv[2] + '/' + plain_name(data_dict['title'])
                open(filepath, 'w').write(data_dict['text'])
                print("  wrote ", filepath)

        if 'title' == tag and 'end' == event:
            data_dict['title'] = elem.text
Example #49
0
    def init_cache(self) -> None:

        msg = f"Edit file {self.config_file} with AIXM directory"

        if self.aixm_path is None or self.cache_dir is None:
            raise RuntimeError(msg)

        self.full_dict: Dict[str, Any] = {}
        self.all_points: Dict[str, Point] = {}

        assert self.aixm_path.is_dir()
        self.ns: Dict[str, str] = dict()

        cache_file = self.cache_dir / "aixm.pkl"
        if cache_file.exists():
            with cache_file.open("rb") as fh:
                try:
                    elts = pickle.load(fh)
                    self.full_dict = elts[0]
                    self.all_points = elts[1]
                    self.tree = elts[2]
                    self.ns = elts[3]

                    self.initialized = True
                    return
                except Exception:
                    logging.warning("aixm files: rebuilding cache file")

        for filename in [
            "AirportHeliport.BASELINE",
            "Airspace.BASELINE",
            "DesignatedPoint.BASELINE",
            "Navaid.BASELINE",
            "StandardInstrumentArrival.BASELINE",
        ]:

            if not (self.aixm_path / filename).exists():
                zippath = zipfile.ZipFile(
                    self.aixm_path.joinpath(f"{filename}.zip").as_posix()
                )
                zippath.extractall(self.aixm_path.as_posix())

        # The versions for namespaces may be incremented and make everything
        # fail just for that reason!
        for _, (key, value) in ElementTree.iterparse(
            (self.aixm_path / "Airspace.BASELINE").as_posix(),
            events=["start-ns"],
        ):
            self.ns[key] = value

        self.tree = ElementTree.parse(
            (self.aixm_path / "Airspace.BASELINE").as_posix()
        )

        for airspace in self.tree.findall(
            "adrmsg:hasMember/aixm:Airspace", self.ns
        ):

            identifier = airspace.find("gml:identifier", self.ns)
            assert identifier is not None
            assert identifier.text is not None
            self.full_dict[identifier.text] = airspace

        points = ElementTree.parse(
            (self.aixm_path / "DesignatedPoint.BASELINE").as_posix()
        )

        for point in points.findall(
            "adrmsg:hasMember/aixm:DesignatedPoint", self.ns
        ):

            identifier = point.find("gml:identifier", self.ns)
            assert identifier is not None
            assert identifier.text is not None

            floats = point.find(
                "aixm:timeSlice/aixm:DesignatedPointTimeSlice/"
                "aixm:location/aixm:Point/gml:pos",
                self.ns,
            )
            assert floats is not None
            assert floats.text is not None

            designator = point.find(
                "aixm:timeSlice/aixm:DesignatedPointTimeSlice/aixm:designator",
                self.ns,
            )
            type_ = point.find(
                "aixm:timeSlice/aixm:DesignatedPointTimeSlice/aixm:type",
                self.ns,
            )

            name = designator.text if designator is not None else None
            type_str = type_.text if type_ is not None else None

            coords = tuple(float(x) for x in floats.text.split())
            self.all_points[identifier.text] = Point(
                coords[0], coords[1], name, type_str
            )

        points = ElementTree.parse(
            (self.aixm_path / "Navaid.BASELINE").as_posix()
        )

        for point in points.findall("adrmsg:hasMember/aixm:Navaid", self.ns):

            identifier = point.find("gml:identifier", self.ns)
            assert identifier is not None
            assert identifier.text is not None

            floats = point.find(
                "aixm:timeSlice/aixm:NavaidTimeSlice/"
                "aixm:location/aixm:ElevatedPoint/gml:pos",
                self.ns,
            )
            assert floats is not None
            assert floats.text is not None

            designator = point.find(
                "aixm:timeSlice/aixm:NavaidTimeSlice/aixm:designator", self.ns
            )
            type_ = point.find(
                "aixm:timeSlice/aixm:NavaidTimeSlice/aixm:type", self.ns
            )

            name = designator.text if designator is not None else None
            type_str = type_.text if type_ is not None else None

            coords = tuple(float(x) for x in floats.text.split())
            self.all_points[identifier.text] = Point(
                coords[0], coords[1], name, type_str
            )

        with cache_file.open("wb") as fh:
            pickle.dump(
                (self.full_dict, self.all_points, self.tree, self.ns), fh
            )

        self.initialized = True
Example #50
0
		#print to_write
		offtet=mini+' '+str(offset)
		offset+=(len(to_write)+1)
		offtxt.write(offtet+'\n')
		outtxt.write(to_write+'\n');
		flag=0
		for i in range(chunknum):
			if com_read[i]==0:
				flag=1
		if flag==0:
			#pass
			break
	outtxt.close()
	offtxt.close()
	#print ind
for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
    tname = strip_tag_name(elem.tag)

    if event == 'start':
        if tname == 'page':
            title = ''
            hitext = ''
            rtitle = ''
            id = -1
            redirect = ''
            inrevision = False
            ns = 0
            l_hitext=[]
            dicw={}
        elif tname == 'revision':
            # Do not pick up on revision id's
Example #51
0
def main(argv):
    inputfolder = ''
    try:
        opts, args = getopt.getopt(argv, "hi:", ["ifile="])
    except getopt.GetoptError:
        print 'parse.py -i <inputfolder>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'parse.py -i <inputfolder>'
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfolder = arg

    if inputfolder == '':
        print 'parse.py -i <inputfolder>'
        sys.exit(2)

    if not inputfolder.endswith('/'):
        inputfolder = inputfolder + '/'

    files = []
    for entry in os.listdir(inputfolder):
        if entry.endswith('.xml'):
            files.append(inputfolder + entry)

    db = MySQLdb.connect(host="127.0.0.1",
                         user="******",
                         passwd="",
                         db="patent_research_base")
    cur = db.cursor()
    company_id = ''
    print '\n\n### Running matches ###\n'
    for inputfile in files:
        ignorecount = failcount = successcount = 0
        total_size = os.path.getsize(inputfile)
        file = open(inputfile, 'r')
        for event, elem in etree.iterparse(file):

            if event is 'end' and elem.tag.endswith('record'):
                pubyear = int(elem.getchildren()[8].text.split('/')[0])
                # if (pubyear <= 2012 and pubyear >= 1982):

                pat_id = str(elem.getchildren()[1].text)
                check_query = "SELECT EXISTS(SELECT 1 FROM processed_patents WHERE patent_id='%s')" % (
                    pat_id)
                cur.execute(check_query)
                if cur.fetchone()[0] != 1:
                    patent_id = capturepatid(pat_id)
                    inventors = splitinventors(elem.getchildren()[6],
                                               elem.getchildren()[7])

                    for owner in elem.getchildren()[2]:
                        if owner.text:
                            company_id = owner.text

                    try:
                        citing = int(elem.getchildren()[3].text)
                    except:
                        citing = 0

                    try:
                        cited = int(elem.getchildren()[4].text)
                    except:
                        cited = 0
                    classification = str(elem.getchildren()[5].text)

                    patent = {
                        'PubYear': pubyear,
                        'Pat_ID': pat_id,
                        'Patent_ID': patent_id,
                        'Inventors': inventors,
                        'Company_ID': company_id,
                        'Citing': citing,
                        'Cited': cited,
                        'Classification': classification
                    }

                    result = findpatent(patent, db)
                    if not result:
                        failcount += 1
                    else:
                        successcount += 1

                    update_query = "INSERT INTO processed_patents (patent_id) VALUES('%s')" % (
                        pat_id)
                    cur.execute(update_query)
                    db.commit()
                else:
                    ignorecount += 1
                # else:
                #     ignorecount += 1

                progress = float(file.tell()) / total_size
                sys.stdout.write(
                    '\rFilename: %s - Processed: %s\t| Failed: %d Success: %d Ignored: %d'
                    % (inputfile, "{:.0%}".format(progress), failcount,
                       successcount, ignorecount))
                sys.stdout.flush()

        move_destination = "./processed/" + \
            inputfile.split('/')[len(inputfile.split('/')) - 1]
        shutil.move(inputfile, move_destination)
        print ''
Example #52
0
def audit_st_tp(filename):
    problem_street_types = defaultdict(set)
    for event, elem in ET.iterparse(filename):
        if is_street_name(elem):
            expected_street_type(problem_street_types, elem.attrib['v'])
    return problem_street_types
Example #53
0
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if 'uid' in element.attrib:
            users.add(element.attrib['uid'])
    return users
Example #54
0
    def __init__(self, uniprot, base_map, pdb_to_go, go_prop_map, login,
                 progress):
        self.uniprot = uniprot
        self.uniprot_qid = base_map[uniprot]['qid']
        self.ensp = set()
        self.ncbip = set()
        self.go_terms = set()
        self.login = login
        self.go_prop_map = go_prop_map
        self.entrez = base_map[uniprot]['entrez']['id']
        self.entrez_quid = base_map[uniprot]['entrez']['qid']
        self.res_id = base_map[uniprot]['entrez']['res_id']

        self.label = ''
        self.description = ''
        self.aliases = set()
        self.tax_id = ''
        self.annotation_type = ''

        self.statements = []

        self.res_prefixes = {x.split(':')[0] for x in res_id_to_entrez_qid}

        start = time.time()

        if not os.path.exists('./data/uniprot_raw'):
            os.makedirs('./data/uniprot_raw')

        # check if Uniprot xml exists and its age?
        r = requests.get('http://www.uniprot.org/uniprot/{}.xml'.format(
            self.uniprot))

        f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'w')
        f.write(r.text)
        f = open('./data/uniprot_raw/{}.xml'.format(self.uniprot), 'r')

        # check if XML can be properly parsed, log obsolete items for permanent removal.
        try:
            for event, e in Et.iterparse(f, events=('start', 'end')):

                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}entry':
                    if 'dataset' in e.attrib:
                        self.annotation_type = e.attrib['dataset']

                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}protein':
                    tmp = e.find(
                        './{http://uniprot.org/uniprot}recommendedName/'
                        '{http://uniprot.org/uniprot}fullName')
                    if tmp is not None:
                        self.label = tmp.text
                    elif e.find('./{http://uniprot.org/uniprot}submittedName/'
                                '{http://uniprot.org/uniprot}fullName'
                                ) is not None:
                        self.label = e.find(
                            './{http://uniprot.org/uniprot}submittedName/'
                            '{http://uniprot.org/uniprot}fullName').text

                    for prop in e.findall(
                            './{http://uniprot.org/uniprot}alternativeName/'):
                        self.aliases.add(prop.text)

                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}organism':
                    for prop in e.findall(
                            './{http://uniprot.org/uniprot}dbReference'):
                        if prop.attrib['type'] == 'NCBI Taxonomy':
                            self.tax_id = prop.attrib['id']

                # print(e)
                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \
                        and 'type' in e.attrib and e.attrib['type'] == 'Ensembl':

                    for prop in e.findall(
                            './{http://uniprot.org/uniprot}property'):
                        if prop.attrib['type'] == 'protein sequence ID':
                            self.ncbip.add(prop.attrib['value'])
                            self.statements.append(
                                PBB_Core.WDString(
                                    value=prop.attrib['value'],
                                    prop_nr='P705',
                                    references=[self.create_reference()]))

                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \
                        and 'type' in e.attrib and e.attrib['type'] == 'RefSeq':
                    self.ncbip.add(e.attrib['id'])
                    self.statements.append(
                        PBB_Core.WDString(value=e.attrib['id'],
                                          prop_nr='P637',
                                          references=[self.create_reference()
                                                      ]))

                # get alternative identifiers for gene to protein mapping
                if event == 'end' and e.tag == '{http://uniprot.org/uniprot}dbReference' \
                        and 'type' in e.attrib and e.attrib['type'] in self.res_prefixes:
                    res_id = e.attrib['id']
                    if res_id in res_id_to_entrez_qid:
                        self.entrez_quid = res_id_to_entrez_qid[res_id][0]

        except Et.ParseError as e:
            print(
                'Error when parsing Uniprot {} XML file, item {} most likely obsolete'
                .format(self.uniprot, self.uniprot_qid))
            PBB_Core.WDItemEngine.log(
                'ERROR',
                '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                .format(main_data_id='{}'.format(self.uniprot),
                        exception_type=type(e),
                        message=e.__str__(),
                        wd_id=self.uniprot_qid,
                        duration=time.time() - start))
            return

        # get GO annotations from QuickGO
        params = {'format': 'tsv', 'limit': '1000', 'protein': self.uniprot}

        url = 'http://www.ebi.ac.uk/QuickGO/GAnnotation'

        try:
            itrt = iter(
                requests.get(url, params=params).text.strip('\n ').split('\n'))
            next(itrt)  # skip header line

            for line in itrt:
                cols = line.split('\t')
                go_id = cols[6]
                evidence_code = cols[9]
                go_aspect = cols[11][0]

                if self.uniprot not in pdb_to_go:
                    pdb_to_go[self.uniprot] = {
                        'go_terms': list(),
                        'evidence': list(),
                        'pdb': set()
                    }

                pdb_to_go[self.uniprot]['go_terms'].append(go_id)
                pdb_to_go[self.uniprot]['evidence'].append(evidence_code)

                if go_id in go_prop_map:
                    go_prop_map[go_id][
                        'go_class_prop'] = ProteinBot.get_go_class(
                            go_id, go_aspect)
        except requests.HTTPError:
            pass
        except IndexError:
            pass

        # set description according to the annotation the Uniprot entry is coming from
        self.description = self.descr_map[self.tax_id]['en']

        if self.annotation_type == 'TrEMBL':
            self.description += ' (annotated by UniProtKB/TrEMBL {})'.format(
                self.uniprot)
        elif self.annotation_type == 'Swiss-Prot':
            self.description += ' (annotated by UniProtKB/Swiss-Prot {})'.format(
                self.uniprot)

        # assign a GO term a GO subontology/OBO namespace
        if self.uniprot in pdb_to_go:
            for go in set(pdb_to_go[self.uniprot]['go_terms']):
                # check if a GO term is not yet in Wikidata
                # TODO: If a GO term is not in Wikidata, trigger OBO bot to add it
                if go not in go_prop_map:
                    PBB_Core.WDItemEngine.log(
                        'ERROR',
                        '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                        .format(
                            main_data_id='{}'.format(self.uniprot),
                            exception_type='GO term not in Wikidata exception',
                            message=
                            'GO term {} not found in Wikidata, skipping this one'
                            .format(go),
                            wd_id=self.uniprot_qid,
                            duration=time.time() - start))
                    print(
                        'GO term {} not found in Wikidata, skipping this one'.
                        format(go))
                    continue

                # search in the EBI OBO Lookup Service, for the rare case a GO term has not been assigned its class
                if not go_prop_map[go]['go_class_prop']:
                    go_class_prop = ProteinBot.get_go_class(go)
                    if not go_class_prop:
                        continue

                    go_prop_map[go]['go_class_prop'] = go_class_prop
                    print('added class code {} to {}'.format(
                        go_prop_map[go]['go_class_prop'], go))

                # create a set of WD QIDs representing GO evidence code items in WD
                evidence = {
                    self.go_evidence_codes[ev]
                    for count, ev in enumerate(pdb_to_go[self.uniprot]
                                               ['evidence'])
                    if pdb_to_go[self.uniprot]['go_terms'][count] == go
                }

                # iterate though the evidence code set and create a new qualifier for each one
                qualifiers = [
                    PBB_Core.WDItemID(value=ev,
                                      prop_nr='P459',
                                      is_qualifier=True) for ev in evidence
                    if ev
                ]

                # Create Wikidata GO term value
                prop_nr = self.go_prop_map[go]['go_class_prop']
                qid = self.go_prop_map[go]['qid']
                self.statements.append(
                    PBB_Core.WDItemID(value=qid,
                                      prop_nr=prop_nr,
                                      qualifiers=qualifiers,
                                      references=[self.create_reference()]))

            for pdb in pdb_to_go[self.uniprot]['pdb']:
                self.statements.append(
                    PBB_Core.WDString(value=pdb.upper(),
                                      prop_nr='P638',
                                      references=[self.create_reference()]))

        self.statements.append(
            PBB_Core.WDItemID(value='Q8054',
                              prop_nr='P279',
                              references=[self.create_reference()]))

        if self.entrez_quid != '':
            self.statements.append(
                PBB_Core.WDItemID(value=self.entrez_quid,
                                  prop_nr='P702',
                                  references=[self.create_reference()]))

        current_taxonomy_id = self.taxon_map[self.tax_id]
        self.statements.append(
            PBB_Core.WDItemID(value=current_taxonomy_id,
                              prop_nr='P703',
                              references=[self.create_reference()]))
        self.statements.append(
            PBB_Core.WDString(value=self.uniprot,
                              prop_nr='P352',
                              references=[self.create_reference()]))

        # remove all Wikidata properties where no data has been provided, but are handled by the bot
        all_stmnt_props = list(map(lambda x: x.get_prop_nr(), self.statements))
        for pr in ['P680', 'P681', 'P682', 'P705', 'P637', 'P702']:
            if pr not in all_stmnt_props:
                self.statements.append(
                    PBB_Core.WDBaseDataType.delete_statement(prop_nr=pr))

        try:
            new_msg = ''
            if self.uniprot_qid != '':
                wd_item = PBB_Core.WDItemEngine(wd_item_id=self.uniprot_qid,
                                                domain='proteins',
                                                data=self.statements)
            else:
                wd_item = PBB_Core.WDItemEngine(item_name=self.label,
                                                domain='proteins',
                                                data=self.statements)
                new_msg = 'new protein created'

            wd_item.set_label(self.label)
            wd_item.set_description(self.description)
            wd_item.set_aliases(aliases=self.aliases, append=False)

            self.uniprot_qid = wd_item.write(self.login)

            if self.entrez_quid != '':
                encodes = PBB_Core.WDItemID(
                    value=self.uniprot_qid,
                    prop_nr='P688',
                    references=[self.create_reference()])
                gene_item = PBB_Core.WDItemEngine(wd_item_id=self.entrez_quid,
                                                  data=[encodes],
                                                  append_value=['P688'])
                gene_item.write(login)

            progress[self.uniprot] = self.uniprot_qid

            PBB_Core.WDItemEngine.log(
                'INFO',
                '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                .format(main_data_id='{}'.format(self.uniprot),
                        exception_type='',
                        message='success{}'.format(new_msg),
                        wd_id=self.uniprot_qid,
                        duration=time.time() - start))

            # pprint.pprint(wd_item.get_wd_json_representation())
        except Exception as e:
            print(e)

            PBB_Core.WDItemEngine.log(
                'ERROR',
                '{main_data_id}, "{exception_type}", "{message}", {wd_id}, {duration}'
                .format(main_data_id='{}'.format(self.uniprot),
                        exception_type=type(e),
                        message=e.__str__(),
                        wd_id=self.uniprot_qid,
                        duration=time.time() - start))
            traceback.print_exc()

        print(self.label)
        print(self.aliases)
        print(self.tax_id)
Example #55
0
def count_rib(filename):
    cont = 0
    for _, element in ET.iterparse(filename):
        if 'user' in element.attrib:
            cont = cont + 1
    return cont
Example #56
0
def main():
    total_size = 0
    root = None
    question, topic = None, None
    with open(all_questions_file_path,
              'w') as qf, open(all_topics_file_path, 'w') as tf:
        for event, elem in etree.iterparse(input_xml_file,
                                           events=('start', 'end')):
            logger.debug("event: {}, elemtag: {}".format(event, elem.tag))

            # keep track of the root element
            if event == 'start' and elem.tag == 'ystfeed':
                root = elem

            # track the data elements needed for the dataset
            if event == 'end' and elem.tag == 'subject':
                question = elem.text
            if event == 'end' and elem.tag == 'maincat':
                topic = elem.text.strip()
                # write data to file
                if is_valid(question, topic):
                    qf.write("{}\n".format(clean_text(question)))
                    tf.write("{}\n".format(topic))
                    total_size += 1

            # when a data instance is completely read, clear root
            if event == 'end' and elem.tag == 'vespaadd':
                root.clear()

    logger.info("{} questions read".format(total_size))

    val_size = int(dev_proportion * total_size)
    test_size = int(test_proportion * total_size)
    logger.info("total_size: {}".format(total_size))
    logger.info("val_size: {}".format(val_size))
    logger.info("test_size: {}".format(test_size))

    logger.info("Creating dataset splits ...")
    indices = list(range(total_size))
    random.shuffle(indices)

    val_indices = set(indices[:val_size])
    test_indices = set(indices[val_size:val_size + test_size])

    with open(all_questions_file_path) as qf, open(all_topics_file_path) as tf, \
            open(val_questions_file_path, 'w') as qfval, open(val_topics_file_path, 'w') as tfval, \
            open(test_questions_file_path, 'w') as qftest, open(test_topics_file_path, 'w') as tftest, \
            open(train_questions_file_path, 'w') as qftrain, open(train_topics_file_path, 'w') as tftrain:

        for index, (question, topic) in enumerate(zip(qf, tf)):
            if index in val_indices:
                qfile = qfval
                tfile = tfval
            elif index in test_indices:
                qfile = qftest
                tfile = tftest
            else:
                qfile = qftrain
                tfile = tftrain

            qfile.write("{}".format(question))
            tfile.write("{}".format(topic))

    logger.info("Finished data preprocessing")
Example #57
0
 def __init__(self, handle):
     self.xml_iter = iter(
         ElementTree.iterparse(handle, events=('start', 'end')))
     self._meta, self._fallback = self._parse_preamble()
def read_doc_input(inputxml, inputparsed, outputfile):
    '''
		input:
			input document xml file and Stanford CoreNLP output
		output:

			1. a new xml file of splitted sentences 
			2. a txt file with one sentence in each line for word segmentation in the later step
	'''

    #read input xml file, store documents in a dictionary.
    #the key of dictionary is the text part of document, the value is the infomartion about the document, e.g date,id
    docdict = {}
    doctexts = []
    output = []

    tree = ET.iterparse(inputxml)

    for event, elem in tree:
        if event == "end" and elem.tag == "Article":
            story = elem

            # Check to make sure all the proper XML attributes are included
            attribute_check = [
                key in story.attrib
                for key in ['date', 'id', 'mongoId', 'sentence', 'source']
            ]
            if not attribute_check:
                print('Need to properly format your XML...')
                break

            entry_id = story.attrib['id']
            mongoid = story.attrib['mongoId']
            date = story.attrib['date']
            date = date[0:date.find("T")].replace("-", "")
            sentence = story.attrib['sentence']
            source = story.attrib['source']

            text = story.find('Text').text
            if text is None:
                text = ""
            else:
                text = text.replace('\n', ' ').strip()

            if entry_id in docdict:
                print(
                    'id must be unique, this article is in document dictionary :'
                    + entry_id)
                break

            docdict[text] = {
                'id': entry_id,
                'date': date,
                'mongoid': mongoid,
                'sentence': sentence,
                'source': source,
                'text': text
            }

            doctexts.append(text)

            elem.clear()

    #read Stanford CoreNLP parsed file
    parsed = open(inputparsed)
    parsedfile = parsed.readlines()
    parsedlines = []
    #for line in parsedfile:
    #if "Sentence #" in line or "[" in line:
    #	continue
    #else:
    #print(line)
    #parsedlines.append(line.replace("\n"," ").strip())
    i = 0
    while i < len(parsedfile):
        line = parsedfile[i]
        if "Sentence #" in line:
            i = i + 1
            continue
        elif not line.startswith('['):
            temp = line
            i = i + 1
            line = parsedfile[i]
            while (not line.startswith('[')):
                temp = temp + line
                i = i + 1
                line = parsedfile[i]
            #print(temp)
            parsedlines.append(temp.replace('\n', ' ').strip())
        i = i + 1

    #match CoreNLP parsed file with input xml file
    sents_dict = {}
    sents = []
    sentidx = 1
    #print(len(doctexts))
    #print(len(parsedlines))
    #raw_input("Press Enter to continue...")

    processed = 0
    for line in parsedlines:
        doc = doctexts[0]
        #print(doc)
        #print(line+"#")
        #print(isinstance(doc,str))
        #print(isinstance(line,str))
        #print(doc.encode('UTF-8').find(line))
        #break
        #'''
        line = line.replace("&gt;", ">").replace("&lt;",
                                                 "<").replace("&amp;", "&")
        if doc.encode('UTF-8').find(line) == -1:
            #print(processed)
            #if processed>=33223:
            #	print(line)
            #	print(doc)
            #raw_input("Press Enter to continue...")
            doctexts.remove(doc)
            sentidx = 1
            doc = doctexts[0]

        if doc.encode('UTF-8').find(line) != -1:
            #print(docdict[doc]['id']+"#"+line)
            key = docdict[doc]['id'] + "#" + line
            sents.append(key)
            output.append(line + "\n")
            sents_dict[key] = {}
            sents_dict[key]['sentence_id'] = str(sentidx)
            sents_dict[key].update(docdict[doc])
            #print(sents_dict[key]['sentence_id']+":"+key)
            sentidx = sentidx + 1

        processed = processed + 1
        #'''

    #print(len(parsedlines))
    #print(len(sents))
    #for sent in sents:
    #print(sent)
    #print(sents_dict.get(sent).get('sentence_id'))
    #print(sents_dict[sent]['sentence_id']+":"+key)

    create_sentence_xml(sents, sents_dict, inputxml + "-sent.xml")

    ofile = open(outputfile, 'w')
    for line in output:
        ofile.write(line)
    ofile.close()
Example #59
0
def iterparse(source, events=('end', ), remove_comments=True, **kw):
    """Thin wrapper around ElementTree.iterparse"""
    return ElementTree.iterparse(source, events, SourceLineParser(), **kw)
Example #60
0
def process_file_with_policy(file_name, policy_id, original_name):
    count = 0
    datadict = {}
    specialdict = {}
    valuedict = {}
    highdict = {}
    lowdict = {}
    newst = ''
    new_key = ''
    policy = Policy.objects.get(id=policy_id)
    operations = Operation.objects.filter(policy=policy)
    for op in operations:
        if op.op_name == 'average':
            datadict[op.signal_name] = 0
        elif op.op_name == 'exceeds':
            specialdict[op.signal_name] = 0
            valuedict[op.signal_name] = op.cut_off_number
        else:
            new_key = op.signal_name + "-" + op.second_signal_name
            datadict[new_key] = 0
            highdict[op.signal_name] = 0
            lowdict[op.second_signal_name] = 0

    yhigh = 0
    ylow = 0

    for event, elem in ET.iterparse(file_name,
                                    events=('start',
                                            'end')):
        if event == 'start':
            if elem.tag == 'frame':
                count += 1
        if event == 'end':
            key = elem.get("key")
            if key is not None:
                if key in datadict:
                    value = elem.get("value")
                    datadict[key] += float(value)
                if key in specialdict and float(value) > valuedict[key]:
                    specialdict[key] += 1
                if key in highdict:
                    yhigh = float(value)
                if key in lowdict:
                    ylow = float(value)
                    diff = abs(yhigh - ylow)
                    datadict[new_key] += diff
            elem.clear()

    result_name = original_name + ".xml"
    current_time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    current_time = datetime.strptime(current_time_str,
                                     "%Y-%m-%d %H:%M:%S")
    new_result = Result(
        filename=result_name,
        policy_id=policy_id,
        policy_name=policy.policy_name,
        processed_time=current_time,
        task_id=1,
        status=True)
    new_result.save()
    result = Result.objects.get(filename=result_name,
                                processed_time=current_time_str)
    for k in datadict.keys():
        v = datadict[k]
        ave = v/count
        new_row = Row(
            result=result,
            signal_name=k,
            result_number=ave,
            op_name='average'
        )
        new_row.save()
    for k, v in specialdict.items():
        new_row = Row(
            result=result,
            signal_name=k,
            result_number=v,
            op_name='exceeded (out of {0} frames)'.format(count),
            cut_off_number=valuedict[k]
        )
        new_row.save()
    return result