def audit(osmfile, audit_value = ''):
    osm_file = open(osmfile, "r")
    if audit_value == 'postal':
        zip_fixed= set()
        for event, elem in ET.iterparse(osm_file, events=("start",)):
    
            if elem.tag == "node" or elem.tag == "way":
                for tag in elem.iter("tag"):
                    if is_postal(tag):
                        zip_fixed.add(audit_postalcode(tag.attrib['v']))
        return zip_fixed

    elif audit_value == 'city':
        city_fixed = set()
        for event, elem in ET.iterparse(osm_file, events=("start",)):
            if elem.tag == "node" or elem.tag == "way":
                for tag in elem.iter("tag"):
                    if is_city(tag):
                        city_fixed.add(audit_city(tag.attrib['v']))
        return city_fixed                        
    
    else:
        street_types = defaultdict(set)
        for event, elem in ET.iterparse(osm_file, events=("start",)):

            if elem.tag == "node" or elem.tag == "way":
                for tag in elem.iter("tag"):
                    if is_street_name(tag):
                        audit_street_type(street_types, tag.attrib['v'])

        return street_types
Example #2
0
    def test_getAllNestedElementInformation(self):
        expectedResultPeaks = {'fullName': 'Proteomics Standards Initiative Mass Spectrometry Ontology', 'id': 'MS', 'tagName': '{http://psi.hupo.org/ms/mzml}cv', 
                               'URI': 'http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo'}
        expectedResultMzml = {'fullName': 'Proteomics Standards Initiative Mass Spectrometry Ontology', 'id': 'MS', 'tagName': '{http://psi.hupo.org/ms/mzml}cv', 
                              'URI': 'http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo', 'version':'2.26.0'}
        expectedResultFeatureXML = {'name': 'FeatureFinder', 'tagName': 'software', 'version': '1.8.0'}

        actualResultPeaks = {}
        elementFile = open(testFolder+'peaksMzmlTestfile.peaks.mzML')
        for event, element in cElementTree.iterparse(elementFile):
            actualResultPeaks = elementFunctions.getAllNestedElementInformation(element)
            # only doing one to test, break
            break
        
        actualResultMzml = {}
        elementFile = open(testFolder+'mzml_test_file_1.mzML')
        for event, element in cElementTree.iterparse(elementFile):
            actualResultMzml = elementFunctions.getAllNestedElementInformation(element)
            # only doing one to test, break
            break

        actualResultFeatureXML = {}
        elementFile = open(testFolder+'featurexmlTestFile_1.featureXML')
        for event, element in cElementTree.iterparse(elementFile):
            actualResultFeatureXML = elementFunctions.getAllNestedElementInformation(element)
            # only doing one to test, break
            break
            
        
        self.assertDictEqual(expectedResultPeaks, actualResultPeaks)
        self.assertDictEqual(expectedResultMzml, actualResultMzml)
        self.assertDictEqual(expectedResultFeatureXML, actualResultFeatureXML)
Example #3
0
def parse_blast_xml(args, cur):
    if args.input:
        for f in args.input:
            con = et.iterparse(f, events=('end', 'start'))
            _parse_blast_xml(args, cur, con)
    else:
        con = et.iterparse(sys.stdin, events=('end', 'start'))
        _parse_blast_xml(args, cur, con)
 def __init__(self):
     self.ATC_dict = json.loads('all_level.json')
     self.result = []
     for event, drug = ET.iterparse('drugbank.xml'):
         if drug.tag != '{http://www.drugbank.ca}drug':
             continue
         for first_level in drug:
             if first_level.tag == '{http://www.drugbank.ca}calculated-properties':
                 self.result += self.calculated_properties(first_level)
Example #5
0
def _iterparse(xmlfile):
    """
    Avoid bug in python 3.{2,3}. See http://bugs.python.org/issue9257.

    :param xmlfile: XML file or file-like object
    """
    try:
        return ET.iterparse(xmlfile, events=("start-ns", ))
    except TypeError:
        return ET.iterparse(xmlfile, events=(b"start-ns", ))
def main():
  global args
  options = argparse.ArgumentParser(epilog="Example: \
%(prog)s dmarc-xml-file 1> outfile.log")
  options.add_argument("dmarcfile", help="dmarc file in XML format")
  args = options.parse_args()

  # get an iterable and turn it into an iterator
  meta_fields = get_meta(iter(etree.iterparse(args.dmarcfile, events=("start", "end"))));
  if not meta_fields:
    print >> sys.stderr, "Error: No valid 'policy_published' and 'report_metadata' xml tags found; File: " + args.dmarcfile 
    sys.exit(1)

  print_record(iter(etree.iterparse(args.dmarcfile, events=("start", "end"))), meta_fields, args)
Example #7
0
def do_search(search):
    """
    Given any arbitrary string, return list of possible matching locations.
    """
    import StringIO
    from x84.bbs import echo, getch
    disp_msg(u'SEARChiNG')
    resp = requests.get(u'http://apple.accuweather.com'
                        + u'/adcbin/apple/Apple_find_city.asp',
                        params=(('location', search),))
    locations = list()
    if resp is None:
        disp_notfound()
    elif resp.status_code != 200:
        # todo: logger.error
        echo(u'\r\n' + u'Status Code: %s\r\n\r\n' % (resp.status_code,))
        echo(repr(resp.content))
        echo(u'\r\n\r\n' + 'Press any key')
        getch()
    else:
        # print resp.content
        xml_stream = StringIO.StringIO(resp.content)
        locations = list([dict(elem.attrib.items())
                          for _event, elem in ET.iterparse(xml_stream)
                          if elem.tag == 'location'])
        if 0 == len(locations):
            disp_notfound()
        else:
            disp_found(len(locations))
    return locations
Example #8
0
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename, events = ("start",)):
        if 'uid' in element.attrib:
            users.add(get_user(element))

    return users
Example #9
0
 def readPrimary(self):
     # If we have either a local cache of the primary.xml.gz file or if
     # it is already local (nfs or local file system) we calculate it's
     # checksum and compare it with the one from repomd. If they are
     # the same we don't need to cache it again and can directly use it.
     if self.repomd.has_key("primary"):
         if not self.repomd["primary"].has_key("location"):
             return 0
         primary = self.repomd["primary"]["location"]
         (csum, destfile) = self.nc.checksum(primary, "sha")
         if self.repomd["primary"].has_key("checksum") and \
                csum == self.repomd["primary"]["checksum"]:
             filename = destfile
         else:
             filename = self.nc.cache(primary, 1)
         if not filename:
             return 0
         try:
             fd = PyGZIP(filename)
             ip = iterparse(fd, events=("start","end"))
             ip = iter(ip)
         except IOError:
             log.error("Couldn't parse primary.xml")
             return 0
         self._parse(ip)
     return 1
Example #10
0
def parse_and_write(xml_file, outfile, fields, tag, n, interval=1):
    # get an iterable
    context = ET.iterparse(xml_file, events=("start", "end"))
    # turn it into an iterator
    context = iter(context)
    # get the root element
    event, root = context.next()
    i = 0
    with open(outfile, 'w') as f:
        for event, row in context:
            if event == "end" and row.tag == tag:
                if i % 100000 == 0:
                    pct = round((i * 1.0 / n) * 100, 1)
                    Printer("Processed {0} records. ~ {1}\% complete.".format(i, pct))
                if interval == 1 or i % interval == 0:
                    if all(map(lambda x: x in row.attrib, fields)):
                        field_data = []
                        for fd in fields:
                            if fd == 'Tags':
                                field_data.extend(parse_tags(row.attrib[fd].encode('ascii', 'ignore')))
                            else:
                                field_data.append(clean(row.attrib[fd].encode('ascii', 'ignore')))
                        text = " ".join(field_data) + "\n"
                        f.write(text)
                i += 1
                root.clear()
                if i >= n:
                    break
Example #11
0
    def new_parse(self):
        """Generator using cElementTree iterparse function"""
        if self.filename.endswith('.bz2'):
            import bz2
            source = bz2.BZ2File(self.filename)
        elif self.filename.endswith('.gz'):
            import gzip
            source = gzip.open(self.filename)
        elif self.filename.endswith('.7z'):
            import subprocess
            source = subprocess.Popen('7za e -bd -so %s 2>/dev/null'
                                      % self.filename,
                                      shell=True,
                                      stdout=subprocess.PIPE,
                                      bufsize=65535).stdout
        else:
            # assume it's an uncompressed XML file
            source = open(self.filename)
        context = iterparse(source, events=("start", "end", "start-ns"))
        self.root = None

        for event, elem in context:
            if event == "start-ns" and elem[0] == "":
                self.uri = elem[1]
                continue
            if event == "start" and self.root is None:
                self.root = elem
                continue
            for rev in self._parse(event, elem):
                yield rev
Example #12
0
def parseOsm(source, handler):
  for event, elem in ElementTree.iterparse(source, events=('start', 'end')):
    if event == 'start':
      handler.startElement(elem.tag, elem.attrib)
    elif event == 'end':
      handler.endElement(elem.tag)
    elem.clear()
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
#        print element.tag
        keys = key_type(element, keys)

    return keys
Example #14
0
def process_map(filename):
    users = set()
    for _, el in ET.iterparse(filename):
        if 'user' in el.attrib:
            users.add(el.attrib['user'])

    return users
def list_country(filename):
    governorate_set=set()
    for _,element in ET.iterparse(filename):
        for tag in element.iter('tag'):
            if 'governorate' in tag.attrib['k'] and tag.attrib['v'] not in governorate_set:
                governorate_set.add(tag.attrib['v'])
    return governorate_set
Example #16
0
def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        # audit street names in "nodes" containing "addr:street"
        if elem.tag == "node":
            for tag in elem.iter("tag"):
                if is_street_name(tag) and tag.attrib['v'] != "":
                    street_type, street_name = audit_street_type(
                        tag.attrib['v'])
                    add_street_type(street_types, street_type, street_name)
        # audit street names in "ways" containing "highway"
        elif elem.tag == "way":
            highway = 0
            # check if way matches an included highway type
            for tag in elem.iter("tag"):
                if (tag.attrib['k'] == "highway") and (tag.attrib['v'] 
                in highway_types):
                    highway = 1
            if highway == 1:
                for tag in elem.iter("tag"):
                    if is_name(tag) and tag.attrib['v'] != "":
                        street_type, street_name = audit_street_type(
                            tag.attrib['v'])
                        if street_type != None:
                            add_street_type(street_types, street_type, 
                                            street_name)
                            
    return street_types
Example #17
0
    def _parse_results(self, stream):
        """Parse results and messages out of *stream*."""
        result = None
        values = None
        try:
            for event, elem in et.iterparse(stream, events=('start', 'end')):
                if elem.tag == 'results' and event == 'start':
                    # The wrapper element is a <results preview="0|1">. We
                    # don't care about it except to tell is whether these
                    # are preview results, or the final results from the
                    # search.
                    is_preview = elem.attrib['preview'] == '1'
                    self.is_preview = is_preview
                if elem.tag == 'result':
                    if event == 'start':
                        result = OrderedDict()
                    elif event == 'end':
                        yield result
                        result = None
                        elem.clear()

                elif elem.tag == 'field' and result is not None:
                    # We need the 'result is not None' check because
                    # 'field' is also the element name in the <meta>
                    # header that gives field order, which is not what we
                    # want at all.
                    if event == 'start':
                        values = []
                    elif event == 'end':
                        field_name = elem.attrib['k'].encode('utf8')
                        if len(values) == 1:
                            result[field_name] = values[0]
                        else:
                            result[field_name] = values
                        # Calling .clear() is necessary to let the
                        # element be garbage collected. Otherwise
                        # arbitrarily large results sets will use
                        # arbitrarily large memory intead of
                        # streaming.
                        elem.clear()

                elif elem.tag in ('text', 'v') and event == 'end':
                    text = "".join(elem.itertext())
                    values.append(text.encode('utf8'))
                    elem.clear()

                elif elem.tag == 'msg':
                    if event == 'start':
                        msg_type = elem.attrib['type']
                    elif event == 'end':
                        text = elem.text if elem.text is not None else ""
                        yield Message(msg_type, text.encode('utf8'))
                        elem.clear()
        except SyntaxError as pe:
            # This is here to handle the same incorrect return from
            # splunk that is described in __init__.
            if 'no element found' in pe.msg:
                return
            else:
                raise
def process_osm(file_in):
	with open(file_in) as file:
		for _, element in ET.iterparse(file):
			el = shape_data(element)
			if el:
				#pprint.pprint(el)
				way_node_collection.insert(el)
Example #19
0
    def parseFile(self, f):
        """ Parses a single Doxygen XML file
            :param f: XML file path
        """
        documentable_members = 0
        documented_members = 0

        # Wrap everything in a try, as sometimes Doxygen XML is malformed
        try:
            for event, elem in ET.iterparse(f):
                if event == 'end' and elem.tag == 'compounddef':
                    if self.elemIsPublicClass(elem):
                        # store documentation status
                        members, documented, undocumented, bindable, has_brief_description, found_version_added = self.parseClassElem(elem)
                        documentable_members += members
                        documented_members += documented
                        class_name = elem.find('compoundname').text
                        acceptable_missing = self.acceptable_missing.get(class_name, [])

                        if not self.hasGroup(class_name) and not class_name in self.acceptable_missing_group:
                            self.classes_missing_group.append(class_name)
                        if not class_name in self.acceptable_missing_brief and not has_brief_description:
                            self.classes_missing_brief.append(class_name)
                        if not class_name in self.acceptable_missing_added_note and not found_version_added:
                            self.classes_missing_version_added.append(class_name)

                        # GEN LIST
                        # if len(undocumented) > 0:
                        #     print('"%s": [%s],' % (class_name, ", ".join(['"%s"' % e.replace('"', '\\"') for e in undocumented])))

                        unacceptable_undocumented = undocumented - set(acceptable_missing)

                        # do a case insensitive check too
                        unacceptable_undocumented_insensitive = set([u.lower() for u in undocumented]) - set([u.lower() for u in acceptable_missing])

                        if len(unacceptable_undocumented_insensitive) > 0:
                            self.undocumented_members[class_name] = {}
                            self.undocumented_members[class_name]['documented'] = documented
                            self.undocumented_members[class_name]['members'] = members
                            self.undocumented_members[class_name]['missing_members'] = unacceptable_undocumented

                        # store bindable members
                        if self.classElemIsBindable(elem):
                            for m in bindable:
                                self.bindable_members.append(m)

                    elem.clear()
        except ET.ParseError as e:
            # sometimes Doxygen generates malformed xml (e.g., for < and > operators)
            line_num, col = e.position
            with open(f, 'r') as xml_file:
                for i, l in enumerate(xml_file):
                    if i == line_num - 1:
                        line = l
                        break
            caret = '{:=>{}}'.format('^', col)
            print(('ParseError in {}\n{}\n{}\n{}'.format(f, e, line, caret)))

        self.documentable_members += documentable_members
        self.documented_members += documented_members
Example #20
0
def parse(stream):
    for event, element in et.iterparse(stream):
        if element.tag != 'row':
            continue
        yield {
            x.get('name'): x.text.strip() if x.text else None for x in element
        }
def count_tags(filename):
        # YOUR CODE HERE
        d = defaultdict(int)
        for events, elem in ET.iterparse(filename):
            d[elem.tag] += 1
                
        return d
def parse_mediawiki_xml(xml_fo, dump_func):
    """
    Process mediawiki xml dump of wiktionary.
    """
    # create tag names in advance (setting default xmlns doesn't help)
    page_tag = tag("page")
    text_tag = tag("text")
    title_tag = tag("title")

    # using event based eltree parser
    itree = ET.iterparse(xml_fo, events=("start","end"))

    word_name = None
    etymology_list = None

    for event, elem in itree:
        # reset data for new word entry
        if event == "start":
            if elem.tag == page_tag:
                word_name = None
            continue
        # get data for current word entry (event == end)
        if elem.tag == title_tag:
            if not elem.text.startswith("Wiktionary:"):
                word_name = elem.text
        elif word_name is not None:
            if elem.tag == text_tag:
                etymology_list = get_etymology(elem.text)
            elif elem.tag == page_tag:
                # all data for current word entry has beed processed
                dump_func(word_name, etymology_list)
def examine_tags(osmfile, tag_range, item_limit):
    assert len(tag_range) == 2
    # use pre-loaded tag_keys list of tuples, if exists
    if TAG_KEYS:
        tag_keys = TAG_KEYS
    # else call mapparser count_tags method to pull sorted list of top tags
    else:
        _, tag_keys = count_tags(osmfile)
    # list comprehension for producing a list of tag_keys in string format
    tag_keys = [tag_key[0] for tag_key in tag_keys][tag_range[0]:tag_range[1]]
    print "Examining tag keys: {}".format(tag_keys)

    # open osm file
    osm_file = open(osmfile, "r")

    # initialize data with default set data structure
    data = defaultdict(set)

    # iterate through elements
    for _, elem in ET.iterparse(osm_file, events=("start",)):
        # check if the element is a node or way
        if elem.tag == "node" or elem.tag == "way":
            # iterate through children matching `tag`
            for tag in elem.iter("tag"):
                # skip if does not contain key-value pair
                if 'k' not in tag.attrib or 'v' not in tag.attrib:
                    continue
                key = tag.get('k')
                val = tag.get('v')
                # add to set if in tag keys of interest and is below the item limit
                if key in tag_keys and len(data[key]) < item_limit:
                    data[key].add(val)
    return data
Example #24
0
def parse(xmlfile, element_names, element_attrs={}, attr_conversions={}):
    """
    Parses the given element_names from xmlfile and yield compound objects for
    their xml subtrees (no extra objects are returned if element_names appear in
    the subtree) The compound objects provide all element attributes of
    the root of the subtree as attributes unless attr_names are supplied. In this
    case attr_names maps element names to a list of attributes which are
    supplied. If attr_conversions is not empty it must map attribute names to
    callables which will be called upon the attribute value before storing under
    the attribute name. 
    The compound objects gives dictionary style access to list of compound
    objects o for any children with the given element name 
    o['child_element_name'] = [osub0, osub1, ...]
    As a shorthand, attribute style access to the list of child elements is
    provided unless an attribute with the same name as the child elements
    exists (i.e. o.child_element_name = [osub0, osub1, ...])
    @Note: All elements with the same name must have the same type regardless of
    the subtree in which they occur
    @Note: Attribute names may be modified to avoid name clashes
    with python keywords.
    @Note: The element_names may be either a single string or a list of strings.
    @Example: parse('plain.edg.xml', ['edge'])
    """
    if isinstance(element_names, str):
        element_names = [element_names]
    elementTypes = {}
    for event, parsenode in ET.iterparse(xmlfile):
        if parsenode.tag in element_names:
            yield _get_compound_object(parsenode, elementTypes,
                                       parsenode.tag, element_attrs, attr_conversions)
            parsenode.clear()
Example #25
0
    def elements(self):
        
        if not self.parser:
            reader = self.stream.reader
            class f(object):
                def read(self, n):
                    if reader.buffer.remaining == 0:
                        #read more data into buffer
                        reader._read_more()
                    return reader.buffer.read_bytes(min(n, reader.buffer.remaining))

            self.parser = iter(iterparse(f(), events=("start", "end")))
            event, self.root = self.parser.next()
            level = 0
        
        for event, element in self.parser:
            if event == 'start':
                level += 1
            elif event == 'end':
                level -= 1
                if level == 0:
                    yield element
                #TODO clear root
            else:
                assert False, "unexpected event"
Example #26
0
def iterparse(text, interested_path_handlers):
    '''
    interested_path_handlers => {'start': ((interested_path, handler), (interested_path, handler), ...),
                                 'end':((interested_path, handler), (interested_path, handler), ...)}
    interested_path => (tag1, tag2, tag3, ...)
    An incremental XML parser. ElementTree.findall has too high CPU/Memory footprint when data set is big
    '''
    strf = StringIO()
    strf.write(text)
    strf.seek(0)

    context = ElementTree.iterparse(strf, events=('start', 'end'))
    context = iter(context)
    all_start_handlers = interested_path_handlers.get('start', ())
    all_end_handlers = interested_path_handlers.get('end', ())
    current_path = []

    for ev, elem in context:
        tag, value = elem.tag, elem.text
        if ev == 'start':
            current_path.append(tag)
            if all_start_handlers:
                _do_handlers(ev, elem, current_path, all_start_handlers)
        elif ev == 'end':
            if all_end_handlers:
                _do_handlers(ev, elem, current_path, all_end_handlers)
            current_path.pop()
            elem.clear()
def count_tags(filename, limit=-1, verbose=False):
    """
    Parses the OSM file and counts the tags by type.
    """
    # initialize dict objects and counter
    tag_count = {}
    tag_keys = {}
    counter = 0

    # iterate through elements
    for _, element in ET.iterparse(filename, events=("start",)):
        # add to tag count
        add_tag(element.tag, tag_count)

        # if tag and has key, add the tag key to tag_keys dict
        if element.tag == 'tag' and 'k' in element.attrib:
            add_tag(element.get('k'), tag_keys)

        # print if verbose output enabled
        if verbose:
            print "{0}: {1}".format(counter, element.tag)

        # break if exceed limit
        if limit > 0 and counter >= limit:
            break
        counter += 1

    # produces a sorted-by-decreasing list of tag key-count pairs
    tag_keys = sorted(tag_keys.items(), key=operator.itemgetter(1))[::-1]

    # return values
    return tag_count, tag_keys
Example #28
0
 def _xml_namespaces(self):
     for _, e in iterparse(self._path, events=('start-ns',)):
         lcode, uri = e
         if 1 > Namespace.objects.filter(resource__name=uri).count():
             r = Resource(name=uri)
             yield r
             yield Namespace(code=lcode, resource=r)
def iterArticles(f):
    pmid = None
    title = None
    abstract = None
    journal= None
    mesh_list = []
    for event,elem in ET.iterparse(f, events=("start","end")):
        if event == 'start':
            if elem.tag == 'PubmedArticle':
                pmid,title,abstract,journal,mesh_list= None,None,None, None,[]
        elif event == 'end':            
            if elem.tag == 'PubmedArticle':
                yield pmid, title, abstract,journal,mesh_list
            elif elem.tag == 'PMID':
                pmid = elem.text
            elif elem.tag == 'ArticleTitle':
                title = elem.text               
            elif elem.tag == 'AbstractText':
                abstract = elem.text
            elif elem.tag == 'Title':
                journal = elem.text    
            elif elem.tag == 'KeywordList':
                keyword_list=elem.findall("Keyword")
                for aa in keyword_list:
                    mesh_list.append(aa.text)
            elif elem.tag == 'MeshHeadingList':
                mhlist = elem.findall("MeshHeading")  
                for child in mhlist:
                    if child.findtext('DescriptorName'):
                         mesh_list.append(child.findtext('DescriptorName'))
                    if child.findtext('QualifierName'):     
                         mesh_list.append(child.findtext('QualifierName'))
Example #30
0
    def _get_annotations(self, source, start_offset=0):
        """It returns the annotations found in the document.

        It follows the following format:
           [
            ('TAG', {ATTRIBUTES}, (start_offset, end_offset)),
            ('TAG', {ATTRIBUTES}, (start_offset, end_offset)),
            ...
            ('TAG', {ATTRIBUTES}, (start_offset, end_offset))
           ]

        """
        annotations = []
        for event, element in etree.iterparse(
                StringIO(source), events=('start', 'end')):
            if event == 'start':
                if element.tag in self.tags_to_spot:
                    try:
                        end_offset = start_offset + len(element.text)
                    except TypeError:
                        continue
                    annotations.append((element.tag, element.attrib,
                                        (start_offset, end_offset)))
                start_offset += len(element.text)
            elif event == 'end':
                if element.text is not None and element.tail is not None:
                    start_offset += len(element.tail)
        return annotations
def convert(blastxml_filename, output_handle):
    blast_program = None
    # get an iterable
    try:
        context = ElementTree.iterparse(blastxml_filename, events=("start", "end"))
    except Exception:
        sys.exit("Invalid data format.")
    # turn it into an iterator
    context = iter(context)
    # get the root element
    try:
        event, root = context.next()
    except Exception:
        sys.exit("Invalid data format.")
    for event, elem in context:
        if event == "end" and elem.tag == "BlastOutput_program":
            blast_program = elem.text
        # for every <Iteration> tag
        if event == "end" and elem.tag == "Iteration":
            # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
            # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
            # <Iteration_query-def>Endoplasmic reticulum resident protein 44
            # OS=H**o sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
            # <Iteration_query-len>406</Iteration_query-len>
            # <Iteration_hits></Iteration_hits>
            #
            # Or, from BLAST 2.2.24+ run online
            # <Iteration_query-ID>Query_1</Iteration_query-ID>
            # <Iteration_query-def>Sample</Iteration_query-def>
            # <Iteration_query-len>516</Iteration_query-len>
            # <Iteration_hits>...
            qseqid = elem.findtext("Iteration_query-ID")
            if re_default_query_id.match(qseqid):
                # Place holder ID, take the first word of the query definition
                qseqid = elem.findtext("Iteration_query-def").split(None, 1)[0]
            qlen = int(elem.findtext("Iteration_query-len"))

            # for every <Hit> within <Iteration>
            for hit in elem.findall("Iteration_hits/Hit"):
                # Expecting either this,
                # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
                # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
                # <Hit_accession>P56514</Hit_accession>
                # or,
                # <Hit_id>Subject_1</Hit_id>
                # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
                # <Hit_accession>Subject_1</Hit_accession>
                #
                # apparently depending on the parse_deflines switch
                #
                # Or, with a local database not using -parse_seqids can get this,
                # <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
                # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis
                # thaliana chromosome 3, complete sequence</Hit_def>
                # <Hit_accession>2</Hit_accession>
                sseqid = hit.findtext("Hit_id").split(None, 1)[0]
                hit_def = sseqid + " " + hit.findtext("Hit_def")
                if re_default_subject_id.match(sseqid) and sseqid == hit.findtext("Hit_accession"):
                    # Place holder ID, take the first word of the subject definition
                    hit_def = hit.findtext("Hit_def")
                    sseqid = hit_def.split(None, 1)[0]
                if sseqid.startswith("gnl|BL_ORD_ID|") and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
                    # Alternative place holder ID, again take the first word of hit_def
                    hit_def = hit.findtext("Hit_def")
                    sseqid = hit_def.split(None, 1)[0]
                # for every <Hsp> within <Hit>
                for hsp in hit.findall("Hit_hsps/Hsp"):
                    nident = hsp.findtext("Hsp_identity")
                    length = hsp.findtext("Hsp_align-len")
                    # As of NCBI BLAST+ 2.4.0 this is given to 3dp (not 2dp)
                    pident = "%0.3f" % (100 * float(nident) / float(length))

                    q_seq = hsp.findtext("Hsp_qseq")
                    h_seq = hsp.findtext("Hsp_hseq")
                    m_seq = hsp.findtext("Hsp_midline")
                    assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
                    gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 +
                                  len(h_seq.replace('-', ' ').split()) - 1)

                    mismatch = m_seq.count(' ') + m_seq.count('+') - q_seq.count('-') - h_seq.count('-')
                    # TODO - Remove this alternative mismatch calculation and test
                    # once satisifed there are no problems
                    expected_mismatch = len(q_seq) - sum(1 for q, h in zip(q_seq, h_seq)
                                                         if q == h or q == "-" or h == "-")
                    xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X")
                    if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
                        sys.exit("%s vs %s mismatches, expected %i <= %i <= %i"
                                 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
                                    int(mismatch), expected_mismatch))

                    # TODO - Remove this alternative identity calculation and test
                    # once satisifed there are no problems
                    expected_identity = sum(1 for q, h in zip(q_seq, h_seq) if q == h)
                    if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
                        sys.exit("%s vs %s identities, expected %i <= %i <= %i"
                                 % (qseqid, sseqid, expected_identity, int(nident),
                                    expected_identity + q_seq.count("X")))

                    evalue = hsp.findtext("Hsp_evalue")
                    if evalue == "0":
                        evalue = "0.0"
                    else:
                        evalue = "%0.0e" % float(evalue)

                    bitscore = float(hsp.findtext("Hsp_bit-score"))
                    if bitscore < 100:
                        # Seems to show one decimal place for lower scores
                        bitscore = "%0.1f" % bitscore
                    else:
                        # Note BLAST does not round to nearest int, it truncates
                        bitscore = "%i" % bitscore

                    values = [qseqid,
                              sseqid,
                              pident,
                              length,  # hsp.findtext("Hsp_align-len")
                              str(mismatch),
                              gapopen,
                              hsp.findtext("Hsp_query-from"),  # qstart,
                              hsp.findtext("Hsp_query-to"),  # qend,
                              hsp.findtext("Hsp_hit-from"),  # sstart,
                              hsp.findtext("Hsp_hit-to"),  # send,
                              evalue,  # hsp.findtext("Hsp_evalue") in scientific notation
                              bitscore,  # hsp.findtext("Hsp_bit-score") rounded
                              ]

                    if extended:
                        try:
                            sallseqid = ";".join(name.split(None, 1)[0] for name in hit_def.split(" >"))
                            salltitles = "<>".join(name.split(None, 1)[1] for name in hit_def.split(" >"))
                        except IndexError as e:
                            sys.exit("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e))
                        # print hit_def, "-->", sallseqid
                        positive = hsp.findtext("Hsp_positive")
                        ppos = "%0.2f" % (100 * float(positive) / float(length))
                        qframe = hsp.findtext("Hsp_query-frame")
                        sframe = hsp.findtext("Hsp_hit-frame")
                        if blast_program == "blastp":
                            # Probably a bug in BLASTP that they use 0 or 1 depending on format
                            if qframe == "0":
                                qframe = "1"
                            if sframe == "0":
                                sframe = "1"
                        slen = int(hit.findtext("Hit_len"))
                        values.extend([sallseqid,
                                       hsp.findtext("Hsp_score"),  # score,
                                       nident,
                                       positive,
                                       hsp.findtext("Hsp_gaps"),  # gaps,
                                       ppos,
                                       qframe,
                                       sframe,
                                       # NOTE - for blastp, XML shows original seq, tabular uses XXX masking
                                       q_seq,
                                       h_seq,
                                       str(qlen),
                                       str(slen),
                                       salltitles,
                                       ])
                    if cols:
                        # Only a subset of the columns are needed
                        values = [values[colnames.index(c)] for c in cols]
                    # print "\t".join(values)
                    output_handle.write("\t".join(values) + "\n")
            # prevents ElementTree from growing large datastructure
            root.clear()
            elem.clear()

#        elif ep[0].text == "Monoisotopic Weight":
#            pass
#        else:
#            pass

    return result

with open("json_dict.txt") as f:
    atc_dict = f.read()
d = json.loads(atc_dict)
count = 0
flag = 0
output_file = open('output', 'ab')
for event, drug in ET.iterparse('drugbank.xml'):
    if drug.tag != '{http://www.drugbank.ca}drug':
        continue
    if not drug.get('type'):
        continue

    result = [0] * 5
    result.append([0.11, 0.01, 0, 0.99, 0.5])
    if drug.get('type') == 'small molecule':
        result[0] = [1]
    else:
        result[0] = [0]
    items = 0
    for first_level in drug:
        flag = 0
Example #33
0
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 19 20:51:11 2017

@author: Burky
"""

import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict

filename = 'cleveland_ohio.osm'

zip_codes = defaultdict(int)
for event, elem in ET.iterparse(filename):
    if elem.tag == 'way':
        for tag in elem.findall('tag'):
            if tag.attrib['k'] == 'addr:postcode':
                zip_codes[tag.attrib['v']] += 1
pprint.pprint(zip_codes)
import xml.etree.cElementTree as cElementTree
import csv

folder = 'D:/Will/GIS Data/Raw OSM Data/EP/'
raw_osm_file = folder + 'arizona_8-12-2016.osm'
outfile = folder + 'parking_lots.csv'

writer = csv.writer(open(outfile, 'wb'))
writer.writerow(['OSMID'])

context = cElementTree.iterparse(raw_osm_file, events=("start", "end"))
context = iter(context)
event, root = next(context)

for event, elem in context:
    if event == 'end' and elem.tag == 'way':
        is_parking = False
        iterator = iter(elem)
        for child in iterator:
            if child.get('k') == 'service' and (
                    child.get('v') == 'parking_aisle'
                    or child.get('v') == 'parking'):
                is_parking = True
            if child.get('k') == 'amenity' and child.get('v') == 'parking':
                is_parking = True

        if is_parking:
            writer.writerow([elem.get('id')])
    root.clear()
def audit():
    for event, elem in ET.iterparse(filename):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])
    print_sorted_dict(street_types)
Example #36
0
    def parse(self, file_obj):
        nodes = []
        ways = []
        print('----------------')
        context = iter(iterparse(file_obj, events=('start', 'end')))
        event, root = next(context)
        
        for (event, elem) in context:
            name = elem.tag
            attrs = elem.attrib
            
            if 'start' == event:
                """Parse the XML element at the start"""
                if name == 'node':
                    record = self.fillDefault(attrs)
                    loc = [float(attrs['lon']),
                           float(attrs['lat'])]
                    record['loc'] = loc
                    record['geometry'] = {'type':'Point', 'coordinates': loc}
                elif name == 'tag':
                    k = attrs['k']
                    v = attrs['v']
                    # MongoDB doesn't let us have dots in the key names.
                    #k = k.replace('.', ',,')
                    record['tg'].append((k, v))
                    record['ky'].append(k)
                elif name == 'way':
                    # Insert remaining nodes
                    if len(nodes) > 0:
                        self.client.osm2.nodes.insert_many(nodes)
                        nodes = []

                    record = self.fillDefault(attrs)
                    record['nd'] = []
                elif name == 'relation':
                    # Insert remaining ways
                    if len(ways) > 0:
                        self.client.osm2.ways.insert_many(ways)
                        ways = []

                    record = self.fillDefault(attrs)
                    record['mm'] = []
                elif name == 'nd':
                    ref = int(attrs['ref'])
                    record['nd'].append(ref)
                elif name == 'member':
                    record['mm'].append(dict(type=attrs['type'],
                                                  ref=int(attrs['ref']),
                                                  role=attrs['role']))
                    
                    if attrs['type'] == 'way':
                        ways2relations = self.client.osm2.ways.find_one({ '_id' : ref})
                        if ways2relations:
                            if 'relations' not in ways2relations:
                                ways2relations['relations'] = []
                            ways2relations['relations'].append(record['_id'])
                            self.client.osm2.ways.save(ways2relations)
                    elif attrs['type'] == 'node':
                        nodes2relations = self.client.osm2.nodes.find_one({ '_id' : ref})
                        if nodes2relations:
                            if 'relations' not in nodes2relations:
                                nodes2relations['relations'] = []
                            nodes2relations['relations'].append(record['_id'])
                            self.client.osm2.nodes.save(nodes2relations)
            elif 'end' == event:
                """Finish parsing an element
                (only really used with nodes, ways and relations)"""
                if name == 'node':
                    if len(record['tg']) == 0:
                        del record['tg']
                    if len(record['ky']) == 0:
                        del record['ky']
                    nodes.append(record)
                    if len(nodes) > 2500:
                        self.client.osm2.nodes.insert_many(nodes)
                        nodes = []
                        self.writeStatsToScreen()

                    record = {}
                    self.stat_nodes = self.stat_nodes + 1
                elif name == 'way':
                    if len(record['tg']) == 0:
                        del record['tg']
                    if len(record['ky']) == 0:
                        del record['ky']
                    nds = dict((rec['_id'], rec) for rec in self.client.osm2.nodes.find({ '_id': { '$in': record['nd'] } }, { 'loc': 1, '_id': 1 }))
                    record['loc'] = []
                    record['geometry'] = dict()
                    locs = []
                    for node in record['nd']:
                        if node in nds:
                            record['loc'].append(nds[node]['loc'])
                            locs.append(nds[node]['loc'])
                        else:
                            print('node not found: '+ str(node))
                    record['geometry'] = {'type':'LineString', 'coordinates': locs}

                    ways.append(record)
                    if len(ways) > 2000:
                        self.client.osm2.ways.insert_many(ways)
                        ways = []

                    record = {}
                    self.statsCount = self.statsCount + 1
                    if self.statsCount > 1000:
                        self.writeStatsToScreen()
                        self.statsCount = 0
                    self.stat_ways = self.stat_ways + 1
                elif name == 'relation':
                    if len(record['tg']) == 0:
                        del record['tg']
                    if len(record['ky']) == 0:
                        del record['ky']
                    self.client.osm2.relations.save(record)
                    record = {}
                    self.statsCount = self.statsCount + 1
                    if self.statsCount > 10:
                        self.writeStatsToScreen()
                        self.statsCount = 0
                    self.stat_relations = self.stat_relations + 1
            elem.clear()
            root.clear()
Example #37
0
#llamado con 7za e -so eowiki-20091128-pages-meta-history.xml.7z | python stubmetahistory-fetch-celementtree.py eo

lang = 'es'  #idioma que será analizado
if len(sys.argv) >= 2:
    lang = sys.argv[1]

rawtotalrevisions = 0.0
site = wikipedia.Site(lang, 'wikipedia')
data = site.getUrl("/wiki/Special:Statistics?action=raw")
rawtotalrevisions += float(data.split("edits=")[1].split(";")[0])

source = sys.stdin
outputfile = "/mnt/user-store/dump/%swiki-fetched.txt.bz" % (lang)
g = bz2.BZ2File(outputfile, "w")

context = iterparse(source, events=("start", "end"))
context = iter(context)

r_newlines = re.compile(ur"(?im)[\n\r\t\s]")
r_redirect = re.compile(
    ur"(?i)^\s*#\s*(REDIRECCIÓN|REDIRECT)\s*\[\[[^\]]+?\]\]")
r_disambig = re.compile(
    ur"(?i)\{\{\s*(d[ei]sambig|desambiguaci[oó]n|des|desamb)\s*[\|\}]")
r_links = re.compile(ur"\[\[\s*[^\]]+?\s*[\]\|]")
r_categories = re.compile(
    ur"(?i)\[\[\s*(category|categoría)\s*\:\s*[^\]\|]+\s*[\]\|]")
r_sections = re.compile(ur"(?im)^(\=+)[^\=]+?\1")
r_templates = ""
r_interwikis = re.compile(ur"(?i)\[\[\s*[a-z]{2,3}(-[a-z]{2,3})?\s*\:")
r_externallinks = re.compile(ur"://")
r_bold = ""
Example #38
0
def audit(osmfile, options=None):
    '''
    Audits the OSM file using the different audit functions defined herein.
    
    osm_file: str. Filepath to the OSM file being audited
    options: list of str. Dictates what types of audits are run. Allowed options values:
                        'counting'
                        'zips'
                        'county/state counting'
                        'county/state reporting'
                        'lat/long'
                        'amenities'
                        'property types'
                        'property type counts'
    
    '''

    with open(osmfile, "rb") as fileIn:
        if options:
            #Setting up the necessary beginning parameters for each function
            if 'counting' in options:
                tag_counts = {}
            if 'zips' in options:
                zipLength = 5
                zipLengthDict = {zipLength: 0, "Non-number": 0}
                known_zips = set()
                knownZipTags = set()
                zip_tags_ignored = []
            if 'county/state counting' in options:
                county_tags = {}
                state_tags = {}
                state_tags_ignored = [
                    'state_capital', 'source:hgv:state_network',
                    'hgv:state_network'
                ]
            if 'county/state reporting' in options:
                counties_found = set()
                states_found = set()
                countyKeys = [
                    'gnis:County', 'gnis:County_num', 'gnis:county_id',
                    'gnis:county_name', 'is_in:county', 'tiger:county'
                ]
                stateKeys = [
                    'addr:state', 'gnis:ST_alpha', 'gnis:state_id',
                    'nist:state_fips', 'ST_num'
                ]
            if 'lat/long' in options:
                badNodes = defaultdict(
                    list
                )  #ensures that each new key will automatically have an empty list value
            if 'amenities' in options:
                known_amenities = defaultdict(set)
            if 'property types' in options:
                propTypes = defaultdict(set)
            if 'property type counts' in options:
                propRecords = defaultdict(int)
                allowed_propTypes = {
                    'landuse': [
                        'residential', 'village_green', 'recreation_ground',
                        'allotments', 'commercial', 'depot', 'industrial',
                        'landfill', 'orchard', 'plant_nursery', 'port',
                        'quarry', 'retail'
                    ],
                    'building': [
                        'apartments', 'farm', 'house', 'detached',
                        'residential', 'dormitory', 'houseboat', 'bungalow',
                        'static_caravan', 'cabin', 'hotel', 'commercial',
                        'industrial', 'retail', 'warehouse', 'kiosk',
                        'hospital', 'stadium'
                    ]
                }

    #----------------------------------------------------------------------
    #Iterating through the XML file
            for _, elem in ET.iterparse(fileIn):
                if 'counting' in options:
                    tag_counts = count_tags(elem, tag_counts)

                if 'zips' in options:
                    zipLengthDict, known_zips, knownZipTags = zipCheck(elem, zipLengthDict, known_zips, \
                                                                       knownZipTags, zip_tags_ignored, digits=zipLength)

                if 'county/state counting' in options:
                    county_tags, state_tags = countyStateTypeCounter(
                        elem, county_tags, state_tags, state_tags_ignored)

                if 'county/state reporting' in options:
                    counties_found, states_found = countyStateReporter(
                        elem, countyKeys, stateKeys, counties_found,
                        states_found)

                if 'lat/long' in options:
                    badNodes = lat_long_checker(elem, badNodes)

                if 'amenities' in options:
                    known_amenities = amenityFinder(elem, known_amenities)

                if 'property types' in options:
                    propTypes = propertyType(elem, propTypes)

                if 'property type counts' in options:
                    propRecords = propertyCounter(elem, allowed_propTypes,
                                                  propRecords)

    #----------------------------------------------------------------------
    #printing everything once done iterating
            if 'counting' in options:
                print("Tags Found")
                pprint.pprint(tag_counts)
            if 'zips' in options:
                print("\nZip Lengths")
                pprint.pprint(zipLengthDict)
                print("\nUnique Zip Codes")
                pprint.pprint(known_zips)
                print("\nZip Code Tag Keys Found")
                pprint.pprint(knownZipTags)
            if 'county/state counting' in options:
                print("\nTypes of County Tags")
                pprint.pprint(county_tags)
                print("\nTypes of State Tags")
                pprint.pprint(state_tags)
            if 'county/state reporting' in options:
                print("\nStates Identified")
                pprint.pprint(states_found)
                print("\nCounties Identified")
                pprint.pprint(counties_found)
            if 'lat/long' in options:
                print("\nNodes with Incorrect Latitudes and/or Longitudes")
                pprint.pprint(badNodes)
            if 'amenities' in options:
                print("\nUnique Amenity and Shop Types Identified")
                pprint.pprint(known_amenities)
            if 'property types' in options:
                print("\nUnique Landuse Types")
                pprint.pprint(propTypes)
            if 'property type counts' in options:
                print("\nCounts of Relevant Landuse Types")
                pprint.pprint(propRecords)
Example #39
0
def test_saving_network_with_bonkers_attributes_with_geometry(tmpdir):
    # attributes are assumed to be a nested dictionary of very specific format. Due to the fact that user can
    # do virtually anything to edge attributes, or due to calculation error, this may not be the case. If it's not
    # of correct format, we don't expect it to get saved to the matsim network.xml
    network = Network('epsg:27700')
    network.add_node('0',
                     attribs={
                         'id': '0',
                         'x': 1,
                         'y': 2,
                         'lat': 1,
                         'lon': 2
                     })
    network.add_node('1',
                     attribs={
                         'id': '1',
                         'x': 2,
                         'y': 2,
                         'lat': 2,
                         'lon': 2
                     })

    link_attribs = {
        'id': '0',
        'from': '0',
        'to': '1',
        'length': 1,
        'freespeed': 1,
        'capacity': 20,
        'permlanes': 1,
        'oneway': '1',
        'modes': ['car'],
        'geometry': LineString([(1, 2), (2, 3), (3, 4)]),
        'attributes': float('nan')
    }

    network.add_link('0', '0', '1', attribs=link_attribs)
    network.write_to_matsim(tmpdir)

    assert_semantically_equal(dict(network.links()), {'0': link_attribs})

    assert_semantically_equal(
        matsim_xml_writer.check_link_attributes(link_attribs), {
            'id': '0',
            'from': '0',
            'to': '1',
            'length': 1,
            'freespeed': 1,
            'capacity': 20,
            'permlanes': 1,
            'oneway': '1',
            'modes': ['car'],
            'geometry': LineString([(1, 2), (2, 3), (3, 4)])
        })

    found_geometry_attrib = False
    for event, elem in ET.iterparse(os.path.join(tmpdir, 'network.xml'),
                                    events=('start', 'end')):
        if event == 'start':
            if elem.tag == 'attribute':
                if elem.attrib['name'] == 'geometry':
                    assert elem.text == '_ibE_seK_ibE_ibE_ibE_ibE'
                    found_geometry_attrib = True
    assert found_geometry_attrib
Example #40
0
    def _parse_wikipedia(self, maximum_number_of_documents=None):
        assert self.__path.exists(), "Wikipedia data does not exist"

        # Determine size of file
        compressed_size = self.__path.stat().st_size

        # Initialise container for documents
        documents = []

        with open(self.__path, mode="rb") as compressed_file:
            with bz2.BZ2File(compressed_file, mode="rb") as uncompressed_file:

                total_compressed_bytes_read_at_last_batch = 0
                tag_prefix = ""
                namespaces = []
                article_namespace_key = None
                in_page = False

                with tqdm(desc="",
                          total=compressed_size,
                          unit="B",
                          unit_scale=True) as progress_bar:

                    for event_number, (event, element) in enumerate(
                            ElementTree.iterparse(
                                uncompressed_file,
                                events=["start", "end", "start-ns",
                                        "end-ns"])):

                        if event == "start-ns":
                            namespaces.append(element)
                            namespace_id, namespace_uri = element
                            if namespace_id == "":
                                tag_prefix = f"{{{namespace_uri}}}"

                        elif event == "end-ns":
                            namespace = namespaces.pop()
                            namespace_id, namespace_uri = namespace
                            if namespace_id == "":
                                tag_prefix = ""

                        elif event == "start":
                            if element.tag == f"{tag_prefix}page":
                                in_page = True
                                title = None
                                text = None
                                page_namespace_keys = []
                                page_redirect = False

                        elif event == "end":

                            tag = element.tag

                            if tag.startswith(tag_prefix):
                                tag = tag.replace(tag_prefix, "", 1)

                            if tag == "namespace":
                                if element.text is None:
                                    article_namespace_key = element.attrib[
                                        "key"]

                            elif in_page and tag == "title":
                                if not title:
                                    title = element.text
                                else:
                                    progress_bar.write(
                                        "Multiple titles found for article "
                                        f"\"{title}\". First one used.")

                            elif in_page and tag == "text":
                                if not text:
                                    text = element.text
                                else:
                                    progress_bar.write(
                                        "Multiple text sections found for article "
                                        f"\"{title}\". First one used.")

                            elif in_page and tag == "ns":
                                page_namespace_keys.append(element.text)

                            elif in_page and tag == "redirect":
                                page_redirect = True

                            elif in_page and tag == "page":

                                in_page = False

                                if article_namespace_key not in page_namespace_keys \
                                    or page_redirect:
                                    continue

                                url = self.__page_base_url \
                                    + title.replace(" ", "_")

                                abstract = self._parse_wikipedia_article(
                                    article_text=text,
                                    sections="first paragraph",
                                    include_header_image_captions=False,
                                    include_header_infoboxes=False)

                                fulltext = self._parse_wikipedia_article(
                                    article_text=text,
                                    sections="all",
                                    include_header_image_captions=False,
                                    include_header_infoboxes=False)

                                document = {
                                    "title": title,
                                    "url": url,
                                    "abstract": abstract,
                                    "text": fulltext
                                }

                                documents.append(document)

                            element.clear()

                        if maximum_number_of_documents and \
                            len(documents) >= maximum_number_of_documents:
                            break

                        if event_number % 1000 == 0:
                            total_compressed_bytes_read = \
                                compressed_file.tell()
                            compressed_bytes_read_for_batch = \
                                total_compressed_bytes_read \
                                - total_compressed_bytes_read_at_last_batch
                            total_compressed_bytes_read_at_last_batch = \
                                total_compressed_bytes_read
                            progress_bar.update(
                                compressed_bytes_read_for_batch)

        return documents
Example #41
0
def process_to_csv(file_in):
    """ Main process to clean data and save to CSV"""
    LOGGER.info("Processing elements in %s", file_in)
    nodes = []
    nodes_tags = []
    addresses = []
    ways = []
    ways_nodes = []
    ways_tags = []

    for _, elem in ET.iterparse(file_in):
        # Process the nodes
        if elem.tag == 'node':
            node = {}
            node_id = 0
            if elem.keys():
                for name, value in elem.items():
                    if name == 'id':
                        node_id = value
                    node[name] = value

                # Process any tags
                if len(elem):
                    address = {'id': node_id}
                    for tag in elem.iter('tag'):
                        # Build a seperate table for real addresses
                        if 'addr' in tag.attrib['k']:
                            address = add_address(tag, address)
                        else:
                            newtag = {'id': node_id}
                            newtag['key'] = tag.attrib['k'].lower()
                            newtag['value'] = tag.attrib['v']
                            nodes_tags.append(newtag)

                    if len(address) > 1:
                        address = audit_address(address)
                        addresses.append(address)

                nodes.append(node)

        # Process ways
        elif elem.tag == 'way':
            position = 0
            way = {}
            way_id = 0
            if elem.keys():
                for name, value in elem.items():
                    if name == 'id':
                        way_id = value
                    way[name] = value

                # Process any Children Found
                if len(elem):
                    # Process Tags
                    for tag in elem.iter('tag'):
                        way_tag = {'id': way_id}
                        way_tag['key'] = tag.attrib['k'].lower()
                        way_tag['value'] = tag.attrib['v']
                        ways_tags.append(way_tag)
                    # Process Node Relations
                    for ndr in elem.iter('nd'):
                        position += 1
                        way_node = {'id': way_id}
                        way_node['node_id'] = ndr.attrib['ref']
                        way_node['position'] = position
                        ways_nodes.append(way_node)

            ways.append(way)

    write_csv(nodes, 'output/nodes.csv', NODES_FIELDS)
    write_csv(nodes_tags, 'output/nodes_tags.csv', TAGS_FIELDS)
    write_csv(addresses, 'output/node_addresses.csv', ADDRESS_FIELDS)
    write_csv(ways, 'output/ways.csv', WAYS_FIELDS)
    write_csv(ways_tags, 'output/ways_tags.csv', TAGS_FIELDS)
    write_csv(ways_nodes, 'output/ways_nodes.csv', WAYS_NODES_FIELDS)
    return
Example #42
0
def processPMCFile(source):
    # Skip to the article element in the file
    for event, elem in etree.iterparse(source,
                                       events=('start', 'end', 'start-ns',
                                               'end-ns')):
        if (event == 'end' and elem.tag == 'article'):
            pmidText, pmcidText, doiText, pubYear, pubMonth, pubDay, journal, journalISO = getMetaInfoForPMCArticle(
                elem)

            # We're going to process the main article along with any subarticles
            # And if any of the subarticles have distinguishing IDs (e.g. PMID), then
            # that'll be used, otherwise the parent article IDs will be used
            subarticles = [elem] + elem.findall('./sub-article')

            for articleElem in subarticles:
                if articleElem == elem:
                    # This is the main parent article. Just use its IDs
                    subPmidText, subPmcidText, subDoiText, subPubYear, subPubMonth, subPubDay, subJournal, subJournalISO = pmidText, pmcidText, doiText, pubYear, pubMonth, pubDay, journal, journalISO
                else:
                    # Check if this subarticle has any distinguishing IDs and use them instead
                    subPmidText, subPmcidText, subDoiText, subPubYear, subPubMonth, subPubDay, subJournal, subJournalISO = getMetaInfoForPMCArticle(
                        articleElem)
                    if subPmidText == '' and subPmcidText == '' and subDoiText == '':
                        subPmidText, subPmcidText, subDoiText = pmidText, pmcidText, doiText
                    if subPubYear == None:
                        subPubYear = pubYear
                        subPubMonth = pubMonth
                        subPubDay = pubDay
                    if subJournal == None:
                        subJournal = journal
                        subJournalISO = journalISO

                # Extract the title of paper
                title = articleElem.findall(
                    './front/article-meta/title-group/article-title'
                ) + articleElem.findall(
                    './front-stub/title-group/article-title')
                assert len(title) <= 1
                titleText = extractTextFromElemList(title)
                titleText = [
                    removeWeirdBracketsFromOldTitles(t) for t in titleText
                ]

                # Get the subtitle (if it's there)
                subtitle = articleElem.findall(
                    './front/article-meta/title-group/subtitle'
                ) + articleElem.findall('./front-stub/title-group/subtitle')
                subtitleText = extractTextFromElemList(subtitle)
                subtitleText = [
                    removeWeirdBracketsFromOldTitles(t) for t in subtitleText
                ]

                # Extract the abstract from the paper
                abstract = articleElem.findall(
                    './front/article-meta/abstract') + articleElem.findall(
                        './front-stub/abstract')
                abstractText = extractTextFromElemList(abstract)

                # Extract the full text from the paper as well as supplementaries and floating blocks of text
                articleText = extractTextFromElemList(
                    articleElem.findall('./body'))
                backText = extractTextFromElemList(
                    articleElem.findall('./back'))
                floatingText = extractTextFromElemList(
                    articleElem.findall('./floats-group'))

                document = {
                    'pmid': subPmidText,
                    'pmcid': subPmcidText,
                    'doi': subDoiText,
                    'pubYear': subPubYear,
                    'pubMonth': subPubMonth,
                    'pubDay': subPubDay,
                    'journal': subJournal,
                    'journalISO': subJournalISO
                }

                textSources = {}
                textSources['title'] = titleText
                textSources['subtitle'] = subtitleText
                textSources['abstract'] = abstractText
                textSources['article'] = articleText
                textSources['back'] = backText
                textSources['floating'] = floatingText

                for k in textSources.keys():
                    tmp = textSources[k]
                    tmp = [t for t in tmp if len(t) > 0]
                    tmp = [html.unescape(t) for t in tmp]
                    tmp = [removeBracketsWithoutWords(t) for t in tmp]
                    textSources[k] = tmp

                document['textSources'] = textSources
                yield document

            # Less important here (compared to abstracts) as each article file is not too big
            elem.clear()
Example #43
0
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if 'user' in element.attrib:
            users.add(get_user(element))
    return users
Example #44
0
    def parseFile(self, f):
        """ Parses a single Doxygen XML file
            :param f: XML file path
        """
        documentable_members = 0
        documented_members = 0

        # Wrap everything in a try, as sometimes Doxygen XML is malformed
        try:
            for event, elem in ET.iterparse(f):
                if event == 'end' and elem.tag == 'compounddef':
                    if self.elemIsPublicClass(elem):
                        # store documentation status
                        members, documented, undocumented, bindable, has_brief_description, found_version_added = self.parseClassElem(
                            elem)
                        documentable_members += members
                        documented_members += documented
                        class_name = elem.find('compoundname').text
                        acceptable_missing = self.acceptable_missing.get(
                            class_name, [])

                        if not self.hasGroup(
                                class_name
                        ) and not class_name in self.acceptable_missing_group:
                            self.classes_missing_group.append(class_name)
                        if not class_name in self.acceptable_missing_brief and not has_brief_description:
                            self.classes_missing_brief.append(class_name)
                        if not class_name in self.acceptable_missing_added_note and not found_version_added:
                            self.classes_missing_version_added.append(
                                class_name)

                        # GEN LIST
                        # if len(undocumented) > 0:
                        #     print('"%s": [%s],' % (class_name, ", ".join(['"%s"' % e.replace('"', '\\"') for e in undocumented])))

                        unacceptable_undocumented = undocumented - set(
                            acceptable_missing)

                        # do a case insensitive check too
                        unacceptable_undocumented_insensitive = set([
                            u.lower() for u in undocumented
                        ]) - set([u.lower() for u in acceptable_missing])

                        if len(unacceptable_undocumented_insensitive) > 0:
                            self.undocumented_members[class_name] = {}
                            self.undocumented_members[class_name][
                                'documented'] = documented
                            self.undocumented_members[class_name][
                                'members'] = members
                            self.undocumented_members[class_name][
                                'missing_members'] = unacceptable_undocumented

                        # store bindable members
                        if self.classElemIsBindable(elem):
                            for m in bindable:
                                self.bindable_members.append(m)

                    elem.clear()
        except ET.ParseError as e:
            # sometimes Doxygen generates malformed xml (eg for < and > operators)
            line_num, col = e.position
            with open(f, 'r') as xml_file:
                for i, l in enumerate(xml_file):
                    if i == line_num - 1:
                        line = l
                        break
            caret = '{:=>{}}'.format('^', col)
            print(('ParseError in {}\n{}\n{}\n{}'.format(f, e, line, caret)))

        self.documentable_members += documentable_members
        self.documented_members += documented_members
Example #45
0
# first data import
import xml.etree.cElementTree as ET
from consts import *


file_name = r'C:\Users\Meital\Desktop\Posts.xml'
questions_cnt = 0
answers_cnt = 0

if __name__ == "__main__":
    DB = client.sof_new
    POSTS_DB = DB.posts
    #POSTS_DB.drop()
    POSTS_DB.ensure_index("Id", unique=True)

    for event, elem in ET.iterparse(file_name, events=("start", "end")):
        if event == 'start':
            post_type = elem.get('PostTypeId')
            if post_type == '1':
                title = elem.get('Title')
                tags = elem.get('Tags')
                post_id = elem.get('Id')
                if POSTS_DB.find({"Id": post_id}).count() == 0:
                    rep_id = POSTS_DB.insert(elem.attrib, w=0)
                    questions_cnt += 1
                else:
                    print 'id ' + id + ' already exists'
            elem.clear()

    print 'inserted:' + str(questions_cnt)
    for event, elem in ET.iterparse(file_name, events=("start", "end")):
                end_tag = str[index:].index('>')
                yield str[(index + 1):(index + end_tag)]
                index += end_tag + 1
            except ValueError:
                raise Exception("Tag parsing error in \"%s\"" % str)
        else:
            raise Exception("Tag parsing error in \"%s\"" % str)


if len(sys.argv) != 2:
    raise Exception("Usage: %s so-files-directory" % sys.argv[0])

os.chdir(sys.argv[1])

filename = "Posts.xml"
posts = ElementTree.iterparse(filename)
tags = {}
tag_id = 1
print "COPY posts (id, type, creation, score, viewcount, title, body, userid, lastactivity, tags, answercount, commentcount) FROM stdin;"

for event, post in posts:
    if event == "end" and post.tag == "row":
        id = int(post.attrib["Id"])

        if post.attrib.has_key("PostTypeId"):
            type = int(post.attrib["PostTypeId"])
        else:
            type = "\N"

        creation = post.attrib["CreationDate"]
Example #47
0
def processArticleFiles(filelist, outFile, processFunction):
    if not isinstance(filelist, list):
        filelist = [filelist]

    # Go through the list of filenames and open each one
    for filename in filelist:
        with open(filename, 'r') as openfile:

            # Skip to the article element in the file
            for event, elem in etree.iterparse(openfile,
                                               events=('start', 'end',
                                                       'start-ns', 'end-ns')):
                if (event == 'end' and elem.tag == 'article'):

                    pmidText, pmcidText, doiText, pubYear = getMetaInfoForPMCArticle(
                        elem)

                    # We're going to process the main article along with any subarticles
                    # And if any of the subarticles have distinguishing IDs (e.g. PMID), then
                    # that'll be used, otherwise the parent article IDs will be used
                    subarticles = [elem] + elem.findall('./sub-article')

                    for articleElem in subarticles:
                        if articleElem == elem:
                            # This is the main parent article. Just use its IDs
                            subPmidText, subPmcidText, subDoiText, subPubYear = pmidText, pmcidText, doiText, pubYear
                        else:
                            # Check if this subarticle has any distinguishing IDs and use them instead
                            subPmidText, subPmcidText, subDoiText, subPubYear = getMetaInfoForPMCArticle(
                                articleElem)
                            if subPmidText == '' and subPmcidText == '' and subDoiText == '':
                                subPmidText, subPmcidText, subDoiText = pmidText, pmcidText, doiText
                            if subPubYear == '':
                                subPubYear = pubYear

                        # Information about the source of this text
                        textSourceInfo = {
                            'pmid': subPmidText,
                            'pmcid': subPmcidText,
                            'doi': subDoiText,
                            'pubYear': subPubYear
                        }

                        # Extract the title of paper
                        title = articleElem.findall(
                            './front/article-meta/title-group/article-title'
                        ) + articleElem.findall(
                            './front-stub/title-group/article-title')
                        assert len(title) <= 1
                        titleText = extractTextFromElemList(title)
                        titleText = [
                            removeWeirdBracketsFromOldTitles(t)
                            for t in titleText
                        ]

                        # Get the subtitle (if it's there)
                        subtitle = articleElem.findall(
                            './front/article-meta/title-group/subtitle'
                        ) + articleElem.findall(
                            './front-stub/title-group/subtitle')
                        subtitleText = extractTextFromElemList(subtitle)
                        subtitleText = [
                            removeWeirdBracketsFromOldTitles(t)
                            for t in subtitleText
                        ]

                        # Extract the abstract from the paper
                        abstract = articleElem.findall(
                            './front/article-meta/abstract'
                        ) + articleElem.findall('./front-stub/abstract')
                        abstractText = extractTextFromElemList(abstract)

                        # Extract the full text from the paper as well as supplementaries and floating blocks of text
                        articleText = extractTextFromElemList(
                            articleElem.findall('./body'))
                        backText = extractTextFromElemList(
                            articleElem.findall('./back'))
                        floatingText = extractTextFromElemList(
                            articleElem.findall('./floats-group'))

                        # Combine all the text we want to process
                        allText = titleText + subtitleText + abstractText + articleText + backText + floatingText
                        allText = [t for t in allText if len(t) > 0]
                        allText = [htmlUnescape(t) for t in allText]
                        allText = [
                            removeBracketsWithoutWords(t) for t in allText
                        ]

                        # Get the co-occurrences using a single list
                        processFunction(outFile, allText, textSourceInfo)

                    # Less important here (compared to abstracts) as each article file is not too big
                    elem.clear()
def audit(osm_file):
    audit_results = {}
    field_types = {'node': {}, 'node_tags': {}, 'way': {}, 'way_tags': {}, 'way_nodes': {}}
    field_validity = {'node': {}, 'node_tags': {}, 'way': {}, 'way_tags': {}, 'way_nodes': {}}

    name_en_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
    expected = ['Road', 'Street', 'Expressway', 'Bridge', 'Highway', 'River', 'Lake', "Hutong"
                'Park', 'Zone', 'Area', 'Alley', 'Market', 'Campus', 'Gate', 'Hall', 'Engineering',
                'China', 'Elegance', 'Avenue', 'Mansion', 'Square', 'Palace', 'Hotel', 'Rail',
                'Quarter', "Building", "Line", "Apartment", "Airport", "Institute", "College"]

    # for a given tag, update the set of its field types based on current element
    def update_field_types(e, tag):
        for field in FIELDS[tag]:
            if field not in field_types[tag]:
                field_types[tag][field] = set()
            field_types[tag][field].add(detect_type(e.attrib[field]))

    # 验证时间标记
    def validate_timestamp(e, tag):
        timestamp = parse_datetime(e.attrib['timestamp'])
        if timestamp is not None:
            if timestamp < field_validity[tag]['timestamp'][0]:
                field_validity[tag]['timestamp'][0] = timestamp
            if timestamp > field_validity[tag]['timestamp'][1]:
                field_validity[tag]['timestamp'][1] = timestamp

    # 验证经纬度和时间信息是否有效
    def validate_node(e, tag):
        if field_validity[tag] == {}:
            field_validity[tag] = {
                'lat': [90, 0],
                'lon': [180, -180],
                'timestamp': [datetime.now(), datetime(1970, 1, 1, 0, 0, 0)]
            }

        lat = parse_float(e.attrib['lat'])
        if lat is not None:
            if lat < field_validity[tag]['lat'][0]:
                field_validity[tag]['lat'][0] = lat
            if lat > field_validity[tag]['lat'][1]:
                field_validity[tag]['lat'][1] = lat

        lon = parse_float(e.attrib['lon'])
        if lon is not None:
            if lon < field_validity[tag]['lon'][0]:
                field_validity[tag]['lon'][0] = lon
            if lon > field_validity[tag]['lon'][1]:
                field_validity[tag]['lon'][1] = lon
        validate_timestamp(e, tag)

    # 验证 way 的 timestamp 是否有效
    def validate_way(e, tag):
        if field_validity[tag] == {}:
            field_validity[tag] = {
                'timestamp': [datetime.now(), datetime(1970, 1, 1, 0, 0, 0)]
            }
        validate_timestamp(e, tag)

    # 验证 postcode 格式是否正确
    def validate_postcode(e, tag):
        if e.attrib['k'] == 'addr:postcode':
            postcode = e.attrib['v']
            if not check_postcode(postcode):
                field_validity[tag]['postcode'].add(postcode)


    def validate_node_tags(e, tag):
        if field_validity[tag] == {}:
            field_validity[tag] = {
                'postcode': set()
            }
        validate_postcode(e, tag)

    # 验证 name:en
    def validate_way_name_en(e, tag):
        if e.attrib['k'] == 'name:en':
            name_en = e.attrib['v']
            m = name_en_re.search(name_en)
            if m:
                way_type = m.group()
                if way_type not in expected:
                    field_validity[tag]['name_en'][way_type].add(name_en)
    
    # 验证 way 的 'name_en'、'postcode'
    def validate_way_tags(e, tag):
        if field_validity[tag] == {}:
            field_validity[tag] = {
                'name_en': defaultdict(set),
                'postcode': set()
            }
        validate_way_name_en(e, tag)
        validate_postcode(e, tag)


    for _, ele in ET.iterparse(osm_file):
        if ele.tag == 'node':
            update_field_types(ele, 'node')
            validate_node(ele, 'node')

            for e_tag in ele.iter('tag'):
                update_field_types(e_tag, 'node_tags')
                validate_node_tags(e_tag, 'node_tags')

        if ele.tag == 'way':
            update_field_types(ele, 'way')
            validate_way(ele, 'way')

            for e_tag in ele.iter('tag'):
                update_field_types(e_tag, 'way_tags')
                validate_way_tags(e_tag, 'way_tags')

            for e_nd in ele.iter('nd'):
                update_field_types(e_nd, 'way_nodes')


    audit_results['field_types'] = field_types
    audit_results['field_validity'] = field_validity

    return audit_results
Example #49
0
def process_medline_file(
    source: Union[str, TextIO],
    tag_handlers: Dict[str,
                       TagHandlerFunction] = {}) -> Iterable[MedlineArticle]:
    """
    Args:
        source: path to the MEDLINE xml file
    """
    for event, elem in etree.iterparse(source,
                                       events=("start", "end", "start-ns",
                                               "end-ns")):
        if event == "end" and elem.tag == "PubmedArticle":  # MedlineCitation'):
            # Try to extract the pmid_id
            pmid_field = elem.find("./MedlineCitation/PMID")
            assert pmid_field is not None
            pmid = pmid_field.text

            journal_year, journal_month, journal_day = get_journal_date_for_medline_file(
                elem, pmid)
            entry_year, entry_month, entry_day = get_pubmed_entry_date(
                elem, pmid)

            jComparison = tuple(
                9999 if d is None else d
                for d in [journal_year, journal_month, journal_day])
            eComparison = tuple(9999 if d is None else d
                                for d in [entry_year, entry_month, entry_day])
            if (
                    jComparison < eComparison
            ):  # The PubMed entry has been delayed for some reason so let's try the journal data
                pub_year, pub_month, pub_day = journal_year, journal_month, journal_day
            else:
                pub_year, pub_month, pub_day = entry_year, entry_month, entry_day

            # Extract the authors
            author_elems = elem.findall(
                "./MedlineCitation/Article/AuthorList/Author")
            authors = []
            for author_elem in author_elems:
                forename = author_elem.find("./ForeName")
                lastname = author_elem.find("./LastName")
                collectivename = author_elem.find("./CollectiveName")

                name = None
                if (forename is not None and lastname is not None
                        and forename.text is not None
                        and lastname.text is not None):
                    name = "%s %s" % (forename.text, lastname.text)
                elif lastname is not None and lastname.text is not None:
                    name = lastname.text
                elif forename is not None and forename.text is not None:
                    name = forename.text
                elif collectivename is not None and collectivename.text is not None:
                    name = collectivename.text
                else:
                    raise RuntimeError(
                        "Unable to find authors in Pubmed citation (PMID=%s)" %
                        pmid)
                authors.append(name)

            chemicals = []
            chemical_elems = elem.findall(
                "./MedlineCitation/ChemicalList/Chemical/NameOfSubstance")
            for chemical_elem in chemical_elems:
                chem_id = chemical_elem.attrib["UI"]
                name = chemical_elem.text
                # chemicals.append((chem_id,name))
                chemicals.append("%s|%s" % (chem_id, name))
            chemicals_txt = "\t".join(chemicals)

            mesh_headings = []
            mesh_elems = elem.findall(
                "./MedlineCitation/MeshHeadingList/MeshHeading")
            for mesh_elem in mesh_elems:
                descriptor_elem = mesh_elem.find("./DescriptorName")
                mesh_id = descriptor_elem.attrib["UI"]
                major_topic_yn = descriptor_elem.attrib["MajorTopicYN"]
                name = descriptor_elem.text

                assert "|" not in mesh_id and "~" not in mesh_id, "Found delimiter in %s" % mesh_id
                assert "|" not in major_topic_yn and "~" not in major_topic_yn, (
                    "Found delimiter in %s" % major_topic_yn)
                assert "|" not in name and "~" not in name, "Found delimiter in %s" % name

                mesh_heading = "Descriptor|%s|%s|%s" % (mesh_id,
                                                        major_topic_yn, name)

                qualifier_elems = mesh_elem.findall("./QualifierName")
                for qualifier_elem in qualifier_elems:
                    mesh_id = qualifier_elem.attrib["UI"]
                    major_topic_yn = qualifier_elem.attrib["MajorTopicYN"]
                    name = qualifier_elem.text

                    assert "|" not in mesh_id and "~" not in mesh_id, (
                        "Found delimiter in %s" % mesh_id)
                    assert "|" not in major_topic_yn and "~" not in major_topic_yn, (
                        "Found delimiter in %s" % major_topic_yn)
                    assert "|" not in name and "~" not in name, "Found delimiter in %s" % name

                    mesh_heading += "~Qualifier|%s|%s|%s" % (
                        mesh_id, major_topic_yn, name)

                mesh_headings.append(mesh_heading)
            mesh_headings_txt = "\t".join(mesh_headings)

            supplementary_concepts = []
            concept_elems = elem.findall(
                "./MedlineCitation/SupplMeshList/SupplMeshName")
            for concept_elem in concept_elems:
                concept_id = concept_elem.attrib["UI"]
                concept_type = concept_elem.attrib["Type"]
                concept_name = concept_elem.text
                # supplementary_concepts.append((concept_id,concept_type,concept_name))
                supplementary_concepts.append(
                    "%s|%s|%s" % (concept_id, concept_type, concept_name))
            supplementary_concepts_txt = "\t".join(supplementary_concepts)

            doi_elems = elem.findall(
                "./PubmedData/ArticleIdList/ArticleId[@IdType='doi']")
            dois = [
                doi_elem.text for doi_elem in doi_elems
                if doi_elem.text and doi_regex.match(doi_elem.text)
            ]

            doi = None
            if dois:
                doi = dois[0]  # We'll just use DOI the first one provided

            pmc_elems = elem.findall(
                "./PubmedData/ArticleIdList/ArticleId[@IdType='pmc']")
            assert len(pmc_elems
                       ) <= 1, "Foud more than one PMCID with PMID: %s" % pmid
            pmcid = None
            if len(pmc_elems) == 1:
                pmcid = pmc_elems[0].text

            pub_type_elems = elem.findall(
                "./MedlineCitation/Article/PublicationTypeList/PublicationType"
            )
            pub_type = [
                e.text for e in pub_type_elems if e.text not in pub_type_skips
            ]
            pub_type_txt = "|".join(pub_type)

            # Extract the title of paper
            title = elem.findall("./MedlineCitation/Article/ArticleTitle")
            title_text = extract_text_chunks(title, tag_handlers=tag_handlers)
            title_text = [
                remove_weird_brackets_from_old_titles(chunk.text)
                for chunk in title_text if chunk.text
            ]
            title_text = [t for t in title_text if len(t) > 0]
            title_text = [html.unescape(t) for t in title_text]
            title_text = [remove_brackets_without_words(t) for t in title_text]

            # Extract the abstract from the paper
            abstract = elem.findall(
                "./MedlineCitation/Article/Abstract/AbstractText")
            abstract_text = extract_text_chunks(abstract,
                                                tag_handlers=tag_handlers)
            abstract_text = [
                chunk.text for chunk in abstract_text if len(chunk.text) > 0
            ]
            abstract_text = [html.unescape(t) for t in abstract_text]
            abstract_text = [
                remove_brackets_without_words(t) for t in abstract_text
            ]

            journal_title_fields = elem.findall(
                "./MedlineCitation/Article/Journal/Title")
            journal_title_iso_fields = elem.findall(
                "./MedlineCitation/Article/Journal/ISOAbbreviation")

            journal_title, journal_iso_title = "", ""
            assert len(journal_title_fields) <= 1, "Error with pmid=%s" % pmid
            assert len(
                journal_title_iso_fields) <= 1, "Error with pmid=%s" % pmid
            if journal_title_fields:
                journal_title = journal_title_fields[0].text
            if journal_title_iso_fields:
                journal_iso_title = journal_title_iso_fields[0].text

            document = {}
            document["pmid"] = pmid
            document["pmcid"] = pmcid
            document["doi"] = doi
            document["pubYear"] = pub_year
            document["pubMonth"] = pub_month
            document["pubDay"] = pub_day
            document["title"] = title_text
            document["abstract"] = abstract_text
            document["journal"] = journal_title
            document["journalISO"] = journal_iso_title
            document["authors"] = authors
            document["chemicals"] = chemicals_txt
            document["meshHeadings"] = mesh_headings_txt
            document["supplementaryMesh"] = supplementary_concepts_txt
            document["publicationTypes"] = pub_type_txt

            yield MedlineArticle(document)

            # Important: clear the current element from memory to keep memory usage low
            elem.clear()
Example #50
0
def audit():
	for event, elem in ET.iterparse(osm_file):
		if is_street_name(elem):
		audit_street_type(street_types, elem.attrib['v'])    
	print_sorted_dict(street_types)    

if __name__ == '__main__':
	audit()



#mapparser.py

import xml.etree.cElementTree as ET #导入库函数 ET
import pprint

def count_tags(filename):
	elem_dict = {}.fromkeys(('bounds','member','nd','node','osm','relation','tag','way'),0) 
	#创建空白字典,包含key值 且value值等于0
	for _, elem in ET.iterparse(filename, events=("start",)): #遍历数据文件
		if elem.tag in elem_dict: #通过筛选是否存在空字典中key值对应的标签
			elem_dict[elem.tag] += 1 #改变字典的value值
		else:
			elem_dict[elem.tag] = 1 #此处的[]使用 elem.tag 而没有使用 '' 符号    ????
	return elem_dict

def test():

	tags = count_tags('example.osm')
	pprint.pprint(tags)
	assert tags =={'bounds': 1,
	'member': 3,
'nd': 4,
'node': 20,
'osm': 1,
'relation': 1,
'tag': 7,
'way': 1}

if __name__ =="__main__":
	test()

# Iterating through Ways Tags

import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint


#练习: 标签类型

import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
Before you process the data and add it into your database, you should check the
"k" value for each "<tag>" and see if there are any potential problems.

We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data
model and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with
problematic characters.

Please complete the function 'key_type', such that we have a count of each of
four tag categories in a dictionary:
  "lower", for tags that contain only lowercase letters and are valid,
  "lower_colon", for otherwise valid tags with a colon in their names,
  "problemchars", for tags with problematic characters, and
  "other", for other tags that do not fall into the other three categories.
See the 'process_map' and 'test' functions for examples of the expected format.
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    
    if element.tag == "tag":
        if lower.match(element.attrib['k']):
            keys["lower"] +=1
        elif lower_colon.match(element.attrib['k']):
            keys["lower_colon"] +=1
        elif problemchars.search(element.attrib['k']):
            keys["problemchars"] +=1
        else:
            keys["other"] +=1
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

#练习:探索用户

import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid")
"""

def get_user(element):
    if 'uid' in element.attrib:
        return element.attrib['uid']


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if get_user(element):
            users.add(get_user(element))

    return users


def test():

    users = process_map('example.osm')
    pprint.pprint(users)
    assert len(users) == 6



if __name__ == "__main__":
    test()



def test():
    # You can use another testfile 'map.osm' to look at your solution
    # Note that the assertion below will be incorrect then.
    # Note as well that the test function here is only used in the Test Run;
    # when you submit, your code will be checked against a different dataset.
    keys = process_map('example.osm')
    pprint.pprint(keys)
    assert keys == {'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}


if __name__ == "__main__":
    test()
#helps to identify important tag keys in the sample data.
#Here we will ctreat a dictionary holding key as key name and value as number of times it has appeard.
import xml.etree.cElementTree as ET
import pprint
import re

filename = 'sample20.osm'
node_tag_keys = dict()
for _, element in ET.iterparse(filename):
    if element.tag == 'node':
        for e in element:
            if e.attrib['k'] in node_tag_keys:
                node_tag_keys[e.attrib['k']] += 1
            else:
                node_tag_keys[e.attrib['k']] = 1
for nodes in node_tag_keys:
    #if the count of repetition is greater than 50 then it will print those names and their values
    if node_tag_keys[nodes] >= 50:
        print('{}->{}'.format(nodes, node_tag_keys[nodes]))
Example #52
0
def process_map(filename):
    check = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for event, element in ET.iterparse(filename, events=("start", )):
        check = key_check(element, check)
    return check
Example #53
0
                kwargs['failfunc'] = failfunc
                kwargs['async'] = True
            try:
                mdpath[repo] = repo._retrieveMD(name, **kwargs)
            except RepoError, e:
                failfunc(e)
        if async:
            grabber.parallel_wait()

        # parse metadata, create DeltaPackage instances
        for repo, cpath in mdpath.items():
            pinfo_repo = pinfo[repo]
            path = repo_gen_decompress(cpath,
                                       'prestodelta.xml',
                                       cached=repo.cache)
            for ev, el in iterparse(path):
                if el.tag != 'newpackage': continue
                name = el.get('name')
                arch = el.get('arch')
                new = name, arch, el.get('epoch'), el.get('version'), el.get(
                    'release')
                index = pinfo_repo.get(new)
                if index is not None:
                    po = pkgs[index]
                    perc = repo.deltarpm_percentage
                    if perc is None:
                        perc = ayum.conf.deltarpm_percentage
                    best = po.size * (perc / 100.0)
                    have = oldrpms.get(repo, {}).get((name, arch), {})
                    for el in el.findall('delta'):
                        size = int(el.find('size').text)
Example #54
0
# Do we have what we need?
if not file_ref or not file_hyp:
    print help_str
    exit(0)

# Totals
# System
t_counts_sys  = Counter({ D:Counter({TP:0,TN:0,FP:0,FN:0,FPN:0}), 
                          C:Counter({TP:0,TN:0,FP:0,FN:0,FPN:0}) })
# Baseline
t_counts_base = Counter({ D:Counter({TP:0,TN:0,FP:0,FN:0,FPN:0}),
                          C:Counter({TP:0,TN:0,FP:0,FN:0,FPN:0}) })

cg = CG.CandidateGenerator()
f_hyp = open(file_hyp,"r")
context = ET.iterparse(file_ref, events=("start", "end"))
context = iter(context)
event, root = context.next()

print_info(file_ref, file_hyp, max_a, max_m, optimise, b, w)

# Show results per sentence?
if per_sent: 
    print "\nSENTENCE RESULTS"
    print_header(per_sent)

# Read gold standard and process each sentence
for event, elem in context:
    if event == "end":
        if elem.tag == "sentence":
            sid = elem.get("id") # Sentence ID
Example #55
0
def xml_parse(xm_file, ifilter, tfilter, nfilter, list):
    """
    Function for parsing XML files created by DNSRecon and apply filters.
    """
    iplist = []
    for event, elem in cElementTree.iterparse(xm_file):
        # Check if it is a record
        if elem.tag == "record":
            # Check that it is a RR Type that has an IP Address
            if "address" in elem.attrib:
                # Check if the IP is in the filter list of IPs to ignore
                if (len(ifilter) == 0 or IPAddress(elem.attrib['address']) in ifilter) and (elem.attrib['address'] != "no_ip"):
                    # Check if the RR Type against the types
                    if re.match(tfilter, elem.attrib['type'], re.I):
                        # Process A, AAAA and PTR Records
                        if re.search(r'PTR|^[A]$|AAAA', elem.attrib['type']) \
                        and re.search(nfilter, elem.attrib['name'], re.I):
                            if list:
                                if elem.attrib['address'] not in iplist:
                                    print elem.attrib['address']
                            else:
                                print_good("{0} {1} {2}".format(elem.attrib['type'], elem.attrib['name'], elem.attrib['address']))

                        # Process NS Records
                        elif re.search(r'NS', elem.attrib['type']) and \
                        re.search(nfilter, elem.attrib['target'], re.I):
                            if list:
                                if elem.attrib['address'] not in iplist:
                                    iplist.append(elem.attrib['address'])
                            else:
                                print_good("{0} {1} {2}".format(elem.attrib['type'], elem.attrib['target'], elem.attrib['address']))

                        # Process SOA Records
                        elif re.search(r'SOA', elem.attrib['type']) and \
                        re.search(nfilter, elem.attrib['mname'], re.I):
                            if list:
                                if elem.attrib['address'] not in iplist:
                                    iplist.append(elem.attrib['address'])
                            else:
                                print_good("{0} {1} {2}".format(elem.attrib['type'], elem.attrib['mname'], elem.attrib['address']))

                        # Process MS Records
                        elif re.search(r'MX', elem.attrib['type']) and \
                        re.search(nfilter, elem.attrib['exchange'], re.I):
                            if list:
                                if elem.attrib['address'] not in iplist:
                                    iplist.append(elem.attrib['address'])
                            else:
                                print_good("{0} {1} {2}".format(elem.attrib['type'], elem.attrib['exchange'], elem.attrib['address']))

                        # Process SRV Records
                        elif re.search(r'SRV', elem.attrib['type']) and \
                        re.search(nfilter, elem.attrib['target'], re.I):
                            if list:
                                if elem.attrib['address'] not in iplist:
                                    iplist.append(elem.attrib['address'])
                            else:
                                print_good("{0} {1} {2} {3}".format(elem.attrib['type'], elem.attrib['name'], elem.attrib['address'], elem.attrib['target'], elem.attrib['port']))
            else:
                if re.match(tfilter, elem.attrib['type'], re.I):
                    # Process TXT and SPF Records
                    if re.search(r'TXT|SPF', elem.attrib['type']):
                        if not list:
                            print_good("{0} {1}".format(elem.attrib['type'], elem.attrib['strings']))
    # Process IPs in list
    if len(iplist) > 0:
        try:
            for ip in filter(None, iplist):
                print_line(ip)
        except IOError:
            sys.exit(0)
Example #56
0
 def parse(self):
     for event, elem in ElementTree.iterparse(self.dump):
         if elem.tag == "artist":
             artist = self.proc_artist(elem)
             self.download_artist(artist)
Example #57
0
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys
Example #58
0
def process_users(filename):
    users = []
    for _, element in ET.iterparse(filename):
        if "user" in element.attrib:
            users.append(element.attrib["user"])
    return users
Example #59
0
def iterative_files_changelog_parser(file_extension, filelists_xml_path,
                                     other_xml_path):
    """
    Iteratively parse filelists.xml and other.xml, to avoid over-use of memory.

    createrepo_c parses everything in bulk, into memory. For large repositories such as
    RHEL 7 or OL 7, this can require more than 5gb of memory. That isn't acceptable, especially
    when many repositories are being synced at once. The main offenders are other.xml (changelogs)
    and filelists.xml (list of owned files). These also happen to be relatively easy to parse.

    This function, ported from Pulp 2, takes a path to filelists.xml and other.xml, creates
    a streaming parser for each, and then yields one package worth of data from each file.
    """
    # it's basically always gzip, but we'll cover our bases w/ all the possibilites
    if file_extension == "gz":
        open_func = gzip.open
    elif file_extension == "xz":
        open_func = lzma.open
    elif file_extension == "bz2":
        open_func = bz2.open
    elif file_extension == "xml":
        open_func = open
    else:
        raise TypeError("Unknown metadata compression type")
    # TODO: zstd

    with open_func(filelists_xml_path) as filelists_xml, open_func(
            other_xml_path) as other_xml:
        filelists_parser = iterparse(filelists_xml, events=("start", "end"))
        filelists_xml_iterator = iter(filelists_parser)

        other_parser = iterparse(other_xml, events=("start", "end"))
        other_xml_iterator = iter(other_parser)

        # get a hold of the root element so we can clear it
        # this prevents the entire parsed document from building up in memory
        try:
            filelists_root_element = next(filelists_xml_iterator)[1]
            other_root_element = next(other_xml_iterator)[1]
        # I know. This is a terrible misuse of SyntaxError. Don't blame the messenger.
        except SyntaxError:
            log.error("failed to parse XML metadata file")
            raise

        while True:
            for event, filelists_element in filelists_xml_iterator:
                # if we're not at a fully parsed package element, keep going
                if event != "end":
                    continue
                # make this work whether the file has namespace as part of the tag or not
                if not (filelists_element.tag == "package" or re.sub(
                        NS_STRIP_RE, "", filelists_element.tag) == "package"):
                    continue

                break

            for event, other_element in other_xml_iterator:
                # if we're not at a fully parsed package element, keep going
                if event != "end":
                    continue
                # make this work whether the file has namespace as part of the tag or not
                if not (other_element.tag == "package" or re.sub(
                        NS_STRIP_RE, "", other_element.tag) == "package"):
                    continue

                break

            (filelists_pkgid,
             files) = process_filelists_package_element(filelists_element)
            (other_pkgid,
             changelogs) = process_other_package_element(other_element)

            filelists_root_element.clear(
            )  # clear all previously parsed ancestors of the root
            other_root_element.clear()

            assert (
                filelists_pkgid == other_pkgid
            ), "Package id for filelists.xml ({}) and other.xml ({}) do not match".format(
                filelists_pkgid, other_pkgid)

            yield filelists_pkgid, files, changelogs
def audit():
    osm_file = open(OSMFILE, encoding='utf8')
    for event, elem in ET.iterparse(osm_file):
        if is_phone_number(elem):
            audit_phone_numbers(elem.attrib["v"])
    osm_file.close()