def ETIteratorFromObj(obj, events=None, parser=None):
    """obj can be
    1) a string that ends with .xml -> the file is parsed and the resulting ElementTree returned
    2) a string that ends with .xml.gz -> the file is unzipped, parsed, and the resulting ElementTree is returned
    3) an open input stream -> the input is parsed and the resulting ElementTree is returned
    4) an ElementTree or an Element -> obj is returned as-is, nothing is done"""
    if isinstance(obj, str) or isinstance(obj, unicode):
        if obj.endswith(".gz"):
            fStream = GzipFile(obj, "rt")
            #fStream = codecs.getreader("utf-8")(GzipFile(obj,"rt"))
        else:
            fStream = open(obj, "rt")
            #fStream=codecs.open(obj, "rt", "utf-8")
        for rv in ElementTree.iterparse(fStream, events):
            yield rv
    elif isinstance(obj,
                    ElementTree.ElementTree) or ElementTree.iselement(obj):
        if ElementTree.iselement(obj):
            root = obj
        else:
            root = obj.getroot()
        #if events == None:
        #    events = ["END"]
        for element in root.getiterator():
            yield ("memory", element)
    else:
        #not a string, not a tree, not an element, should be a stream
        #let's parse it
        for rv in ElementTree.iterparse(obj, events):
            yield rv
Exemple #2
0
def ETIteratorFromObj(obj, events=None, parser=None):
    """obj can be
    1) a string that ends with .xml -> the file is parsed and the resulting ElementTree returned
    2) a string that ends with .xml.gz -> the file is unzipped, parsed, and the resulting ElementTree is returned
    3) an open input stream -> the input is parsed and the resulting ElementTree is returned
    4) an ElementTree or an Element -> obj is returned as-is, nothing is done"""
    if isinstance(obj, str) or isinstance(obj, unicode):
        if obj.endswith(".gz"):
            fStream = GzipFile(obj, "rt")
            # fStream = codecs.getreader("utf-8")(GzipFile(obj,"rt"))
        else:
            fStream = open(obj, "rt")
            # fStream=codecs.open(obj, "rt", "utf-8")
        for rv in ElementTree.iterparse(fStream, events):
            yield rv
    elif isinstance(obj, ElementTree.ElementTree) or ElementTree.iselement(obj):
        if ElementTree.iselement(obj):
            root = obj
        else:
            root = obj.getroot()
        # if events == None:
        #    events = ["END"]
        for element in root.getiterator():
            yield ("memory", element)
    else:
        # not a string, not a tree, not an element, should be a stream
        # let's parse it
        for rv in ElementTree.iterparse(obj, events):
            yield rv
Exemple #3
0
 def parse(self):
     var_names = []
     bindings = []
     events = iter(ET.iterparse(self.stream, events=('start', 'end')))
     # lets gather up the variable names in head
     for (event, node) in events:
         if event == 'start' and node.tag == _VARIABLE:
             var_names.append(node.get('name'))
         elif event == 'end' and node.tag == _HEAD:
             break
     # now let's yield each result as we parse them
     for (event, node) in events:
         if event == 'start':
             if node.tag == _BINDING:
                 idx = var_names.index(node.get('name'))
             elif node.tag == _RESULT:
                 bindings = [None, ] * len(var_names)
         elif event == 'end':
             if node.tag == _URI:
                 bindings[idx] = URIRef(node.text)
             elif node.tag == _BNODE:
                 bindings[idx] = BNode(node.text)
             elif node.tag == _LITERAL:
                 bindings[idx] = Literal(node.text or '',
                                         datatype=node.get('datatype'),
                                         lang=node.get(_LANG))
             elif node.tag == _RESULT:
                 node.clear()
                 yield tuple(bindings)
def iterparse(file, elementName, callback, limit=-1):
    """ Parse iteratively xml-files
    
    This function offers a simple way to use the cElementTree
    iterparse-function the way it is often used.
    
    Keyword arguments:
    file -- (file) file or file-like object to parse 
    elementName -- (string) matching elements are passed to the callback
    callback -- (function) called when parser has parsed an element
                of name elementName
    limit -- (int) stop after reading "limit" elements. If -1, read
             until end of file. This is mostly useful when debugging
             programs that parse large files.
    """
    context = ElementTree.iterparse(file, events=("start", "end"))
    root = None

    for event, elem in context:
        if limit == 0:
            return

        if event == "start" and root is None:
            root = elem  # the first element is root
        if event == "end" and elem.tag == elementName:  #elem.tag == "record":
            #... process record elements ...
            callback(elem)
            root.clear()
            if limit != -1:
                limit -= 1
Exemple #5
0
def iterparse(file, elementName, callback, limit = -1):
    """ Parse iteratively xml-files
    
    This function offers a simple way to use the cElementTree
    iterparse-function the way it is often used.
    
    Keyword arguments:
    file -- (file) file or file-like object to parse 
    elementName -- (string) matching elements are passed to the callback
    callback -- (function) called when parser has parsed an element
                of name elementName
    limit -- (int) stop after reading "limit" elements. If -1, read
             until end of file. This is mostly useful when debugging
             programs that parse large files.
    """
    context = ElementTree.iterparse(file, events=("start", "end"))
    root = None

    for event, elem in context:
        if limit == 0:
            return

        if event == "start" and root is None:
            root = elem     # the first element is root
        if event == "end" and elem.tag == elementName: #elem.tag == "record":
            #... process record elements ...
            callback(elem)
            root.clear()
            if limit != -1:
                limit -= 1
def iterparse2(file, events=("start", "end")):
    """ Parse iteratively xml-files
    
    This function offers a simple way to use the cElementTree
    iterparse-function the way it is often used.
    
    Keyword arguments:
    file -- (file) file or file-like object to parse 
    elementName -- (string) matching elements are passed to the callback
    callback -- (function) called when parser has parsed an element
                of name elementName
    limit -- (int) stop after reading "limit" elements. If -1, read
             until end of file. This is mostly useful when debugging
             programs that parse large files.
    """
    # get an iterable
    context = ElementTree.iterparse(file, events=events)

    # turn it into an iterator
    context = iter(context)

    # get the root element
    event, root = context.next()
    yield (event, root)

    for event, elem in context:
        yield (event, elem)
        if event == "end":
            root.clear()
Exemple #7
0
def iterparse2(file, events=("start", "end")):
    """ Parse iteratively xml-files
    
    This function offers a simple way to use the cElementTree
    iterparse-function the way it is often used.
    
    Keyword arguments:
    file -- (file) file or file-like object to parse 
    elementName -- (string) matching elements are passed to the callback
    callback -- (function) called when parser has parsed an element
                of name elementName
    limit -- (int) stop after reading "limit" elements. If -1, read
             until end of file. This is mostly useful when debugging
             programs that parse large files.
    """
    # get an iterable
    context = ElementTree.iterparse(file, events=events)
    
    # turn it into an iterator
    context = iter(context)
    
    # get the root element
    event, root = context.next()
    yield (event, root)
    
    for event, elem in context:
        yield (event, elem)
        if event == "end":
            root.clear()
Exemple #8
0
 def parse(self):
     var_names = []
     bindings = []
     events = iter(ET.iterparse(self.stream, events=('start', 'end')))
     # lets gather up the variable names in head
     for (event, node) in events:
         if event == 'start' and node.tag == _VARIABLE:
             var_names.append(node.get('name'))
         elif event == 'end' and node.tag == _HEAD:
             break
     # now let's yield each result as we parse them
     for (event, node) in events:
         if event == 'start':
             if node.tag == _BINDING:
                 idx = var_names.index(node.get('name'))
             elif node.tag == _RESULT:
                 bindings = [
                     None,
                 ] * len(var_names)
         elif event == 'end':
             if node.tag == _URI:
                 bindings[idx] = URIRef(node.text)
             elif node.tag == _BNODE:
                 bindings[idx] = BNode(node.text)
             elif node.tag == _LITERAL:
                 bindings[idx] = Literal(node.text or '',
                                         datatype=node.get('datatype'),
                                         lang=node.get(_LANG))
             elif node.tag == _RESULT:
                 node.clear()
                 yield tuple(bindings)
    def handle(self, *args, **options):
        # download file
        r = requests.get('https://data.icecat.biz/export/level4/refs/SuppliersList.xml.gz', auth=(settings.API_USERNAME, settings.API_PASSWORD))

        tmp_filename = settings.TMP_PATH + '/suppliers.xml.gz'
        with open(tmp_filename, "wb") as f:
            f.write(r.content)

        # the suppliers list is gzipped
        f = gzip.open(tmp_filename, 'rb')

        # use cElement, it's faaaaaaaaast
        from cElementTree import iterparse
        context = iterparse(f, events=("start", "end"))

        # turn it into an iterator
        context = iter(context)

        # get the root element
        event, root = context.next()


        # loop through suppliers
        for event, elem in context:
            if event == "end" and elem.tag == "Supplier":
                values = dict(elem.items())

                # get or update the supplier
                supplier, created = Supplier.objects.get_or_create(name=values['Name'], pk=values['ID'])
                if created:
                    print supplier.name + ' added'
                root.clear()

        # close gzip file
        f.close()
def read(f, ignore_tags=None):
    """
    Generator for reading a wikiprep XML file from a file object.
    """
    print >> sys.stderr, "Reading %s..." % f
    # print >> sys.stderr, stats()
    doc = {}
    cnt = 0

    if not ignore_tags:
        ignore_tags = set()

    for event, elem in cElementTree.iterparse(f):

        if elem.tag in ignore_tags:
            continue

        if elem.tag == "title":
            doc["title"] = ("".join(elem.itertext()))
        elif elem.tag == "text":
            doc["text"] = ("".join(elem.itertext()))
        elif elem.tag == "link":
            # Skip internal links
            if elem.get("url") is None:
                continue

            if "external links" not in doc:
                doc["external links"] = []
            doc["external links"].append([elem.get("url"), ("".join(elem.itertext()))])
        elif elem.tag == "links":
            doc["links"] = [int(i) for i in string.split("".join(elem.itertext()))]
        elif elem.tag == "categories":
            doc["categories"] = [int(i) for i in string.split("".join(elem.itertext()))]
        elif elem.tag == "page":
            doc["_id"] = int(elem.get("id"))
            doc["length"] = int(elem.get("newlength"))
            if elem.get("stub"):
                doc["stub"] = bool(elem.get("stub") == "1")
            if elem.get("disambig"):
                doc["disambig"] = bool(elem.get("disambig") == "1")
            if elem.get("image"):
                doc["image"] = bool(elem.get("image") == "1")
            if elem.get("category"):
                doc["category"] = bool(elem.get("category") == "1")

            cnt += 1
            yield doc
            doc = {}

            # Free the memory of the building tree
            elem.clear()
            if cnt % 20000 == 0:
                print >> sys.stderr, "Read %d articles from %s" % (cnt, f)
                # print >> sys.stderr, stats()
    print >> sys.stderr, "...done reading %s" % f
Exemple #11
0
    def __init__(self, filename):
        self.deltainfo = {}

        if filename.endswith(".gz"):
            fo = gzip.open(filename)
        else:
            fo = open(filename, "rt")
        for event, elem in iterparse(fo):
            if elem.tag == "newpackage":
                p = NewPackage(elem)
                self.deltainfo[p.nevra()] = p
Exemple #12
0
    def __init__(self, filename):
        self.deltainfo = {}

        if filename.endswith(".gz"):
            fo = gzip.open(filename)
        else:
            fo = open(filename, 'rt')
        for event, elem in iterparse(fo):
            if elem.tag == "newpackage":
                p = NewPackage(elem)
                self.deltainfo[p.nevra()] = p
Exemple #13
0
	def __init__(self, filename):
		parser = ET.iterparse(filename)
		for action, elem in parser:
			unmarshal = self.unmarshallers.get(elem.tag)
			if unmarshal:
				data = unmarshal(elem)
				elem.clear()
				elem.text = data
			elif elem.tag != "plist":
				raise IOError("unknown plist type: %r" % elem.tag)
		self.plist = parser.root[0].text
 def __init__(self, file):
     MoleculeFactory.__init__(self)
     for event, element in iterparse(file):
         tag = element.tag
         ob_id = element.attrib.get('id', None)
         if tag == 'molecule' and ob_id is not None:
             self.makeGroup(element)
             element.clear()
         elif tag == 'templates':
             element.clear()
         elif tag == 'universe':
             self.makeUniverse(element)
Exemple #15
0
 def __init__(self, file):
     MoleculeFactory.__init__(self)
     for event, element in iterparse(file):
         tag = element.tag
         ob_id = element.attrib.get('id', None)
         if tag == 'molecule' and ob_id is not None:
             self.makeGroup(element)
             element.clear()
         elif tag == 'templates':
             element.clear()
         elif tag == 'universe':
             self.makeUniverse(element)
def read_mast_output(mastfile, output, genes):
    MAST = MastResult(genes)
    for event, elem in cET.iterparse(mastfile):
        tag = elem.tag
        if tag == "model":  
            MAST.model(elem)
        elif tag == "sequences":  
            MAST.sequences(elem)
    annotations = MAST.getAnnotD()
    MAST.combine_p_values()
    comb_pe_values = MAST.get_pe_values() 
    return (comb_pe_values, annotations)
Exemple #17
0
    def handle(self, *args, **options):
        # download file
        r = requests.get('https://data.icecat.biz/export/level4/refs/CategoriesList.xml.gz', auth=(settings.API_USERNAME, settings.API_PASSWORD))

        tmp_filename = settings.TMP_PATH + '/categories.xml.gz'
        with open(tmp_filename, "wb") as f:
            f.write(r.content)

        # the suppliers list is gzipped
        f = gzip.open(tmp_filename, 'rb')

        # use cElement, it's faaaaaaaaast
        from cElementTree import iterparse
        context = iterparse(f, events=("start", "end"))

        # turn it into an iterator
        context = iter(context)

        # get the root element
        event, root = context.next()

        # loop through suppliers
        for event, elem in context:
            if event == "end" and elem.tag == "Category":
                values = dict(elem.items())
                category, created = Category.objects.get_or_create(pk=values['ID'])

                # find description
                descriptions = elem.findall('Description')
                for descr in descriptions:
                    descr_values = dict(descr.items())
                    if descr_values['langid'] == str(settings.API_LANGUAGE):
                        category.description = descr_values['Value']

                # find name
                names = elem.findall('Name')
                for name in names:
                    name_values = dict(name.items())
                    if name_values['langid'] == str(settings.API_LANGUAGE):
                        category.name = name_values['Value']

                # see if we have a parent category
                parent = elem.find('ParentCategory')
                if parent:
                    parent = dict(parent.items())
                    category.parent_id = parent['ID']

                category.save()

                root.clear()

        # All done!
def termExtraction(appid, context, query=None):
    d = dict(appid=appid, context=context.encode("utf-8"))
    if query:
        d["query"] = query.encode("utf-8")
    result = []
    f = urllib.urlopen(URI + '?' + urllib.urlencode(d))
    try:
        for event, elem in ElementTree.iterparse(f):
            if elem.tag == "{urn:yahoo:cate}Result":
                result.append(elem.text)
    except SyntaxError:
        return []
    return result
Exemple #19
0
 def loads(data):
     params = method = None
     for action, elem in iterparse(StringIO(data)):
         unmarshal = unmarshallers.get(elem.tag)
         if unmarshal:
             data = unmarshal(elem)
             elem.clear()
             elem.text = data
         elif elem.tag == "methodName":
             method = elem.text
         elif elem.tag == "params":
             params = tuple([v.text for v in elem])
     return params, method
Exemple #20
0
 def loads(data):
     params = method = None
     for action, elem in iterparse(StringIO(data)):
         unmarshal = unmarshallers.get(elem.tag)
         if unmarshal:
             data = unmarshal(elem)
             elem.clear()
             elem.text = data
         elif elem.tag == "methodName":
             method = elem.text
         elif elem.tag == "params":
             params = tuple([v.text for v in elem])
     return params, method
def parse_troves(filename):
    """Parse given fm troves"""
    f = {}
    for event, elem in iterparse(open(filename, "r")):
        if elem.tag == "id":
            id = int(elem.text)
            elem.clear()
        elif elem.tag == "name":
            name = "%s" % elem.text
            elem.clear()
            f[id] = {'id': id, 'name': name}
            elem.clear()
    file = open(TROVES_DICT, 'w')
    cPickle.dump(f, file)
def load(file):
    """ 
    Loads an Apple Property List (XML) and parses it 
    Source: http://effbot.org/zone/element-iterparse.htm
    """
    parser = ET.iterparse(file)
    for action, elem in parser:
        unmarshal = unmarshallers.get(elem.tag)
        if unmarshal:
            data = unmarshal(elem)
            elem.clear()
            elem.text = data
        elif elem.tag != "plist":
            raise IOError("Unknown plist type: %r" % elem.tag)
    return parser.root[0].text
def termExtraction(appid, context, query=None):
    d = dict(
        appid=appid,
        context=context.encode("utf-8")
        )
    if query:
        d["query"] = query.encode("utf-8")
    result = []
    f = urllib.urlopen(URI + '?' + urllib.urlencode(d))
    try:
        for event, elem in ElementTree.iterparse(f):
            if elem.tag == "{urn:yahoo:cate}Result":
                result.append(elem.text)
    except SyntaxError:
        return []
    return result
def parse_troves(filename):
    """Parse given fm troves"""
    f = {}
    for event, elem in iterparse(open(filename, "r")):
        if elem.tag == "id":
            id = int(elem.text)
            elem.clear()
        elif elem.tag == "name":
            name = "%s" % elem.text
            elem.clear()
            f[id] = {'id': id,
                        'name': name
                        }
            elem.clear()
    file = open(TROVES_DICT, 'w')
    cPickle.dump(f, file)
def parse_rdf(filename):
    """Parse given fm rdf"""
    f = {}
    trove = ""
    #Number of years to check back in time.
    years = get_last_four_years()
    for event, elem in iterparse(open(filename, "r")):
        if elem.tag == "project_id":
            project_id = int(elem.text)
            elem.clear()
        elif elem.tag == "projectname_short":
            projectname_short = "%s" % elem.text
            elem.clear()
        elif elem.tag == "desc_short":
            desc_short = "%s" % elem.text
            elem.clear()
        elif elem.tag == "latest_release_version":
            latest_release_version = "%s" % elem.text
            elem.clear()
        elif elem.tag == "url_homepage":
            url_homepage = "%s" % elem.text
            elem.clear()
        elif elem.tag == "url_changelog":
            url_changelog = "%s" % elem.text
            elem.clear()
        elif elem.tag == "latest_release_date":
            latest_release_date = "%s" % elem.text[0:10]
            elem.clear()
        elif elem.tag == "descriminators":
            t = ""
            for trove in elem[:]:
                t = "%s %s" % (t, trove.text)
            #If it hasn't been updated in four years, screw it.
            if latest_release_date[0:4] in years:
                f[projectname_short] = {
                    'id': project_id,
                    'descShort': desc_short,
                    'fmName': projectname_short,
                    'latestReleaseVersion': latest_release_version,
                    'urlHomepage': url_homepage,
                    'urlChangelog': url_changelog,
                    'latestReleaseDate': latest_release_date,
                    'troveId': t
                }
            elem.clear()
    file = open(FM_DICT, 'w')
    cPickle.dump(f, file)
Exemple #26
0
 def _parseInternal(self,fileLike):
     for event,elem in et.iterparse(fileLike):
         if elem.tag == 'request':
             id = elem.attrib.get('id')
             status = elem.attrib.get('status')
             reason = elem.attrib.get('reason')
             if id is None:
                 raise ParseException('missing id')
             if status is None:
                 raise ParseException('missing status')
             if reason is None:
                 reason = ''
             cval = ResponseData(id,status,reason)
             self.requests.append(cval)
             for itemel in elem.getiterator():
                 if itemel.tag == 'value':
                     cval.params[itemel.attrib['name']] = itemel.text
Exemple #27
0
    def processResults(self,data):
        results = []
        citem = None
        #domtree = parseString(data)
        #print domtree.toprettyxml()

        for event,elem in et.iterparse(StringIO(data)):
            #print elem.tag
            if elem.tag == '{urn:ebay:apis:eBLBaseComponents}Item':
                citem = self.ebayItem()
                results.append(citem)
                for itemel in elem.getiterator():
                    if itemel.tag == '{urn:ebay:apis:eBLBaseComponents}ViewItemURL':
                        citem.url = itemel.text
                    elif itemel.tag == '{urn:ebay:apis:eBLBaseComponents}CurrentPrice':
                        citem.price = itemel.text
        return results        
def parse_rdf(filename):
    """Parse given fm rdf"""
    f = {}
    trove = ""
    #Number of years to check back in time.
    years = get_last_four_years()
    for event, elem in iterparse(open(filename, "r")):
        if elem.tag == "project_id":
            project_id = int(elem.text)
            elem.clear()
        elif elem.tag == "projectname_short":
            projectname_short = "%s" % elem.text
            elem.clear()
        elif elem.tag == "desc_short":
            desc_short = "%s" % elem.text
            elem.clear()
        elif elem.tag == "latest_release_version":
            latest_release_version = "%s" % elem.text
            elem.clear()
        elif elem.tag == "url_homepage":
            url_homepage = "%s" % elem.text
            elem.clear()
        elif elem.tag == "url_changelog":
            url_changelog = "%s" % elem.text
            elem.clear()
        elif elem.tag == "latest_release_date":
            latest_release_date = "%s" % elem.text[0:10]
            elem.clear()
        elif elem.tag == "descriminators":
            t = ""
            for trove in elem[:]:
                t = "%s %s" % (t, trove.text)
            #If it hasn't been updated in four years, screw it.
            if  latest_release_date[0:4] in years:
                    f[projectname_short] = {'id': project_id,
                                'descShort': desc_short,
                                'fmName': projectname_short,
                                'latestReleaseVersion': latest_release_version,
                                'urlHomepage': url_homepage,
                                'urlChangelog': url_changelog,
                                'latestReleaseDate': latest_release_date,
                                'troveId': t
                                }
            elem.clear()
    file = open(FM_DICT, 'w')
    cPickle.dump(f, file)
Exemple #29
0
    def parse_sparql_xml(self, data):
        """ New cElementTree SPARQL Results format parser, Copyright (c) 2011 Daniel A. Smith (written 2011-02-23) """
        current_type = None
        current_name = ""
        current_chars = ""
        results = []
        current = {}

        logging.debug(data)

        for event, elem in cElementTree.iterparse(StringIO(data),
                                                  events=("start", "end")):
            if event == "start":
                if elem.tag == '{http://www.w3.org/2005/sparql-results#}uri':
                    current_type = 'uri'
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}literal':
                    current_type = 'literal'
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}binding':
                    current_name = elem.attrib['name']
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}bnode':
                    current_type = 'bnode'

            elif event == "end":
                if elem.tag == '{http://www.w3.org/2005/sparql-results#}uri':
                    current_chars = elem.text
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}literal':
                    current_chars = elem.text
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}bnode':
                    current_chars = elem.text
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}binding':
                    current[current_name] = {
                        'value': current_chars,
                        'type': current_type
                    }
                    if elem.attrib.has_key('xml:lang'):
                        current[current_name]['xml:lang'] = elem.attrib[
                            'xml:lang']
                    current_chars = ""
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}result':
                    results.append(current)
                    current = {}

                elem.clear()

        return results
Exemple #30
0
    def handle(self, *args, **options):
        # TODO: make this configurable?
        filename = args[0]

        with open(filename, "r") as f:
            # use cElement, it's faaaaaaaaast
            from cElementTree import iterparse
            context = iterparse(f, events=("start", "end"))

            # turn it into an iterator
            context = iter(context)

            # # get the root element
            event, root = context.next()

            # loop through suppliers
            for event, elem in context:
                if event == "end" and elem.tag == "file":
                    values = dict(elem.items())

                    supplier, created = Supplier.objects.get_or_create(
                        pk=values['Supplier_id'])
                    category, created = Category.objects.get_or_create(
                        pk=values['Catid'])

                    product = Product()
                    product.pk = values['Product_ID']
                    product.supplier = supplier
                    product.category = category
                    product.model_name = values['Model_Name']
                    product.part = values['Prod_ID']
                    product.created_at = datetime.strptime(
                        values['Date_Added'], '%Y%m%d%H%M%S')
                    product.updated_at = datetime.strptime(
                        values['Updated'], '%Y%m%d%H%M%S')
                    product.thumbnail = values['HighPic']
                    if values['On_Market'] == '1':
                        product.on_market = True
                    product.save()

                    print product.model_name, product.part
                    root.clear()
    def parse_sparql_xml(self, data):
        """ New cElementTree SPARQL Results format parser, Copyright (c) 2011 Daniel A. Smith (written 2011-02-23) """
        current_type = None
        current_name = ""
        current_chars = ""
        results = []
        current = {}

        logging.debug(data)

        for event, elem in cElementTree.iterparse(StringIO(data), events=("start","end")):
            if event == "start":
                if elem.tag == '{http://www.w3.org/2005/sparql-results#}uri':
                    current_type = 'uri'
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}literal':
                    current_type = 'literal'
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}binding':
                    current_name = elem.attrib['name']
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}bnode':
                    current_type = 'bnode'

            elif event == "end":
                if elem.tag == '{http://www.w3.org/2005/sparql-results#}uri':
                    current_chars = elem.text
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}literal':
                    current_chars = elem.text
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}bnode':
                    current_chars = elem.text
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}binding':
                    current[current_name] = {'value': current_chars, 'type': current_type}
                    if elem.attrib.has_key('xml:lang'):
                        current[current_name]['xml:lang'] = elem.attrib['xml:lang']
                    current_chars = ""
                elif elem.tag == '{http://www.w3.org/2005/sparql-results#}result':
                    results.append(current)
                    current = {}

                elem.clear()

        return results
Exemple #32
0
    def handle(self, *args, **options):
        # TODO: make this configurable?
        filename = args[0]

        with open(filename, "r") as f:
            # use cElement, it's faaaaaaaaast
            from cElementTree import iterparse
            context = iterparse(f, events=("start", "end"))

            # turn it into an iterator
            context = iter(context)

            # # get the root element
            event, root = context.next()

            # loop through suppliers
            for event, elem in context:
                if event == "end" and elem.tag == "file":
                    values = dict(elem.items())

                    supplier, created = Supplier.objects.get_or_create(pk=values['Supplier_id'])
                    category, created = Category.objects.get_or_create(pk=values['Catid'])

                    product = Product()
                    product.pk = values['Product_ID']
                    product.supplier = supplier
                    product.category = category
                    product.model_name = values['Model_Name']
                    product.part = values['Prod_ID']
                    product.created_at = datetime.strptime(values['Date_Added'], '%Y%m%d%H%M%S')
                    product.updated_at = datetime.strptime(values['Updated'], '%Y%m%d%H%M%S')
                    product.thumbnail = values['HighPic']
                    if values['On_Market'] == '1':
                        product.on_market = True
                    product.save()

                    print product.model_name, product.part
                    root.clear()
Exemple #33
0
    def parseRequest(self,data):
        currentCat = None
        tempcat = {}

        # save off the file. temporary hack!!
        domtree = parseString(data)
        f = open('/tmp/output.xml','w')
        f.write(domtree.toprettyxml().encode('utf-8'))
        f.close()
        
        try:
            for event,elem in et.iterparse(StringIO(data)):
                if elem.tag == '{urn:ebay:apis:eBLBaseComponents}Category':
                    for itemel in elem.getiterator():
                        if itemel.tag == '{urn:ebay:apis:eBLBaseComponents}CategoryID':
                            currentCat = itemel.text
                        if itemel.tag == '{urn:ebay:apis:eBLBaseComponents}CategoryName':
                            if itemel.text == 'Cycling':
                                print 'cycling found,value is',currentCat,itemel.text.lower() in tempcat
                            if itemel.text.lower() not in tempcat:
                                tempcat[itemel.text.lower()] = currentCat
        except Exception,e:
            print 'error loading catalog; keeping existing entries',e
Exemple #34
0
def inlinkData(appid, # see http://developer.yahoo.net/faq/index.html#appid
               query, #  The domain or path to get inlink data for.
               results=50, # The number of results to return. 
               start=1, #  The starting result position to return (1-based). 
                        # The finishing position (start + results - 1) cannot 
                        # exceed 1000.
               entire_site=None # Specifies whether to provide results for the 
                                # entire site, or just the page referenced by 
                                # the query. If the query is not a domain URL 
                                # (i.e. it contains path information, such as 
                                # http://smallbusiness.yahoo.com/webhosting/), 
                                # this parameter has no effect.
               ):
    d = dict(appid=appid, 
             query=unicodify(query).encode('utf-8'),
             results=int(results),
             start=int(start)
             )
    if entire_site:
        # xxx perhaps this parameter should be automatically 
        # set based on the 'query' having a path
        d['entire_site'] = 1

    u = URI+'?'+urllib.urlencode(d)
    f = urllib.urlopen(u)
    results = []
    Title = Url = ClickUrl = None
    for event, elem in ElementTree.iterparse(f):
        if elem.tag == '{urn:yahoo:srch}ClickUrl':
            ClickUrl = elem.text
        elif elem.tag == '{urn:yahoo:srch}Url':
            Url = elem.text
        elif elem.tag == '{urn:yahoo:srch}Title':
            Title = elem.text
        elif not (Title is None or Url is None or ClickUrl is None):
            yield dict(title=Title, url=Url, clickurl=ClickUrl)
            Title = Url = ClickUrl = None
Exemple #35
0
    def handle(self, *args, **options):
        # download file
        r = requests.get(
            'https://data.icecat.biz/export/level4/refs/SuppliersList.xml.gz',
            auth=(settings.API_USERNAME, settings.API_PASSWORD))

        tmp_filename = settings.TMP_PATH + '/suppliers.xml.gz'
        with open(tmp_filename, "wb") as f:
            f.write(r.content)

        # the suppliers list is gzipped
        f = gzip.open(tmp_filename, 'rb')

        # use cElement, it's faaaaaaaaast
        from cElementTree import iterparse
        context = iterparse(f, events=("start", "end"))

        # turn it into an iterator
        context = iter(context)

        # get the root element
        event, root = context.next()

        # loop through suppliers
        for event, elem in context:
            if event == "end" and elem.tag == "Supplier":
                values = dict(elem.items())

                # get or update the supplier
                supplier, created = Supplier.objects.get_or_create(
                    name=values['Name'], pk=values['ID'])
                if created:
                    print supplier.name + ' added'
                root.clear()

        # close gzip file
        f.close()
Exemple #36
0
 def parseFile(self, file):
     current_comp_id = None
     current_seq_id = None
     current_asym_id = None
     current_chain = None
     current_residue = None
     for event, element in ET.iterparse(file):
         tag = element.tag
         if self.prefix is None:
             self.prefix = tag[:tag.find('}')+1]
         if tag == self.prefix+"atom_site":
             atom_spec = self.parseAtom(element)
             if (atom_spec['alt_id'] is None or
                 atom_spec['alt_id'] == self.alternate) \
                    and atom_spec['model'] == self.model:
                 atom = Atom(atom_spec['name'], atom_spec['position'],
                             element=atom_spec['element'],
                             occupancy=atom_spec['occupancy'],
                             temperature_factor=atom_spec['beta'])
                 self.atoms[atom_spec['atom_id']] = atom
                 if atom_spec['asym_id'] != current_asym_id:
                     # start new chain or molecule
                     entity = self.entities[atom_spec['entity_id']]
                     residue_name = atom_spec['comp_id']
                     if entity['type'] == 'polymer':
                         if residue_name in amino_acids:
                             current_chain = MMTK.PDB.PDBPeptideChain(chain_id = atom_spec['asym_id'], segment_id = '')
                             self.peptide_chains.append(current_chain)
                         elif residue_name in nucleic_acids:
                             current_chain = MMTK.PDB.PDBNucleotideChain(chain_id = atom_spec['asym_id'], segment_id = '')
                             self.nucleotide_chains.append(current_chain)
                         else:
                             raise ValueError('Unknown polymer type' +
                                              'containing residue ' +
                                              residue_name)
                         current_comp_id = None
                         current_residue = None
                     else:
                         current_chain = None
                         current_residue = MMTK.PDB.PDBMolecule(residue_name)
                         current_comp_id = residue_name
                         current_seq_id = atom_spec['seq_id']
                         mol_list = self.molecules.get(residue_name, [])
                         mol_list.append(current_residue)
                         self.molecules[residue_name] = mol_list
                         self.residues.append(current_residue)
                     current_asym_id = atom_spec['asym_id']
                 if atom_spec['comp_id'] != current_comp_id or \
                        atom_spec['seq_id'] != current_seq_id:
                     # start a new residue
                     current_comp_id = atom_spec['comp_id']
                     current_seq_id = atom_spec['seq_id']
                     if current_comp_id in amino_acids:
                         current_residue = AminoAcidResidue(current_comp_id,
                                                            [],
                                                            current_seq_id)
                     elif current_comp_id in nucleic_acids:
                         current_residue = NucleotideResidue(current_comp_id,
                                                             [],
                                                             current_seq_id)
                     else:
                         raise ValueError('Unknown residue ' +
                                          residue_name)
                     current_chain.addResidue(current_residue)
                     self.residues.append(current_residue)
                 if current_residue is not None:
                     # Ultimately, this should never be None, but for
                     # now we skip whatever we can't handle yet
                     current_residue.addAtom(atom)
             element.clear()
         elif tag == self.prefix+'chem_compCategory':
             self.chem_comp = element
         elif tag == self.prefix+'pdbx_entity_nameCategory':
             self.entity_names = element
         elif tag == self.prefix+'entityCategory':
             self.entities = {}
             for e in element:
                 entity_id = e.attrib['id']
                 entity_def = {}
                 for field in e:
                     entity_def[field.tag[len(self.prefix):]] = field.text
                 self.entities[entity_id] = entity_def
         elif tag == self.prefix+'entity_poly_seqCategory':
             self.chain_entities = None
         elif tag == self.prefix+'pdbx_poly_seq_schemeCategory':
             self.chains = element
         elif tag == self.prefix+'pdbx_nonpoly_schemeCategory':
             self.nonchains = element
         elif tag == self.prefix+'atom_siteCategory':
             # This event happens when all atoms have been treated
             element.clear()
    def __init__(self, source):
        self.elements = []
        # map element names to ElementTypes
        elemDict = {}
        # elements currently open
        elemStack = []
        # map (father-name, child-name) to _childInfo
        childInfoDict = {}
        namespaces = deque([('xml','http://www.w3.org/XML/1998/namespace')])
        pendingNamespaces = []
        for event,elem in iterparse(source, events=('start', 'end',
                                                    'start-ns', 'end-ns')):
            if event == 'start-ns':
                namespaces.appendleft(elem)
                pendingNamespaces.append(elem)

            elif event == 'end-ns':
                namespaces.popleft()

            elif event == 'start':
                # add the namespace declarations as attributes
                for prefix,url in pendingNamespaces:
                    attr = prefix and 'xmlns:%s' % prefix or 'xmlns'
                    elem.attrib[attr] = escape_attrib(url)
                del pendingNamespaces[:]
                # convert name from clark format to prefix:local
                name = _clark_to_orig(elem.tag,namespaces)
                elemType = elemDict.get(name)
                if elemType is None:
                    elemType = ElementType(name)
                    elemDict[name] = elemType
                    self.elements.append(elemType)
                elemType._occurrences += 1
                # update atttibute declarations
                for attr,value in elem.items():
                    # convert attribute names from clark format to prefix:local
                    elemType.updateAttribute(_clark_to_orig(attr,namespaces),value)
                # keep track of the nesting and sequence of child elements
                if elemStack:
                    parentEntry = elemStack[-1]
                    # for sequencing, we're interested in consecutive groups
                    # of the same child element type
                    isFirstInGroup = parentEntry.latestChild != name
                    if isFirstInGroup:
                        parentEntry.latestChild = name
                        parentEntry.sequenceNumber += 1
                    # check if we've seen this child of this parent before
                    parent = parentEntry.elemType
                    childInfo = childInfoDict.get((parent.name,name))
                    if childInfo is None:
                        # this is the first time we've seen this child
                        # belonging to this parent. if the child is not on
                        # the first instance of the parent, then we allow it
                        # as an optional element
                        childInfo = _ChildInfo(name, parent._occurrences>1)
                        childInfoDict[parent.name,name] = childInfo
                        parent._children.append(childInfo)
                    elif (
                        # we've seen this child before: check if it makes
                        # parent non-consecutive
                        parent._occurrences == 1 and isFirstInGroup
                        # check whether the position of this group of children in
                        # this parent element is the same as its position in
                        # previous instances of the parent.
                        or len(parent._children) <= parentEntry.sequenceNumber
                        or parent._children[parentEntry.sequenceNumber].name != name):
                            parent._sequenced = False
                    # if there's more than one child element, mark it as repeatable
                    if not isFirstInGroup:
                        childInfo.repeatable = True
                #fi elemStack
                elemStack.append(_StackEntry(elemType))

            elif event == 'end':
                entry = elemStack.pop()
                elemType = entry.elemType
                for txt in elem.text, elem.tail:
                    if txt is not None and not txt.isspace():
                        elemType._hasCharacterContent = True
                        break
                # check that all expected children are accounted for.
                # If the number of child element groups in this parent element
                # is less than the number in previous elements, then the
                # absent children are marked as optional
                if elemType._sequenced:
                    for c in elemType._children[entry.sequenceNumber+1:]:
                        c.optional = True
                elem.clear()
Exemple #38
0
	def parse_intact_xml_file(self, curs, input_fname, expt_table, interaction_table, acc_tax_id2gene_id_list):
		"""
		12-28-05
			xmlns="net:sf:psidev:mi" causes a namespace header to be added for each element
		12-29-05
			add acc_tax_id2gene_id_list
		"""		
		sys.stderr.write("Parsing %s...\n"%input_fname)
		namespace = '{net:sf:psidev:mi}'
		for event, elem in ElementTree.iterparse(input_fname):
			if elem.tag == '%sentry'%namespace:
				interactor_intact_id2uniprot_id_tax_id = {}	#later used in parsing interactionList
				for sub_elem in elem:
					if sub_elem.tag == '%sexperimentList'%namespace:
						for expt_desc_elem in sub_elem:
							expt_attrib = expt_attribute()
							expt_attrib.expt_id = expt_desc_elem.get("id")
							expt_attrib.short_label = expt_desc_elem.findtext('%snames/%sshortLabel'%(namespace, namespace))
							expt_attrib.full_name = expt_desc_elem.findtext("%snames/%sfullName"%(namespace, namespace))
							pubmed_ref_elem = expt_desc_elem.find("%sbibref/%sxref/%sprimaryRef"%(namespace, namespace, namespace))
							if pubmed_ref_elem.get("db")=="pubmed":
								expt_attrib.pubmed_id = int(pubmed_ref_elem.get("id"))
							else:
								expt_attrib.pubmed_id = -1
							self.submit_expt_table(curs, expt_attrib, expt_table)
							expt_desc_elem.clear()	#release memory
					if sub_elem.tag == '%sinteractorList'%namespace:
						for interactor_elem in sub_elem:
							interactor_intact_id = interactor_elem.get("id")
							uniprot_id_elem = interactor_elem.find('%sxref/%sprimaryRef'%(namespace, namespace))
							uniprot_id = uniprot_id_elem.get("id")
							tax_id_elem = interactor_elem.find("%sorganism"%namespace)
							if not tax_id_elem:
								interactor_intact_id2uniprot_id_tax_id[interactor_intact_id] = (uniprot_id.upper(), None, None)
							else:
								tax_id = int(tax_id_elem.get("ncbiTaxId"))
								#12-29-05
								key = (uniprot_id.upper(), tax_id)
								gene_id_list = acc_tax_id2gene_id_list.get(key)
								if gene_id_list and len(gene_id_list)==1:
									interactor_intact_id2uniprot_id_tax_id[interactor_intact_id] = (uniprot_id.upper(), int(gene_id_list[0]), tax_id)
								else:
									#sys.stderr.write("\t Warning: %s gets entrez gene_id_list: %s\n"%(uniprot_id, gene_id_list))
									interactor_intact_id2uniprot_id_tax_id[interactor_intact_id] = (uniprot_id.upper(), None, tax_id)	#12-29-05 use None
									#10-25-06 mrinal
									if not self.gene_symbol2gene_id.has_key(str(tax_id)):
										print "Getting gene symbol mappings for tax_id",tax_id
										self.gene_symbol2gene_id[str(tax_id)] = get_gene_symbol2gene_id(curs, tax_id)
									gs2gid = self.gene_symbol2gene_id[str(tax_id)]
	
									names_elem = interactor_elem.find('%snames'%namespace)
									gene_names = Set()
									gene_ids = Set()
									for alias_elem in names_elem:
										if alias_elem.tag == '%salias'%namespace:
											alias_type = alias_elem.get("type")
											if alias_type =="gene name":
												gene_names.add(alias_elem.text)
											elif alias_type =="gene name synonym":
												gene_names.add(alias_elem.text)
									for gene_name in gene_names:
										if gs2gid.has_key(gene_name):
											gene_ids.add(gs2gid[gene_name])
									if len(gene_ids)==1:
										interactor_intact_id2uniprot_id_tax_id[interactor_intact_id] = (uniprot_id.upper(), int(gene_ids.pop()), tax_id)
									else:
										sys.stderr.write("\t Warning: Couldn't find gene ids for interactor id %s\n"%(interactor_intact_id))
							interactor_elem.clear()	#release memory
					if sub_elem.tag == "%sinteractionList"%namespace:
						skipped=0
						wrote=0
						for interaction_elem in sub_elem:
							interaction_attrib = interaction_attribute()
							interaction_attrib.expt_id_array = [expt_ref_elem.text \
								for expt_ref_elem in interaction_elem.find("%sexperimentList"%namespace)]
							interaction_attrib.interaction_type_id = \
								interaction_elem.find('%sinteractionType/%sxref/%sprimaryRef'%(namespace, namespace, namespace)).get("id")
							interaction_attrib.intact_id = interaction_elem.find("%sxref/%sprimaryRef"%(namespace, namespace)).get("id")
							for prot_part_elem in interaction_elem.find("%sparticipantList"%namespace):
								prot_intact_id = prot_part_elem.find("%sinteractorRef"%namespace).text
								#12-29-05
								uniprot_id, gene_id, tax_id = interactor_intact_id2uniprot_id_tax_id[prot_intact_id]
								interaction_attrib.uniprot_id_array.append(uniprot_id)
								
								if interaction_attrib.tax_id and tax_id!=interaction_attrib.tax_id:
									sys.stderr.write("\t Warning: interaction %s has >1 tax_id: %s, %s(ignored).\n"%\
										(interaction_attrib.intact_id, interaction_attrib.tax_id, tax_id))
									interaction_attrib.is_cross_species = 1	#interaction not just within one species
								else:
									interaction_attrib.tax_id = tax_id

								#10-25-06 (mrinal)
								if gene_id:
									interaction_attrib.gene_id_array.append(gene_id)
								else:
									sys.stderr.write("\t Warning: prot_intact_id %s doesn't have proper NCBI gene id.\n"%prot_intact_id)
									interaction_attrib.is_cross_species = 1 # tag it as bad
								
							if interaction_attrib.uniprot_id_array and interaction_attrib.gene_id_array:	#12-29-05 not empty
								self.submit_interaction_table(curs, interaction_attrib, interaction_table)
								wrote+=1
							else:
								skipped+=1
							interaction_elem.clear()	#release memory
					
					sub_elem.clear()	#release the sub_elem
		sys.stderr.write("Done.\n")
		print "wrote:",wrote,"skipped:",skipped
        print "all.xml is fresh"
except IOError:
    print "Creating all.xml"
    open('update', 'w').write(update)
    x = urllib2.urlopen(all).read()
    open('all.xml', 'w').write(x)

try:
    os.mkdir('./backup')
except OSError:
    pass
os.chdir('./backup')

gmail = gmail_connect(gmail_user, gmail_pw)

for e, post in ce.iterparse('../all.xml'):
    if post.tag != "post":
        continue
    data = dict(post.items())
    #turn any utf-8 data into binary strings
    name = data['href'][7:].encode('utf-8')
    desc = data['description'].encode('utf-8')
    ext = data.get('extended', '').encode('utf-8')
    tags = data.get('tag', '').encode('utf-8')
    #make the url into a valid filename
    #\W = not([a-zA-Z0-9_]), except unicode-aware
    name = re.sub('\W', '_', name)[-200:]
    try:
        if not os.path.isfile(name):
            print "getting %s as %s" % (data['href'], name)
            f = open(name, 'w')
Exemple #40
0
SFX = ['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
t0 = time.time()
count = 0
size = 0
files = sys.argv[1:]
if os.path.isdir(files[0]):
    print "No dirs please, use wildcards"
    sys.exit()
for file in files:
    if os.path.isfile(os.path.abspath(file)):
        count += 1
        size += os.path.getsize(file)
        print "processing %s ..." % file
        date = None
        try:
            for event, elem in cElementTree.iterparse(file):
                if elem.tag.split("}")[1] == "time":
                    if date is None:
                        date = isodate.parse_datetime(elem.text)
                    else:
                        newdate = isodate.parse_datetime(elem.text)
                        if newdate > date:
                            date = newdate
                elem.clear()
        except SyntaxError:
            print "\tnot valid xml"
            pass
        if date is not None:
            t = time.mktime(date.timetuple())
            os.utime(file, (time.time(), t))
        else:
Exemple #41
0
def programmes(input_file):
    for event, elem in ET.iterparse(input_file):
        if event == "end" and elem.tag == 'programme':
            yield Programme(elem)
Exemple #42
0
    def __init__(self, source):
        self.elements = []
        # map element names to ElementTypes
        elemDict = {}
        # elements currently open
        elemStack = []
        # map (father-name, child-name) to _childInfo
        childInfoDict = {}
        namespaces = deque([('xml', 'http://www.w3.org/XML/1998/namespace')])
        pendingNamespaces = []
        for event, elem in iterparse(source,
                                     events=('start', 'end', 'start-ns',
                                             'end-ns')):
            if event == 'start-ns':
                namespaces.appendleft(elem)
                pendingNamespaces.append(elem)

            elif event == 'end-ns':
                namespaces.popleft()

            elif event == 'start':
                # add the namespace declarations as attributes
                for prefix, url in pendingNamespaces:
                    attr = prefix and 'xmlns:%s' % prefix or 'xmlns'
                    elem.attrib[attr] = escape_attrib(url)
                del pendingNamespaces[:]
                # convert name from clark format to prefix:local
                name = _clark_to_orig(elem.tag, namespaces)
                elemType = elemDict.get(name)
                if elemType is None:
                    elemType = ElementType(name)
                    elemDict[name] = elemType
                    self.elements.append(elemType)
                elemType._occurrences += 1
                # update atttibute declarations
                for attr, value in elem.items():
                    # convert attribute names from clark format to prefix:local
                    elemType.updateAttribute(_clark_to_orig(attr, namespaces),
                                             value)
                # keep track of the nesting and sequence of child elements
                if elemStack:
                    parentEntry = elemStack[-1]
                    # for sequencing, we're interested in consecutive groups
                    # of the same child element type
                    isFirstInGroup = parentEntry.latestChild != name
                    if isFirstInGroup:
                        parentEntry.latestChild = name
                        parentEntry.sequenceNumber += 1
                    # check if we've seen this child of this parent before
                    parent = parentEntry.elemType
                    childInfo = childInfoDict.get((parent.name, name))
                    if childInfo is None:
                        # this is the first time we've seen this child
                        # belonging to this parent. if the child is not on
                        # the first instance of the parent, then we allow it
                        # as an optional element
                        childInfo = _ChildInfo(name, parent._occurrences > 1)
                        childInfoDict[parent.name, name] = childInfo
                        parent._children.append(childInfo)
                    elif (
                            # we've seen this child before: check if it makes
                            # parent non-consecutive
                            parent._occurrences == 1 and isFirstInGroup
                            # check whether the position of this group of children in
                            # this parent element is the same as its position in
                            # previous instances of the parent.
                            or
                            len(parent._children) <= parentEntry.sequenceNumber
                            or
                            parent._children[parentEntry.sequenceNumber].name
                            != name):
                        parent._sequenced = False
                    # if there's more than one child element, mark it as repeatable
                    if not isFirstInGroup:
                        childInfo.repeatable = True
                #fi elemStack
                elemStack.append(_StackEntry(elemType))

            elif event == 'end':
                entry = elemStack.pop()
                elemType = entry.elemType
                for txt in elem.text, elem.tail:
                    if txt is not None and not txt.isspace():
                        elemType._hasCharacterContent = True
                        break
                # check that all expected children are accounted for.
                # If the number of child element groups in this parent element
                # is less than the number in previous elements, then the
                # absent children are marked as optional
                if elemType._sequenced:
                    for c in elemType._children[entry.sequenceNumber + 1:]:
                        c.optional = True
                elem.clear()
Exemple #43
0
def channels(input_file):
    for event, elem in ET.iterparse(input_file):
        if event == "end" and elem.tag == 'channel':
            yield Channel(elem)
    def __init__(self, source):
        self.elements = []
        # map element names to ElementTypes
        elemDict = {}
        # elements currently open
        elemStack = []
        namespaces = deque([('xml','http://www.w3.org/XML/1998/namespace')])
        pendingNamespaces = []
        for event,elem in iterparse(source, events=('start', 'end',
                                                    'start-ns', 'end-ns')):
            if event == 'start-ns':
                namespaces.appendleft(elem)
                pendingNamespaces.append(elem)

            elif event == 'end-ns':
                namespaces.popleft()

            elif event == 'start':
                # add the namespace declarations as attributes
                for prefix,url in pendingNamespaces:
                    attr = prefix and 'xmlns:%s' % prefix or 'xmlns'
                    elem.attrib[attr] = escape_attrib(url)
                del pendingNamespaces[:]
                # convert name from clark format to prefix:local
                name = _clark_to_orig(elem.tag,namespaces)
                elemType = elemDict.get(name)
                if elemType is None:
                    elemType = ElementType(name)
                    elemDict[name] = elemType
                    self.elements.append(elemType)
                elemType._occurrences += 1
                # update atttibute declarations
                for attr,value in elem.items():
                    # convert attribute names from clark format to prefix:local
                    elemType.updateAttribute(_clark_to_orig(attr,namespaces),value)
                # keep track of the nesting and sequence of child elements
                if elemStack:
                    parentEntry = elemStack[-1]
                    parent = parentEntry.elemType
                    # for sequencing, we're interested in consecutive groups
                    # of the same child element type
                    isFirstInGroup = parentEntry.latestChild != name
                    if isFirstInGroup:
                        parentEntry.latestChild = name
                        parentEntry.groupIndex += 1
                        parent.setChildInfo(name,parentEntry.groupIndex)
                    else:
                        parent.getChildInfo(name,parentEntry.groupIndex).repeatable = True
                #fi elemStack
                elemStack.append(_StackEntry(elemType))

            elif event == 'end':
                entry = elemStack.pop()
                elemType = entry.elemType
                for txt in elem.text, elem.tail:
                    if txt is not None and not txt.isspace():
                        elemType._hasCharacterContent = True
                        break
                # check that all expected children are accounted for.
                # If the number of child element groups in this parent element
                # is less than the number in previous elements, then the
                # absent children are marked as optional
                for c in elemType.iterChildInfo(entry.groupIndex+1,None):
                    c.optional = True
                elem.clear()
    def __init__(self, source):
        self.elements = []
        # map element names to ElementTypes
        elemDict = {}
        # elements currently open
        elemStack = []
        namespaces = deque([("xml", "http://www.w3.org/XML/1998/namespace")])
        pendingNamespaces = []
        for event, elem in iterparse(source, events=("start", "end", "start-ns", "end-ns")):
            if event == "start-ns":
                namespaces.appendleft(elem)
                pendingNamespaces.append(elem)

            elif event == "end-ns":
                namespaces.popleft()

            elif event == "start":
                # add the namespace declarations as attributes
                for prefix, url in pendingNamespaces:
                    attr = prefix and "xmlns:%s" % prefix or "xmlns"
                    elem.attrib[attr] = escape_attrib(url)
                del pendingNamespaces[:]
                # convert name from clark format to prefix:local
                name = _clark_to_orig(elem.tag, namespaces)
                elemType = elemDict.get(name)
                if elemType is None:
                    elemType = ElementType(name)
                    elemDict[name] = elemType
                    self.elements.append(elemType)
                elemType._occurrences += 1
                # update atttibute declarations
                for attr, value in elem.items():
                    # convert attribute names from clark format to prefix:local
                    elemType.updateAttribute(_clark_to_orig(attr, namespaces), value)
                # keep track of the nesting and sequence of child elements
                if elemStack:
                    parentEntry = elemStack[-1]
                    parent = parentEntry.elemType
                    # for sequencing, we're interested in consecutive groups
                    # of the same child element type
                    isFirstInGroup = parentEntry.latestChild != name
                    if isFirstInGroup:
                        parentEntry.latestChild = name
                        parentEntry.groupIndex += 1
                        parent.setChildInfo(name, parentEntry.groupIndex)
                    else:
                        parent.getChildInfo(name, parentEntry.groupIndex).repeatable = True
                # fi elemStack
                elemStack.append(_StackEntry(elemType))

            elif event == "end":
                entry = elemStack.pop()
                elemType = entry.elemType
                for txt in elem.text, elem.tail:
                    if txt is not None and not txt.isspace():
                        elemType._hasCharacterContent = True
                        break
                # check that all expected children are accounted for.
                # If the number of child element groups in this parent element
                # is less than the number in previous elements, then the
                # absent children are marked as optional
                for c in elemType.iterChildInfo(entry.groupIndex + 1, None):
                    c.optional = True
                elem.clear()
Exemple #46
0
def parseFile(path, source):
    global handles, buffers
    if os.path.exists(path + source + ".dat"):
        print ".dat already exists"
        return

    initOutput()
    postsFile = False
    if not source.find("posts.xml") == -1:
        if os.path.exists(path + "questions.xml.dat"):
            print "questions.xml.dat already exists"
            return
        postsFile = True
        questionsFile = open(path + "questions.xml.dat", "wb+")
        answersFile = open(path + "answers.xml.dat", "wb+")
        addHandle(questionsFile)
        addHandle(answersFile)
    else:
        outputFile = open(path + source + ".dat", "wb+")
        addHandle(outputFile)

    lineCounter = 1

    context = iterparse(path + source, events=("start", "end"))

    # turn it into an iterator
    context = iter(context)

    # get the root element
    event, root = context.next()

    for event, elem in context:
        if lineCounter % 100 == 0:
            flushOutput()
        if elem.tag == "row":
            if event == "end":
                root.clear()
            else:
                if postsFile:
                    if elem.get("PostTypeId") == "2":
                        # Answer post
                        index = 1
                        keys = FIELD_KEYS["answers.xml"]
                    else:
                        index = 0
                        keys = FIELD_KEYS["questions.xml"]
                else:
                    index = 0
                    keys = FIELD_KEYS[source]

                vals = []
                for key in keys:
                    val = elem.get(key)
                    if val == None:
                        val = "NULL"
                    vals.append(val)
                joined = FIELD_DELIMITER.join(vals)
                appendBuffer(index, joined)
                appendBuffer(index, LINE_DELIMITER)

            lineCounter += 1
    flushOutput()
    closeOutput()
Exemple #47
0
#!/usr/bin/env python
import cElementTree as ce
import re, os

infile = file("ted.xml")
outdir = "."

#\W = not [a-zA-Z0-9], except unicode-aware
mkfilename = re.compile('\W').sub

for e, item in ce.iterparse(infile):
    #print e, item
    if item.tag == 'item':
        title = item.findtext('title')
        dir = item.findtext('category')
        if not os.path.isdir(item.findtext('category')):
            os.makedirs(item.findtext('category'))
        filename = os.path.join(outdir, dir, mkfilename('_', title)) + '.txt'
        filename = filename[:200]
        if not os.path.isfile(filename):
            outfile = file(filename, 'w')
        else:
            raise "file %s already exists" % filename
        print >> outfile, title
        for n in item:
            if n.tag.find('content') != -1:
                print >> outfile, n.text
            elif n.tag == 'description':
                print >> outfile, n.text
Exemple #48
0
        print "all.xml is fresh"
except IOError:
    print "Creating all.xml"
    open('update', 'w').write(update)
    x = urllib2.urlopen(all).read()
    open('all.xml', 'w').write(x)

try:
    os.mkdir('./backup')
except OSError:
    pass
os.chdir('./backup')

gmail = gmail_connect(gmail_user, gmail_pw)

for e, post in ce.iterparse('../all.xml'):
    if post.tag != "post":
        continue
    data = dict(post.items())
    #turn any utf-8 data into binary strings
    name = data['href'][7:].encode('utf-8')
    desc = data['description'].encode('utf-8')
    ext = data.get('extended', '').encode('utf-8')
    tags = data.get('tag', '').encode('utf-8')
    #make the url into a valid filename
    #\W = not([a-zA-Z0-9_]), except unicode-aware
    name = re.sub('\W', '_', name)[-200:]
    try:
        if not os.path.isfile(name):
            print "getting %s as %s" % (data['href'], name)
            f = open(name, 'w')