Ejemplo n.º 1
0
    def parse_positions(self):
        doc_context = etree.iterparse(self.filename, events=('end',), tag='{http://earth.google.com/kml/2.2}Document',
                                      encoding='utf-8')
        context = etree.iterparse(self.filename, events=('end',), tag='{http://earth.google.com/kml/2.2}Placemark',
                                  encoding='utf-8')

        collarname = None
        xp = etree.XPath("//*[local-name()='name']/text()")
        for action, elem in doc_context:
            collarname_result = xp(elem)
            if len(collarname_result) > 0:
                collarname = collarname_result[0]
                break
            else:
                raise Exception("Error getting collarname")

        if not collarname:
            raise Exception("No collarname found")

        try:
            collar = Collar.objects.get(serial=collarname)
        except ObjectDoesNotExist:
            collar = Collar.objects.create(serial=collarname)
            collar.save()
        except Exception, e:
            raise Exception("DB Error: %s" % e)
def users_xmldata():
    """
    Extracts data from XML file and groups it by user_name.
    """
    locale.setlocale(locale.LC_COLLATE, 'pl_PL.utf-8')
    data = {}
    with open(app.config['DATA_XML'], 'r') as xml_file:
        for event, element in etree.iterparse(xml_file, tag='server'):
            protocol = element.findtext('protocol')
            host = element.findtext('host')
            port = element.findtext('port')
        xml_file.seek(0)
        for event, element in etree.iterparse(xml_file, tag='user'):
            if element.tag == 'user':
                user_id = element.attrib.get('id')
            image = element.findtext('avatar')
            link_to_avatar = '{}://{}:{}{}'.format(protocol, host, port, image)
            user_name = element.findtext('name')
            user_name = user_name.encode('utf-8')

            data[user_id] = {
                'user_name': user_name,
                'link_to_avatar': link_to_avatar,
            }

    return sorted(
        data.iteritems(),
        key=lambda x: x[1]['user_name'],
        cmp=locale.strcoll
    )
def dump_tables(table_names, anatomy, xml_path, dump_path, dump_database_name, log_filename='dump.log'):
    logging.basicConfig(filename=os.path.join(dump_path, log_filename), level=logging.INFO)
    db = sqlite3.connect(os.path.join(dump_path, dump_database_name))

    for table_name in table_names:
        print "Opening {0}.xml".format(table_name)
        with open(os.path.join(xml_path, table_name + '.xml')) as xml_file:
            tree = etree.iterparse(xml_file)

            sql_create = CREATE_QUERY.format(
                    table=table_name,
                    fields=", ".join(['{0} {1}'.format(name, type) for name, type in anatomy[table_name].items()]))
            print 'Creating table {0}'.format(table_name)

            try:
                logging.info(sql_create)
                db.execute(sql_create)
            except Exception, e:
                logging.warning(e)

            for _, row in etree.iterparse(xml_file, tag="row"):
                try:
                    logging.debug(row.attrib.keys())
                    db.execute(INSERT_QUERY.format(
                        table=table_name,
                        columns=', '.join(row.attrib.keys()),
                        values=('?, ' * len(row.attrib.keys()))[:-2]),
                        row.attrib.values())
                    print ".",
                except Exception, e:
                    logging.warning(e)
                    print "x",
                finally:
Ejemplo n.º 4
0
    def check(self, filename='FORCE_CONSTANTS'):
        ref = io.read('SPOSCAR')
        files = shell_exec("ls dirs").split('\n')
        fc2 = readfc2(filename)
        np.set_printoptions(precision=2, suppress=True)
        vasprunxml = "dir_SPOSCAR/vasprun.xml"
        if exists(vasprunxml):
            vasprun = etree.iterparse(vasprunxml, tag='varray')
            forces0 = parseVasprun(vasprun, 'forces')
            print(forces0.max())
        else:
            forces0 = 0.0
        for file in files:
            print(file)
            POSCAR = 'dirs/%s/POSCAR' % file
            vasprunxml = "dirs/%s/vasprun.xml" % file
            atoms = io.read(POSCAR)
            u = atoms.positions - ref.positions
            f = -np.einsum('ijkl,jl', fc2, u)

            vasprun = etree.iterparse(vasprunxml, tag='varray')
            forces = parseVasprun(vasprun, 'forces') - forces0
            print(np.abs(f).max(), "\n")
            print(np.abs(forces - f).max())
            print(np.allclose(f, forces, atol=1e-2))
Ejemplo n.º 5
0
 def UserUpdate(self, *args, **kw):
   if not kw.has_key('data'):
     raise ValueError, "No data passed to UserGet, got %s / %s" %(args, kw)
   # Must check xml validity here
   # parse the data to retrieve the user id
   context = etree.iterparse(StringIO(str(kw['data'])), events=('end',), tag="OxID")
   for event, element in context:
     user_id = element.text
   # retrieve the person inside the test module
   person_list = self.context.getPortalObject().oxatis_test_module.searchFolder(reference=user_id,
                                                                                portal_type="Oxatis Test Person")
   if len(person_list) != 1:
     raise KeyError(user_id)
   else:
     person = person_list[0].getObject()
   context = etree.iterparse(StringIO(str(kw['data'])), events=('end',))
   person_dict = {}
   for event, element in context:
     if element.text is None:
       person_dict[element.tag.lower()] = ""
     else:
       person_dict[element.tag.lower()] = element.text
   person_dict.pop('oxid')
   LOG("editing person %s with %s" %(person.getPath(), person_dict,), 300, "\n")
   person.edit(**person_dict)
   transaction.commit()
   # Return default xml
   root = self.generateResultHeader()
   xml = etree.tostring(root, pretty_print=True)
   return "", xml
Ejemplo n.º 6
0
def load_data(ksj_file, ksj_name, ksj_parser, host='192.168.1.10', port=27017, db='test', collection='geo'):
    mongo = pymongo.MongoClient(host=host, port=port)
    geo_collection = mongo[db][collection]
    
    point_context = etree.iterparse(
        ksj_file,
        events={"end",},
        tag="{http://www.opengis.net/gml/3.2}Point",
        recover=True
    )

    point_dict = {}
    for _, point in point_context:
        point_id = point.get("{http://www.opengis.net/gml/3.2}id")
        point_loc = point.find("gml:pos", namespaces=schema.namespaces).text
        point_dict[point_id] = [float(p) for p in point_loc.split()]

    ksj_context = etree.iterparse(
        ksj_file,
        events={"end",},
        tag="{http://nlftp.mlit.go.jp/ksj/schemas/ksj-app}%s" % ksj_name,
        recover=True
    )
    
    for _, ksj in ksj_context:
        geo_collection.insert_one(ksj_parser.parse(ksj, point_dict))
Ejemplo n.º 7
0
def write_FORCE_SETS_vasp(forces_filenames,
                          displacements,
                          filename='FORCE_SETS',
                          is_zero_point=False,
                          verbose=True):
    try:
        from lxml import etree
    except ImportError:
        print "You need to install python-lxml."
        sys.exit(1)

    if verbose:
        print "counter (file index):",
        
    num_atom = displacements['natom']
    count = 0
    are_files_correct = True
        
    if is_zero_point:
        force_files = forces_filenames[1:]
        if vasp.is_version528(forces_filenames[0]):
            zero_forces = vasp.get_forces_vasprun_xml(etree.iterparse(
                vasp.VasprunWrapper(forces_filenames[0]), tag='varray'))
        else:
            zero_forces = vasp.get_forces_vasprun_xml(
                etree.iterparse(forces_filenames[0], tag='varray'))

        if verbose:
            print "%d" % (count + 1),
        count += 1
            
        if not check_forces(zero_forces, num_atom, forces_filenames[0]):
            are_files_correct = False
    else:
        force_files = forces_filenames
        zero_forces = None

    for i, disp in enumerate(displacements['first_atoms']):
        if vasp.is_version528(force_files[i]):
            disp['forces'] = vasp.get_forces_vasprun_xml(etree.iterparse(
                vasp.VasprunWrapper(force_files[i]), tag='varray'))
        else:
            disp['forces'] = vasp.get_forces_vasprun_xml(
                etree.iterparse(force_files[i], tag='varray'))

        if verbose:
            print "%d" % (count + 1),
        count += 1
        
        if not check_forces(disp['forces'], num_atom, force_files[i]):
            are_files_correct = False

    if verbose:
        print
        
    write_FORCE_SETS(displacements,
                     filename=filename,
                     zero_forces=zero_forces)

    return are_files_correct
	def __init__(self, users, badges, posts):
		self.users = users
		self.badges = badges
		self.posts = posts
		self.summary = dict.fromkeys(["epic", "famous", "questions", "answers", "accepted", "users"],0)
		self.userContext = etree.iterparse(self.users)
		self.badgeContext = etree.iterparse(self.badges)
		self.postContext = etree.iterparse(self.posts)
Ejemplo n.º 9
0
def visualize(iabook):
#    scandata = objectify.parse(iabook.get_scandata_path()).getroot()
    scandata = iabook.get_scandata()
    if opts.source == 'abbyy':
        context = etree.iterparse(iabook.get_abbyy(), tag=abbyyns+'page')
    elif opts.source == 'pdfxml':
        context = etree.iterparse(iabook.get_pdfxml_xml(), tag='PAGE')
    elif opts.source == 'djvu':
        context = etree.iterparse(iabook.get_djvu_xml(), tag='OBJECT')
    info = scan_pages(context, scandata, iabook)
Ejemplo n.º 10
0
def write_FORCES( lattice,
                  forces_filenames,
                  displacements,
                  filename='FORCE_SETS',
                  amplitude=0.01,
                  mode='vasp',
                  is_zero_point=False,
                  is_fropho_disp=False ):

    if mode == "vasp":
        try:
            from lxml import etree
        except ImportError:
            print "You need to install python-lxml."
            sys.exit(1)

    if is_zero_point:
        force_files = forces_filenames[1:]
        if mode == "wien2k":
            zero_forces = wien2k.get_forces_wien2k(forces_filenames[0], lattice)
        else: # "vasp" case
            zero_forces = \
                vasp.get_forces_vasprun_xml(etree.iterparse( 
                    vasp.VasprunWrapper( forces_filenames[0] ),
                    tag='varray' ) )
    else:
        force_files = forces_filenames
        zero_forces = None

    displacements = sort_displacements( displacements )
    forces = []

    # Show progress 
    print >> sys.stderr, "counter (file index):",
    for i in range( len( displacements ) ):
        if mode == "wien2k": # wien2k
            forces.append( wien2k.get_forces_wien2k( force_files[i], lattice ) )
        else: # vasp
            forces.append(
                vasp.get_forces_vasprun_xml( etree.iterparse(
                        vasp.VasprunWrapper( force_files[i] ),
                        tag='varray') ) )

    write_FORCES_from_forces( lattice,
                              forces,
                              displacements,
                              amplitude,
                              filename,
                              zero_forces,
                              is_fropho_disp,
                              verbose=True )

    # Show progress 
    print >> sys.stderr, "\n"
def exampleIterativeParsing():
    some_file_like = BytesIO("<root><a>data</a></root>")
    for event, element in etree.iterparse(some_file_like):
        print("%s, %4s, %s" % (event, element.tag, element.text))
    some_file_like.close()
    some_file_like = BytesIO("<root><a>data</a></root>")
    for event, element in etree.iterparse(some_file_like, events=("start", "end")):                                  
        print("%s, %4s, %s" % (event, element.tag, element.text))
    some_file_like.close()
    some_file_like = BytesIO("<root><a>data</a></root>")
    tree = etree.parse(some_file_like)
    root = tree.getroot()
    return root
Ejemplo n.º 12
0
	def main(self):
		queued_docs = []
		process_f = {
			'membership': self.process_membership_doc,
			'person': self.process_person_doc
		}.get(self.args.element, self.process_doc)
		target_db = self.couchdb_client(self.args.db)
		datasource = None
		datetime = None

		# Start with scanning specifically for the properties element once.
		# The properties element appears at the top of the file for PeopleSoft data,
		# but at the end of the file for Destiny One data.
		context = etree.iterparse(self.args.file, events=('end',), tag='properties')
		for event, elem in context:
			assert elem.tag == 'properties'
			properties = self.etree_to_dict(elem)
			datasource = properties['datasource']
			datetime = properties['datetime']
			try:
				datetime_ = dt.strptime(datetime, '%Y-%m-%d %H:%M:%S')
				datetime_ = timezone('Canada/Mountain').localize(datetime_)
				datetime = datetime_.isoformat()
			except:
				raise

		# Then, scan for and parse the user specified elements,
		# batching up to batch elements before processing
		with open(self.args.file, 'r') as f:
			progress = progressbar.ProgressBar(maxval=os.path.getsize(self.args.file))
			progress.start()

			context = etree.iterparse(f, events=('end',), tag=self.args.element)
			for event, elem in context:
				assert elem.tag == self.args.element
				progress.update(f.tell())
				doc = self.etree_to_dict(elem)
				if not 'datasource' in doc:
					doc['datasource'] = datasource
				if not 'datetime' in doc:
					doc['datetime'] = datetime
				queued_docs.append(doc)

				if len(queued_docs) >= self.args.batch:
					self.process_documents(queued_docs, target_db, process_f)
					queued_docs = []

			progress.finish()

		if len(queued_docs) > 0:
			self.process_documents(queued_docs, target_db, process_f)
Ejemplo n.º 13
0
def read_force_constant_vasprun_xml(filename):

    import sys
    try:
        from lxml import etree
    except ImportError:
        print "You need to install python-lxml."
        sys.exit(1)

    if vasp.is_version528(filename):
        vasprun = etree.iterparse(vasp.VasprunWrapper(filename))
    else:
        vasprun = etree.iterparse(filename)
    return vasp.get_force_constants_vasprun_xml(vasprun)
Ejemplo n.º 14
0
def repomdmetadata_from_xml_factory(xmlpath):
    rm_obj = RepomdMetadata(xmlpath)

    for _, elements in etree.iterparse(xmlpath):
        elements = MyElement(elements)
        for elem in elements:
            elem = MyElement(elem)

            # Get revision
            if elem.tag.endswith("revision"):
                rm_obj.revision = elem.text

            # Parse tags
            if elem.tag.endswith("tags"):
                for subelem in elem:
                    if subelem.tag.endswith("content"):
                        rm_obj.tags.setdefault("content", set()).add(subelem.text)
                    if subelem.tag.endswith("repo"):
                        rm_obj.tags.setdefault("repo", set()).add(subelem.text)
                    if subelem.tag.endswith("distro"):
                        rm_obj.tags.setdefault("distro", set()).add((subelem.get("cpeid"),
                                                                     subelem.text))

    # Iter over data elements  (<data type="primary">, ...)
    for _, elements in etree.iterparse(xmlpath, tag="%sdata" % MD_NS):
        elements = MyElement(elements)
        re = RepomdItem()
        re.name = elements.get("type")
        for elem in elements:
            elem = MyElement(elem)
            if elem.tag.endswith("location"):
                re.location_href = elem.get("href")
            elif elem.tag.endswith("open-size"):
                re.open_size = elem.text
            elif elem.tag.endswith("open-checksum"):
                re.open_checksum_type = elem.get("type")
                re.open_checksum = elem.text
            elif elem.tag.endswith("checksum"):
                re.checksum_type = elem.get("type")
                re.checksum = elem.text
            elif elem.tag.endswith("timestamp"):
                re.timestamp = elem.text
            elif elem.tag.endswith("size"):
                re.size = elem.text

            elif elem.tag.endswith("database_version"):
                re.database_version = elem.text
        elements.clear()
        rm_obj.append(re.name, re)
    return rm_obj
Ejemplo n.º 15
0
        def get_codelist_data(elem=None, name=None):

            if not name:
                name = self.return_first_exist(elem.xpath('name/text()'))

                description = self.return_first_exist(elem.xpath('description/text()'))
                count = self.return_first_exist(elem.xpath('count/text()'))
                fields = self.return_first_exist(elem.xpath('fields/text()'))
                date_updated = datetime.datetime.now()

                if (Codelist.objects.filter(name=name).exists()):
                    current_codelist = Codelist.objects.get(name=name)
                    current_codelist.date_updated = date_updated
                    current_codelist.description = description
                    current_codelist.count = count
                    current_codelist.fields = fields
                    current_codelist.save()
                else:
                    new_codelist = Codelist(name=name, description=description, count=count, fields=fields, date_updated=date_updated)
                    new_codelist.save()

            cur_downloaded_xml = "http://www.iatistandard.org/105/codelists/downloads/clv1/codelist/" + name + ".xml"
            cur_file_opener = urllib2.build_opener()
            cur_xml_file = cur_file_opener.open(cur_downloaded_xml)

            context2 = etree.iterparse(cur_xml_file, tag=name)
            fast_iter(context2, add_code_list_item)
Ejemplo n.º 16
0
def download(file_name):
    log.info("Downloading Regexp")
    # res = requests.get(URL, stream=True)
    # res.raw.decode_content = True
    fh = open(file_name, "rb")
    for evt, el in etree.iterparse(fh):
        if evt != "end" or el.tag != NS + "group":
            continue
        xml = etree.tostring(el)
        group = parse_group(el)
        prov = data_table.find_one(group=group.get("id"))
        if prov is not None:
            prov["last_seen"] = datetime.utcnow()
        else:
            prov = {
                "group": group.get("id"),
                "name": group.get("name"),
                "first_seen": datetime.utcnow(),
                "last_seen": datetime.utcnow(),
                "xml": xml
                # 'json': json.dumps(group, default=json_default)
            }
            log.info("Importing %s" % group.get("name"))
            store_group(group)
        data_table.upsert(prov, ["group"])
        el.clear()
Ejemplo n.º 17
0
    def write_csv(self, root, header):
        """
        Handle from root to start as of modulnr to extract the information
        and put them into the CSV with header.
        """
        modulnr = 1
        csvfile = self.filename + '.csv'

        with open(csvfile, 'wb') as f:
            f.write(codecs.BOM_UTF8)
            f_csv = csv.writer(f)
            f_csv.writerow(header)
            for _, modul in ET.iterparse(self.filename, tag='modul'):
                # FIXME: Define Dialect and Encoding='UTF-8'
                # Python3: with open(csvfile, 'w', newline='') as f:

                ueberschriften = modul.find('ueberschriften')
                positionen = [siblings[0] for siblings in
                              ueberschriften.itersiblings()
                              if siblings.tag == 'positionen']

                for uberschrift in ueberschriften.findall('uberschrift'):
                    for pos in positionen:
                        output = [modulnr]
                        modulnr += 1
                        for item in header:
                            value = pos.findtext(item)
                            if value is not None:
                                output.append(value.encode('utf-8'))
                            else:
                                output.append('')
                        print(output)
                        f_csv.writerow(output)
Ejemplo n.º 18
0
def test_real(jmdict_path, examples_path):
    i = 0
    errs = 0

    ef = open('errors.txt', 'wb')
    out = open('jmdict-importable.xml', 'wb')

    jmdict_total_size = os.path.getsize(jmdict_path)
    examples_total_size = os.path.getsize(examples_path)
    widgets = ['Converting: ', pb.Percentage(), ' ', pb.Bar(),
               ' ', pb.Timer(), ' ']
    pbar = pb.ProgressBar(widgets=widgets, maxval=jmdict_total_size).start()

    example_dict = load_examples(examples_path)

    with open(jmdict_path, 'rb') as f:
        with etree.xmlfile(out, encoding='utf-8') as xf:
            xf.write_declaration()
            context = etree.iterparse(f, tag=('entry'), resolve_entities=False)

            with xf.element(NAMESPACE_PREFIX+'dictionary', nsmap=NSMAP,
                            attrib={XSI_PREFIX+'schemaLocation': SCHEMA_LOCATION,
                                    'schema_version': __schema_version__}): 
                xf.write("\n")
                xml_meta = create_meta(jmdict_path)
                xf.write(xml_meta, pretty_print=True)

                for action, elem in context:
                    xml_entry = convert_entry(elem, example_dict)
                    xf.write(xml_entry, pretty_print=True)
                    pbar.update(f.tell())
                    elem.clear()

    pbar.finish()
Ejemplo n.º 19
0
    def iterparser(self):
        iterparser = etree.iterparse(self.osm_file, events=("start", "end",))

        item = None
        for action, element in iterparser:
            if action == "start":
                if item is None:
                    if element.tag == "node":
                        item = Node(**element.attrib)
                    elif element.tag == "way":
                        item = Way(**element.attrib)
                    elif element.tag == "relation":
                        item = Relation(**element.attrib)
                else:
                    if element.tag == "nd":
                        item.nodes.append(element.get("ref"))
                    elif element.tag == "tag":
                        item.tags[element.get("k")] = element.get("v")
                    elif element.tag == "member":
                        item.members.append((
                            element.get("type"), element.get("ref"),
                            element.get("role")
                        ))
                    else:
                        print("Tag %s under item %s" % (element.tag, item))
            else:
                if element.tag in ("node", "way", "relation"):
                    yield item
                    item = None

            element.clear()
            while element.getprevious() is not None:
                del element.getparent()[0]
        del iterparser
Ejemplo n.º 20
0
def read_pan_data(fn:str, gender_names=GENDER_NAMES, age_names=AGE_NAMES):
    #Read blog data from xml in PAN13 format.
    gender_dic = {v:i for i,v in enumerate(gender_names)}
    age_dic = {v:i for i,v in enumerate(age_names)}

    texts = []
    genders = []
    ages = []
    logging.info('Read PAN13 format data from {}'.format(fn))

    elements = let.iterparse(fn, events=["end"])
    n_authors = 0
    for event, el in elements:
        if el.tag == 'conversation':
            t = el.text or ''
            texts.append(_preprocess(t))
            el.clear()
        elif el.tag == 'author':
            n_authors += 1
            gender = gender_dic[el.attrib['gender']]
            age = age_dic[el.attrib['age_group']]
            genders.extend([gender] * (len(texts) - len(genders)))
            ages.extend([age] * (len(texts) - len(ages)))
        elif el.tag == 'file':
            del el.getparent()[0]
        else:
            continue
    logging.info('{} authors'.format(n_authors))
    return texts, genders, ages
Ejemplo n.º 21
0
def sentence_generator(filename,separate=True,gzipped=True):
    """Returns metadata and the sentence: [(words),(tags),(lemmas)]
    
    Arguments
    ---------
    filename: filename
    separate: if False, changes sentence format to [(w1,t1,l1),(w2,t2,l2),...]
    gzipped : assumes the file is gzipped. Change to False for unpacked files
    """
    source = gzip.GzipFile(filename) if gzipped else filename
    parser = etree.iterparse(source,html=True)
    for x,y in parser:
        try:
            # Trips is a list of the word, part-of-speech and the lemma.
            # by zipping that list, you get a format that I prefer (see details above)
            # The good thing about it is that you can search for sub-sequences
            # in the POS list. E.g. using the contains() function that I included
            # for convenience.
            trips = [w.split('\t') for w in y.text.strip().split('\n')]
            # y.attrib contains the sentence metadata.
            yield y.attrib, zip(*trips) if separate else trips
        except AttributeError:
            print 'No text for this element!'
            pass
        y.clear() # Save memory
        # Save more memory by deleting references to previous sentences
        for ancestor in y.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
Ejemplo n.º 22
0
def findTerms(threadName, XMLFILE, list_terms, queue):
	print "Running thread %d" % threadName

	list_count_terms = queue.get()

	context = etree.iterparse(XMLFILE, events=("end",), tag="page")

	#for each article, which is each <page> tag, verifies if each term or pair of terms appear in it
	count = 0
	for event, elem in context:
		for each_term in list_terms:
			#If it's a pair of terms
			if each_term.find("_") > -1:
				words = each_term.split(":")[0]
				if findPair(words.split("_")[0], words.split("_")[1])(elem.text) or findPair(words.split("_")[1], words.split("_")[0])(elem.text):
					index = list_terms.index(each_term)
					list_count_terms[index] += 1	
			#If it's only one term
			elif each_term.find("_") == -1 and findWholeWord(each_term.split(":")[0])(elem.text):
				index = list_terms.index(each_term)
				list_count_terms[index] += 1
		count += 1
		if count % 1000 == 0:
			print "Thread %d processed %d articles." % (threadName, count)
		#after verifing all the terms, clear the element from memory, because otherwise we end up with all the file in memory, and THAT IS BAD
		elem.clear()

	queue.put(list_count_terms)
	print "Thread %d finished" % threadName
Ejemplo n.º 23
0
def convert(xml_path, csv_path):
    with open(xml_path) as fobj:
        with open(csv_path, 'w') as outfile:
            pos_writer = csv.writer(outfile)

            context = etree.iterparse(fobj)

            for _, elem in context:
                if elem.tag == 'FrameSet':
                    # print elem.attrib
                    player = player_id(elem)
                    team = team_id(elem)
                    
                    match = convert_id(elem.get('MatchId') or elem.get('Match'))
                    period = 1+ (elem.get('GameSection') == 'secondHalf')

                    for n, frame in enumerate(elem):
                        #to centi seconds (1/100 s)
                        time = n*4
                        x = frame.get('X')
                        y = frame.get('Y')

                        #last 0 for velocity
                        pos_writer.writerow([match, period, time, player, team, x, y, 0])

                    elem.clear()
Ejemplo n.º 24
0
def view(identifier):
    item=get_item(identifier)
    host, path = locate(identifier)
    url = 'http://%s/~edward/read_abbyy.php?item_id=%s&doc=%s&path=%s' % (host, identifier, identifier, path)
    f = urlopen(url)
    page_count = 0
    body = []
    for eve, page in etree.iterparse(f):
        if page.tag != page_tag:
            continue
        for block in page:
            if block.attrib['blockType'] != 'Text':
                continue
            region, text = block
            for par in text:
                cur_par = ''
                if len(par) == 0 or len(par[0]) == 0 or len(par[0][0]) == 0:
                    continue
                for line in par:
                    chars = []
                    for fmt in line:
                        chars += [c.text for c in fmt]
                    cur_par += ''.join(chars)
                body.append(cur_par)
        if page_count == 20:
            break
        page_count += 1

    return render_template('view.html', item=item, int=int, body=body)
Ejemplo n.º 25
0
def get_glyph_pngs(id):
    # Make a png from the xml file.
    # Server side path to xml file:
    # This won't be in home... it'll be at /classifier/[UUID]$
    encoded_glyphs = []
    for event, element in etree.iterparse(UPLOADS + "projects/1/classifiers/" + str(id) + "/" + str(id) + ".xml"):
        if (element.tag == "data"):  # Maybe a better way in lxml to get to the data element
            ncols = int(element.getparent().get("ncols"))
            nrows = int(element.getparent().get("nrows"))
            # Make an iterable that yields each row in boxed row flat pixel format.*
            # *http://pypng.googlecode.com/svn/trunk/code/png.py
            # Plan: make a list of length nrows * ncols * 3 then make sublists of length ncols * 3.
            # The *3 is for RGB: (0,0,0) is black and (255,255,255) is white
            pixels = []
            white_or_black = True
            for n in re.findall("\d+", element.text):
                pixels.extend([255 * white_or_black] * int(n))
                white_or_black = not(white_or_black)
            png_writer = png.Writer(width=ncols, height=nrows, greyscale=True)
            pixels_2D = []
            for i in xrange(nrows):
                pixels_2D.append(pixels[i*ncols: (i+1)*ncols])  # Index one row of pixels
            # StringIO.StringIO lets you write to strings as files: it gives you a file descriptor.
            # (pypng expects a file descriptor)
            buf = StringIO.StringIO()
            #image = png.from_array(pixels_2D,mode='L')
            #image.save(buf) # Hopefully this doesn't write to a file
            png_writer.write(buf, pixels_2D)
            my_png = buf.getvalue()
            encoded_png = base64.b64encode(my_png)  # not sure why
            encoded_glyphs.append(encoded_png)
    return encoded_glyphs
Ejemplo n.º 26
0
Archivo: parse.py Proyecto: natano/misc
    def iterpages(self):
        context = etree.iterparse(
            self.file_, events=('start', 'end'),
            tag='{}page'.format(self.NAMESPACE),
            encoding=self.ENCODING)
        context = iter(context)

        _, root = next(context)

        for event, page in context:
            if event == 'start':
                continue
            pagedata = {}
            for key in ('id', 'title'):
                pagedata[key] = self.findtext(page, key)

            revisions = []
            for revision in page.iterfind('{}revision'.format(self.NAMESPACE)):
                revisiondata = {}
                for key in ('id', 'comment', 'text'):
                    revisiondata[key] = self.findtext(revision, key)
                timestamp = self.findtext(revision, 'timestamp')
                revisiondata['timestamp'] = self.parse_iso8601(timestamp)
                revisions.append(revisiondata)

            revisions.sort(key=lambda rev: rev['timestamp'])

            # XXX: why is root.clear() not sufficient?
            page.clear()
            while page.getprevious() is not None:
                del page.getparent()[0]
            root.clear()

            yield pagedata, revisions
Ejemplo n.º 27
0
def LossCurveParser(input_file):

    refs = []
    longitude = []
    latitude = []
    losses = []
    poes = []
    meta_info = {}

    for _, element in etree.iterparse(input_file):
        if element.tag == '%slossCurves' % xmlNRML:
            meta_info = parse_metadata(element)
        elif element.tag == '%slossCurve' % xmlNRML:
            lon, lat, ref, poe, loss = parse_single_loss_curve(element)
            longitude.append(lon)
            latitude.append(lat)
            refs.append(ref)
            poes.append(poe)
            losses.append(loss)
        else:
            continue
    longitude = np.array(longitude)
    latitude = np.array(latitude)
    
    return refs, longitude, latitude, poes, losses
def process_data(inputdump, outputdir, maxfilesize, compress, outformat):
    
    # we expects large dumps so we are using iterparse method
    context = etree.iterparse(inputdump)
    context = iter(context)
    
    # discover prefix from the xml dump file
    # /mediawiki/siteinfo/base
    prefix = None    
    for event, elem in context:
        if event == "end" and elem.tag.endswith("base"):
            prefix =  elem.text[:elem.text.rfind("/")]
            break    
    print "base url: %s" % prefix
    
    # initialize wiki page queue
    queue = Queue.Queue(maxsize=1024)

    # start worker threads    
    for _ in range(multiprocessing.cpu_count()):
        cleaner = WikiCleanerThread(queue, outputdir, maxfilesize, prefix, compress, outformat)
        cleaner.setDaemon(False)
        cleaner.start()
    
    # put element pages in the queue to be processed by the cleaner threads
    for event, elem in context:
        if event == "end" and elem.tag.endswith("page"):
            queue.put(elem)
    
    print "finishing..."
Ejemplo n.º 29
0
def sentence_generator(filename, gzipped=True, structure=False):
    """Returns metadata, optionally the sentence structure, and the sentence itself.
    Each sentence is represented as a list of Token objects. Tokens are named tuples,
    with the following values: ['token', 'POS', 'lemma', 'depid', 'dephead', 'deprel']
    
    Arguments
    ---------
    filename:  filename
    gzipped:   assumes the file is gzipped. Change to False for unpacked files
    structure: assumes we don't need information about sentence structure.
               change to True to get this info.
    """
    source = gzip.GzipFile(filename) if gzipped else filename
    parser = etree.iterparse(source, html=True, events=('start','end',), tag='s')
    # get_full_sentence_data() returns the structure and a list of tokens
    # get_sentence_data() returns a list of tokens
    data_func = get_full_sentence_data if structure else get_sentence_data
    for event, element in parser:
        if event == 'start':
            # element.attrib() returns a dictionary with metadata for the sentence.
            yield (element.attrib, data_func(element))
            opening_element = element
        elif event == 'end':
            clear_references(opening_element)
            clear_references(element)
    # Aggressively keep memory load down
    del parser
Ejemplo n.º 30
0
def augment_with_region(in_file='../personlist.xml', out_file='personlist_with_region_iterparse.xml', etree=etree):
    """
    Try to minimise memory overhead by serialising as the data comes in.
    """
    region_element = etree.Element('region')

    region_index = build_region_index(etree=etree)

    context = etree.iterparse(in_file, tag='person')

    with open(out_file, 'w') as out:
        out.write('<personlist>\n')

        for _, person in context:
            # find city and country of each person
            city = person.findtext('address/city')
            country = person.findtext('address/country')
            if not city or not country:
                continue

            # insert region tag
            region_element.text = region_index.get((city,country))
            city_el = person.find('address/city')
            city_el.addnext(deepcopy(region_element))

            # serialise into target file
            out.write(etree.tostring(person))

            # clear processed content
            person.clear()

        out.write('\n</personlist>')
Ejemplo n.º 31
0
def context_iter(dblp_path):
    """Create a dblp data iterator of (event, element) pairs for processing"""
    return etree.iterparse(source=dblp_path,
                           dtd_validation=True,
                           load_dtd=True)  # required dtd
Ejemplo n.º 32
0
# Parse buildings from output of osmosis and output corresponding keys
# Apache License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0


from lxml.etree import XMLParser, parse , iterparse, tostring
p = XMLParser(huge_tree=True)
import sys
 
 
# write to disk 
fo = open("osm-buildings.tsv","w")
 
# Documentation on iterparse
# http://effbot.org/zone/element-iterparse.htm
 
context = iterparse(sys.stdin, events=("start", "end"), huge_tree=True)
context = iter(context)
event, root = context.next()
 
print "wayid\ttimestamp\tuid\tuser\tchangeset\tkey\tvalue" 
for event, elem in context:
    if event == "end" and elem.tag == "way":
        wayid = unicode(elem.get("id"))
        timestamp = unicode(elem.get("timestamp"))
        uid = unicode(elem.get("uid")) 
        user = unicode(elem.get("user"))
        changeset = unicode(elem.get("changeset"))

        td = {}
        for i in elem.findall("tag"):
            if isinstance(i.get("v"), str):
Ejemplo n.º 33
0
    def deserialize(self, source: Union[IO, str], typesystem: TypeSystem):
        # namespaces
        NS_XMI = "{http://www.omg.org/XMI}"
        NS_CAS = "{http:///uima/cas.ecore}"

        TAG_XMI = NS_XMI + "XMI"
        TAG_CAS_NULL = NS_CAS + "NULL"
        TAG_CAS_SOFA = NS_CAS + "Sofa"
        TAG_CAS_VIEW = NS_CAS + "View"

        OUTSIDE_FS = 1
        INSIDE_FS = 2
        INSIDE_ARRAY = 3

        sofas = []
        views = {}
        feature_structures = {}
        children = defaultdict(list)

        context = etree.iterparse(source, events=("start", "end"))

        state = OUTSIDE_FS

        for event, elem in context:
            if elem.tag == TAG_XMI or elem.tag == TAG_CAS_NULL:
                pass
                # Ignore the 'xmi:XMI' and 'cas:NULL' elements
            elif elem.tag == TAG_CAS_SOFA:
                if event == "end":
                    sofa = self._parse_sofa(elem)
                    sofas.append(sofa)
            elif elem.tag == TAG_CAS_VIEW:
                if event == "end":
                    proto_view = self._parse_view(elem)
                    views[proto_view.sofa] = proto_view
            else:
                """
                In XMI, array element features can be encoded as
                
                <cas:StringArray>
                    <elements>LNC</elements>
                    <elements>MTH</elements>
                    <elements>SNOMEDCT_US</elements>
                </cas:StringArray>
                
                In order to parse this with an incremental XML parser, we need to employ 
                a simple state machine. It is depicted in the following.
                            
                                   "start"               "start"
                     +-----------+-------->+-----------+-------->+--------+
                     | Outside   |         | Inside    |         | Inside |
                +--->+ feature   |         | feature   |         | array  |
                     | structure |         | structure |         | element|
                     +-----------+<--------+-----------+<--------+--------+
                                    "end"                 "end"                                
                """
                if event == "start":
                    if state == OUTSIDE_FS:
                        # We saw the opening tag of a new feature structure
                        state = INSIDE_FS
                    elif state == INSIDE_FS:
                        # We saw the opening tag of an array element
                        state = INSIDE_ARRAY
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'start'".format(
                                state))
                elif event == "end":
                    if state == INSIDE_FS:
                        # We saw the closing tag of a new feature
                        state = OUTSIDE_FS
                        fs = self._parse_feature_structure(
                            typesystem, elem, children)
                        feature_structures[fs.xmiID] = fs

                        children.clear()
                    elif state == INSIDE_ARRAY:
                        # We saw the closing tag of an array element
                        children[elem.tag].append(elem.text)
                        state = INSIDE_FS
                    else:
                        raise RuntimeError(
                            "Invalid state transition: [{0}] 'end'".format(
                                state))
                else:
                    raise RuntimeError(
                        "Invalid XML event: [{0}]".format(event))

            # Free already processed elements from memory
            if event == "end":
                self._clear_elem(elem)

        if len(sofas) != len(views):
            raise RuntimeError("Number of views and sofas is not equal!")

        # Post-process feature values
        for xmi_id, fs in feature_structures.items():
            t = typesystem.get_type(fs.type)

            for feature in t.all_features:
                feature_name = feature.name

                if feature_name == "sofa":
                    continue

                if typesystem.is_primitive(
                        feature.rangeTypeName
                ) or typesystem.is_primitive_collection(feature.rangeTypeName):
                    # TODO: Parse feature values to their real type here, e.g. parse ints or floats
                    continue

                # Resolve references here
                value = getattr(fs, feature_name)
                if value is None:
                    continue

                # Resolve references
                if typesystem.is_collection(feature.rangeTypeName):
                    # A collection of references is a list of integers separated
                    # by single spaces, e.g. <foo:bar elements="1 2 3 42" />
                    targets = []
                    for ref in value.split():
                        target_id = int(ref)
                        target = feature_structures[target_id]
                        targets.append(target)
                    setattr(fs, feature_name, targets)
                else:
                    target_id = int(value)
                    target = feature_structures[target_id]
                    setattr(fs, feature_name, target)

        cas = Cas(typesystem)
        for sofa in sofas:
            proto_view = views[sofa.xmiID]

            if sofa.sofaID == "_InitialView":
                view = cas.get_view("_InitialView")
            else:
                view = cas.create_view(sofa.sofaID)

            view.sofa_string = sofa.sofaString
            view.sofa_mime = sofa.mimeType

            for member_id in proto_view.members:
                annotation = feature_structures[member_id]

                view.add_annotation(annotation)

        return cas
Ejemplo n.º 34
0
def xml2bin(infilename, outfilename, type='pickle', insist_on_living=True):
    genes = list()
    # list of dicts

    xf = open(infilename, 'rb')
    #xf=open('short1.xml','r') ;
    if not isfile(outfilename):
        outfile = open(outfilename, 'wb')
        outfile.close()

    outfile = ''
    if type == 'pickle':
        outfile = open(outfilename, 'ab')
    elif type == 'tsv':
        outfile = open(outfilename, 'at')
    pp = pprint.PrettyPrinter(indent=4)
    #tree=etree.parse(xf) ;
    #root=tree.getroot() ;
    count = 0
    for event, element in etree.iterparse(xf,
                                          events=('end', ),
                                          tag='Entrezgene'):
        #for element in root.iter("Entrezgene") :
        #element=prelement ;
        #element=etree.fromstring(etree.tostring(prelement))

        if event != 'end':
            continue

        try:
            genetype = eseek(element, 'Entrezgene_type').get(
                'value')  #  == 'protein-coding' and
            if (not insist_on_living
                    or element[0][0][1].get('value') == 'live'):
                newGene = dict()
                extDBrefs = list()
                newGene.update({'External': extDBrefs})
                newGene.update({'genetype': genetype})
            else:

                element.clear()
                for ancestor in element.xpath('ancestor-or-self::*'):
                    while ancestor.getprevious() is not None:
                        del ancestor.getparent()[0]

                continue
        except KeyError:
            sys.stderr.write('Unclear type.n')

        newGene.update({'EID': element[0][0][0].text})

        # 3 is the Entrezgene_gene child
        newGene.update({'Symbol': element[3][0][0].text})

        try:
            newGene.update({'Location': element[3][0][2].text})
        except IndexError:
            newGene.update({'Location': None})

        newGene.update(
            {'Taxon': gettext(element[2][0].find(".//Object-id_id"))})
        #<Entrezgene_source> 2
        #    <BioSource> 0
        #      <BioSource_genome value="genomic">1</BioSource_genome>
        #      <BioSource_origin value="natural">1</BioSource_origin>
        #      <BioSource_org> 2
        #        <Org-ref> 0
        #          <Org-ref_taxname>H**o sapiens</Org-ref_taxname>
        #          <Org-ref_common>human</Org-ref_common>
        #          <Org-ref_db> 2
        #            <Dbtag>  0
        #              <Dbtag_db>taxon</Dbtag_db>
        #              <Dbtag_tag> 1
        #                <Object-id> 0
        #                  <Object-id_id>9606</Object-id_id> 0
        #                </Object-id>

        try:
            # gene-ref_db
            for c in list(eseek(element[3][0], 'Gene-ref_db')):
                # 3-0-3 : Gene-ref_db  (i tried, anyway)
                #newGene['External'].update({ c[0].text : c[1][0][0].text }) ;
                newGene['External'].append(c[0].text + ":" + c[1][0][0].text)
        except (IndexError, KeyError):
            newGene.update({'External': None})

        try:
            syns = list()
            for c in list(element[3][0][4]):
                syns.append(c.text)

            newGene.update({'Synonym': syns})
        except IndexError:
            newGene.update({'Synonym': None})

        # save this one for later-- we may need to do a sub-iteration
        try:
            for dbitem in element.iterfind('.//Dbtag_db'):
                if dbitem.text == 'UniProtKB/Swiss-Prot':
                    if 'SwissProt' not in newGene:  # get the first one
                        newGene.update(
                            {'SwissProt': dbitem.getparent()[1][0][0].text})
                    break
            else:
                newGene.update({'SwissProt': None})
        except IndexError:
            newGene.update({'SwissProt': None})

        try:
            for dbitem in element.iterfind('.//Dbtag_db'):
                if dbitem.text == 'UniProtKB/TrEMBL':
                    newGene.update(
                        {'TrEMBL': dbitem.getparent()[1][0][0].text})
                    break
            else:
                newGene.update({'TrEMBL': None})
        except IndexError:
            newGene.update({'TrEMBL': None})

        # changed to list
        try:
            newGene.update(
                {'Pubmed': [e.text for e in element.findall('.//PubMedId')]})
        except KeyError:
            newGene.update({'Pubmed': None})

        try:
            #newGene.update({ 'Peptide' : \
            # getProtAccs(eseek(element,'Entrezgene_locus').findall(".//Gene-commentary_type"))  +
            # getProtAccs(eseek(element,'Entrezgene_comments').findall(".//Gene-commentary_type")) }) ;

            newGene.update({
                'Peptide':
                getProtAccs(element.findall(".//Gene-commentary_type"))
            })

        except KeyError:
            newGene.update({'Peptide': None})
        # 8-0-5 : Entrexgene_comments, Gene-commentary, Gene-commentary_products

        try:
            newGene.update({
                'mRNA':
                getRNAAccs(element.findall(".//Gene-commentary_type"))
            })
        except KeyEror:
            newGene.update({'mRNA': None})

        try:
            newGene.update(
                {'Summary': eseek(element, 'Entrezgene_summary').text})
        except KeyError:
            newGene.update({'Summary': None})

        ### NEW AND EXTREMELY BOSS : CDD DOMAINS !!!!1111ONE!!
        try:
            CDDentries = set()
            for dbitem in element.iterfind('.//Dbtag_db'):
                if dbitem.text == 'CDD':
                    CDDentries.add(dbitem.getparent()[1][0][0].text)

            if not CDDentries:
                newGene.update({'CDD': None})
            else:
                newGene.update({'CDD': list(CDDentries)})
        except IndexError:
            # not super sure why this would ever happen here
            newGene.update({'CDD': None})
            raise ValueError

        element.clear()
        for ancestor in element.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]

        if type == 'pickle':
            pickle.dump(newGene, outfile, protocol=2)
        elif type == 'tsv':
            line = ''
            counter = 0
            for k, v in sorted(newGene.items()):
                if isinstance(v, list):
                    v = ";".join(v)
                elif v == None:
                    v = ''
                if counter == 0:
                    line = v
                else:
                    line = line + "\t" + v
                counter = counter + 1
            line = re.sub(r'\n', '', line)
            outfile.write(line + "\n")

        outfile.flush()

    outfile.close()
Ejemplo n.º 35
0
    def collect_entities_from_dump(
            self,
            limit_per_query,  # for consistent API
            n_queries=None,
            # no effect, for consistent API only
            include_wikipedia=True,
            delay_wikipedia_retrieval=True,
            pre_filter=None,
            **kwargs):
        """
        iteratively parse a xml-dump (with embedded json entities) for entities
        of interest, using bz2file.
        Note: the pure JSON does not contain all relevant meta-info (e. g.
        timestamps and revision IDs)
        :param pre_filter:
        :param n_queries:
        :param limit_per_query: maximum items to be read in (for debugging/testing)
        :type limit_per_query: int
        :return: list of entities to be updated
        :rtype: list
        """
        if self.dump_path is None:
            raise ValueError('Dump path required!')

        if pre_filter is None:
            pre_filter = [(lambda entity, entity_type: True, {})]

        def best_guess_open(file_name):
            """
            Use bz2file to iterate over a compressed file,
            regular open otherwise."""
            if file_name.endswith('.bz2'):
                return BZ2File(file_name)
            elif file_name.endswith('.gz'):
                return gzip.open(file_name)
            else:
                return open(file_name)

        dump_path = self.dump_path
        try:
            if not self.all_relevant_categories and not self.process_all:
                self.all_relevant_categories = self.get_relevant_category_ids(
                    self.entity_types)
        except Exception as e:
            raise e
        with best_guess_open(dump_path) as xml_file:
            parser = et.iterparse(xml_file, events=('end', ))
            try:
                for events, elem in parser:
                    if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}timestamp':
                        timestamp = elem.text
                    elif elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}text':

                        if not elem.text:
                            del elem
                            del events
                            continue

                        try:
                            elem_content = ujson.loads(elem.text)
                            assert isinstance(elem_content, dict)
                        except (ValueError, AssertionError):
                            del elem
                            del events
                            continue
                        try:
                            elem_content['timestamp'] = timestamp
                            del timestamp
                        except NameError:
                            logger.warning(
                                "Item %s cannot be assigned a timestamp!",
                                elem_content['id'])
                        try:
                            category = self.determine_relevant_category(
                                elem_content)
                            assert category
                        except (
                                ValueError,  # if the JSON is empty
                                AssertionError
                                # if the entity doesn't fit search categories
                        ):
                            del elem
                            del events
                            continue
                        pre_filter_result = all([
                            filter_function(entity=elem_content,
                                            entity_type=category,
                                            **filter_params)
                            for filter_function, filter_params in pre_filter
                        ])
                        if pre_filter_result:
                            try:
                                for entity in collect_attributes_from_wp_and_wd(
                                        elem_content,
                                        include_wikipedia=include_wikipedia,
                                        delay_wikipedia_retrieval=
                                        delay_wikipedia_retrieval,
                                        entity_type=category,
                                        **kwargs):
                                    entity['category'] = category
                                    if include_wikipedia and not delay_wikipedia_retrieval:
                                        for language_result in merge_with_wikipedia_by_language(
                                                entity=entity,
                                                languages=kwargs['languages']):
                                            yield language_result
                                    else:
                                        yield entity

                            except (DoesNotMatchFilterError, ValueError) as e:
                                # this probably means no
                                # Wikipedia page in any of our languages, or
                                # failute to mach filter criteria
                                # have no use for such entities.
                                del elem
                                del events
                                continue
                            except Exception as e:
                                raise e

                    del elem
                    del events
            except (EOFError, IOError) as e:
                logger.warning('Error parsing file {dump_path}: %s',
                               e,
                               exc_info=True)
Ejemplo n.º 36
0
def load_data(data_type, min_usr_freq=5):
    question_dict, question_order = {}, []
    usr_idx_dict = {}  # usr
    post_file = PROJECT_PATH + 'Posts.xml'
    active_usrs, temp_usrs = get_active_usrs(post_file,
                                             min_usr_freq=min_usr_freq)
    temp_question_ids, temp_usr_ids = [], [
    ]  # questions used for temporal test

    parser = etree.iterparse(post_file, events=('end', ), tag='row')
    for i, (_, elem) in enumerate(parser):
        attr = dict(elem.attrib)

        # Output to separate files
        if attr['PostTypeId'] == '1':  # question post
            id, title, content, date, ans_id = parse_question(attr)
            # if question doesn't contain the accepted answer, skip the question
            if not ans_id:
                continue
            question_dict[id] = {
                "id": id,
                "title": title,
                "content": content,
                "date": date,
                "answers": [],
                "accept_ans": ans_id
            }
            question_order.append(id)
            pass
        elif attr['PostTypeId'] == '2':  # answer post
            parent_id, id, content, usr_name, score, date = parse_answer(attr)

            if (parent_id not in question_dict) or (not usr_name):
                continue

            if (usr_name not in active_usrs) and (usr_name not in temp_usrs):
                continue

            # assign user id
            if usr_name not in usr_idx_dict:
                usr_idx = len(usr_idx_dict.keys())
                usr_idx_dict[usr_name] = usr_idx
            else:
                usr_idx = usr_idx_dict[usr_name]

            answer_tuple = (id, content, usr_idx, score, date)
            question_dict[parent_id]["answers"].append(answer_tuple)

            if usr_name in temp_usrs:
                temp_question_ids.append(parent_id)

    # filter questions without any answers
    question_dict, num_removed = filter_unanswer_question(question_dict)

    # add data by its different data orders
    if data_type == "rand":
        shuffle(question_order)

    questions, temp_questions = [], []
    for id in question_order:
        if id not in question_dict:
            continue

        if id in temp_question_ids:
            temp_questions.append(question_dict[id])
        else:
            questions.append(question_dict[id])

    temp_usr_ids = [usr_idx_dict[i] for i in temp_usrs if i in usr_idx_dict]
    print("Total questions: ",
          str(len(questions)),
          " Users: ",
          str(len(usr_idx_dict.keys())),
          sep="")
    print("Temp Users IDs:", " ".join([str(i) for i in temp_usr_ids]))
    return questions, usr_idx_dict, temp_questions, temp_usr_ids
Ejemplo n.º 37
0
cluster = MongoClient(
    'mongodb://localhost:27017/?readPreference=primary&appname=MongoDB%20Compass&ssl=false'
)

path_to_file = 'dblp-2021-04-01.xml'
dtd_path = 'dblp.dtd'

dtd = etree.DTD(dtd_path)

count = 0

db = cluster["dblp"]
coll = db['data']
context = etree.iterparse(path_to_file,
                          dtd_validation=True,
                          tag="phdthesis",
                          events=('start', 'end'))
for event, element in context:
    dict = {}
    ee = []
    for child in element:
        if (child.tag == 'ee'):
            ee.append(child.text)
        else:
            dict[child.tag] = child.text
    dict['ee'] = ee
    coll.insert_one(dict)
    element.clear()
    del dict
    del ee
    count = count + 1
Ejemplo n.º 38
0
def restore_tags(SOURCENOTAGSTOKSP, SOURCETAGSTOKSP, SELECTEDALIGNMENT,
                 TARGETNOTAGSTOKSP):
    relations = {}
    for t in SELECTEDALIGNMENT.split(" "):
        camps = t.split("-")
        if not int(camps[0]) in relations:
            relations[int(camps[0])] = []
        relations[int(camps[0])].append(int(camps[1]))
    SOURCETAGSTOKSPMOD = "<s> " + SOURCETAGSTOKSP + " </s>"
    f = io.BytesIO(SOURCETAGSTOKSPMOD.encode('utf-8'))
    events = ("start", "end")
    context = ET.iterparse(f, events=events, recover=True)
    cont_g = -1
    tags = []
    tagpairs = []
    LISTSOURCETAGSTOKSP = splitTags(SOURCETAGSTOKSP)
    #LISTSOURCETAGSTOK=removeSpeChar(LISTSOURCETAGSTOKSP,spechar)
    LISTSOURCETAGSTOK = LISTSOURCETAGSTOKSP
    LISTSOURCENOTAGSTOKSP = splitTags(SOURCENOTAGSTOKSP)
    LISTTARGETNOTAGSTOKSP = splitTags(TARGETNOTAGSTOKSP)
    TEMPLIST = LISTSOURCETAGSTOKSP
    charbefore = {}
    charafter = {}
    print("SOURCETAGSTOKSP", SOURCETAGSTOKSP)
    for event, elem in context:
        if not elem.tag == "s":
            tag = elem.tag
            attr = elem.items()
            if event == "start":
                if len(attr) == 0:
                    xmltag = "<" + tag + ">"
                    if SOURCETAGSTOKSP.find(xmltag) > -1: tags.append(xmltag)
                else:
                    lat = []
                    for at in attr:
                        cadena = at[0] + "='" + str(at[1]) + "'"
                        lat.append(cadena)
                    cat = " ".join(lat)
                    xmltag1 = "<" + tag + " " + cat + ">"
                    if SOURCETAGSTOKSP.find(xmltag1) > -1:
                        tags.append(xmltag1)
                        xmltag = xmltag1

                    lat = []
                    for at in attr:
                        cadena = at[0] + '="' + str(at[1]) + '"'
                        lat.append(cadena)
                    cat = " ".join(lat)
                    xmltag2 = "<" + tag + " " + cat + ">"
                    if SOURCETAGSTOKSP.find(xmltag2) > -1:
                        tags.append(xmltag2)
                        xmltag = xmltag2

                closingtag = "</" + tag + ">"
                if SOURCETAGSTOKSP.find(closingtag) > -1:
                    tripleta = (xmltag, closingtag, elem.text)
                    tagpairs.append(tripleta)

            elif event == "end":
                xmltag = "</" + tag + ">"
                if SOURCETAGSTOKSP.find(xmltag) > -1:
                    tags.append(xmltag)

                xmltag = "<" + tag + "/>"
                if SOURCETAGSTOKSP.find(xmltag) > -1:
                    tags.append(xmltag)

    preTags = []
    postTags = []
    for xmltag in tags:
        if SOURCETAGSTOKSP.find(xmltag) > -1:
            chbf = SOURCETAGSTOKSP[SOURCETAGSTOKSP.index(xmltag) - 1]
            charbefore[xmltag] = chbf
            chaf = SOURCETAGSTOKSP[SOURCETAGSTOKSP.index(xmltag) + len(xmltag)]
            charafter[xmltag] = chaf

    tagsCHAR = {}

    for tag in tags:
        tagC = tag
        if tag in charbefore:
            cb = charbefore[tag].strip()
            tagC = cb + tag

        if tag in charafter:
            ca = charafter[tag].strip()
            tagC = tagC + ca
        tagsCHAR[tag] = tagC

    for i in range(0, len(LISTTARGETNOTAGSTOKSP) + 1):
        preTags.insert(i, None)
        postTags.insert(i, None)

    for tripleta in tagpairs:
        #source positions
        sourcepositions = []
        for ttrip in tripleta[2].strip().split(" "):
            try:
                postemp = LISTSOURCENOTAGSTOKSP.index(ttrip.strip())
                sourcepositions.append(postemp)
            except:
                pass
        try:
            tags.remove(tripleta[0])
            TEMPLIST.remove(tripleta[0])

        except:
            pass
        try:
            tags.remove(tripleta[1])
            TEMPLIST.remove(tripleta[1])
        except:
            pass
        #target positions
        targetpositions = []
        for position in sourcepositions:
            if position in relations:
                targetpositions.extend(relations[position])

        preTags[min(targetpositions)] = tagsCHAR[tripleta[0]]

        postTags[max(targetpositions)] = tagsCHAR[tripleta[1]]

    #isolated tags
    for tag in tags:
        try:
            preTags[relations[TEMPLIST.index(tag)][0]] = tag
            TEMPLIST.remove(tag)
        except:
            pass

    LISTTARGETTAGSTOKSP = []
    for i in range(0, len(LISTTARGETNOTAGSTOKSP)):
        try:
            if preTags[i]:
                LISTTARGETTAGSTOKSP.append(preTags[i])
            LISTTARGETTAGSTOKSP.append(LISTTARGETNOTAGSTOKSP[i])
            if postTags[i]:
                LISTTARGETTAGSTOKSP.append(postTags[i])
        except:
            pass
    translationTagsSP = " ".join(LISTTARGETTAGSTOKSP)
    return (translationTagsSP)
Ejemplo n.º 39
0
    def _validate_event_files(self):
        """
        Validates all event files in the currently active project.

        The following tasks are performed:
            * Validate against QuakeML 1.2 scheme.
            * Check for duplicate ids amongst all QuakeML files.
            * Make sure they contain at least one origin, magnitude and focal
              mechanism object.
            * Some simply sanity checks so that the event depth is reasonable
              and the moment tensor values as well. This is rather fragile and
              mainly intended to detect values specified in wrong units.
            * Events that are too close in time. Events that are less then one
              hour apart can in general not be used for adjoint tomography.
              This will naturally also detect duplicate events.
        """
        import collections
        import itertools
        import math
        from obspy import read_events
        from obspy.io.quakeml.core import _validate as validate_quakeml
        from lxml import etree

        print "Validating %i event files ..." % self.comm.events.count()

        # Start with the schema validation.
        print "\tValidating against QuakeML 1.2 schema ",
        all_valid = True
        for event in self.comm.events.get_all_events().values():
            filename = event["filename"]
            self._flush_point()
            if validate_quakeml(filename) is not True:
                all_valid = False
                msg = (
                    "ERROR: "
                    "The QuakeML file '{basename}' did not validate against "
                    "the QuakeML 1.2 schema. Unfortunately the error messages "
                    "delivered by lxml are not useful at all. To get useful "
                    "error messages make sure jing is installed "
                    "('brew install jing' (OSX) or "
                    "'sudo apt-get install jing' (Debian/Ubuntu)) and "
                    "execute the following command:\n\n"
                    "\tjing http://quake.ethz.ch/schema/rng/QuakeML-1.2.rng "
                    "{filename}\n\n"
                    "Alternatively you could also use the "
                    "'lasif add_spud_event' command to redownload the event "
                    "if it is in the GCMT "
                    "catalog.\n\n").format(basename=os.path.basename(filename),
                                           filename=os.path.relpath(filename))
                self._add_report(msg)
        if all_valid is True:
            self._print_ok_message()
        else:
            self._print_fail_message()

        # Now check for duplicate public IDs.
        print "\tChecking for duplicate public IDs ",
        ids = collections.defaultdict(list)
        for event in self.comm.events.get_all_events().values():
            filename = event["filename"]
            self._flush_point()
            # Now walk all files and collect all public ids. Each should be
            # unique!
            with open(filename, "rt") as fh:
                for event, elem in etree.iterparse(fh, events=("start", )):
                    if "publicID" not in elem.keys() or \
                            elem.tag.endswith("eventParameters"):
                        continue
                    ids[elem.get("publicID")].append(filename)
        ids = {
            key: list(set(value))
            for (key, value) in ids.iteritems() if len(value) > 1
        }
        if not ids:
            self._print_ok_message()
        else:
            self._print_fail_message()
            self._add_report(
                "Found the following duplicate publicIDs:\n" + "\n".join([
                    "\t%s in files: %s" % (id_string, ", ".join(
                        [os.path.basename(i) for i in faulty_files]))
                    for id_string, faulty_files in ids.iteritems()
                ]),
                error_count=len(ids))

        def print_warning(filename, message):
            self._add_report("WARNING: File '{event_name}' "
                             "contains {msg}.\n".format(
                                 event_name=os.path.basename(filename),
                                 msg=message))

        # Performing simple sanity checks.
        print "\tPerforming some basic sanity checks ",
        all_good = True
        for event in self.comm.events.get_all_events().values():
            filename = event["filename"]
            self._flush_point()
            cat = read_events(filename)
            filename = os.path.basename(filename)
            # Check that all files contain exactly one event!
            if len(cat) != 1:
                all_good = False
                print_warning(filename,
                              "%i events instead of only one." % len(cat))
            event = cat[0]

            # Sanity checks related to the origin.
            if not event.origins:
                all_good = False
                print_warning(filename, "no origin")
                continue
            origin = event.preferred_origin() or event.origins[0]
            if (origin.depth % 100.0):
                all_good = False
                print_warning(
                    filename, "a depth of %.1f meters. This kind of accuracy "
                    "seems unrealistic. The depth in the QuakeML "
                    "file has to be specified in meters. Checking "
                    "all other QuakeML files for the correct units "
                    "might be a good idea" % origin.depth)
            if (origin.depth > (800.0 * 1000.0)):
                all_good = False
                print_warning(
                    filename, "a depth of more than 800 km. This is"
                    " likely wrong.")

            # Sanity checks related to the magnitude.
            if not event.magnitudes:
                all_good = False
                print_warning(filename, "no magnitude")
                continue

            # Sanity checks related to the focal mechanism.
            if not event.focal_mechanisms:
                all_good = False
                print_warning(filename, "no focal mechanism")
                continue

            focmec = event.preferred_focal_mechanism() or \
                event.focal_mechanisms[0]
            if not hasattr(focmec, "moment_tensor") or \
                    not focmec.moment_tensor:
                all_good = False
                print_warning(filename, "no moment tensor")
                continue

            mt = focmec.moment_tensor
            if not hasattr(mt, "tensor") or \
                    not mt.tensor:
                all_good = False
                print_warning(filename, "no actual moment tensor")
                continue
            tensor = mt.tensor

            # Convert the moment tensor to a magnitude and see if it is
            # reasonable.
            mag_in_file = event.preferred_magnitude() or event.magnitudes[0]
            mag_in_file = mag_in_file.mag
            M_0 = 1.0 / math.sqrt(2.0) * math.sqrt(tensor.m_rr**2 +
                                                   tensor.m_tt**2 +
                                                   tensor.m_pp**2)
            magnitude = 2.0 / 3.0 * math.log10(M_0) - 6.0
            # Use some buffer to account for different magnitudes.
            if not (mag_in_file - 1.0) < magnitude < (mag_in_file + 1.0):
                all_good = False
                print_warning(
                    filename, "a moment tensor that would result in a moment "
                    "magnitude of %.2f. The magnitude specified in "
                    "the file is %.2f. Please check that all "
                    "components of the tensor are in Newton * meter" %
                    (magnitude, mag_in_file))

        if all_good is True:
            self._print_ok_message()
        else:
            self._print_fail_message()

        # Collect event times
        event_infos = self.comm.events.get_all_events().values()

        # Now check the time distribution of events.
        print "\tChecking for duplicates and events too close in time %s" % \
              (self.comm.events.count() * "."),
        all_good = True
        # Sort the events by time.
        event_infos = sorted(event_infos, key=lambda x: x["origin_time"])
        # Loop over adjacent indices.
        a, b = itertools.tee(event_infos)
        next(b, None)
        for event_1, event_2 in itertools.izip(a, b):
            time_diff = abs(event_2["origin_time"] - event_1["origin_time"])
            # If time difference is under one hour, it could be either a
            # duplicate event or interfering events.
            if time_diff <= 3600.0:
                all_good = False
                self._add_report(
                    "WARNING: "
                    "The time difference between events '{file_1}' and "
                    "'{file_2}' is only {diff:.1f} minutes. This could "
                    "be either due to a duplicate event or events that have "
                    "interfering waveforms.\n".format(
                        file_1=event_1["filename"],
                        file_2=event_2["filename"],
                        diff=time_diff / 60.0))
        if all_good is True:
            self._print_ok_message()
        else:
            self._print_fail_message()

        # Check that all events fall within the chosen boundaries.
        print "\tAssure all events are in chosen domain %s" % \
              (self.comm.events.count() * "."),
        all_good = True
        domain = self.comm.project.domain
        for event in event_infos:
            if domain.point_in_domain(latitude=event["latitude"],
                                      longitude=event["longitude"]):
                continue
            all_good = False
            self._add_report(
                "\nWARNING: "
                "Event '{filename}' is out of bounds of the chosen domain."
                "\n".format(filename=event["filename"]))
        if all_good is True:
            self._print_ok_message()
        else:
            self._print_fail_message()
def element_generator(input_file, template, root_tag, is_whole_element):
    root_tag_with_namespace = "{*}" + root_tag
    for event, elem in etree.iterparse(input_file,
                                       tag=root_tag_with_namespace):
        yield (etree.tostring(elem), template.get(root_tag), is_whole_element)
        elem.clear()
Ejemplo n.º 41
0
stats = {}
stats['addresses'] = 0
stats['ways'] = {}
stats['nodes'] = {}
# Prepare changesets and stats to hold changes by tag name
for tag in tags:
    stats['nodes'][tag]  = 0
    stats['ways'][tag] = 0
    changesets[tag]= {}

sys.stderr.write('finding points\n')

# ------------------------------------------
# Find nodes that fall within specified area
# ------------------------------------------
context = iter(etree.iterparse(osc_file, events=('start', 'end')))
event, root = context.next()
for event, n in context:
    if event == 'start':
        if n.tag == 'node':
            lon = float(n.get('lon', 0))
            lat = float(n.get('lat', 0))
            if point_in_box(lon, lat, aoi_box) and point_in_poly(lon, lat, aoi_poly):
                cid = n.get('changeset')
                nid = n.get('id', -1)
                nids.add(nid)
                ntags = n.findall(".//tag[@k]")
                addr_tags = getaddresstags(ntags)
                version = int(n.get('version'))

                # Capture address changes
Ejemplo n.º 42
0
    def preprocess(
            self,
            xml_directory='RawXML',
            name_space='http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord',
            process_name=True,
            num_file_lines=10**6,
            show_progress=True):
        """
        Bulk preprocess of the Web of Science raw data.

        Parameters
        ----------
        process_name: bool, default True
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.

        xml_file_name: str, default 'dblp.xml.gz'
            The xml file name.

        num_file_lines: int, default 10**6
            The processed data will be saved into smaller DataFrames, each with `num_file_lines` rows.

        show_progress: bool, default True
            Show progress with processing of the data.

        """

        pub_column_names = [
            'PublicationId', 'Year', 'JournalId', 'Doi', 'ISSN', 'Title',
            'Date', 'Volume', 'Issue', 'Pages', 'DocType', 'TeamSize'
        ]
        author_column_names = ['AuthorId', 'FullName', 'FirstName', 'LastName']

        if show_progress:
            print("Starting to preprocess the WOS database.")

        for hier_dir_type in [
                'publication', 'author', 'publicationauthoraffiliation',
                'pub2field', 'pub2ref', 'affiliation'
        ]:

            if not os.path.exists(
                    os.path.join(self.path2database, hier_dir_type)):
                os.mkdir(os.path.join(self.path2database, hier_dir_type))

        pub2year = {}
        pub2doctype = {}

        found_aids = set([])

        found_affiliations = {}

        ns = {"ns": name_space}
        xmlfiles = sorted([
            fname for fname in os.listdir(
                os.path.join(self.path2database, xml_directory))
            if '.xml' in fname
        ])

        ifile = 0
        for xml_file_name in tqdm(xmlfiles,
                                  desc='WOS xml files',
                                  leave=True,
                                  disable=not show_progress):

            publication_df = []
            author_df = []
            paa_df = []
            pub2field_df = []
            pub2ref_df = []
            affiliation_df = []
            field_df = []

            name, extension = os.path.splitext(xml_file_name)

            if extension == '.gz':
                with gzip.open(
                        os.path.join(self.path2database, xml_directory,
                                     xml_file_name), 'r') as infile:
                    xml_file = infile.read()
                bytesxml = BytesIO(xml_file)

            elif extension == '.xml':
                with open(
                        os.path.join(self.path2database, xml_directory,
                                     xml_file_name), 'r') as infile:
                    xml_file = infile.read()

            # extract the desired fields from the XML tree  #

            xmltree = etree.iterparse(bytesxml,
                                      events=('end', ),
                                      tag="{{{0}}}REC".format(name_space))

            if show_progress:
                print("{} Xml tree parsed, iterating through elements.".format(
                    xml_file_name))

            last_position = 0

            for event, elem in xmltree:

                # scrape the publication information
                PublicationId = load_html_str(
                    elem.xpath('./ns:UID', namespaces=ns)[0].text)

                pub_record = self._blank_wos_publication(PublicationId)

                pub_record['Title'] = load_html_str(
                    load_xml_text(
                        elem.xpath(
                            './ns:static_data/ns:summary/ns:titles/ns:title[@type="item"]',
                            namespaces=ns)))
                pub_record['JournalId'] = load_html_str(
                    load_xml_text(
                        elem.xpath(
                            './ns:static_data/ns:summary/ns:titles/ns:title[@type="source"]',
                            namespaces=ns)))

                pub_info = elem.xpath(
                    './ns:static_data/ns:summary/ns:pub_info',
                    namespaces=ns)[0]
                pub_record['Year'] = load_int(pub_info.get('pubyear', ''))
                pub_record['Date'] = load_html_str(pub_info.get(
                    'sortdate', ''))
                pub_record['Volume'] = load_int(pub_info.get('vol', ''))
                pub_record['Issue'] = load_int(pub_info.get('issue', ''))

                pub2year[PublicationId] = pub_record['Year']

                pub_record['Pages'] = load_html_str(
                    load_xml_text(elem.xpath(
                        './ns:static_data/ns:summary/ns:pub_info/ns:page',
                        namespaces=ns),
                                  default=''))

                for ident in ['ISSN', 'Doi']:
                    identobject = elem.xpath(
                        './ns:dynamic_data/ns:cluster_related/ns:identifiers/ns:identifier[@type="{}"]'
                        .format(ident.lower()),
                        namespaces=ns)
                    if len(identobject) > 0:
                        pub_record[ident] = load_html_str(identobject[0].get(
                            'value', ''))

                #load_html_str(load_xml_text(elem.xpath('./ns:dynamic_data/ns:cluster_related/ns:identifiers/ns:identifier[@type="doi"]', namespaces=ns)))

                pub_record['DocType'] = load_html_str(
                    load_xml_text(
                        elem.xpath(
                            './ns:static_data/ns:summary/ns:doctypes/ns:doctype',
                            namespaces=ns)))

                pub2doctype[PublicationId] = pub_record['DocType']

                # now scrape the authors
                pub_authors = {}
                author_objects = elem.xpath(
                    './ns:static_data/ns:summary/ns:names/ns:name[@role="author"]',
                    namespaces=ns)
                pub_record['TeamSize'] = len(author_objects)

                for author_obj in author_objects:
                    author_record = self._blank_wos_author(None)
                    author_record['AuthorId'] = author_obj.get('dais_id', None)

                    author_record['FullName'] = load_html_str(
                        load_xml_text(
                            author_obj.xpath('./ns:full_name', namespaces=ns)))
                    author_record['FirstName'] = load_html_str(
                        load_xml_text(
                            author_obj.xpath('./ns:first_name',
                                             namespaces=ns)))
                    author_record['LastName'] = load_html_str(
                        load_xml_text(
                            author_obj.xpath('./ns:last_name', namespaces=ns)))

                    author_record['Affiliations'] = author_obj.get(
                        'addr_no', '')
                    author_record['Affiliations'] = [
                        int(single_addr_no) for single_addr_no in
                        author_record['Affiliations'].split(' ')
                        if len(single_addr_no) > 0
                    ]

                    author_record['AuthorOrder'] = int(
                        author_obj.get('seq_no', None))

                    pub_authors[author_record['AuthorOrder']] = author_record

                #contributor_objects = elem.xpath('./ns:static_data/ns:contributors/ns:contributor/ns:name[@role="researcher_id"]', namespaces=ns)

                address_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:addresses/ns:address_name/ns:address_spec',
                    namespaces=ns)
                for addr_obj in address_objects:
                    addr_record = self._blank_wos_affiliation()

                    organization_objects = addr_obj.xpath(
                        './ns:organizations/ns:organization[@pref="Y"]',
                        namespaces=ns)
                    if len(organization_objects) == 0:
                        organization_objects = addr_obj.xpath(
                            './ns:organizations/ns:organization',
                            namespaces=ns)

                    if len(organization_objects) == 0:
                        orgtext = ''
                    else:
                        orgtext = organization_objects[0].text

                    address_no = int(addr_obj.get('addr_no'))

                    affiliation_df.append([PublicationId, addr_no, orgtext])

                    #if found_affiliations

                    #article['addresses'][address_no] = address_info

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:headings/ns:heading',
                    namespaces=ns)
                field_df.extend([[PublicationId, field_obj.text, 'heading']
                                 for field_obj in field_objects
                                 if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subheadings/ns:subheading',
                    namespaces=ns)
                field_df.extend([[PublicationId, field_obj.text, 'subheading']
                                 for field_obj in field_objects
                                 if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subjects/ns:subject[@ascatype="traditional"]',
                    namespaces=ns)
                field_df.extend([[
                    PublicationId, field_obj.text, 'ASCA traditional subject'
                ] for field_obj in field_objects if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:category_info/ns:subjects/ns:subject[@ascatype="extended"]',
                    namespaces=ns)
                field_df.extend(
                    [[PublicationId, field_obj.text, 'ASCA extended subject']
                     for field_obj in field_objects if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:keywords/ns:keyword',
                    namespaces=ns)
                field_df.extend([[PublicationId, field_obj.text, 'keyword']
                                 for field_obj in field_objects
                                 if field_obj is not None])

                field_objects = elem.xpath(
                    './ns:static_data/ns:item/ns:keywords_plus/ns:keyword',
                    namespaces=ns)
                field_df.extend(
                    [[PublicationId, field_obj.text, 'keyword plus']
                     for field_obj in field_objects if field_obj is not None])

                reference_objects = elem.xpath(
                    './ns:static_data/ns:fullrecord_metadata/ns:references/ns:reference',
                    namespaces=ns)
                for ref_obj in reference_objects:
                    for ref_elem in ref_obj:
                        if ref_elem.tag == "{{{0}}}uid".format(name_space):
                            refid = load_html_str(
                                ref_elem.text.replace('WOS:', ''))
                            pub2ref_df.append([PublicationId, refid])
                        elif ref_elem.tag == "{{{0}}}year".format(name_space):
                            pub2year[refid] = load_int(ref_elem.text)

                publication_df.append(
                    [pub_record[k] for k in pub_column_names])

                for aorder, author_record in pub_authors.items():
                    if not author_record[
                            'AuthorId'] is None and not author_record[
                                'AuthorId'] in found_aids:
                        found_aids.add(author_record['AuthorId'])
                        author_df.append(
                            [author_record[k] for k in author_column_names])

                    paa_df.append([
                        PublicationId, author_record['AuthorId'], aorder,
                        author_record['FullName']
                    ])

            self._save_dataframes(ifile, publication_df, pub_column_names,
                                  author_df, author_column_names, paa_df,
                                  pub2ref_df, affiliation_df, field_df)
            ifile += 1

        with gzip.open(os.path.join(self.path2database, 'pub2year.json.gz'),
                       'w') as outfile:
            outfile.write(json.dumps(pub2year).encode('utf8'))

        with gzip.open(os.path.join(self.path2database, 'pub2doctype.json.gz'),
                       'w') as outfile:
            outfile.write(json.dumps(pub2doctype).encode('utf8'))
Ejemplo n.º 43
0
def read_marc_file(f):
    for event, elem in etree.iterparse(f, tag=record_tag):
        yield MarcXml(elem)
        elem.clear()
Ejemplo n.º 44
0
    def parse(self, languages, replace=False, bn_to_wn_mapping=None):
        """
        Parses sentences of the given languages.

        In case replace=False: for each sentence, returns its attributes, text, text attributes, annotations, annotations attributes.
        In case replace=True: for each sentence, returns the processed sentence according the replacing rule.

        :param languages: languages to be considered (in ISO code), as List
        :param replace: True: replaces anchors with lemma_annotation; False: nothing (default)
        :param bn_to_wn_mapping: dictionary mapping BabelNet IDs to WordNet IDs
        :return: appropriate return, in a generator fashion
        """
        assert (replace is False) or (
            replace and bn_to_wn_mapping is not None
        ), "BabelNet to WordNet mapping must be provided if replace=True"

        for event, sentence in etree.iterparse(self.xml, tag="sentence"):
            proc_sentences = []
            sentence_attrs = sentence.attrib
            text_attrs, text, annotations, annotations_attrs = [], [], [], []

            if event == "end":
                for element in sentence:
                    if element.tag == "text" and element.get(
                            "lang") in languages:
                        text_attrs.append(element.attrib)
                        text.append(element.text)

                    if element.tag == "annotations":
                        for annotation in element:
                            if annotation.get("lang") in languages:
                                annotations_attrs.append(annotation.attrib)
                                annotations.append(annotation.text)

                if not replace:
                    yield sentence_attrs, text, text_attrs, annotations, annotations_attrs

                else:
                    # used to keep track of possible replacements in order to select the longest mention
                    Replacement = namedtuple("Replacement",
                                             "anchor synset lemma")

                    # iterate over all texts, one for each language selected
                    for single_text, single_attrs in zip(text, text_attrs):

                        # skip processing of null texts (i.e. sentence id = 2031)
                        if single_text is None:
                            continue

                        proc_sentence = []
                        for word in single_text.split(" "):
                            curr_replacement = Replacement(
                                anchor=[], synset="<NO_SYNSET>", lemma="")

                            for annotation, ann_attrs in zip(
                                    annotations, annotations_attrs):

                                # no need to parse annotations of another language than the text's
                                if ann_attrs["lang"] != single_attrs["lang"]:
                                    continue

                                curr_anchor = ann_attrs["anchor"].split(" ")
                                if word in curr_anchor:
                                    # longest mention is preferred in case of multiple annotations for the same word
                                    if len(curr_anchor) > len(
                                            curr_replacement.anchor):
                                        curr_replacement = Replacement(
                                            anchor=curr_anchor,
                                            synset=annotation,
                                            lemma=ann_attrs["lemma"])

                            # no annotation for this word
                            if curr_replacement.synset == "<NO_SYNSET>":
                                proc_sentence.append(word)

                            # annotation found and word is the last in its mention
                            elif curr_replacement.synset != "<NO_SYNSET>" and \
                                    word == curr_replacement.anchor[-1]:

                                # build the lemma_synset format for the whole mention
                                replacement_word = "%s_%s" % (
                                    curr_replacement.lemma.replace(
                                        " ", "_"), curr_replacement.synset)
                                proc_sentence.append(replacement_word)

                        # form a string concatenated by space
                        proc_sentences.append(" ".join(proc_sentence))

                    yield proc_sentences

                sentence.clear()
Ejemplo n.º 45
0
 def _iterparse(source):
     for _, node in etree.iterparse(source, tag='document'):
         yield node
         node.clear()
Ejemplo n.º 46
0
    def _parse_results(self, file_report):

        # Special thanks to:
        # http://codereview.stackexchange.com/questions/2449/parsing-huge-xml-file-with-lxml-etree-iterparse-in-python
        context = etree.iterparse(file_report,
                                  huge_tree=True,
                                  remove_blank_text=True,
                                  dtd_validation=False,
                                  events=("start", "end"))

        for hostCounter, element in enumerate(
                self._extract_host_elements(context)):

            item_info = {
                'scan_start': '',
                'scan_stop': '',
                'os': '',
                'hostname': '',
                'netbios_name': '',
                'mac_address': '',
                'ip': '',
            }

            # for some reason i keep getting the "report" element too.
            # continue if i see it.
            if element.tag == 'Report':
                continue

            # make sure the element is formatted properly.
            ip = element.get('name')
            if ip is None:
                continue

            host_properties = element.find('HostProperties')
            if host_properties is not None:

                self._results[ip] = []
                host_tags = host_properties.findall('tag')
                if host_tags is not None:
                    for host_tag in host_tags:
                        if host_tag.get("name") == 'HOST_START':
                            item_info['scan_start'] = host_tag.text

                        if host_tag.get("name") == 'HOST_END':
                            item_info['scan_stop'] = host_tag.text

                        if host_tag.get("name") == 'operating-system':
                            item_info['os'] = host_tag.text

                        if host_tag.get("name") == 'host-fqdn':
                            item_info['hostname'] = host_tag.text

                        if host_tag.get("name") == 'netbios-name':
                            item_info['netbios_name'] = host_tag.text

                        if host_tag.get("name") == 'mac-address':
                            item_info['mac_address'] = host_tag.text

                        if host_tag.get("name") == 'host-ip':
                            item_info['ip'] = host_tag.text

                self._results[ip].append(item_info)
            else:
                if ip is not None:
                    # this means that, for some reason, etree was not able to parse the element.
                    print "I found IP:", ip, "but there was an empty element."

            report_items = element.findall('ReportItem')
            data_items = [
                'description', 'solution', 'plugin_type', 'cvss_base_score',
                'cvss_vector', 'exploit_available', 'exploitability_ease',
                'exploit_framework_metasploit', 'cve'
            ]

            if report_items is not None:
                for report_item in report_items:

                    vuln = {
                        'plugin_name': '',
                        'plugin_id': '',
                        'plugin_type': '',
                        'port': '',
                        'protocol': '',
                        'description': '',
                        'solution': '',
                        'service_name': '',
                        'cvss_base_score': '0.0',
                        'cvss_vector': '',
                        'exploit_available': '',
                        'metasploit': '',
                        'cve': '',
                    }
                    # Skip specific vulnerability if it is into a blacklist
                    if report_item.get('pluginID') in self._blacklist:
                        self._blacklist_hit += 1
                        continue

                    vuln['plugin_name'] = report_item.get('pluginName')
                    vuln['plugin_id'] = report_item.get('pluginID')
                    vuln['port'] = report_item.get('port')
                    vuln['protocol'] = report_item.get('protocol')
                    vuln['description'] = report_item.get('description')
                    vuln['service_name'] = report_item.get('svc_name')

                    for data_item in data_items:
                        data = report_item.find(data_item)
                        if data is not None:
                            # set the following to false initially.
                            vuln['exploit_framework_metasploit'] = 'false'
                            vuln['exploit_available'] = 'false'
                            vuln['patch_avail'] = 'false'

                            if data.tag == 'description':
                                vuln['description'] = data.text

                            if data.tag == 'solution':
                                vuln['solution'] = data.text

                            if data.tag == 'plugin_type':
                                vuln['plugin_type'] = data.text

                            if data.tag == 'cvss_base_score':
                                if data.text is not None:
                                    vuln['cvss_base_score'] = data.text

                            if data.tag == 'cvss_vector':
                                vuln['cvss_vector'] = data.text

                            if data.tag == 'exploit_available':
                                vuln['exploit_available'] = 'true'

                            if data.tag == 'exploitability_ease':
                                vuln['exploit_available'] = 'true'

                            if data.tag == 'exploit_framework_metasploit':
                                vuln['metasploit'] = 'true'

                            if data.tag == 'cve':
                                vuln['cve'] = data.text

                    self._results[ip].append(vuln)
Ejemplo n.º 47
0
def parse_unzipped(fp1):
    context = etree.iterparse(
        fp1,
        tag='{http://www.mediawiki.org/xml/export-0.10/}page',
        encoding='utf-8')
    fast_iter(context)
Ejemplo n.º 48
0
    def parse(self):
        results_base = os.path.join(self.data_path_base, 'results')
        if not os.path.isdir(results_base):
            os.makedirs(results_base)

        page_info_results_file = os.path.join(results_base, 'page_info.csv')
        revision_info_results_file = os.path.join(results_base,
                                                  'revisions.csv')
        no_text_error_results_file = os.path.join(results_base,
                                                  'no_text_error.csv')
        author_info_results_file = os.path.join(results_base,
                                                'author_info.csv')

        results_path = os.path.join(results_base,
                                    os.path.splitext(self.file_name)[0])
        if not os.path.isdir(results_path):
            os.makedirs(results_path)
        cat_results_file = os.path.join(results_path, 'cats.csv')
        link_results_file = os.path.join(results_path, 'links.csv')

        for file in glob.glob(self.data_path + '/*'):
            size = os.path.getsize(file)
            if size < 10485760000:
                for event, elem in etree.iterparse(
                        file,
                        tag='{http://www.mediawiki.org/xml/export-0.10/}page',
                        huge_tree=True):
                    for data in elem.iterchildren(
                            reversed=False,
                            tag='{http://www.mediawiki.org/xml/export-0.10/}ns'
                    ):
                        ns = data.text
                    if ns == '0' or ns == '14':
                        page_info, revision_info, no_text_error, author_info = self.get_data(
                            elem, cat_results_file, link_results_file)
                        page_info.to_csv(page_info_results_file,
                                         sep='\t',
                                         mode='a',
                                         header=False,
                                         index=False)
                        revision_info.to_csv(revision_info_results_file,
                                             sep='\t',
                                             mode='a',
                                             header=False,
                                             index=False)
                        no_text_error.to_csv(no_text_error_results_file,
                                             sep='\t',
                                             mode='a',
                                             header=False,
                                             index=False)
                        author_info.to_csv(author_info_results_file,
                                           sep='\t',
                                           mode='a',
                                           header=False,
                                           index=False)
                    else:
                        pass
                    elem.clear()
                    while elem.getprevious() is not None:
                        del elem.getparent()[0]
                os.remove(file)
            else:
                too_large = os.path.join(self.data_path_base,
                                         'too_large_to_parse')
                if not os.path.isdir(too_large):
                    os.makedirs(too_large)
                try:
                    subprocess.call([
                        '7z', 'a',
                        os.path.join(os.getcwd(), file + '.7z'),
                        os.path.join(os.getcwd(), file)
                    ])
                    shutil.copy2(file + '.7z', too_large)
                    os.remove(file)
                    os.remove(file + '.7z')
                except:
                    pass
        return True
Ejemplo n.º 49
0
def parseDBLP(facultydict):
    authlogs = {}
    interestingauthors = {}
    authorscores = {}
    authorscoresAdjusted = {}

    with gzip.open("dblp.xml.gz") as f:

        oldnode = None

        for (event, node) in ElementTree.iterparse(f, events=["start", "end"]):
            if oldnode is not None:
                oldnode.clear()
            oldnode = node

            foundArticle = True  # include all venues
            # foundArticle = False
            inRange = False
            authorsOnPaper = 0
            authorName = ""
            confname = ""
            year = -1
            pageCount = -1
            foundOneInDict = False
            volume = 0

            if node.tag == "inproceedings" or node.tag == "article":

                # First, check if this is one of the conferences we are looking for.

                for child in node:
                    if child.tag == "booktitle" or child.tag == "journal":
                        confname = child.text
                        if True:  # INCLUDE ALL VENUES
                            # was: if (confname in confdict):
                            foundArticle = True
                    if child.tag == "volume":
                        volume = child.text
                    if child.tag == "year":
                        if child.text is not None:
                            year = int(child.text)
                    if child.tag == "pages":
                        pageCount = csrankings.pagecount(child.text)
                    if child.tag == "author":
                        authorName = child.text
                        if authorName is not None:
                            authorName = authorName.strip()
                            authorsOnPaper += 1
                            if authorName in facultydict:
                                foundOneInDict = True

                if not foundArticle:
                    # Not one of our conferences.
                    continue

                if confname is None:
                    continue

                if confname not in csrankings.confdict:
                    areaname = "na"
                else:
                    areaname = csrankings.confdict[confname]

                # Check that dates are in the specified range.
                if (year >= startyear) and (year <= endyear):
                    inRange = True

                if year == -1:
                    # No year.
                    continue

                tooFewPages = False
                if (pageCount != -1) and (
                    pageCount < csrankings.pageCountThreshold
                ):
                    tooFewPages = True
                    exceptionConference = confname == "SC"
                    exceptionConference |= (
                        confname == "SIGSOFT FSE" and year == 2012
                    )
                    exceptionConference |= (
                        confname == "ACM Trans. Graph."
                        and int(volume) >= 26
                        and int(volume) <= 36
                    )
                    if exceptionConference:
                        tooFewPages = False

                if (not inRange) or (not foundOneInDict) or tooFewPages:
                    continue

                # If we got here, we have a winner.

                for child in node:
                    if child.tag == "author":
                        authorName = child.text
                        authorName = authorName.strip()
                        if authorName in facultydict:
                            print(
                                "here we go"
                                + authorName
                                + " "
                                + confname
                                + " "
                                + str(authorsOnPaper)
                                + " "
                                + str(year)
                            )
                            logstring = authorName.encode("utf-8")
                            logstring += " ; ".encode("utf-8")
                            logstring += confname.encode("utf-8")
                            logstring += " ".encode("utf-8")
                            logstring += str(year).encode("utf-8")
                            tmplist = authlogs.get(authorName, [])
                            tmplist.append(logstring)
                            authlogs[authorName] = tmplist
                            interestingauthors[authorName] = (
                                interestingauthors.get(authorName, 0) + 1
                            )
                            authorscores[(authorName, areaname, year)] = (
                                authorscores.get(
                                    (authorName, areaname, year), 0
                                )
                                + 1.0
                            )
                            authorscoresAdjusted[
                                (authorName, areaname, year)
                            ] = (
                                authorscoresAdjusted.get(
                                    (authorName, areaname, year), 0
                                )
                                + 1.0 / authorsOnPaper
                            )

    return (interestingauthors, authorscores, authorscoresAdjusted, authlogs)
Ejemplo n.º 50
0
    fd.close()
del zfile

source = None
for f in os.listdir("."):
    if f.startswith("f_amp2_"):
        source = f

gc.collect()

now = datetime.datetime.now().isoformat()
print "Loading from ", source

items = []
data = {"_when": now, "_source": source}
for event, elem in etree.iterparse(source):
    if elem.tag == "AMP":
        if 'APID' in data:
            items.append(data.copy())
            data = {"_when": now, "_source": source}        
            if len(items) == 100:
                scraperwiki.sqlite.save(['APID'], items)
                items = []
        else:
            print 'No APID in %s and event was %s' %(data,event)
    if elem.text:
        data[elem.tag] = elem.text
    if elem.tag == "DESC":
        m = supplier_re.match(elem.text)
        if m:
            data['SUPPLIER'] = m.groups()[0]
Ejemplo n.º 51
0
def createFullCSV(marcxml_file, output):
    writer = csv.writer(open(output), 'w')
    header = getHeader(marcxml_file)
    writer.writerow(header)
    for event, record in etree.iterparse(open(marcxml_file, 'rb')):
        if record.tag == marcxmlNS + "record":
            nextrow = []
            for field in header:
                if field == 'leader':
                    field_value = record.xpath('marcxml:leader', namespaces=ns)
                    if len(field_value) == 1:
                        value = field_value[0].text.encode('utf8')
                    else:
                        value = None
                    nextrow.append(value)
                elif len(field) == 3:
                    field_value = record.xpath('marcxml:controlfield[@tag=' +
                                               field + ']', namespaces=ns)
                    if len(field_value) > 1:
                        value = ''
                        for n in range(len(field_value)):
                            value += field_value[n].text.encode('utf8') + ";"
                        value.strip(';')
                    elif len(field_value) == 1:
                        value = field_value[0].text.encode('utf8')
                    else:
                        value = None
                    nextrow.append(value)
                # elif field.strip('_') in fieldsHeadings:
                #     field_value = record.xpath('marcxml:datafield[@tag=' +
                #                                field.strip('_') + ']',
                #                                namespaces=ns)
                #     if len(field_value) > 1:
                #         value = ''
                #         for resp in field_value:
                #             resp_value = ''
                #             for subfield in resp.getchildren():
                #                 if subfield.get('code') not in headingsSkip:
                #                     resp_value += subfield.text.encode('utf8') + ' '
                #                 if middleInits_re.search(resp_value.strip()) is None:
                #                     resp_value = resp_value.strip().strip('.')
                #             value += resp_value + ';'
                #         value = value.strip(';')
                #         print(value)
                #     elif len(field_value) == 1:
                #         value = ''
                #         for resp in field_value:
                #             for subfield in resp.getchildren():
                #                 value += subfield.text.encode('utf8') + ' '
                #             value = value.strip()
                #         if middleInits_re.search(value) is None:
                #             value = value.strip('.')
                #     else:
                #         value = None
                #     nextrow.append(value)
                #     print(value)
                else:
                    tag = field[:3]
                    inds_subf = field.split('_', 1)[1]
                    ind1 = inds_subf.split("$", 1)[0][:1].replace('#', ' ')
                    ind2 = inds_subf.split("$", 1)[0][-1:].replace('#', ' ')
                    subfield = inds_subf.split("$", 1)[1]
                    if tag in fieldsNonFile1:
                        xpath = ('marcxml:datafield[@tag="' + tag +
                                 '"][@ind1="' + ind1 + '"]/' +
                                 'marcxml:subfield[@code="' + subfield + '"]')
                    elif tag in fieldsNonFile2:
                        xpath = ('marcxml:datafield[@tag="' + tag +
                                 '"][@ind2="' + ind2 + '"]/' +
                                 'marcxml:subfield[@code="' + subfield + '"]')
                    else:
                        xpath = ('marcxml:datafield[@tag="' + tag +
                                 '"][@ind1="' + ind1 + '"][@ind2="' + ind2 +
                                 '"]/' + 'marcxml:subfield[@code="' + subfield
                                 + '"]')
                    field_value = record.xpath(xpath, namespaces=ns)
                    if len(field_value) > 1:
                        value = ''
                        for n in range(len(field_value)):
                            value += field_value[n].text.encode('utf8') + ';'
                        print(value)
                        value.strip(';')
                    elif len(field_value) == 1:
                        value = field_value[0].text.encode('utf8')
                    else:
                        value = None
                    nextrow.append(value)
            writer.writerow(nextrow)
Ejemplo n.º 52
0
 def _set_up_context(self, xmlfile):
     return etree.iterparse(xmlfile, tag='token', events=('end', ))
Ejemplo n.º 53
0
def main(filein, hadvcode, fileout):
    """ Use lxml """
    recs = []
    #tree = etree.parse(filein)
    # better to use iterparse
    # elements can be of two kinds, 'f' or 'pv'.
    # We are interested only in 'f' elements
    # An '<f>' element is an 'inflected form entry'.
    # iterate through all the <f> elements.
    stemdict = {}
    stemlist = []
    n = 0
    ncaus = 0
    ndes = 0
    for _, element in etree.iterparse(filein, tag='f'):
        n = n + 1
        form = element.get("form")
        # "form" is the sole required attribute of an <f> element.  Its value is the inflected form.
        # most of the time, the element has exactly 2 children. However, sometimes there
        # are multiple children. The first instance in file SL_roots.xml is
        # <f form="akaRqayata">
        #  <v><cj><prim/></cj><sys><prs gn="10"><md><im/></md><atma/></prs></sys><np><sg/><trd/></np></v>
        #  <v><cj><prim/></cj><sys><prs gn="10"><md><im/></md><para/></prs></sys><np><pl/><snd/></np></v>
        # <s stem="kaRq"/></f>

        # The rest of the <f> element (its xml 'children') is used to describe the form.
        children = list(element)
        nchildren = len(children)
        # The xml '<s>' element is one of the children. "lexicon stem or root generating the form"
        # We assume '<s>' is the 'last' child of '<f>'
        s = children[-1]
        # The actual stem is the value of the "stem" attribute of <s>.
        # This stem may have a 'homonym' number, represented as '#1','#2' suffixing the stem value.
        stem = s.get("stem")
        assert ((form != None) and (stem != None))
        # Remove homonym marker from stem, if present
        stem = re.sub(r'#.*$', '', stem)
        """
  if stem not in stemdict:
   stemdict[stem]=Huet_verb_prim_prs(stem)
   stemlist.append(stem)
  rec = stemdict[stem]
  """
        # the rest of the children describe details regarding how the inflected form arises from the stem.
        inflected_forms = children[0:-1]
        #tags = [inflected_form.tag for inflected_form in inflected_forms]
        for inflected_form in inflected_forms:
            tag = inflected_form.tag
            if tag != 'vu':  # indeclineable verbal form
                continue
            [cjelt, ivelt] = list(inflected_form)  # nominal, kridanta
            assert (cjelt.tag == 'cj'
                    ), "Unexpected cj tag %s for stem=%s" % (cjelt.tag, stem)
            [primelt] = list(cjelt)
            # can be primary or causal conjugation time
            assert (primelt.tag in [
                'prim', 'ca', 'des'
            ]), "Unexpected prim tag %s for stem=%s" % (primelt.tag, stem)
            if primelt.tag == 'ca':
                ncaus = ncaus + 1
                print "%s:%s:%s of %s" % (stem, form, hadvcode, primelt.tag)
                continue  # don't handle these for now
            elif primelt.tag == 'des':
                ndes = ndes + 1
                print "%s:%s:%s of %s" % (stem, form, hadvcode, primelt.tag)
                continue
            [elt] = list(ivelt)
            if elt.tag != hadvcode:
                continue
            if stem not in stemdict:  # stem is a root
                stemdict[stem] = Huet_adverb(stem)
                stemlist.append(stem)
            rec = stemdict[stem]
            rec.update_form(form, )
        element.clear()  # for efficiency, free memory of this element
    print len(stemlist), "stems found in", filein
    fout = codecs.open(fileout, "w", "utf-8")  # utf-8 not required
    # sort stemlist in Sanskrit alphabetical order
    stemlist.sort(cmp=slp_cmp)
    for stem in stemlist:
        rec = stemdict[stem]
        forms = rec.forms
        forms.sort(cmp=slp_cmp)
        formstr = ','.join(forms)
        out = "%s:%s" % (stem, formstr)
        fout.write(out + "\n")
    fout.close()
    print ncaus, "causal forms skipped"
    print ndes, "desiderative forms skipped"
Ejemplo n.º 54
0
def readcity(file):
    context = etree.iterparse(file, events=('end', ))
    fast_iter(context)
Ejemplo n.º 55
0
 def on_epoch_end(self):
     self.parser_gen = iter(etree.iterparse(self.my_file_path, tag="s"))
Ejemplo n.º 56
0
#!/usr/bin/env python

import urllib, sys, re
from lxml import etree


word = ' '.join(sys.argv[5:]).lower()

f = urllib.urlopen('http://kill-or-cure.heroku.com/a-z/%s' % word[0])

foundterm = False
msg = causetitle = preventtitle = None

ns = ''
for ev, el in etree.iterparse(f, events=['start-ns', 'end']):
    if ev == 'start-ns':
        prefix, ns = el
        continue

    if el.tag == '{%s}h2' % ns:
        if foundterm:
            break

        classes = el.get('class').split(' ')
        if 'termHeading' in classes:
            term = el.findtext('{%s}em' % ns)
            if term.lower() == word:
                foundterm = True
                result, = set(['both', 'cause', 'prevent']) & set(classes)
                msg = etree.tostring(el, method='text')
                msg = re.sub(r'\s+', ' ', msg).strip('# ')
Ejemplo n.º 57
0
    def _poll_collection(self, poll_service, begin, end):
        req = Taxii11.poll_request(collection_name=self.collection,
                                   exclusive_begin_timestamp=begin,
                                   inclusive_end_timestamp=end)
        reqhdrs = Taxii11.headers(protocol=poll_service.split(':', 1)[0])
        result = self._send_request(url=poll_service,
                                    headers=reqhdrs,
                                    data=req,
                                    stream=True)
        result.raw.decode_content = True

        while True:
            result_part_number = None
            result_id = None
            more = None
            tag_stack = collections.deque()  # type: ignore

            try:
                for action, element in etree.iterparse(result.raw,
                                                       events=('start', 'end'),
                                                       recover=True):
                    if action == 'start':
                        tag_stack.append(element.tag)

                    else:
                        last_tag = tag_stack.pop()
                        if last_tag != element.tag:
                            raise RuntimeError(
                                '{} - error parsing poll response, mismatched tags'
                                .format(INTEGRATION_NAME))

                    if action == 'end' and element.tag.endswith(
                            'Status_Message') and len(tag_stack) == 0:
                        self._raise_for_taxii_error(
                            BeautifulSoup(
                                etree.tostring(element, encoding='unicode'),
                                'xml'))
                        return

                    elif action == 'end' and element.tag.endswith(
                            'Poll_Response') and len(tag_stack) == 0:
                        result_id = element.get('result_id', None)
                        more = element.get('more', None)
                        result_part_number = element.get(
                            'result_part_number', None)
                        if result_part_number is not None:
                            result_part_number = int(result_part_number)

                    elif action == 'end' and element.tag.endswith(
                            'Content_Block') and len(tag_stack) == 1:
                        for c in element:
                            if c.tag.endswith('Content'):
                                if len(c) == 0:
                                    continue

                                content = etree.tostring(c[0],
                                                         encoding='unicode')
                                timestamp, indicators = StixDecode.decode(
                                    content)

                                for indicator in indicators:
                                    yield indicator
                                if timestamp:
                                    if self.last_stix_package_ts is None or timestamp > self.last_stix_package_ts:
                                        self.last_stix_package_ts = timestamp

                            elif c.tag.endswith('Timestamp_Label'):
                                timestamp = Taxii11.parse_timestamp_label(
                                    c.text)

                                if timestamp:
                                    if self.last_taxii_content_ts is None or timestamp > self.last_taxii_content_ts:
                                        self.last_taxii_content_ts = timestamp

                        element.clear()

            finally:
                result.close()

            if not more or more == '0' or more.lower() == 'false':
                break

            if result_id is None or result_part_number is None:
                break

            req = Taxii11.poll_fulfillment_request(
                collection_name=self.collection,
                result_id=result_id,
                result_part_number=result_part_number + 1)
            result = self._send_request(url=poll_service,
                                        headers=reqhdrs,
                                        data=req,
                                        stream=True)
Ejemplo n.º 58
0
    print(len(community))


def show_big_communities(communites, authors, authorship):
    ten_biggest_communities = get_biggest_communities(communites, 200)

    for i, community in enumerate(ten_biggest_communities):
        print('DETAILED INFO - COMMUNITY ', i)
        show_community_size(community)
        show_the_most_popular_journals(community, authors, authorship, 5)


authors = []
i = 0
max_authors = 0
for event, elem in etree.iterparse(source=xml, dtd_validation=False,
                                   load_dtd=True):  # ET.iterparse(xml, events=('start', 'end', 'start-ns', 'end-ns')):
    if i % 100000 == 0:
        print(i / 1000000, len(graph_edges), max_authors)
        max_authors = 0

    if event == 'end':
        i = i + 1

        if elem.tag == 'title':
            title = elem.text

        if elem.tag == 'article' or elem.tag == 'inproceedings' or elem.tag == 'incollection' or elem.tag == 'proceedings' or elem.tag == 'www' or elem.tag == 'phdthesis' or elem.tag == 'mastersthesis' or elem.tag == 'book':
            if len(authors) > 1:
                if elem.tag == 'article' or elem.tag == 'inproceedings' or elem.tag == 'incollection' or elem.tag == 'proceedings':
                    if len(authors) > max_authors:
                        max_authors = len(authors)
Ejemplo n.º 59
0
import csv
import sys

from lxml.etree import iterparse

writer = csv.writer(sys.stdout, quoting=csv.QUOTE_NONNUMERIC)

group_name = ''

parsing = iterparse('podcasts.opml', events=['start'])

for (event, node) in parsing:
    if node.tag != 'outline':
        continue
    if not node.attrib.get('xmlUrl'):
        group_name = node.attrib['text']
    else:
        writer.writerow(
            (group_name, node.attrib['text'], node.attrib['xmlUrl'],
             node.attrib.get('htmlUrl', '')))
Ejemplo n.º 60
0

def make_tmpfile(pagenum, dir='tempdata'):
    print("creates new file %d" % pagenum)
    '''returns a file object for a small chunk file; must close it yourself'''
    import os
    if not os.path.exists(dir):
        os.mkdir(dir)

    fp = os.path.join(dir, 'chunk_%d.xml' % pagenum)
    return open(fp, mode='w')


# USAGE
context1 = etree.iterparse(
    "./file1",
    tag='{http://www.mediawiki.org/xml/export-0.10/}page',
    encoding='utf-8')
context2 = etree.iterparse(
    "file2",
    tag='{http://www.mediawiki.org/xml/export-0.10/}page',
    encoding='utf-8')

fast_iter(context1)
fast_iter(context2)

et = etree.parse('tempdata/chunk_20.xml')
root = et.getroot()
nsmap = {'ns': 'http://www.mediawiki.org/xml/export-0.10/'}

root.findall('ns:page', nsmap)  # find all pages
root.xpath('*/*/*/ns:username', namespaces=nsmap)  # extract all username tags