Esempio n. 1
0
def convert_values():
    filename = 'refined/tables.csv'
    csvr = csv.DictReader(open(filename, 'r'),
                          delimiter='\t',
                          quoting=csv.QUOTE_NONE)
    filename = 'refined/regions.csv'
    regs = csv.DictReader(open(filename, 'r'),
                          delimiter='\t',
                          quoting=csv.QUOTE_NONE)
    tables = []
    for o in csvr:
        tables.append(o)
    inds = []
    filename = 'refined/indicators.csv'
    for o in csv.DictReader(open(filename, 'r'),
                            delimiter='\t',
                            quoting=csv.QUOTE_NONE):
        inds.append(o)


#	f = open('refined/values.csv', 'w')
#	f.write('\t'.join(['reg_date', 'reg_id', 'ind_id', 'val']) + '\n')
    for r in regs:
        rkey = r['reg_id']
        if not os.path.exists('refined/values/%s/' % rkey):
            os.makedirs('refined/values/%s' % (rkey))
        for t in tables:
            tkey = t['ind_table']
            ires = []
            for i in inds:
                if i['ind_table'] == tkey:
                    ires.append(i['ind_id'])
            filepath = 'raw/values/%s/%s__%s.xml' % (rkey, rkey,
                                                     '_'.join(ires))
            if not os.path.exists(filepath): continue
            f1 = open(filepath, 'r')
            data = f1.read()
            f1.close()
            filepath = 'refined/values/%s/%s__%s.csv' % (rkey, rkey,
                                                         '_'.join(ires))
            if os.path.exists(filepath):
                print 'Skipped', filepath
                continue
            f = open(filepath, 'w')
            f.write('\t'.join(['reg_date', 'reg_id', 'ind_id', 'val']) + '\n')
            soup = BeautifulStoneSoup(data)
            tp = Path('//reg_val')
            objs = tp.apply(soup)
            for o in objs:
                keys = ['regdate', 'reg_id', 'ind_id', 'val']
                vals = []
                for k in keys:
                    vals.append(str(o.find(k).string))
                f.write('\t'.join(vals) + '\n')
            print tkey, filepath

            f.close()
Esempio n. 2
0
def convert_tables():
	f = open('raw/tables.xml', 'r')
	data = f.read()
	f.close()
	soup = BeautifulStoneSoup(data)
	tp = Path('//regtables')
	objs = tp.apply(soup)
	f = open('refined/tables.csv', 'w')
	f.write('\t'.join(['ind_table', 'tbl_name', 'cont']) + '\n')
	for o in objs:
		rname = unicode(o.find('tbl_name').string).encode('utf8', 'ignore')
		f.write('\t'.join([str(o.find('ind_table').string), rname, str(o.find('cont').string)]) + '\n')
Esempio n. 3
0
def convert_regions():
	f = open('raw/regions.xml', 'r')
	data = f.read()
	f.close()
	soup = BeautifulStoneSoup(data)
	tp = Path('//regions')
	objs = tp.apply(soup)
	f = open('refined/regions.csv', 'w')
	f.write('\t'.join(['reg_id', 'name']) + '\n')
	for o in objs:
		rname = unicode(o.find('regname').string).encode('utf8', 'ignore')
		f.write('\t'.join([str(o.find('reg_id').string), rname]) + '\n')
Esempio n. 4
0
def convert_regions():
    f = open('raw/regions.xml', 'r')
    data = f.read()
    f.close()
    soup = BeautifulStoneSoup(data)
    tp = Path('//regions')
    objs = tp.apply(soup)
    f = open('refined/regions.csv', 'w')
    f.write('\t'.join(['reg_id', 'name']) + '\n')
    for o in objs:
        rname = unicode(o.find('regname').string).encode('utf8', 'ignore')
        f.write('\t'.join([str(o.find('reg_id').string), rname]) + '\n')
Esempio n. 5
0
def convert_tables():
    f = open('raw/tables.xml', 'r')
    data = f.read()
    f.close()
    soup = BeautifulStoneSoup(data)
    tp = Path('//regtables')
    objs = tp.apply(soup)
    f = open('refined/tables.csv', 'w')
    f.write('\t'.join(['ind_table', 'tbl_name', 'cont']) + '\n')
    for o in objs:
        rname = unicode(o.find('tbl_name').string).encode('utf8', 'ignore')
        f.write('\t'.join([
            str(o.find('ind_table').string), rname,
            str(o.find('cont').string)
        ]) + '\n')
Esempio n. 6
0
def convert_values():
	filename = 'refined/tables.csv'
	csvr = csv.DictReader(open(filename, 'r'), delimiter='\t', quoting=csv.QUOTE_NONE)
	filename = 'refined/regions.csv'
	regs = csv.DictReader(open(filename, 'r'), delimiter='\t', quoting=csv.QUOTE_NONE)
	tables = []
	for o in csvr:
		tables.append(o)
	inds = []
	filename = 'refined/indicators.csv'
	for o in csv.DictReader(open(filename, 'r'), delimiter='\t', quoting=csv.QUOTE_NONE):
		inds.append(o)
#	f = open('refined/values.csv', 'w')
#	f.write('\t'.join(['reg_date', 'reg_id', 'ind_id', 'val']) + '\n')
	for r in regs:
		rkey = r['reg_id']
		if not os.path.exists('refined/values/%s/' %(rkey)):
			os.makedirs('refined/values/%s' %(rkey))
		for  t in tables:
			tkey = t['ind_table']
			ires = []
			for i in inds:
				if i['ind_table'] == tkey:
					ires.append(i['ind_id'])
			filepath = 'raw/values/%s/%s$%s.xml' %(rkey, rkey, '_'.join(ires))
			if not os.path.exists(filepath): continue
			f1 = open(filepath, 'r')
			data = f1.read()
			f1.close()
			filepath = 'refined/values/%s/%s$%s.csv' %(rkey, rkey, '_'.join(ires))
			if os.path.exists(filepath): 
				print 'Skipped', filepath
				continue
			f = open(filepath, 'w')
			f.write('\t'.join(['reg_date', 'reg_id', 'ind_id', 'val']) + '\n')
			soup = BeautifulStoneSoup(data)
			tp = Path('//reg_val')
			objs = tp.apply(soup)
			for o in objs:
				keys = ['regdate', 'reg_id', 'ind_id', 'val']
				vals = []
				for k in keys:
					vals.append(str(o.find(k).string))
				f.write('\t'.join(vals) + '\n')
			print tkey, filepath

			f.close()
Esempio n. 7
0
def convert_inds():
	filename = 'refined/tables.csv'
	csvr = csv.DictReader(open(filename, 'r'), delimiter='\t', quoting=csv.QUOTE_NONE)
	f = open('refined/indicators.csv', 'w')
	f.write('\t'.join(['ind_id', 'ind_name', 'ind_izm', 'ind_table']) + '\n')
	for r in csvr:
		tkey = r['ind_table']
		filename = tkey + '.xml'
		f1 = open('raw/indicators/%s' %(filename), 'r')
		data = f1.read()
		f1.close()
		soup = BeautifulStoneSoup(data)
		tp = Path('//indlist')
		objs = tp.apply(soup)
		print tkey
		for o in objs:
			rname = unicode(o.find('ind_name').string).encode('utf8', 'ignore')
			izm = unicode(o.find('ind_izm').string).encode('utf8', 'ignore')
			f.write('\t'.join([str(o.find('ind_id').string), rname, izm, tkey]) + '\n')
	f.close()
Esempio n. 8
0
def convert_inds():
    filename = 'refined/tables.csv'
    csvr = csv.DictReader(open(filename, 'r'),
                          delimiter='\t',
                          quoting=csv.QUOTE_NONE)
    f = open('refined/indicators.csv', 'w')
    f.write('\t'.join(['ind_id', 'ind_name', 'ind_izm', 'ind_table']) + '\n')
    for r in csvr:
        tkey = r['ind_table']
        filename = tkey + '.xml'
        f1 = open('raw/indicators/%s' % filename, 'r')
        data = f1.read()
        f1.close()
        soup = BeautifulStoneSoup(data)
        tp = Path('//indlist')
        objs = tp.apply(soup)
        print tkey
        for o in objs:
            rname = unicode(o.find('ind_name').string).encode('utf8', 'ignore')
            izm = unicode(o.find('ind_izm').string).encode('utf8', 'ignore')
            f.write(
                '\t'.join([str(o.find('ind_id').string), rname, izm, tkey]) +
                '\n')
    f.close()
Esempio n. 9
0
        configitem.put()
        return delta

    def parsebbsbyXpath(self, config, htmlstring):

        try:
            dom = BeautifulSoup(htmlstring)
        except Exception, e:
            logging.error(
                "failed to parse bbs by Xpath parser; schoolname= %s",
                config['bbsname'])
            raise
            #return;

        contentpath = Path(config['xpath'])
        domblock = contentpath.apply(dom)
        blockstring = self.convertdom2string(domblock)

        if blockstring is None:
            logging.error(
                "failed to parse bbs by xpath parser; schoolname= %s",
                config['bbsname'])
            return
        return self.parsebbsDomDetail(blockstring, config)

    def parsebbsbyRegularExpression(self, config, htmlstring):

        try:
            re_block = config['re_block']
            blockstring = re_block.search(htmlstring).group()