def dataframe_preprocess(file_path, sheetname=0): """ 读取源文件并简化 :param file_path: :param sheetname: :return: """ df = read_source(file_path, sheetname=sheetname) # 一次筛选 df = df[df['当前阶段'] != '[作废]'] # 根据2019.9.18与网格中心考评处的沟通, 不考虑作废案件 # keep only useful columns, to reduce too many dimensions df = df[[ '问题来源', '问题类型', '大类名称', '小类名称', '小类明细', '微类名称', '街道', '上报时间', '当前阶段', '处置截止时间', '处置结束时间' ]] # 二次筛选 # "垃圾不落地"(大类名称), "垃圾分类"(大类名称) 中不包含废弃物 attr_filter = "施工废弃料|废弃家具|生活垃圾|施工废料" df = df.loc[df["大类名称"].str.contains(attr_filter, na=False) | df["小类名称"].str.contains(attr_filter, na=False) | df["小类明细"].str.contains(attr_filter, na=False)] return df
def dataframe_preprocess(file_path, sheetname=0): """ 读取源文件并简化 :param file_path: :param sheetname: :return: """ df = read_source(file_path, sheetname=sheetname) # filter with ["问题类型"] == "事件" df = df[df["问题类型"] == "社会服务管理"] # filter with ['当前阶段'] == '[作废]' df = df[df['当前阶段'] != '[作废]'] # 根据2019.9.18与网格中心考评处的沟通, 不考虑作废案件 # 大类筛选 df = df[df["大类名称"].isin(["矛盾纠纷", "劳动与社会保障", "社会事业"])] # keep only useful columns, to reduce too many dimensions df = df[[ '案件号', '问题来源', '问题类型', '大类名称', '街道', '上报时间', '当前阶段', '处置截止时间', '处置结束时间', '结案时间', '立案时间', '强制结案时间' ]] # 时间类 df["立案耗时"] = df["立案时间"].subtract(df["上报时间"]) df["处置预计耗时"] = df["处置截止时间"].subtract(df["立案时间"]) # 时间转分钟 # https://blog.csdn.net/liudinglong1989/article/details/78728683 def f(x): return int(x / timedelta(minutes=1)) if isinstance(x, timedelta) else 0 df["立案耗时"] = df["立案耗时"].apply(f) df["处置预计耗时"] = df["处置预计耗时"].apply(f) # 立案耗时加权 bins = [0, 60, 360, 1440, 4320, 999999] # minutes [0-1h, 1h-6h, 6h-1d, 1d-3d, >3d] # bins = [timedelta(hours=0), timedelta(hours=1), timedelta(hours=6), # timedelta(hours=24), timedelta(hours=72), timedelta(days=365)] # minutes [0-1h, 1h-6h, 6h-1d, 1d-3d, >3d] df["W立案"] = pd.cut(df["立案耗时"], bins, labels=[1, 0, -1, -2, -3]) # 强结案加权 df["W强结"] = -pd.isnull(df["强制结案时间"]) return df
def gen_source_ids(device, devdef=None): struct = utils.read_source(device.source_name) num = len(device.feed_names) driver = struct['driver']; retval = [] if driver == "SMAP": for i in range(num): retval.append(str(uuid.uuid5(uuid.UUID(SMAP_UUID), str("%s/r%d"%(device.IDstr,i))))) return retval elif driver == "CSV": for i in range(num): retval.append('%s/%s.csv'%(device.IDstr,device.feed_names[i])) return retval else: print "ERROR Cannot generate source ID for %s b/c no %s driver"%(device.source_name,driver) return []
def dataframe_preprocess(file_path, sheetname=0): """ 读取源文件并简化 :param file_path: :param sheetname: :return: """ df = read_source(file_path, sheetname=sheetname) # 一次筛选 df = df[df['当前阶段'] != '[作废]'] # 根据2019.9.18与网格中心考评处的沟通, 不考虑作废案件 # keep only useful columns, to reduce too many dimensions df = df[['问题来源', '问题类型', '大类名称', '小类名称', '小类明细', '微类名称', '街道', '上报时间', '当前阶段', '处置截止时间', '处置结束时间']] # 二次筛选 df = df[df["问题类型"] == "社会服务管理"] df = df[df["大类名称"] == "矛盾纠纷"] return df
def dataframe_preprocess(file_path, sheetname=0): """ 读取源文件并简化 :param file_path: :param sheetname: :return: """ df = read_source(file_path, sheetname=sheetname) # 一次筛选 # 筛选后大类名称: 服务项目, 劳动与社会保障(包含劳动关系与纠纷, 社会福利与保障), 矛盾纠纷, 社会事业, 特殊行业监管 df = df[df["问题类型"] == "社会服务管理"] # filter with ['当前阶段'] == '[作废]' df = df[df['当前阶段'] != '[作废]'] # 根据2019.9.18与网格中心考评处的沟通, 不考虑作废案件 # keep only useful columns, to reduce too many dimensions df = df[[ '问题来源', '问题类型', '大类名称', '小类名称', '街道', '上报时间', '当前阶段', '处置截止时间', '处置结束时间' ]] return df
def get_partner_id(person_id, marriage): if marriage['MotherId'] == person_id: return marriage['FatherId'] elif marriage['FatherId'] == person_id: return marriage['MotherId'] else: raise AssertionError def format_date(date): # http://blog.sneawo.com/blog/2015/04/08/strftime-for-datetime-before-1900-year/ # date.strftime('%d/%m/%Y') if not pd.isnull(date): return '{0.day:02d}/{0.month:02d}/{0.year}'.format(date) individus, couples = read_source() # remove marriages with unknown mother or father couples = couples[(couples.MotherId != 0) & (couples.FatherId != 0)] final = individus.copy() age_in_days = final.DEAT_DATE - final.ESTIM_BIRT_DATE final['AGE_AT_DEATH'] = (age_in_days[age_in_days.notnull()].dt.days / 365).astype(int) # mothers and fathers marriages for gender_groups in [couples.groupby('MotherId'), couples.groupby('FatherId')]: for person_id, group in gender_groups: sorted_group = group.sort_values(['MARR_DATE', 'EVEN_DATE', 'MARB_DATE']) birth_date = final.loc[person_id, 'ESTIM_BIRT_DATE'] for index, (_, marriage) in enumerate(sorted_group.iterrows(), 1):
def publish_data(id_str, time, data, feednum=None, devdef=None, device_type=None, source=None, dev=None): # Usage 1: time and data are scalars - one data point, feed # = feednum or 0 if feednum=None # Usage 2: time and data are lists of scalars (time could also be scalar) - one data point per feed, feed #s = feednum (list) or range(total feeds) if feednum=None # Usage 3: time and data are lists of scalars, feednum is a scalar - multiple data points for one feed # Usage 4: time and data are lists of lists of scalars (time could also be list of scalar) - multiple data points per feed, feed #s = feednum(list) or range(total feeds) if feednum=None if not isinstance(data, list): # Usage 1 data = [data] if feednum == None: feednum = [0] else: feednum = [feednum] else: # Usage 2,3,4 if feednum == None: # Usage 2,4 feednum = range(len(data)) elif not isinstance(feednum,list): # usage 3 feednum = [feednum] time = [time] data = [data] if not isinstance(time,list) or (not isinstance(time[0],list) and isinstance(data[0],list)): # Usage 1,2,4 time = [time]*len(feednum) if not devicedb.connected(): devicedb.connect() id_str = id_str.replace('/','_'); postgresops.check_evil(id_str); dev = find_device(id_str, create_new = True, device_type=device_type, source=source, devdef=devdef, dev=dev) if dev == None: return; source_struct = utils.read_source(dev.source_name) if devdef == None: devdef = utils.read_device(dev.device_type) driver = source_struct['driver'] for i in range(len(feednum)): if feednum[i] >= len(dev.feed_names): print "ERROR cannot publish data for feed %d because it is not defined in the definition for %s"%(feednum[i],dev.device_type) elif feednum[i] >= len(dev.source_ids) or dev.source_ids[feednum[i]] == None or dev.source_ids[feednum[i]] == '': print "ERROR cannot publish data for feed %d of device %s because it is not defined"%(feednum[i],dev.IDstr) else: source_id = dev.source_ids[feednum[i]] if driver == 'SMAP': publish_smap(dev.source_name,source_struct,dev,devdef,feednum[i],source_id,time[i],data[i]) elif driver == 'CSV': fname = source_id if fname[0] != '/': fname = source_struct['path'] + '/' + fname try: parentdir = fname[:fname.rfind('/')] try: os.makedirs(parentdir) except: pass csvfile = open(fname, "ab") #print "\t",time[i],data[i] if isinstance(time[i],list): for j in range(len(time[i])): csvfile.write("%.12f,%.12f\n"%(time[i][j],data[i][j])) else: csvfile.write("%.12f,%.12f\n"%(time[i],data[i])) csvfile.close() except OSError,e: print "ERROR Cannot publish data to %s because "%(fname),e else: print "ERROR Cannot publish data for %s b/c no %s driver"%(dev.source_name,driver) return []
def run(self, time_from, time_to, source_name=None, source_id=None, pretend=False, use_cache=True, local=False, params=[]): if source_id == None and local: print "Error: Can only run 'local' test on one stream ( source name & id pair ) at a time" sys.exit(1) if source_name == None: names = utils.list_sources(); for name in names: ids = utils.list_ids(name) for id in ids: self.run(time_from, time_to, name, id, pretend, use_cache, local) return elif source_id == None: ids = utils.list_ids(source_name) for id in ids: self.run(time_from, time_to, source_name, id, pretend, use_cache, local) return smap = utils.read_source(source_name) if not local and use_cache: # find any cached files import filedb filedb.connect() for file in self.files: stepchain = file.stepchain cfiles = filedb.get_files(where='time_from=%s and time_to=%s and source_name=%s and source_id=%s and steps=%s',params=(time_from,time_to,source_name,source_id,stepchain)) for cfile in cfiles: if ( cfile.status == filedb.INVALID or os.path.exists(cfile.file_name) ): file.cached = True file.file_id = cfile.id file.fname = cfile.file_name file.deptask = cfile.task_id file.stepchain = stepchain print "Found cached file for output of "+file.src.name break # prune any tasks that don't need to be run for step in self.steps[:]: if ( len(step.outputs) == 0): print "Step %s will be run b/c it has no outputs"%step.name continue canbepruned = True for f in step.outputs: if not f.cached: canbepruned = False break if canbepruned: print "Pruning step %s because the cache can supply all outputs"%step.name self.steps.remove(step) else: for f in step.outputs: if f.cached: f.cached = False print "Cached file %s.O%d will be regenerated b/c not all outputs were cached"%(step.name,f.index) # create all the files we'll need if local: dir = '' else: if ( self.use_tmp ): dir = '/tmp/sensezilla/flowdata/'+self.name else: dir = config.map['global']['data_dir']+'/flowdata/'+self.name if not pretend: try: os.makedirs(dir+'/outputs',0755) except:pass for file in self.files: if not file.cached: if local: file.fname = dir+'testing_%s_%s.O%d'%(self.name,file.src.name,file.index) if not pretend: if file.directory: try: os.mkdir(file.fname) except:pass else: fout = open(file.fname,'w') fout.close() else: if 'OUTPUT' not in [v[0] for v in file.dests]: if not pretend: if file.directory: file.fname = tempfile.mkdtemp(dir=dir) else: tfile = tempfile.NamedTemporaryFile('w', dir=dir, delete=False) file.fname = tfile.name; tfile.close() else: file.fname = os.tempnam(dir) else: file.fname = dir+'/outputs/%s.O%d_%s_%s_%d_to_%d'%(file.src.name,file.index,source_name,source_id.replace('/','.'), utils.date_to_unix(time_from),utils.date_to_unix(time_to)) if file.directory: if not pretend: os.mkdir(file.fname) else: if not pretend: fout = open(file.fname,'w') fout.close() if file.directory: print "Created directory : "+file.fname else: print "Created file : "+file.fname # generate dictionary of substitutions subs = { 'TIME_FROM':int(utils.date_to_unix(time_from)), 'TIME_TO':int(utils.date_to_unix(time_to)), 'SOURCE':source_name, 'ID':source_id }; subs.update(params) try: import devicedb devicedb.connect() plmeta,dev,pl_index = devicedb.find_plugload(source_name,source_id) subs['PLUGLOAD'] = plmeta.value; subs['DEVID'] = dev.ID subs['DEVIDSTR'] = dev.IDstr except Exception,e: print "Cannot contact devicedb "+str(e)