def dataframe_preprocess(file_path, sheetname=0):
    """
    读取源文件并简化
    :param file_path:
    :param sheetname:
    :return:
    """
    df = read_source(file_path, sheetname=sheetname)

    # 一次筛选
    df = df[df['当前阶段'] != '[作废]']  # 根据2019.9.18与网格中心考评处的沟通, 不考虑作废案件

    # keep only useful columns, to reduce too many dimensions
    df = df[[
        '问题来源', '问题类型', '大类名称', '小类名称', '小类明细', '微类名称', '街道', '上报时间', '当前阶段',
        '处置截止时间', '处置结束时间'
    ]]

    # 二次筛选
    # "垃圾不落地"(大类名称), "垃圾分类"(大类名称) 中不包含废弃物
    attr_filter = "施工废弃料|废弃家具|生活垃圾|施工废料"
    df = df.loc[df["大类名称"].str.contains(attr_filter, na=False)
                | df["小类名称"].str.contains(attr_filter, na=False)
                | df["小类明细"].str.contains(attr_filter, na=False)]

    return df
def dataframe_preprocess(file_path, sheetname=0):
    """
    读取源文件并简化
    :param file_path:
    :param sheetname:
    :return:
    """
    df = read_source(file_path, sheetname=sheetname)

    # filter with ["问题类型"] == "事件"
    df = df[df["问题类型"] == "社会服务管理"]
    # filter with ['当前阶段'] == '[作废]'
    df = df[df['当前阶段'] != '[作废]']  # 根据2019.9.18与网格中心考评处的沟通, 不考虑作废案件

    # 大类筛选
    df = df[df["大类名称"].isin(["矛盾纠纷", "劳动与社会保障", "社会事业"])]

    # keep only useful columns, to reduce too many dimensions
    df = df[[
        '案件号', '问题来源', '问题类型', '大类名称', '街道', '上报时间', '当前阶段', '处置截止时间',
        '处置结束时间', '结案时间', '立案时间', '强制结案时间'
    ]]

    # 时间类
    df["立案耗时"] = df["立案时间"].subtract(df["上报时间"])
    df["处置预计耗时"] = df["处置截止时间"].subtract(df["立案时间"])

    # 时间转分钟

    # https://blog.csdn.net/liudinglong1989/article/details/78728683
    def f(x):
        return int(x / timedelta(minutes=1)) if isinstance(x, timedelta) else 0

    df["立案耗时"] = df["立案耗时"].apply(f)
    df["处置预计耗时"] = df["处置预计耗时"].apply(f)

    # 立案耗时加权
    bins = [0, 60, 360, 1440, 4320,
            999999]  # minutes [0-1h, 1h-6h, 6h-1d, 1d-3d, >3d]
    # bins = [timedelta(hours=0), timedelta(hours=1), timedelta(hours=6),
    #         timedelta(hours=24), timedelta(hours=72), timedelta(days=365)]  # minutes [0-1h, 1h-6h, 6h-1d, 1d-3d, >3d]
    df["W立案"] = pd.cut(df["立案耗时"], bins, labels=[1, 0, -1, -2, -3])
    # 强结案加权
    df["W强结"] = -pd.isnull(df["强制结案时间"])

    return df
def gen_source_ids(device, devdef=None):
    struct = utils.read_source(device.source_name)
    
    num = len(device.feed_names)
    driver = struct['driver'];
    retval = []
    if driver  == "SMAP":
        for i in range(num):
            retval.append(str(uuid.uuid5(uuid.UUID(SMAP_UUID), str("%s/r%d"%(device.IDstr,i)))))
        return retval
    elif driver == "CSV":
        for i in range(num):
            retval.append('%s/%s.csv'%(device.IDstr,device.feed_names[i]))
        return retval
    else:
        print "ERROR Cannot generate source ID for %s b/c no %s driver"%(device.source_name,driver)
        return []
def dataframe_preprocess(file_path, sheetname=0):
    """
    读取源文件并简化
    :param file_path:
    :param sheetname:
    :return:
    """
    df = read_source(file_path, sheetname=sheetname)

    # 一次筛选
    df = df[df['当前阶段'] != '[作废]']  # 根据2019.9.18与网格中心考评处的沟通, 不考虑作废案件

    # keep only useful columns, to reduce too many dimensions
    df = df[['问题来源', '问题类型', '大类名称', '小类名称', '小类明细', '微类名称',
             '街道', '上报时间', '当前阶段', '处置截止时间', '处置结束时间']]

    # 二次筛选
    df = df[df["问题类型"] == "社会服务管理"]
    df = df[df["大类名称"] == "矛盾纠纷"]
    return df
def dataframe_preprocess(file_path, sheetname=0):
    """
    读取源文件并简化
    :param file_path:
    :param sheetname:
    :return:
    """
    df = read_source(file_path, sheetname=sheetname)

    # 一次筛选
    # 筛选后大类名称: 服务项目, 劳动与社会保障(包含劳动关系与纠纷, 社会福利与保障), 矛盾纠纷, 社会事业, 特殊行业监管
    df = df[df["问题类型"] == "社会服务管理"]
    # filter with ['当前阶段'] == '[作废]'
    df = df[df['当前阶段'] != '[作废]']  # 根据2019.9.18与网格中心考评处的沟通, 不考虑作废案件

    # keep only useful columns, to reduce too many dimensions
    df = df[[
        '问题来源', '问题类型', '大类名称', '小类名称', '街道', '上报时间', '当前阶段', '处置截止时间',
        '处置结束时间'
    ]]

    return df
Exemple #6
0
def get_partner_id(person_id, marriage):
    if marriage['MotherId'] == person_id:
        return marriage['FatherId']
    elif marriage['FatherId'] == person_id:
        return marriage['MotherId']
    else:
        raise AssertionError


def format_date(date):
    # http://blog.sneawo.com/blog/2015/04/08/strftime-for-datetime-before-1900-year/
    # date.strftime('%d/%m/%Y')
    if not pd.isnull(date):
        return '{0.day:02d}/{0.month:02d}/{0.year}'.format(date)

individus, couples = read_source()

# remove marriages with unknown mother or father
couples = couples[(couples.MotherId != 0) & (couples.FatherId != 0)]

final = individus.copy()

age_in_days = final.DEAT_DATE - final.ESTIM_BIRT_DATE
final['AGE_AT_DEATH'] = (age_in_days[age_in_days.notnull()].dt.days / 365).astype(int)

# mothers and fathers marriages
for gender_groups in [couples.groupby('MotherId'), couples.groupby('FatherId')]:
    for person_id, group in gender_groups:
        sorted_group = group.sort_values(['MARR_DATE', 'EVEN_DATE', 'MARB_DATE'])
        birth_date = final.loc[person_id, 'ESTIM_BIRT_DATE']
        for index, (_, marriage) in enumerate(sorted_group.iterrows(), 1):
def publish_data(id_str, time, data, feednum=None, devdef=None, device_type=None, source=None, dev=None):    
    # Usage 1: time and data are scalars - one data point, feed # = feednum or 0 if feednum=None
    # Usage 2: time and data are lists of scalars (time could also be scalar) - one data point per feed, feed #s = feednum (list) or range(total feeds) if feednum=None
    # Usage 3: time and data are lists of scalars, feednum is a scalar - multiple data points for one feed
    # Usage 4: time and data are lists of lists of scalars (time could also be list of scalar) - multiple data points per feed, feed #s = feednum(list) or range(total feeds) if feednum=None
    if not isinstance(data, list): # Usage 1
        data = [data]
        if feednum == None:
            feednum = [0]
        else:
            feednum = [feednum]
    else:  # Usage 2,3,4
        if feednum == None: # Usage 2,4
            feednum = range(len(data))
        elif not isinstance(feednum,list): # usage 3
            feednum = [feednum]
            time = [time]
            data = [data]
    
    if not isinstance(time,list) or (not isinstance(time[0],list) and isinstance(data[0],list)): # Usage 1,2,4
        time = [time]*len(feednum)
        
    if not devicedb.connected():
        devicedb.connect()
    
    id_str = id_str.replace('/','_');
    postgresops.check_evil(id_str);
    
    dev = find_device(id_str, create_new = True, device_type=device_type, source=source, devdef=devdef, dev=dev)
    
    if dev == None:
        return;
    
    source_struct = utils.read_source(dev.source_name)
    if devdef == None:
        devdef = utils.read_device(dev.device_type)
        
    driver = source_struct['driver']
    for i in range(len(feednum)):
        if feednum[i] >= len(dev.feed_names):
            print "ERROR cannot publish data for feed %d because it is not defined in the definition for %s"%(feednum[i],dev.device_type)
        elif feednum[i] >= len(dev.source_ids) or dev.source_ids[feednum[i]] == None or dev.source_ids[feednum[i]] == '':
            print "ERROR cannot publish data for feed %d of device %s because it is not defined"%(feednum[i],dev.IDstr)
        else:
            source_id = dev.source_ids[feednum[i]]
            if driver == 'SMAP':
                publish_smap(dev.source_name,source_struct,dev,devdef,feednum[i],source_id,time[i],data[i])
            elif driver == 'CSV':
                fname = source_id
                if fname[0] != '/':
                    fname = source_struct['path'] + '/' + fname
                try:
                    parentdir = fname[:fname.rfind('/')]
                    
                    try:
                        os.makedirs(parentdir)
                    except: pass
                    
                    csvfile = open(fname, "ab")
                    #print "\t",time[i],data[i]
                    if isinstance(time[i],list):
                        for j in range(len(time[i])):
                            csvfile.write("%.12f,%.12f\n"%(time[i][j],data[i][j]))
                    else:
                        csvfile.write("%.12f,%.12f\n"%(time[i],data[i]))
                    csvfile.close()
                except OSError,e:
                    print "ERROR Cannot publish data to %s because "%(fname),e
                    
            else:
                print "ERROR Cannot publish data for %s b/c no %s driver"%(dev.source_name,driver)
                return []       
    def run(self, time_from, time_to, source_name=None, source_id=None, pretend=False, use_cache=True, local=False, params=[]):
        if source_id == None and local:
            print "Error: Can only run 'local' test on one stream ( source name & id pair ) at a time"
            sys.exit(1)
        
        
        if source_name == None:
            names = utils.list_sources();
            for name in names:
                ids = utils.list_ids(name)
                for id in ids:
                    self.run(time_from, time_to, name, id, pretend, use_cache, local)
            return
                            
        elif source_id == None:
            ids = utils.list_ids(source_name)
            for id in ids:
                self.run(time_from, time_to, source_name, id, pretend, use_cache, local)
            return
        
        smap = utils.read_source(source_name)
        if not local and use_cache:
            # find any cached files
            import filedb
            filedb.connect()
            
            for file in self.files:
                stepchain = file.stepchain
                cfiles = filedb.get_files(where='time_from=%s and time_to=%s and source_name=%s and source_id=%s and steps=%s',params=(time_from,time_to,source_name,source_id,stepchain))
                for cfile in cfiles:
                    if ( cfile.status == filedb.INVALID or os.path.exists(cfile.file_name) ):
                        file.cached = True
                        file.file_id = cfile.id
                        file.fname = cfile.file_name
                        file.deptask = cfile.task_id
                        file.stepchain = stepchain
                        print "Found cached file for output of "+file.src.name
                        break
        
            # prune any tasks that don't need to be run
            for step in self.steps[:]:
                if ( len(step.outputs) == 0):
                    print "Step %s will be run b/c it has no outputs"%step.name
                    continue
                    
                canbepruned = True
                for f in step.outputs:
                    if not f.cached:
                        canbepruned = False
                        break 
                if canbepruned:
                    print "Pruning step %s because the cache can supply all outputs"%step.name
                    self.steps.remove(step)
                else:
                    for f in step.outputs:
                        if f.cached:
                            f.cached = False
                            print "Cached file %s.O%d will be regenerated b/c not all outputs were cached"%(step.name,f.index)
        
        # create all the files we'll need
        if local:
            dir = ''
        else:
            if ( self.use_tmp ):
                dir = '/tmp/sensezilla/flowdata/'+self.name
            else:
                dir = config.map['global']['data_dir']+'/flowdata/'+self.name
        
        
            if not pretend:
                try:
                    os.makedirs(dir+'/outputs',0755)
                except:pass
            
        
        for file in self.files:
            if not file.cached:
                if local:
                    file.fname = dir+'testing_%s_%s.O%d'%(self.name,file.src.name,file.index)
                    if not pretend:
                        if file.directory:
                            try:
                                os.mkdir(file.fname)
                            except:pass
                        else:
                            fout = open(file.fname,'w')
                            fout.close()
                else:
                    if 'OUTPUT' not in [v[0] for v in file.dests]:
                        if not pretend:
                            if file.directory:
                                file.fname = tempfile.mkdtemp(dir=dir)
                            else:
                                tfile = tempfile.NamedTemporaryFile('w', dir=dir, delete=False)
                                file.fname = tfile.name;
                                tfile.close()
                        else:
                            file.fname = os.tempnam(dir)
                    else:
                        file.fname = dir+'/outputs/%s.O%d_%s_%s_%d_to_%d'%(file.src.name,file.index,source_name,source_id.replace('/','.'),
                                                                           utils.date_to_unix(time_from),utils.date_to_unix(time_to))
                        if file.directory:
                            if not pretend:
                                os.mkdir(file.fname)
                        else:
                            if not pretend: 
                                fout = open(file.fname,'w')
                                fout.close()
                    
                if file.directory:
                    print "Created directory : "+file.fname
                else:
                    print "Created file : "+file.fname


        # generate dictionary of substitutions
        subs = {
                'TIME_FROM':int(utils.date_to_unix(time_from)),
                'TIME_TO':int(utils.date_to_unix(time_to)),
                'SOURCE':source_name,
                'ID':source_id                
        };
        subs.update(params)
        
        try:
            import devicedb
            devicedb.connect()
            plmeta,dev,pl_index = devicedb.find_plugload(source_name,source_id)
            subs['PLUGLOAD'] = plmeta.value;
            subs['DEVID'] = dev.ID
            subs['DEVIDSTR'] = dev.IDstr
            
        except Exception,e:
            print "Cannot contact devicedb "+str(e)