Python filter_data Examples, tools.filter_data Python Examples

Example #1

0

Show file

def income_overall_analysis(year=2018):
    json_data = tools.get_json()
    json_data = tools.filter_data(json_data,
                                  lambda e: e['main']['year'] == year)

    # offices of interest
    # offices = [5, 4, 191, 4199, 482, 5963, 1397, 5953, 598, 979, 14, 607, 450]
    offices = [
        1, 3, 4, 5, 7, 14, 15, 17, 113, 146, 449, 450, 453, 456, 461, 467, 594,
        595, 596
    ]
    json_data = tools.filter_data(
        json_data, lambda e: int(e['main']['office']['id']) in offices)

    income = np.array([],
                      dtype=[('person_id', 'i8'), ('office_id', 'i8'),
                             ('income_self', 'float64'),
                             ('income_rel', 'float64'),
                             ('income_total', 'float64')])

    for entry in json_data:
        person_id = int(entry['main']['person']['id'])
        office_id = int(entry['main']['office']['id'])
        income_self = sum(
            [e['size'] for e in entry['incomes'] if e['relative'] == None])
        income_rel = sum(
            [e['size'] for e in entry['incomes'] if e['relative'] != None])
        income = np.append(
            income,
            np.array([(person_id, office_id, income_self, income_rel,
                       income_self + income_rel)],
                     dtype=income.dtype))

    # List offices and abbreviations
    for o in offices:
        print(o, office_id2name[str(o)],
              abbreviate_name(office_id2name[str(o)]))

    # Plot 1: income in each office of interest
    N = len(offices)
    dx = []
    dy = []
    for x in range(N):
        income_slice = np.array(
            [item for item in income if int(item['office_id']) == offices[x]],
            dtype=income.dtype)
        for y in income_slice:
            if y['income_rel'] / 1000000.0 < 10:
                dx.append(x)
                dy.append(y['income_self'] / 1000000.0)

    plot1 = plt.scatter(x=dx, y=dy, s=5)
    plt.ylabel('income (rel), millions of RUB')
    plt.xticks(range(N),
               [abbreviate_name(office_id2name[str(o)]) for o in offices])
    plt.show()

Example #2

0

Show file

def income_clustering(year):
    json_data = tools.get_json()
    json_data = tools.filter_data(json_data,
                                  lambda e: e['main']['year'] == year)
    json_data = tools.filter_data(json_data,
                                  lambda e: e['main']['office']['id'] == 14)

    data = []
    persons = []

    for entry in json_data:
        person_id = entry['main']['person']['id']
        income_self = sum(
            [e['size'] for e in entry['incomes'] if e['relative'] == None])
        income_rel = sum(
            [e['size'] for e in entry['incomes'] if e['relative'] != None])

        data.append([income_self, income_rel])
        persons.append(person_id)

    dframe = pd.DataFrame(data, index=persons)
    dframe.columns = ['income_self', 'income_rel']

    print(dframe)

    classifier = hdbscan.HDBSCAN(min_cluster_size=5).fit(dframe)
    classified = []
    for label in set(filter(lambda x: x >= 0, classifier.labels_)):
        print('Cluster label: ', label)
        ids = [i for i, x in enumerate(classifier.labels_) if x == label]
        for i in ids:
            print(persons[i], person_id2name[str(persons[i])], data[i])
            classified.append((label, persons[i], data[i][0], data[i][1]))
        print('\n')

    colour_palette = sns.color_palette('deep', 20)
    cluster_colours = [colour_palette[x[0]] for x in classified]
    cluster_member_colours = [
        sns.desaturate(x, p)
        for x, p in zip(cluster_colours, classifier.probabilities_)
    ]
    dx = [x[2] for x in classified]
    dy = [x[3] for x in classified]
    plt.scatter(x=dx,
                y=dy,
                s=10,
                linewidth=0,
                c=cluster_member_colours,
                alpha=1)
    plt.show()

Example #3

0

Show file

def inventory_history(inventory, categories, brands, month_slider, frequency,
                      relative):

    inventory = inventory[(inventory["Sales"] == 0)
                          & (inventory["NetQuantity"] > 0)]
    inventory = filter_data(inventory,
                            filter_many={"Brand": brands},
                            month_slider=month_slider)
    inventory = inventory.groupby([pd.Grouper(freq=frequency),
                                   'Size']).sum().reset_index()
    inventory = inventory.pivot(index="posting_date",
                                columns="Size",
                                values="Quantity")  # , 'Quantity Returned'

    if 'True' in relative:
        inventory = inventory.apply(lambda s: s / s.sum(), axis=1)

    traces = [
        go.Bar(x=inventory.index, y=inventory[category], name=category)
        for category in inventory.columns
    ]

    layout = default_graph_layout()
    layout["barmode"] = 'stack'
    layout["title"] = "Inventory History Including Size Distribution"

    return dict(data=traces, layout=layout)

Example #4

0

Show file

def update_brand_text(data, sizes, brands, month_slider):
    dff = filter_data(data,
                      filter_many={
                          "Brand": brands,
                          "Size": sizes
                      },
                      month_slider=month_slider)
    return "No of Products: {}".format(dff.shape[0])

Example #5

0

Show file

File: DataReadingFunctions.py Project: Gipsa-lab-PFP/AAA

def request_merapi(config, tStart, duration, verbatim=0):
    """ Request Merapi function.
    Gets the signature only, not full signal
    /!\ Signature of this function should not be modified and is similar for all applications (i,e, request_XXX)
    INPUT:
    - config: config dictionnary according to project formating
    - tStart: datetime object
    - duration: in sec
    - verbatim
    OUTPUT:
    - signature: numpy array containing the signal read
    - fs: sampling_rate
    """
    if verbatim > 1:
        debug = True
    else:
        debug = False

    try:
        client = Client(
            user=config.data_to_analyze['reading_arguments']['user'],
            host=config.data_to_analyze['reading_arguments']['host'],
            port=config.data_to_analyze['reading_arguments']['port'],
            debug=debug)
    except Exception as inst:
        print('Impossible to reach client ')
        print('--', inst)
        return 0, []

    delta_t = eval(config.data_to_analyze['reading_arguments']['delta_t'])
    t = UTCDateTime(tStart)

    try:
        st = client.get_waveforms(
            config.data_to_analyze['reading_arguments']['network'],
            config.data_to_analyze['reading_arguments']['station'],
            config.data_to_analyze['reading_arguments']['location'],
            config.data_to_analyze['reading_arguments']['channel'],
            t - delta_t,
            min(
                t + duration, t +
                config.data_to_analyze['reading_arguments']['max_duration']))
    except Exception as inst:
        print('Reading not possible for data: ', t, duration)
        print('--', inst)
        return 0, []

    signature = st[0].data
    fs = st[0].stats['sampling_rate']

    if eval(config.data_to_analyze['reading_arguments']['filtering']):
        signature = filter_data(
            signature, fs,
            config.data_to_analyze['reading_arguments']['filtering_frequency'])

    return fs, signature

Example #6

0

Show file

def pie_graph(sales, sizes, brands, month_slider, relative_selector):
    sales = filter_data(sales,
                        filter_many={'Brand': brands},
                        month_slider=month_slider)
    brand_group = sales.groupby('Brand').sum()
    size_group = sales.groupby('Size').sum()
    if "True" in relative_selector:
        brand_group = brand_group.apply(lambda x: x / x.sum(), axis=1)
        size_group = size_group.apply(lambda x: x / x.sum(), axis=1)

    data = [
        dict(
            type='pie',
            labels=list(brand_group.index),
            values=list(brand_group['Quantity'].values),
            name='Brand Breakdown',
            # text=['Total Gas Produced (mcf)', 'Total Oil Produced (bbl)', 'Total Water Produced (bbl)'],  # noqa: E501
            hoverinfo="value+percent",
            textinfo="label+percent+name",
            hole=0.5,
            # marker=dict(
            #     colors=['#fac1b7', '#a9bb95', '#92d8d8']
            # ),
            domain={
                "x": [0, .45],
                'y': [0.2, 0.8]
            },
        ),
        dict(
            type='pie',
            labels=list(size_group.index),
            values=list(size_group['Quantity'].values),
            name='Size Breakdown',
            hoverinfo="label+text+value+percent",
            textinfo="label+percent+name",
            hole=0.5,
            # marker=dict(
            #     colors=[WELL_COLORS[i] for i in aggregate.index]
            # ),
            domain={
                "x": [0.55, 1],
                'y': [0.2, 0.8]
            },
        )
    ]

    layout = default_graph_layout()
    layout['title'] = "Sales Repartition by Brand"
    layout['font'] = dict(color='#777777')
    layout['legend'] = dict(font=dict(color='#CCCCCC', size='10'),
                            orientation='h',
                            bgcolor='rgba(0,0,0,0)')

    return dict(data=data, layout=layout)

Example #7

0

Show file

def size_gap(inventory, sizes, brands, month_slider):
    inventory = filter_data(inventory,
                            filter_many={"Brand": brands},
                            month_slider=month_slider)

    stocks = inventory[(inventory["Sales"] == 0)
                       & (inventory["NetQuantity"] > 0)]
    sales = inventory[inventory["Sales"] != 0]
    matenboog_stock = (stocks.reset_index()[[
        "Brand", "Size", "NetQuantity"
    ]].groupby([
        "Brand", "Size"
    ]).sum().groupby("Brand").apply(lambda x: x / float(x.sum())).rename(
        columns={"NetQuantity": "Inventory"}))

    matenboog_sales = (sales.reset_index()[["Brand", "Size", "Sales"]].groupby(
        ["Brand",
         "Size"]).sum().groupby("Brand").apply(lambda x: x / float(x.sum())))
    matenboog = pd.concat([matenboog_stock, matenboog_sales],
                          axis=1,
                          join="outer").fillna(0)

    try:
        matenboog["gap"] = matenboog["Inventory"] - matenboog["Sales"]
    except KeyError:
        return dict(data=[], layout={})

    gap_summary = matenboog.groupby(level=0).agg(lambda s: abs(s).sum())

    x_data, y_data = gap_summary.index, gap_summary["gap"]

    traces = [
        go.Bar(x=x_data,
               y=y_data,
               text=list(map("{:.2f}".format, y_data)),
               textfont={"color": "#fff"},
               textposition='auto',
               marker=dict(color='rgb(255, 125, 0)'))
    ]

    layout = default_graph_layout()
    layout['title'] = "Size Gap per Brand"

    return dict(data=traces, layout=layout)

Example #8

0

Show file

def sales_history(sales, sizes, brands, month_slider, frequency, relative):

    sales = filter_data(sales,
                        filter_many={"Brand": brands},
                        month_slider=month_slider)
    sales = sales.groupby([pd.Grouper(freq=frequency),
                           'Size']).sum().reset_index()
    sales = sales.pivot(index="order_date", columns="Size",
                        values="Quantity")  # , 'Quantity Returned'

    if 'True' in relative:
        sales = sales.apply(lambda s: s / s.sum(), axis=1)

    traces = [
        go.Bar(x=sales.index, y=sales[category], name=category)
        for category in sales.columns
    ]

    layout = default_graph_layout()
    layout["barmode"] = 'stack'
    layout["title"] = "Sales History Including Size Distribution"

    return dict(data=traces, layout=layout)

Example #9

0

Show file

from scipy import stats

# 2019.11.01 该程序主要用来证明高斯分布
# 编程问题：如何从中开始循环去掉头尾等，循环如何改变源数据而非取出，如何转化list的每个数据，如何过滤

all_data = tools.old_read_data()  # all_data[userid][31维特征]=[同一维的所有数据]
print("load all_data success.")

# 看一些比较规律的数据直接作图用于论文
examples = [("s003", 21), ("s004", 6), ("s008", 24), ("s002", 22),
            ("s007", 25), ("s010", 1)]
for example in examples:
    key = example[0]
    column = example[1]
    data_list = all_data[key][column]
    x = tools.filter_data(data_list, 3)  #过滤数据
    mean = x.mean()
    std = x.std()
    print("user="******" column=" + str(column) + " mean=" + str(mean) +
          " std=" + str(std) + " first=" + str(data_list[0]))
    n, bins, patches = plt.hist(x, bins=80, color='r', alpha=0.5,
                                density=True)  # 直方图alpha控制颜色透明度
    plt.xlabel('time/s')
    plt.ylabel('normed frequency')
    # plt.title('histogram')
    y = stats.norm.pdf(bins, mean, std)
    plt.plot(bins, y, color='g', linewidth=1)
    plt.show()

# #遍历看数据
# count = 0

Example #10

0

Show file

def mistrust_index(year=2018):
    json_data = tools.get_json()
    json_data = tools.filter_data(json_data,
                                  lambda e: e['main']['year'] == year)

    rating = defaultdict(int)
    scored = defaultdict(int)

    for entry in json_data:
        person_id = entry['main']['person']['id']
        score = 0

        # 1: total income is low, but savings are huge
        income = sum([e['size'] for e in entry['incomes']])
        savings = 0
        for s in entry['savings']:
            size = float(s.split('руб.')[0].replace(' ', '').replace(',', '.'))
            savings += size
        if (income > 0 and savings / income >= 5.0) or (income == 0
                                                        and savings > 0):
            score += 1
            scored[1] += 1

        # 2: personal income is low, but the relatives' income is huge
        income_self = sum(
            [e['size'] for e in entry['incomes'] if e['relative'] == None])
        income_rel = sum(
            [e['size'] for e in entry['incomes'] if e['relative'] != None])
        if (income_self > 0
                and income_rel / income_self >= 5.0) or (income_self == 0
                                                         and income_rel > 0):
            score += 1
            scored[2] += 1

        # 3: zero total income(can be due to incorrectly submitted declaration, but still not good)
        if income == 0:
            score += 1
            scored[3] += 1

        # 4: low income, but owns a lot in real estate
        estates_area = 0
        for estate in entry['real_estates']:  # shall we exclude relatives?
            if not estate['square']:
                continue
            total = float(estate['square'])
            if estate['share']:
                total *= float(estate['share'])
            estates_area += total
        if income / 1000000.0 < 1.0 and estates_area > 500.0:
            score += 1
            scored[4] += 1

        # 5: lux cars
        lux_cars = [{
            'parent_name': 'BMW',
            'name': '3 series'
        }, {
            'parent_name': 'BMW',
            'name': '5 series'
        }, {
            'parent_name': 'BWM',
            'name': '7 series'
        }, {
            'parent_name': 'Acura',
            'name': 'Acura'
        }, {
            'parent_name': 'Audi',
            'name': 'A4'
        }, {
            'parent_name': 'Audi',
            'name': 'A6'
        }, {
            'parent_name': 'Audi',
            'name': 'A7'
        }, {
            'parent_name': 'Audi',
            'name': 'A8'
        }, {
            'parent_name': 'Alfa Romeo',
            'name': 'Giulietta'
        }, {
            'parent_name': 'Bentley'
        }, {
            'parent_name': 'Cadillac'
        }, {
            'parent_name': 'Ferrari'
        }, {
            'parent_name': 'Hummer'
        }, {
            'parent_name': 'Infinity'
        }, {
            'parent_name': 'Jaguar'
        }, {
            'parent_name': 'Lamborghini'
        }, {
            'parent_name': 'Land Rover'
        }, {
            'parent_name': 'Lexus'
        }, {
            'parent_name': 'Maserati'
        }, {
            'parent_name': 'Mercedes-Benz',
            'name': 'C-класс'
        }, {
            'parent_name': 'Mercedes-Benz',
            'name': 'E-класс'
        }, {
            'parent_name': 'Mercedes-Benz',
            'name': 'GL-класс'
        }, {
            'parent_name': 'Mercedes-Benz',
            'name': 'S-класс'
        }, {
            'parent_name': 'Porsche'
        }, {
            'parent_name': 'Rolls-Royce'
        }, {
            'parent_name': 'Saab',
            'name': '9-3'
        }, {
            'parent_name': 'Saab',
            'name': '9-5'
        }, {
            'parent_name': 'Volkswagen',
            'name': 'Phaeton'
        }, {
            'parent_name': 'Volvo',
            'name': 'S60'
        }, {
            'parent_name': 'Volvo',
            'name': 'S80'
        }]

        has_lux = 0
        for vehicle in entry['vehicles']:
            if not vehicle['brand']:
                continue

            for item in lux_cars:
                if 'name' in item:
                    for brand in carbrand:
                        if brand['parent_name'] == item[
                                'parent_name'] and brand['name'] == item[
                                    'name'] and brand['id'] == vehicle[
                                        'brand']['id']:
                            has_lux = 1
                else:
                    for brand in carbrand:
                        if brand['parent_name'] == item[
                                'parent_name'] and brand['id'] == vehicle[
                                    'brand']['id']:
                            has_lux = 1

        score += has_lux
        scored[5] += has_lux

        rating[person_id] += score

    return rating

Example #11

0

Show file

def request_merapi_mseedreq(config, tStart, duration, verbatim=0):
    """ Request Merapi function.
    Gets the signature only, not full signal
    /!\ Signature of this function should not be modified and is similar for all applications (i,e, request_XXX)
    INPUT:
    - config: config dictionnary according to project formating
    - tStart: datetime object
    - duration: in sec
    - verbatim
    OUTPUT:
    - signature: numpy array containing the signal read
    - fs: sampling_rate
    """

    try:
        year_s = tStart.year
        month_s = tStart.month
        day_s = tStart.day
        hour_s = tStart.hour
        minute_s = tStart.minute
        second_s = tStart.second

        #using mseedreq
        if config.data_to_analyze['reading_arguments'][
                'request'] == 'mseedreq':
            t1 = str(year_s).zfill(4) + "," + str(month_s).zfill(
                2) + "," + str(day_s).zfill(2) + "," + str(hour_s).zfill(
                    2) + "," + str(minute_s).zfill(2) + "," + str(
                        second_s).zfill(2)
            streams = config.data_to_analyze['reading_arguments'][
                'network'] + "." + config.data_to_analyze['reading_arguments'][
                    'station'] + "." + config.data_to_analyze[
                        'reading_arguments'][
                            'location'] + "." + config.data_to_analyze[
                                'reading_arguments']['channel']
            url = "http://localhost/cgi-bin/mseedreq.pl?all=2&s3=SEFRAN3&streams=" + streams + "&t1=" + t1 + "&ds=" + str(
                int(duration))

        #using  fdsn
        if config.data_to_analyze['reading_arguments']['request'] == 'fdsn':
            tEnd = tStart + timedelta(seconds=duration)
            year_e = tEnd.year
            month_e = tEnd.month
            day_e = tEnd.day
            hour_e = tEnd.hour
            minute_e = tEnd.minute
            second_e = tEnd.second
            start = str(year_s).zfill(4) + "-" + str(month_s).zfill(
                2) + "-" + str(day_s).zfill(2) + "T" + str(hour_s).zfill(
                    2) + ":" + str(minute_s).zfill(2) + ":" + str(
                        second_s).zfill(2)
            end = str(year_e).zfill(4) + "-" + str(month_e).zfill(
                2) + "-" + str(day_e).zfill(2) + "T" + str(hour_e).zfill(
                    2) + ":" + str(minute_e).zfill(2) + ":" + str(
                        second_e).zfill(2)
            stream = 'net=' + config.data_to_analyze['reading_arguments'][
                'network'] + '&sta=' + config.data_to_analyze[
                    'reading_arguments'][
                        'station'] + '&loc=' + config.data_to_analyze[
                            'reading_arguments'][
                                'location'] + '&cha=' + config.data_to_analyze[
                                    'reading_arguments']['channel']
            url = 'http://localhost:8080/fdsnws/dataselect/1/query?' + stream + '&start=' + start + '&end=' + end + '&nodata=404'

        #if verbatim > 1:
        print("Query:", url)

        r = requests.get(url, auth=("wo", ""))
    except Exception as inst:
        print('Impossible to send request to client ')
        print('--', inst)
        return 0, []

    try:
        filepath = config.general['project_root'] + config.application[
            'name'].upper(
            ) + config.data_to_analyze['reading_arguments']['tmpfilepath']
        print(r.status_code)
        if r.status_code == 200:
            with open(filepath, 'wb') as f:
                f.write(r.content)
        st = read(filepath)

    except Exception as inst:
        print('Reading not possible for data: ', tStart, duration)
        print('--', inst)
        return 0, []

    signature = st[0].data
    fs = st[0].stats['sampling_rate']

    if eval(config.data_to_analyze['reading_arguments']['filtering']):
        signature = filter_data(
            signature, fs,
            config.data_to_analyze['reading_arguments']['filtering_frequency'])

    return fs, signature