def income_overall_analysis(year=2018): json_data = tools.get_json() json_data = tools.filter_data(json_data, lambda e: e['main']['year'] == year) # offices of interest # offices = [5, 4, 191, 4199, 482, 5963, 1397, 5953, 598, 979, 14, 607, 450] offices = [ 1, 3, 4, 5, 7, 14, 15, 17, 113, 146, 449, 450, 453, 456, 461, 467, 594, 595, 596 ] json_data = tools.filter_data( json_data, lambda e: int(e['main']['office']['id']) in offices) income = np.array([], dtype=[('person_id', 'i8'), ('office_id', 'i8'), ('income_self', 'float64'), ('income_rel', 'float64'), ('income_total', 'float64')]) for entry in json_data: person_id = int(entry['main']['person']['id']) office_id = int(entry['main']['office']['id']) income_self = sum( [e['size'] for e in entry['incomes'] if e['relative'] == None]) income_rel = sum( [e['size'] for e in entry['incomes'] if e['relative'] != None]) income = np.append( income, np.array([(person_id, office_id, income_self, income_rel, income_self + income_rel)], dtype=income.dtype)) # List offices and abbreviations for o in offices: print(o, office_id2name[str(o)], abbreviate_name(office_id2name[str(o)])) # Plot 1: income in each office of interest N = len(offices) dx = [] dy = [] for x in range(N): income_slice = np.array( [item for item in income if int(item['office_id']) == offices[x]], dtype=income.dtype) for y in income_slice: if y['income_rel'] / 1000000.0 < 10: dx.append(x) dy.append(y['income_self'] / 1000000.0) plot1 = plt.scatter(x=dx, y=dy, s=5) plt.ylabel('income (rel), millions of RUB') plt.xticks(range(N), [abbreviate_name(office_id2name[str(o)]) for o in offices]) plt.show()
def income_clustering(year): json_data = tools.get_json() json_data = tools.filter_data(json_data, lambda e: e['main']['year'] == year) json_data = tools.filter_data(json_data, lambda e: e['main']['office']['id'] == 14) data = [] persons = [] for entry in json_data: person_id = entry['main']['person']['id'] income_self = sum( [e['size'] for e in entry['incomes'] if e['relative'] == None]) income_rel = sum( [e['size'] for e in entry['incomes'] if e['relative'] != None]) data.append([income_self, income_rel]) persons.append(person_id) dframe = pd.DataFrame(data, index=persons) dframe.columns = ['income_self', 'income_rel'] print(dframe) classifier = hdbscan.HDBSCAN(min_cluster_size=5).fit(dframe) classified = [] for label in set(filter(lambda x: x >= 0, classifier.labels_)): print('Cluster label: ', label) ids = [i for i, x in enumerate(classifier.labels_) if x == label] for i in ids: print(persons[i], person_id2name[str(persons[i])], data[i]) classified.append((label, persons[i], data[i][0], data[i][1])) print('\n') colour_palette = sns.color_palette('deep', 20) cluster_colours = [colour_palette[x[0]] for x in classified] cluster_member_colours = [ sns.desaturate(x, p) for x, p in zip(cluster_colours, classifier.probabilities_) ] dx = [x[2] for x in classified] dy = [x[3] for x in classified] plt.scatter(x=dx, y=dy, s=10, linewidth=0, c=cluster_member_colours, alpha=1) plt.show()
def inventory_history(inventory, categories, brands, month_slider, frequency, relative): inventory = inventory[(inventory["Sales"] == 0) & (inventory["NetQuantity"] > 0)] inventory = filter_data(inventory, filter_many={"Brand": brands}, month_slider=month_slider) inventory = inventory.groupby([pd.Grouper(freq=frequency), 'Size']).sum().reset_index() inventory = inventory.pivot(index="posting_date", columns="Size", values="Quantity") # , 'Quantity Returned' if 'True' in relative: inventory = inventory.apply(lambda s: s / s.sum(), axis=1) traces = [ go.Bar(x=inventory.index, y=inventory[category], name=category) for category in inventory.columns ] layout = default_graph_layout() layout["barmode"] = 'stack' layout["title"] = "Inventory History Including Size Distribution" return dict(data=traces, layout=layout)
def update_brand_text(data, sizes, brands, month_slider): dff = filter_data(data, filter_many={ "Brand": brands, "Size": sizes }, month_slider=month_slider) return "No of Products: {}".format(dff.shape[0])
def request_merapi(config, tStart, duration, verbatim=0): """ Request Merapi function. Gets the signature only, not full signal /!\ Signature of this function should not be modified and is similar for all applications (i,e, request_XXX) INPUT: - config: config dictionnary according to project formating - tStart: datetime object - duration: in sec - verbatim OUTPUT: - signature: numpy array containing the signal read - fs: sampling_rate """ if verbatim > 1: debug = True else: debug = False try: client = Client( user=config.data_to_analyze['reading_arguments']['user'], host=config.data_to_analyze['reading_arguments']['host'], port=config.data_to_analyze['reading_arguments']['port'], debug=debug) except Exception as inst: print('Impossible to reach client ') print('--', inst) return 0, [] delta_t = eval(config.data_to_analyze['reading_arguments']['delta_t']) t = UTCDateTime(tStart) try: st = client.get_waveforms( config.data_to_analyze['reading_arguments']['network'], config.data_to_analyze['reading_arguments']['station'], config.data_to_analyze['reading_arguments']['location'], config.data_to_analyze['reading_arguments']['channel'], t - delta_t, min( t + duration, t + config.data_to_analyze['reading_arguments']['max_duration'])) except Exception as inst: print('Reading not possible for data: ', t, duration) print('--', inst) return 0, [] signature = st[0].data fs = st[0].stats['sampling_rate'] if eval(config.data_to_analyze['reading_arguments']['filtering']): signature = filter_data( signature, fs, config.data_to_analyze['reading_arguments']['filtering_frequency']) return fs, signature
def pie_graph(sales, sizes, brands, month_slider, relative_selector): sales = filter_data(sales, filter_many={'Brand': brands}, month_slider=month_slider) brand_group = sales.groupby('Brand').sum() size_group = sales.groupby('Size').sum() if "True" in relative_selector: brand_group = brand_group.apply(lambda x: x / x.sum(), axis=1) size_group = size_group.apply(lambda x: x / x.sum(), axis=1) data = [ dict( type='pie', labels=list(brand_group.index), values=list(brand_group['Quantity'].values), name='Brand Breakdown', # text=['Total Gas Produced (mcf)', 'Total Oil Produced (bbl)', 'Total Water Produced (bbl)'], # noqa: E501 hoverinfo="value+percent", textinfo="label+percent+name", hole=0.5, # marker=dict( # colors=['#fac1b7', '#a9bb95', '#92d8d8'] # ), domain={ "x": [0, .45], 'y': [0.2, 0.8] }, ), dict( type='pie', labels=list(size_group.index), values=list(size_group['Quantity'].values), name='Size Breakdown', hoverinfo="label+text+value+percent", textinfo="label+percent+name", hole=0.5, # marker=dict( # colors=[WELL_COLORS[i] for i in aggregate.index] # ), domain={ "x": [0.55, 1], 'y': [0.2, 0.8] }, ) ] layout = default_graph_layout() layout['title'] = "Sales Repartition by Brand" layout['font'] = dict(color='#777777') layout['legend'] = dict(font=dict(color='#CCCCCC', size='10'), orientation='h', bgcolor='rgba(0,0,0,0)') return dict(data=data, layout=layout)
def size_gap(inventory, sizes, brands, month_slider): inventory = filter_data(inventory, filter_many={"Brand": brands}, month_slider=month_slider) stocks = inventory[(inventory["Sales"] == 0) & (inventory["NetQuantity"] > 0)] sales = inventory[inventory["Sales"] != 0] matenboog_stock = (stocks.reset_index()[[ "Brand", "Size", "NetQuantity" ]].groupby([ "Brand", "Size" ]).sum().groupby("Brand").apply(lambda x: x / float(x.sum())).rename( columns={"NetQuantity": "Inventory"})) matenboog_sales = (sales.reset_index()[["Brand", "Size", "Sales"]].groupby( ["Brand", "Size"]).sum().groupby("Brand").apply(lambda x: x / float(x.sum()))) matenboog = pd.concat([matenboog_stock, matenboog_sales], axis=1, join="outer").fillna(0) try: matenboog["gap"] = matenboog["Inventory"] - matenboog["Sales"] except KeyError: return dict(data=[], layout={}) gap_summary = matenboog.groupby(level=0).agg(lambda s: abs(s).sum()) x_data, y_data = gap_summary.index, gap_summary["gap"] traces = [ go.Bar(x=x_data, y=y_data, text=list(map("{:.2f}".format, y_data)), textfont={"color": "#fff"}, textposition='auto', marker=dict(color='rgb(255, 125, 0)')) ] layout = default_graph_layout() layout['title'] = "Size Gap per Brand" return dict(data=traces, layout=layout)
def sales_history(sales, sizes, brands, month_slider, frequency, relative): sales = filter_data(sales, filter_many={"Brand": brands}, month_slider=month_slider) sales = sales.groupby([pd.Grouper(freq=frequency), 'Size']).sum().reset_index() sales = sales.pivot(index="order_date", columns="Size", values="Quantity") # , 'Quantity Returned' if 'True' in relative: sales = sales.apply(lambda s: s / s.sum(), axis=1) traces = [ go.Bar(x=sales.index, y=sales[category], name=category) for category in sales.columns ] layout = default_graph_layout() layout["barmode"] = 'stack' layout["title"] = "Sales History Including Size Distribution" return dict(data=traces, layout=layout)
from scipy import stats # 2019.11.01 该程序主要用来证明高斯分布 # 编程问题:如何从中开始循环去掉头尾等,循环如何改变源数据而非取出,如何转化list的每个数据,如何过滤 all_data = tools.old_read_data() # all_data[userid][31维特征]=[同一维的所有数据] print("load all_data success.") # 看一些比较规律的数据直接作图用于论文 examples = [("s003", 21), ("s004", 6), ("s008", 24), ("s002", 22), ("s007", 25), ("s010", 1)] for example in examples: key = example[0] column = example[1] data_list = all_data[key][column] x = tools.filter_data(data_list, 3) #过滤数据 mean = x.mean() std = x.std() print("user="******" column=" + str(column) + " mean=" + str(mean) + " std=" + str(std) + " first=" + str(data_list[0])) n, bins, patches = plt.hist(x, bins=80, color='r', alpha=0.5, density=True) # 直方图alpha控制颜色透明度 plt.xlabel('time/s') plt.ylabel('normed frequency') # plt.title('histogram') y = stats.norm.pdf(bins, mean, std) plt.plot(bins, y, color='g', linewidth=1) plt.show() # #遍历看数据 # count = 0
def mistrust_index(year=2018): json_data = tools.get_json() json_data = tools.filter_data(json_data, lambda e: e['main']['year'] == year) rating = defaultdict(int) scored = defaultdict(int) for entry in json_data: person_id = entry['main']['person']['id'] score = 0 # 1: total income is low, but savings are huge income = sum([e['size'] for e in entry['incomes']]) savings = 0 for s in entry['savings']: size = float(s.split('руб.')[0].replace(' ', '').replace(',', '.')) savings += size if (income > 0 and savings / income >= 5.0) or (income == 0 and savings > 0): score += 1 scored[1] += 1 # 2: personal income is low, but the relatives' income is huge income_self = sum( [e['size'] for e in entry['incomes'] if e['relative'] == None]) income_rel = sum( [e['size'] for e in entry['incomes'] if e['relative'] != None]) if (income_self > 0 and income_rel / income_self >= 5.0) or (income_self == 0 and income_rel > 0): score += 1 scored[2] += 1 # 3: zero total income(can be due to incorrectly submitted declaration, but still not good) if income == 0: score += 1 scored[3] += 1 # 4: low income, but owns a lot in real estate estates_area = 0 for estate in entry['real_estates']: # shall we exclude relatives? if not estate['square']: continue total = float(estate['square']) if estate['share']: total *= float(estate['share']) estates_area += total if income / 1000000.0 < 1.0 and estates_area > 500.0: score += 1 scored[4] += 1 # 5: lux cars lux_cars = [{ 'parent_name': 'BMW', 'name': '3 series' }, { 'parent_name': 'BMW', 'name': '5 series' }, { 'parent_name': 'BWM', 'name': '7 series' }, { 'parent_name': 'Acura', 'name': 'Acura' }, { 'parent_name': 'Audi', 'name': 'A4' }, { 'parent_name': 'Audi', 'name': 'A6' }, { 'parent_name': 'Audi', 'name': 'A7' }, { 'parent_name': 'Audi', 'name': 'A8' }, { 'parent_name': 'Alfa Romeo', 'name': 'Giulietta' }, { 'parent_name': 'Bentley' }, { 'parent_name': 'Cadillac' }, { 'parent_name': 'Ferrari' }, { 'parent_name': 'Hummer' }, { 'parent_name': 'Infinity' }, { 'parent_name': 'Jaguar' }, { 'parent_name': 'Lamborghini' }, { 'parent_name': 'Land Rover' }, { 'parent_name': 'Lexus' }, { 'parent_name': 'Maserati' }, { 'parent_name': 'Mercedes-Benz', 'name': 'C-класс' }, { 'parent_name': 'Mercedes-Benz', 'name': 'E-класс' }, { 'parent_name': 'Mercedes-Benz', 'name': 'GL-класс' }, { 'parent_name': 'Mercedes-Benz', 'name': 'S-класс' }, { 'parent_name': 'Porsche' }, { 'parent_name': 'Rolls-Royce' }, { 'parent_name': 'Saab', 'name': '9-3' }, { 'parent_name': 'Saab', 'name': '9-5' }, { 'parent_name': 'Volkswagen', 'name': 'Phaeton' }, { 'parent_name': 'Volvo', 'name': 'S60' }, { 'parent_name': 'Volvo', 'name': 'S80' }] has_lux = 0 for vehicle in entry['vehicles']: if not vehicle['brand']: continue for item in lux_cars: if 'name' in item: for brand in carbrand: if brand['parent_name'] == item[ 'parent_name'] and brand['name'] == item[ 'name'] and brand['id'] == vehicle[ 'brand']['id']: has_lux = 1 else: for brand in carbrand: if brand['parent_name'] == item[ 'parent_name'] and brand['id'] == vehicle[ 'brand']['id']: has_lux = 1 score += has_lux scored[5] += has_lux rating[person_id] += score return rating
def request_merapi_mseedreq(config, tStart, duration, verbatim=0): """ Request Merapi function. Gets the signature only, not full signal /!\ Signature of this function should not be modified and is similar for all applications (i,e, request_XXX) INPUT: - config: config dictionnary according to project formating - tStart: datetime object - duration: in sec - verbatim OUTPUT: - signature: numpy array containing the signal read - fs: sampling_rate """ try: year_s = tStart.year month_s = tStart.month day_s = tStart.day hour_s = tStart.hour minute_s = tStart.minute second_s = tStart.second #using mseedreq if config.data_to_analyze['reading_arguments'][ 'request'] == 'mseedreq': t1 = str(year_s).zfill(4) + "," + str(month_s).zfill( 2) + "," + str(day_s).zfill(2) + "," + str(hour_s).zfill( 2) + "," + str(minute_s).zfill(2) + "," + str( second_s).zfill(2) streams = config.data_to_analyze['reading_arguments'][ 'network'] + "." + config.data_to_analyze['reading_arguments'][ 'station'] + "." + config.data_to_analyze[ 'reading_arguments'][ 'location'] + "." + config.data_to_analyze[ 'reading_arguments']['channel'] url = "http://localhost/cgi-bin/mseedreq.pl?all=2&s3=SEFRAN3&streams=" + streams + "&t1=" + t1 + "&ds=" + str( int(duration)) #using fdsn if config.data_to_analyze['reading_arguments']['request'] == 'fdsn': tEnd = tStart + timedelta(seconds=duration) year_e = tEnd.year month_e = tEnd.month day_e = tEnd.day hour_e = tEnd.hour minute_e = tEnd.minute second_e = tEnd.second start = str(year_s).zfill(4) + "-" + str(month_s).zfill( 2) + "-" + str(day_s).zfill(2) + "T" + str(hour_s).zfill( 2) + ":" + str(minute_s).zfill(2) + ":" + str( second_s).zfill(2) end = str(year_e).zfill(4) + "-" + str(month_e).zfill( 2) + "-" + str(day_e).zfill(2) + "T" + str(hour_e).zfill( 2) + ":" + str(minute_e).zfill(2) + ":" + str( second_e).zfill(2) stream = 'net=' + config.data_to_analyze['reading_arguments'][ 'network'] + '&sta=' + config.data_to_analyze[ 'reading_arguments'][ 'station'] + '&loc=' + config.data_to_analyze[ 'reading_arguments'][ 'location'] + '&cha=' + config.data_to_analyze[ 'reading_arguments']['channel'] url = 'http://localhost:8080/fdsnws/dataselect/1/query?' + stream + '&start=' + start + '&end=' + end + '&nodata=404' #if verbatim > 1: print("Query:", url) r = requests.get(url, auth=("wo", "")) except Exception as inst: print('Impossible to send request to client ') print('--', inst) return 0, [] try: filepath = config.general['project_root'] + config.application[ 'name'].upper( ) + config.data_to_analyze['reading_arguments']['tmpfilepath'] print(r.status_code) if r.status_code == 200: with open(filepath, 'wb') as f: f.write(r.content) st = read(filepath) except Exception as inst: print('Reading not possible for data: ', tStart, duration) print('--', inst) return 0, [] signature = st[0].data fs = st[0].stats['sampling_rate'] if eval(config.data_to_analyze['reading_arguments']['filtering']): signature = filter_data( signature, fs, config.data_to_analyze['reading_arguments']['filtering_frequency']) return fs, signature