def squash_segments(seg_pset): """Combine contiguous segments.""" curr_chrom = None curr_start = None curr_end = None curr_genes = [] curr_val = None curr_cnt = 0 squashed_rows = [] for row in seg_pset: if row.chromosome == curr_chrom and row.log2 == curr_val: # Continue the current segment curr_end = row.end curr_genes.append(row.gene) curr_cnt += 1 else: # Segment break # Finish the current segment if curr_cnt: squashed_rows.append((curr_chrom, curr_start, curr_end, ",".join(pd.unique(curr_genes)), curr_val, curr_cnt)) # Start a new segment curr_chrom = row.chromosome curr_start = row.start curr_end = row.end curr_genes = [] curr_val = row.log2 curr_cnt = 1 # Remainder squashed_rows.append((curr_chrom, curr_start, curr_end, ",".join(pd.unique(curr_genes)), curr_val, curr_cnt)) return seg_pset.as_rows(squashed_rows)
def calculate_average_all_pair_distance(csv_file, hasConsensus=True): #df_in is the output csv from median_swc plugin #it contains unique pair-wise distances #consensus is usually the last input the median_swc() inputs, so it won't show up in the "swc_file_name1" column #output the average distances array #remove invalid results df_out=pd.DataFrame() if not os.path.exists(csv_file): return df_out df_f = pd.read_csv(csv_file) if df_f.empty: return df_out df_in = df_f[df_f['sum_distance'] >0] df_out = pd.DataFrame(columns = ['swc_file_name','average_sum_distance','average_structure_difference','average_max_distance']) dfg1 = df_in.groupby('swc_file_name1') dfg2 = df_in.groupby('swc_file_name2') swc_names = pd.unique(df_in['swc_file_name1']) swc_names_2 = pd.unique(df_in['swc_file_name2']) consensus_file_name = df_in['swc_file_name2'].tail(1).values[0] if 'consensus' not in consensus_file_name: #print "missing consensus" return df_out row = 0 for swc_name in swc_names: a = dfg1.get_group(swc_name) a = a[a['swc_file_name2']!=consensus_file_name] b = pd.DataFrame(columns = ['swc_file_name1','swc_file_name2','sum_distance','structure_difference','max_distance']) #empty if swc_name in swc_names_2: b = dfg2.get_group(swc_name) num_of_swcs = len(a) +len(b) df_out.loc[row,'swc_file_name']= swc_name.split('/')[-1] df_out.loc[row,'average_sum_distance'] = (a['sum_distance'].sum() + b['sum_distance'].sum())/ num_of_swcs df_out.loc[row,'average_structure_difference'] = a['structure_difference'].sum() + b['structure_difference'].sum()/num_of_swcs df_out.loc[row,'average_max_distance'] = a['max_distance'].sum() + b['max_distance'].sum()/num_of_swcs row = row +1 df_out.loc[row,'swc_file_name']= consensus_file_name.split('/')[-1] consensus_group = dfg2.get_group(consensus_file_name) df_out.loc[row,'average_sum_distance'] = consensus_group['sum_distance'].sum() / (num_of_swcs+1) df_out.loc[row,'average_structure_difference'] = consensus_group['structure_difference'].sum() / (num_of_swcs+1) df_out.loc[row,'average_max_distance'] = consensus_group['max_distance'].sum() / (num_of_swcs+1) return df_out
def buildTable(self): sub_dict = {str(el):np.random.rand(self.origin_call) for el in pd.unique(self.df['ORIGIN_CALL'])} Data.lookupTable['origin_call'] = sub_dict sub_dict = {str(el):np.random.rand(self.origin_stand) for el in pd.unique(self.df['ORIGIN_STAND'])} Data.lookupTable['origin_stand'] = sub_dict sub_dict = {str(el):np.random.rand(self.taxi_id) for el in pd.unique(self.df['TAXI_ID'])} Data.lookupTable['taxi_id'] = sub_dict sub_dict = {str(el):np.random.rand(self.day_type) for el in pd.unique(self.df['DAY_TYPE'])} Data.lookupTable['day_type'] = sub_dict sub_dict = {str(el):np.random.rand(self.week) for el in range(1,54)} Data.lookupTable['week_of_year'] = sub_dict sub_dict = {str(el):np.random.rand(self.day) for el in range(1,8)} Data.lookupTable['day_of_week'] = sub_dict sub_dict = {str(el):np.random.rand(self.qhour) for el in range(1,100)} Data.lookupTable['qhour_of_day'] = sub_dict
def addMarkerSheet(book,marker,results,kea,markerdata,markernotes,colours,imagePath,validated): '''Adds a sheet with information about a given marker.''' sheet = book.create_sheet() sheet.title = marker writeGroupHeading(sheet,1,'Marker',marker,colours) notes = markernotes.getNotesForMarker(marker) writeMarkerNotes(sheet,1,notes,colours) m = 2 + len(notes) markerImagePath = os.path.join(imagePath,marker+'.jpg') if os.path.isfile(markerImagePath): writeMeltCurves(sheet,m+1,1,markerImagePath) m = m + 19 groups = markerdata.getGroupsForMarker(marker) writeGroupHeading(sheet,m-1,'Results summary','',colours) identifiers,groupings = 'All Samples',['All'] writeSummaryHeadings(sheet,m,identifiers,groups,colours) stats = results.getStatsTable(marker,groupings) grouping = pd.unique(stats[groupings[-1]].ravel()) grouping.sort() stats = stats.set_index(grouping) for c,g in enumerate(grouping): try: row = stats.loc[g] writeSummary(sheet,m+c*3+2,'all',row,marker,groups,markerdata,colours) except: pass m = m+c*3+5 identifiers,groupings = 'Plate',['Plate ID','Plate Name','Plate Label'] writeSummaryHeadings(sheet,m,identifiers,groups,colours) stats = results.getStatsTable(marker,groupings) grouping = pd.unique(stats[groupings[-1]].ravel()) grouping.sort() stats = stats.set_index(grouping) for c,g in enumerate(grouping): try: row = stats.loc[g] writeSummary(sheet,m+c*3+2,False,row,marker,groups,markerdata,colours) except: pass m = m+c*3+5 if kea: identifiers,groupings = 'Population',['Population'] writeSummaryHeadings(sheet,m,identifiers,groups,colours) stats = results.getStatsTable(marker,groupings) grouping = pd.unique(stats[groupings[-1]].ravel()) grouping.sort() stats = stats.set_index(grouping) for c,g in enumerate(grouping): try: row = stats.loc[g] writeSummary(sheet,m+c*3+2,'kea',row,marker,groups,markerdata,colours) except: pass m = m+c*3+5 writeGroupHeading(sheet,m,'Marker Validations','',colours) writeValidated(sheet,m+1,marker,markerdata,colours,validated)
def label_encode_train_test_sets (train, test) : " Label encode 'supplier' and 'bracket_pricing' features for both train and test set " test_suppliers = np.sort(pd.unique(test.supplier.ravel())) print ("Test suppliers shape & elements: ", test_suppliers.shape, test_suppliers) train_suppliers = np.sort(pd.unique(train.supplier.ravel())) print ("Train suppliers shape & elements: ", train_suppliers.shape, train_suppliers) ## Merge 'supplier' for both datasets first because we want encoding to be consistent across both # http://docs.scipy.org/doc/numpy/reference/generated/numpy.sort.html supplier_ids = [] supplier_ids.extend(train_suppliers) supplier_ids.extend(test_suppliers) supplier_ids = np.sort(np.unique(supplier_ids)) print ("Merged supplier_ids.shape: ", supplier_ids.shape) # print ("supplier_ids.elements: ", supplier_ids) ## Perform label encoding fit on the merged array and then individually transform for train and test sets print ("Performing label encoding on supplier column...") label_e = LabelEncoder() label_e.fit(supplier_ids) train['supplier'] = label_e.transform(train['supplier']) test['supplier'] = label_e.transform(test['supplier']) ## Perform label encoding on 'bracket_pricing' print ("Performing label encoding on bracket_pricing column...") train['bracket_pricing'] = label_e.fit_transform(train['bracket_pricing']) test['bracket_pricing'] = label_e.fit_transform(test['bracket_pricing']) return train, test
def create_table_POP(mimic_db): # Limit the population POP = mimic_db['ICUSTAY_DETAIL'] # Criterions 1: first time in ICU, Adult patient, between 12 - 96 hours of ICU stay POP = POP[(POP['ICUSTAY_SEQ'] == 1) & (POP['ICUSTAY_AGE_GROUP'] == 'adult') & (POP['ICUSTAY_LOS'] >= 12*60) & (POP['ICUSTAY_LOS'] <= 96*60)] # Criterion 2: 1) Exclude CMO, 2) Exclude DNR/DNI, 3) Include only Full Code, 4) No NSICU, CSICU # Merge the patient data with chartevents MERGED = POP.merge(mimic_db['CHARTEVENTS'], on='ICUSTAY_ID', how='left') # Find PACEMAKER data, Find RISK FOR FALLS data PACEMAKER = MERGED[MERGED['ITEMID'] == 1484][['ICUSTAY_ID', 'VALUE1']] RISKFALLS = MERGED[MERGED['ITEMID'] == 516][['ICUSTAY_ID', 'VALUE1']] PACEMAKER = PACEMAKER.groupby('ICUSTAY_ID', as_index=False).agg(lambda x: x.iloc[0]) RISKFALLS = RISKFALLS.groupby('ICUSTAY_ID', as_index=False).agg(lambda x: x.iloc[0]) PACEMAKER.rename(columns={'VALUE1': 'PACEMAKER'}, inplace=True) RISKFALLS.rename(columns={'VALUE1': 'RISKFALLS'}, inplace=True) all_ICUSTAY_ID = pd.unique(MERGED['ICUSTAY_ID'].values.ravel()) # Grab out only the events WITHOUT full code for care protocol MERGED = MERGED[(MERGED['ITEMID'] == 128) & (MERGED['VALUE1'] != 'Full Code')] bad_ICUSTAY_ID = pd.unique(MERGED['ICUSTAY_ID'].values.ravel()) # Subtract the two sets good_ICUSTAY_ID = np.array([i for i in all_ICUSTAY_ID if i not in bad_ICUSTAY_ID]) POP = POP[POP['ICUSTAY_ID'].isin(good_ICUSTAY_ID)] # Remove any NSICU Service or CSICU Service patients POP = POP[~POP['ICUSTAY_FIRST_SERVICE'].isin(['NSICU', 'CSICU'])] # Merge with the selection data POP = POP.merge(PACEMAKER, on='ICUSTAY_ID', how='left') POP['PACEMAKER'].fillna('No', inplace=True) POP = POP.merge(RISKFALLS, on='ICUSTAY_ID', how='left') POP['RISKFALLS'].fillna('None', inplace=True) return POP
def processVolumeData(aggregated): df_2013 = data_utils.parseFileWithIndex('data/2013/Medicare Volume Measures.csv', ['Diagnosis Related Group', 'Number Of Cases']) df_2012 = data_utils.parseFileWithIndex('data/2012/Medicare Payment and Volume Measures.csv', ['Diagnosis Related Group', 'Number Of Cases']) mincases = '10' missing_marker = '*' test_column = 'Chest Pain 2013'; reformatted = [] for df in [df_2013, df_2012]: df['Number Of Cases'][df['Number Of Cases'] == missing_marker] = mincases df['Number Of Cases'] = df['Number Of Cases'].str.replace(",", "") df['Number Of Cases'] = df['Number Of Cases'].astype(float) hospitals = pd.unique(df.index) cols = pd.unique(df['Diagnosis Related Group']) df2 = pd.DataFrame(data = 0, index = hospitals, columns = cols) for col in cols: x = df['Number Of Cases'][df['Diagnosis Related Group'] == col] df2[col] = x reformatted.append(df2) assert 'Number of Cases' not in df2.columns assert 'Diagnosis Related Group' not in df2.columns reformatted[0].columns = reformatted[0].columns.map(lambda x: str(x) + ' 2013') volume = reformatted[0].join(reformatted[1], how = 'outer', rsuffix=' 2012') assert test_column in volume.columns volume[pd.isnull(volume)] = float(mincases) merged_final_data = aggregated.join(volume, how='left') merged_final_data = merged_final_data.fillna(float(mincases)) return merged_final_data
def get_nutrient_profiles(df): ''' Function to parse the depth-nutrient concentrations from a pandas.core.frame.DataFrame object ('df'). The df should be the tabular nutrient file imported from the nutrients data file. The data will be sorted into an OrderedDict structure with the following key-hierarchy: Stations (region specific, i.e. Calvert) Nutrients (SiO2, NO2+NO3, PO4) Dates Sampled Nutrient concentration (with the sampling depth arranged as the indices) The end key-value will be the nutrient concentrations of the respective nutrients sampled (structured as a pandas.core.series.Series object) with the sampling depth as the Series indices. ''' stations_sampled = np.sort(pd.unique(df['Site ID'])) profiles = OrderedDict() nutrients_sampled = ['PO4', 'SiO2', 'NO2+NO3'] for each_sta in stations_sampled: profiles[each_sta] = {} for each_nutrient in nutrients_sampled: profiles[each_sta][each_nutrient] = {} for each_date in pd.unique((df['Site ID'] == each_sta), 'Date'): profiles[each_sta][each_nutrient][each_date] = df.loc[(df['Site ID'] == each_sta) & (df['Date'] == each_date), each_nutrient] profiles[each_sta][each_nutrient][each_date].index = df.loc[(df['Site ID'] == each_sta) & (df['Date'] == each_date), 'Depth'] return profiles
def for_seating(case): newframe=pd.DataFrame() ## the rearrange of the original data subtest=df[df.casenum==case].reset_index(drop=True) ## 'subtest' only take the records that have a specific case id num=subtest.shape[0] ## num will be 3, because usally there are 3 records for each case j1=(pd.unique((subtest.codej1).dropna()))[0] j2=(pd.unique((subtest.codej2).dropna()))[0] j3=(pd.unique((subtest.codej3).dropna()))[0] for j in range(num): copytest=deepcopy(subtest.ix[j]) if copytest.ids==j1: newframe=newframe.append(copytest) if copytest.ids==j2: copytest.codej2=j1 copytest.j2vote1=copytest.direct1 copytest.j2maj1=1 newframe=newframe.append(copytest) if copytest.ids==j3: copytest.codej3=j1 copytest.j3vote1=copytest.direct1 copytest.j3maj1=1 newframe=newframe.append(copytest) return newframe
def find_best_pars(df): """ Finds the 'best-fit' parameters for each original file and method :param df: :return: """ # First, get the maximum value of the ccf df['max_ccf'] = df['ccf'].map(np.max) methods = pd.unique(df.method) original_files = pd.unique(df.original) best_info = defaultdict(list) for original_filename in original_files: for method in methods: good = df.loc[(df.method == method) & (df.original == original_filename)] best = good.loc[good['max_ccf'] == good['max_ccf'].max()] # print 'File: {}\n\tmethod = {}\n\tT = {}\n\tlogg = {}\n\t[Fe/H] = {}'.format(original_filename, # method, # best['T'].item(), # best['logg'].item(), # best['metallicity'].item()) #print '\tvsini = {}'.format(best['vsini'].item()) best_info['original'].append(original_filename) best_info['method'].append(method) best_info['T'].append(best['T'].item()) best_info['logg'].append(best['logg'].item()) best_info['metallicity'].append(best['metallicity'].item()) best_info['vsini'].append(best['vsini'].item()) return pd.DataFrame(data=best_info)
def encodeMut(filepath, output, mut_predictor=None): '''Generates a new dataframe with binarized mutation data''' df = pd.read_csv(filepath, header = 0) # Filter rows by SNP column (remove rows where SNP = y) df = df.loc[df['SNP'] != 'y', :] # # Remove silent mutations # df = df.loc[df['Mutation.Description'] != 'Substitution - coding silent'] # Filter by FATHMM.prediction column df = df.loc[df['FATHMM.prediction'] != 'PASSENGER/OTHER', :] # Filter by VEP predictions (SIFT/Polyphen scores) # if mut_predictor is None, does nothing if mut_predictor == 'sift': df = df.loc[df['SIFT'] != 'tolerated', :] elif mut_predictor == 'polyphen': df = df.loc[df['PolyPhen'] != 'benign', :] # doesn't predict effects of indels!! # Create new dataframe df2 = pd.DataFrame(index=pd.unique(df['cell_line_name'])) genes = pd.unique(df['Gene.name']) for gene in genes: df2[gene+'_mut'] = 0 for index, row in df.iterrows(): df2.set_value(row['cell_line_name'], row['Gene.name']+'_mut', 1) #Save to file df2.to_csv(output, index_label='CELL_LINE')
def daily_stats(self): """Overall right/wrong percent by date""" grouped = (self.data_base["correct"]==True).groupby(self.data_base["date"]) correct = grouped.mean().reset_index() correct["wrong"] = 1 - correct["correct"] correct["date"] = pd.to_datetime(correct["date"], dayfirst=True) correct = correct.sort_values("date") fig, ax = plt.subplots() left_limit = (datetime.datetime.strptime(pd.unique(self.data_base["date"])[0], '%d-%m-%Y' ) - datetime.timedelta(days= 1)).date() right_limit = (datetime.datetime.strptime(pd.unique(self.data_base["date"])[-1], '%d-%m-%Y' ) + datetime.timedelta(days=1)).date() ax.plot(correct['date'], correct['correct'], marker = '.', color='lightseagreen', ms = 15, lw = 2, linestyle= '-' , label="correct" ) ax.plot(correct['date'], correct['wrong'], color='coral', marker = '.', ms = 15, lw = 2, linestyle= '-', label="wrong" ) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') ax.grid(axis="y", zorder=0, color="#9698A1") ax.set_xticks((correct['date'].values)) ax.set_xlim([left_limit, right_limit]) ax.set_ylim([0., 1.]) ax.legend(loc='upper right').get_frame().set_alpha(0.3) ax.set_title('Daily Stats', fontsize=15) ax.set_xticklabels(correct['date'].map(lambda x: x.strftime("%d %b"))) ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%d \n %b'))
def __init__(self,target_label,DF,OUT_PATH,FILENAME,TEST_MODE,logger,eval_output_file,argv): self.target_label=target_label self.DF=DF self.reduced_matrix=np.load(OUT_PATH+FILENAME[:FILENAME.index(".")]+'.npy') self.blacklisted_estimators=[] self.es=None self.best_estimator=None self.valid_found=False self.test_predictions=None self.argv=argv if not TEST_MODE: self.labelled_indexes=np.array(self.DF[self.target_label+'_num_label'][self.DF[self.target_label+'_num_label'].notnull()].index) self.num_classes=dict(sorted(pd.unique(self.DF.loc[self.DF[self.target_label+'_num_label'].notnull(),[self.target_label+'_num_label',target_label]].values),key=lambda x:x[1])) else: self.labelled_indexes=np.arange(0,500) self.num_classes=dict(sorted(pd.unique(self.DF[[self.target_label+'_num_label',target_label]].values),key=lambda x:x[1])) logger.info(str(len(self.labelled_indexes))+" labelled instances detected as a SEED for modelling.") mask = np.ones(len(self.DF[self.target_label+'_num_label']), dtype=bool) mask[self.labelled_indexes] = False self.unlabelled_indexes=np.copy(self.DF.index)[mask] self.logger=logger self._eval_output_file=eval_output_file
def makehist(series,df,mincount=0,bins=[],title=""): rej = df.rejected == 1 app = df.rejected == 0 nrej = sum(rej)*1.0 napp = sum(app)*1.0 series_rej = pd.Series({count: sum(series[rej]==count) for count in pd.unique(series[rej])}) series_app = pd.Series({count: sum(series[app]==count) for count in pd.unique(series[app])}) rej_plot = series_rej[series_rej.index>=mincount]/nrej app_plot = series_app[series_app.index>=mincount]/napp plt.figure() if len(bins)>0: n1,bin1,_ = plt.hist(np.array(rej_plot.index),bins=bins,weights=np.array(rej_plot),label='rejected') n2,bin2,_ = plt.hist(np.array(app_plot.index),bins=bins,weights=np.array(app_plot),label='approved') else: n1,bin1,_ = plt.hist(np.array(rej_plot.index),weights=np.array(rej_plot),label='rejected') n2,bin2,_ = plt.hist(np.array(app_plot.index),weights=np.array(app_plot),label='approved') plt.legend() if mincount > 0: title = title + "(count >= " + str(mincount) + ")" plt.title(title) plt.show() df_freq = pd.concat([ pd.DataFrame(series_rej/nrej,columns=['rejected']), pd.DataFrame(series_app/napp,columns=['approved'])],axis=1) return df_freq
def app_activity_features(): train = pd.read_csv("gender_age_train.csv") test = pd.read_csv("gender_age_test.csv") train.drop(['gender','age','group'],axis=1,inplace=True) data = train.append(test) """ Merge with brand_model table""" device_table = pd.read_csv("phone_brand_device_model.csv") data = pd.merge(data,device_table,how='left',on='device_id') data = data.drop_duplicates() #drop duplicates #note: there is still one device associated with 2 brands/models del device_table print "data build" """ Create dataframe indicating for each device id, which app is present, and how much is it active - merge events and app_events on event_id - group by device_id and app_id, and take the mean of activity """ events = pd.read_csv("events.csv") events = events[events['device_id'].isin(list(data['device_id']))] apps = pd.read_csv("app_events.csv") apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id') apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean() del events print "events build" """Reshape the dataframe so that each app is a new feature""" reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id']))) reshaped[list(pd.unique(apps['app_id']))]=0 for app in list(pd.unique(apps['app_id'])): sliced = apps[apps['app_id']==app] reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values del apps return reshaped
def prepData(): # load up files from disk training_data, kaggle_data = LoadData.load_data() features_in = ['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address' 'X', 'Y'] # break dates into month, day, year, day of week, hour # categorize category, month, day, year, dow, hour, district # scale lat (y), long(x) training_data['Year'] = (pd.DatetimeIndex(training_data['Dates']).year) training_data['Month'] = (pd.DatetimeIndex(training_data['Dates']).month) training_data['Day'] = (pd.DatetimeIndex(training_data['Dates']).day) training_data['Hour'] = (pd.DatetimeIndex(training_data['Dates']).hour) training_data['Minute'] = (pd.DatetimeIndex(training_data['Dates']).minute) # cast date as unix time training_data['UnixTime'] = (pd.DatetimeIndex(training_data['Dates'])).astype(np.int64) / 10000000000 # day of week to number sorted_days = ('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday') def dayOfWeekNumber(d): return sorted_days.index(d) training_data['DayNumber'] = training_data['DayOfWeek'].apply(dayOfWeekNumber) # set up an id number for each category from alphabetical list # add to training_data categories = pd.unique(training_data['Category']) sorted_categories = (np.sort(categories)).tolist() def categoryNumber(category): return sorted_categories.index(category) training_data['CategoryNumber'] = training_data['Category'].apply(categoryNumber) districts = pd.unique(training_data['PdDistrict']) sorted_districts = (np.sort(districts)).tolist() def districtNumber(district): return sorted_districts.index(district) training_data['DistrictNumber'] = training_data['PdDistrict'].apply(districtNumber) # X is longitude, Y is latitude set ones outside city to median values training_data.loc[training_data.X > -122.0, 'X'] = training_data.X.median() training_data.loc[training_data.X < -123.0, 'X'] = training_data.X.median() training_data.loc[training_data.Y < 37.0, 'Y'] = training_data.Y.median() training_data.loc[training_data.Y > 38.0, 'Y'] = training_data.Y.median() return (training_data)
def understand_data(input_list): for choice in input_list: if choice == 1: print raw_data.head(10) # print pd.unique(raw_data.TripType) print "Count of unique product Upc", int(pd.DataFrame(pd.unique(raw_data.Upc)).count()) print "Count of unique product department descriptions", int( pd.DataFrame(pd.unique(raw_data.DepartmentDescription)).count() ) elif choice == 2: share_of_trip_type = pd.DataFrame( raw_data.groupby(["TripType"], axis=0)["VisitNumber"].count() * 100 / len(raw_data) ) print share_of_trip_type products_departments = pd.DataFrame( raw_data.groupby(["DepartmentDescription"], axis=0)["Upc"].nunique() ) # http://stackoverflow.com/questions/15411158/pandas-countdistinct-equivalent print products_departments elif choice == 3: # http://pandas.pydata.org/pandas-docs/stable/reshaping.html # department_triptype_pivot = pd.pivot_table(raw_data, values='VisitNumber', index='DepartmentDescription', columns='TripType', aggfunc=np.size) # print department_triptype_pivot department_finelinenum_pivot = pd.pivot_table( raw_data, values="VisitNumber", index="FinelineNumber", columns="DepartmentDescription", aggfunc=np.size ) print department_finelinenum_pivot department_weekday_pivot = pd.pivot_table( raw_data, values="VisitNumber", index="DepartmentDescription", columns="Weekday", aggfunc=np.size ) print department_weekday_pivot Weekday_trip_type_pivot = pd.pivot_table( raw_data, values="VisitNumber", index="TripType", columns="Weekday", aggfunc=np.size ) print Weekday_trip_type_pivot elif choice == 10: # http://stackoverflow.com/questions/21654635/scatter-plots-in-pandas-pyplot-how-to-plot-by-category groups = raw_data.groupby("TripType") fig, ax = plt.subplots() for name, group in groups: # print name # print group.DepartmentDescription ax.plot(group.ScanCount, group.Weekday_num, marker="o", linestyle="", ms=5, label=name) ax.legend() plt.show() elif choice == 20: # (pd.DataFrame(pd.unique(raw_data.TripType))).to_csv('Unique_trip_types.csv',sep = ',',index = False) # share_of_trip_type.to_csv('TripType_percentage_share.csv',sep = ',') # products_departments.to_csv('Unique products per department.csv',sep = ',') # department_triptype_pivot.to_csv('DepartmentDescription trip Type visit number frequency pivot table.csv',sep=',') # department_finelinenum_pivot.to_csv('DepartmentDescription finelinenumber visit number frequency pivot table.csv',sep=',') # department_weekday_pivot.to_csv('DepartmentDescription Weekday visit number frequency pivot table.csv',sep=',') Weekday_trip_type_pivot.to_csv("TripType Weekday visit number frequency pivot table.csv", sep=",")
def get_rand_index(M, n): subset = [(randint(0,M.shape[0]-1), randint(0,M.shape[1]-1)) for _ in range(n)] subset = pd.unique(subset) while len(subset) < n: new_indices = [(randint(0,M.shape[0]-1), randint(0,M.shape[1]-1)) for _ in range(n-len(subset))] subset = list(subset) + new_indices subset = pd.unique(subset) return list(subset)
def schedule_to_timeslot(schedule, n_timeslot=15): """ Create personal schedule from list of schedule """ schedule_df = pd.DataFrame(schedule, columns=['person', 'person_to_meet']) person_to_meet_df = pd.DataFrame(schedule_df.person_to_meet.values.tolist(), columns=range(1, n_timeslot)) # schedule to dataframe schedule_df = pd.concat((schedule_df[['person']], person_to_meet_df), axis=1) # create person list and map to row/ column person_list = pd.unique(list(schedule_df['person'])) P_map = {v: k for k, v in enumerate(person_list)} timeslot_list = [] for i in range(1, n_timeslot): timeslot_df = schedule_df[['person', i]].dropna().astype(int).reset_index(drop=True) P = np.zeros((len(person_list), len(person_list)), dtype=int) # adding table number count = 1 for _, r in schedule_df.iterrows(): if not pd.isnull(r['person']) and not pd.isnull(r[i]) and P[P_map[r['person']], P_map[r[i]]] == 0 and P[P_map[r[i]], P_map[r['person']]] == 0: P[P_map[r['person']], P_map[r[i]]] = count P[P_map[r[i]], P_map[r['person']]] = count count += 1 # fill in pair of people (add random pair of people) left_person = list(set(person_list) - set(pd.unique(list(timeslot_df.person) + list(timeslot_df[i].dropna().astype(int))))) random.shuffle(left_person) random_pair = list(zip(left_person[0:int(len(left_person)/2)], left_person[int(len(left_person)/2)::])) for p1, p2 in random_pair: count += 1 P[P_map[p1], P_map[p2]] = count P[P_map[p2], P_map[p1]] = count additional_pair = \ [[p1, p2, int(P[P_map[p1], P_map[p2]])] for p1, p2 in random_pair] + \ [[p2, p1, int(P[P_map[p1], P_map[p2]])] for p1, p2 in random_pair] left_person_df = pd.DataFrame(additional_pair, columns=['person', i, 'table_number']) # concatenate table_number = [int(P[P_map[r['person']], P_map[r[i]]]) for _, r in timeslot_df.iterrows()] timeslot_df['table_number'] = table_number timeslot_df = pd.concat((timeslot_df, left_person_df)) timeslot_list.append(timeslot_df) # for all person, make schedule person_schedule_all = [] for p in person_list: person_schedule = [] for t_df in timeslot_list: person_schedule.append(t_df[t_df.person == p]) person_schedule_all.append(pd.concat(person_schedule)) return person_schedule_all # list of dataframe each contains schedule
def getASstats(): print('.... Statistics ....') df = pd.read_csv(INPUT_FILE_PATH, parse_dates=[1]) unique_src_as = len(pd.unique(df.src_ASN.ravel())) unique_dst_as = len(pd.unique(df.dst_ASN.ravel())) bytes_count = df.Bytes.sum() print('''No. of unique:\nSrc ASes: %d, Dst ASes: %d, Total Bytes: %d''' % (unique_src_as, unique_dst_as, bytes_count))
def set_unique_tag_values(df): unique_tag = set(pd.unique(df['tag1'])) unique_tag = unique_tag.union(pd.unique(df['tag2'])) unique_tag = unique_tag.union(pd.unique(df['tag3'])) unique_tag = unique_tag.union(pd.unique(df['tag4'])) unique_tag = unique_tag.union(pd.unique(df['tag5'])) unique_tag = [x for x in unique_tag if str(x) != 'nan'] global all_unique_tags all_unique_tags = all_unique_tags.union(unique_tag)
def assign_id(data): items = pd.unique(data['itemid']) vks = pd.unique(data['vk']) itemid = {items[i]:i for i in range(items.shape[0])} vkid = {vks[i]:i for i in range(vks.shape[0])} return itemid, vkid
def match_workers_assignments(worker_list, worker_result_df): """ Creates a dataframe with results only from specified workers. :param worker_list: workers to filter on :param worker_result_df: all worker results :return: results filtered by worker """ match_df = worker_result_df[worker_result_df['worker_id'].isin(worker_list)] return pd.unique(match_df['assignment_id']).tolist(), pd.unique(match_df['worker_id']).tolist()
def _write_report(dframe, groups, sub_id=None, sc_split=False, condensed=True, out_file='report.pdf', dpi=DEFAULT_DPI): """ Generates the violin plots of each qctype """ columns = dframe.columns.ravel() headers = [] for group in groups: rem = [] for head in group: if head not in columns: rem.append(head) else: headers.append(head) for i in rem: group.remove(i) report = PdfPages(out_file) sessions = sorted(pd.unique(dframe.session_id.ravel())) for ssid in sessions: sesdf = dframe.copy().loc[dframe['session_id'] == ssid] scans = pd.unique(sesdf.run_id.ravel()) if sc_split: for scid in scans: subset = sesdf.loc[sesdf['run_id'] == scid] if len(subset.index) > 1: if sub_id is None: subtitle = '(session: %s other: %s)' % (ssid, scid) else: subtitle = '(Subject: %s, session: %s, other: %s)' % (sub_id, ssid, scid) if condensed: fig = plot_all(sesdf, groups, subject=sub_id, title='QC measures ' + subtitle) else: fig = plot_measures( sesdf, headers, subject=sub_id, title='QC measures ' + subtitle) report.savefig(fig, dpi=dpi) fig.clf() else: if len(sesdf.index) > 1: if sub_id is None: subtitle = '(session %s)' % (ssid) else: subtitle = '(subject %s, session %s)' % (sub_id, ssid) if condensed: fig = plot_all(sesdf, groups, subject=sub_id, title='QC measures ' + subtitle) else: fig = plot_measures( sesdf, headers, subject=sub_id, title='QC measures ' + subtitle) report.savefig(fig, dpi=dpi) fig.clf() report.close() plt.close() # print 'Written report file %s' % out_file return out_file
def choose_share_class(dataframe): """not working! """ #sets variables for commonly used column names rank = 'Rank' fee = 'Management Fee' ter = 'Annual Report Net Expense Ratio' ongoing_charge = 'Annual Report Ongoing Charge' income = 'Distribution Status' #finds unique fund indentifiers in rank column ranks = pd.unique(dataframe[rank]) #sets the boolean column to 0 dataframe['Chosen Share Class'] = 0 #sets a column to record difference from the target MER of .75 dataframe['Difference'] = 0 #loops through the fund identifiers for r in ranks: print 'Rank - ', r fund = dataframe[dataframe[rank]==r].copy() income_type = pd.DataFrame(pd.unique(dataframe[income])) #sorts out funds without accumulating share classes if income_type.isin(['Acc']).sum()[0] == 0: fund['Chosen Share Class'] = 'No Acc' print 'No Acc' else: fund_acc = fund[fund[income]=='Acc'].copy() for row in np.arange(len(fund_acc)): if fund_acc['Management Fee'].iloc[row,]!="": fund_acc['Difference'].iloc[row,] = np.absolute(\ fund_acc['Management Fee'].iloc[row,]-.75) print 'Difference - ',fund_acc['Difference'].iloc[row,] else: fund_acc['Chosen Share Class'].iloc[row,] = 'No MER' print 'No MER' try: minimum = fund_acc['Difference'].min() print 'Minimum - ',minimum fund_acc['Chosen Share Class'][fund_acc['Difference']==minimum] = 1 print 'Success' if len(fund_acc[fund_acc['Chosen Share Class']==1])>1: acc = fund_acc[fund_acc['Chosen Share Class']==1].copy() for row in np.arange(len(acc)): print 'Row - ', row print 'Name - ', acc['Name'].iloc[row,] print 'MER - ', acc[fee].iloc[row,] result = int(raw_input('Pick one!: ')) acc['Chosen Share Class'] = 0 acc['Chosen Share Class'].iloc[result,]=1 fund_acc[fund_acc['Chosen Share Class']==1] = acc except: result = 'error' fund_acc['Chosen Share Class'] = result print result fund[fund[income]=='Acc'] = fund_acc dataframe[dataframe[rank]==r] = fund return dataframe
def _write_report(df, groups, sub_id=None, sc_split=False, condensed=True, out_file='report.pdf'): columns = df.columns.ravel() headers = [] for g in groups: rem = [] for h in g: if h not in columns: rem.append(h) else: headers.append(h) for r in rem: g.remove(r) report = PdfPages(out_file) sessions = sorted(pd.unique(df.session.ravel())) for ss in sessions: sesdf = df.copy().loc[df['session'] == ss] scans = pd.unique(sesdf.scan.ravel()) if sc_split: for sc in scans: subset = sesdf.loc[sesdf['scan'] == sc] if len(subset.index) > 1: if sub_id is None: subtitle = '(%s_%s)' % (ss, sc) else: subtitle = '(subject %s_%s_%s)' % (sub_id, ss, sc) if condensed: fig = plot_all(sesdf, groups, subject=sub_id, title='QC measures ' + subtitle) else: fig = plot_measures( sesdf, headers, subject=sub_id, title='QC measures ' + subtitle) report.savefig(fig, dpi=300) fig.clf() else: if len(sesdf.index) > 1: if sub_id is None: subtitle = '(%s)' % (ss) else: subtitle = '(subject %s_%s)' % (sub_id, ss) if condensed: fig = plot_all(sesdf, groups, subject=sub_id, title='QC measures ' + subtitle) else: fig = plot_measures( sesdf, headers, subject=sub_id, title='QC measures ' + subtitle) report.savefig(fig, dpi=300) fig.clf() report.close() plt.close() # print 'Written report file %s' % out_file return out_file
def metric(self, numer, denom, numer_count=False, denom_count=False): numer_qty = float(self[numer].sum()) denom_qty = float(self[denom].sum()) if numer_count: numer_qty = float(len(pd.unique(self[numer]))) elif denom_count: denom_qty = float(len(pd.unique(self[denom]))) return numer_qty / denom_qty
def setUp(self): import os import pandas as pd import pkg_resources as p from qap.viz.plotting import plot_all self.plot_all = plot_all anat_spat_csv = \ p.resource_filename("qap", os.path.join("test_data", "qap_anatomical_spatial_5rows.csv")) func_spat_csv = \ p.resource_filename("qap", os.path.join("test_data", "qap_functional_spatial_5rows.csv")) func_temp_csv = \ p.resource_filename("qap", os.path.join("test_data", "qap_functional_temporal_5rows.csv")) self.anat_spat_df = pd.read_csv(anat_spat_csv) self.func_spat_df = pd.read_csv(func_spat_csv) self.func_temp_df = pd.read_csv(func_temp_csv) self.anat_spat_sessions = \ sorted(pd.unique(self.anat_spat_df.Session.ravel())) self.func_spat_sessions = \ sorted(pd.unique(self.func_spat_df.Session.ravel())) self.func_temp_sessions = \ sorted(pd.unique(self.func_temp_df.Session.ravel())) self.anat_spat_groups = [['CNR'], ['Cortical Contrast'], ['EFC'], ['FBER'], ['FWHM', 'FWHM_x', 'FWHM_y', 'FWHM_z'], ['Qi1'], ['SNR']] self.func_spat_groups = [['EFC'], ['FBER'], ['FWHM', 'FWHM_x', 'FWHM_y', 'FWHM_z'], ['Ghost_%s' % a for a in ['x', 'y', 'z']], ['SNR']] self.func_temp_groups = [['Fraction of Outliers (Mean)', 'Fraction of Outliers (Median)', 'Fraction of Outliers (Std Dev)', 'Fraction of Outliers IQR'], ['GCOR'], ['Quality (Mean)', 'Quality (Median)', 'Quality (Std Dev)', 'Quality IQR', 'Quality percent outliers'], ['RMSD (Mean)', 'RMSD (Median)', 'RMSD (Std Dev)', 'RMSD IQR'], ['Std. DVARS (Mean)', 'Std. DVARS (Median)', 'Std. DVARS percent outliers', 'Std. DVARs IQR']]
def homepage(): # Contenedores locales. selected_entity = [] se_subset = pd.DataFrame() asociated_words = pd.DataFrame() sources = pd.DataFrame() graph_data = [] # Las lineas que siguen son las acciones del lado del servidor que # corren cada vez que el usuario hace click en una entidad. if request.method == 'POST': selected_entity = request.form.get('entidades', None) se_subset = df[df.entidad == selected_entity] se_subset = se_subset.sort(['dateStamp']) last_week_days = se_subset['dateStamp'].iloc[-7] last_week_subset = se_subset[se_subset.dateStamp == last_week_days] asociated_words = (last_week_subset[['adjetivo', 'valor']] .groupby('adjetivo').sum() .sort('valor', ascending=False)) urls = list(pd.unique(last_week_subset.link.ravel())) titles = list(pd.unique(last_week_subset.titulo.ravel())) sources = pd.DataFrame(urls, titles) # Creacion del grafico con Pygal. custom_style = Style(background='transparent', plot_background='transparent', title_font_size=32) graph = pygal.Line(show_legend=False, x_label_rotation=20, width=1500, height=450, explicit_size=True, range=(-1.2, 1.2), background="transparent", foreground="transparent", plot_background="transparent", margin=0, style=custom_style, show_minor_x_labels = False) graph.title = "Sentimiento para '"+selected_entity+"'" agg = se_subset.groupby('dateStamp').mean() m_avg = pd.rolling_mean(agg, 3) m_avg = m_avg.fillna(0) # graph.add(selected_entity, list(agg['valor'])) graph.add(selected_entity, list(m_avg['valor'])) date = pd.DatetimeIndex(agg.index) graph.x_labels = map(str, date) graph.x_labels_major = map(str, date[0::5]) graph_data = graph.render_data_uri() return render_template('index.html', entities=entities, graph_data=graph_data, asociated_words=asociated_words, se_subset=se_subset, sources=sources)
def create_dict_of_team_ids(df): both_teams = {} for game_id in pd.unique(df['GAME_ID'].values.tolist()): df_curr = df[df['GAME_ID'] == game_id] curr_teams = pd.unique(df_curr['TEAM_ID'].values) if curr_teams.size != 2: print "ERROR" both_teams[game_id] = curr_teams return both_teams
to_date = '2019-09-12' sites_df = pdsql.mssql.rd_sql(server, database, 'TSDataNumericHourlySumm', col_names=['ExtSiteID', 'DatasetTypeID'], where_in={'DatasetTypeID': [38, 15]}) prec_ts_df = pdsql.mssql.rd_sql(server, database, 'TSDataNumericHourly', col_names=['ExtSiteID', 'DateTime', 'Value'], where_in={ 'DatasetTypeID': [38, 15], 'ExtSiteID': pd.unique(sites_df.ExtSiteID).tolist(), 'QualityCode': [600] }) prec_ts_df['DateTime'] = pd.to_datetime(prec_ts_df['DateTime']) prec_ts_df = prec_ts_df.loc[(prec_ts_df.DateTime >= pd.Timestamp(from_date)) & (prec_ts_df.DateTime <= pd.Timestamp(to_date))] prec_ts_df.to_csv( r'C:\Active\Projects\MetService_precip_analysis\Data\Stations\station_ts.csv', index=False) #-Get the locations of the sites and write to csv sites_xy = pdsql.mssql.rd_sql( server, database, 'ExternalSite', col_names=['ExtSiteID', 'NZTMX', 'NZTMY'],
def process_scene(ns_scene, env, nusc, data_path): scene_id = int(ns_scene['name'].replace('scene-', '')) data = pd.DataFrame(columns=['frame_id', 'type', 'node_id', 'robot', 'x', 'y', 'z', 'length', 'width', 'height', 'heading']) sample_token = ns_scene['first_sample_token'] sample = nusc.get('sample', sample_token) frame_id = 0 while sample['next']: annotation_tokens = sample['anns'] for annotation_token in annotation_tokens: annotation = nusc.get('sample_annotation', annotation_token) category = annotation['category_name'] if len(annotation['attribute_tokens']): attribute = nusc.get('attribute', annotation['attribute_tokens'][0])['name'] else: continue if 'pedestrian' in category and not 'stroller' in category and not 'wheelchair' in category: our_category = env.NodeType.PEDESTRIAN elif 'vehicle' in category and 'bicycle' not in category and 'motorcycle' not in category and 'parked' not in attribute: our_category = env.NodeType.VEHICLE else: continue data_point = pd.Series({'frame_id': frame_id, 'type': our_category, 'node_id': annotation['instance_token'], 'robot': False, 'x': annotation['translation'][0], 'y': annotation['translation'][1], 'z': annotation['translation'][2], 'length': annotation['size'][0], 'width': annotation['size'][1], 'height': annotation['size'][2], 'heading': Quaternion(annotation['rotation']).yaw_pitch_roll[0]}) data = data.append(data_point, ignore_index=True) # Ego Vehicle our_category = env.NodeType.VEHICLE sample_data = nusc.get('sample_data', sample['data']['CAM_FRONT']) annotation = nusc.get('ego_pose', sample_data['ego_pose_token']) data_point = pd.Series({'frame_id': frame_id, 'type': our_category, 'node_id': 'ego', 'robot': True, 'x': annotation['translation'][0], 'y': annotation['translation'][1], 'z': annotation['translation'][2], 'length': 4, 'width': 1.7, 'height': 1.5, 'heading': Quaternion(annotation['rotation']).yaw_pitch_roll[0], 'orientation': None}) data = data.append(data_point, ignore_index=True) sample = nusc.get('sample', sample['next']) frame_id += 1 if len(data.index) == 0: return None data.sort_values('frame_id', inplace=True) max_timesteps = data['frame_id'].max() x_min = np.round(data['x'].min() - 50) x_max = np.round(data['x'].max() + 50) y_min = np.round(data['y'].min() - 50) y_max = np.round(data['y'].max() + 50) data['x'] = data['x'] - x_min data['y'] = data['y'] - y_min scene = Scene(timesteps=max_timesteps + 1, dt=dt, name=str(scene_id), aug_func=augment) # Generate Maps map_name = nusc.get('log', ns_scene['log_token'])['location'] nusc_map = NuScenesMap(dataroot=data_path, map_name=map_name) type_map = dict() x_size = x_max - x_min y_size = y_max - y_min patch_box = (x_min + 0.5 * (x_max - x_min), y_min + 0.5 * (y_max - y_min), y_size, x_size) patch_angle = 0 # Default orientation where North is up canvas_size = (np.round(3 * y_size).astype(int), np.round(3 * x_size).astype(int)) homography = np.array([[3., 0., 0.], [0., 3., 0.], [0., 0., 3.]]) layer_names = ['lane', 'road_segment', 'drivable_area', 'road_divider', 'lane_divider', 'stop_line', 'ped_crossing', 'stop_line', 'ped_crossing', 'walkway'] map_mask = (nusc_map.get_map_mask(patch_box, patch_angle, layer_names, canvas_size) * 255.0).astype( np.uint8) map_mask = np.swapaxes(map_mask, 1, 2) # x axis comes first # PEDESTRIANS map_mask_pedestrian = np.stack((map_mask[9], map_mask[8], np.max(map_mask[:3], axis=0)), axis=0) type_map['PEDESTRIAN'] = GeometricMap(data=map_mask_pedestrian, homography=homography, description=', '.join(layer_names)) # VEHICLES map_mask_vehicle = np.stack((np.max(map_mask[:3], axis=0), map_mask[3], map_mask[4]), axis=0) type_map['VEHICLE'] = GeometricMap(data=map_mask_vehicle, homography=homography, description=', '.join(layer_names)) map_mask_plot = np.stack(((np.max(map_mask[:3], axis=0) - (map_mask[3] + 0.5 * map_mask[4]).clip( max=255)).clip(min=0).astype(np.uint8), map_mask[8], map_mask[9]), axis=0) type_map['VISUALIZATION'] = GeometricMap(data=map_mask_plot, homography=homography, description=', '.join(layer_names)) scene.map = type_map del map_mask del map_mask_pedestrian del map_mask_vehicle del map_mask_plot for node_id in pd.unique(data['node_id']): node_frequency_multiplier = 1 node_df = data[data['node_id'] == node_id] if node_df['x'].shape[0] < 2: continue if not np.all(np.diff(node_df['frame_id']) == 1): # print('Occlusion') continue # TODO Make better node_values = node_df[['x', 'y']].values x = node_values[:, 0] y = node_values[:, 1] heading = node_df['heading'].values if node_df.iloc[0]['type'] == env.NodeType.VEHICLE and not node_id == 'ego': # Kalman filter Agent vx = derivative_of(x, scene.dt) vy = derivative_of(y, scene.dt) velocity = np.linalg.norm(np.stack((vx, vy), axis=-1), axis=-1) filter_veh = NonlinearKinematicBicycle(dt=scene.dt, sMeasurement=1.0) P_matrix = None for i in range(len(x)): if i == 0: # initalize KF # initial P_matrix P_matrix = np.identity(4) elif i < len(x): # assign new est values x[i] = x_vec_est_new[0][0] y[i] = x_vec_est_new[1][0] heading[i] = x_vec_est_new[2][0] velocity[i] = x_vec_est_new[3][0] if i < len(x) - 1: # no action on last data # filtering x_vec_est = np.array([[x[i]], [y[i]], [heading[i]], [velocity[i]]]) z_new = np.array([[x[i + 1]], [y[i + 1]], [heading[i + 1]], [velocity[i + 1]]]) x_vec_est_new, P_matrix_new = filter_veh.predict_and_update( x_vec_est=x_vec_est, u_vec=np.array([[0.], [0.]]), P_matrix=P_matrix, z_new=z_new ) P_matrix = P_matrix_new curvature, pl, _ = trajectory_curvature(np.stack((x, y), axis=-1)) if pl < 1.0: # vehicle is "not" moving x = x[0].repeat(max_timesteps + 1) y = y[0].repeat(max_timesteps + 1) heading = heading[0].repeat(max_timesteps + 1) global total global curv_0_2 global curv_0_1 total += 1 if pl > 1.0: if curvature > .2: curv_0_2 += 1 node_frequency_multiplier = 3*int(np.floor(total/curv_0_2)) elif curvature > .1: curv_0_1 += 1 node_frequency_multiplier = 3*int(np.floor(total/curv_0_1)) vx = derivative_of(x, scene.dt) vy = derivative_of(y, scene.dt) ax = derivative_of(vx, scene.dt) ay = derivative_of(vy, scene.dt) if node_df.iloc[0]['type'] == env.NodeType.VEHICLE: v = np.stack((vx, vy), axis=-1) v_norm = np.linalg.norm(np.stack((vx, vy), axis=-1), axis=-1, keepdims=True) heading_v = np.divide(v, v_norm, out=np.zeros_like(v), where=(v_norm > 1.)) heading_x = heading_v[:, 0] heading_y = heading_v[:, 1] data_dict = {('position', 'x'): x, ('position', 'y'): y, ('velocity', 'x'): vx, ('velocity', 'y'): vy, ('velocity', 'norm'): np.linalg.norm(np.stack((vx, vy), axis=-1), axis=-1), ('acceleration', 'x'): ax, ('acceleration', 'y'): ay, ('acceleration', 'norm'): np.linalg.norm(np.stack((ax, ay), axis=-1), axis=-1), ('heading', 'x'): heading_x, ('heading', 'y'): heading_y, ('heading', '°'): heading, ('heading', 'd°'): derivative_of(heading, dt, radian=True)} node_data = pd.DataFrame(data_dict, columns=data_columns_vehicle) else: data_dict = {('position', 'x'): x, ('position', 'y'): y, ('velocity', 'x'): vx, ('velocity', 'y'): vy, ('acceleration', 'x'): ax, ('acceleration', 'y'): ay} node_data = pd.DataFrame(data_dict, columns=data_columns_pedestrian) node = Node(node_type=node_df.iloc[0]['type'], node_id=node_id, data=node_data, frequency_multiplier=node_frequency_multiplier) node.first_timestep = node_df['frame_id'].iloc[0] if node_df.iloc[0]['robot'] == True: node.is_robot = True scene.robot = node scene.nodes.append(node) return scene
file = read_args_tank_treading() video = getInputFile(settings_name="extract_cell_snippets.py", video=file) print(video) config = getConfig(video) config["channel_width_m"] = 0.00019001261833616293 data = getData(video) getVelocity(data, config) correctCenter(data, config) data = data[(data.solidity > 0.96) & (data.irregularity < 1.06)] data.reset_index(drop=True, inplace=True) ids = pd.unique(data["cell_id"]) image_reader = CachedImageReader(video) results = [] for id in tqdm.tqdm(ids): d = data[data.cell_id == id] crops, shifts, valid = getCroppedImages(image_reader, d) if len(crops) <= 1: continue crops = crops[valid] shifts = shifts[valid]
def main(dirs, config_filename, map_filename=None, summary_filename=None, with_aid=True, with_target=True, phenotype=False, id_prefix='CID', output_format='.pkl.gz'): aids = set() targets = set() total = 0 config = pd.read_csv(config_filename) summary = [] sid_cid = None if map_filename is not None: sid_cid = read_sid_cid_map(map_filename) if 'aid' not in config.columns: raise ValueError('Configuration file must contain "aid" column.') assert len(config) == len(pd.unique(config['aid'])) for this_dir in dirs: for filename in glob.glob(os.path.join(this_dir, '*.json.gz')): # get AID from filename so we only have to load relevant assays aid = int(os.path.basename(filename).split('.')[0]) if aid not in config['aid'].values: continue # get configuration for this AID this_config = config[config['aid'] == aid].iloc[0] if not with_aid and 'aid' in this_config: del this_config['aid'] if not with_target and 'target' in this_config: del this_config['target'] # get data try: extractor = PcbaDataExtractor(filename, this_config, with_aid=with_aid) except NotImplementedError as e: warnings.warn(e.message) continue if phenotype and 'phenotype' not in extractor.config: warnings.warn('{} has no phenotype'.format(aid)) continue assert aid == extractor.parser.get_aid( ) # sanity check for AID match aids.add(aid) target = extractor.config.get('target') targets.add(target) data = extractor.get_data(sid_cid=sid_cid) total += len(data) # add generic molecule ID column if id_prefix == 'CID': col = 'cid' elif id_prefix == 'SID': col = 'sid' else: raise NotImplementedError( 'Unrecognized ID prefix "{}"'.format(id_prefix)) ids = [] for i, mol_id in enumerate(data[col]): try: ids.append(id_prefix + str(int(mol_id))) except (TypeError, ValueError): warnings.warn('No ID for the following row:\n{}'.format( data.loc[i])) ids.append(None) # can be found with pd.isnull # skip this assay if there are no valid IDs if np.all(pd.isnull(ids)): warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid)) continue data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index) # add generic assay ID column assay_id = 'PCBA-' + str(aid) if with_aid: data.loc[:, 'assay_id'] = assay_id # save dataframe output_filename = '{}.{}'.format(assay_id, output_format) print '{}\t{}\t{}\t{}'.format(aid, target, output_filename, len(data)) write_dataframe(data, output_filename) summary.append({ 'aid': aid, 'target': target, 'filename': output_filename, 'size': len(data) }) # make sure we found everything missing = set(config['aid']).difference(aids) if len(missing): warnings.warn('Missed AIDs {}'.format(missing)) # save a summary summary = pd.DataFrame(summary) if summary_filename is not None: write_dataframe(summary, summary_filename) warnings.warn( 'Found {} assays for {} targets ({} total data points)'.format( len(aids), len(targets), total))
def histAnimation(det_file, save_in=None, cam='c010', track_id=None): """ Input: movie_path If movie path is a movie - load and extract frame - # TODO: If movie path is a folder - load images det_file : xml,txt or pkl panda list with the following column names ['frame','track_id', 'xmax', 'xmin', 'ymax', 'ymin'] save_in folder to save output frames / movie -# TODO: """ # Get BBox detection from list df = ut.getBBox_from_gt(det_file) if track_id is None: track_id = pd.unique(df['track_id']) # Create folders if they don't exist if save_in is not None and not os.path.isdir(save_in): os.mkdir(save_in) df.sort_values(by=['frame']) # create trajectory of all track df_track = df.groupby('track_id') index = np.linspace(0, 255, 31) colors = plt.cm.hsv(index / float(max(index))) for id, tt in df_track: first_frame = True print('Track id {}'.format(id)) if id not in track_id: continue print('frames:') for t in tt.index.tolist(): # 1st frame - ts = tt.loc[t, 'time_stamp'] f = tt.loc[t, 'frame'] print(f) hist = tt.loc[t, 'histogram'] #print(hist) if first_frame: first_frame = False plt.ion() plt.show() fig = plt.figure() fig.suptitle('Camera {}, Track Id {}'.format(cam, id), fontsize=16) ax = fig.add_subplot(111) ax.bar(range(len(hist)), hist, color=colors) #ax.bar(index, hist) ax.set_xlabel('Hue', fontsize=5) ax.set_ylabel('Probability', fontsize=5) #ax.set_xticklabels(range(len(hist)),index) ax.set_title('time {}'.format(ts)) else: ax.clear() ax.bar(range(len(hist)), hist, color=colors) #ax.bar(index, hist) ax.set_xlabel('Hue', fontsize=5) ax.set_ylabel('Probability', fontsize=5) #ax.set_xticklabels(range(len(hist)),index) ax.set_title('time {}'.format(ts)) if save_in is not None: fig.savefig(os.path.join(save_in, 'tk{}_f{}.png').format(id, f), dpi=fig.dpi) return
y_train = train['target'] train = train.drop(['target'], axis=1) id_test = test['ID'] df_all = pd.concat((train, test), axis=0, ignore_index=True) df_all['null_count'] = df_all.isnull().sum(axis=1).tolist() df_all = df_all.fillna(-1) df_all_temp = df_all['ID'] df_all = df_all.drop(['ID'], axis=1) df_data_types = df_all.dtypes[:] #{'object':0,'int64':0,'float64':0,'datetime64':0} d_col_drops = [] for i in range(len(df_data_types)): if str(df_data_types[i]) == 'object': df_u = pd.unique(df_all[str(df_data_types.index[i])].ravel()) print("Column: ", str(df_data_types.index[i]), " Length: ", len(df_u)) d = {} j = 1000 for s in df_u: d[str(s)] = j j += 5 df_all[str(df_data_types.index[i]) + '_vect_'] = df_all[str( df_data_types.index[i])].map(lambda x: d[str(x)]) d_col_drops.append(str(df_data_types.index[i])) if len(df_u) < 150: dummies = pd.get_dummies(df_all[str( df_data_types.index[i])]).rename(columns=lambda x: str( df_data_types.index[i]) + '_' + str(x)) df_all_temp = pd.concat([df_all_temp, dummies], axis=1)
for j in range(len(corr_mtx)): if ((corr_mtx.iat[i,j] > 0.75 or corr_mtx.iat[i,j] < -0.75) and i > j): if(not(i in corr_mtx and j in corr_mtx) and not(j in corr_mtx)): indexes.append(i) correlated = correlated + 1 print(correlated) print(indexes) #Remove correlated variables data = data.drop(data.columns[indexes], axis=1) print(len(data.columns)) #### y: np.ndarray = data.pop('classification').values X: np.ndarray = data.values labels = pd.unique(y) trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y) allknn = AllKNN() nm = NearMiss() smt = SMOTE() ada = ADASYN(random_state=42) lst = [allknn, nm, smt, ada] gb = GradientBoostingClassifier(n_estimators=50, max_depth=10, learning_rate=0.5) for samp in lst: trnX, trnY = samp.fit_resample(trnX, trnY)
def robustOpr(): # make the multiplier of the IQR in the boxplot outlier calculation easily # accessible for hyperparameter tuning # for the 2020 data, 1.6 is optimal rather than the default 1.5 IQR_MULT = 1.6 # get the event key from the command-line argument try: event = sys.argv[1] except IndexError: sys.exit( 'ERROR - please specify an event key e.g. 2020scmb as the first argument' ) # optionally get the TBA API read key from the second argument if len(sys.argv) >= 3: tbaKey = sys.argv[2] else: try: # check for tba_key.json and read the TBA API key from the file with open('tba_key.json', 'r') as read_file: f = json.load(read_file) tbaKey = f['tba_key'] except OSError: sys.exit( 'ERROR - must provide TBA API read key in a tba_key.json file or as the second argument' ) # get data for a given event key from TheBlueAlliance API matches = getRawMatchData(event, tbaKey) # convert the match data to match-alliance data as needed by the OPR functions maData = matchToAlliance(matches) # truncate data to only qualification matches for OPR maData = maData[maData['comp_level'] == 'qm'] # get unique list of teams within the match data as needed for OPR calcs teams = pd.unique(maData[['team1', 'team2', 'team3']].values.ravel('K')).tolist() # sort the team numbers for easier perusing of the OPR output # note that the sort is alphabetical since the team numbers are strings teams.sort() # initialize an index counter for an outlier removal loop i = 1 # loop over OPR calculations - removing outlier match-alliance records in each # iteration and recomputing robust OPR - stop when no match-alliance records # are identified as outliers while True: # compute OPR - maData may be the original data for the full event or # could be on a truncated set of match-alliances after outlier removal # assuming that outlier removal does not remove all matches for any team # that the teams list needs to be recomputed during iteration opr = calcOPR(teams, maData) if i == 1: # save the OPR before outlier removal oprAll = opr.copy() # compute prediction errors for the OPR dataset maData = predictionError(maData, opr) # identify outlier match-alliance records using non-parametric boxplot # outlier computations - values more extreme than IQR_MULT times the # interquartile range outside the respective quartile are identified # as outliers # upper quartile is the 75th percentile of the data q3 = maData['score.errorS'].quantile(0.75) # lower quartile is the 25th percentile of the data q1 = maData['score.errorS'].quantile(0.25) # interquartile range is the difference between the upper and lower quartiles iqr = q3 - q1 # high outlier limit is IQR_MULT * iqr above the upper quartile lim_hi = q3 + IQR_MULT * iqr # low outlier limit is IQR_MULT * iqr below the lower quartile lim_lo = q1 - IQR_MULT * iqr # look for outliers where the match-alliance prediction error is beyond # the outlier limits just calculated outliers = maData[(maData['score.errorS'] > lim_hi) | (maData['score.errorS'] < lim_lo)] # if there are no outlier records, break out of the loop - the last OPR # calculated is the robust OPR if len(outliers) == 0: break # print to console if outliers are found print(f'Outlier(s) found on iteration {i}') for index, row in outliers.iterrows(): print( f'Match {row.key} {row.color} ({row.team1} {row.team2} {row.team3}) - score {row.score} - pred {round(row["score.predS"], 1)}' ) # find the indexes of the outlier records toDrop = list(outliers.index.values) # remove the outlier records from the qualification match-alliance dataset # before re-computing robust OPR on the next iteration maData.drop(toDrop, axis=0, inplace=True) # update the loop counter i += 1 # run another iteration after outlier removal # prepare the OPR results for export # add event into the dataframe oprAll.insert(loc=0, column='event', value=event) # get needed columns from oprAll which is "standard" OPR oprAll = oprAll[['event', 'opr']] # give columns better names for export oprAll.columns = ['event', 'oprStd'] # get needed columns from opr which is robust OPR opr = opr[['opr']] # give columns better names for export opr.columns = ['oprRobust'] # combine standard and robust OPR data opr = pd.concat([oprAll, opr], axis=1, sort=False) # rearrange columns for easier human interpretation of the exported data opr = opr[['event', 'oprStd', 'oprRobust']] # round numbers in the dataframe to 1 place opr = opr.round(1) # export the results to CSV try: filename = 'robustOpr_' + event + '.csv' opr.to_csv(filename) except OSError: sys.exit( f'ERROR - output file could not be written - is {filename} open for editing?' )
'j11vote1', 'j11vote2', 'j11maj1', 'j11maj2', 'codej12', 'j12vote1', 'j12vote2', 'j12maj1', 'j12maj2', 'codej13', 'j13vote1', 'j13vote2', 'j13maj1', 'j13maj2', 'codej14', 'j14vote1', 'j14vote2', 'j14maj1', 'j14maj2', 'codej15', 'j15vote1', 'j15vote2', 'j15maj1', 'j15maj2', 'j16maj1', 'j16vote1' ] # # to_dummies = ['month','day','method','state','district','origin','source','distjudg', # 'applfrom','adminrev','opinstat','treat','classact','crossapp','counsel1','counsel2','sanction', # 'initiate','numappel','appnatpr','appnatpr','appbus','appnonp','appfed','appsubst','appstate', # 'appfiduc','ap_stid','genapel1','bank_ap1','genapel2','bank_ap2','appel1','appel2',] print df.shape df.drop(labels=del_cols, axis=1, inplace=True) moredropcolumns = df.columns.tolist() # .tolist? for i in moredropcolumns: if len(pd.unique(df[i])) == 1: df.drop(labels=i, axis=1, inplace=True) caseList = pd.unique(df['casenum']) caseList = caseList[pd.notnull(caseList)].tolist() print len(caseList) num_cores = multiprocessing.cpu_count() print "num_cores is: ", num_cores def do_to_case(case): newframe = pd.DataFrame() ## the rearrange of the original data output = [ ] ## the corresponding alignment of judge 1 and judge 2, yes =1, no = -1 subtest = df[df.casenum == case].reset_index( drop=True ) ## 'subtest' only take the records that have a specific case id
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns #Initialisation: a = pd.read_csv("/Users/michaelwehbe/Desktop/q.csv") b = pd.read_csv("/Users/michaelwehbe/Desktop/w.csv") c = pd.read_csv("/Users/michaelwehbe/Desktop/x.csv") data_temp = pd.concat([a, b, c], axis=0) n = len(data_temp['ID']) n_secs = len(pd.unique(data_temp['ID'])) n_dates = len(pd.unique(data_temp['date'])) #Checking for NAs if np.sum(data_temp['R'].isnull().sum()) == 0: print("No missing Values in the dataset") else: print(np.sum(data_temp['R'].isnull().sum()), "in the dataset") #Let's now sort the data by entering each security in a column, and each row would represent a date: data_temp_2 = np.zeros((n_dates, n_secs)) for i in range(0, n_dates): #This basically does what we described above. data_temp_2[i, :] = data_temp.iloc[i * n_secs:n_secs * (i + 1), 2] data = pd.DataFrame(data_temp_2,
def lagged_strat(lag): signal_lag, signal_temp_lag = lagged_signal(lag) signal_temp_lag_2 = pd.DataFrame(signal_temp_lag, columns=pd.unique(data_temp['ID'])) #Strategy : Long top decile and short bottom decile #We will rank order our signals for each date: sorted_signal_lag = pd.DataFrame(np.sort(signal_lag, axis=1), index=pd.unique(data_temp['date'])) #We have 690 securities, so 690 returns per date. so the first decile would be the smallest 69 returns and the 10th decile would be the largest 69 returns #Let's create two matrices, each with the IDs of the securities we are shorting or longing at each time: long_temp_lag = np.zeros((n_dates, 69)) short_temp_lag = np.zeros((n_dates, 69)) for i in range(0, n_dates): long_temp_lag[i, :] = signal_temp_lag_2.sort_values( by=i, axis=1).columns[n_secs - 69:n_secs] short_temp_lag[i, :] = signal_temp_lag_2.sort_values( by=i, axis=1).columns[0:69] long_positions_lag = pd.DataFrame(long_temp_lag.astype(int), index=pd.unique(data_temp['date'])) short_positions_lag = pd.DataFrame(short_temp_lag.astype(int), index=pd.unique(data_temp['date'])) #We want all the longs to have equal weight in our portfolio, and same for all the shorts #Hence each long security in the portfolio has weight 1/69 and short has -1/69, which satisfies all the given conditions #For simplicity of computations let's design a weight matrix: weights_lag = pd.DataFrame(np.zeros((n_dates, n_secs)), index=pd.unique(data_temp['date']), columns=pd.unique(data_temp['ID'])) for i in range(0, n_dates): for j in range(0, 69): weights_lag[long_positions_lag.iloc[i, j]][i] = 1 / 69 weights_lag[short_positions_lag.iloc[i, j]][i] = -1 / 69 #Let's now compute the returns of our portfolio: portfolio_rets_temp_lag = np.array( weights_lag)[:n_dates - 1, :] * np.array(data)[1:, :] portfolio_rets_temp_lag_2 = [] for i in range(0, n_dates - 1): portfolio_rets_temp_lag_2.append(np.sum(portfolio_rets_temp_lag[i, :])) portfolio_rets_lag = pd.DataFrame(portfolio_rets_temp_lag_2, columns=['Portfolio Returns'], index=pd.unique(data_temp['date'])[1:]) #a #Let's compute the annualized mean return, volatility and sharpe ratio of the strategy and of the market portfolio: ann_mean_ret_strat_lag = np.mean(portfolio_rets_lag) * 252 ann_vol_strat_lag = np.std(portfolio_rets_lag) * np.sqrt(252) ann_SR_strat_lag = ann_mean_ret_strat_lag / ann_vol_strat_lag #Since we don't know what the risk free rate is return float(ann_mean_ret_strat_lag), float(ann_vol_strat_lag), float( ann_SR_strat_lag)
ofname = run(args, workload) if (not verify(ofname)): fail.append(workload + "_run.csv") continue okay.append(workload + "_load.csv") okay.append(workload + "_run.csv") df = stats(ofname) operations = [ '[OVERALL]', '[READ]', '[INSERT]', '[CLEANUP]', '[UPDATE]', '[READ-MODIFY-WRITE]' ] found_ops = pd.unique(df[0]) headers = [ 'Operations', 'MinLatency(us)', 'AverageLatency(us)', '95thPercentileLatency(us)', '99thPercentileLatency(us)', 'MaxLatency(us)' ] printable_headers = [ 'Operations', '#ofOperations', 'MinLatency(us)', 'AverageLatency(us)', '95thPercentileLatency(us)', '99thPercentileLatency(us)', 'MaxLatency(us)' ] overall_headers = [ 'CreatePmemPool(ms)', 'RunTime(ms)', 'Throughput(ops/sec)' ] printable_overall_headers = [ 'Overall', 'CreatePmemPool(ms)', 'RunTime(ms)',
# import the dataset into the dataframe, using pandas # data = pd.read_csv('covtype.csv', sep=';') # dataSample = data.sample(frac=0.1) # data = pd.read_csv('Undersample.csv', sep=',') # dataSample = data # usar 100% (são apenas 2k valores...) # data = pd.read_csv('Oversample.csv', sep=',') # dataSample = data.sample(frac=0.1) data = pd.read_csv('SMOTE_sample.csv', sep=',') dataSample = data.sample(frac=0.1) # Data preparation for the classification models y: np.ndarray = dataSample.pop('Cover_Type(Class)').values X: np.ndarray = dataSample.values labels = pd.unique(y) trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y) # For printing data def model_performance(tstY, prdY, n, d=' ', k=' ', n_name='n', d_name=' ', k_name=' '): # import warnings # warnings.filterwarnings('default') # "error", "ignore", "always", "default", "module" or "once" accuracy = metrics.accuracy_score(tstY, prdY) precision = metrics.precision_score(tstY, prdY, average='macro') sensibility = metrics.recall_score(tstY, prdY, average='macro') print('Accuracy :', str(accuracy)[:6], \ ' precision: ', str(precision)[:6], \ ' sensibility: ' + str(sensibility)[:6], n_name, n, d_name, d, k_name, k)
Created on Fri Feb 12 12:28:53 2016 @author: Jleach1 """ # %% import packages import pandas as pd # %% d2 = pd.read_csv('../../../../data/d2_firm_level_data.csv') d3 = pd.read_csv('../../../../data/d3_patent_data.csv') # %% Compute collaboration teams = d3.inv_num teams_sets = [x.split(';') for x in teams] invs = pd.unique([y for x in teams_sets for y in x]) # %% Reshape d3 to inventor level d3_inv = pd.concat([d3.pnum, d3.inv_num.apply(lambda y: pd.Series(y.split(';')))], axis = 1) d3_inv = pd.melt(d3_inv, id_vars = 'pnum', value_vars = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,25,25], value_name = 'inv_num') d3_inv = d3_inv.drop('variable', axis = 1) d3_inv.to_csv("../../../../data/outputs/d3_inv.csv", index = False) # %% inv_list = pd.unique([inv for inv in d3_inv.inv_num])
header=None) motif_found = motif_found.sort_values([0, 1, 2]) motif_found = motif_found.values ### global Bonferroni over all pairs tested across all cluster pvalues = np.asarray(motif_found[:, 3]) # empty vectors for pvals pvalues_Gbonf = np.zeros(motif_found.shape[0]) for i in range(0, motif_found.shape[0]): pvalues_Gbonf[i] = pvalues[i] * motif_found.shape[0] if pvalues_Gbonf[i] > 1: pvalues_Gbonf[i] = 1 # performed corrections for each cluster in the motif_found file (designated by values in first column) # pd.unique retains order where np.unique does not for clust in pd.unique(motif_found[:, 0]): print(clust) motifs = motif_found[motif_found[:, 0] == clust] pvalues = np.asarray(motifs[:, 3]) # empty vectors for pvals pvalues_bonf = np.zeros(motifs.shape[0]) pvalues_bh = np.zeros(motifs.shape[0]) # calculate Bonferroni correction for i in range(0, motifs.shape[0]): pvalues_bonf[i] = pvalues[i] * motifs.shape[0] if pvalues_bonf[i] > 1: pvalues_bonf[i] = 1 #calculate BH correction pvalues_bh = bh(pvalues) motifs = np.column_stack( (motifs[:, [0, 1, 2, 4, 5, 6, 3]], pvalues_bh, pvalues_bonf))
def generate_data(raw_data, output_dir, n_heldout_users, min_uc, min_sc): """Generates and writes train, validation and test data. The raw_data is first split into train, validation and test by user. For the validation set, each user's ratings are randomly partitioned into two subsets following a (80, 20) split (see split_train_test_proportion), and written to validation_tr.csv and validation_te.csv. A similar split is applied to the test set. Args: raw_data: a DataFrame of (userId, movieId, rating). output_dir: path to the output directory. n_heldout_users: this many users are held out for each of the validation and test sets. min_uc: filter out users with fewer than min_uc ratings. min_sc: filter out items with fewer than min_sc ratings. """ raw_data, user_activity, item_popularity = filter_triplets( raw_data, min_uc, min_sc) sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0]) print('After filtering, there are %d watching events from %d users and %d ' 'movies (sparsity: %.3f%%)' % (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100)) unique_uid = user_activity.index np.random.seed(98765) idx_perm = np.random.permutation(unique_uid.size) unique_uid = unique_uid[idx_perm] n_users = unique_uid.size tr_users = unique_uid[:(n_users - n_heldout_users * 2)] vd_users = unique_uid[(n_users - n_heldout_users * 2):(n_users - n_heldout_users)] te_users = unique_uid[(n_users - n_heldout_users):] train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)] unique_sid = pd.unique(train_plays['movieId']) show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid)) profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid)) def numerize(tp): uid = [profile2id[x] for x in tp['userId']] sid = [show2id[x] for x in tp['movieId']] return pd.DataFrame(data={ 'uid': uid, 'sid': sid }, columns=['uid', 'sid']) pro_dir = output_dir if not os.path.exists(pro_dir): os.makedirs(pro_dir) with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f: for sid in unique_sid: f.write('%s\n' % sid) vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)] vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)] vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays) test_plays = raw_data.loc[raw_data['userId'].isin(te_users)] test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)] test_plays_tr, test_plays_te = split_train_test_proportion(test_plays) train_data = numerize(train_plays) train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False) vad_data_tr = numerize(vad_plays_tr) vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False) vad_data_te = numerize(vad_plays_te) vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False) test_data_tr = numerize(test_plays_tr) test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False) test_data_te = numerize(test_plays_te) test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)
def merge_sub(sub1, sub2, bar, driver): if space_var.get() == 1: space_sub = '\n \n' else: space_sub = '\n' sub1_df = dataframe_sub(sub1, "en") sub2_df = dataframe_sub(sub2, "ru") df = pd.concat([sub1_df, sub2_df], axis=0) df['sum'] = df[['start', 'end']].sum(axis=1) df['plus'] = (df['start'] + df['end']) / 2 df = df.sort_values(by='start', ascending=True) # агломеративная кластеризация if clusters_auto_var.get() == 1: clusters_list = [] # оценка качества с помощью "силуэта" silhouette = [] for i in np.linspace(0.2, 1, 20): root.update() threshold = float(i) * 10000 clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=threshold).fit(df[['start', 'end']]) clusters = clustering.labels_ clusters_list.append(len(pd.unique(clusters))) score = silhouette_score(df[['start', 'end']], clusters) silhouette.append(score) max_silhouette = np.argmax(silhouette) clustering = AgglomerativeClustering( n_clusters=clusters_list[max_silhouette]).fit(df[['start', 'end']]) else: threshold = float(clusters_manual_entry.get()) * 10000 clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=threshold, linkage=clusters_method_combobox.get()).fit(df[['start', 'end']]) clusters = clustering.labels_ # добавление найденных кластеров df['cluster'] = clusters bar_subs = float(bar) / float(len(pd.unique(clusters))) # создание нового файла субтитров double_sub = pysrt.SubRipFile(encoding='utf-8') translate_list = pysrt.SubRipFile(encoding='utf-8') for n, i in enumerate(pd.unique(clusters)): root.update() progressBar['value'] += bar_subs df_en = df[(df['language'] == 'en') & (df['cluster'] == i)] df_ru = df[(df['language'] == 'ru') & (df['cluster'] == i)] df_group_en = df_en.groupby('cluster').agg({ 'text': ' '.join, 'start': min, 'end': max, 'language': 'first' }) df_group_ru = df_ru.groupby('cluster').agg({ 'text': ' '.join, 'start': min, 'end': max, 'language': 'first' }) df_group = df_group_en.merge( df_group_ru, on=['cluster', 'text', 'start', 'end', 'language'], how='outer').groupby('cluster').agg({ 'text': space_sub.join, 'start': 'first', 'end': 'first', 'language': ''.join }) sub = pysrt.SubRipItem(index=n + 1, start=int(df_group.iloc[0]['start']), end=int(df_group.iloc[0]['end']), text=str(df_group.iloc[0]['text'])) double_sub.append(sub) if translate_var.get() == 1 and df_group['language'].values == 'en': translate_list.append(sub) if translate_var.get() == 1 and translate_list: translate_sub(translate_list, bar, driver) # переиндексация субтитров double_sub.clean_indexes() return double_sub
strings vals = 'a,b, guido' vals.split(" ") import pandas as pd import matplotlib.pyplot as plt values = pd.Series([0, 1, 0, 0] * 2) dim = pd.Series(['apple', 'orange']) values dim #Categorical data: import numpy as np import pandas as pd values = pd.Series(['apple', 'orange', 'apple', 'apples'] * 2) pd.unique(values) pd.value_counts(values) values = pd.Series([0, 1, 0, 0] * 2) dim = pd.Series(["apple", "apples"]) dim.take(values) dim2 = pd.Series(["apple", "apples", "orange"]) dim2.take(values) #Categorical type in pandas: fruits = ['apple', 'orange', 'apple', 'apples'] * 2 N = len(fruits) df = pd.DataFrame({'fruit': fruits, 'basket_id': np.arange(N), 'count': np.random.randint(3, 15, size = N), 'weight': np.random.uniform(0, 4, size = N)},
levels.append(levels[-1]+1) elif prev[1:].replace('RL','rl').islower() and current[1:].replace('RL','rl').isupper(): metas.append(prev) levels.append(levels[-1]+1) elif prev[1:].replace('RL','rl').isupper() and current[1:].replace('RL','rl').isupper(): metas.append(metas[-1]) levels.append(levels[-1]) elif prev[1:].replace('RL','rl').isupper() and current[1:].replace('RL','rl').islower(): metas.append(pd.unique(metas)[-2]) levels.append(levels[-1]-1) else: #Debuggaus kaiken varalta. Ei pitäisi mennä tänne. print(prev) print(current) prev = current df['meta']=metas dfs.append(df) rikokset = pd.concat(dfs).iloc[:,:-1] rikokset = rikokset.set_index('Alue')
def writeChemicalMatches(): all_list_names = extract_flows_for_chemical_matcher() if len(all_list_names) == 0: log.error('no local flows found, chemical matches can not be assessed, ' 'generate local inventories before continuing.') return # Determine whether to use the id or name to query SRS inventory_query_type = {"RCRAInfo": "list", "TRI": "list", "NEI": "list", "eGRID": "name", "DMR": "list", "GHGRP": "name"} # Create a df to store the results all_lists_srs_info = pd.DataFrame(columns=["FlowName", "SRS_ID", "SRS_CAS", "Source"]) errors_srs = pd.DataFrame(columns=["FlowName", "Source", "ErrorType"]) # Loop through sources, querying SRS by the query type defined for the # source, merge the results with the flows for that inventory. # Store errors in a separate dataframe sources = list(pd.unique(all_list_names['Source'])) for source in sources: log.info('accessing SRS for ' + source) # Get df with inventory flows inventory_flows = all_list_names[all_list_names['Source'] == source].reset_index(drop=True) if inventory_query_type[source] == 'list': # make sure flowid is a string inventory_flows['FlowID'] = inventory_flows['FlowID'].map(str) # query SRS to get entire list and then merge with it list_srs_info = get_SRSInfo_for_program_list(source) # merge this with the original list using FlowID list_srs_info = pd.merge(inventory_flows, list_srs_info, left_on='FlowID', right_on='PGM_ID', how='left') elif inventory_query_type[source] == 'name': # For names, query SRS one by one to get results list_srs_info = pd.DataFrame(columns=["FlowName", "SRS_ID", "SRS_CAS", "Source"]) errors_srs = pd.DataFrame(columns=["FlowName", "Source", "ErrorType"]) # Cycle through names one by one for index, row in inventory_flows.iterrows(): chemical_srs_info = pd.DataFrame(columns=["FlowName", "SRS_ID", "SRS_CAS", "Source"]) error_srs = pd.DataFrame(columns=["FlowName", "Source", "ErrorDescription"]) name = row["FlowName"] result = get_SRSInfo_for_substance_name(name) if isinstance(result, str): # This is an error error_srs.loc[0, 'FlowName'] = name #error_srs.loc[0, 'FlowID'] = id error_srs.loc[0, 'Source'] = source error_srs.loc[0, 'ErrorDescription'] = result else: chemical_srs_info = result chemical_srs_info.loc[0, "FlowName"] = name #chemical_srs_info.loc[0, "FlowID"] = name chemical_srs_info.loc[0, "Source"] = source errors_srs = pd.concat([errors_srs, error_srs], sort=False) list_srs_info = pd.concat([list_srs_info, chemical_srs_info], sort=False) all_lists_srs_info = pd.concat([all_lists_srs_info, list_srs_info], sort=False) # Remove waste code and PGM_ID all_lists_srs_info = all_lists_srs_info.drop(columns=['PGM_ID']) all_lists_srs_info = all_lists_srs_info.sort_values(['Source', 'FlowName', 'SRS_ID', 'FlowID']) # Add in manually found matches all_lists_srs_info = add_manual_matches(all_lists_srs_info) subset = ['FlowID', 'FlowName', 'Source'] # Write to csv all_lists_srs_info = all_lists_srs_info[['FlowID', 'FlowName', 'SRS_CAS', 'SRS_ID', 'Source']].drop_duplicates(subset) all_lists_srs_info.to_csv(OUTPUT_PATH .joinpath('ChemicalsByInventorywithSRS_IDS_forStEWI.csv'), index=False) #errors_srs.to_csv('work/ErrorsSRS.csv',index=False) # Write flows missing srs_ids to file for more inspection flows_missing_SRS_ID = all_lists_srs_info[all_lists_srs_info['SRS_ID'].isnull()] flows_missing_SRS_ID.to_csv(OUTPUT_PATH .joinpath('flows_missing_SRS_ID.csv'), index=False)
# PRE vs HIGH # data = data.loc[(data["Condition"] != "Low") & (data["Condition"] != "Post")] X = data.iloc[:, 15:] # PRE & POST vs LOW vs HIGH # y = (data["Condition"] == "Low").astype(int) + (data["Condition"] == "High").astype(int) * 2 # LOW vs HIGH or PRE vs HIGH y = (data["Condition"] == "High").astype(int) # PRE vs LOW # y = (data["Condition"] == "Low").astype(int) hashes = pd.unique(data["hash"]) # %% def cv_generator(): for hash in hashes: cond = data["hash"] == hash yield np.where(~cond)[0], np.where(cond)[0] parameters = { "svc__kernel": ["rbf"], "svc__gamma": [1000**i for i in np.linspace(-1, 1, 1000)], "svc__C": [1000**i for i in np.linspace(-1, 1, 1000)] }
# degreeTwo.py # Aakash Indurkhya import numpy as np from matplotlib import pyplot as plt import pandas as pd import itertools as itools import networkx as nx import sys import pickle data = pd.read_csv(sys.argv[1]) antenna_map = {} antenna_map_c = {} G = nx.Graph() antennas = list(pd.unique(data['antenna_id'])) for a in antennas: G.add_node(a) pairs = list(itools.combinations(antennas, 2)) for pair in pairs: # print pair id1, id2 = pair if id1 < id2: a = id1 b = id2 else: a = id2 b = id1 antenna_map[(a, b)] = 0 antenna_map_c[(a, b)] = 0
def get_column(self, features): return list(pd.unique(list(features.values())))
for district_assignment_col in district_column_names: agg_data = data.dissolve(by=district_assignment_col, aggfunc='sum') # ---- Geographic Metrics ------ # County Splits if cong: threshold_pop = cong_factor * cong_seat_pop else: total_pop = sum(data[pop_col]) threshold_pop = leg_factor * total_pop / num_dist num_splits = 0 lower_county_splits = 0 counties = list(filter(None, pd.unique(data[county_assignment_col]))) for county in counties: county_subset = data[data[county_assignment_col] == county] split_times = len(pd.unique(county_subset[district_assignment_col])) if split_times > 1: num_splits = num_splits + 1 county_pop = sum(county_subset[pop_col]) if county_pop > threshold_pop: lower_county_splits = lower_county_splits + 1 upper_county_splits = len(counties) county_splits_scores.append(num_splits) # --- Compactness -----
# data = data[data['city'].isin(a)] # a = ['Apartment','House','Townhouse','Condominium','Serviced apartment','Villa','Guesthouse','Guest suite','Bed and breakfast','Loft','Bungalow','Cottage'] # data = data[data['property_type'].isin(a)].reset_index(drop=True) a = ['Entire home/apt','Private room'] data = data[data['room_type'].isin(a)].reset_index(drop=True) a = ['strict_14_with_grace_period','moderate','flexible'] data = data[data['cancellation_policy'].isin(a)].reset_index(drop=True) # a = ['Central Business District','Southbank','St Kilda','South Yarra','Docklands','Carlton','Richmond','Brunswick','Fitzroy','Collingwood','South Melbourne'] # data = data[data['neighborhood'].isin(a)].reset_index(drop=True) # data = data.replace({'strict_14_with_grace_period': 'strict'}) # data = data.drop(columns = ['neighborhood','room_type','cancellation_policy']) for c in category_features: one_hot = pd.get_dummies(data[c]) # print(one_hot.shape[1]) uniq = pd.unique(data[c]) # print(c) # print(data[c].value_counts()) # print(uniq) # print(len(uniq)) F, p_value = stats.f_oneway(*(data[data[c] == u]['price'] for u in uniq)) # print(c, F, p_value) data = data.drop(columns=c, axis=1) if p_value <= 0.05: # print(c) data = data.join(one_hot) ####################################################################################################################### #calculate correlations between numerical varaibles and target variable, to find important ones for c in numerical_features: corr = np.corrcoef(data['price'], data[c]) print(c, corr[0,1])
for w in tokens: if w not in stop_words: filtered_string.append(w) stemmed_string = [] for w in filtered_string: stemmed_string.append(ps.stem(w)) return " ".join(stemmed_string) data = pd.read_csv("hack.csv") data = data[data["Tier"] == "Total"] columns = ["".join(process_str(i)) for i in list(data.columns)] states = [ps.stem(str(i)) for i in pd.unique(data["State"]).tolist()] districts = [ps.stem(str(i)) for i in pd.unique(data["District"]).tolist()] def analyze(string): try: our_str = process_str(string) val = max([[process.extractOne(our_str, i[0]), i[1]] for i in [[states, "State"], [districts, "District"]]], key=lambda x: x[0][1]) cat = val[1] if val[0][1] >= 85 else "Nation" if cat == "State": val = pd.unique(data["State"]).tolist()[states.index( val[0][0])] if val[0][1] >= 70 else "" elif cat == "District":
def print_overlap_of_algorithm(name, all_pairs_unsorted, co_changes_unsorted, include_class_level=True, include_package_level=True, calculate_chi_square=True, calculate_precede_values=True): print("--- Overlap ", name, " co-changes and smells: ---") all_pairs_unsorted = all_pairs_unsorted.drop(['file1Size', 'file2Size'], axis=1) co_changes_unsorted = co_changes_unsorted.drop(['startdate', 'enddate', 'Unnamed: 0'], axis=1) # Class level data all_pairs_no_package = all_pairs_unsorted.drop(['package1', 'package2'], axis=1) # This drops rows without both packages. May only be done for class-level analysis all_pairs_no_package = order_file1_and_file2(all_pairs_no_package) cc_pairs_no_package = co_changes_unsorted.drop(['package1', 'package2'], axis=1) # This drops rows without both packages. May only be done for class-level analysis cc_pairs_no_package = order_file1_and_file2(cc_pairs_no_package) class_smell_pairs_with_date = pd.DataFrame(columns=['file1', 'file2']) if include_class_level: class_smell_pairs_with_date = load_pickle("class_smell_pairs_with_date") if class_smell_pairs_with_date is None: class_smell_pairs_with_date = order_file1_and_file2(get_project_class_smells_in_range(calculate_precede_values)) # df: file1, file2 # Find file pairs that are part of the same class-level smell: class_smell_pairs_with_date = join_helper.perform_chunkified_pair_join(all_pairs_no_package, class_smell_pairs_with_date, level='file', compare_dates=False) save_pickle(class_smell_pairs_with_date, "class_smell_pairs_with_date") del all_pairs_no_package gc.collect() class_smell_pairs_with_date.info(verbose=False, memory_usage="deep") # Package level data all_pairs_unsorted.dropna(inplace=True) co_changes_unsorted.dropna(inplace=True) all_pairs_with_package = order_package1_and_package2(order_file1_and_file2(all_pairs_unsorted)) cc_pairs_with_package = order_package1_and_package2(order_file1_and_file2(co_changes_unsorted)) del all_pairs_unsorted del co_changes_unsorted gc.collect() package_smell_pairs_with_date = pd.DataFrame(columns=['file1', 'file2']) if include_package_level: package_smell_pairs_with_date = load_pickle("package_smell_pairs_with_date") if package_smell_pairs_with_date is None: package_smell_pairs_with_date = order_package1_and_package2(get_project_package_smells_in_range(calculate_precede_values)) # df: package1, package2 # We want to find file pairs whose package are part of the same smell: package_smell_pairs_with_date = join_helper.perform_chunkified_pair_join(all_pairs_with_package, package_smell_pairs_with_date, level='package', compare_dates=False) # Note: we are interested in (file1, file2) in package_smell_pairs save_pickle(package_smell_pairs_with_date, "package_smell_pairs_with_date") package_smell_pairs_with_date.info(verbose=False, memory_usage="deep") del all_pairs_with_package gc.collect() # Combine the pairs df_list = [class_smell_pairs_with_date, package_smell_pairs_with_date] smell_pairs_with_date = pd.concat(df_list) del class_smell_pairs_with_date del package_smell_pairs_with_date gc.collect() smell_pairs_with_date.info(verbose=False, memory_usage="deep") if include_class_level: # Overlapping pairs contains at least: file1, file2, parsedSmellFirstDate, parsedSmellLastDate, parsedStartDate, parsedEndDate overlapping_cc_smells = join_helper.perform_chunkified_pair_join(cc_pairs_no_package, smell_pairs_with_date) else: # Overlapping pairs contains at least: file1, file2, parsedSmellFirstDate, parsedSmellLastDate, parsedStartDate, parsedEndDate overlapping_cc_smells = join_helper.perform_chunkified_pair_join(cc_pairs_with_package, smell_pairs_with_date) overlapping_cc_smells.info(verbose=False, memory_usage="deep") del smell_pairs_with_date gc.collect() # RQ4: Are smells introduced before or after files start co-changing? if calculate_precede_values and len(overlapping_cc_smells) > 0: # Filter smells and co-changes which are already present at the start of the analysis. We are not sure what their real start date is. overlapping_cc_smells.drop(['parsedVersionDate', 'package1', 'package2'], axis=1, inplace=True) overlapping_cc_smells.info(verbose=False, memory_usage="deep") gc.collect() print("unfiltered:", len(overlapping_cc_smells)) overlapping_cc_smells = overlapping_cc_smells[overlapping_cc_smells['parsedSmellFirstDate'].dt.floor('d') != analysis_start_date.date()] gc.collect() print("after filtering smells: ", len(overlapping_cc_smells)) # Note: this counts joined rows overlapping_cc_smells = overlapping_cc_smells[overlapping_cc_smells['parsedStartDate'].dt.floor('d') != analysis_start_date.date()] gc.collect() print("filtered ccs: ", len(overlapping_cc_smells)) # Compare the two start dates and count which is earlier how often. Also count ties! # group by: file1, file2, smellId earlier_smell_rows = overlapping_cc_smells[ overlapping_cc_smells['parsedSmellFirstDate'].dt.floor('d') < overlapping_cc_smells[ 'parsedStartDate'].dt.floor('d')] earlier_smell_pairs = len(pd.unique(earlier_smell_rows[['file1', 'file2', 'uniqueSmellID']].values.ravel('K'))) add_result(project_name, name + "_earlier_smell_pairs", earlier_smell_pairs) del earlier_smell_rows gc.collect() earlier_ccs_rows = overlapping_cc_smells[ overlapping_cc_smells['parsedStartDate'].dt.floor('d') < overlapping_cc_smells[ 'parsedSmellFirstDate'].dt.floor('d')] earlier_ccs_pairs = len(pd.unique(earlier_ccs_rows[['file1', 'file2', 'uniqueSmellID']].values.ravel('K'))) add_result(project_name, name + "_earlier_ccs_pairs", earlier_ccs_pairs) del earlier_ccs_rows gc.collect() tied_rows = overlapping_cc_smells[ overlapping_cc_smells['parsedStartDate'].dt.floor('d') == overlapping_cc_smells[ 'parsedSmellFirstDate'].dt.floor('d')] tied_pairs = len(pd.unique(tied_rows[['file1', 'file2', 'uniqueSmellID']].values.ravel('K'))) add_result(project_name, name + "_tied_pairs", tied_pairs) elif calculate_precede_values and len(overlapping_cc_smells) == 0: add_result(project_name, name + "_earlier_smell_pairs", 0) add_result(project_name, name + "_earlier_ccs_pairs", 0) add_result(project_name, name + "_tied_pairs", 0)
s_cols = ['424'] for s_col in s_cols: temp[s_col] = temp[s_col].apply(lambda x : unit_transform_xt(str(x)) if pd.notnull(x) else x) temp[s_col] = temp[s_col].apply(lambda x : unit_transform_xt2(str(x)) if pd.notnull(x) else x) temp[s_col] = temp[s_col].apply(lambda x : unit_transform_xt3(str(x)) if pd.notnull(x) else x) for col in cols: if (np.array(temp[col]).dtype) == 'object': obj_list.append(col) try: temp[col] = temp[col].apply(lambda x : float(x) ) except: print (col) obj_list_4.append(col) print (pd.unique(temp[col])) dealcol=[1325, 425 , 437 ,3191 , 547 , 1321, 3203, 2233, 3485 , 30007 , 549, 424 ,459101 , 2229 ,901 ,1322 ,1326 ,3429 ,3430 , 459102 , 3194 ,3198 , 733, 212 , 2302] dealcol=[str(i) for i in dealcol] unhealth=temp[dealcol].select_dtypes(include=['object']) for i in unhealth.columns: print(i+'*****') print(unhealth[i].unique()) dropcol=['547', '2302', '733',] temp=temp.drop(dropcol,axis=1) num=data.select_dtypes(include=['float64']) result=pd.concat([temp, num], axis=1)
submit_df['msg_date'] = pd.to_datetime(submit_df['msg_date']) submit_date = submit_df['msg_date'].dt.weekday submit_date = submit_date.apply(convert_date) submit_date = submit_date.value_counts().reindex( ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']) channel_submit = submit_df.groupby('channel_name').size() channel_submit = channel_submit.reset_index().rename(columns={0: 'count'}) st.title("DataCracy Submit Dashboard") matplotlib.rcParams['font.family'] = "sans-serif" matplotlib.rcParams['font.sans-serif'] = "Open Sans" plt.rcParams['patch.edgecolor'] = 'black' select = list(pd.unique(submit_df['DataCracy_role'])) select.append('All Group') option = st.selectbox('Choose submit by group', sorted(select)) if option == 'Learner_Gr1': st.write(f'Group 1 submited {len_submit_1} assignments') st.write(f'Group 1 have {len_review_1} submit be reviewed') sizes_all = np.array([len_review, len_submit - len_review]) sizes_gr1 = np.array([len_review_1, len_submit_1 - len_review_1]) def func(pct, allvals): absolute = int(round(pct / 100. * np.sum(allvals))) return "{:.1f}%\n({:d})".format(pct, absolute) # sizes =[len_submit, len_submit-len_review]
def con3(query): sparql = SPARQLWrapper("http://dbtune.org/musicbrainz/sparql") print(query) construct_query = """ PREFIX mo: <http://purl.org/ontology/mo/> PREFIX mbz: <http://purl.org/ontology/mbz#> PREFIX owl: <http://www.w3.org/2002/07/owl#> PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> PREFIX bio: <http://purl.org/vocab/bio/0.1/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX tags: <http://www.holygoat.co.uk/owl/redwood/0.1/tags/> PREFIX geo: <http://www.geonames.org/ontology#> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX lingvoj: <http://www.lingvoj.org/ontology#> PREFIX rel: <http://purl.org/vocab/relationship/> PREFIX vocab: <http://dbtune.org/musicbrainz/resource/vocab/> PREFIX event: <http://purl.org/NET/c4dm/event.owl#> PREFIX map: <file:/home/moustaki/work/motools/musicbrainz/d2r-server-0.4/mbz_mapping_raw.n3#> PREFIX db: <http://dbtune.org/musicbrainz/resource/> PREFIX foaf: <http://xmlns.com/foaf/0.1/> PREFIX dc: <http://purl.org/dc/elements/1.1/> SELECT DISTINCT ?v1 ?v2 ?v3 ?v4 WHERE { ?r rdfs:label \"""" construct_query = construct_query + query construct_query = construct_query + """\" . ?r dc:title ?v1 . ?r mo:track_number ?v2 . ?r foaf:maker ?v3 . ?v3 rdfs:label ?v4 } ORDER BY ?v2 ?v4 LIMIT 100""" sparql.setQuery(construct_query) sparql.setReturnFormat(JSON) a = sparql.query().convert() b = a["results"]["bindings"] Art = [] t_no = [] url = [] for m in b: Art.append(m["v4"]["value"]) t_no.append(m["v2"]["value"]) url.append(m["v3"]["value"]) d = [] d = pd.unique(Art) Albums = [] for i in range(len(d)): Temp = [] for j in range(len(Art)): if (Art[j] == d[i]): t = [] t.append(Art[j]) t.append(t_no[j]) t.append(url[j]) Temp.append(t) Albums.append(Temp) return Albums[0:19]