def get_infinite(tag_id, get_time): new_time = [] new_weeks = 0 allmonday_list = [] non_monday_weeks = [] summed_month = [] year_list = [] dframe = load_data(tag_id) df1 = datetime.datetime.strptime(get_time + ' 19:00:00', "%Y-%m-%d %H:%M:%S") k = 1 while (k != 13): weeks_list = [] df1 = time_tango(str(df1.year) + "-" + str(k) + "-" + "01") b = 0 non_monday_weeks = json.loads(new_week(tag_id, str(df1.date()))) b = [non_monday_weeks[x][1] for x, y in enumerate(non_monday_weeks)] weeks_list.append(sum(b)) for i in allmondays(df1.year, df1.month): w = json.loads(new_week(tag_id, str(i))) lst2 = [item[1] for item in w] weeks_list.append(sum(lst2)) summed_month.append(sum(weeks_list)) k += 1 year_list.append(sum(summed_month)) return json.dumps(util.parse_data(year_list, tag_id))
def get_aggregated_day(tag_id, get_time): new_time = [] df = datetime.datetime.strptime(get_time + ' 00:00:00', "%Y-%m-%d %H:%M:%S") for i in range(25): new_time.append( int(time.mktime(utc_return_nowtime(df).timetuple()) * 1000)) df += datetime.timedelta(hours=1) dt = (util.parse_data_reon( Parallel(n_jobs=15, backend="threading")(delayed(qb.query_aggregated_func)( tag_id, new_time[j], new_time[j + 1], 's', 15, 'avg') for j in range(24)))) dt = ast.literal_eval(dt) dft = [] dt1 = [] dt_final = [] for i, v in dt.iteritems(): dft = v['results'] dft = [x for x in dft if x != []] for i in range(0, len(dft)): dt2 = (dft[i][0]) # print(dt2) dt1.append( (datetime.datetime.fromtimestamp(dt2 / 1000)).strftime('%M')) if (dt1[i] == '00'): dt_final.append([dft[i][0], dft[i][1]]) return json.dumps(util.parse_data(dt_final, tag_id))
def validate(data, trace=False): items = {} for (li, tvdb_id, sep, alias_list, line) in parse_data(data): # Check line ends with a ',' if not line.strip().endswith(','): if trace: print "line does not end with ','" print "==================" print_error(li, line) return False # Check tvdb_id is a valid integer if not validate_tvdb_id(tvdb_id, trace=trace): if trace: print_error(li, line) return False # Check for duplicates if tvdb_id in items: if trace: print 'item already exists' print 'items[' + str(tvdb_id) + '] = ' + str(alias_list) print '==================' print_error(li, line) return False items[tvdb_id] = alias_list if trace: print 'valid' return True
def get_aggregated_year(tag_id, get_time): new_time = [] new_weeks = 0 allmonday_list = [] non_monday_weeks = [] week_tuple = [] summed_month = [] b = 0 dframe = load_data(tag_id) for k in range(1, 13): weeks_list = [] df1 = datetime.datetime.strptime(get_time + ' 19:00:00', "%Y-%m-%d %H:%M:%S") df1 = time_tango(str(df1.year) + "-" + str(k) + "-" + "01") non_monday_weeks = json.loads(new_week(tag_id, str(df1.date()))) b = [non_monday_weeks[x][1] for x, y in enumerate(non_monday_weeks)] weeks_list.append(sum(b)) for i in allmondays(df1.year, df1.month): week = json.loads(new_week(tag_id, str(i))) lst2 = [item[1] for item in week] weeks_list.append(sum(lst2)) summed_month.append(sum(weeks_list)) return json.dumps(util.parse_data(summed_month, tag_id))
def main(): # read in the data set subdir = 'data/original_tags/' fname = 'dataset.csv' data = util.parse_data(subdir,fname,extract_features=True) # randomize the data cases random.shuffle(data) # split into training and testing data slice = math.trunc(len(data)*(.8)) # 80% train, 20% test train_data = data[:slice] test_data = data[slice:] # instantiating classifier ovr = OneVsRestClassifier() a = Analyzer(subdir,fname) # printing dataset statistics print 'Dataset Statistics\n---' print 'Total Tokens:',a.total_tokens() print 'Total Types:',a.total_types() print 'Total Label Types:',a.total_label_types() print 'Average number of tags per sample:',a.mean_tag_set_size() print # printing OVR training specific statistics print 'Training Statistics\n---' ovr.fit(train_data,threshold=200,print_stats=True) total_hamming_error = ovr.total_hamming_error(test_data) total_recall_error = ovr.total_recall_error(test_data) total_precision_error =ovr.total_precision_error(test_data) test_size = len(test_data) print print 'Model Accuracy\n---' print 'Total Hamming Error:',total_hamming_error print 'Mean Hamming Error:',total_hamming_error / test_size print 'Total Recall Error:',total_recall_error print 'Mean Recall Error:',total_recall_error / test_size print 'Total Precision Error:',total_precision_error print 'Mean Precision Error:',total_precision_error / test_size print # An example sample_str = 'How many numbers less than 70 are relatively prime to it?' sample = util.features(sample_str) gold_y = ovr.transform(['combinatorics','number-theory']) print 'An Example\n---' print sample_str pred_y = ovr.predict(sample) print 'Prediction:',ovr.inverse_transform(pred_y) print 'Actual:',ovr.inverse_transform(gold_y) print 'Hamming Error:',util.hamming_error(gold_y,pred_y) print 'Recall Error:',util.recall_error(gold_y,pred_y) print 'Precision Error:',util.precision_error(gold_y,pred_y)
def __init__(self, subdir, fname): # read in the stopwords file stop_words_file = open(os.path.join('data/', 'stop_words')) self.stop_words = [line.rstrip('\n') for line in stop_words_file ] #striping each \n from stop words self.data = util.parse_data(subdir, fname) # extract the unique labels present in the data self.label_set = self.get_label_set()
def main(): # read in the data set subdir = 'data/original_tags/' fname = 'dataset.csv' data = util.parse_data(subdir, fname, extract_features=True) # grab number of trials form the cmd line input num_trials = int(sys.argv[1]) # set up dictionary to hold results trial_results = {} trial_results['sum_total_hamming'] = 0 trial_results['sum_total_precision'] = 0 trial_results['sum_total_recall'] = 0 trial_results['sum_mean_hamming'] = 0 trial_results['sum_mean_precision'] = 0 trial_results['sum_mean_recall'] = 0 print 'Trial ', # run the trials for i in range(num_trials): print(i + 1), # randomize the data cases random.shuffle(data) # split into training and testing data slice = math.trunc(len(data) * (.8)) # 80% train, 20% test train_set = data[:slice] test_set = data[slice:] # train a new classifier ovr = OneVsRestClassifier() ovr.fit(train_set) # determine total error for each metric total_hamming_error = ovr.total_hamming_error(test_set) total_precision_error = ovr.total_precision_error(test_set) total_recall_error = ovr.total_recall_error(test_set) n = len(test_set) # update relevant error entries in the dictionary trial_results['sum_total_hamming'] += total_hamming_error trial_results['sum_total_precision'] += total_precision_error trial_results['sum_total_recall'] += total_recall_error trial_results['sum_mean_hamming'] += (total_hamming_error / n) trial_results['sum_mean_precision'] += (total_precision_error / n) trial_results['sum_mean_recall'] += (total_recall_error / n) # print the results print '\n---' print 'Number of trials:', num_trials for metric, value in trial_results.items(): print metric, value print 'Average:', value / num_trials print
def get_parsed_data(features): parsed_features = [] # parsing line by line for feature in features: parsed_data = parse_data(str(feature)) # If list is not empty if parsed_data != None: parsed_features.append(parsed_data) final_features = remove_inconsistencies(parsed_features) return final_features
def get_features(folderpath): features = [] fileData = read_file(folderpath) # For each line for dataVar in fileData: parsed_data = parse_data(dataVar) # If list is not empty if parsed_data != None: features.append(parsed_data) # Imputing missing data final_features = remove_inconsistencies(features) return final_features
def merge(localPath, remotePath): items = {} key_order = [] changes = {} for path in [localPath, remotePath]: for (li, tvdb_id, sep, alias_list, line) in parse_data(get_text(path)): if not validate_tvdb_id(tvdb_id): continue if not items.has_key(tvdb_id): items[tvdb_id] = [] key_order.append(tvdb_id) for alias in alias_list: alias = alias.strip().replace("'", "\\'") if not find_match(alias, items[tvdb_id]): items[tvdb_id].append(alias) # track remote changes if path == remotePath: if not changes.has_key(tvdb_id): changes[tvdb_id] = [] changes[tvdb_id].append(alias) print "----------------------------------------------------------" print "New Shows" print "----------------------------------------------------------" for ck, added in changes.items(): if items[ck] == added: print str(ck) + '\tnew\t\t' + str(added) print "----------------------------------------------------------" print "New Aliases" print "----------------------------------------------------------" for ck, added in changes.items(): if items[ck] != added: print str(ck) + '\tadd\t\t' + str(added) print '=============\t', items[ck] print return dict_to_data(items, key_order)
def remove_duplicates(path): items = {} key_order = [] changes = {} for (li, tvdb_id, sep, alias_list, line) in parse_data(get_text(path)): if not validate_tvdb_id(tvdb_id): continue if not items.has_key(tvdb_id): items[tvdb_id] = [] key_order.append(tvdb_id) for alias in alias_list: alias = alias.strip().replace("'", "\\'") if not find_match(alias, items[tvdb_id]): items[tvdb_id].append(alias) return dict_to_data(items, key_order)
def get_aggregated_week(tag_id, get_time): new_time = [] new_list = [] dframe = load_data(tag_id) pars = dateutil.parser.parse(get_time).date() df = time_tango(pars) week_day = df.weekday() for k in range(week_day): df -= datetime.timedelta(days=1) for i in range(8): try: if (df.date() <= datetime.datetime.now().date()): datevalues = dframe.Values[dframe.Date == df.date()].tolist() datevalues = [x for x in datevalues if x != 0] datevalues = datevalues[::-1] #values1=datevalues[0] new_list.append(datevalues[0]) else: new_list.append(0) except: new_list.append(0) df += datetime.timedelta(days=1) df = datetime.datetime.strptime(get_time + ' 19:00:00', "%Y-%m-%d %H:%M:%S") df = df - datetime.timedelta(days=df.weekday()) for i in range(len(new_list) - 1): mid = new_list[i + 1] - new_list[i] try: if (df.date() == datetime.datetime.now().date()): mid = datevalues[-1] - datevalues[0] except: mid = 0 if (mid < 0 or new_list[i] == 0): new_time.append(0) else: new_time.append(mid) df += datetime.timedelta(days=1) return json.dumps(util.parse_data(new_time, tag_id))
#R_gt_44 = np.matmul(R_tgt,np.linalg.inv(R_src)) R_gt_44 = np.matmul(R_tgt_first, np.linalg.inv(R_src_first)) R_gt = R_gt_44[:3,:3] else: R_src_first = R_src R_tgt_first = R_tgt #R_gt_44 = np.matmul(R_tgt,np.linalg.inv(R_src)) R_gt_44 = np.matmul(R_tgt_first, np.linalg.inv(R_src_first)) #R_gt_44 = np.matmul(R_tgt, np.linalg.inv(R_src)) R_gt = R_gt_44[:3,:3] # generate source/target scans, point cloud depth_src,depth_tgt,_,_,color_src,color_tgt,pc_src,pc_tgt = util.parse_data(depth,rgb,norm,args.dataset,args.method) if len(pc_src) == 0 or len(pc_tgt)==0: print(f"this point cloud file contain no point") continue if args.old_scannet: overlap_val,cam_dist_this,pc_dist_this,pc_nn = util.point_cloud_overlap(pc_src, pc_tgt, R_gt_44) overlap = '0-0.1' if overlap_val <= 0.1 else '0.1-0.5' if overlap_val <= 0.5 else '0.5-1.0' data_s = {'rgb': rgb[0,0,:,:,:].transpose(1,2,0),
# use origin size scan for baselines on scannet dataset if 'scannet' in args.dataList and 'ours' not in args.method: rgb, depth = data['rgb_full'], data['depth_full'] R = torch_op.npy(R) rgb = torch_op.npy(rgb * 255).clip(0, 255).astype('uint8') norm = torch_op.npy(norm) depth = torch_op.npy(depth) segm = torch_op.npy(segm) R_src = R[0, 0, :, :] R_tgt = R[0, 1, :, :] R_gt_44 = np.matmul(R_tgt, np.linalg.inv(R_src)) R_gt = R_gt_44[:3, :3] # generate source/target scans, point cloud depth_src, depth_tgt, normal_src, normal_tgt, color_src, color_tgt, pc_src, pc_tgt = util.parse_data( depth, rgb, norm, args.dataList, args.method) if len(pc_src) == 0 or len(pc_tgt) == 0: print(f"this point cloud file contain no point") continue # compute overlap and other stats overlap_val, cam_dist_this, pc_dist_this, pc_nn = util.point_cloud_overlap( pc_src, pc_tgt, R_gt_44) overlap = '0-0.1' if overlap_val <= 0.1 else '0.1-0.5' if overlap_val <= 0.5 else '0.5-1.0' # do not test non-overlap with traditional method since make no sense. if args.method in ['fgs', 'gs', 'super4pcs', 'cgs' ] and overlap_val < 0.1: continue
import sys import os import util from keyword_frequency_classifier import KeywordFrequencyClassifier from sklearn.linear_model import LogisticRegression from nltk.classify.scikitlearn import SklearnClassifier from nltk import NaiveBayesClassifier # analysis of the single label classification models if __name__ == "__main__": # read in the data set subdir = 'data/single_tags/' fname = 'dataset.csv' data = util.parse_data(subdir, fname, single_label=True, extract_features=True) # randomize the data cases random.shuffle(data) # split into training and testing data slice = math.trunc(len(data) * (.8)) # 80% train, 20% test train_set = data[:slice] test_set = data[slice:] # train classification models print 'Training models on', len(train_set), 'data samples...' nb = NaiveBayesClassifier.train(train_set) lr = SklearnClassifier(LogisticRegression()).train(train_set) kwfc = KeywordFrequencyClassifier()