def process_read_by_user(): """ header is service,course_id,username,nid,id returns a dict with "username" -> forum entries count [float] """ data = load_file("forum_readlist.csv") entries_data = load_file("forum_99_entries.csv") line_by_user = _lines_by_user(data, 2) course_id = "99" #exclude those that are NOT course_id 99 (AlgoDat) new_line_by_user = {} for username in line_by_user: lines = line_by_user[username] lines = [line for line in lines if line[1] == course_id] new_line_by_user[username] = lines total_entries = len(entries_data.split("\n")) counts = _percent_count_by_user(new_line_by_user, total_entries) #counts = _counts_by_user(new_line_by_user) return counts
def load_classifier(classifier): """ Load a classifier from local storage. :param classifier: String name of classifier. :return: classifier, vectorizer """ classifier = load_file( 'classifiers/{classifier}.clf'.format(classifier=classifier)) vectorizer = load_file('classifiers/vectorizer.vc') return classifier, vectorizer
def process_written_by_user(): """ header is service,course_id,user,nid,id,subject_length,text_length returns a dict with "username" -> (WrittenByUser, AvgSubjectLength, AvgTextLength) """ USER_COL = 2 SUBJ_COL = 5 TEXT_COL = 6 data = load_file("forum_99_entries.csv") line_by_user = _lines_by_user(data, USER_COL) counts = _percent_count_by_user(line_by_user) # dict with "username" -> (WrittenByUser, AvgSubjectLength, AvgTextLength) result = {} for username in line_by_user: subj_lengths = collect_col_as_int(line_by_user, username, SUBJ_COL) text_lengths = collect_col_as_int(line_by_user, username, TEXT_COL) avg_sub_length = calculate_avg_length(subj_lengths) avg_text_length = calculate_avg_length(text_lengths) result[username] = (counts[username], avg_sub_length, avg_text_length) return result
def main(): parser = argparse.ArgumentParser() parser.add_argument("customers_file", nargs='?', default="interim/customers-100.txt") parser.add_argument("-x", "--x-orders-out", nargs='?', default="x-orders.pkl") parser.add_argument("-y", "--y-out", nargs='?', default="y-list.csv") parser.add_argument("-c", "--customers-out", nargs='?', default="customers-split-used.out") parser.add_argument("-s", "--splitter", choices=['time','percent'], default='percent') args = parser.parse_args() cst = (int(customer_id) for customer_id in common.load_file(args.customers_file)) # cst = [1,2,100,270074,270081] # cst = [1,5] #splitter = SplitXYByTime('2013-03-00') #pprint(splitter.split(customer.get_records(270074))) #pprint(splitter.split(customer.get_records(100))) # pprint(splitter.split(customer.get_records(1))) #pprint(splitter.split(customer.get_records(270081))) if args.splitter == 'percent': splitter = SplitXYByPer() min_x = 1 min_y = 1 elif args.splitter == 'time': splitter = SplitXYByTime('2013-06-00') min_x = 1 min_y = 3 X, Y, c_ids = get_split(splitter, cst, min_x, min_y) save(X, args.x_orders_out, Y, args.y_out, c_ids, args.customers_out)
def load_vectorizer(): """ Get the vectorizer from file. :return: vectorizer """ return load_file('classifiers/vectorizer.vc')
def read_original_log(filename): lines = [x.split(";") for x in load_file(filename).split("\n")[1:] if x != ''] result = [] for line in lines: if line[0] == "''": line[0] = DEFAULT_DATETIME result.append(line) return result
def load(probas_filename): cust_probas = {} for line in common.load_file(probas_filename): if line.strip() == "": continue line = line.split("|") customer, probas = line probas = json.loads(probas) cust_probas[customer] = probas return cust_probas
def load_classifiers(): """ Load classifiers from file. :return: classifiers """ classifiers = [] for classifier_name in c_name: classifier = load_file( 'classifiers/{classifier}.clf'.format(classifier=classifier_name)) classifiers.append(classifier) return classifiers
def process_success_rate_by_user(): """ SuccessRate = count(SUCCESS) / (count(FAILURE) + count(ERROR)) header is service,course_id,group_id,person_id,topic_id,date,success,warning,failure,error,info,timeout returns a dict with "username" -> success rate [float] """ data = load_file("code_unittest_results.csv") USER_COL = 3 SUCCESS_COL = 12 FAILURE_COL = 14 ERROR_COL = 15 line_by_user = _lines_by_user(data, USER_COL) return calculate_success_rates(line_by_user, SUCCESS_COL, [FAILURE_COL, ERROR_COL])
def main(): import sys import glob import time import os import redis start = time.time() inputdir = sys.argv[1] outputfile = sys.argv[2] valid_user_file = sys.argv[3] topdomainfile = sys.argv[4] seconddomainfile = sys.argv[5] blacklistdomainfile = sys.argv[6] topdomain_set = set([line.strip().split(',')[0] for line in open(topdomainfile)]) seconddomain_set = set([line.strip().split(',')[0] for line in open(seconddomainfile)]) blacklistdomain_set = set([line.strip().split(',')[0] for line in open(blacklistdomainfile)]) valid_users = set([line.strip() for line in open(valid_user_file)]) split_func = split_by_5_minute get_domain_func = generate_get_right_domain(topdomain_set, seconddomain_set, blacklistdomain_set) r = redis.StrictRedis(host='localhost', port=6379, db=0) csvfiles = glob.glob(os.path.join(inputdir, '*', '*.csv')) n = len(csvfiles) outputobj = open(outputfile, 'w') for idx, csvfile in enumerate(csvfiles): uid = get_uid(csvfile) if uid not in valid_users: continue f = load_file(r, csvfile) counter = app_and_category_statistic(f, get_domain_func, split_func) f.close() save_to_csv(outputobj, uid, counter) logging.info('[%d/%d]' % (idx + 1, n)) logging.info('finish with time %s', str(time.time() - start)) outputobj.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("customers_file", nargs='?', default="interim/customers-100.txt") parser.add_argument("-o", "--orders-out", default='all-orders.pkl') parser.add_argument("-c", "--customers-out", nargs='?', default="customers-all-used.out") args = parser.parse_args() cst = (int(customer_id) for customer_id in common.load_file(args.customers_file)) X = [] c_ids_used = [] for c_id in cst: cr = get_records(c_id) if cr: X.append(cr) c_ids_used.append(c_id) save_records(X, args.orders_out) common.save_csv_i(args.customers_out, ([c] for c in c_ids_used))
def text(self, info): messages = common.load_file(info) # Title font = pygame.font.SysFont(constants.font_title[0], constants.font_title[1]) font_height = font.get_linesize() y = 20 title = unicode(messages[0], 'utf-8') text = font.render(title, True, constants.font_title_color) text_pos = (20, 20) self.screen.blit(text, text_pos) y = 35 + font_height font = pygame.font.SysFont(constants.font_default[0], constants.font_default[1]) font_height = font.get_linesize() for message in messages[1:]: message = unicode(message, 'utf-8') text = font.render(message, True, constants.font_default_color) text_pos = (20, y) y += font_height self.screen.blit(text, text_pos)
def obtain_best_classifier_in_folder( directory: Path, seed: int) -> List[Tuple[Any, float, Path]]: procs = get_config("INIT", "procs") if procs.isspace() or not procs.isnumeric(): procs = math.floor(multiprocessing.cpu_count() / 2) else: procs = int(procs) pool = multiprocessing.Pool(procs) files = [x for x in directory.iterdir() if x.suffix == ".csv"] classifiers = create_classifiers() result = [] for file in files: df = load_file(file) df = clean_dataset(df) start = datetime.now() classifier, auc = get_best_classifier(df, classifiers, seed, pool) end = datetime.now() result.append((classifier, auc, file)) print( f"Finished file {fg.blue}{file}{fg.rs}, took {format_time_difference(start.timestamp(), end.timestamp())}" ) pool.close() return result
def read_virtual_events(filename): """ output: [datetime, label] """ return [x.split(";") for x in load_file(filename).split("\n") if x != '']