Exemple #1
0
def process_read_by_user():
    """
    header is

        service,course_id,username,nid,id

    returns a dict with

        "username" -> forum entries count [float]

    """
    data = load_file("forum_readlist.csv")
    entries_data = load_file("forum_99_entries.csv")
    line_by_user = _lines_by_user(data, 2)

    course_id = "99"
    #exclude those that are NOT course_id 99 (AlgoDat)
    new_line_by_user = {}
    for username in line_by_user:
        lines = line_by_user[username]
        lines = [line for line in lines if line[1] == course_id]
        new_line_by_user[username] = lines

    total_entries = len(entries_data.split("\n"))

    counts = _percent_count_by_user(new_line_by_user, total_entries)
    #counts = _counts_by_user(new_line_by_user)

    return counts
Exemple #2
0
def load_classifier(classifier):
    """
    Load a classifier from local storage.

    :param classifier: String name of classifier.
    :return: classifier, vectorizer
    """
    classifier = load_file(
        'classifiers/{classifier}.clf'.format(classifier=classifier))

    vectorizer = load_file('classifiers/vectorizer.vc')

    return classifier, vectorizer
Exemple #3
0
def process_written_by_user():
    """
    header is

        service,course_id,user,nid,id,subject_length,text_length

    returns a dict with

        "username" -> (WrittenByUser, AvgSubjectLength, AvgTextLength)
    """
    USER_COL = 2
    SUBJ_COL = 5
    TEXT_COL = 6

    data = load_file("forum_99_entries.csv")

    line_by_user = _lines_by_user(data, USER_COL)
    counts = _percent_count_by_user(line_by_user)

    # dict with "username" -> (WrittenByUser, AvgSubjectLength, AvgTextLength)
    result = {}
    for username in line_by_user:
        subj_lengths = collect_col_as_int(line_by_user, username, SUBJ_COL)
        text_lengths = collect_col_as_int(line_by_user, username, TEXT_COL)
        avg_sub_length = calculate_avg_length(subj_lengths)
        avg_text_length = calculate_avg_length(text_lengths)
        result[username] = (counts[username], avg_sub_length, avg_text_length)

    return result
Exemple #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("customers_file", nargs='?', default="interim/customers-100.txt")
    parser.add_argument("-x", "--x-orders-out", nargs='?', default="x-orders.pkl")
    parser.add_argument("-y", "--y-out", nargs='?', default="y-list.csv")
    parser.add_argument("-c", "--customers-out", nargs='?', default="customers-split-used.out")
    parser.add_argument("-s", "--splitter", choices=['time','percent'], default='percent')
    
    args = parser.parse_args()

    cst = (int(customer_id)
           for customer_id
           in common.load_file(args.customers_file))
    # cst = [1,2,100,270074,270081]
    # cst = [1,5]

    #splitter = SplitXYByTime('2013-03-00')
    #pprint(splitter.split(customer.get_records(270074)))
    #pprint(splitter.split(customer.get_records(100)))
    # pprint(splitter.split(customer.get_records(1)))
    #pprint(splitter.split(customer.get_records(270081)))

    if args.splitter == 'percent':
        splitter = SplitXYByPer()
        min_x = 1
        min_y = 1
    elif args.splitter == 'time':
        splitter = SplitXYByTime('2013-06-00')
        min_x = 1
        min_y = 3

    X, Y, c_ids = get_split(splitter, cst, min_x, min_y)
    save(X, args.x_orders_out, Y, args.y_out, c_ids, args.customers_out)
Exemple #5
0
def load_vectorizer():
    """
    Get the vectorizer from file.

    :return: vectorizer
    """

    return load_file('classifiers/vectorizer.vc')
Exemple #6
0
def read_original_log(filename):
    lines = [x.split(";") for x in load_file(filename).split("\n")[1:]
             if x != '']
    result = []
    for line in lines:
        if line[0] == "''":
            line[0] = DEFAULT_DATETIME
        result.append(line)
    return result
Exemple #7
0
def load(probas_filename):
    cust_probas = {}
    for line in common.load_file(probas_filename):
        if line.strip() == "":
            continue
        line = line.split("|")
        customer, probas = line
        probas = json.loads(probas)
        cust_probas[customer] = probas
    return cust_probas
Exemple #8
0
def load_classifiers():
    """
    Load classifiers from file.
    :return: classifiers
    """

    classifiers = []
    for classifier_name in c_name:
        classifier = load_file(
            'classifiers/{classifier}.clf'.format(classifier=classifier_name))
        classifiers.append(classifier)

    return classifiers
Exemple #9
0
def process_success_rate_by_user():
    """
    SuccessRate = count(SUCCESS) / (count(FAILURE) + count(ERROR))

    header is

        service,course_id,group_id,person_id,topic_id,date,success,warning,failure,error,info,timeout

    returns a dict with

        "username" -> success rate [float]
    """
    data = load_file("code_unittest_results.csv")
    USER_COL = 3
    SUCCESS_COL = 12
    FAILURE_COL = 14
    ERROR_COL   = 15
    line_by_user = _lines_by_user(data, USER_COL)
    return calculate_success_rates(line_by_user, SUCCESS_COL, [FAILURE_COL, ERROR_COL])
def main():
    import sys
    import glob
    import time
    import os
    import redis

    start = time.time()

    inputdir = sys.argv[1]
    outputfile = sys.argv[2]
    valid_user_file = sys.argv[3]
    topdomainfile = sys.argv[4]
    seconddomainfile = sys.argv[5]
    blacklistdomainfile = sys.argv[6]

    topdomain_set = set([line.strip().split(',')[0] for line in open(topdomainfile)])
    seconddomain_set = set([line.strip().split(',')[0] for line in open(seconddomainfile)])
    blacklistdomain_set = set([line.strip().split(',')[0] for line in open(blacklistdomainfile)])

    valid_users = set([line.strip() for line in open(valid_user_file)])
    split_func = split_by_5_minute
    get_domain_func = generate_get_right_domain(topdomain_set, seconddomain_set, blacklistdomain_set)

    r = redis.StrictRedis(host='localhost', port=6379, db=0)
    csvfiles = glob.glob(os.path.join(inputdir, '*', '*.csv'))
    n = len(csvfiles)
    outputobj = open(outputfile, 'w')

    for idx, csvfile in enumerate(csvfiles):
        uid = get_uid(csvfile)
        if uid not in valid_users:
            continue
        f = load_file(r, csvfile)
        counter = app_and_category_statistic(f, get_domain_func, split_func)
        f.close()
        save_to_csv(outputobj, uid, counter)
        logging.info('[%d/%d]' % (idx + 1, n))

    logging.info('finish with time %s', str(time.time() - start))
    outputobj.close()
Exemple #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("customers_file", nargs='?', default="interim/customers-100.txt")
    parser.add_argument("-o", "--orders-out", default='all-orders.pkl')
    parser.add_argument("-c", "--customers-out", nargs='?', default="customers-all-used.out")
    args = parser.parse_args()
    cst = (int(customer_id)
           for customer_id
           in common.load_file(args.customers_file))

    X = []
    c_ids_used = []

    for c_id in cst:
        cr = get_records(c_id)
        if cr:
            X.append(cr)
            c_ids_used.append(c_id)

    save_records(X, args.orders_out)
    common.save_csv_i(args.customers_out, ([c] for c in c_ids_used))
Exemple #12
0
    def text(self, info):
        messages = common.load_file(info)
        # Title
        font = pygame.font.SysFont(constants.font_title[0], constants.font_title[1])
        font_height = font.get_linesize()
        y = 20

        title = unicode(messages[0], 'utf-8')
        text = font.render(title, True, constants.font_title_color)
        text_pos = (20, 20)
        self.screen.blit(text, text_pos)

        y = 35 + font_height
        font = pygame.font.SysFont(constants.font_default[0], constants.font_default[1])
        font_height = font.get_linesize()

        for message in messages[1:]:
            message = unicode(message, 'utf-8')
            text = font.render(message, True, constants.font_default_color)
            text_pos = (20, y)
            y += font_height
            self.screen.blit(text, text_pos)
Exemple #13
0
def obtain_best_classifier_in_folder(
        directory: Path, seed: int) -> List[Tuple[Any, float, Path]]:
    procs = get_config("INIT", "procs")
    if procs.isspace() or not procs.isnumeric():
        procs = math.floor(multiprocessing.cpu_count() / 2)
    else:
        procs = int(procs)

    pool = multiprocessing.Pool(procs)
    files = [x for x in directory.iterdir() if x.suffix == ".csv"]
    classifiers = create_classifiers()
    result = []
    for file in files:
        df = load_file(file)
        df = clean_dataset(df)
        start = datetime.now()
        classifier, auc = get_best_classifier(df, classifiers, seed, pool)
        end = datetime.now()
        result.append((classifier, auc, file))
        print(
            f"Finished file {fg.blue}{file}{fg.rs}, took {format_time_difference(start.timestamp(), end.timestamp())}"
        )
    pool.close()
    return result
Exemple #14
0
def read_virtual_events(filename):
    """ output: [datetime, label] """
    return [x.split(";") for x in load_file(filename).split("\n")
            if x != '']